const phantom = require('phantom');
const fs = require('fs');
const cheerio = require('cheerio');
const request = require('request');
class ImageScraper {
constructor() {
this.searchUrl = 'https://image.baidu.com/search/index?ct=201326592&z=&tn=baiduimage&word=%E6%BC%AB%E5%A8%81%E5%9B%BE%E7%89%87&pn=0&ie=utf-8&oe=utf-8&cl=2&lm=-1&fr=ala&se=&sme=&width=1920&height=1080';
this.currentPage = 1;
this.imageLinks = [];
this.thumbnailElements = [];
}
async initialize() {
const phantomInstance = await phantom.create();
try {
await this.loadWebpage();
await this.extractThumbnails();
await this.downloadHighResImages();
} catch (error) {
console.error('Scraping error:', error);
}
}
wait(seconds) {
return new Promise((resolve) => {
setTimeout(resolve, seconds * 1000);
});
}
pause(milliseconds) {
return new Promise((resolve) => {
console.log(`Pausing for ${milliseconds / 1000} seconds...`);
setTimeout(resolve, milliseconds);
});
}
async loadWebpage() {
await this.scrollPage(0);
}
async scrollPage(scrollCount) {
const phantomInstance = await phantom.create();
const page = await phantomInstance.createPage();
await page.open(this.searchUrl);
await page.property('viewportSize', {
width: 1920,
height: 1080
});
await this.wait(5);
await page.property('scrollPosition', {
left: 0,
top: 1000 * scrollCount
});
const pageContent = await page.property('content');
const $ = cheerio.load(pageContent);
console.log('Thumbnail count:', $('.imgbox').length);
if ($('.imgbox').length < 20) {
await this.scrollPage(++scrollCount);
}
this.thumbnailElements = $('.imgitem');
}
async extractThumbnails() {
const phantomInstance = await phantom.create();
const page = await phantomInstance.createPage();
const pageContent = await page.property('content');
const $ = cheerio.load(pageContent);
this.thumbnailElements.each((index, element) => {
this.imageLinks.push({
detailUrl: 'https://image.baidu.com' + $(element).find('a').attr('href')
});
});
}
async downloadHighResImages() {
const phantomInstance = await phantom.create();
const page = await phantomInstance.createPage();
for (let i = 0; i < this.imageLinks.length; i++) {
try {
await page.open(this.imageLinks[i].detailUrl);
await this.wait(2);
const pageContent = await page.property('content');
const $ = cheerio.load(pageContent);
const imageSrc = $('#currentImg').attr('src');
const fileExtension = imageSrc.split('.').pop();
console.log('Downloading image', i + 1);
request(imageSrc).pipe(fs.createWriteStream(`./downloads/${Date.now()}.${fileExtension}`));
await this.pause(3000);
console.log('Download completed');
if (i === this.imageLinks.length - 1) {
phantomInstance.exit();
}
} catch (error) {
console.error('Download error:', error);
}
}
}
}
const scraper = new ImageScraper();
scraper.initialize();