Home > Notes > Content

Web Scraping Image Galleries with Python: Overcoming Referer Headers and Anti-Scraping Measures

Notes 1

When implementing a web scraper to download images from a gallery site, the initial attempt to download pictures resulted in corrupted files. Directly accessing the image URLs in a browser worked for previously viewed images but failed for new ones, suggesting a server-side check.

Analysis of network requests revealed that image downloads required specific headers. The Referer header, which indicates the page from which the request originated, was essential for successful image retrieval.

Additionally, to avoid connection interruptions due to user-agent detection, a random User-Agent rotation mechanism was implemented. The following code demonstrates the solution.

import requests
from bs4 import BeautifulSoup
import os
import random

class GalleryImageScraper:
    def __init__(self):
        self.base_folder = "D:/gallery_images"
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
        ]

    def scrape_category_list(self, start_url):
        page_content = self.send_page_request(start_url)
        if not page_content:
            return
        soup = BeautifulSoup(page_content.text, 'lxml')
        link_elements = soup.find('div', {'class': 'all'}).find_all('a')
        for link_element in link_elements:
            category_name = link_element.get_text()
            if category_name == '早期图片':
                continue
            print(f'Processing: {category_name}')
            folder_name = category_name.replace("?", "_")
            if not self.prepare_directory(folder_name):
                print(f'Skipping existing folder: {category_name}')
                continue
            category_url = link_element['href']
            self.process_category(category_url)

    def process_category(self, category_url):
        category_content = self.send_page_request(category_url)
        if not category_content:
            return
        soup = BeautifulSoup(category_content.text, 'lxml')
        page_nav = soup.find('div', {'class': 'pagenavi'})
        if page_nav:
            last_page = page_nav.find_all('span')[-2].get_text()
            total_pages = int(last_page)
            for page_num in range(1, total_pages + 1):
                image_page_url = f"{category_url}/{page_num}"
                self.extract_image_url(image_page_url)

    def extract_image_url(self, image_page_url):
        image_page = self.send_page_request(image_page_url)
        if not image_page:
            return
        soup = BeautifulSoup(image_page.text, 'lxml')
        main_image_div = soup.find('div', {'class': 'main-image'})
        if main_image_div:
            img_tag = main_image_div.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_source = img_tag['src']
                self.download_image_file(image_source, image_page_url)

    def download_image_file(self, image_url, referer_url):
        file_identifier = image_url[-9:-4]
        image_data = self.send_image_request(image_url, referer_url)
        if image_data:
            try:
                with open(file_identifier + '.jpg', 'wb') as img_file:
                    img_file.write(image_data.content)
            except OSError as error:
                print(f'Failed to save image {image_url}: {error}')

    def prepare_directory(self, folder_path):
        folder_path = folder_path.strip()
        full_path = os.path.join(self.base_folder, folder_path)
        if not os.path.exists(full_path):
            print(f'Creating folder: {folder_path}')
            os.makedirs(full_path)
            os.chdir(full_path)
            return True
        return False

    def send_image_request(self, target_url, referer_url):
        selected_agent = random.choice(self.user_agents)
        request_headers = {
            'User-Agent': selected_agent,
            'Referer': referer_url
        }
        try:
            return requests.get(target_url, headers=request_headers)
        except requests.RequestException as error:
            print(f'Error fetching image {target_url}: {error}')
            return None

    def send_page_request(self, target_url):
        request_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        try:
            return requests.get(target_url, headers=request_headers)
        except requests.RequestException as error:
            print(f'Error fetching page {target_url}: {error}')
            return None

if __name__ == '__main__':
    scraper = GalleryImageScraper()
    scraper.scrape_category_list('http://www.mzitu.com/all')
    print('Image scraping completed.')

Tags: Web Scraping Python Requests

Back to List

Prev: Excluding Fields from JSON Serialization in Java

Next: Developing a Gym Management System with Spring Boot, Vue.js, and Uniapp

Fading Coder

Web Scraping Image Galleries with Python: Overcoming Referer Headers and Anti-Scraping Measures

Related Articles

Designing Alertmanager Templates for Prometheus Notifications

Deploying a Maven Web Application to Tomcat 9 Using the Tomcat Manager

Skipping Errors in MySQL Asynchronous Replication

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Web Scraping Image Galleries with Python: Overcoming Referer Headers and Anti-Scraping Measures

Related Articles

Designing Alertmanager Templates for Prometheus Notifications

Deploying a Maven Web Application to Tomcat 9 Using the Tomcat Manager

Skipping Errors in MySQL Asynchronous Replication

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment