Web Scraping Image Galleries with Python: Overcoming Referer Headers and Anti-Scraping Measures
When implementing a web scraper to download images from a gallery site, the initial attempt to download pictures resulted in corrupted files. Directly accessing the image URLs in a browser worked for previously viewed images but failed for new ones, suggesting a server-side check.
Analysis of network requests revealed that image downloads required specific headers. The Referer header, which indicates the page from which the request originated, was essential for successful image retrieval.
Additionally, to avoid connection interruptions due to user-agent detection, a random User-Agent rotation mechanism was implemented. The following code demonstrates the solution.
import requests
from bs4 import BeautifulSoup
import os
import random
class GalleryImageScraper:
def __init__(self):
self.base_folder = "D:/gallery_images"
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
]
def scrape_category_list(self, start_url):
page_content = self.send_page_request(start_url)
if not page_content:
return
soup = BeautifulSoup(page_content.text, 'lxml')
link_elements = soup.find('div', {'class': 'all'}).find_all('a')
for link_element in link_elements:
category_name = link_element.get_text()
if category_name == '早期图片':
continue
print(f'Processing: {category_name}')
folder_name = category_name.replace("?", "_")
if not self.prepare_directory(folder_name):
print(f'Skipping existing folder: {category_name}')
continue
category_url = link_element['href']
self.process_category(category_url)
def process_category(self, category_url):
category_content = self.send_page_request(category_url)
if not category_content:
return
soup = BeautifulSoup(category_content.text, 'lxml')
page_nav = soup.find('div', {'class': 'pagenavi'})
if page_nav:
last_page = page_nav.find_all('span')[-2].get_text()
total_pages = int(last_page)
for page_num in range(1, total_pages + 1):
image_page_url = f"{category_url}/{page_num}"
self.extract_image_url(image_page_url)
def extract_image_url(self, image_page_url):
image_page = self.send_page_request(image_page_url)
if not image_page:
return
soup = BeautifulSoup(image_page.text, 'lxml')
main_image_div = soup.find('div', {'class': 'main-image'})
if main_image_div:
img_tag = main_image_div.find('img')
if img_tag and 'src' in img_tag.attrs:
image_source = img_tag['src']
self.download_image_file(image_source, image_page_url)
def download_image_file(self, image_url, referer_url):
file_identifier = image_url[-9:-4]
image_data = self.send_image_request(image_url, referer_url)
if image_data:
try:
with open(file_identifier + '.jpg', 'wb') as img_file:
img_file.write(image_data.content)
except OSError as error:
print(f'Failed to save image {image_url}: {error}')
def prepare_directory(self, folder_path):
folder_path = folder_path.strip()
full_path = os.path.join(self.base_folder, folder_path)
if not os.path.exists(full_path):
print(f'Creating folder: {folder_path}')
os.makedirs(full_path)
os.chdir(full_path)
return True
return False
def send_image_request(self, target_url, referer_url):
selected_agent = random.choice(self.user_agents)
request_headers = {
'User-Agent': selected_agent,
'Referer': referer_url
}
try:
return requests.get(target_url, headers=request_headers)
except requests.RequestException as error:
print(f'Error fetching image {target_url}: {error}')
return None
def send_page_request(self, target_url):
request_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
return requests.get(target_url, headers=request_headers)
except requests.RequestException as error:
print(f'Error fetching page {target_url}: {error}')
return None
if __name__ == '__main__':
scraper = GalleryImageScraper()
scraper.scrape_category_list('http://www.mzitu.com/all')
print('Image scraping completed.')