Python Web Scraping: Single-Threaded vs Multi-Threaded Approaches
Overview
Web scraping is a common technique for extracting data from websites. This article demonstrates how to build an image scraper in Python using two different approaches: a sequential single-threaded version and a concurrent multi-threaded version. The code examples illlustrate key concepts like HTTP request handling, HTML parsing, and process pool management.
Single-Threaded Implementation
The following implementation processes requests sequentially, one after another:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
BASE_URL = 'http://www.mzitu.com'
# HTTP headers to mimic browser behavior and bypass hotlink protection
host_headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': BASE_URL
}
image_headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://i.meizitu.net'
}
# Initialize storage directory
output_dir = os.getcwd() + '/mzitu/'
# Fetch the main page to determine total pages
response = requests.get(BASE_URL, headers=host_headers)
parser = BeautifulSoup(response.text, "html.parser")
pagination = parser.find_all('a', class_='page-numbers')
total_pages = pagination[-2].text
# Iterate through each page
category_url = f'{BASE_URL}/page/'
for page_num in range(1, int(total_pages) + 1):
page_url = f'{category_url}{page_num}'
response = requests.get(page_url, headers=host_headers)
soup = BeautifulSoup(response.text, "html.parser")
gallery_links = soup.find('div', class_='postlist').find_all('a', target='_blank')
for link in gallery_links:
gallery_title = link.get_text()
if not gallery_title:
continue
print(f"Processing: {gallery_title}")
# Sanitize folder name by removing invalid characters
sanitized_title = gallery_title.strip().replace('?', '')
folder_path = os.path.join(output_dir, sanitized_title)
if os.path.exists(folder_path):
existing_files = len(os.listdir(folder_path))
# Check if download is complete by comparing file count
gallery_response = requests.get(link['href'], headers=host_headers)
gallery_soup = BeautifulSoup(gallery_response.text, "html.parser")
image_count = gallery_soup.find_all('span')[10].text
if existing_files >= int(image_count):
print('Already downloaded, skipping...')
continue
is_new = False
else:
os.makedirs(folder_path)
is_new = True
os.chdir(folder_path)
# Fetch individual image pages
gallery_response = requests.get(link['href'], headers=host_headers)
gallery_soup = BeautifulSoup(gallery_response.text, "html.parser")
image_count = gallery_soup.find_all('span')[10].text
for img_num in range(1, int(image_count) + 1):
image_page_url = f"{link['href']}/{img_num}"
page_response = requests.get(image_page_url, headers=host_headers)
page_soup = BeautifulSoup(page_response.text, "html.parser")
image_element = page_soup.find('img', alt=gallery_title)
# Validate that the image element has a source attribute
if 'src' not in image_element.attrs:
continue
image_url = image_element['src']
print(image_url)
# Download the image with appropriate headers
image_response = requests.get(image_url, headers=image_headers)
filename = image_url.split('/')[-1]
with open(filename, 'wb') as f:
f.write(image_response.content)
print(f'Page {page_num} completed')
Multi-Threaded Implementation
For improved performance, this version utilizes Python's multiprocessing module to download multiple images concurrently:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
from multiprocessing import Pool, freeze_support
import sys
request_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
'Referer': 'http://www.mzitu.com'
}
def retrieve_total_pages():
"""Fetch the total number of gallery pages from the website."""
response = requests.get('http://www.mzitu.com', headers=request_headers)
soup = BeautifulSoup(response.text, "html.parser")
pagination = soup.find_all('a', class_='page-numbers')
return pagination[-2].text
def fetch_image_gallery(gallery_url, gallery_title, base_path):
"""Download all images from a specific gallery."""
response = requests.get(gallery_url, headers=request_headers)
soup = BeautifulSoup(response.text, 'html.parser')
total_images = soup.find_all('span')[10].text
sanitized_title = gallery_title.strip().replace('?', '')
gallery_folder = os.path.join(base_path, sanitized_title)
# Skip if already downloaded
if os.path.exists(gallery_folder):
if len(os.listdir(gallery_folder)) >= int(total_images):
print(f'Already downloaded: {gallery_title}')
return
print(f"Downloading {total_images} images: {gallery_title}")
os.makedirs(gallery_folder, exist_ok=True)
os.chdir(gallery_folder)
for image_index in range(1, int(total_images) + 1):
image_page_url = f'{gallery_url}/{image_index}'
page_response = requests.get(image_page_url, headers=request_headers)
page_soup = BeautifulSoup(page_response.text, "html.parser")
image_tag = page_soup.find('img', alt=gallery_title)
# Handle cases where image source attribute is missing
if 'src' not in image_tag.attrs:
continue
image_url = image_tag['src']
print(f"{gallery_title}: {image_url}")
image_response = requests.get(image_url, headers=request_headers)
filename = image_url.split('/')[-1]
with open(filename, 'wb') as output_file:
output_file.write(image_response.content)
print(f'Completed: {gallery_title}')
if __name__ == '__main__':
# Required for Windows executable packaging
freeze_support()
# Configure thread pool size from command line argument
pool_size = 1
if len(sys.argv) > 1:
pool_size = int(sys.argv[1])
process_pool = Pool(pool_size)
print(f'Initializing thread pool with {pool_size} workers')
# Setup output directory
save_path = os.getcwd() + '/mzitu_parallel/'
total_pages = retrieve_total_pages()
print(f'Found {total_pages} pages to process')
page_base = 'http://www.mzitu.com/page/'
for page_number in range(1, int(total_pages) + 1):
page_url = f'{page_base}{page_number}'
response = requests.get(page_url, headers=request_headers)
soup = BeautifulSoup(response.text, "html.parser")
all_galleries = soup.find('div', class_='postlist').find_all('a', target='_blank')
for gallery in all_galleries:
title = gallery.get_text()
if title:
gallery_url = gallery['href']
process_pool.apply_async(
fetch_image_gallery,
args=(gallery_url, title, save_path)
)
process_pool.close()
process_pool.join()
print('All downloads completed')
Key Differences
The single-threaded approach processes requests sequentially, which means it waits for each HTTP request and image download to complete before starting the next one. This is simple to implement but can be slow when dealing with many resources.
The multi-threaded version uses multiprocessing.Pool to create a pool of worker processes. Each worker handles downloading images from one gallery independently, allowing multiple downloads to occur simultaneously. The freeze_support() call is essential when packaging the application as a Windows executable.
Both implementations include error handling for missing image attributes and check for existing downloads to avoid redundant work. The multi-threaded version accepts a command-line argument to configure the number of parallel workers.