Building a Multi-threaded Novel Scraper with Python
A straightforward approach to extracting novel content from websites using Python's requests library and lxml for HTML parsing, with multi-threaded download capabilities.
Core Configuration
stop_flag = False
worker_threads = 5
running_state = False
thread_lock = threading.Lock()
Data Model
class Novel:
title = None
content = None
writer = None
source = None
Thread Worker Function
Each worker thread continuously retrieves chapter URLs from the queue and processes them until the queue is empty or the user interrupts the operation.
def worker():
global url_queue, status_display, execution_status
while url_queue:
if stop_flag:
status_display.insert(END, f"{threading.current_thread().name} terminated due to user cancellation.\n")
status_display.yview_moveto(1)
execution_status = False
return
thread_lock.acquire()
chapter_url = url_queue.pop()
thread_lock.release()
fetch_chapter(chapter_url)
status_display.insert(END, f"{threading.current_thread().name} finished processing.\n")
status_display.yview_moveto(1)
execution_status = False
Chapter Download Function
The URL parameter contains a pre-processed single chapter link stored in a collection. After fetching the page, parse the HTML and save the content to disk.
def fetch_chapter(chapter_url):
global progress_label, progress_bar, progress_value, app_window, save_path
client = requests.session()
client.keep_alive = False
response = client.get(chapter_url)
tree = etree.HTML(response.text)
novel_title = tree.xpath("/html/body/div[2]/div[3]/div[2]/a[3]/text()")
chapter_title = tree.xpath("/html/body/div[2]/div[3]/div[3]/div/div[1]/div[2]/div[2]/text()")
paragraphs = tree.xpath("/html/body/div[2]/div[3]/div[3]/div/div[1]/div[5]/p/text()")
formatted_content = '\n'.join(paragraphs)
output_file = f"{save_path}\\{novel_title}{chapter_title}.txt"
with open(output_file, "a+", encoding="utf-8") as f:
f.write(formatted_content)
progress_value += 1
progress_bar['value'] = progress_value
app_window.update()
print(formatted_content)
status_display.insert(END, f"{threading.current_thread().name} completed: {novel_title}{chapter_title}\n")
status_display.yview_moveto(1)
client.close()
Initialization Logic
def start_crawl():
global running_state, stop_flag, url_queue
if running_state:
return
url_queue = list(catalogue_urls) # Populate from UI
running_state = True
stop_flag = False
threads = []
for _ in range(worker_threads):
t = threading.Thread(target=worker)
t.start()
threads.append(t)
Stopping Mechenism
def halt_crawl():
global stop_flag
stop_flag = True
The implementation demonstrates several essential patterns: thread-safe queue management using locks to prevent duplicate downloads, session reuse with connection pooling disabled for efficiency, XPath-based HTML traversal for extracting structured content, and progress tracking through UI updates.
Additional considerations for production use include implementing retry logic for failed requests, adding rate limiting to avoid server strain, and incorporating proper errer handling for malformed HTML structures.