Home > Tech > Content

Web Scraping a Chinese Fiction Site with Scrapy: Metadata and Chapter Capture

Tech 3

Initialize a Scrapy project with a unique identifier. Run scrapy startproject novel_scraper to generate the base directory tree, which will look like this after adding necessary components:

├── novel_scraper
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       └── fiction_crawler.py

Spider Implementation (`fiction_crawler.py`)

This class handles asynchronous request scheduling and content extraction across genre, list, detail, and chapter index pages. Only a single genre slice is enabled initially to testing.

import scrapy
from urllib.parse import urljoin
from novel_scraper.items import FictionMetadataItem, FictionChapterLinkItem

class GenreFictionSpider(scrapy.Spider):
    name = "fiction_crawler"
    allowed_domains = ["23us.so"]
    genre_base = "https://www.23us.so/fenlei/"
    genre_suffix = ".html"

    def start_requests(self):
        # Test with genre ID 8 (wuxia) and pagination range 1-2
        for genre_id in range(8, 9):
            first_page = f"{self.genre_base}{genre_id}_1{self.genre_suffix}"
            yield scrapy.Request(first_page, callback=self.parse_genre_pages)

    def parse_genre_pages(self, response):
        base_paginated = response.url[:-7]
        max_page = response.xpath('//div[@id="pagelink"]/a[last()-1]/text()').get()
        # Use a limited test range first
        for page_num in range(1, min(int(max_page), 3) + 1):
            target_url = f"{base_paginated}_{page_num}{self.genre_suffix}"
            yield scrapy.Request(target_url, callback=self.extract_fiction_links)

    def extract_fiction_links(self, response):
        for row in response.xpath('//table[@class="grid"]/tr')[1:]:
            detail_url = row.xpath('td[1]/a/@href').get()
            title = row.xpath('td[1]/a/text()').get()
            if detail_url:
                yield scrapy.Request(
                    detail_url,
                    callback=self.parse_fiction_detail,
                    meta={"temp_title": title}
                )

    def parse_fiction_detail(self, response):
        temp_title = response.meta["temp_title"]
        metadata = FictionMetadataItem()
        metadata["title"] = temp_title
        metadata["author"] = response.xpath('//table[@class="grid"]/tr[1]/td[2]/text()').get()
        metadata["site_url"] = response.url
        metadata["status"] = response.xpath('//table[@class="grid"]/tr[1]/td[3]/text()').get()
        metadata["word_count"] = response.xpath('//table[@class="grid"]/tr[2]/td[2]/text()').get()
        metadata["genre"] = response.xpath('//table[@class="grid"]/tr[1]/td[1]/a/text()').get()
        metadata["site_id"] = response.url.split("/")[-1].split(".")[0]
        metadata["total_collects"] = response.xpath('//table[@class="grid"]/tr[2]/td[1]/text()').get()
        metadata["total_clicks"] = response.xpath('//table[@class="grid"]/tr[3]/td[1]/text()').get()
        metadata["summary"] = response.xpath('//div[@id="intro"]/p[2]').get()
        chapter_index_url = response.xpath('//p[@class="btnlinks"]/a[2]/@href').get()
        yield scrapy.Request(
            chapter_index_url,
            callback=self.extract_chapter_urls,
            meta={"book_metadata": metadata}
        )

    def extract_chapter_urls(self, response):
        book_meta = response.meta["book_metadata"]
        for cell in response.xpath('//table[@class="css"]/tr/td'):
            chapter_text = cell.xpath('a/text()').get()
            chapter_href = cell.xpath('a/@href').get()
            if chapter_href:
                chapter_item = FictionChapterLinkItem()
                chapter_item.update(book_meta)
                chapter_item["chapter_title"] = chapter_text
                chapter_item["chapter_url"] = urljoin(response.url, chapter_href)
                yield chapter_item

Data Models (`items.py`)

Separate models keep metadata and chapter links organized, though they are merged later for pipeline processing.

import scrapy

class FictionMetadataItem(scrapy.Item):
    title = scrapy.Field()
    author = scrapy.Field()
    site_url = scrapy.Field()
    status = scrapy.Field()
    word_count = scrapy.Field()
    genre = scrapy.Field()
    site_id = scrapy.Field()
    total_collects = scrapy.Field()
    total_clicks = scrapy.Field()
    summary = scrapy.Field()

class FictionChapterLinkItem(FictionMetadataItem):
    chapter_title = scrapy.Field()
    chapter_url = scrapy.Field()

Scrapy Configuration (`settings.py`)

Adjust concurrency, disable robots.txt for testing, enable caching, and register the custom pipeline. Add a custom storage directory.

BOT_NAME = 'novel_scraper'
SPIDER_MODULES = ['novel_scraper.spiders']
NEWSPIDER_MODULE = 'novel_scraper.spiders'
NOVEL_STORAGE = "local_fiction_library"
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 8
DOWNLOAD_DELAY = 0.5
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
ITEM_PIPELINES = {
    'novel_scraper.pipelines.LocalFileStoragePipeline': 200,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_DIR = 'http_cache'

Pipeline for File Saving (`pipelines.py`)

This pipeline creates nested directories, writes metadata summaries, fetches chapter content via requests, and saves chapters as UTF-8 text files. It skips existing files to avoid redundant downloads.

import os
import requests
from scrapy.exceptions import DropItem
from novel_scraper import settings
from novel_scraper.items import FictionChapterLinkItem
from bs4 import BeautifulSoup

class LocalFileStoragePipeline:
    def process_item(self, item, spider):
        if not isinstance(item, FictionChapterLinkItem):
            raise DropItem("Only processing chapter-linked items")
        
        # Create base storage path
        base_dir = os.path.join(settings.NOVEL_STORAGE, spider.name)
        os.makedirs(base_dir, exist_ok=True)
        
        # Create book-specific directory
        safe_title = item["title"].replace("/", "_").replace("\\", "_")
        book_dir = os.path.join(base_dir, safe_title)
        os.makedirs(book_dir, exist_ok=True)
        
        # Write metadata summary if missing
        summary_path = os.path.join(book_dir, f"{safe_title}_metadata.txt")
        if not os.path.exists(summary_path):
            with open(summary_path, 'w', encoding='utf-8') as f:
                f.write(f"Title: {item['title']}\n")
                f.write(f"Author: {item['author']}\n")
                f.write(f"URL: {item['site_url']}\n")
                f.write(f"Status: {item['status']}\n")
                f.write(f"Word Count: {item['word_count']}\n")
                f.write(f"Genre: {item['genre']}\n")
                f.write(f"Site ID: {item['site_id']}\n")
                f.write(f"Total Collects: {item['total_collects']}\n")
                f.write(f"Total Clicks: {item['total_clicks']}\n")
                f.write("\nSummary:\n")
                summary_soup = BeautifulSoup(item['summary'], 'html.parser')
                f.write(summary_soup.get_text(separator="\n"))
        
        # Write chapter content if missing
        safe_chapter = item["chapter_title"].replace("/", "_").replace("\\", "_").replace(":", "_")
        chapter_path = os.path.join(book_dir, f"{safe_chapter}.txt")
        if not os.path.exists(chapter_path):
            try:
                chapter_res = requests.get(item["chapter_url"], headers={"User-Agent": settings.USER_AGENT}, timeout=10)
                chapter_res.raise_for_status()
                chapter_soup = BeautifulSoup(chapter_res.text, 'lxml')
                content_div = chapter_soup.find("dd", id="contents")
                if content_div:
                    with open(chapter_path, 'w', encoding='utf-8') as f:
                        f.write(f"{item['chapter_title']}\n\n")
                        f.write(content_div.get_text(separator="\n"))
            except Exception as e:
                spider.logger.error(f"Failed to fetch {item['chapter_url']}: {str(e)}")
        
        return item

Execute the crawler witth scrapy crawl fiction_crawler. For testing, keep the limited pagination and genre range; remove these restrictions once validation passes.

Tags: Python

Back to List

Prev: Counting Unique Paths in a 2D Grid Matrix

Next: Spring Bean Lifecycle: Destruction

Fading Coder

Web Scraping a Chinese Fiction Site with Scrapy: Metadata and Chapter Capture

Spider Implementation (`fiction_crawler.py`)

Data Models (`items.py`)

Scrapy Configuration (`settings.py`)

Pipeline for File Saving (`pipelines.py`)

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor