Web Scraping a Chinese Fiction Site with Scrapy: Metadata and Chapter Capture
Initialize a Scrapy project with a unique identifier. Run scrapy startproject novel_scraper to generate the base directory tree, which will look like this after adding necessary components:
├── novel_scraper
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── fiction_crawler.py
Spider Implementation (fiction_crawler.py)
This class handles asynchronous request scheduling and content extraction across genre, list, detail, and chapter index pages. Only a single genre slice is enabled initially to testing.
import scrapy
from urllib.parse import urljoin
from novel_scraper.items import FictionMetadataItem, FictionChapterLinkItem
class GenreFictionSpider(scrapy.Spider):
name = "fiction_crawler"
allowed_domains = ["23us.so"]
genre_base = "https://www.23us.so/fenlei/"
genre_suffix = ".html"
def start_requests(self):
# Test with genre ID 8 (wuxia) and pagination range 1-2
for genre_id in range(8, 9):
first_page = f"{self.genre_base}{genre_id}_1{self.genre_suffix}"
yield scrapy.Request(first_page, callback=self.parse_genre_pages)
def parse_genre_pages(self, response):
base_paginated = response.url[:-7]
max_page = response.xpath('//div[@id="pagelink"]/a[last()-1]/text()').get()
# Use a limited test range first
for page_num in range(1, min(int(max_page), 3) + 1):
target_url = f"{base_paginated}_{page_num}{self.genre_suffix}"
yield scrapy.Request(target_url, callback=self.extract_fiction_links)
def extract_fiction_links(self, response):
for row in response.xpath('//table[@class="grid"]/tr')[1:]:
detail_url = row.xpath('td[1]/a/@href').get()
title = row.xpath('td[1]/a/text()').get()
if detail_url:
yield scrapy.Request(
detail_url,
callback=self.parse_fiction_detail,
meta={"temp_title": title}
)
def parse_fiction_detail(self, response):
temp_title = response.meta["temp_title"]
metadata = FictionMetadataItem()
metadata["title"] = temp_title
metadata["author"] = response.xpath('//table[@class="grid"]/tr[1]/td[2]/text()').get()
metadata["site_url"] = response.url
metadata["status"] = response.xpath('//table[@class="grid"]/tr[1]/td[3]/text()').get()
metadata["word_count"] = response.xpath('//table[@class="grid"]/tr[2]/td[2]/text()').get()
metadata["genre"] = response.xpath('//table[@class="grid"]/tr[1]/td[1]/a/text()').get()
metadata["site_id"] = response.url.split("/")[-1].split(".")[0]
metadata["total_collects"] = response.xpath('//table[@class="grid"]/tr[2]/td[1]/text()').get()
metadata["total_clicks"] = response.xpath('//table[@class="grid"]/tr[3]/td[1]/text()').get()
metadata["summary"] = response.xpath('//div[@id="intro"]/p[2]').get()
chapter_index_url = response.xpath('//p[@class="btnlinks"]/a[2]/@href').get()
yield scrapy.Request(
chapter_index_url,
callback=self.extract_chapter_urls,
meta={"book_metadata": metadata}
)
def extract_chapter_urls(self, response):
book_meta = response.meta["book_metadata"]
for cell in response.xpath('//table[@class="css"]/tr/td'):
chapter_text = cell.xpath('a/text()').get()
chapter_href = cell.xpath('a/@href').get()
if chapter_href:
chapter_item = FictionChapterLinkItem()
chapter_item.update(book_meta)
chapter_item["chapter_title"] = chapter_text
chapter_item["chapter_url"] = urljoin(response.url, chapter_href)
yield chapter_item
Data Models (items.py)
Separate models keep metadata and chapter links organized, though they are merged later for pipeline processing.
import scrapy
class FictionMetadataItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
site_url = scrapy.Field()
status = scrapy.Field()
word_count = scrapy.Field()
genre = scrapy.Field()
site_id = scrapy.Field()
total_collects = scrapy.Field()
total_clicks = scrapy.Field()
summary = scrapy.Field()
class FictionChapterLinkItem(FictionMetadataItem):
chapter_title = scrapy.Field()
chapter_url = scrapy.Field()
Scrapy Configuration (settings.py)
Adjust concurrency, disable robots.txt for testing, enable caching, and register the custom pipeline. Add a custom storage directory.
BOT_NAME = 'novel_scraper'
SPIDER_MODULES = ['novel_scraper.spiders']
NEWSPIDER_MODULE = 'novel_scraper.spiders'
NOVEL_STORAGE = "local_fiction_library"
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 8
DOWNLOAD_DELAY = 0.5
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
ITEM_PIPELINES = {
'novel_scraper.pipelines.LocalFileStoragePipeline': 200,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_DIR = 'http_cache'
Pipeline for File Saving (pipelines.py)
This pipeline creates nested directories, writes metadata summaries, fetches chapter content via requests, and saves chapters as UTF-8 text files. It skips existing files to avoid redundant downloads.
import os
import requests
from scrapy.exceptions import DropItem
from novel_scraper import settings
from novel_scraper.items import FictionChapterLinkItem
from bs4 import BeautifulSoup
class LocalFileStoragePipeline:
def process_item(self, item, spider):
if not isinstance(item, FictionChapterLinkItem):
raise DropItem("Only processing chapter-linked items")
# Create base storage path
base_dir = os.path.join(settings.NOVEL_STORAGE, spider.name)
os.makedirs(base_dir, exist_ok=True)
# Create book-specific directory
safe_title = item["title"].replace("/", "_").replace("\\", "_")
book_dir = os.path.join(base_dir, safe_title)
os.makedirs(book_dir, exist_ok=True)
# Write metadata summary if missing
summary_path = os.path.join(book_dir, f"{safe_title}_metadata.txt")
if not os.path.exists(summary_path):
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(f"Title: {item['title']}\n")
f.write(f"Author: {item['author']}\n")
f.write(f"URL: {item['site_url']}\n")
f.write(f"Status: {item['status']}\n")
f.write(f"Word Count: {item['word_count']}\n")
f.write(f"Genre: {item['genre']}\n")
f.write(f"Site ID: {item['site_id']}\n")
f.write(f"Total Collects: {item['total_collects']}\n")
f.write(f"Total Clicks: {item['total_clicks']}\n")
f.write("\nSummary:\n")
summary_soup = BeautifulSoup(item['summary'], 'html.parser')
f.write(summary_soup.get_text(separator="\n"))
# Write chapter content if missing
safe_chapter = item["chapter_title"].replace("/", "_").replace("\\", "_").replace(":", "_")
chapter_path = os.path.join(book_dir, f"{safe_chapter}.txt")
if not os.path.exists(chapter_path):
try:
chapter_res = requests.get(item["chapter_url"], headers={"User-Agent": settings.USER_AGENT}, timeout=10)
chapter_res.raise_for_status()
chapter_soup = BeautifulSoup(chapter_res.text, 'lxml')
content_div = chapter_soup.find("dd", id="contents")
if content_div:
with open(chapter_path, 'w', encoding='utf-8') as f:
f.write(f"{item['chapter_title']}\n\n")
f.write(content_div.get_text(separator="\n"))
except Exception as e:
spider.logger.error(f"Failed to fetch {item['chapter_url']}: {str(e)}")
return item
Execute the crawler witth scrapy crawl fiction_crawler. For testing, keep the limited pagination and genre range; remove these restrictions once validation passes.