Building a Web Crawler for Baidu Baike Using Python
This tutorial demonstrates how to build a web crawler to extract encyclopedia entries from Baidu Baike. The project follows a modular architecture with separate components for URL management, page downloading, content parsing, and data output.
Project Structure
baike_spider/
├── url_manager.py
├── page_downloader.py
├── content_parser.py
├── data_outputer.py
├── spider_core.py
└── __init__.py
Module Overview
url_manager.py - URL Management
The URL manager maintains two collections: pending URLs to crawl and URLs already processed. Using Python sets ensures automatic deduplication.
#!/usr/bin/python
# -*- coding: utf-8 -*-
class UrlManager:
def __init__(self):
self.pending_urls = set()
self.processed_urls = set()
def add_url(self, url):
if url is None:
return
if url not in self.pending_urls and url not in self.processed_urls:
self.pending_urls.add(url)
def add_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_url(url)
def has_pending(self):
return len(self.pending_urls) != 0
def get_url(self):
url = self.pending_urls.pop()
self.processed_urls.add(url)
return url
page_downloader.py - Web Page Downloader
This module handles HTTP requests to fetch web pages. It uses Python's urllib library for network operations.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib
class PageDownloader:
def download(self, url):
if url is None:
return None
try:
response = urllib.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
except:
return None
content_parser.py - HTML Parsing
This module extracts article titles, summaries, and new article links from the downloaded HTML pages using BeautifulSoup.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import urlparse
class ContentParser:
def extract_links(self, page_url, soup):
links = set()
# Match article links: /view/12345.htm or /item/...
pattern = soup.find_all('a', href=re.compile(r"/view/\d+\.htm|/item/"))
for link in pattern:
href = link.get('href')
full_url = urlparse.urljoin(page_url, href)
links.add(full_url)
return links
def extract_data(self, page_url, soup):
data = {}
data['url'] = page_url
title_elem = soup.find('dd', class_="lemmaWgt-lemmaTitle-title")
if title_elem:
h1 = title_elem.find('h1')
if h1:
data['title'] = h1.get_text()
summary_elem = soup.find('div', class_="lemma-summary")
if summary_elem:
data['summary'] = summary_elem.get_text()
return data
def parse(self, page_url, html_content):
if page_url is None or html_content is None:
return None, None
soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
new_links = self.extract_links(page_url, soup)
article_data = self.extract_data(page_url, soup)
return new_links, article_data
data_outputer.py - Data Output
Collects scraped data and outputs it to an HTML file for viewing.
#!/usr/bin/python
# -*- coding: utf-8 -*-
class DataOutputer:
def __init__(self):
self.articles = []
def store(self, data):
if data is None:
return
print(data['url'])
print(data.get('title', '').encode('utf-8'))
self.articles.append(data)
def write_html(self):
with open('output.html', 'w') as f:
f.write("<html><body><table>")
for item in self.articles:
f.write("<tr>")
f.write("<td>%s</td>" % item['url'])
f.write("<td>%s</td>" % item.get('title', '').encode('utf-8'))
f.write("<td>%s</td>" % item.get('summary', '').encode('utf-8'))
f.write("</tr>")
f.write("</table></body></html>")
spider_core.py - Main Crawler
Coordinates all modules and implements the crawling loop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import url_manager
import page_downloader
import content_parser
import data_outputer
class SpiderCore:
def __init__(self):
self.url_mgr = url_manager.UrlManager()
self.downloader = page_downloader.PageDownloader()
self.parser = content_parser.ContentParser()
self.outputer = data_outputer.DataOutputer()
def crawl(self, root_url):
count = 1
self.url_mgr.add_url(root_url)
while self.url_mgr.has_pending():
try:
current_url = self.url_mgr.get_url()
print('Crawling [%d]: %s' % (count, current_url))
html_content = self.downloader.download(current_url)
new_urls, article_data = self.parser.parse(current_url, html_content)
self.url_mgr.add_urls(new_urls)
self.outputer.store(article_data)
if count >= 1000:
break
count += 1
except Exception as e:
print('Error occurred: %s' % str(e))
self.outputer.write_html()
if __name__ == "__main__":
spider = SpiderCore()
spider.crawl("http://baike.baidu.com/view/21087.htm")
Execution
Run the crawler on Linux:
python spider_core.py
Or make it executable and run directly:
chmod +x spider_core.py
./spider_core.py
Important Notes
The original Baidu Baike URL format used /view/ID.htm but has since changed to /item/name. To adapt to this change, update the regex pattern in the parser:
# Old pattern
links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
# New pattern
links = soup.find_all('a', href=re.compile(r"/item/"))
This crawler requires Python 2.7 and the BeautifulSoup 4 library. On Ubuntu, dependencies are typically pre-installed. On Windows, install them via pip:
pip install beautifulsoup4
The output HTML file uses UTF-8 encoding. When opening in a browser, ensure the encoding is set to UTF-8 for proper Chinese character display.