Extracting Text-Based Jokes from Qiushibaike with Python
This guide details the process of building a Python scraper to extract text-based jokes from the Qiushibaike website. The focus is on retrieving structured data such as author names, joke content, upvote counts, and coment counts, while implementing pagination and user interaction.
Target Analysis and Strategy
The primary target is the text-only section of Qiushibaike, accessible at http://www.qiushibaike.com/text/. This page contains jokes without embedded images, simplifying the data extraction process. The goal is to systematically collect:
- Author of the joke.
- The joke content itself.
- Number of upvotes ("likes").
- Number of comments.
The scraper should handle multiple pages by following the site's pagination pattern, where subsequent pages follow the URL structure http://www.qiushibaike.com/text/page/2, http://www.qiushibaike.com/text/page/3, and so on.
Fetchnig the Initial Page
A direct request using urllib2 may be blocked. Its necessary to mimic a browser by adding a User-Agent header to the HTTP request.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
def fetch_page_content():
try:
target_url = "http://www.qiushibaike.com/text/"
# Simulate a browser request
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
request_headers = {'User-Agent': user_agent}
http_request = urllib2.Request(target_url, headers=request_headers)
page_html = urllib2.urlopen(http_request).read().decode('utf-8')
return page_html
except urllib2.URLError as e:
if hasattr(e, "reason"):
print "Failed to connect to Qiushibaike. Reason:", e.reason
return None
html_content = fetch_page_content()
Parsing Page Content with Regular Expressions
With the HTML content retrieved, regular expressions can extract the required fields from the page structure.
import re
def extract_jokes_from_html(html):
joke_list = []
# Compile regex patterns for each data point
author_pattern = re.compile(u'<h2>(.*?)</h2>', re.S)
content_pattern = re.compile(u'<span>(.*?)</span>', re.S)
upvote_pattern = re.compile(u'<i class="number">(\d*)</i>\s*好笑', re.S)
comment_pattern = re.compile(u'<i class="number">(\d*)</i>\s*评论', re.S)
authors = re.findall(author_pattern, html)
contents = re.findall(content_pattern, html)
upvotes = re.findall(upvote_pattern, html)
comments = re.findall(comment_pattern, html)
if not authors:
print "No author data found."
return None
for idx in xrange(len(authors)):
# Replace HTML line breaks with newline characters
br_tag = re.compile("<br/>")
joke_text = re.sub(br_tag, "\n", contents[idx])
# Safely get upvote and comment counts, defaulting to "0"
upvote_count = "0"
if idx < len(upvotes):
upvote_count = upvotes[idx].strip()
comment_count = "0"
if idx < len(comments):
comment_count = comments[idx].strip()
# Append structured data for one joke
joke_list.append([
str(idx + 1), # Joke number on page
authors[idx].strip(), # Author
joke_text, # Content
upvote_count, # Upvotes
comment_count # Comments
])
# Print to console for verification
print joke_list[idx][0], joke_list[idx][1], joke_list[idx][2], joke_list[idx][3], joke_list[idx][4]
return joke_list
# Usage
page_html = fetch_page_content()
if page_html:
jokes = extract_jokes_from_html(page_html)
Implementing User Interaction and Pagination
To improve usability, the program can display one joke at a time, waiting for user input (Enter) to show the next, and quit on command ('Q'). The core logic is wrapped into a class for better state management.
import sys
import datetime
class QiushiScraper:
def __init__(self):
self.current_page = 1
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
self.headers = {'User-Agent': self.user_agent}
self.joke_cache = [] # Stores jokes from fetched pages
self.is_running = False
self.output_file = 'qiushi_jokes.txt'
self.file_handle = open(self.output_file, 'wb')
def get_page_html(self, page_num):
try:
url = "http://www.qiushibaike.com/text/page/" + str(page_num)
req = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(req)
return response.read().decode('utf-8')
except urllib2.URLError as e:
if hasattr(e, "reason"):
print "Connection failed:", e.reason
return None
def parse_page(self, html):
# Uses the same extract_jokes_from_html logic, adapted as a method
# ... (implementation identical to the function above) ...
pass
def load_page_into_cache(self, html):
if self.is_running and len(self.joke_cache) < 2:
page_jokes = self.parse_page(html)
if page_jokes:
self.joke_cache.append(page_jokes)
def display_joke_interactively(self, page_jokes, page_num):
for joke in page_jokes:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
output_str = "Page %d, Joke %s\nAuthor: %s\nTime: %s\n%s\nUpvotes: %s Comments: %s\n" % \
(page_num, joke[0], joke[1], timestamp, joke[2], joke[3], joke[4])
print output_str
self.file_handle.write(output_str + '\n')
user_input = raw_input()
if user_input.upper() == 'Q':
self.is_running = False
self.file_handle.close()
return
def start(self):
print "Reading Qiushibaike. Press Enter for next joke, 'Q' to quit."
self.is_running = True
start_page = 1
user_start = raw_input('Enter starting page (default 1): ')
try:
start_page = int(user_start)
except ValueError:
if user_start.upper() == 'Q':
self.is_running = False
self.file_handle.close()
return
while self.is_running:
page_html = self.get_page_html(start_page)
if not page_html:
print "Page load failed."
break
self.load_page_into_cache(page_html)
if len(self.joke_cache) > 0:
jokes_to_show = self.joke_cache[0]
del self.joke_cache[0]
self.display_joke_interactively(jokes_to_show, start_page)
start_page += 1
# Set default encoding and run
reload(sys)
sys.setdefaultencoding("utf-8")
scraper = QiushiScraper()
scraper.start()
The final class-based implementation handles fetching, parsing, pagination, interactive display, and local file logging. It provides a robust foundation for scraping text content from Qiushibaike.