Building a GUI-Based Web Scraper for Qiushibaike Jokes in Python
The following Python implementasion demonstrates how to create a GUI application for srcaping jokes from Qiusihbaike using Tkinter for the interface.
#!/usr/bin/python
#coding:utf-8
import urllib2
import re
import sys
import datetime
from Tkinter import *
from HTMLParser import HTMLParser
class JokeScraper:
def __init__(self):
self.current_page = 1
self.headers = {'User-Agent': 'Mozilla/5.0'}
self.joke_cache = []
self.page_cache = []
self.output_file = 'jokes.txt'
self.file_handle = open(self.output_file, 'wb')
def fetch_page(self, page_num):
try:
url = "http://www.qiushibaike.com/text/page/" + str(page_num)
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
return HTMLParser().unescape(response.read().decode('utf-8'))
except Exception as e:
print "Connection error:", str(e)
return None
def parse_jokes(self, html, page_num):
jokes = []
author_pattern = re.compile(u'<h2>(.*?)</h2>', re.S)
content_pattern = re.compile(u'<span>(.*?)</span>', re.S)
authors = re.findall(author_pattern, html)
contents = re.findall(content_pattern, html)
for i in range(len(authors)):
text = re.sub(re.compile("<br/>"), "\n", contents[i])
jokes.append([str(i+1), authors[i].strip(), text, page_num])
self.page_cache.append([str(i+1), authors[i].strip(), text, page_num])
return jokes
def load_more_jokes(self, html, page_num):
if len(self.page_cache) < 20:
self.parse_jokes(html, page_num)
def get_next_joke(self):
if len(self.page_cache) > 1:
joke = self.page_cache.pop(0)
return joke
else:
html = self.fetch_page(self.current_page)
self.load_more_jokes(html, self.current_page)
self.current_page += 1
if len(self.page_cache) > 1:
return self.page_cache.pop(0)
def display_joke(self):
joke = self.get_next_joke()
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
output = "Page %s Joke %s\nAuthor: %s\n%s\n\nPosted: %s\n" % \
(joke[3], joke[0], joke[1], joke[2], timestamp)
self.file_handle.write(output)
return output
def clear_display():
display_box.delete(1.0, END)
def show_joke():
display_box.delete(1.0, END)
display_box.insert(1.0, scraper.display_joke())
if __name__ == '__main__':
scraper = JokeScraper()
root = Tk()
root.title('Qiushibaike Joke Viewer')
title_label = Label(root, text='Qiushibaike Jokes', justify=CENTER)
title_label.grid()
display_box = Text(root)
display_box.grid(row=1, column=0, columnspan=2)
next_btn = Button(root, text='Next Joke', command=show_joke)
next_btn.grid(row=2, column=0)
clear_btn = Button(root, text='Clear', command=clear_display)
clear_btn.grid(row=2, column=1)
root.mainloop()
This implementation includes:
- A dedicated class for scraping and managing jokes
- Proper HTML parsing and error handling
- Simple GUI with Next and Clear buttons
- Persistent storage of scraped jokes
- Automatic loading of additional pages when needed