Five Python Approaches to Complete the First Heibanke Crawler Practice Level
The target practice site is http://www.heibanke.com/lesson/crawler_ex00/, which requires navigating through a sequence of 5-digit numeric values appended to the base URL path until reaching the final challenge page. Below are five Python-based automation methods to complete this level.
Method 1: Using urllib and Regular Expressions
# -*- coding: utf-8 -*-
import re
import urllib
import datetime
start_time = datetime.datetime.now()
base_url = "http://www.heibanke.com/lesson/crawler_ex00/"
current_html = urllib.urlopen(base_url).read()
matched_nums = re.findall(r"Enter the number ([0-9]{5})", current_html)
while matched_nums:
current_url = f"{base_url}{matched_nums[0]}/"
print(current_url)
current_html = urllib.urlopen(current_url).read()
matched_nums = re.findall(r"Number is ([0-9]{5})", current_html)
final_html = urllib.urlopen(current_url).read()
final_pass_url = "http://www.heibanke.com" + re.findall(r'<a href="(.*?)" class', final_html)[0]
elapsed = datetime.datetime.now() - start_time
print(f"Final pass URL: {final_pass_url}, Elapsed time: {elapsed}")
Method 2: Using requests and Regular Expressions
# -*- coding: utf-8 -*-
import requests
import re
import datetime
start_time = datetime.datetime.now()
base_url = "http://www.heibanke.com/lesson/crawler_ex00/"
current_url = base_url
num_pattern = re.compile(r"<h3>[^\d<]*?(\d+)[^\d<]*?</h3")
while True:
print(f"Fetching: {current_url}")
response = requests.get(current_url)
matched_nums = num_pattern.findall(response.text)
if not matched_nums:
final_pass_url = "http://www.heibanke.com" + re.findall(r'<a href="(.*?)" class', response.text)[0]
break
current_url = f"{base_url}{matched_nums[0]}"
elapsed = datetime.datetime.now() - start_time
print(f"Final pass URL: {final_pass_url}, Elapsed time: {elapsed}")
Alternative Regex Pattern
An alternative matching pattern can be used:
pattern = r"<h3>(.*)</h3>"
matches = re.findall(pattern, page_content)
try:
extracted_num = int("".join(map(lambda char: char if char.isdigit() else "", matches[0])))
except (IndexError, ValueError):
pass
str.join() Method
The str.join(iterable) method concatenates elements of an iterable into a single string using the calling string as a separator:
# Example 1: Join list elements without separator
word_list = ["hello", "world", "", "", "j", "i", "m"]
print("".join(word_list)) # Output: helloworldjim
# Example 2: Join with dot separator
print(".".join(word_list)) # Output: hello.world...j.i.m
map() Function
The built-in map(func, iterable) function applies a given function to each item of an iterable and returns a new iterator of results:
def format_name(input_str):
return input_str[0].upper() + input_str[1:].lower()
print(list(map(format_name, ["adam", "LISA", "barT"])))
# Output: ['Adam', 'Lisa', 'Bart']
Lambda Expressions
Lambda expressions provide a concise way to define simple anonymous functions, equivalent to using def for short functions:
# Simple conditional lambda
check_condition = lambda: True if 4 > 6 else False
print(check_condition()) # Output: False
# Arithmetic lambda
add_one = lambda num: num + 1
print(add_one(5)) # Output:6
Method 3: Using urllib2 and Regular Expresssions
# -*- coding: utf-8 -*-
import re
import urllib2
import datetime
start_time = datetime.datetime.now()
base_url = "http://www.heibanke.com/lesson/crawler_ex00/"
current_html = urllib2.urlopen(base_url).read()
matched_nums = re.findall(r"Enter the number ([0-9]{5})", current_html)
while matched_nums:
current_url = f"{base_url}{matched_nums[0]}/"
print(current_url)
current_html = urllib2.urlopen(current_url).read()
matched_nums = re.findall(r"Number is ([0-9]{5})", current_html)
final_html = urllib2.urlopen(current_url).read()
final_pass_url = "http://www.heibanke.com" + re.findall(r'<a href="(.*?)" class', final_html)[0]
elapsed = datetime.datetime.now() - start_time
print(f"Final pass URL: {final_pass_url}, Elapsed time: {elapsed}")
Method 4: Using urllib2, re and BeautifulSoup
# -*- coding: utf-8 -*-
import re
import urllib2
import datetime
from bs4 import BeautifulSoup
start_time = datetime.datetime.now()
base_url = "http://www.heibanke.com/lesson/crawler_ex00/"
current_url = base_url
while True:
print(f"Scraping: {current_url}")
response = urllib2.urlopen(current_url)
soup = BeautifulSoup(response.read(), "html.parser", from_encoding="utf8")
heading_tags = soup.find_all("h3")
heading_text = "".join(str(heading_tags[0]))
matched_nums = re.findall(r"\d{5}", heading_text)
if not matched_nums:
final_pass_url = "http://www.heibanke.com" + re.findall(r'<a href="(.*?)" class', response.read())[0]
break
current_url = f"{base_url}{matched_nums[0]}"
elapsed = datetime.datetime.now() - start_time
print(f"Final pass URL: {final_pass_url}, Elapsed time: {elapsed}")
Method 5: Using Selenium WebDriver and Regular Expressions
# -*- coding: utf-8 -*-
import re
import datetime
from selenium import webdriver
import sys
# Set default encoding for Python 2 compatibility
reload(sys)
sys.setdefaultencoding("utf-8")
start_time = datetime.datetime.now()
base_url = "http://www.heibanke.com/lesson/crawler_ex00/"
driver = webdriver.PhantomJS()
driver.get(base_url)
current_content = driver.find_element_by_tag_name("h3").text
matched_nums = re.findall(r"([0-9]{5})", current_content)
while True:
if not matched_nums:
link_element = driver.find_element_by_xpath("/html/body/div/div/div[2]/a")
final_pass_url = link_element.get_attribute("href")
break
current_url = f"{base_url}{matched_nums[0]}"
driver.get(current_url)
current_content = driver.find_element_by_tag_name("h3").text
matched_nums = re.findall(r"([0-9]{5})", current_content)
elapsed = datetime.datetime.now() - start_time
print(f"Final pass URL: {final_pass_url}, Elapsed time: {elapsed}")
driver.quit()
Installasion Tips
# Install BeautifulSoup
pip install bs4
# Install Selenium
pip install selenium
Note: PhantomJS, used in Method 5, requires separate download and setup for you're operating system.
Execution Time Measurement
To calculate total script execution time, capture timestamps with datetime.datetime.now() at the start and end of the script, then subtract the start timestamp from the end timestamp to get the elapsed duration.