Web Data Extraction Patterns: Seven Practical Python Scraping Implementations
Web scraping involves programmatically fetching and parsing web content to extract structured information. The following implementations demonstrate distinct approaches for various target architectures, ranging from static HTML parsing to dynamic JavaScript-rendered content retrieval.
1. Film Database Aggregation (Douban Top 250)
This implementation utilizes BeautifulSoup to extract cinematic metadata including titles, ratings, and review volumes from paginated listings, persisting results to CSV format.
import requests
from bs4 import BeautifulSoup
import csv
base_endpoint = "https://movie.douban.com/top250"
request_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def extract_movies(page_content):
document = BeautifulSoup(page_content, "html.parser")
entries = document.select("ol.grid_view li")
for entry in entries:
film_title = entry.select_one("span.title").get_text()
score = entry.select_one("span.rating_num").get_text()
review_stats = entry.select("div.star span")[-1].get_text()
yield {"title": film_title, "rating": score, "reviews": review_stats}
def persist_douban_data():
with open("douban_cinema_data.csv", "w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["title", "rating", "reviews"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for page_offset in range(0, 250, 25):
paginated_url = f"{base_endpoint}?start={page_offset}"
response = requests.get(paginated_url, headers=request_headers)
for record in extract_movies(response.text):
writer.writerow(record)
if __name__ == "__main__":
persist_douban_data()
2. Box Office Analytics (Maoyan Top 100)
Employing regular expressions for pattern matching against raw HTML, this solution captures theatrical release information and cast details from entertainment listings.
import requests
import re
maoyan_base = "https://maoyan.com/board/4"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
def parse_cinema_listings(html_content):
extraction_pattern = re.compile(
r'<p class="name">.*?<a.*?title="(.*?)".*?>(.*?)</a>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>',
re.DOTALL
)
matches = re.findall(extraction_pattern, html_content)
for match in matches:
yield {
"film_name": match[1].strip(),
"performers": match[2].strip(),
"release_date": match[3].strip()
}
def store_maoyan_results():
with open("maoyan_cinema_records.txt", "w", encoding="utf-8") as output:
for offset in range(0, 100, 10):
target_url = f"{maoyan_base}?offset={offset}"
resp = requests.get(target_url, headers=headers)
for entry in parse_cinema_listings(resp.text):
output.write(f"{entry}\n")
if __name__ == "__main__":
store_maoyan_results()
3. Academic Institution Indexing (University Rankings)
This pattern demonstrates tabular data extraction from educational ranking platforms using regex-based HTML parsing.
import requests
import re
ranking_url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
def scrape_academic_data(html):
row_pattern = re.compile(
r'<tr class="alt">.*?<td>(\d+)</td>.*?<a href=".*?" target="_blank">(.*?)</a>.*?<td>(.*?)</td>.*?<td>([\d.]+)</td>.*?</tr>',
re.S
)
rows = re.findall(row_pattern, html)
for rank, name, province, score in rows:
yield {
"rank": rank,
"institution": name,
"region": province,
"total_score": score
}
def export_university_data():
with open("academic_rankings.txt", "w", encoding="utf-8") as f:
response = requests.get(ranking_url, headers=headers)
for data in scrape_academic_data(response.text):
f.write(f"{data['rank']}: {data['institution']} ({data['region']}) - Score: {data['total_score']}\n")
if __name__ == "__main__":
export_university_data()
4. Meteorological Data Collection (Weather Network)
Utilizing XPath selectors to navigate hierarchical XML structures, this implemantation gathers atmospheric conditions and temperature renges from meteorological services.
import requests
from lxml import etree
import csv
weather_endpoint = "http://www.weather.com.cn/textFC/hb.shtml"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
def extract_climate_data(html):
tree = etree.HTML(html)
regions = tree.xpath('//div[@class="conMidtab"]')[0]
provinces = regions.xpath('.//div[@class="conMidtab3"]')
for province in provinces:
cities = province.xpath('.//tr')[1:]
for city in cities:
try:
city_name = city.xpath('./td[2]/a/text()')[0]
weather_condition = city.xpath('./td[3]/text()')[0]
temp_high = city.xpath('./td[4]/text()')[0]
temp_low = city.xpath('./td[5]/text()')[0]
yield {
"municipality": city_name,
"condition": weather_condition,
"max_temp": temp_high,
"min_temp": temp_low
}
except IndexError:
continue
def archive_weather_metrics():
with open("climate_data.csv", "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=["municipality", "condition", "max_temp", "min_temp"])
writer.writeheader()
response = requests.get(weather_endpoint, headers=headers)
response.encoding = "utf-8"
for record in extract_climate_data(response.text):
writer.writerow(record)
if __name__ == "__main__":
archive_weather_metrics()
5. E-commerce Product Discovery (Dangdang Books)
This example illustrates structured data extraction from retail platforms using precise XPath expressions to locate product metadata including pricing and bibliographic details.
import requests
from lxml import etree
import csv
search_url = "http://search.dangdang.com/?key=python&act=input"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
def harvest_publication_data(html):
dom = etree.HTML(html)
products = dom.xpath('//ul[@class="bigimg"]/li')
for item in products:
try:
book = {
"title": item.xpath('./a/@title')[0],
"url": item.xpath('./a/@href')[0],
"price": item.xpath('.//span[@class="search_now_price"]/text()')[0].replace("¥", ""),
"author": item.xpath('.//p[@class="search_book_author"]/span[1]/a/@title')[0],
"publisher": item.xpath('.//p[@class="search_book_author"]/span[3]/a/@title')[0]
}
yield book
except (IndexError, AttributeError):
continue
def catalog_books():
with open("publication_catalog.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["title", "price", "author", "publisher", "url"])
writer.writeheader()
resp = requests.get(search_url, headers=headers)
for publication in harvest_publication_data(resp.text):
writer.writerow(publication)
if __name__ == "__main__":
catalog_books()
6. Content Aggregation (Humor Database)
Demonstrating text extraction from content platforms using XPath to isolate user-generated content blocks.
import requests
from lxml import etree
base_humor_url = "https://www.qiushibaike.com/text/page/{}/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
def retrieve_anecdotes(page_html):
parser = etree.HTML(page_html)
stories = parser.xpath('//div[@class="col1 old-style-col1"]/div')
for story in stories:
try:
content = story.xpath('.//div[@class="content"]/span/text()')
if content:
yield "".join(content).strip()
except Exception:
continue
def compile_humor_archive():
with open("anecdote_collection.txt", "w", encoding="utf-8") as archive:
for page_num in range(1, 4):
url = base_humor_url.format(page_num)
response = requests.get(url, headers=headers)
for anecdote in retrieve_anecdotes(response.text):
archive.write(anecdote + "\n" + "-"*50 + "\n")
if __name__ == "__main__":
compile_humor_archive()
7. Dynamic Content Retrieval (Social Media Platform)
For JavaScript-rendered content requiring browser automation, this implementation leverages Selenium WebDriver with explicit waits to handle authentication and dynamic DOM manipulation.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests
social_platform = "https://weibo.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
def capture_dynamic_feed():
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(social_platform)
wait = WebDriverWait(driver, 15)
username_field = wait.until(EC.presence_of_element_located((By.NAME, "username")))
password_field = driver.find_element(By.NAME, "password")
submit_btn = driver.find_element(By.CLASS_NAME, "W_btn_a")
username_field.send_keys("your_username")
password_field.send_keys("your_password")
submit_btn.click()
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "WB_feed")))
cookies = driver.get_cookies()
session = requests.Session()
for cookie in cookies:
session.cookies.set(cookie['name'], cookie['value'])
response = session.get(social_platform, headers=headers)
return response.text
finally:
driver.quit()
def save_social_data():
html_content = capture_dynamic_feed()
with open("social_feed_snapshot.html", "w", encoding="utf-8") as f:
f.write(html_content)
if __name__ == "__main__":
save_social_data()