Home > Tech > Content

Web Data Extraction Patterns: Seven Practical Python Scraping Implementations

Tech May 17 15

Web scraping involves programmatically fetching and parsing web content to extract structured information. The following implementations demonstrate distinct approaches for various target architectures, ranging from static HTML parsing to dynamic JavaScript-rendered content retrieval.

1. Film Database Aggregation (Douban Top 250)

This implementation utilizes BeautifulSoup to extract cinematic metadata including titles, ratings, and review volumes from paginated listings, persisting results to CSV format.

import requests
from bs4 import BeautifulSoup
import csv

base_endpoint = "https://movie.douban.com/top250"
request_headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def extract_movies(page_content):
    document = BeautifulSoup(page_content, "html.parser")
    entries = document.select("ol.grid_view li")
    
    for entry in entries:
        film_title = entry.select_one("span.title").get_text()
        score = entry.select_one("span.rating_num").get_text()
        review_stats = entry.select("div.star span")[-1].get_text()
        yield {"title": film_title, "rating": score, "reviews": review_stats}

def persist_douban_data():
    with open("douban_cinema_data.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["title", "rating", "reviews"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for page_offset in range(0, 250, 25):
            paginated_url = f"{base_endpoint}?start={page_offset}"
            response = requests.get(paginated_url, headers=request_headers)
            for record in extract_movies(response.text):
                writer.writerow(record)

if __name__ == "__main__":
    persist_douban_data()

2. Box Office Analytics (Maoyan Top 100)

Employing regular expressions for pattern matching against raw HTML, this solution captures theatrical release information and cast details from entertainment listings.

import requests
import re

maoyan_base = "https://maoyan.com/board/4"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

def parse_cinema_listings(html_content):
    extraction_pattern = re.compile(
        r'<p class="name">.*?<a.*?title="(.*?)".*?>(.*?)</a>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>',
        re.DOTALL
    )
    matches = re.findall(extraction_pattern, html_content)
    
    for match in matches:
        yield {
            "film_name": match[1].strip(),
            "performers": match[2].strip(),
            "release_date": match[3].strip()
        }

def store_maoyan_results():
    with open("maoyan_cinema_records.txt", "w", encoding="utf-8") as output:
        for offset in range(0, 100, 10):
            target_url = f"{maoyan_base}?offset={offset}"
            resp = requests.get(target_url, headers=headers)
            for entry in parse_cinema_listings(resp.text):
                output.write(f"{entry}\n")

if __name__ == "__main__":
    store_maoyan_results()

3. Academic Institution Indexing (University Rankings)

This pattern demonstrates tabular data extraction from educational ranking platforms using regex-based HTML parsing.

import requests
import re

ranking_url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}

def scrape_academic_data(html):
    row_pattern = re.compile(
        r'<tr class="alt">.*?<td>(\d+)</td>.*?<a href=".*?" target="_blank">(.*?)</a>.*?<td>(.*?)</td>.*?<td>([\d.]+)</td>.*?</tr>',
        re.S
    )
    rows = re.findall(row_pattern, html)
    
    for rank, name, province, score in rows:
        yield {
            "rank": rank,
            "institution": name,
            "region": province,
            "total_score": score
        }

def export_university_data():
    with open("academic_rankings.txt", "w", encoding="utf-8") as f:
        response = requests.get(ranking_url, headers=headers)
        for data in scrape_academic_data(response.text):
            f.write(f"{data['rank']}: {data['institution']} ({data['region']}) - Score: {data['total_score']}\n")

if __name__ == "__main__":
    export_university_data()

4. Meteorological Data Collection (Weather Network)

Utilizing XPath selectors to navigate hierarchical XML structures, this implemantation gathers atmospheric conditions and temperature renges from meteorological services.

import requests
from lxml import etree
import csv

weather_endpoint = "http://www.weather.com.cn/textFC/hb.shtml"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def extract_climate_data(html):
    tree = etree.HTML(html)
    regions = tree.xpath('//div[@class="conMidtab"]')[0]
    provinces = regions.xpath('.//div[@class="conMidtab3"]')
    
    for province in provinces:
        cities = province.xpath('.//tr')[1:]
        for city in cities:
            try:
                city_name = city.xpath('./td[2]/a/text()')[0]
                weather_condition = city.xpath('./td[3]/text()')[0]
                temp_high = city.xpath('./td[4]/text()')[0]
                temp_low = city.xpath('./td[5]/text()')[0]
                yield {
                    "municipality": city_name,
                    "condition": weather_condition,
                    "max_temp": temp_high,
                    "min_temp": temp_low
                }
            except IndexError:
                continue

def archive_weather_metrics():
    with open("climate_data.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["municipality", "condition", "max_temp", "min_temp"])
        writer.writeheader()
        response = requests.get(weather_endpoint, headers=headers)
        response.encoding = "utf-8"
        for record in extract_climate_data(response.text):
            writer.writerow(record)

if __name__ == "__main__":
    archive_weather_metrics()

5. E-commerce Product Discovery (Dangdang Books)

This example illustrates structured data extraction from retail platforms using precise XPath expressions to locate product metadata including pricing and bibliographic details.

import requests
from lxml import etree
import csv

search_url = "http://search.dangdang.com/?key=python&act=input"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}

def harvest_publication_data(html):
    dom = etree.HTML(html)
    products = dom.xpath('//ul[@class="bigimg"]/li')
    
    for item in products:
        try:
            book = {
                "title": item.xpath('./a/@title')[0],
                "url": item.xpath('./a/@href')[0],
                "price": item.xpath('.//span[@class="search_now_price"]/text()')[0].replace("¥", ""),
                "author": item.xpath('.//p[@class="search_book_author"]/span[1]/a/@title')[0],
                "publisher": item.xpath('.//p[@class="search_book_author"]/span[3]/a/@title')[0]
            }
            yield book
        except (IndexError, AttributeError):
            continue

def catalog_books():
    with open("publication_catalog.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "price", "author", "publisher", "url"])
        writer.writeheader()
        resp = requests.get(search_url, headers=headers)
        for publication in harvest_publication_data(resp.text):
            writer.writerow(publication)

if __name__ == "__main__":
    catalog_books()

6. Content Aggregation (Humor Database)

Demonstrating text extraction from content platforms using XPath to isolate user-generated content blocks.

import requests
from lxml import etree

base_humor_url = "https://www.qiushibaike.com/text/page/{}/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def retrieve_anecdotes(page_html):
    parser = etree.HTML(page_html)
    stories = parser.xpath('//div[@class="col1 old-style-col1"]/div')
    
    for story in stories:
        try:
            content = story.xpath('.//div[@class="content"]/span/text()')
            if content:
                yield "".join(content).strip()
        except Exception:
            continue

def compile_humor_archive():
    with open("anecdote_collection.txt", "w", encoding="utf-8") as archive:
        for page_num in range(1, 4):
            url = base_humor_url.format(page_num)
            response = requests.get(url, headers=headers)
            for anecdote in retrieve_anecdotes(response.text):
                archive.write(anecdote + "\n" + "-"*50 + "\n")

if __name__ == "__main__":
    compile_humor_archive()

7. Dynamic Content Retrieval (Social Media Platform)

For JavaScript-rendered content requiring browser automation, this implementation leverages Selenium WebDriver with explicit waits to handle authentication and dynamic DOM manipulation.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests

social_platform = "https://weibo.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def capture_dynamic_feed():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        driver.get(social_platform)
        wait = WebDriverWait(driver, 15)
        
        username_field = wait.until(EC.presence_of_element_located((By.NAME, "username")))
        password_field = driver.find_element(By.NAME, "password")
        submit_btn = driver.find_element(By.CLASS_NAME, "W_btn_a")
        
        username_field.send_keys("your_username")
        password_field.send_keys("your_password")
        submit_btn.click()
        
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "WB_feed")))
        
        cookies = driver.get_cookies()
        session = requests.Session()
        for cookie in cookies:
            session.cookies.set(cookie['name'], cookie['value'])
        
        response = session.get(social_platform, headers=headers)
        return response.text
        
    finally:
        driver.quit()

def save_social_data():
    html_content = capture_dynamic_feed()
    with open("social_feed_snapshot.html", "w", encoding="utf-8") as f:
        f.write(html_content)

if __name__ == "__main__":
    save_social_data()

Tags: Python

Back to List

Prev: Architecture and Design of a Quest System in Unity3D

Next: Solving Word Break with Dynamic Programming and Understanding Multiple Knapsack

Fading Coder

Web Data Extraction Patterns: Seven Practical Python Scraping Implementations

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Web Data Extraction Patterns: Seven Practical Python Scraping Implementations

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment