Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Web Data Extraction Patterns: Seven Practical Python Scraping Implementations

Tech May 17 2

Web scraping involves programmatically fetching and parsing web content to extract structured information. The following implementations demonstrate distinct approaches for various target architectures, ranging from static HTML parsing to dynamic JavaScript-rendered content retrieval.

1. Film Database Aggregation (Douban Top 250)

This implementation utilizes BeautifulSoup to extract cinematic metadata including titles, ratings, and review volumes from paginated listings, persisting results to CSV format.

import requests
from bs4 import BeautifulSoup
import csv

base_endpoint = "https://movie.douban.com/top250"
request_headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def extract_movies(page_content):
    document = BeautifulSoup(page_content, "html.parser")
    entries = document.select("ol.grid_view li")
    
    for entry in entries:
        film_title = entry.select_one("span.title").get_text()
        score = entry.select_one("span.rating_num").get_text()
        review_stats = entry.select("div.star span")[-1].get_text()
        yield {"title": film_title, "rating": score, "reviews": review_stats}

def persist_douban_data():
    with open("douban_cinema_data.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["title", "rating", "reviews"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for page_offset in range(0, 250, 25):
            paginated_url = f"{base_endpoint}?start={page_offset}"
            response = requests.get(paginated_url, headers=request_headers)
            for record in extract_movies(response.text):
                writer.writerow(record)

if __name__ == "__main__":
    persist_douban_data()

2. Box Office Analytics (Maoyan Top 100)

Employing regular expressions for pattern matching against raw HTML, this solution captures theatrical release information and cast details from entertainment listings.

import requests
import re

maoyan_base = "https://maoyan.com/board/4"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

def parse_cinema_listings(html_content):
    extraction_pattern = re.compile(
        r'<p class="name">.*?<a.*?title="(.*?)".*?>(.*?)</a>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>',
        re.DOTALL
    )
    matches = re.findall(extraction_pattern, html_content)
    
    for match in matches:
        yield {
            "film_name": match[1].strip(),
            "performers": match[2].strip(),
            "release_date": match[3].strip()
        }

def store_maoyan_results():
    with open("maoyan_cinema_records.txt", "w", encoding="utf-8") as output:
        for offset in range(0, 100, 10):
            target_url = f"{maoyan_base}?offset={offset}"
            resp = requests.get(target_url, headers=headers)
            for entry in parse_cinema_listings(resp.text):
                output.write(f"{entry}\n")

if __name__ == "__main__":
    store_maoyan_results()

3. Academic Institution Indexing (University Rankings)

This pattern demonstrates tabular data extraction from educational ranking platforms using regex-based HTML parsing.

import requests
import re

ranking_url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}

def scrape_academic_data(html):
    row_pattern = re.compile(
        r'<tr class="alt">.*?<td>(\d+)</td>.*?<a href=".*?" target="_blank">(.*?)</a>.*?<td>(.*?)</td>.*?<td>([\d.]+)</td>.*?</tr>',
        re.S
    )
    rows = re.findall(row_pattern, html)
    
    for rank, name, province, score in rows:
        yield {
            "rank": rank,
            "institution": name,
            "region": province,
            "total_score": score
        }

def export_university_data():
    with open("academic_rankings.txt", "w", encoding="utf-8") as f:
        response = requests.get(ranking_url, headers=headers)
        for data in scrape_academic_data(response.text):
            f.write(f"{data['rank']}: {data['institution']} ({data['region']}) - Score: {data['total_score']}\n")

if __name__ == "__main__":
    export_university_data()

4. Meteorological Data Collection (Weather Network)

Utilizing XPath selectors to navigate hierarchical XML structures, this implemantation gathers atmospheric conditions and temperature renges from meteorological services.

import requests
from lxml import etree
import csv

weather_endpoint = "http://www.weather.com.cn/textFC/hb.shtml"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def extract_climate_data(html):
    tree = etree.HTML(html)
    regions = tree.xpath('//div[@class="conMidtab"]')[0]
    provinces = regions.xpath('.//div[@class="conMidtab3"]')
    
    for province in provinces:
        cities = province.xpath('.//tr')[1:]
        for city in cities:
            try:
                city_name = city.xpath('./td[2]/a/text()')[0]
                weather_condition = city.xpath('./td[3]/text()')[0]
                temp_high = city.xpath('./td[4]/text()')[0]
                temp_low = city.xpath('./td[5]/text()')[0]
                yield {
                    "municipality": city_name,
                    "condition": weather_condition,
                    "max_temp": temp_high,
                    "min_temp": temp_low
                }
            except IndexError:
                continue

def archive_weather_metrics():
    with open("climate_data.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["municipality", "condition", "max_temp", "min_temp"])
        writer.writeheader()
        response = requests.get(weather_endpoint, headers=headers)
        response.encoding = "utf-8"
        for record in extract_climate_data(response.text):
            writer.writerow(record)

if __name__ == "__main__":
    archive_weather_metrics()

5. E-commerce Product Discovery (Dangdang Books)

This example illustrates structured data extraction from retail platforms using precise XPath expressions to locate product metadata including pricing and bibliographic details.

import requests
from lxml import etree
import csv

search_url = "http://search.dangdang.com/?key=python&act=input"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}

def harvest_publication_data(html):
    dom = etree.HTML(html)
    products = dom.xpath('//ul[@class="bigimg"]/li')
    
    for item in products:
        try:
            book = {
                "title": item.xpath('./a/@title')[0],
                "url": item.xpath('./a/@href')[0],
                "price": item.xpath('.//span[@class="search_now_price"]/text()')[0].replace("¥", ""),
                "author": item.xpath('.//p[@class="search_book_author"]/span[1]/a/@title')[0],
                "publisher": item.xpath('.//p[@class="search_book_author"]/span[3]/a/@title')[0]
            }
            yield book
        except (IndexError, AttributeError):
            continue

def catalog_books():
    with open("publication_catalog.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "price", "author", "publisher", "url"])
        writer.writeheader()
        resp = requests.get(search_url, headers=headers)
        for publication in harvest_publication_data(resp.text):
            writer.writerow(publication)

if __name__ == "__main__":
    catalog_books()

6. Content Aggregation (Humor Database)

Demonstrating text extraction from content platforms using XPath to isolate user-generated content blocks.

import requests
from lxml import etree

base_humor_url = "https://www.qiushibaike.com/text/page/{}/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def retrieve_anecdotes(page_html):
    parser = etree.HTML(page_html)
    stories = parser.xpath('//div[@class="col1 old-style-col1"]/div')
    
    for story in stories:
        try:
            content = story.xpath('.//div[@class="content"]/span/text()')
            if content:
                yield "".join(content).strip()
        except Exception:
            continue

def compile_humor_archive():
    with open("anecdote_collection.txt", "w", encoding="utf-8") as archive:
        for page_num in range(1, 4):
            url = base_humor_url.format(page_num)
            response = requests.get(url, headers=headers)
            for anecdote in retrieve_anecdotes(response.text):
                archive.write(anecdote + "\n" + "-"*50 + "\n")

if __name__ == "__main__":
    compile_humor_archive()

7. Dynamic Content Retrieval (Social Media Platform)

For JavaScript-rendered content requiring browser automation, this implementation leverages Selenium WebDriver with explicit waits to handle authentication and dynamic DOM manipulation.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests

social_platform = "https://weibo.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def capture_dynamic_feed():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        driver.get(social_platform)
        wait = WebDriverWait(driver, 15)
        
        username_field = wait.until(EC.presence_of_element_located((By.NAME, "username")))
        password_field = driver.find_element(By.NAME, "password")
        submit_btn = driver.find_element(By.CLASS_NAME, "W_btn_a")
        
        username_field.send_keys("your_username")
        password_field.send_keys("your_password")
        submit_btn.click()
        
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "WB_feed")))
        
        cookies = driver.get_cookies()
        session = requests.Session()
        for cookie in cookies:
            session.cookies.set(cookie['name'], cookie['value'])
        
        response = session.get(social_platform, headers=headers)
        return response.text
        
    finally:
        driver.quit()

def save_social_data():
    html_content = capture_dynamic_feed()
    with open("social_feed_snapshot.html", "w", encoding="utf-8") as f:
        f.write(html_content)

if __name__ == "__main__":
    save_social_data()
Tags: Python

Related Articles

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

SBUS Signal Analysis and Communication Implementation Using STM32 with Fus Remote Controller

Overview In a recent project, I utilized the SBUS protocol with the Fus remote controller to control a vehicle's basic operations, including movement, lights, and mode switching. This article is aimed...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.