Fading Coder

An Old Coder’s Final Dance

Home > Tech > Content

Scraping Lottery Draws, Exploring Frequencies with pyecharts, and a Basic SVR Baseline in Python

Tech 2

This walkthrough shows how to: (1) collect historical draw results from a static website, (2) explore number frequencies with pyecharts, and (3) build a simple SVR-based baseline model that maps dates/issue numbers to the seven drawn numbers.

1. Collect historical draw data

The target pages are static, so parsing with requests + BeautifulSoup is sufficient. The snippet below downloads multiple pages and writes a nomralized CSV for downstream analysis.

import csv
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import List, Dict

import requests
from bs4 import BeautifulSoup

BASE_URL = 'http://www.lottery.gov.cn/historykj/history_{}.jspx?_ltype=dlt'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/122.0.0.0 Safari/537.36',
    'Upgrade-Insecure-Requests': '1',
    'Host': 'www.lottery.gov.cn'
}

# Adjust the page range as needed
START_PAGE = 1
END_PAGE = 93


def fetch_page(page_no: int) -> List[Dict[str, str]]:
    url = BASE_URL.format(page_no)
    resp = requests.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'lxml')
    result_div = soup.find('div', class_='result')
    if not result_div:
        return []

    tbody = result_div.find('tbody')
    if not tbody:
        return []

    rows = []
    for tr in tbody.find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all('td')]
        if not cells:
            continue
        row_text = ' '.join(cells)

        # Extract date (YYYY-MM-DD) and issue number (5 digits)
        date_match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', row_text)
        issue_match = re.search(r'\b\d{5}\b', row_text)

        # Collect two-digit numbers; last 7 should be five front-area and two back-area numbers
        twodigs = re.findall(r'\b\d{2}\b', row_text)
        if not date_match or not issue_match or len(twodigs) < 7:
            continue

        # Use last seven two-digit tokens to avoid accidental matches earlier in the row
        nums = list(map(int, twodigs[-7:]))
        record = {
            'date': date_match.group(0),
            'issue': issue_match.group(0),
            'r1': f'{nums[0]:02d}', 'r2': f'{nums[1]:02d}', 'r3': f'{nums[2]:02d}',
            'r4': f'{nums[3]:02d}', 'r5': f'{nums[4]:02d}',
            'b1': f'{nums[5]:02d}', 'b2': f'{nums[6]:02d}',
        }
        rows.append(record)
    return rows


def crawl_all(pages: range, output_csv: Path) -> None:
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    with ThreadPoolExecutor(max_workers=8) as pool, output_csv.open('w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(
            f,
            fieldnames=['date', 'issue', 'r1', 'r2', 'r3', 'r4', 'r5', 'b1', 'b2']
        )
        writer.writeheader()

        futures = {pool.submit(fetch_page, p): p for p in pages}
        for fut in as_completed(futures):
            page = futures[fut]
            try:
                rows = fut.result()
                for rec in rows:
                    writer.writerow(rec)
            except Exception as exc:
                print(f'Page {page} failed: {exc}')


if __name__ == '__main__':
    pages = range(START_PAGE, END_PAGE + 1)
    crawl_all(pages, Path('data/draws.csv'))

Output schema: date, issue, r1–r5 (front/"red"), b1–b2 (back/"blue").

2. Quick frequency exploration with pyecharts

The next step aggregates occurrences of each number and renders simple bar charts for the front- and back-area balls.

import csv
from collections import Counter

from pyecharts import options as opts
from pyecharts.charts import Bar, Page

INPUT_CSV = 'data/draws.csv'


def load_numbers(path: str):
    red, blue = [], []
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # front area (5 numbers)
            red.extend([int(row[k]) for k in ('r1', 'r2', 'r3', 'r4', 'r5')])
            # back area (2 numbers)
            blue.extend([int(row[k]) for k in ('b1', 'b2')])
    return red, blue


def build_bar(x_vals, y_vals, title: str, series_name: str, color: str = None) -> Bar:
    bar = Bar()
    bar.add_xaxis([str(x) for x in x_vals])
    bar.add_yaxis(series_name, y_vals, itemstyle_opts=opts.ItemStyleOpts(color=color) if color else None)
    bar.set_global_opts(
        title_opts=opts.TitleOpts(title=title),
        toolbox_opts=opts.ToolboxOpts(),
        xaxis_opts=opts.AxisOpts(name='Number'),
        yaxis_opts=opts.AxisOpts(name='Count')
    )
    bar.set_series_opts(
        markpoint_opts=opts.MarkPointOpts(
            data=[opts.MarkPointItem(type_='max', name='Max'), opts.MarkPointItem(type_='min', name='Min')]
        )
    )
    return bar


def visualize_frequencies():
    red_vals, blue_vals = load_numbers(INPUT_CSV)
    red_count = Counter(red_vals)
    blue_count = Counter(blue_vals)

    # Sort by numeric order for axes
    red_pairs = sorted(red_count.items(), key=lambda x: x[0])
    blue_pairs = sorted(blue_count.items(), key=lambda x: x[0])

    red_x, red_y = [k for k, _ in red_pairs], [v for _, v in red_pairs]
    blue_x, blue_y = [k for k, _ in blue_pairs], [v for _, v in blue_pairs]

    red_bar = build_bar(red_x, red_y, title='Front-Area Number Frequency', series_name='Front Count')
    blue_bar = build_bar(blue_x, blue_y, title='Back-Area Number Frequency', series_name='Back Count', color='blue')

    page = Page(page_title='Lottery Draw Frequency Overview')
    page.add(red_bar, blue_bar)
    page.render('output/lottery_frequency.html')


if __name__ == '__main__':
    visualize_frequencies()

This produces output/lottery_frequency.html with two bar charts.

3. A simple SVR baseline for seven positions

Below is a minimal baseline using scikit-learn’s SVR. Each of the seven positions is modeled separately with features [yyyymmdd, issue]. The example demonstrates training and a single prdeiction.

import csv
from typing import List, Tuple

import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

INPUT_CSV = 'data/draws.csv'


def load_supervised(path: str) -> Tuple[np.ndarray, List[np.ndarray]]:
    xs = []
    ys = [[] for _ in range(7)]  # r1..r5, b1..b2

    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Features: yyyymmdd (int), issue (int)
            date_num = int(row['date'].replace('-', ''))
            issue_num = int(row['issue'])
            xs.append([date_num, issue_num])

            ys[0].append(int(row['r1']))
            ys[1].append(int(row['r2']))
            ys[2].append(int(row['r3']))
            ys[3].append(int(row['r4']))
            ys[4].append(int(row['r5']))
            ys[5].append(int(row['b1']))
            ys[6].append(int(row['b2']))

    X = np.asarray(xs, dtype=np.float64)
    y_list = [np.asarray(arr, dtype=np.float64) for arr in ys]
    return X, y_list


def train_models(X: np.ndarray, y_list: List[np.ndarray]):
    models = []
    for y in y_list:
        # Standardize features and fit RBF SVR
        model = make_pipeline(StandardScaler(with_mean=True, with_std=True), SVR(kernel='rbf', gamma='scale', C=10.0, epsilon=0.1))
        model.fit(X, y)
        models.append(model)
    return models


def predict_draw(models, date_str: str, issue_str: str) -> List[int]:
    x = np.array([[int(date_str.replace('-', '')), int(issue_str)]], dtype=np.float64)
    preds = [m.predict(x)[0] for m in models]
    # Round to nearest integer within a typical 1..35 (front) and 1..12 (back) range
    rounded = [int(max(1, min(35, round(p)))) for p in preds[:5]] + [int(max(1, min(12, round(p)))) for p in preds[5:]]
    return rounded


if __name__ == '__main__':
    X, y_list = load_supervised(INPUT_CSV)
    models = train_models(X, y_list)

    # Example future-like input
    example_date = '2019-08-03'
    example_issue = '19089'
    pred = predict_draw(models, example_date, example_issue)
    # pred contains seven integers [r1..r5, b1..b2]
    print('Predicted numbers:', pred)

This baseline uses a seperate SVR for each position with simple numeric features. It illustrates end-to-end engestion, aggregation, plottign, and model fitting for experimentation.

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.