Scraping Lottery Draws, Exploring Frequencies with pyecharts, and a Basic SVR Baseline in Python
This walkthrough shows how to: (1) collect historical draw results from a static website, (2) explore number frequencies with pyecharts, and (3) build a simple SVR-based baseline model that maps dates/issue numbers to the seven drawn numbers.
1. Collect historical draw data
The target pages are static, so parsing with requests + BeautifulSoup is sufficient. The snippet below downloads multiple pages and writes a nomralized CSV for downstream analysis.
import csv
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
BASE_URL = 'http://www.lottery.gov.cn/historykj/history_{}.jspx?_ltype=dlt'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/122.0.0.0 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Host': 'www.lottery.gov.cn'
}
# Adjust the page range as needed
START_PAGE = 1
END_PAGE = 93
def fetch_page(page_no: int) -> List[Dict[str, str]]:
url = BASE_URL.format(page_no)
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
result_div = soup.find('div', class_='result')
if not result_div:
return []
tbody = result_div.find('tbody')
if not tbody:
return []
rows = []
for tr in tbody.find_all('tr'):
cells = [td.get_text(strip=True) for td in tr.find_all('td')]
if not cells:
continue
row_text = ' '.join(cells)
# Extract date (YYYY-MM-DD) and issue number (5 digits)
date_match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', row_text)
issue_match = re.search(r'\b\d{5}\b', row_text)
# Collect two-digit numbers; last 7 should be five front-area and two back-area numbers
twodigs = re.findall(r'\b\d{2}\b', row_text)
if not date_match or not issue_match or len(twodigs) < 7:
continue
# Use last seven two-digit tokens to avoid accidental matches earlier in the row
nums = list(map(int, twodigs[-7:]))
record = {
'date': date_match.group(0),
'issue': issue_match.group(0),
'r1': f'{nums[0]:02d}', 'r2': f'{nums[1]:02d}', 'r3': f'{nums[2]:02d}',
'r4': f'{nums[3]:02d}', 'r5': f'{nums[4]:02d}',
'b1': f'{nums[5]:02d}', 'b2': f'{nums[6]:02d}',
}
rows.append(record)
return rows
def crawl_all(pages: range, output_csv: Path) -> None:
output_csv.parent.mkdir(parents=True, exist_ok=True)
with ThreadPoolExecutor(max_workers=8) as pool, output_csv.open('w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(
f,
fieldnames=['date', 'issue', 'r1', 'r2', 'r3', 'r4', 'r5', 'b1', 'b2']
)
writer.writeheader()
futures = {pool.submit(fetch_page, p): p for p in pages}
for fut in as_completed(futures):
page = futures[fut]
try:
rows = fut.result()
for rec in rows:
writer.writerow(rec)
except Exception as exc:
print(f'Page {page} failed: {exc}')
if __name__ == '__main__':
pages = range(START_PAGE, END_PAGE + 1)
crawl_all(pages, Path('data/draws.csv'))
Output schema: date, issue, r1–r5 (front/"red"), b1–b2 (back/"blue").
2. Quick frequency exploration with pyecharts
The next step aggregates occurrences of each number and renders simple bar charts for the front- and back-area balls.
import csv
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Bar, Page
INPUT_CSV = 'data/draws.csv'
def load_numbers(path: str):
red, blue = [], []
with open(path, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# front area (5 numbers)
red.extend([int(row[k]) for k in ('r1', 'r2', 'r3', 'r4', 'r5')])
# back area (2 numbers)
blue.extend([int(row[k]) for k in ('b1', 'b2')])
return red, blue
def build_bar(x_vals, y_vals, title: str, series_name: str, color: str = None) -> Bar:
bar = Bar()
bar.add_xaxis([str(x) for x in x_vals])
bar.add_yaxis(series_name, y_vals, itemstyle_opts=opts.ItemStyleOpts(color=color) if color else None)
bar.set_global_opts(
title_opts=opts.TitleOpts(title=title),
toolbox_opts=opts.ToolboxOpts(),
xaxis_opts=opts.AxisOpts(name='Number'),
yaxis_opts=opts.AxisOpts(name='Count')
)
bar.set_series_opts(
markpoint_opts=opts.MarkPointOpts(
data=[opts.MarkPointItem(type_='max', name='Max'), opts.MarkPointItem(type_='min', name='Min')]
)
)
return bar
def visualize_frequencies():
red_vals, blue_vals = load_numbers(INPUT_CSV)
red_count = Counter(red_vals)
blue_count = Counter(blue_vals)
# Sort by numeric order for axes
red_pairs = sorted(red_count.items(), key=lambda x: x[0])
blue_pairs = sorted(blue_count.items(), key=lambda x: x[0])
red_x, red_y = [k for k, _ in red_pairs], [v for _, v in red_pairs]
blue_x, blue_y = [k for k, _ in blue_pairs], [v for _, v in blue_pairs]
red_bar = build_bar(red_x, red_y, title='Front-Area Number Frequency', series_name='Front Count')
blue_bar = build_bar(blue_x, blue_y, title='Back-Area Number Frequency', series_name='Back Count', color='blue')
page = Page(page_title='Lottery Draw Frequency Overview')
page.add(red_bar, blue_bar)
page.render('output/lottery_frequency.html')
if __name__ == '__main__':
visualize_frequencies()
This produces output/lottery_frequency.html with two bar charts.
3. A simple SVR baseline for seven positions
Below is a minimal baseline using scikit-learn’s SVR. Each of the seven positions is modeled separately with features [yyyymmdd, issue]. The example demonstrates training and a single prdeiction.
import csv
from typing import List, Tuple
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
INPUT_CSV = 'data/draws.csv'
def load_supervised(path: str) -> Tuple[np.ndarray, List[np.ndarray]]:
xs = []
ys = [[] for _ in range(7)] # r1..r5, b1..b2
with open(path, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Features: yyyymmdd (int), issue (int)
date_num = int(row['date'].replace('-', ''))
issue_num = int(row['issue'])
xs.append([date_num, issue_num])
ys[0].append(int(row['r1']))
ys[1].append(int(row['r2']))
ys[2].append(int(row['r3']))
ys[3].append(int(row['r4']))
ys[4].append(int(row['r5']))
ys[5].append(int(row['b1']))
ys[6].append(int(row['b2']))
X = np.asarray(xs, dtype=np.float64)
y_list = [np.asarray(arr, dtype=np.float64) for arr in ys]
return X, y_list
def train_models(X: np.ndarray, y_list: List[np.ndarray]):
models = []
for y in y_list:
# Standardize features and fit RBF SVR
model = make_pipeline(StandardScaler(with_mean=True, with_std=True), SVR(kernel='rbf', gamma='scale', C=10.0, epsilon=0.1))
model.fit(X, y)
models.append(model)
return models
def predict_draw(models, date_str: str, issue_str: str) -> List[int]:
x = np.array([[int(date_str.replace('-', '')), int(issue_str)]], dtype=np.float64)
preds = [m.predict(x)[0] for m in models]
# Round to nearest integer within a typical 1..35 (front) and 1..12 (back) range
rounded = [int(max(1, min(35, round(p)))) for p in preds[:5]] + [int(max(1, min(12, round(p)))) for p in preds[5:]]
return rounded
if __name__ == '__main__':
X, y_list = load_supervised(INPUT_CSV)
models = train_models(X, y_list)
# Example future-like input
example_date = '2019-08-03'
example_issue = '19089'
pred = predict_draw(models, example_date, example_issue)
# pred contains seven integers [r1..r5, b1..b2]
print('Predicted numbers:', pred)
This baseline uses a seperate SVR for each position with simple numeric features. It illustrates end-to-end engestion, aggregation, plottign, and model fitting for experimentation.