Web Scraping Sina Weibo Hot Search List
Core Libraries
import json
import time
from urllib.parse import quote
import requests
import warnings
warnings.filterwarnings('ignore')
Fetching Hot Search Data
This function retrieves the raw JSON data from Weibo's hot search endpoint.
def get_hot_search_feed():
target_url = 'https://weibo.com/ajax/side/hotSearch'
resp = requests.get(target_url)
if resp.status_code == 200:
return resp.json().get('data')
else:
return None
Parsing and Storing Data
The main processing functoin extarcts information and saves it too a structured text file.
def process_hot_search(entry_count):
json_data = get_hot_search_feed()
if json_data is None:
print('Failed to retrieve Weibo hot search list.')
return
timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
current_date_dir = time.strftime('HotSearchData/%Y-%m-%d')
output_filename = f"{current_date_dir}/{time.strftime('%Y%m%d_%H%M%S')}.txt"
try:
os.makedirs(current_date_dir, exist_ok=True)
except OSError as e:
print(f"Directory creation warning: {e}")
header = "Topic,Heat,Label,Subject,Category,URL,Timestamp\n"
pinned_topic = json_data['hotgov']['word'].strip('#')
header += f"{pinned_topic},,Pinned,,,,"{timestamp}"
entries = json_data.get('realtime', [])[:entry_count]
for idx, entry in enumerate(entries):
rank = idx + 1
topic = entry.get('word', '')
encoded_topic = quote(topic)
item_url = f"https://s.weibo.com/weibo?q={encoded_topic}&t=31&band_rank={rank}&Refer=top"
heat_value = entry.get('raw_hot', '')
tag = entry.get('label_name', '')
subject_tag = entry.get('subject_label', '')
entry_category = entry.get('category', '').replace(',', '|')
if heat_value:
csv_line = f"{topic},{heat_value},{tag},{subject_tag},{entry_category},{item_url},{timestamp}"
header += '\n' + csv_line
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(header)
print(f"Data saved to: {output_filename}")
Execution Script
if __name__ == '__main__':
count = 50 # Number of hot search entries to retrieve
process_hot_search(count)