Extracting Danmaku and Comments from Bilibili
Extracting Danmaku and Comments from Bilibili
Scraping Danmaku
To extract danmaku from Bilibili, follow these steps:
- Analyze the Bilibili webpage content - Open developer tools with F12 - Find the network section where most webpage elements are located.
- Identify the necessary parameters like aid and cid for the target Bilibili URL.
- Based on the analysis, modify the code to download the required content.
import re
import requests
def fetch_danmaku_data(url, file_counter):
response = requests.get(url)
filename = f"{file_counter}.so"
with open(filename, "wb") as file:
file.write(response.content)
print('File downloaded successfully!')
print('-----------')
def calculate_entries():
global total_entries
with open('danmaku_data.so', "r", encoding='utf-8') as file:
lines = file.readlines()
for line in lines:
matches = re.findall('>(.*?)<', line)
empty_list = []
index = 0
for match in matches:
empty_list.append(f"{index + 1}{match}")
index += 1
print(f'Total elements in list: {index}')
print('------------------')
total_entries = index - 2
def process_danmaku():
global total_entries
output_file = open('./processed_danmaku.txt', "w", encoding='utf-8')
input_file = open('./danmaku_data.so', "r", encoding='utf-8')
lines = input_file.readlines()
danmaku_content = []
calculate_entries()
for line in lines:
matches = re.findall('>(.*?)<', line)
i = 16
while i <= total_entries:
output_file.write(matches[i])
output_file.write('\n')
danmaku_content.append(matches[i])
i += 2
else:
break
print('Danmaku processing completed!')
if __name__ == '__main__':
fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314105897', 'danmaku_1') # Example video ID
# fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314094543', 'danmaku_2') # Another video ID
# fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314116841', 'danmaku_3') # Third video ID
process_danmaku()
Scraping Comments
Similarly, we can scrape comments from Bilibili by identifying the required content and implementing appropriate extraction methods.
import requests
def fetch_comments():
output_file = open('comments.txt', "w", encoding='utf-8')
output_file.write("Comments" + "\n")
output_file.write("\n")
# API endpoint for comments
url = "https://api.bilibili.com/x/v2/reply/main?p=p&next=0&type=1&oid=204809650&mode=3&plat=1"
# Headers need to include your own cookie and user-agent
headers = {
"cookie": "replace_with_your_own_cookie",
'accept-language': 'zh-CN,zh;q=0.9',
'referer': 'https://www.bilibili.com/video/your_video_url',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'replace_with_your_own_user_agent'
}
response = requests.get(url=url, headers=headers).()
comments_data = response['data']['replies']
try:
for comment in comments_data:
username = comment['member']['uname']
comment_text = comment['content']['message']
output_file.write(f"{username}:{comment_text}\n")
# Handle replies to comments
try:
for reply in comment['replies']:
output_file.write(f" {reply['member']['uname']}:")
output_file.write(f"{reply['content']['message']}\n")
except (TypeError, KeyError):
output_file.write("\n")
page = 1
while response['data']['cursor']['is_end'] == False:
page += 1
next_url = f"https://api.bilibili.com/x/v2/reply/main?p=p&next={page}&type=1&oid=204809650&mode=3&plat=1"
response = requests.get(url=next_url, headers=headers).()
comments_data = response['data']['replies']
for comment in comments_data:
username = comment['member']['uname']
comment_text = comment['content']['message']
output_file.write(f"{username}:{comment_text}\n")
try:
for reply in comment['replies']:
output_file.write(f" {reply['member']['uname']}:")
output_file.write(f"{reply['content']['message']}\n")
except (TypeError, KeyError):
output_file.write("\n")
print('Comment scraping completed')
except (TypeError, KeyError):
print("Program stopped")
if __name__ == '__main__':
fetch_comments()