Home > Tools > Content

Extracting Danmaku and Comments from Bilibili

Tools May 18 22

Extracting Danmaku and Comments from Bilibili

Scraping Danmaku

To extract danmaku from Bilibili, follow these steps:

Analyze the Bilibili webpage content - Open developer tools with F12 - Find the network section where most webpage elements are located.
Identify the necessary parameters like aid and cid for the target Bilibili URL.
Based on the analysis, modify the code to download the required content.


import re
import requests

def fetch_danmaku_data(url, file_counter):
    response = requests.get(url)
    filename = f"{file_counter}.so"
    with open(filename, "wb") as file:
        file.write(response.content)
    print('File downloaded successfully!')
    print('-----------')

def calculate_entries():
    global total_entries
    with open('danmaku_data.so', "r", encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        matches = re.findall('>(.*?)<', line)
        empty_list = []
        index = 0
        for match in matches:
            empty_list.append(f"{index + 1}{match}")
            index += 1
        print(f'Total elements in list: {index}')
        print('------------------')
        total_entries = index - 2

def process_danmaku():
    global total_entries
    output_file = open('./processed_danmaku.txt', "w", encoding='utf-8')
    input_file = open('./danmaku_data.so', "r", encoding='utf-8')
    lines = input_file.readlines()
    danmaku_content = []
    calculate_entries()
    for line in lines:
        matches = re.findall('>(.*?)<', line)
        i = 16
        while i <= total_entries:
            output_file.write(matches[i])
            output_file.write('\n')
            danmaku_content.append(matches[i])
            i += 2
        else:
            break
    print('Danmaku processing completed!')

if __name__ == '__main__':
    fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314105897', 'danmaku_1') # Example video ID
    # fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314094543', 'danmaku_2') # Another video ID
    # fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314116841', 'danmaku_3') # Third video ID
    process_danmaku()

Scraping Comments

Similarly, we can scrape comments from Bilibili by identifying the required content and implementing appropriate extraction methods.


import requests

def fetch_comments():
    output_file = open('comments.txt', "w", encoding='utf-8')
    output_file.write("Comments" + "\n")
    output_file.write("\n")
    
    # API endpoint for comments
    url = "https://api.bilibili.com/x/v2/reply/main?p=p&next=0&type=1&oid=204809650&mode=3&plat=1"
    
    # Headers need to include your own cookie and user-agent
    headers = {
        "cookie": "replace_with_your_own_cookie",
        'accept-language': 'zh-CN,zh;q=0.9',
        'referer': 'https://www.bilibili.com/video/your_video_url',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'script',
        'sec-fetch-mode': 'no-cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'replace_with_your_own_user_agent'
    }
    
    response = requests.get(url=url, headers=headers).()
    comments_data = response['data']['replies']
    
    try:
        for comment in comments_data:
            username = comment['member']['uname']
            comment_text = comment['content']['message']
            output_file.write(f"{username}:{comment_text}\n")
            
            # Handle replies to comments
            try:
                for reply in comment['replies']:
                    output_file.write(f"  {reply['member']['uname']}:")
                    output_file.write(f"{reply['content']['message']}\n")
            except (TypeError, KeyError):
                output_file.write("\n")
                
        page = 1
        while response['data']['cursor']['is_end'] == False:
            page += 1
            next_url = f"https://api.bilibili.com/x/v2/reply/main?p=p&next={page}&type=1&oid=204809650&mode=3&plat=1"
            response = requests.get(url=next_url, headers=headers).()
            comments_data = response['data']['replies']
            
            for comment in comments_data:
                username = comment['member']['uname']
                comment_text = comment['content']['message']
                output_file.write(f"{username}:{comment_text}\n")
                
                try:
                    for reply in comment['replies']:
                        output_file.write(f"  {reply['member']['uname']}:")
                        output_file.write(f"{reply['content']['message']}\n")
                except (TypeError, KeyError):
                    output_file.write("\n")
                    
        print('Comment scraping completed')
    except (TypeError, KeyError):
        print("Program stopped")

if __name__ == '__main__':
    fetch_comments()

Back to List

Prev: Entity Framework: Data Mapping, Query Execution, and State Management Techniques

Next: Chinese Localization of GitLab: Implementation Guide

Efficient Usage of HTTP Client in IntelliJ IDEA

IntelliJ IDEA incorporates a versatile HTTP client tool, enabling developres to interact with RESTful services and APIs effectively with in the editor. This functionality streamlines workflows, replac...

Installing CocoaPods on macOS Catalina (10.15) Using a User-Managed Ruby

System Ruby on macOS 10.15 frequently fails to build native gems required by CocoaPods (for example, ffi), leading to errors like: ERROR: Failed to build gem native extension checking for ffi.h... no...

Resolve PhpStorm "Interpreter is not specified or invalid" on WAMP (Windows)

Symptom PhpStorm displays: "Interpreter is not specified or invalid. Press ‘Fix’ to edit your project configuration." This occurs when the IDE cannot locate a valid PHP CLI executable or when the debu...

Fading Coder

Extracting Danmaku and Comments from Bilibili

Extracting Danmaku and Comments from Bilibili

Scraping Danmaku

Scraping Comments

Related Articles

Efficient Usage of HTTP Client in IntelliJ IDEA

Installing CocoaPods on macOS Catalina (10.15) Using a User-Managed Ruby

Resolve PhpStorm "Interpreter is not specified or invalid" on WAMP (Windows)

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Extracting Danmaku and Comments from Bilibili

Extracting Danmaku and Comments from Bilibili

Scraping Danmaku

Scraping Comments

Related Articles

Efficient Usage of HTTP Client in IntelliJ IDEA

Installing CocoaPods on macOS Catalina (10.15) Using a User-Managed Ruby

Resolve PhpStorm "Interpreter is not specified or invalid" on WAMP (Windows)

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment