Fading Coder

One Final Commit for the Last Sprint

Home > Tools > Content

Extracting Danmaku and Comments from Bilibili

Tools May 18 3

Extracting Danmaku and Comments from Bilibili

Scraping Danmaku

To extract danmaku from Bilibili, follow these steps:

  1. Analyze the Bilibili webpage content - Open developer tools with F12 - Find the network section where most webpage elements are located.
  2. Identify the necessary parameters like aid and cid for the target Bilibili URL.
  3. Based on the analysis, modify the code to download the required content.

import re
import requests

def fetch_danmaku_data(url, file_counter):
    response = requests.get(url)
    filename = f"{file_counter}.so"
    with open(filename, "wb") as file:
        file.write(response.content)
    print('File downloaded successfully!')
    print('-----------')

def calculate_entries():
    global total_entries
    with open('danmaku_data.so', "r", encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        matches = re.findall('>(.*?)<', line)
        empty_list = []
        index = 0
        for match in matches:
            empty_list.append(f"{index + 1}{match}")
            index += 1
        print(f'Total elements in list: {index}')
        print('------------------')
        total_entries = index - 2

def process_danmaku():
    global total_entries
    output_file = open('./processed_danmaku.txt', "w", encoding='utf-8')
    input_file = open('./danmaku_data.so', "r", encoding='utf-8')
    lines = input_file.readlines()
    danmaku_content = []
    calculate_entries()
    for line in lines:
        matches = re.findall('>(.*?)<', line)
        i = 16
        while i <= total_entries:
            output_file.write(matches[i])
            output_file.write('\n')
            danmaku_content.append(matches[i])
            i += 2
        else:
            break
    print('Danmaku processing completed!')

if __name__ == '__main__':
    fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314105897', 'danmaku_1') # Example video ID
    # fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314094543', 'danmaku_2') # Another video ID
    # fetch_danmaku_data('https://api.bilibili.com/x/v1/dm/list.so?oid=314116841', 'danmaku_3') # Third video ID
    process_danmaku()

Scraping Comments

Similarly, we can scrape comments from Bilibili by identifying the required content and implementing appropriate extraction methods.


import requests

def fetch_comments():
    output_file = open('comments.txt', "w", encoding='utf-8')
    output_file.write("Comments" + "\n")
    output_file.write("\n")
    
    # API endpoint for comments
    url = "https://api.bilibili.com/x/v2/reply/main?p=p&next=0&type=1&oid=204809650&mode=3&plat=1"
    
    # Headers need to include your own cookie and user-agent
    headers = {
        "cookie": "replace_with_your_own_cookie",
        'accept-language': 'zh-CN,zh;q=0.9',
        'referer': 'https://www.bilibili.com/video/your_video_url',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'script',
        'sec-fetch-mode': 'no-cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'replace_with_your_own_user_agent'
    }
    
    response = requests.get(url=url, headers=headers).()
    comments_data = response['data']['replies']
    
    try:
        for comment in comments_data:
            username = comment['member']['uname']
            comment_text = comment['content']['message']
            output_file.write(f"{username}:{comment_text}\n")
            
            # Handle replies to comments
            try:
                for reply in comment['replies']:
                    output_file.write(f"  {reply['member']['uname']}:")
                    output_file.write(f"{reply['content']['message']}\n")
            except (TypeError, KeyError):
                output_file.write("\n")
                
        page = 1
        while response['data']['cursor']['is_end'] == False:
            page += 1
            next_url = f"https://api.bilibili.com/x/v2/reply/main?p=p&next={page}&type=1&oid=204809650&mode=3&plat=1"
            response = requests.get(url=next_url, headers=headers).()
            comments_data = response['data']['replies']
            
            for comment in comments_data:
                username = comment['member']['uname']
                comment_text = comment['content']['message']
                output_file.write(f"{username}:{comment_text}\n")
                
                try:
                    for reply in comment['replies']:
                        output_file.write(f"  {reply['member']['uname']}:")
                        output_file.write(f"{reply['content']['message']}\n")
                except (TypeError, KeyError):
                    output_file.write("\n")
                    
        print('Comment scraping completed')
    except (TypeError, KeyError):
        print("Program stopped")

if __name__ == '__main__':
    fetch_comments()

Related Articles

Efficient Usage of HTTP Client in IntelliJ IDEA

IntelliJ IDEA incorporates a versatile HTTP client tool, enabling developres to interact with RESTful services and APIs effectively with in the editor. This functionality streamlines workflows, replac...

Resolve PhpStorm "Interpreter is not specified or invalid" on WAMP (Windows)

Symptom PhpStorm displays: "Interpreter is not specified or invalid. Press ‘Fix’ to edit your project configuration." This occurs when the IDE cannot locate a valid PHP CLI executable or when the debu...

Capturing Android Screenshots and Screen Recordings with ADB

Two practical ways to grab images and videos from an Android device: Mirror the phone display to a computer and use desktop tools for screenshots and GIFs Use ADB commands (no UI mirroring required)...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.