Home > Tech > Content

Web Scraping Baidu Search Results with Python

Tech 1

Method 1: Automated Browser Interaction with Selenium

The first approach involves using Selenium WebDriver to automate browser interactions. This method navigates to the Baidu homepage, locates the search input field, submits a query, and extracts the results.

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

def scrape_baidu_via_automation():
    driver = webdriver.Chrome()
    driver.get('https://www.baidu.com')
    
    search_box = driver.find_element(By.ID, 'kw')
    search_box.send_keys('Python web scraping')
    search_box.send_keys(Keys.RETURN)
    
    time.sleep(3)
    
    result_links = driver.find_elements(By.XPATH, '//h3/a')
    
    for link in result_links:
        title_text = link.text
        if 'Python' in title_text:
            target_url = link.get_attribute('href')
            print(f'Title: {title_text}')
            print(f'URL: {target_url}')
            driver.get(target_url)
            break
    
    driver.quit()

if __name__ == '__main__':
    scrape_baidu_via_automation()

Method 2: Direct URL Request with Selenium

An alternative Selenium approach bypasses the homepage by directly accessing the search result URL. This requires constructing the proper query string beforehand.

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

def scrape_baidu_direct_url():
    query = 'Python web scraping'
    encoded_query = query.replace(' ', '%20')
    search_url = f'https://www.baidu.com/s?wd={encoded_query}'
    
    driver = webdriver.Chrome()
    driver.get(search_url)
    
    time.sleep(3)
    
    results = driver.find_elements(By.XPATH, '//h3/a')
    
    for item in results:
        headline = item.text
        link = item.get_attribute('href')
        print(f'{headline}: {link}')
    
    driver.quit()

if __name__ == '__main__':
    scrape_baidu_direct_url()

Method 3: HTTP Requests with Regular Expressions

For better performance, using the requests library combined with regular expressions offers a lightweight solution without browser overhead.

import re
import requests

def fetch_search_results(keyword):
    base_url = 'https://www.baidu.com/s'
    params = {'wd': keyword}
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    response = requests.get(base_url, params=params, headers=headers)
    html_content = response.text
    
    title_pattern = re.compile(r']*>.*?]*>(.*?).*?', re.DOTALL)
    url_pattern = re.compile(r']*href="([^"]*)"[^>]*>')
    
    titles = title_pattern.findall(html_content)
    
    clean_pattern = re.compile(r'<[^>]+>')
    
    results = []
    for title in titles:
        clean_title = clean_pattern.sub('', title).strip()
        results.append(clean_title)
    
    return results

def extract_urls(html_text):
    pattern = re.compile(r'class="result".*?href="([^"]+)"', re.DOTALL)
    return pattern.findall(html_text)

def main():
    search_term = input('Enter search keyword: ')
    titles = fetch_search_results(search_term)
    
    for idx, title in enumerate(titles, 1):
        print(f'{idx}. {title}')

if __name__ == '__main__':
    main()

The regex pattern r'<h3[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h3>' captures the title text within the heading tags. The clean_pattern removes any remaining HTML tags from the extracted text.

Tags: Python web-scraping Selenium

Back to List

Prev: Cost-Effective GPT-4 Access Using GitHub Copilot CLI Tokens

Next: Using ThreadLocal for Thread-Specific Variables and DTOs in Java Applications

Fading Coder

Web Scraping Baidu Search Results with Python

Method 1: Automated Browser Interaction with Selenium

Method 2: Direct URL Request with Selenium

Method 3: HTTP Requests with Regular Expressions

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Web Scraping Baidu Search Results with Python

Method 1: Automated Browser Interaction with Selenium

Method 2: Direct URL Request with Selenium

Method 3: HTTP Requests with Regular Expressions

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment