Web Scraping Baidu Search Results with Python
Method 1: Automated Browser Interaction with Selenium
The first approach involves using Selenium WebDriver to automate browser interactions. This method navigates to the Baidu homepage, locates the search input field, submits a query, and extracts the results.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
def scrape_baidu_via_automation():
driver = webdriver.Chrome()
driver.get('https://www.baidu.com')
search_box = driver.find_element(By.ID, 'kw')
search_box.send_keys('Python web scraping')
search_box.send_keys(Keys.RETURN)
time.sleep(3)
result_links = driver.find_elements(By.XPATH, '//h3/a')
for link in result_links:
title_text = link.text
if 'Python' in title_text:
target_url = link.get_attribute('href')
print(f'Title: {title_text}')
print(f'URL: {target_url}')
driver.get(target_url)
break
driver.quit()
if __name__ == '__main__':
scrape_baidu_via_automation()
Method 2: Direct URL Request with Selenium
An alternative Selenium approach bypasses the homepage by directly accessing the search result URL. This requires constructing the proper query string beforehand.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
def scrape_baidu_direct_url():
query = 'Python web scraping'
encoded_query = query.replace(' ', '%20')
search_url = f'https://www.baidu.com/s?wd={encoded_query}'
driver = webdriver.Chrome()
driver.get(search_url)
time.sleep(3)
results = driver.find_elements(By.XPATH, '//h3/a')
for item in results:
headline = item.text
link = item.get_attribute('href')
print(f'{headline}: {link}')
driver.quit()
if __name__ == '__main__':
scrape_baidu_direct_url()
Method 3: HTTP Requests with Regular Expressions
For better performance, using the requests library combined with regular expressions offers a lightweight solution without browser overhead.
import re
import requests
def fetch_search_results(keyword):
base_url = 'https://www.baidu.com/s'
params = {'wd': keyword}
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(base_url, params=params, headers=headers)
html_content = response.text
title_pattern = re.compile(r']*>.*?]*>(.*?).*?
', re.DOTALL)
url_pattern = re.compile(r']*href="([^"]*)"[^>]*>')
titles = title_pattern.findall(html_content)
clean_pattern = re.compile(r'<[^>]+>')
results = []
for title in titles:
clean_title = clean_pattern.sub('', title).strip()
results.append(clean_title)
return results
def extract_urls(html_text):
pattern = re.compile(r'class="result".*?href="([^"]+)"', re.DOTALL)
return pattern.findall(html_text)
def main():
search_term = input('Enter search keyword: ')
titles = fetch_search_results(search_term)
for idx, title in enumerate(titles, 1):
print(f'{idx}. {title}')
if __name__ == '__main__':
main()
The regex pattern r'<h3[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h3>' captures the title text within the heading tags. The clean_pattern removes any remaining HTML tags from the extracted text.