Automating Huya Video Download with Selenium
Exploring the use of Selenium to extract video URLs and save them locally, focusing on videos under five minutes in duration from the first page only.
Selenium is preferred over requests due to challenges such as complex data structures, encrypted APIs, or difficult-to-determine video URL pattersn.
Install the Selenimu library and set up headless mode. Details on driver setup can be found in another blog: https://www.cnblogs.com/pfeiliu/p/12275239.html
Code example:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
class VideoScraper:
video_urls = []
def __init__(self, url):
self.driver = webdriver.Chrome(executable_path=r"C:\Program Files\python\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe", options=chrome_options)
self.driver.get(url)
def __del__(self):
print("Scraping complete.", len(VideoScraper.video_urls), VideoScraper.video_urls)
self.driver.close()
def start(self):
self.collect_video_links()
def collect_video_links(self):
detail_links = self.driver.find_elements_by_xpath('//a[@class="video-wrap statpid"]')
urls = [link.get_attribute('href') for link in detail_links]
durations = [element.text for element in self.driver.find_elements_by_xpath('//span[@class="video-duration"]')]
for url, duration in zip(urls, durations):
minutes = int(duration.split(':')[0])
if minutes <= 5:
self.extract_video_info(url, duration)
def extract_video_info(self, url, duration):
self.driver.get(url)
video_element = self.driver.find_elements_by_xpath('//video')[0]
video_source = video_element.get_attribute('src')
title = self.driver.find_elements_by_xpath('//h1[@class="video-title"]')[0].text
print(f'Video source: {video_source}, Title: {title}, Duration: {duration}')
self.save_video(video_source, title, duration)
VideoScraper.video_urls.append(video_source)
def save_video(self, source, title, duration):
filename = f"video_{title}_{duration.replace(':', '')}.mp4"
response = requests.get(source)
with open(filename, 'wb') as file:
file.write(response.content)
print(f'{filename} saved successfully')
if __name__ == '__main__':
scraper = VideoScraper('https://v.huya.com/cat/7')
scraper.start()