Adapting a Selenium Web Scraper for Windows Environments
To run the web scraping script on a Windows operating system, specific environment configurations are required. Begin by installing the Selenium bindings via the Python package manager.
pip install selenium
Verify the installation by attempting to import the module in a Python shell. No errors should be reported.
Subsequently, the PhantomJS binary must be integrated. After downloading the distribution, extract the executable and place it within the Python Scripts directory (e.g., C:\Python27\Scripts\phantomjs.exe) to ansure it is discoverable.
Two primary modifications are necessary to adapt the logic for Windows.
- Driver Initialization: Explicitly define the path to the executable when instantiating the webdriver object.
self.main_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')
self.detail_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')
- Path Handling: Windows file systems may truncate long directory names, causing errors during creation. Apending a timestamp or unique suffix to the directory name resolves this issue.
output_folder = base_path + '/' + str(album_info['id']) + '/' + album_info['title'] + str(datetime.date(2017, 3, 17))
The following is the refactored code, opitmized for Python 3 and Windows compatibility.
# -*- coding: utf-8 -*-
import os
import sys
import time
import uuid
import datetime
import urllib.request
from selenium import webdriver
class TaobaoScraper:
def __init__(self):
self.start_page = 1
self.root_dir = 'Taobao_Data'
# Initialize browsers with explicit paths for Windows
driver_path = r'C:\Python27\Scripts\phantomjs.exe'
self.main_browser = webdriver.PhantomJS(executable_path=driver_path)
self.detail_browser = webdriver.PhantomJS(executable_path=driver_path)
def log_message(self, enabled, message):
if enabled:
print(message)
def run_crawler(self, limit):
for page_num in range(1, limit + 1):
self.log_message(True, f"Processing page {page_num}")
self.process_list_page(page_num)
self.main_browser.quit()
self.detail_browser.quit()
def process_list_page(self, page_index):
target_url = f"https://mm.taobao.com//request_top_list.htm?page={page_index}"
self.main_browser.get(target_url)
profile_links = self.main_browser.find_elements_by_xpath('//div[@class="list-item"]/div[1]/div[1]/p/a')
for link in profile_links:
profile_url = link.get_attribute('href').replace("model_card", "model_info")
self.process_profile(profile_url)
def process_profile(self, url):
self.detail_browser.get(url)
try:
name = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dd/a').text
self.log_message(True, f"Found profile: {name} at {url}")
avatar_url = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dt/a/img').get_attribute('src')
bio_elements = self.detail_browser.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
bio_data = "\n".join([item.text for item in bio_elements])
save_path = os.path.join(self.root_dir, name).strip()
self.save_profile_data(save_path, name, avatar_url, bio_data, url)
# Access albums
album_link = self.detail_browser.find_element_by_xpath('//ul[@class="mm-p-menu"]//a').get_attribute('href')
self.log_message(True, f"Album link: {album_link}")
album_list = self.fetch_album_list(album_link, name)
self.download_images(album_list, save_path)
except Exception as e:
self.log_message(True, f"Error processing profile: {e}")
def save_profile_data(self, path, name, avatar_url, bio, profile_url):
if not os.path.exists(path):
os.makedirs(path)
# Save avatar
avatar_path = os.path.join(path, f"{name}.jpg")
try:
urllib.request.urlretrieve(avatar_url, avatar_path)
except:
pass
# Save text info
info_path = os.path.join(path, f"{name}.txt")
with open(info_path, 'w', encoding='utf-8') as f:
f.write(bio)
f.write(f"\nProfile URL: {profile_url}")
def check_end_page(self):
try:
self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-end"]')
self.log_message(True, "Reached the last page of albums")
return False
except:
return True
def get_next_page(self):
try:
next_btn = self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-next J_AjaxifyTrigger"]')
return next_btn
except:
self.log_message(True, "Album pagination complete")
return None
def fetch_album_list(self, base_url, user_name):
collected_albums = []
self.detail_browser.get(base_url)
current_page = 1
album_count = 1
while self.check_end_page():
if current_page == 0:
break
album_items = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photo-cell"]/div/h4/a')
for item in album_items:
album_info = {
'url': item.get_attribute('href'),
'title': item.text,
'id': current_page,
'user': user_name
}
collected_albums.append(album_info)
album_count += 1
current_page += 1
nav_btn = self.get_next_page()
if nav_btn is None:
current_page = 0
else:
nav_btn.click()
return collected_albums
def download_images(self, albums, base_path):
self.log_message(True, f"Total albums found: {len(albums)}")
if not albums:
self.log_message(True, "No albums available for this user.")
return
for album in albums:
self.detail_browser.get(album['url'])
# Scroll to load images
for _ in range(15):
self.detail_browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.1)
images = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photoW-cell-middle"]/div/a/img')
self.log_message(True, f"Found {len(images)} images in '{album['title']}'")
# Create directory with timestamp suffix to avoid Windows truncation issues
folder_suffix = str(datetime.date(2017, 3, 17))
album_path = os.path.join(base_path, str(album['id']), f"{album['title']}{folder_suffix}")
self.log_message(True, f"Target directory: {album_path}")
if not os.path.exists(album_path):
try:
os.makedirs(album_path)
except OSError as e:
self.log_message(True, f"Directory creation failed: {e}")
continue
for idx, img_tag in enumerate(images):
src_url = img_tag.get_attribute('src')
file_name = os.path.join(album_path, f"{uuid.uuid1()}.jpg")
try:
urllib.request.urlretrieve(src_url, file_name)
time.sleep(0.1)
self.log_message(True, f"Saved image {idx+1} to {file_name}")
except Exception as e:
self.log_message(True, f"Failed to save image: {e}")
if __name__ == '__main__':
scraper = TaobaoScraper()
scraper.run_crawler(1)