Home > Tech > Content

Adapting a Selenium Web Scraper for Windows Environments

Tech Apr 22 16

To run the web scraping script on a Windows operating system, specific environment configurations are required. Begin by installing the Selenium bindings via the Python package manager.

pip install selenium

Verify the installation by attempting to import the module in a Python shell. No errors should be reported.

Subsequently, the PhantomJS binary must be integrated. After downloading the distribution, extract the executable and place it within the Python Scripts directory (e.g., C:\Python27\Scripts\phantomjs.exe) to ansure it is discoverable.

Two primary modifications are necessary to adapt the logic for Windows.

Driver Initialization: Explicitly define the path to the executable when instantiating the webdriver object.

self.main_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')
self.detail_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')

Path Handling: Windows file systems may truncate long directory names, causing errors during creation. Apending a timestamp or unique suffix to the directory name resolves this issue.

output_folder = base_path + '/' + str(album_info['id']) + '/' + album_info['title'] + str(datetime.date(2017, 3, 17))

The following is the refactored code, opitmized for Python 3 and Windows compatibility.

# -*- coding: utf-8 -*-

import os
import sys
import time
import uuid
import datetime
import urllib.request
from selenium import webdriver

class TaobaoScraper:
    def __init__(self):
        self.start_page = 1
        self.root_dir = 'Taobao_Data'
        # Initialize browsers with explicit paths for Windows
        driver_path = r'C:\Python27\Scripts\phantomjs.exe'
        self.main_browser = webdriver.PhantomJS(executable_path=driver_path)
        self.detail_browser = webdriver.PhantomJS(executable_path=driver_path)

    def log_message(self, enabled, message):
        if enabled:
            print(message)

    def run_crawler(self, limit):
        for page_num in range(1, limit + 1):
            self.log_message(True, f"Processing page {page_num}")
            self.process_list_page(page_num)
        self.main_browser.quit()
        self.detail_browser.quit()

    def process_list_page(self, page_index):
        target_url = f"https://mm.taobao.com//request_top_list.htm?page={page_index}"
        self.main_browser.get(target_url)
        profile_links = self.main_browser.find_elements_by_xpath('//div[@class="list-item"]/div[1]/div[1]/p/a')
        
        for link in profile_links:
            profile_url = link.get_attribute('href').replace("model_card", "model_info")
            self.process_profile(profile_url)

    def process_profile(self, url):
        self.detail_browser.get(url)
        
        try:
            name = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dd/a').text
            self.log_message(True, f"Found profile: {name} at {url}")
            
            avatar_url = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dt/a/img').get_attribute('src')
            
            bio_elements = self.detail_browser.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
            bio_data = "\n".join([item.text for item in bio_elements])
            
            save_path = os.path.join(self.root_dir, name).strip()
            self.save_profile_data(save_path, name, avatar_url, bio_data, url)
            
            # Access albums
            album_link = self.detail_browser.find_element_by_xpath('//ul[@class="mm-p-menu"]//a').get_attribute('href')
            self.log_message(True, f"Album link: {album_link}")
            
            album_list = self.fetch_album_list(album_link, name)
            self.download_images(album_list, save_path)
            
        except Exception as e:
            self.log_message(True, f"Error processing profile: {e}")

    def save_profile_data(self, path, name, avatar_url, bio, profile_url):
        if not os.path.exists(path):
            os.makedirs(path)
            
        # Save avatar
        avatar_path = os.path.join(path, f"{name}.jpg")
        try:
            urllib.request.urlretrieve(avatar_url, avatar_path)
        except:
            pass

        # Save text info
        info_path = os.path.join(path, f"{name}.txt")
        with open(info_path, 'w', encoding='utf-8') as f:
            f.write(bio)
            f.write(f"\nProfile URL: {profile_url}")

    def check_end_page(self):
        try:
            self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-end"]')
            self.log_message(True, "Reached the last page of albums")
            return False
        except:
            return True

    def get_next_page(self):
        try:
            next_btn = self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-next J_AjaxifyTrigger"]')
            return next_btn
        except:
            self.log_message(True, "Album pagination complete")
            return None

    def fetch_album_list(self, base_url, user_name):
        collected_albums = []
        self.detail_browser.get(base_url)
        
        current_page = 1
        album_count = 1
        
        while self.check_end_page():
            if current_page == 0:
                break
                
            album_items = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photo-cell"]/div/h4/a')
            for item in album_items:
                album_info = {
                    'url': item.get_attribute('href'),
                    'title': item.text,
                    'id': current_page,
                    'user': user_name
                }
                collected_albums.append(album_info)
                album_count += 1
                
            current_page += 1
            nav_btn = self.get_next_page()
            if nav_btn is None:
                current_page = 0
            else:
                nav_btn.click()
                
        return collected_albums

    def download_images(self, albums, base_path):
        self.log_message(True, f"Total albums found: {len(albums)}")
        if not albums:
            self.log_message(True, "No albums available for this user.")
            return
            
        for album in albums:
            self.detail_browser.get(album['url'])
            
            # Scroll to load images
            for _ in range(15):
                self.detail_browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.1)
                
            images = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photoW-cell-middle"]/div/a/img')
            self.log_message(True, f"Found {len(images)} images in '{album['title']}'")
            
            # Create directory with timestamp suffix to avoid Windows truncation issues
            folder_suffix = str(datetime.date(2017, 3, 17))
            album_path = os.path.join(base_path, str(album['id']), f"{album['title']}{folder_suffix}")
            self.log_message(True, f"Target directory: {album_path}")
            
            if not os.path.exists(album_path):
                try:
                    os.makedirs(album_path)
                except OSError as e:
                    self.log_message(True, f"Directory creation failed: {e}")
                    continue

            for idx, img_tag in enumerate(images):
                src_url = img_tag.get_attribute('src')
                file_name = os.path.join(album_path, f"{uuid.uuid1()}.jpg")
                
                try:
                    urllib.request.urlretrieve(src_url, file_name)
                    time.sleep(0.1)
                    self.log_message(True, f"Saved image {idx+1} to {file_name}")
                except Exception as e:
                    self.log_message(True, f"Failed to save image: {e}")

if __name__ == '__main__':
    scraper = TaobaoScraper()
    scraper.run_crawler(1)

Tags: Python Selenium web-scraping windows

Back to List

Prev: Vue 3 Guide: Importing External Dependencies, Custom Components, and Routing

Next: Timed Input Delivery Methods for Multi-Threaded Interactive Console Applications

Fading Coder

Adapting a Selenium Web Scraper for Windows Environments

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Adapting a Selenium Web Scraper for Windows Environments

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment