Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Adapting a Selenium Web Scraper for Windows Environments

Tech 1

To run the web scraping script on a Windows operating system, specific environment configurations are required. Begin by installing the Selenium bindings via the Python package manager.

pip install selenium

Verify the installation by attempting to import the module in a Python shell. No errors should be reported.

Subsequently, the PhantomJS binary must be integrated. After downloading the distribution, extract the executable and place it within the Python Scripts directory (e.g., C:\Python27\Scripts\phantomjs.exe) to ansure it is discoverable.

Two primary modifications are necessary to adapt the logic for Windows.

  1. Driver Initialization: Explicitly define the path to the executable when instantiating the webdriver object.
self.main_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')
self.detail_browser = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe')
  1. Path Handling: Windows file systems may truncate long directory names, causing errors during creation. Apending a timestamp or unique suffix to the directory name resolves this issue.
output_folder = base_path + '/' + str(album_info['id']) + '/' + album_info['title'] + str(datetime.date(2017, 3, 17))

The following is the refactored code, opitmized for Python 3 and Windows compatibility.

# -*- coding: utf-8 -*-

import os
import sys
import time
import uuid
import datetime
import urllib.request
from selenium import webdriver

class TaobaoScraper:
    def __init__(self):
        self.start_page = 1
        self.root_dir = 'Taobao_Data'
        # Initialize browsers with explicit paths for Windows
        driver_path = r'C:\Python27\Scripts\phantomjs.exe'
        self.main_browser = webdriver.PhantomJS(executable_path=driver_path)
        self.detail_browser = webdriver.PhantomJS(executable_path=driver_path)

    def log_message(self, enabled, message):
        if enabled:
            print(message)

    def run_crawler(self, limit):
        for page_num in range(1, limit + 1):
            self.log_message(True, f"Processing page {page_num}")
            self.process_list_page(page_num)
        self.main_browser.quit()
        self.detail_browser.quit()

    def process_list_page(self, page_index):
        target_url = f"https://mm.taobao.com//request_top_list.htm?page={page_index}"
        self.main_browser.get(target_url)
        profile_links = self.main_browser.find_elements_by_xpath('//div[@class="list-item"]/div[1]/div[1]/p/a')
        
        for link in profile_links:
            profile_url = link.get_attribute('href').replace("model_card", "model_info")
            self.process_profile(profile_url)

    def process_profile(self, url):
        self.detail_browser.get(url)
        
        try:
            name = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dd/a').text
            self.log_message(True, f"Found profile: {name} at {url}")
            
            avatar_url = self.detail_browser.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dt/a/img').get_attribute('src')
            
            bio_elements = self.detail_browser.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
            bio_data = "\n".join([item.text for item in bio_elements])
            
            save_path = os.path.join(self.root_dir, name).strip()
            self.save_profile_data(save_path, name, avatar_url, bio_data, url)
            
            # Access albums
            album_link = self.detail_browser.find_element_by_xpath('//ul[@class="mm-p-menu"]//a').get_attribute('href')
            self.log_message(True, f"Album link: {album_link}")
            
            album_list = self.fetch_album_list(album_link, name)
            self.download_images(album_list, save_path)
            
        except Exception as e:
            self.log_message(True, f"Error processing profile: {e}")

    def save_profile_data(self, path, name, avatar_url, bio, profile_url):
        if not os.path.exists(path):
            os.makedirs(path)
            
        # Save avatar
        avatar_path = os.path.join(path, f"{name}.jpg")
        try:
            urllib.request.urlretrieve(avatar_url, avatar_path)
        except:
            pass

        # Save text info
        info_path = os.path.join(path, f"{name}.txt")
        with open(info_path, 'w', encoding='utf-8') as f:
            f.write(bio)
            f.write(f"\nProfile URL: {profile_url}")

    def check_end_page(self):
        try:
            self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-end"]')
            self.log_message(True, "Reached the last page of albums")
            return False
        except:
            return True

    def get_next_page(self):
        try:
            next_btn = self.detail_browser.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-next J_AjaxifyTrigger"]')
            return next_btn
        except:
            self.log_message(True, "Album pagination complete")
            return None

    def fetch_album_list(self, base_url, user_name):
        collected_albums = []
        self.detail_browser.get(base_url)
        
        current_page = 1
        album_count = 1
        
        while self.check_end_page():
            if current_page == 0:
                break
                
            album_items = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photo-cell"]/div/h4/a')
            for item in album_items:
                album_info = {
                    'url': item.get_attribute('href'),
                    'title': item.text,
                    'id': current_page,
                    'user': user_name
                }
                collected_albums.append(album_info)
                album_count += 1
                
            current_page += 1
            nav_btn = self.get_next_page()
            if nav_btn is None:
                current_page = 0
            else:
                nav_btn.click()
                
        return collected_albums

    def download_images(self, albums, base_path):
        self.log_message(True, f"Total albums found: {len(albums)}")
        if not albums:
            self.log_message(True, "No albums available for this user.")
            return
            
        for album in albums:
            self.detail_browser.get(album['url'])
            
            # Scroll to load images
            for _ in range(15):
                self.detail_browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.1)
                
            images = self.detail_browser.find_elements_by_xpath('//div[@class="mm-photoW-cell-middle"]/div/a/img')
            self.log_message(True, f"Found {len(images)} images in '{album['title']}'")
            
            # Create directory with timestamp suffix to avoid Windows truncation issues
            folder_suffix = str(datetime.date(2017, 3, 17))
            album_path = os.path.join(base_path, str(album['id']), f"{album['title']}{folder_suffix}")
            self.log_message(True, f"Target directory: {album_path}")
            
            if not os.path.exists(album_path):
                try:
                    os.makedirs(album_path)
                except OSError as e:
                    self.log_message(True, f"Directory creation failed: {e}")
                    continue

            for idx, img_tag in enumerate(images):
                src_url = img_tag.get_attribute('src')
                file_name = os.path.join(album_path, f"{uuid.uuid1()}.jpg")
                
                try:
                    urllib.request.urlretrieve(src_url, file_name)
                    time.sleep(0.1)
                    self.log_message(True, f"Saved image {idx+1} to {file_name}")
                except Exception as e:
                    self.log_message(True, f"Failed to save image: {e}")

if __name__ == '__main__':
    scraper = TaobaoScraper()
    scraper.run_crawler(1)

Related Articles

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

SBUS Signal Analysis and Communication Implementation Using STM32 with Fus Remote Controller

Overview In a recent project, I utilized the SBUS protocol with the Fus remote controller to control a vehicle's basic operations, including movement, lights, and mode switching. This article is aimed...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.