Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Web Scraping Taobao Models: Extracting Profile Data and Images

Tech May 18 2

The following code implements a single-threaded scraper for gathering profile information and photographs from a Taobao models directory. This implementation serves as a parctical exercise for understanding web scraping logic and workflow, despite limitations such as the absence of multi-threading, control mechanisms, selection options, and a graphical interface.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import time
import urllib
import uuid
from selenium import webdriver

# Ensure proper encoding for the script
reload(sys)
sys.setdefaultencoding('utf-8')

class TaobaoModelScraper:
    def __init__(self):
        self.start_page = 1
        self.base_dir = 'taobao_models_data'
        # Initialize two separate WebDriver instances for concurrent page handling
        self.main_driver = webdriver.PhantomJS()
        self.detail_driver = webdriver.PhantomJS()

    def log_message(self, enable_log, message):
        if enable_log:
            print(message)

    def start_scraping(self, total_pages):
        for current_page in range(1, total_pages + 1):
            self.log_message(True, f'Processing page {current_page}')
            self.extract_model_links(current_page)
        self.main_driver.quit()
        self.detail_driver.quit()

    def extract_model_links(self, page_number):
        list_url = f'https://mm.taobao.com/json/request_top_list.htm?page={page_number}'
        self.main_driver.get(list_url)
        # Locate all profile link elements on the list page
        link_elements = self.main_driver.find_elements_by_xpath('//div[@class="list-item"]/div[1]/div[1]/p/a')
        
        profile_urls = []
        for element in link_elements:
            # Modify the URL to access the full profile page
            profile_url = element.get_attribute('href').replace("model_card", "model_info")
            profile_urls.append(profile_url)
            self.process_model_profile(profile_url)

    def process_model_profile(self, profile_url):
        self.detail_driver.get(profile_url)
        self.log_message(False, self.detail_driver.current_url)

        # Extract the model's name
        name_element = self.detail_driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dd/a')
        model_name = name_element.text
        self.log_message(True, f'Found model: {model_name} at {profile_url}')

        # Extract profile picture URL
        avatar_element = self.detail_driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dt/a/img')
        avatar_url = avatar_element.get_attribute('src')
        self.log_message(False, avatar_url)

        # Collect basic profile information
        info_items = self.detail_driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
        profile_summary = ''
        for item in info_items:
            profile_summary += item.text + '\n'

        # Save the collected profile data
        save_path = os.path.join(self.base_dir, model_name)
        save_path = save_path.strip()
        self.save_profile_data(save_path, model_name, avatar_url, str(profile_summary), profile_url)

        # Navigate to the model's photo album page
        album_link = self.detail_driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a')
        album_url = album_link.get_attribute('href')
        self.log_message(True, f'Album URL: {album_url}')

        # Retrieve all album links and names
        albums_data = self.fetch_album_list(album_url, model_name)
        # Download images from each album
        self.download_album_images(albums_data, save_path)

    def check_last_album_page(self):
        try:
            # Attempt to find the "last page" indicator
            last_page_indicator = self.main_driver.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-end"]')
            self.log_message(True, "Reached the final album page")
            return None
        except:
            return True

    def get_next_album_page(self):
        try:
            next_page_button = self.detail_driver.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-next J_AjaxifyTrigger"]')
            return next_page_button
        except:
            self.log_message(True, "Album scraping completed.")
            return None

    def fetch_album_list(self, album_index_url, model_name):
        try:
            album_collection = []
            self.detail_driver.get(album_index_url)
            
            current_album_page = 1
            album_counter = 1
            while self.check_last_album_page():
                if current_album_page == 0:
                    break
                
                album_entries = self.detail_driver.find_elements_by_xpath('//div[@class="mm-photo-cell"]/div/h4/a')
                for entry in album_entries:
                    album_record = {}
                    album_link = entry.get_attribute('href')
                    album_record['album_title'] = entry.text
                    album_record['album_url'] = album_link
                    album_record['page_number'] = current_album_page
                    album_record['model_name'] = model_name
                    album_collection.append(album_record)
                    album_counter += 1

                current_album_page += 1
                next_page_element = self.get_next_album_page()
                if next_page_element is None:
                    current_album_page = 0
                else:
                    next_page_element.click()
            return album_collection
        except Exception as e:
            self.log_message(True, e)

    def download_album_images(self, albums_list, base_path):
        total_albums = len(albums_list)
        self.log_message(True, f'Total albums found: {total_albums}')
        if not total_albums:
            self.log_message(True, 'No albums available for this model.')
            return
        
        for album in albums_list:
            self.detail_driver.get(album['album_url'])
            # Scroll to load all images on the page
            scroll_script = "document.body.scrollTop = document.body.scrollHeight"
            for _ in range(10):
                self.detail_driver.execute_script(scroll_script)
                time.sleep(0.1)
            
            image_elements = self.detail_driver.find_elements_by_xpath('//div[@class="mm-photoW-cell-middle"]/div/a/img')
            self.log_message(True, f"{album['album_url']} contains {len(image_elements)} images")
            
            album_save_path = os.path.join(base_path, str(album['page_number']), album['album_title'])
            self.log_message(True, album_save_path)
            
            if not os.path.exists(album_save_path):
                os.makedirs(album_save_path)
            
            image_count = 1
            for img_element in image_elements:
                image_source = img_element.get_attribute('src')
                unique_filename = os.path.join(album_save_path, f"{uuid.uuid1()}.jpg")
                urllib.urlretrieve(image_source, unique_filename)
                time.sleep(0.1)
                self.log_message(True, f"Saved image {image_count} from album '{album['album_title']}' to {album_save_path}")
                image_count += 1

    def save_profile_data(self, directory_path, name, avatar_url, summary, profile_url):
        try:
            if not os.path.exists(directory_path):
                os.makedirs(directory_path)
            
            # Download and save the profile picture
            avatar_filepath = os.path.join(directory_path, f'{name}.jpg')
            urllib.urlretrieve(avatar_url, avatar_filepath)
            
            # Save the profile summary to a text file
            info_file = os.path.join(directory_path, f'{name}.txt')
            with open(info_file, 'w+') as file:
                self.log_message(True, f'Saving profile info for {name} to {directory_path}')
                file.write(summary.encode('utf-8'))
                profile_link_text = "Profile URL: " + profile_url
                file.write(profile_link_text)
        except Exception as e:
            self.log_message(True, e)
            return False
        return True

if __name__ == '__main__':
    scraper = TaobaoModelScraper()
    scraper.start_scraping(1)

Executing this script initiates the scraping process, which logs its progress to the console as it downloads profile information and images, organizing them into a local directory structure.

Tags: Python

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.