Web Scraping Taobao Models: Extracting Profile Data and Images
The following code implements a single-threaded scraper for gathering profile information and photographs from a Taobao models directory. This implementation serves as a parctical exercise for understanding web scraping logic and workflow, despite limitations such as the absence of multi-threading, control mechanisms, selection options, and a graphical interface.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import time
import urllib
import uuid
from selenium import webdriver
# Ensure proper encoding for the script
reload(sys)
sys.setdefaultencoding('utf-8')
class TaobaoModelScraper:
def __init__(self):
self.start_page = 1
self.base_dir = 'taobao_models_data'
# Initialize two separate WebDriver instances for concurrent page handling
self.main_driver = webdriver.PhantomJS()
self.detail_driver = webdriver.PhantomJS()
def log_message(self, enable_log, message):
if enable_log:
print(message)
def start_scraping(self, total_pages):
for current_page in range(1, total_pages + 1):
self.log_message(True, f'Processing page {current_page}')
self.extract_model_links(current_page)
self.main_driver.quit()
self.detail_driver.quit()
def extract_model_links(self, page_number):
list_url = f'https://mm.taobao.com/json/request_top_list.htm?page={page_number}'
self.main_driver.get(list_url)
# Locate all profile link elements on the list page
link_elements = self.main_driver.find_elements_by_xpath('//div[@class="list-item"]/div[1]/div[1]/p/a')
profile_urls = []
for element in link_elements:
# Modify the URL to access the full profile page
profile_url = element.get_attribute('href').replace("model_card", "model_info")
profile_urls.append(profile_url)
self.process_model_profile(profile_url)
def process_model_profile(self, profile_url):
self.detail_driver.get(profile_url)
self.log_message(False, self.detail_driver.current_url)
# Extract the model's name
name_element = self.detail_driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dd/a')
model_name = name_element.text
self.log_message(True, f'Found model: {model_name} at {profile_url}')
# Extract profile picture URL
avatar_element = self.detail_driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]/dl/dt/a/img')
avatar_url = avatar_element.get_attribute('src')
self.log_message(False, avatar_url)
# Collect basic profile information
info_items = self.detail_driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
profile_summary = ''
for item in info_items:
profile_summary += item.text + '\n'
# Save the collected profile data
save_path = os.path.join(self.base_dir, model_name)
save_path = save_path.strip()
self.save_profile_data(save_path, model_name, avatar_url, str(profile_summary), profile_url)
# Navigate to the model's photo album page
album_link = self.detail_driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a')
album_url = album_link.get_attribute('href')
self.log_message(True, f'Album URL: {album_url}')
# Retrieve all album links and names
albums_data = self.fetch_album_list(album_url, model_name)
# Download images from each album
self.download_album_images(albums_data, save_path)
def check_last_album_page(self):
try:
# Attempt to find the "last page" indicator
last_page_indicator = self.main_driver.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-end"]')
self.log_message(True, "Reached the final album page")
return None
except:
return True
def get_next_album_page(self):
try:
next_page_button = self.detail_driver.find_element_by_xpath('//div[@class="pagination"]/a[@class="page-next J_AjaxifyTrigger"]')
return next_page_button
except:
self.log_message(True, "Album scraping completed.")
return None
def fetch_album_list(self, album_index_url, model_name):
try:
album_collection = []
self.detail_driver.get(album_index_url)
current_album_page = 1
album_counter = 1
while self.check_last_album_page():
if current_album_page == 0:
break
album_entries = self.detail_driver.find_elements_by_xpath('//div[@class="mm-photo-cell"]/div/h4/a')
for entry in album_entries:
album_record = {}
album_link = entry.get_attribute('href')
album_record['album_title'] = entry.text
album_record['album_url'] = album_link
album_record['page_number'] = current_album_page
album_record['model_name'] = model_name
album_collection.append(album_record)
album_counter += 1
current_album_page += 1
next_page_element = self.get_next_album_page()
if next_page_element is None:
current_album_page = 0
else:
next_page_element.click()
return album_collection
except Exception as e:
self.log_message(True, e)
def download_album_images(self, albums_list, base_path):
total_albums = len(albums_list)
self.log_message(True, f'Total albums found: {total_albums}')
if not total_albums:
self.log_message(True, 'No albums available for this model.')
return
for album in albums_list:
self.detail_driver.get(album['album_url'])
# Scroll to load all images on the page
scroll_script = "document.body.scrollTop = document.body.scrollHeight"
for _ in range(10):
self.detail_driver.execute_script(scroll_script)
time.sleep(0.1)
image_elements = self.detail_driver.find_elements_by_xpath('//div[@class="mm-photoW-cell-middle"]/div/a/img')
self.log_message(True, f"{album['album_url']} contains {len(image_elements)} images")
album_save_path = os.path.join(base_path, str(album['page_number']), album['album_title'])
self.log_message(True, album_save_path)
if not os.path.exists(album_save_path):
os.makedirs(album_save_path)
image_count = 1
for img_element in image_elements:
image_source = img_element.get_attribute('src')
unique_filename = os.path.join(album_save_path, f"{uuid.uuid1()}.jpg")
urllib.urlretrieve(image_source, unique_filename)
time.sleep(0.1)
self.log_message(True, f"Saved image {image_count} from album '{album['album_title']}' to {album_save_path}")
image_count += 1
def save_profile_data(self, directory_path, name, avatar_url, summary, profile_url):
try:
if not os.path.exists(directory_path):
os.makedirs(directory_path)
# Download and save the profile picture
avatar_filepath = os.path.join(directory_path, f'{name}.jpg')
urllib.urlretrieve(avatar_url, avatar_filepath)
# Save the profile summary to a text file
info_file = os.path.join(directory_path, f'{name}.txt')
with open(info_file, 'w+') as file:
self.log_message(True, f'Saving profile info for {name} to {directory_path}')
file.write(summary.encode('utf-8'))
profile_link_text = "Profile URL: " + profile_url
file.write(profile_link_text)
except Exception as e:
self.log_message(True, e)
return False
return True
if __name__ == '__main__':
scraper = TaobaoModelScraper()
scraper.start_scraping(1)
Executing this script initiates the scraping process, which logs its progress to the console as it downloads profile information and images, organizing them into a local directory structure.