Scraping Property Listings from Lianjia Using Python Scrapy
Define a Scrapy Item class to structure the extracted real estate attributes including community identifiers, geographic locations, and transaction URLs.
import scrapy
class HousingData(scrapy.Item):
estate_name = scrapy.Field()
listing_link = scrapy.Field()
street_address = scrapy.Field()
zone_name = scrapy.Field()
apartment_type = scrapy.Field()
total_cost = scrapy.Field()
Configure the crawler to respect server load while accessing restricted endpoints. Disable robots.txt compliance to access platforms that restrict automated browsing, and implement request throttling to prevent IP blocking.
BOT_NAME = 'property_scraper'
SPIDER_MODULES = ['property_scraper.spiders']
NEWSPIDER_MODULE = 'property_scraper.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS_PER_DOMAIN = 4
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
The spider navigates paginated search results for second-hand properties in specific districts. It extracts summary cards from listing pages, then dispatches asynchronous requests to individual property pages while preserving item state through request metadata.
import scrapy
from scrapy.http import Request
from property_scraper.items import HousingData
class RealEstateSpider(scrapy.Spider):
name = 'lianjia_crawler'
allowed_domains = ['lianjia.com']
def __init__(self):
self.base_path = 'https://bj.lianjia.com/ershoufang/tongzhou/pg'
self.page_limit = 3
def start_requests(self):
for idx in range(1, self.page_limit + 1):
url = f"{self.base_path}{idx}/"
yield Request(url, callback=self.extract_summaries)
def extract_summaries(self, response):
properties = response.xpath('//ul[@class="sellListContent"]/li')
for unit in properties:
payload = HousingData()
payload['estate_name'] = unit.xpath('.//div[@class="title"]/a/text()').get()
payload['listing_link'] = unit.xpath('.//div[@class="title"]/a/@href').get()
house_info = unit.xpath('.//div[@class="houseInfo"]/text()').get()
payload['apartment_type'] = house_info.split('|')[0] if house_info else None
position_data = unit.xpath('.//div[@class="positionInfo"]/a/text()').getall()
payload['zone_name'] = position_data[0] if position_data else None
address_text = unit.xpath('.//div[@class="positionInfo"]/text()').get()
payload['street_address'] = address_text.strip() if address_text else None
price_text = unit.xpath('.//div[@class="totalPrice"]/span/text()').get()
payload['total_cost'] = price_text
if payload['listing_link']:
yield Request(
payload['listing_link'],
callback=self.extract_details,
meta={'payload': payload}
)
def extract_details(self, response):
current = response.meta['payload']
community = response.xpath('//a[@class="info no_resblock_a"]/text()').get()
if community:
current['estate_name'] = community.strip()
yield current