2026/4/18 2:54:15
网站建设
项目流程
asp手机网站,广东住房城乡建设厅网站,html教程菜鸟教程下载,神网站建设引言在房产大数据时代#xff0c;获取准确的二手房源信息对于市场分析、投资决策和学术研究都具有重要意义。本文将详细介绍如何使用Python最新技术栈构建一个高效、稳定的链家二手房数据爬虫系统。我们将从爬虫原理、技术选型、代码实现到数据存储进行全面讲解。技术栈选型Py…引言在房产大数据时代获取准确的二手房源信息对于市场分析、投资决策和学术研究都具有重要意义。本文将详细介绍如何使用Python最新技术栈构建一个高效、稳定的链家二手房数据爬虫系统。我们将从爬虫原理、技术选型、代码实现到数据存储进行全面讲解。技术栈选型Python 3.10使用最新Python版本确保性能和安全Playwright微软开源的无头浏览器替代传统的SeleniumBeautifulSoup4 lxmlHTML解析库asyncio aiohttp异步IO处理提升爬取效率Pandas NumPy数据处理与分析SQLAlchemy数据库ORM框架FastAPI可选的数据API服务项目结构设计textlianjia_crawler/ ├── config/ # 配置文件 │ └── settings.py ├── spiders/ # 爬虫核心 │ ├── base_spider.py │ └── lianjia_spider.py ├── models/ # 数据模型 │ ├── database.py │ └── models.py ├── utils/ # 工具函数 │ ├── logger.py │ ├── proxy_manager.py │ └── user_agent.py ├── middlewares/ # 中间件 │ └── anti_anti_crawler.py ├── storage/ # 数据存储 │ └── data_handler.py ├── requirements.txt # 依赖文件 └── main.py # 主程序入口完整代码实现1. 环境配置与依赖安装python# requirements.txt playwright1.40.0 beautifulsoup44.12.0 lxml5.0.0 aiohttp3.9.0 pandas2.1.0 sqlalchemy2.0.0 asyncpg0.29.0 pydantic2.5.0 asyncio python-dotenv1.0.02. 配置文件python# config/settings.py from pydantic_settings import BaseSettings from typing import List class Settings(BaseSettings): # 爬虫配置 MAX_CONCURRENT_REQUESTS: int 5 REQUEST_TIMEOUT: int 30 DELAY_RANGE: tuple (1, 3) # 链家配置 BASE_URL: str https://bj.lianjia.com START_URLS: List[str] [ https://bj.lianjia.com/ershoufang/pg{}/ ] # 数据库配置 DATABASE_URL: str postgresqlasyncpg://user:passwordlocalhost:5432/lianjia # 代理配置 USE_PROXY: bool False PROXY_POOL_URL: str # 日志配置 LOG_LEVEL: str INFO class Config: env_file .env settings Settings()3. 数据模型定义python# models/models.py from sqlalchemy import Column, Integer, String, Float, DateTime, Text from sqlalchemy.ext.declarative import declarative_base from datetime import datetime Base declarative_base() class HouseListing(Base): 二手房房源数据模型 __tablename__ house_listings id Column(String(50), primary_keyTrue) title Column(String(200)) total_price Column(Float) unit_price Column(Float) district Column(String(50)) region Column(String(50)) community Column(String(100)) floor Column(String(50)) area Column(Float) room_type Column(String(50)) orientation Column(String(50)) decoration Column(String(50)) build_year Column(Integer) listing_date Column(DateTime) follow_count Column(Integer) view_count Column(Integer) tags Column(Text) url Column(String(500)) created_at Column(DateTime, defaultdatetime.now) updated_at Column(DateTime, defaultdatetime.now, onupdatedatetime.now) def __repr__(self): return fHouseListing {self.title} - {self.total_price}万4. 异步数据库连接python# models/database.py from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.orm import sessionmaker from config.settings import settings class Database: def __init__(self): self.engine create_async_engine( settings.DATABASE_URL, echoFalse, futureTrue, pool_size20, max_overflow0 ) self.async_session sessionmaker( self.engine, class_AsyncSession, expire_on_commitFalse ) async def init_db(self): 初始化数据库 async with self.engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) async def get_session(self) - AsyncSession: 获取数据库会话 async with self.async_session() as session: yield session db Database()5. 核心爬虫类实现python# spiders/lianjia_spider.py import asyncio import re import json from typing import List, Dict, Optional from datetime import datetime import aiohttp from bs4 import BeautifulSoup from playwright.async_api import async_playwright import random import logging from urllib.parse import urljoin from pydantic import BaseModel, ValidationError from config.settings import settings from models.models import HouseListing from utils.logger import setup_logger from utils.user_agent import get_random_user_agent from middlewares.anti_anti_crawler import AntiAntiCrawlerMiddleware logger setup_logger(__name__) class HouseItem(BaseModel): 数据验证模型 id: str title: str total_price: float unit_price: float district: str region: str community: str floor: str area: float room_type: str orientation: str decoration: str build_year: Optional[int] listing_date: datetime follow_count: int view_count: int tags: List[str] url: str class LianJiaSpider: 链家二手房爬虫 def __init__(self): self.base_url settings.BASE_URL self.start_urls settings.START_URLS self.session None self.playwright None self.browser None self.context None self.middleware AntiAntiCrawlerMiddleware() async def init_session(self): 初始化aiohttp会话 timeout aiohttp.ClientTimeout(totalsettings.REQUEST_TIMEOUT) self.session aiohttp.ClientSession( timeouttimeout, headers{ User-Agent: get_random_user_agent(), Accept: text/html,application/xhtmlxml,application/xml;q0.9,image/webp,*/*;q0.8, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, } ) async def init_browser(self): 初始化Playwright浏览器 self.playwright await async_playwright().start() self.browser await self.playwright.chromium.launch( headlessTrue, args[ --disable-blink-featuresAutomationControlled, --disable-dev-shm-usage, --no-sandbox, --disable-setuid-sandbox ] ) self.context await self.browser.new_context( user_agentget_random_user_agent(), viewport{width: 1920, height: 1080}, ignore_https_errorsTrue ) # 添加stealth插件避免检测 await self.context.add_init_script( Object.defineProperty(navigator, webdriver, { get: () undefined }); window.chrome { runtime: {} }; ) async def crawl_list_pages(self, max_pages: int 100): 爬取列表页 listings [] for start_url in self.start_urls: for page in range(1, max_pages 1): url start_url.format(page) logger.info(f爬取列表页: {url}) try: # 使用Playwright处理动态内容 page_content await self.fetch_with_playwright(url) if not page_content: logger.warning(f页面内容为空: {url}) continue # 解析列表页 page_listings await self.parse_list_page(page_content, url) listings.extend(page_listings) # 随机延迟 await asyncio.sleep(random.uniform(*settings.DELAY_RANGE)) # 检查是否到达末页 if not await self.has_next_page(page_content): logger.info(已到达最后一页) break except Exception as e: logger.error(f爬取列表页失败 {url}: {str(e)}) continue return listings async def fetch_with_playwright(self, url: str) - Optional[str]: 使用Playwright获取页面内容 page None try: page await self.context.new_page() # 设置请求头 await page.set_extra_http_headers({ User-Agent: get_random_user_agent(), Referer: self.base_url, Accept-Language: zh-CN,zh;q0.9 }) # 导航到页面 response await page.goto(url, wait_untilnetworkidle, timeout60000) if response.status ! 200: logger.error(f请求失败状态码: {response.status}) return None # 等待关键元素加载 await page.wait_for_selector(.sellListContent, timeout30000) # 获取页面内容 content await page.content() # 执行JavaScript获取隐藏数据 additional_data await page.evaluate( () { const data {}; // 提取页面上的JSON-LD数据 const scriptTags document.querySelectorAll(script[typeapplication/ldjson]); scriptTags.forEach(script { try { const jsonData JSON.parse(script.textContent); Object.assign(data, jsonData); } catch(e) {} }); return data; } ) if additional_data: logger.debug(f获取到额外数据: {additional_data}) return content except Exception as e: logger.error(fPlaywright获取页面失败: {str(e)}) return None finally: if page: await page.close() async def parse_list_page(self, html: str, page_url: str) - List[Dict]: 解析列表页 soup BeautifulSoup(html, lxml) listings [] # 查找房源列表 house_items soup.select(.sellListContent li[class^clear LOGCLICKDATA]) for item in house_items: try: house_data await self.extract_house_data(item, page_url) if house_data: listings.append(house_data) except Exception as e: logger.error(f解析房源失败: {str(e)}) continue return listings async def extract_house_data(self, item, page_url: str) - Optional[Dict]: 提取单个房源数据 try: # 房源ID house_id item.get(data-lj_action_housedel_id, ) if not house_id: return None # 标题 title_elem item.select_one(.title a) title title_elem.text.strip() if title_elem else # 总价 price_elem item.select_one(.totalPrice span) total_price float(price_elem.text.strip()) if price_elem else 0 # 单价 unit_price_elem item.select_one(.unitPrice) unit_price_text unit_price_elem.text.strip() if unit_price_elem else unit_price float(re.search(r(\d), unit_price_text).group(1)) if unit_price_text else 0 # 位置信息 position_elem item.select_one(.positionInfo a) position_text position_elem.text.strip() if position_elem else district, region self.parse_position(position_text) # 小区 community_elem item.select_one(.houseInfo a) community community_elem.text.strip() if community_elem else # 房屋信息 house_info_elem item.select_one(.houseInfo) house_info house_info_elem.text.strip() if house_info_elem else area, room_type, floor self.parse_house_info(house_info) # 关注信息 follow_elem item.select_one(.followInfo) follow_text follow_elem.text.strip() if follow_elem else follow_count, view_count self.parse_follow_info(follow_text) # 标签 tag_elems item.select(.tag span) tags [tag.text.strip() for tag in tag_elems] # URL relative_url title_elem.get(href, ) if title_elem else url urljoin(self.base_url, relative_url) # 构建数据字典 house_data { id: house_id, title: title, total_price: total_price, unit_price: unit_price, district: district, region: region, community: community, area: area, room_type: room_type, floor: floor, follow_count: follow_count, view_count: view_count, tags: json.dumps(tags, ensure_asciiFalse), url: url, listing_date: datetime.now() } # 验证数据 try: validated_data HouseItem(**house_data) return validated_data.dict() except ValidationError as e: logger.warning(f数据验证失败: {e}) return None except Exception as e: logger.error(f提取房源数据失败: {str(e)}) return None def parse_position(self, position_text: str) - tuple: 解析位置信息 if · in position_text: parts position_text.split(·) if len(parts) 2: return parts[0].strip(), parts[1].strip() return , def parse_house_info(self, house_info: str) - tuple: 解析房屋信息 area 0 room_type floor try: parts house_info.split(|) if len(parts) 3: # 面积 area_match re.search(r([\d\.])平米, parts[1].strip()) if area_match: area float(area_match.group(1)) # 户型 room_type parts[0].strip() # 楼层 floor parts[2].strip() except Exception as e: logger.debug(f解析房屋信息失败: {e}) return area, room_type, floor def parse_follow_info(self, follow_text: str) - tuple: 解析关注信息 follow_count 0 view_count 0 try: parts follow_text.split(/) if len(parts) 2: follow_match re.search(r(\d), parts[0]) view_match re.search(r(\d), parts[1]) if follow_match: follow_count int(follow_match.group(1)) if view_match: view_count int(view_match.group(1)) except Exception as e: logger.debug(f解析关注信息失败: {e}) return follow_count, view_count async def has_next_page(self, html: str) - bool: 检查是否有下一页 soup BeautifulSoup(html, lxml) next_button soup.select_one(.page-box .next) return next_button is not None and disabled not in next_button.get(class, []) async def crawl_detail_pages(self, listings: List[Dict]): 爬取详情页补充信息 detailed_listings [] for listing in listings[:10]: # 限制爬取数量避免请求过多 try: detail_data await self.fetch_detail_page(listing[url]) if detail_data: listing.update(detail_data) detailed_listings.append(listing) # 随机延迟 await asyncio.sleep(random.uniform(2, 5)) except Exception as e: logger.error(f爬取详情页失败 {listing[url]}: {str(e)}) continue return detailed_listings async def fetch_detail_page(self, url: str) - Optional[Dict]: 爬取详情页 page None try: page await self.context.new_page() await page.goto(url, wait_untilnetworkidle, timeout60000) # 等待详情信息加载 await page.wait_for_selector(.overview, timeout30000) # 执行JavaScript提取详情数据 detail_data await page.evaluate( () { const data {}; // 提取装修信息 const decorationElem document.querySelector(.base .content ul li:last-child); if (decorationElem) { data.decoration decorationElem.textContent.replace(装修, ).trim(); } // 提取朝向 const orientationElem document.querySelector(.base .content ul li:nth-child(7)); if (orientationElem) { data.orientation orientationElem.textContent.replace(房屋朝向, ).trim(); } // 提取建筑年代 const yearElem document.querySelector(.area .subInfo); if (yearElem) { const yearMatch yearElem.textContent.match(/(\\d{4})/); if (yearMatch) { data.build_year parseInt(yearMatch[1]); } } return data; } ) return detail_data except Exception as e: logger.error(f获取详情页数据失败: {str(e)}) return None finally: if page: await page.close() async def save_to_database(self, listings: List[Dict]): 保存数据到数据库 from models.database import db async with db.async_session() as session: try: for listing in listings: # 检查是否已存在 existing await session.get(HouseListing, listing[id]) if existing: # 更新现有记录 for key, value in listing.items(): setattr(existing, key, value) existing.updated_at datetime.now() else: # 创建新记录 house HouseListing(**listing) session.add(house) await session.commit() logger.info(f成功保存 {len(listings)} 条数据到数据库) except Exception as e: await session.rollback() logger.error(f保存数据失败: {str(e)}) raise async def run(self, max_pages: int 10): 运行爬虫 try: # 初始化 await self.init_browser() await self.init_session() # 爬取列表页 logger.info(开始爬取列表页...) listings await self.crawl_list_pages(max_pages) logger.info(f爬取到 {len(listings)} 条房源数据) # 爬取详情页可选 if listings: logger.info(开始爬取详情页...) detailed_listings await self.crawl_detail_pages(listings) # 保存到数据库 await self.save_to_database(detailed_listings) # 导出为CSV await self.export_to_csv(detailed_listings) logger.info(爬虫任务完成) except Exception as e: logger.error(f爬虫运行失败: {str(e)}) raise finally: # 清理资源 await self.cleanup() async def export_to_csv(self, listings: List[Dict]): 导出数据为CSV import pandas as pd if listings: df pd.DataFrame(listings) timestamp datetime.now().strftime(%Y%m%d_%H%M%S) filename flianjia_houses_{timestamp}.csv # 选择需要的列 columns_to_export [ id, title, total_price, unit_price, district, region, community, area, room_type, floor, orientation, decoration, build_year, follow_count, view_count, listing_date, url ] df df[[col for col in columns_to_export if col in df.columns]] df.to_csv(filename, indexFalse, encodingutf-8-sig) logger.info(f数据已导出到 {filename}) async def cleanup(self): 清理资源 if self.session: await self.session.close() if self.context: await self.context.close() if self.browser: await self.browser.close() if self.playwright: await self.playwright.stop()6. 工具函数python# utils/user_agent.py import random USER_AGENTS [ # Chrome Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, # Firefox Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0, # Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15, ] def get_random_user_agent(): 获取随机User-Agent return random.choice(USER_AGENTS)7. 反反爬虫中间件python# middlewares/anti_anti_crawler.py import asyncio import random from typing import Dict, Optional import hashlib import time class AntiAntiCrawlerMiddleware: 反反爬虫中间件 def __init__(self): self.request_history [] self.blocked_count 0 async def process_request(self, url: str, headers: Dict) - Dict: 处理请求添加反反爬虫策略 # 随机延迟 delay random.uniform(1, 5) await asyncio.sleep(delay) # 动态更新请求头 headers.update({ X-Requested-With: XMLHttpRequest, Sec-Fetch-Dest: document, Sec-Fetch-Mode: navigate, Sec-Fetch-Site: same-origin, Sec-Fetch-User: ?1, }) # 添加时间戳 headers[X-Timestamp] str(int(time.time() * 1000)) # 添加指纹简化版 fingerprint self.generate_fingerprint(headers) headers[X-Fingerprint] fingerprint return headers def generate_fingerprint(self, headers: Dict) - str: 生成浏览器指纹 fingerprint_str f{headers.get(User-Agent, )}{int(time.time())} return hashlib.md5(fingerprint_str.encode()).hexdigest() def check_anti_crawler(self, html: str) - bool: 检查是否触发反爬虫 warning_signs [ 访问验证, 安全检查, restricted, blocked, 请输入验证码, 访问过于频繁 ] for sign in warning_signs: if sign in html: return True return False async def handle_blocked(self): 处理被封锁的情况 self.blocked_count 1 wait_time 60 * (2 ** min(self.blocked_count, 4)) # 指数退避 logger.warning(f检测到反爬虫等待 {wait_time} 秒) await asyncio.sleep(wait_time)8. 主程序入口python# main.py import asyncio import argparse import sys from spiders.lianjia_spider import LianJiaSpider from utils.logger import setup_logger logger setup_logger(__name__) async def main(): 主函数 parser argparse.ArgumentParser(description链家二手房爬虫) parser.add_argument(--pages, typeint, default10, help爬取的页数) parser.add_argument(--max-workers, typeint, default5, help最大并发数) parser.add_argument(--export-csv, actionstore_true, help导出CSV文件) args parser.parse_args() try: # 初始化爬虫 spider LianJiaSpider() # 运行爬虫 await spider.run(max_pagesargs.pages) except KeyboardInterrupt: logger.info(用户中断程序) sys.exit(0) except Exception as e: logger.error(f程序运行失败: {e}) sys.exit(1) if __name__ __main__: # 安装Playwright浏览器 import subprocess subprocess.run([playwright, install, chromium]) # 运行主程序 asyncio.run(main())高级功能扩展1. 分布式爬虫支持python# spiders/distributed_spider.py import redis import json from typing import List import asyncio class DistributedLianJiaSpider(LianJiaSpider): 分布式爬虫 def __init__(self, redis_url: str redis://localhost:6379/0): super().__init__() self.redis redis.Redis.from_url(redis_url) self.task_queue lianjia:task:queue self.result_queue lianjia:result:queue async def distribute_tasks(self, urls: List[str]): 分发任务到队列 for url in urls: task { url: url, timestamp: asyncio.get_event_loop().time(), retry_count: 0 } self.redis.rpush(self.task_queue, json.dumps(task)) async def consume_tasks(self): 消费任务 while True: task_json self.redis.blpop(self.task_queue, timeout30) if task_json: task json.loads(task_json[1]) result await self.process_task(task) if result: self.redis.rpush(self.result_queue, json.dumps(result))2. 数据监控与可视化python# monitor/dashboard.py from fastapi import FastAPI import pandas as pd import plotly.express as px from sqlalchemy import func from models.database import db from models.models import HouseListing app FastAPI(title链家数据监控面板) app.get(/api/stats) async def get_statistics(): 获取统计信息 async with db.async_session() as session: # 基本统计 total_count await session.scalar( func.count(HouseListing.id) ) avg_price await session.scalar( func.avg(HouseListing.unit_price) ) # 区域统计 district_stats await session.execute( func.count(HouseListing.id).label(count), func.avg(HouseListing.unit_price).label(avg_price), HouseListing.district ).group_by(HouseListing.district).all() return { total_count: total_count, avg_price: round(avg_price, 2) if avg_price else 0, district_stats: [ { district: stat[2], count: stat[0], avg_price: round(stat[1], 2) } for stat in district_stats ] }爬虫优化策略智能延迟策略根据请求频率动态调整延迟时间请求头轮换定期更换User-Agent和CookieIP代理池集成付费或免费的IP代理服务验证码识别集成第三方验证码识别服务断点续传保存爬取状态支持从中断处继续数据去重使用Bloom Filter等数据结构去重法律与道德声明重要提示爬虫使用前请仔细阅读链家网站的robots.txt文件遵守网站的使用条款和服务协议控制爬取频率避免对目标网站造成过大压力仅用于学习和研究目的不得用于商业用途尊重数据隐私不爬取个人敏感信息总结本文详细介绍了如何使用Python最新技术构建一个功能完善的链家二手房爬虫系统。通过结合Playwright、异步IO、数据验证和反反爬虫技术我们创建了一个高效、稳定的爬虫解决方案。同时我们也强调了合法合规使用爬虫的重要性。