2026/4/17 23:26:51
网站建设
项目流程
安徽网站制作,问什么出现 这个网站正在建设中,诸暨市住房和城乡建设局网站,徐州高端网站建设引言#xff1a;房产数据获取的技术挑战与解决方案
在当今大数据时代#xff0c;房产信息的实时获取对于投资者、购房者和市场研究者至关重要。然而#xff0c;现代房产网站普遍采用动态加载、反爬虫机制和复杂验证策略#xff0c;给传统爬虫带来了巨大挑战。本文将介绍如…引言房产数据获取的技术挑战与解决方案在当今大数据时代房产信息的实时获取对于投资者、购房者和市场研究者至关重要。然而现代房产网站普遍采用动态加载、反爬虫机制和复杂验证策略给传统爬虫带来了巨大挑战。本文将介绍如何利用最新的Python爬虫技术——Playwright配合异步编程构建一个高效、稳定的房产信息智能采集系统。技术栈亮点Playwright微软推出的新一代浏览器自动化工具比Selenium更快更稳定异步编程使用asyncio和aiohttp实现高并发数据采集智能解析结合CSS选择器和XPath的动态解析策略反反爬虫多级代理、请求头轮换、行为模拟等综合策略数据存储支持多种存储后端JSON、CSQLite、MySQL完整爬虫系统架构python 房产信息智能爬虫系统 版本2.0 功能支持多网站采集、异步并发、智能解析和数据清洗 import asyncio import logging import json import random import time from datetime import datetime from typing import List, Dict, Any, Optional from dataclasses import dataclass, asdict from urllib.parse import urljoin, urlparse, parse_qs import aiohttp import pandas as pd from playwright.async_api import async_playwright, Page, BrowserContext from bs4 import BeautifulSoup import aiosqlite from fake_useragent import UserAgent from tenacity import retry, stop_after_attempt, wait_exponential # 配置日志系统 logging.basicConfig( levellogging.INFO, format%(asctime)s - %(name)s - %(levelname)s - %(message)s, handlers[ logging.FileHandler(property_crawler.log, encodingutf-8), logging.StreamHandler() ] ) logger logging.getLogger(__name__) dataclass class PropertyItem: 房产数据模型类 title: str price: str unit_price: str area: str layout: str # 户型 floor: str # 楼层 orientation: str # 朝向 district: str # 区域 address: str year: str # 建造年份 property_type: str # 房屋类型 decoration: str # 装修情况 source: str # 数据来源网站 url: str crawl_time: str latitude: Optional[float] None longitude: Optional[float] None tags: Optional[List[str]] None def to_dict(self) - Dict[str, Any]: return asdict(self) class ProxyManager: 代理IP管理器 def __init__(self, proxy_file: str proxies.txt): self.proxies [] self.load_proxies(proxy_file) def load_proxies(self, file_path: str): try: with open(file_path, r) as f: self.proxies [line.strip() for line in f if line.strip()] logger.info(f已加载 {len(self.proxies)} 个代理IP) except FileNotFoundError: logger.warning(代理文件未找到将使用直连模式) def get_random_proxy(self) - Optional[Dict[str, str]]: if not self.proxies: return None proxy random.choice(self.proxies) return { server: fhttp://{proxy}, username: your_username, # 如果需要认证 password: your_password } class PropertyCrawler: 房产爬虫主类 def __init__(self, headless: bool True, max_concurrent: int 5): self.headless headless self.max_concurrent max_concurrent self.ua UserAgent() self.proxy_manager ProxyManager() self.semaphore asyncio.Semaphore(max_concurrent) # 支持的房产网站配置 self.website_configs { lianjia: { base_url: https://sh.lianjia.com/ershoufang/, list_selector: .sellListContent li, pagination: True, max_pages: 100 }, anjuke: { base_url: https://sh.anjuke.com/sale/, list_selector: .list-item, pagination: True, max_pages: 50 }, fang: { base_url: https://esf.sh.fang.com/, list_selector: .shop_list li, pagination: True, max_pages: 100 } } async def init_browser(self): 初始化Playwright浏览器 self.playwright await async_playwright().start() # 配置浏览器启动选项 launch_options { headless: self.headless, slow_mo: 100, # 操作延迟模拟人类行为 } # 随机选择代理 proxy self.proxy_manager.get_random_proxy() if proxy: launch_options[proxy] proxy self.browser await self.playwright.chromium.launch(**launch_options) # 创建上下文模拟移动设备 context_options { user_agent: self.ua.random, viewport: {width: 1920, height: 1080}, locale: zh-CN, timezone_id: Asia/Shanghai, } self.context await self.browser.new_context(**context_options) logger.info(浏览器初始化完成) retry( stopstop_after_attempt(3), waitwait_exponential(multiplier1, min4, max10) ) async def fetch_page(self, url: str, page: Page) - str: 获取页面内容带有重试机制 try: await page.goto(url, wait_untilnetworkidle, timeout30000) # 随机滚动模拟人类浏览 await self.simulate_human_behavior(page) # 等待关键内容加载 await page.wait_for_selector(.content, timeout10000) content await page.content() logger.info(f成功获取页面: {url}) return content except Exception as e: logger.error(f获取页面失败 {url}: {str(e)}) raise async def simulate_human_behavior(self, page: Page): 模拟人类浏览行为 # 随机滚动 for _ in range(random.randint(2, 5)): scroll_height random.randint(300, 800) await page.evaluate(fwindow.scrollBy(0, {scroll_height})) await asyncio.sleep(random.uniform(0.5, 2)) # 随机移动鼠标 await page.mouse.move( random.randint(0, 500), random.randint(0, 500) ) async def parse_lianjia_list(self, html: str, base_url: str) - List[PropertyItem]: 解析链家列表页 soup BeautifulSoup(html, html.parser) properties [] items soup.select(self.website_configs[lianjia][list_selector]) for item in items: try: # 提取详细信息 title_elem item.select_one(.title a) if not title_elem: continue title title_elem.text.strip() detail_url urljoin(base_url, title_elem[href]) # 价格信息 price_elem item.select_one(.totalPrice) price price_elem.text.strip() if price_elem else 未知 # 单价 unit_price_elem item.select_one(.unitPrice) unit_price unit_price_elem.text.strip() if unit_price_elem else 未知 # 房屋信息 house_info item.select_one(.houseInfo) if house_info: info_parts house_info.text.strip().split(|) layout info_parts[0].strip() if len(info_parts) 0 else area info_parts[1].strip() if len(info_parts) 1 else orientation info_parts[2].strip() if len(info_parts) 2 else decoration info_parts[3].strip() if len(info_parts) 3 else else: layout area orientation decoration # 位置信息 position_elem item.select_one(.positionInfo) if position_elem: position_parts position_elem.text.strip().split(-) district position_parts[0].strip() if len(position_parts) 0 else address position_parts[1].strip() if len(position_parts) 1 else else: district address # 标签 tag_elem item.select_one(.tag) tags [span.text for span in tag_elem.select(span)] if tag_elem else [] property_item PropertyItem( titletitle, priceprice, unit_priceunit_price, areaarea, layoutlayout, floor, # 链家列表页不直接显示楼层 orientationorientation, districtdistrict, addressaddress, year, property_type二手房, decorationdecoration, sourcelianjia, urldetail_url, crawl_timedatetime.now().strftime(%Y-%m-%d %H:%M:%S), tagstags ) properties.append(property_item) except Exception as e: logger.error(f解析链家房源失败: {str(e)}) continue return properties async def get_detail_info(self, property_item: PropertyItem) - PropertyItem: 获取房源详细信息 async with self.semaphore: page await self.context.new_page() try: html await self.fetch_page(property_item.url, page) soup BeautifulSoup(html, html.parser) # 解析详细信息以链家为例 if property_item.source lianjia: # 楼层信息 floor_elem soup.select_one(.content li:contains(楼层)) if floor_elem: property_item.floor floor_elem.text.replace(楼层, ).strip() # 建造年份 year_elem soup.select_one(.content li:contains(建成年份)) if year_elem: property_item.year year_elem.text.replace(建成年份, ).strip() # 经纬度从地图数据提取 map_data soup.select_one(#mapPosition) if map_data and data-position in map_data.attrs: pos map_data[data-position] if pos: lng, lat pos.split(,) property_item.longitude float(lng.strip()) property_item.latitude float(lat.strip()) await page.close() return property_item except Exception as e: logger.error(f获取详情页失败 {property_item.url}: {str(e)}) await page.close() return property_item async def crawl_website(self, website: str, district: str None) - List[PropertyItem]: 爬取指定网站数据 config self.website_configs.get(website) if not config: logger.error(f不支持的网站: {website}) return [] base_url config[base_url] if district: base_url urljoin(base_url, f{district}/) all_properties [] page_num 1 while page_num config[max_pages]: try: url f{base_url}pg{page_num}/ if config[pagination] else base_url page await self.context.new_page() html await self.fetch_page(url, page) await page.close() # 根据网站类型调用不同的解析方法 if website lianjia: properties await self.parse_lianjia_list(html, base_url) else: # 可以添加其他网站的解析方法 properties [] if not properties: logger.info(f{website} 第{page_num}页没有数据停止爬取) break all_properties.extend(properties) logger.info(f{website} 第{page_num}页爬取完成共{len(properties)}条数据) # 随机延迟避免请求过快 await asyncio.sleep(random.uniform(2, 5)) page_num 1 except Exception as e: logger.error(f爬取{website}第{page_num}页失败: {str(e)}) break return all_properties async def crawl_all_websites(self, districts: List[str] None) - List[PropertyItem]: 并发爬取所有网站 if districts is None: districts [pudong, minhang, xuhui] # 默认爬取几个区域 tasks [] for website in self.website_configs.keys(): for district in districts: tasks.append(self.crawl_website(website, district)) results await asyncio.gather(*tasks, return_exceptionsTrue) all_properties [] for result in results: if isinstance(result, Exception): logger.error(f爬取任务失败: {result}) elif isinstance(result, list): all_properties.extend(result) # 获取详细信息 detail_tasks [self.get_detail_info(item) for item in all_properties] detailed_results await asyncio.gather(*detail_tasks, return_exceptionsTrue) # 过滤掉异常结果 final_properties [] for result in detailed_results: if isinstance(result, PropertyItem): final_properties.append(result) return final_properties async def save_to_json(self, properties: List[PropertyItem], filename: str): 保存数据到JSON文件 data [item.to_dict() for item in properties] with open(filename, w, encodingutf-8) as f: json.dump(data, f, ensure_asciiFalse, indent2) logger.info(f数据已保存到 {filename}共 {len(properties)} 条记录) async def save_to_sqlite(self, properties: List[PropertyItem], db_path: str properties.db): 保存数据到SQLite数据库 async with aiosqlite.connect(db_path) as db: await db.execute( CREATE TABLE IF NOT EXISTS properties ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, price TEXT, unit_price TEXT, area TEXT, layout TEXT, floor TEXT, orientation TEXT, district TEXT, address TEXT, year TEXT, property_type TEXT, decoration TEXT, source TEXT, url TEXT UNIQUE, crawl_time TEXT, latitude REAL, longitude REAL, tags TEXT ) ) for prop in properties: tags_str json.dumps(prop.tags, ensure_asciiFalse) if prop.tags else None await db.execute( INSERT OR REPLACE INTO properties (title, price, unit_price, area, layout, floor, orientation, district, address, year, property_type, decoration, source, url, crawl_time, latitude, longitude, tags) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) , ( prop.title, prop.price, prop.unit_price, prop.area, prop.layout, prop.floor, prop.orientation, prop.district, prop.address, prop.year, prop.property_type, prop.decoration, prop.source, prop.url, prop.crawl_time, prop.latitude, prop.longitude, tags_str )) await db.commit() logger.info(f数据已保存到数据库 {db_path}) async def export_to_excel(self, properties: List[PropertyItem], filename: str): 导出数据到Excel文件 data [item.to_dict() for item in properties] df pd.DataFrame(data) # 数据清洗 df[price_num] df[price].str.extract(r(\d\.?\d*)).astype(float) df[area_num] df[area].str.extract(r(\d\.?\d*)).astype(float) df[unit_price_num] df[price_num] / df[area_num] with pd.ExcelWriter(filename, engineopenpyxl) as writer: df.to_excel(writer, sheet_name房产数据, indexFalse) # 添加统计信息 summary { 统计项: [房源总数, 平均总价(万), 平均单价(元/㎡), 最大面积(㎡), 最小面积(㎡)], 数值: [ len(df), df[price_num].mean(), df[unit_price_num].mean(), df[area_num].max(), df[area_num].min() ] } summary_df pd.DataFrame(summary) summary_df.to_excel(writer, sheet_name统计汇总, indexFalse) logger.info(f数据已导出到Excel文件 {filename}) async def close(self): 关闭资源 await self.context.close() await self.browser.close() await self.playwright.stop() logger.info(爬虫资源已释放) async def main(): 主函数 crawler PropertyCrawler(headlessTrue, max_concurrent3) try: # 初始化浏览器 await crawler.init_browser() # 爬取所有网站数据 logger.info(开始爬取房产数据...) properties await crawler.crawl_all_websites() # 保存数据 if properties: timestamp datetime.now().strftime(%Y%m%d_%H%M%S) await crawler.save_to_json(properties, fproperties_{timestamp}.json) await crawler.save_to_sqlite(properties, properties.db) await crawler.export_to_excel(properties, fproperties_{timestamp}.xlsx) # 输出统计信息 sources {} for prop in properties: sources[prop.source] sources.get(prop.source, 0) 1 logger.info(爬取完成统计信息) for source, count in sources.items(): logger.info(f{source}: {count} 条数据) logger.info(f总计: {len(properties)} 条数据) else: logger.warning(未获取到任何房产数据) except Exception as e: logger.error(f爬虫运行失败: {str(e)}) finally: await crawler.close() class AdvancedFeatures: 高级功能扩展类 staticmethod async def detect_price_changes(db_path: str, days: int 7): 检测价格变动 async with aiosqlite.connect(db_path) as db: # 获取最近days天的数据 query f SELECT url, price, crawl_time FROM properties WHERE crawl_time date(now, -{days} days) ORDER BY url, crawl_time cursor await db.execute(query) rows await cursor.fetchall() # 分析价格变动 price_changes {} for url, price, crawl_time in rows: if url not in price_changes: price_changes[url] [] price_num float(.join(filter(str.isdigit, price))) price_changes[url].append((crawl_time, price_num)) # 找出价格变动的房源 changed_properties [] for url, prices in price_changes.items(): if len(prices) 1: first_price prices[0][1] last_price prices[-1][1] change_rate (last_price - first_price) / first_price * 100 if abs(change_rate) 1: # 变动超过1% changed_properties.append({ url: url, first_price: first_price, last_price: last_price, change_rate: change_rate, first_time: prices[0][0], last_time: prices[-1][0] }) return changed_properties staticmethod def generate_report(properties: List[PropertyItem]): 生成数据分析报告 df pd.DataFrame([p.to_dict() for p in properties]) report { total_properties: len(df), avg_price: df[price].apply(lambda x: float(.join(filter(str.isdigit, x)))).mean(), avg_area: df[area].apply(lambda x: float(.join(filter(str.isdigit, x)))).mean(), by_district: df[district].value_counts().to_dict(), by_source: df[source].value_counts().to_dict(), price_distribution: pd.cut( df[price].apply(lambda x: float(.join(filter(str.isdigit, x)))), bins[0, 200, 500, 800, 1000, float(inf)], labels[0-200万, 200-500万, 500-800万, 800-1000万, 1000万以上] ).value_counts().to_dict() } return report if __name__ __main__: # 运行爬虫 asyncio.run(main()) # 可以单独运行高级功能 # price_changes asyncio.run(AdvancedFeatures.detect_price_changes(properties.db))系统部署与优化建议1. 分布式爬虫架构python# 使用Celery或RQ实现分布式任务队列 # 使用Redis进行任务调度和数据共享 # 部署多个爬虫节点通过中央调度器分配任务2. 反爬虫策略应对使用住宅代理IP池实现验证码自动识别OCR或第三方服务设置合理的请求频率和随机延迟模拟真实用户行为模式3. 数据质量保障python# 数据验证器 class DataValidator: staticmethod def validate_property(item: PropertyItem) - bool: rules [ len(item.title) 5, item.price and any(char.isdigit() for char in item.price), item.area and any(char.isdigit() for char in item.area), item.url.startswith((http://, https://)) ] return all(rules)4. 监控与报警系统python# 监控爬虫运行状态 class CrawlerMonitor: def __init__(self): self.metrics { success_rate: 0, avg_response_time: 0, data_quality_score: 0 } async def send_alert(self, message: str): # 集成邮件、Slack、微信通知 pass环境配置与依赖bash# requirements.txt playwright1.40.0 aiohttp3.9.1 beautifulsoup44.12.2 pandas2.1.4 aiosqlite0.19.0 fake-useragent1.4.0 tenacity8.2.3 openpyxl3.1.2 celery5.3.4 redis5.0.1 # 安装Playwright浏览器 playwright install chromium结语本文详细介绍了一个基于最新Python技术的房产信息爬虫系统。通过结合Playwright的强大的浏览器自动化能力、异步编程的高效并发处理、以及智能的反反爬虫策略我们构建了一个稳定、高效、可扩展的数据采集系统。关键优势高可靠性多层重试机制和错误处理高效率异步并发智能调度高扩展性模块化设计易于扩展新网站数据质量完整的数据清洗和验证流程可维护性清晰的代码结构和完整文档未来扩展方向集成机器学习算法进行房价预测添加实时数据监控和可视化仪表盘实现自动化的房源推荐系统开发API接口供其他系统调用