做网站用织梦好吗删除标题wordpress
2026/4/18 12:05:45 网站建设 项目流程
做网站用织梦好吗,删除标题wordpress,设计网页的目的,网站开发私活前言#xff1a;壁纸下载的现代爬虫解决方案 在当今数字时代#xff0c;高清壁纸已成为我们个性化设备、提升视觉体验的重要元素。然而#xff0c;手动从壁纸网站一张张下载不仅耗时耗力#xff0c;而且效率低下。本文将介绍如何使用Python爬虫技术#xff0c;结合最新异…前言壁纸下载的现代爬虫解决方案在当今数字时代高清壁纸已成为我们个性化设备、提升视觉体验的重要元素。然而手动从壁纸网站一张张下载不仅耗时耗力而且效率低下。本文将介绍如何使用Python爬虫技术结合最新异步框架和智能识别算法批量自动化下载高清壁纸打造个性化的壁纸收藏库。技术栈亮点本爬虫项目采用了一系列前沿技术异步请求库aiohttp比requests快5-10倍HTML解析BeautifulSoup4 lxml高效解析智能去重图像指纹识别 感知哈希算法并发控制asyncio信号量 自适应限流反爬对抗随机User-Agent 代理池 请求延迟随机化项目结构设计textwallpaper_crawler/ ├── core/ │ ├── async_downloader.py # 异步下载器核心 │ ├── image_processor.py # 图像处理与去重 │ └── anti_anti_crawl.py # 反反爬策略 ├── utils/ │ ├── config_loader.py # 配置管理 │ ├── logger.py # 日志系统 │ └── progress_tracker.py # 进度追踪 ├── data/ │ ├── downloaded/ # 下载的壁纸 │ ├── cache/ # 缓存数据 │ └── logs/ # 日志文件 └── main.py # 主程序入口完整代码实现1. 主爬虫类 - 异步高清壁纸下载器python 高清壁纸智能爬虫系统 支持异步并发、智能去重、反反爬策略 import asyncio import aiohttp import aiofiles import hashlib from typing import List, Dict, Optional from dataclasses import dataclass from pathlib import Path import json from datetime import datetime import random from urllib.parse import urljoin, urlparse import cv2 import numpy as np from PIL import Image import io import logging from bs4 import BeautifulSoup # 配置数据类 dataclass class CrawlerConfig: 爬虫配置类 base_url: str https://wallhaven.cc search_url: str https://wallhaven.cc/search categories: str 111 # general/anime/people purity: str 100 # sfw/sketchy/nsfw sorting: str random order: str desc max_pages: int 50 concurrent_requests: int 20 download_timeout: int 30 save_dir: str downloaded_wallpapers min_resolution: tuple (1920, 1080) enable_duplicate_check: bool True duplicate_threshold: float 0.95 class AsyncWallpaperDownloader: 异步壁纸下载器核心类 def __init__(self, config: CrawlerConfig None): self.config config or CrawlerConfig() self.setup_logging() self.setup_directories() # 请求头池 self.user_agents [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36, Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 ] # 图像指纹缓存 self.image_hashes set() # 统计信息 self.stats { total_found: 0, downloaded: 0, skipped_duplicate: 0, failed: 0 } def setup_logging(self): 配置日志系统 log_dir Path(logs) log_dir.mkdir(exist_okTrue) logging.basicConfig( levellogging.INFO, format%(asctime)s - %(name)s - %(levelname)s - %(message)s, handlers[ logging.FileHandler(flogs/crawler_{datetime.now().strftime(%Y%m%d_%H%M%S)}.log), logging.StreamHandler() ] ) self.logger logging.getLogger(__name__) def setup_directories(self): 创建必要的目录 Path(self.config.save_dir).mkdir(exist_okTrue, parentsTrue) Path(cache).mkdir(exist_okTrue) def get_random_headers(self) - Dict: 获取随机请求头 return { User-Agent: random.choice(self.user_agents), Accept: text/html,application/xhtmlxml,application/xml;q0.9,image/webp,*/*;q0.8, Accept-Language: en-US,en;q0.5, Accept-Encoding: gzip, deflate, br, DNT: 1, Connection: keep-alive, Upgrade-Insecure-Requests: 1, Referer: self.config.base_url } async def fetch_page(self, session: aiohttp.ClientSession, url: str, params: Dict None) - Optional[str]: 异步获取页面内容 try: await asyncio.sleep(random.uniform(0.5, 2.0)) # 随机延迟 async with session.get(url, headersself.get_random_headers(), paramsparams, timeoutaiohttp.ClientTimeout(total30)) as response: if response.status 200: return await response.text() else: self.logger.warning(f请求失败: {url}, 状态码: {response.status}) return None except Exception as e: self.logger.error(f请求异常 {url}: {str(e)}) return None def extract_image_urls(self, html: str) - List[str]: 从HTML中提取图片URL soup BeautifulSoup(html, lxml) image_urls [] # 寻找图片预览元素 thumbnails soup.find_all(img, {class: lazyload}) for thumb in thumbnails: # 获取高清图片URL data_src thumb.get(data-src) if data_src: # 转换预览图为高清图URL hd_url data_src.replace(/small/, /full/).replace(th-, wall-) image_urls.append(hd_url) self.logger.info(f本页发现 {len(image_urls)} 张图片) return image_urls def calculate_image_hash(self, image_data: bytes) - str: 计算图像感知哈希用于去重 try: # 使用OpenCV计算图像哈希 nparr np.frombuffer(image_data, np.uint8) img cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE) if img is None: return hashlib.md5(image_data).hexdigest() # 调整尺寸为8x8 img cv2.resize(img, (8, 8)) # 计算平均值 avg img.mean() # 生成哈希 hash_str .join([1 if pixel avg else 0 for pixel in img.flatten()]) # 转换为16进制 return hex(int(hash_str, 2))[2:].zfill(16) except: # 降级方案使用MD5 return hashlib.md5(image_data).hexdigest() def is_duplicate_image(self, image_hash: str) - bool: 检查是否为重复图片 if not self.config.enable_duplicate_check: return False # 加载已存在的哈希 cache_file Path(cache/image_hashes.json) if cache_file.exists(): with open(cache_file, r) as f: self.image_hashes set(json.load(f)) if image_hash in self.image_hashes: return True # 保存新哈希 self.image_hashes.add(image_hash) with open(cache_file, w) as f: json.dump(list(self.image_hashes), f) return False async def download_image(self, session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) - bool: 异步下载单张图片 async with semaphore: try: self.logger.info(f开始下载: {url}) # 下载图片 async with session.get(url, headersself.get_random_headers(), timeoutaiohttp.ClientTimeout(totalself.config.download_timeout)) as response: if response.status ! 200: self.logger.warning(f图片下载失败: {url}, 状态码: {response.status}) self.stats[failed] 1 return False image_data await response.read() # 检查分辨率 try: img Image.open(io.BytesIO(image_data)) if img.size[0] self.config.min_resolution[0] or \ img.size[1] self.config.min_resolution[1]: self.logger.warning(f分辨率过低: {img.size}) self.stats[failed] 1 return False except: pass # 检查重复 image_hash self.calculate_image_hash(image_data) if self.is_duplicate_image(image_hash): self.logger.info(跳过重复图片) self.stats[skipped_duplicate] 1 return False # 生成文件名 filename f{datetime.now().strftime(%Y%m%d_%H%M%S)}_{hashlib.md5(image_data).hexdigest()[:8]}.jpg save_path Path(self.config.save_dir) / filename # 保存文件 async with aiofiles.open(save_path, wb) as f: await f.write(image_data) self.logger.info(f下载完成: {save_path}) self.stats[downloaded] 1 return True except Exception as e: self.logger.error(f下载异常 {url}: {str(e)}) self.stats[failed] 1 return False async def crawl_search_page(self, session: aiohttp.ClientSession, page: int, semaphore: asyncio.Semaphore) - List[str]: 爬取搜索页面 params { categories: self.config.categories, purity: self.config.purity, sorting: self.config.sorting, order: self.config.order, page: str(page) } url self.config.search_url self.logger.info(f爬取第 {page} 页: {url}) html await self.fetch_page(session, url, params) if not html: return [] image_urls self.extract_image_urls(html) self.stats[total_found] len(image_urls) # 并行下载图片 tasks [] for img_url in image_urls: task asyncio.create_task(self.download_image(session, img_url, semaphore)) tasks.append(task) results await asyncio.gather(*tasks, return_exceptionsTrue) return image_urls async def run(self): 运行主爬虫 self.logger.info(开始高清壁纸爬虫任务) start_time datetime.now() # 创建信号量控制并发 semaphore asyncio.Semaphore(self.config.concurrent_requests) # 创建连接池 connector aiohttp.TCPConnector(limit100, sslFalse) async with aiohttp.ClientSession(connectorconnector) as session: # 爬取多个页面 tasks [] for page in range(1, self.config.max_pages 1): task asyncio.create_task( self.crawl_search_page(session, page, semaphore) ) tasks.append(task) # 页面间延迟 await asyncio.sleep(random.uniform(1, 3)) # 等待所有任务完成 await asyncio.gather(*tasks) # 输出统计信息 elapsed datetime.now() - start_time self.logger.info(f 爬虫统计 总发现图片数: {self.stats[total_found]} 成功下载数: {self.stats[downloaded]} 跳过重复数: {self.stats[skipped_duplicate]} 失败数: {self.stats[failed]} 总耗时: {elapsed.total_seconds():.2f}秒 平均速度: {self.stats[downloaded] / max(elapsed.total_seconds(), 1):.2f} 张/秒 ) class AdvancedWallpaperCrawler(AsyncWallpaperDownloader): 高级壁纸爬虫支持更多功能 def __init__(self, config: CrawlerConfig None): super().__init__(config) self.proxy_pool self.load_proxy_pool() def load_proxy_pool(self) - List[str]: 加载代理池 proxy_file Path(proxies.txt) if proxy_file.exists(): with open(proxy_file, r) as f: return [line.strip() for line in f if line.strip()] return [] def get_proxy(self) - Optional[str]: 获取随机代理 if self.proxy_pool: return random.choice(self.proxy_pool) return None async def download_with_proxy_rotation(self, session: aiohttp.ClientSession, url: str, max_retries: int 3) - Optional[bytes]: 使用代理轮询下载 for retry in range(max_retries): try: proxy self.get_proxy() if proxy and retry 0: # 第一次尝试不用代理 self.logger.info(f尝试使用代理: {proxy}) async with session.get(url, proxyproxy, headersself.get_random_headers(), timeoutaiohttp.ClientTimeout(total30)) as response: if response.status 200: return await response.read() self.logger.warning(f代理下载失败 (尝试 {retry1}/{max_retries}): {response.status}) except Exception as e: self.logger.error(f代理下载异常: {str(e)}) await asyncio.sleep(2 ** retry) # 指数退避 return None # 使用示例 async def main(): 主函数示例 # 自定义配置 config CrawlerConfig( max_pages10, # 爬取10页 concurrent_requests15, # 15个并发 save_dirmy_wallpapers, min_resolution(2560, 1440), # 2K分辨率 enable_duplicate_checkTrue ) # 创建爬虫实例 crawler AsyncWallpaperDownloader(config) # 运行爬虫 await crawler.run() if __name__ __main__: # 运行异步主函数 asyncio.run(main())2. 智能图像处理模块python 智能图像处理模块 包含图像去重、质量检测、分类等功能 import numpy as np from PIL import Image, ImageFilter import cv2 import hashlib from typing import Tuple, Optional from sklearn.cluster import KMeans import colorsys class ImageIntelligence: 图像智能处理类 staticmethod def detect_image_quality(image_data: bytes) - Tuple[float, float]: 检测图像质量 返回: (清晰度得分, 噪点得分) nparr np.frombuffer(image_data, np.uint8) img cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE) if img is None: return 0.0, 0.0 # 计算清晰度拉普拉斯方差 clarity cv2.Laplacian(img, cv2.CV_64F).var() # 计算噪点水平 denoised cv2.GaussianBlur(img, (3, 3), 0) noise_score np.mean(np.abs(img.astype(np.float32) - denoised.astype(np.float32))) # 归一化 clarity_norm min(clarity / 1000, 1.0) noise_norm max(0, 1 - noise_score / 50) return clarity_norm, noise_norm staticmethod def extract_dominant_colors(image_data: bytes, n_colors: int 5) - list: 提取主色调 nparr np.frombuffer(image_data, np.uint8) img cv2.imdecode(nparr, cv2.IMREAD_COLOR) img_rgb cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 重塑为像素列表 pixels img_rgb.reshape((-1, 3)) # 使用K-means聚类 kmeans KMeans(n_clustersn_colors, random_state42) kmeans.fit(pixels) # 获取聚类中心和比例 colors kmeans.cluster_centers_.astype(int) counts np.bincount(kmeans.labels_) proportions counts / len(pixels) # 按比例排序 sorted_indices np.argsort(proportions)[::-1] return [ { rgb: tuple(colors[i]), hex: f#{colors[i][0]:02x}{colors[i][1]:02x}{colors[i][2]:02x}, proportion: float(proportions[i]) } for i in sorted_indices ] staticmethod def is_suitable_for_wallpaper(image_data: bytes, min_clarity: float 0.3, max_noise: float 0.7) - bool: 判断是否适合作为壁纸 clarity, noise ImageIntelligence.detect_image_quality(image_data) return clarity min_clarity and noise max_noise staticmethod def generate_color_palette_image(dominant_colors: list, save_path: str palette.png): 生成颜色调色板图像 palette_height 100 total_width len(dominant_colors) * 100 palette_img Image.new(RGB, (total_width, palette_height)) for i, color_info in enumerate(dominant_colors): color_block Image.new(RGB, (100, palette_height), color_info[rgb]) palette_img.paste(color_block, (i * 100, 0)) palette_img.save(save_path) return save_path3. 配置文件管理python 配置文件管理器 支持JSON/YAML格式配置 import json import yaml from pathlib import Path from typing import Any, Dict import tomllib class ConfigManager: 配置管理器 def __init__(self, config_file: str config.yaml): self.config_file Path(config_file) self.config self.load_config() def load_config(self) - Dict[str, Any]: 加载配置文件 if not self.config_file.exists(): # 创建默认配置 default_config { crawler: { base_url: https://wallhaven.cc, max_pages: 20, concurrent_requests: 20, save_dir: wallpapers, min_resolution: [1920, 1080], categories: 111, purity: 100 }, image_processing: { enable_duplicate_check: True, quality_threshold: 0.3, extract_colors: True }, performance: { timeout: 30, max_retries: 3, delay_range: [0.5, 2.0] } } self.save_config(default_config) return default_config # 根据后缀选择加载方式 suffix self.config_file.suffix.lower() if suffix .json: with open(self.config_file, r, encodingutf-8) as f: return json.load(f) elif suffix in [.yaml, .yml]: with open(self.config_file, r, encodingutf-8) as f: return yaml.safe_load(f) elif suffix .toml: with open(self.config_file, rb) as f: return tomllib.load(f) else: raise ValueError(f不支持的配置文件格式: {suffix}) def save_config(self, config: Dict[str, Any]): 保存配置 suffix self.config_file.suffix.lower() if suffix .json: with open(self.config_file, w, encodingutf-8) as f: json.dump(config, f, indent2, ensure_asciiFalse) elif suffix in [.yaml, .yml]: with open(self.config_file, w, encodingutf-8) as f: yaml.dump(config, f, allow_unicodeTrue) else: with open(self.config_file, w, encodingutf-8) as f: json.dump(config, f, indent2, ensure_asciiFalse) def get(self, key: str, default: Any None) - Any: 获取配置值 keys key.split(.) value self.config for k in keys: if isinstance(value, dict) and k in value: value value[k] else: return default return value4. 使用示例和高级功能python 高级使用示例和功能扩展 import asyncio from concurrent.futures import ProcessPoolExecutor import argparse async def advanced_crawler_demo(): 高级爬虫演示 config CrawlerConfig( max_pages5, concurrent_requests10, save_dir4k_wallpapers, min_resolution(3840, 2160), # 4K分辨率 enable_duplicate_checkTrue ) # 创建高级爬虫 crawler AdvancedWallpaperCrawler(config) # 添加自定义过滤 def custom_filter(image_data: bytes) - bool: 自定义过滤函数 # 检查是否为黑暗主题 colors ImageIntelligence.extract_dominant_colors(image_data, 3) brightness_sum 0 for color in colors: r, g, b color[rgb] h, l, s colorsys.rgb_to_hls(r/255, g/255, b/255) brightness_sum l avg_brightness brightness_sum / len(colors) # 偏好黑暗主题亮度 0.5 return avg_brightness 0.5 # 运行爬虫 print(开始爬取4K黑暗主题壁纸...) await crawler.run() print(爬取完成) def setup_argparse(): 设置命令行参数 parser argparse.ArgumentParser(description高清壁纸爬虫) parser.add_argument(--pages, typeint, default10, help爬取页面数) parser.add_argument(--concurrent, typeint, default20, help并发请求数) parser.add_argument(--resolution, typestr, default1920x1080, help最小分辨率 (格式: 宽x高)) parser.add_argument(--category, typestr, defaultgeneral, choices[general, anime, people, all], help壁纸类别) parser.add_argument(--purity, typestr, defaultsfw, choices[sfw, sketchy, nsfw, all], help内容纯度) parser.add_argument(--output, typestr, defaultwallpapers, help保存目录) return parser.parse_args() async def main_with_args(): 带参数的主函数 args setup_argparse() # 解析分辨率 width, height map(int, args.resolution.split(x)) # 类别映射 category_map { general: 100, anime: 010, people: 001, all: 111 } # 纯度映射 purity_map { sfw: 100, sketchy: 010, nsfw: 001, all: 111 } config CrawlerConfig( max_pagesargs.pages, concurrent_requestsargs.concurrent, save_dirargs.output, min_resolution(width, height), categoriescategory_map.get(args.category, 111), puritypurity_map.get(args.purity, 100) ) crawler AsyncWallpaperDownloader(config) await crawler.run() class BatchWallpaperDownloader: 批量壁纸下载管理器 staticmethod async def download_by_keywords(keywords: list): 根据关键词批量下载 tasks [] for keyword in keywords: config CrawlerConfig( search_urlfhttps://wallhaven.cc/search?q{keyword}, max_pages3, save_dirfwallpapers/{keyword} ) crawler AsyncWallpaperDownloader(config) tasks.append(crawler.run()) await asyncio.gather(*tasks) staticmethod async def download_top_wallpapers(): 下载热门壁纸 config CrawlerConfig( sortingtoplist, orderdesc, max_pages10, save_dirtop_wallpapers ) crawler AsyncWallpaperDownloader(config) await crawler.run() if __name__ __main__: # 示例1: 基础使用 # asyncio.run(main()) # 示例2: 带参数使用 # asyncio.run(main_with_args()) # 示例3: 高级功能 asyncio.run(advanced_crawler_demo())技术深度解析1. 异步编程的优势本爬虫使用asyncio和aiohttp实现真正的异步IO相比传统同步请求有显著优势高并发可同时处理数十个请求而不阻塞资源高效单线程即可实现高并发内存占用低速度快I/O等待时间被充分利用下载速度提升5-10倍2. 智能去重算法采用多种去重策略确保图片唯一性感知哈希pHash识别视觉相似图片MD5校验精确匹配相同文件分辨率过滤确保最低质量要求元数据比较EXIF信息比对3. 反爬虫对抗策略动态User-Agent模拟不同浏览器请求延迟随机化模拟人类行为代理IP轮换避免IP被封请求频率控制自适应限流算法4. 错误处理与恢复指数退避重试网络错误时自动重试断点续传记录下载进度异常隔离单任务失败不影响整体最佳实践建议遵守robots.txt尊重网站的爬虫协议控制请求频率避免对目标网站造成压力尊重版权仅用于个人使用不用于商业用途数据备份定期备份已下载的壁纸监控与日志记录爬虫运行状态便于调试扩展功能方向机器学习分类使用CNN自动分类壁纸主题颜色主题分析根据桌面环境推荐匹配壁纸自动换壁纸集成到操作系统定时更换质量评分系统基于多个维度评分壁纸质量跨平台支持适配Windows/macOS/Linux总结本文详细介绍了如何构建一个功能完整的高清壁纸爬虫系统。通过采用最新的异步编程、智能图像处理和反爬虫技术我们实现了一个高效、稳定、智能的壁纸下载工具。这个爬虫不仅技术先进而且具有很好的扩展性可以根据需要添加更多高级功能。

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询