(原创)使用Python对任意网站图片进行爬取,仅用于学习
import os import time import argparse import requests import re import io from urllib.parse import urljoin from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed from PIL import Image from selenium import webdriver from selenium.webdriver.chrome.options import Options import ssl # 设置代理和浏览器UA proxies = { 'http': 'http://127.0.0.1:20171', 'https': 'http://127.0.0.1:20171' } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' } # 定义正则表达式,用于匹配图片链接 img_regex = re.compile(r'(http|https)?://[^\s]*\.(jpg|jpeg|png|gif|bmp)') # 定义函数,用于获取网站中的所有图片链接和页面链接 def get_links(url, timeout=10, use_proxy=False): try: if use_proxy: response = requests.get(url, proxies=proxies, headers=headers, timeout=timeout, verify=False) else: response = requests.get(url, headers=headers, timeout=timeout, verify=False) response.raise_for_status() # 检查响应状态码,如果不是 200,抛出异常 except requests.exceptions.RequestException as e: print(f"请求 {url} 时出错:{e}") return ([], []) html = response.text soup = BeautifulSoup(html, 'html.parser') img_links = [] page_links = [] for img in soup.find_all('img'): img_links.append(img.get('src')) for a in soup.find_all('a', href=True): page_links.append(a.get('href')) return (img_links, page_links) # 定义函数,用于下载图片 def download_img(img_url, save_path, timeout=10, use_proxy=False): try: img_name = os.path.basename(img_url) if use_proxy: img_data = requests.get(img_url, proxies=proxies, headers=headers, timeout=timeout, verify=False).content else: img_data = requests.get(img_url, headers=headers, timeout=timeout, verify=False).content except requests.exceptions.RequestException as e: print(f"下载 {img_url} 时出错:{e}") return # 校验图片是否完整 if not is_valid_image(img_data): print(f"下载 {img_url} 时出错:图片不完整或者损坏") return # 获取图片尺寸 img = Image.open(io.BytesIO(img_data)) width, height = img.size # 过滤掉尺寸小于 224x224 的图片 if width < 224 or height < 224: return # 保存图片 with open(os.path.join(save_path, img_name), 'wb') as f: f.write(img_data) # 定义函数,用于校验图片是否完整 def is_valid_image(img_data): try: Image.open(io.BytesIO(img_data)).verify() return True except: return False # 定义函数,用于下载所有页面的图片 def download_all_images(url, save_path, max_depth=3, delay=0.5, timeout=10, use_proxy=False): visited_links = set() # 用集合来保存已经访问过的链接 download_queue = [(url, 0)] # 用队列来保存待下载的链接和深度 page_count = 0 # 记录已经成功访问的页面数量 img_count = 0 # 记录已经成功下载的图片数量 # 创建一个 Chrome 浏览器实例 chrome_options = Options() chrome_options.add_argument('--headless') # 设置无头模式,不显示浏览器窗口 driver = webdriver.Chrome(options=chrome_options) while download_queue: url, depth = download_queue.pop(0) if depth > max_depth: continue if url in visited_links: continue # 使用 selenium 打开页面,让浏览器执行 JavaScript 代码 try: driver.get(url) time.sleep(1) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') img_links = [] for img in soup.find_all('img'): img_src = img.get('src') if img_src and img_regex.match(img_src): img_links.append(img_src) except Exception as e: print(f"访问 {url} 时出错:{e}") continue # 下载当前页面的所有图片 with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for img_url in img_links: if not img_url.startswith('http'): img_url = urljoin(url, img_url) try: download_img(img_url, save_path, timeout=timeout, use_proxy=use_proxy) img_count += 1 except requests.exceptions.RequestException: download_img(img_url, save_path, timeout=timeout, use_proxy=True) img_count += 1 futures.append(executor.submit(download_img, img_url, save_path, timeout=timeout, use_proxy=use_proxy)) for future in as_completed(futures): if future.exception() is not None: print(f"下载图片时出错:{future.exception()}") # 将当前页面中的所有链接加入待下载队列 for page_link in set(get_links(url, use_proxy=use_proxy)[1]): if not page_link.startswith('http'): page_link = urljoin(url, page_link) if page_link not in visited_links: download_queue.append((page_link, depth + 1)) visited_links.add(url) page_count += 1 print(f"已成功访问 {page_count} 个页面,已成功下载 {img_count} 张图片") # 暂停一段时间,防止访问过快被封 IP time.sleep(delay) driver.quit() # 定义函数,用于从 txt 文件中读取要下载图片的网站 URL def read_urls_from_file(file_path): urls = [] with open(file_path, 'r') as f: for line in f: line = line.strip() if not line: continue urls.append(line) return urls # 定义命令行参数 parser = argparse.ArgumentParser(description='Download all images from a website.') parser.add_argument('-u', '--url', help='The URL of the website to download images from.') parser.add_argument('-f', '--file', help='The path to a file containing URLs of websites to download images from.') parser.add_argument('-d', '--depth', type=int, default=3, help='The maximum depth to crawl.') parser.add_argument('-o', '--output', default='images', help='The output directory for the downloaded images.') parser.add_argument('-t', '--timeout', type=int, default=10, help='The timeout for requests.') parser.add_argument('-p', '--proxy', action='store_true', help='Use proxy to download images.') args = parser.parse_args() # 读取要下载图片的网站 URL urls = [] if args.url: urls.append(args.url) elif args.file: urls = read_urls_from_file(args.file) else: print('请指定要下载图片的网站 URL 或者包含网站 URL 的文件路径') # 创建输出目录 if not os.path.exists(args.output): os.makedirs(args.output) # 爬取所有网站中的图片 for url in urls: print(f'开始爬取 {url} 中的图片...') download_all_images(url, args.output, max_depth=args.depth, timeout=args.timeout, use_proxy=args.proxy) print(f'已完成 {url} 中的图片爬取')
使用方法:
python download_images.py -u
也可以使用-d指定爬取深度,默认是3层。