(原创)联合网页图片爬虫和PaddlePaddle,对图片进行爬取并分类
#首先是Python语言的测试代码,如需服务端部署,请见文末。
import os
import time
import argparse
import requests
import re
import io
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import cv2
from shutil import copyfile
import numpy as np
import paddlex as pdx
import importlib
import sys
importlib.reload(sys)
model = pdx.load_model('./inference_model') # Paddle加载模型
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
}
img_regex = re.compile(r'(http|https)?://[^\s]*\.(jpg|jpeg|png|gif|bmp)')
# 定义函数,用于获取网站中的所有图片链接和页面链接
def get_links(url, timeout=10):
try:
response = requests.get(url, headers=headers, timeout=timeout, verify=False)
response.raise_for_status() # 检查响应状态码,如果不是 200,抛出异常
except requests.exceptions.RequestException as e:
print(f"请求 {url} 时出错:{e}")
return ([], [])
html = response.text
soup = BeautifulSoup(html, 'html.parser')
img_links = [img.get('src') for img in soup.find_all('img')]
page_links = [a.get('href') for a in soup.find_all('a', href=True)]
return (img_links, page_links)
# 定义函数,用于下载图片
def download_img(img_url, save_path, timeout=10):
try:
img_name = os.path.basename(img_url)
img_data = requests.get(img_url, headers=headers, timeout=timeout, verify=False).content
except requests.exceptions.RequestException as e:
print(f"下载 {img_url} 时出错:{e}")
return
# 校验图片是否完整
if not is_valid_image(img_data):
print(f"下载 {img_url} 时出错:图片不完整或者损坏")
return
# 获取图片尺寸
img = Image.open(io.BytesIO(img_data))
width, height = img.size
# 过滤掉尺寸小于 224x224 的图片
if width < 224 or height < 224:
return
# 保存图片
with open(os.path.join(save_path, img_name), 'wb') as f:
f.write(img_data)
# 预测图片
result_path = "./PredictImg"
try:
im = cv2.imdecode(np.fromfile(os.path.join(save_path, img_name), dtype=np.uint8), -1)
im = im.astype('float32')
result = model.predict(im)
isExists = os.path.exists(result_path + '/' + result[0]['category']) # 判断分类文件夹是否存在
if not isExists:
os.makedirs(result_path + '/' + result[0]['category'])
if result[0]['score'] > 0.9:
copyfile(os.path.join(save_path, img_name),
result_path + '/' + result[0]['category'] + '/' + img_name)
print('OK:' + os.path.join(result_path, img_name))
os.remove(os.path.join(save_path, img_name))
except:
print('ERROR:' + os.path.join(save_path, img_name))
# 定义函数,用于校验图片是否完整
def is_valid_image(img_data):
try:
Image.open(io.BytesIO(img_data)).verify()
return True
except:
return False
# 定义函数,用于下载所有页面的图片
def download_all_images(url, save_path, max_depth=3, delay=0.5, timeout=10):
visited_links = set() # 用集合来保存已经访问过的链接
download_queue = [(url, 0)] # 用队列来保存待下载的链接和深度
page_count = 0 # 记录已经成功访问的页面数量
img_count = 0 # 记录已经成功下载的图片数量
while download_queue:
url, depth = download_queue.pop(0)
if depth > max_depth:
continue
if url in visited_links:
continue
try:
response = requests.get(url, headers=headers, timeout=timeout, verify=False)
response.raise_for_status() # 检查响应状态码,如果不是 200,抛出异常
html = response.text
soup = BeautifulSoup(html, 'html.parser')
img_links = [
img.get('src')
for img in soup.find_all('img')
if img.get('src') and img_regex.match(img.get('src'))
]
except Exception as e:
print(f"访问 {url} 时出错:{e}")
continue
# 下载当前页面的所有图片
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for img_url in img_links:
if not img_url.startswith('http'):
img_url = urljoin(url, img_url)
try:
download_img(img_url, save_path, timeout=timeout)
img_count += 1
except requests.exceptions.RequestException:
download_img(img_url, save_path, timeout=timeout)
img_count += 1
futures.append(
executor.submit(download_img, img_url, save_path, timeout=timeout))
for future in as_completed(futures):
if future.exception() is not None:
print(f"下载图片时出错:{future.exception()}")
# 将当前页面中的所有链接加入待下载队列
for page_link in set(get_links(url)[1]):
if not page_link.startswith('http'):
page_link = urljoin(url, page_link)
if page_link not in visited_links:
download_queue.append((page_link, depth + 1))
visited_links.add(url)
page_count += 1
print(f"已成功访问 {page_count} 个页面,已成功下载 {img_count} 张图片")
# 暂停一段时间,防止访问过快被封 IP
time.sleep(delay)
# 定义函数,用于从 txt 文件中读取要下载图片的网站链接
def read_urls_from_file(file_path):
urls = []
try:
with open(file_path, 'r') as file:
urls = [line.strip() for line in file]
except Exception as e:
print(f"读取文件 {file_path} 时出错:{e}")
return urls
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download Images from URLs')
parser.add_argument('--url_file', type=str, help='Path to the file containing the URLs', required=True)
parser.add_argument('--save_path', type=str, help='Path to save the downloaded images', default='./Images')
parser.add_argument('--max_depth', type=int, help='Maximum depth for crawling', default=3)
args = parser.parse_args()
urls = read_urls_from_file(args.url_file)
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
for url in urls:
download_all_images(url, args.save_path, max_depth=args.max_depth)这段代码是一个用于从给定网站链接下载图片的程序。它使用Python的requests库发送HTTP请求获取网页内容,并使用BeautifulSoup库解析HTML内容以获取图片链接和页面链接。
代码的主要逻辑如下:
导入所需的库和模块。
定义全局变量和常量,包括请求头部信息、图片链接的正则表达式等。
定义了几个函数:
get_links(url, timeout):用于从给定的网页中获取所有图片链接和页面链接。
download_img(img_url, save_path, timeout):用于下载图片并进行预测分类。
is_valid_image(img_data):用于检查图片是否完整。
download_all_images(url, save_path, max_depth, delay, timeout):用于递归地下载指定深度内的所有页面中的图片。
read_urls_from_file(file_path):从文本文件中读取要下载的网站链接。
在if __name__ == '__main__':语句块中,使用argparse库解析命令行参数,包括指定保存路径、最大深度等参数。
从指定的文本文件中读取要下载的网站链接。
遍历每个网站链接,调用download_all_images函数下载图片,并保存到指定的路径中。
这段代码利用多线程的方式下载图片,通过解析HTML内容获取图片链接和页面链接,并使用PaddleX库加载预训练的模型进行图片分类预测。它还包含一些错误处理机制,例如处理请求异常、校验图片完整性等。最后,代码通过设置延迟时间来控制访问速度,以避免被封IP。
当然,你也可以在服务端部署,以下是PHP代码,代码尚处测试阶段,仅供参考
<?php
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use Symfony\Component\DomCrawler\Crawler;
require 'vendor/autoload.php';
$client = new Client();
$headers = [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
];
$img_regex = '/(http|https)?:\/\/[^\s]*\.(jpg|jpeg|png|gif|bmp)/i';
// Define a function to get all image links and page links from a website
function getLinks($url, $timeout = 10)
{
try {
$client = new GuzzleHttp\Client();
$response = $client->get($url, ['headers' => $headers, 'timeout' => $timeout]);
$html = $response->getBody()->getContents();
} catch (RequestException $e) {
echo "请求 $url 时出错:" . $e->getMessage();
return [[], []];
}
$crawler = new Crawler($html);
$imgLinks = $crawler->filter('img')->each(function ($node) {
return $node->attr('src');
});
$pageLinks = $crawler->filter('a[href]')->each(function ($node) {
return $node->attr('href');
});
return [$imgLinks, $pageLinks];
}
// Define a function to download an image
function downloadImg($imgUrl, $savePath, $timeout = 10)
{
try {
$imgName = basename($imgUrl);
$response = $client->get($imgUrl, ['headers' => $headers, 'timeout' => $timeout]);
$imgData = $response->getBody()->getContents();
} catch (RequestException $e) {
echo "下载 $imgUrl 时出错:" . $e->getMessage();
return;
}
// Check if the image is valid
if (!isValidImage($imgData)) {
echo "下载 $imgUrl 时出错:图片不完整或损坏";
return;
}
// Save the image
file_put_contents($savePath . '/' . $imgName, $imgData);
// Predict the image
$resultPath = './PredictImg';
try {
$im = imagecreatefromstring($imgData);
imagealphablending($im, true);
imagesavealpha($im, true);
imagepng($im, $savePath . '/' . $imgName);
imagedestroy($im);
$im = imagecreatefrompng($savePath . '/' . $imgName);
$result = modelPredict($im);
$isExists = is_dir($resultPath . '/' . $result[0]['category']);
if (!$isExists) {
mkdir($resultPath . '/' . $result[0]['category'], 0755, true);
}
if ($result[0]['score'] > 0.9) {
copy($savePath . '/' . $imgName, $resultPath . '/' . $result[0]['category'] . '/' . $imgName);
echo 'OK: ' . $resultPath . '/' . $imgName . "\n";
unlink($savePath . '/' . $imgName);
}
} catch (\Throwable $th) {
echo 'ERROR: ' . $savePath . '/' . $imgName . "\n";
}
}
// Define a function to check if an image is valid
function isValidImage($imgData)
{
try {
$im = imagecreatefromstring($imgData);
if ($im !== false) {
imagedestroy($im);
return true;
} else {
return false;
}
} catch (\Throwable $th) {
return false;
}
}
// Define a function to download all images from all pages
function downloadAllImages($url, $savePath, $maxDepth = 3, $delay = 0.5, $timeout = 10)
{
$visitedLinks = [];
$downloadQueue = [[$url, 0]];
$pageCount = 0;
$imgCount = 0;
while (!empty($downloadQueue)) {
[$url, $depth] = array_shift($downloadQueue);
if ($depth > $maxDepth) {
continue;
}
if (in_array($url, $visitedLinks)) {
continue;
}
try {
$response = $client->get($url, ['headers' => $headers, 'timeout' => $timeout]);
$html = $response->getBody()->getContents();
} catch (RequestException $e) {
echo "访问 $url 时出错:" . $e->getMessage();
continue;
}
$crawler = new Crawler($html);
$imgLinks = $crawler->filter('img')->each(function ($node) {
$imgSrc = $node->attr('src');
if (preg_match($img_regex, $imgSrc)) {
return $imgSrc;
}
});
// Download images from the current page
$pool = new Pool($client, $imgLinks, [
'fulfilled' => function ($response, $index) use ($savePath, &$imgCount) {
$imgUrl = $response->getBody()->getContents();
downloadImg($imgUrl, $savePath);
$imgCount++;
},
'rejected' => function ($reason, $index) use ($savePath) {
$imgUrl = $reason->getRequest()->getUri();
downloadImg($imgUrl, $savePath);
},
]);
$promise = $pool->promise();
$promise->wait();
// Get all page links from the current page
$pageLinks = $crawler->filter('a[href]')->each(function ($node) {
$pageLink = $node->attr('href');
if (preg_match($img_regex, $pageLink)) {
return $pageLink;
}
});
// Add the page links to the download queue
foreach ($pageLinks as $pageLink) {
if (!empty($pageLink) && !in_array($pageLink, $visitedLinks)) {
$downloadQueue[] = [$pageLink, $depth + 1];
}
}
$visitedLinks[] = $url;
$pageCount++;
echo "已成功访问 $pageCount 个页面,已成功下载 $imgCount 张图片\n";
// Pause for a while to avoid being blocked
usleep($delay * 1000000);
}
}
// Define a function to read URLs from a file
function readUrlsFromFile($filePath)
{
$urls = [];
try {
$file = fopen($filePath, 'r');
if ($file) {
while (($line = fgets($file)) !== false) {
$urls[] = trim($line);
}
fclose($file);
}
} catch (\Throwable $th) {
echo "读取文件 $filePath 时出错:" . $th->getMessage();
}
return $urls;
}
// Main code
$urlFile = isset($argv[1]) ? $argv[1] : null;
$savePath = './Images';
$maxDepth = 3;
if (empty($urlFile)) {
echo "请提供包含URL的文件路径\n";
exit(1);
}
$urls = readUrlsFromFile($urlFile);
if (!is_dir($savePath)) {
mkdir($savePath, 0755, true);
}
foreach ($urls as $url) {
downloadAllImages($url, $savePath, $maxDepth);
}
?>



