当前位置：首页 > Software > Python > 正文内容

Python基于Searx进行信息搜索

chanra1n11个月前 (01-27)Python1585

Python版本：3.7

代码如下：

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import time
from typing import List, Dict

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler('searx_search.log'),
        logging.StreamHandler()
    ]
)

# Searx实例列表URL
SEARX_INSTANCES_URL = 'https://data.myfpga.cn/searx.txt'

# 最大并发数
MAX_CONCURRENT = 3

class SearxSearcher:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})
        self.search_instances = self._load_instances()
        self.executor = ThreadPoolExecutor(max_workers=MAX_CONCURRENT)

    def _load_instances(self) -> List[str]:
        """从URL加载Searx实例列表"""
        try:
            response = self.session.get(SEARX_INSTANCES_URL, timeout=10)
            return [i.strip() for i in response.text.split('\n') if i.strip()][:10]
        except Exception as e:
            logging.error(f"实例加载失败: {str(e)}")
            return ["https://search.us.projectsegfau.lt"]

    def search(self, query: str, pages: int = 10) -> List[Dict]:
        """搜索并解析结果"""
        futures = {
            self.executor.submit(self._search_instance, instance, query, pages): instance
            for instance in self.search_instances[:MAX_CONCURRENT]
        }
        results = []
        for future in as_completed(futures):
            instance_results = future.result()  # 避免使用海象运算符
            if instance_results:
                results.extend(instance_results)
        return results[:pages * 10]  # 返回前10页的结果

    def _search_instance(self, instance: str, query: str, pages: int) -> List[Dict]:
        """在单个Searx实例上搜索并解析结果"""
        results = []
        for page in range(1, pages + 1):
            try:
                response = self.session.get(
                    f"{instance}/search",
                    params={
                        'q': query,
                        'category_general': 1,
                        'language': 'auto',
                        'time_range': '',
                        'safesearch': 0,
                        'theme': 'simple',
                        'pageno': page
                    },
                    timeout=15
                )
                if not response.ok:
                    logging.warning(f"请求失败: {instance} 第 {page} 页")
                    break

                soup = BeautifulSoup(response.text, 'html.parser')
                main_div = soup.find('div', id='results')
                if not main_div:
                    logging.warning(f"未找到结果: {instance} 第 {page} 页")
                    break

                for article in main_div.find_all('article', class_='result'):
                    title = article.find('h3').get_text(strip=True) if article.find('h3') else '无标题'
                    url = article.find('a', class_='url_header')['href'] if article.find('a', class_='url_header') else '无URL'
                    content = article.find('p', class_='content').get_text(strip=True) if article.find('p', class_='content') else '无内容'
                    results.append({
                        'title': title,
                        'url': url,
                        'content': content
                    })

                time.sleep(0.5)  # 防止请求过快
            except Exception as e:
                logging.error(f"解析失败: {instance} 第 {page} 页 - {str(e)}")
                break
        return results

if __name__ == "__main__":
    searcher = SearxSearcher()
    query = "myfpga.cn"
    results = searcher.search(query, pages=10)
    
    for i, result in enumerate(results, 1):
        print(f"结果 {i}:")
        print(f"标题: {result['title']}")
        print(f"URL: {result['url']}")
        print(f"内容: {result['content']}")
        print("-" * 80)

扫描二维码推送至手机访问。

本文链接：https://world.myfpga.cn/index.php/post/434.html

分享给朋友：

返回列表

上一篇：Smart Image Organizer Assistant 图片数据集重命名、清理，去重错误文件 Python实现

没有最新的文章了...

“Python基于Searx进行信息搜索” 的相关文章

for循环

range()函数range（start，end，step）range（）函数返回一个可迭代对象（可理解为一个序列，序列中的数包括start，不包括end）例如range（1,101），返回1-100的序列。range（101），范围0-100的序列。range（1,100,2），返回1,3,5.....

列表作为函数参数

列表作为函数参数，函数中可以修改原列表def multiply(values,factor): for i in range(len(values)): values[i]*=factor aList=[1,2,3,4,5] multiply(aL...

(原创)使用Python对任意网站图片进行爬取，仅用于学习

import os import time import argparse import requests import re import io from urllib.parse import ...

(原创)使用Python递归获取网页内的所有URL，并进行清洗

import argparse import time from urllib.parse import urljoin, urlparse from selenium import webdriver...

(原创)使用Python提取ISE工程的RTL代码

在工程文件夹下运行Python程序即可 #Author : / #Description : 从ISE的项目文件夹中提取rtl文件，用于LEDA调试 #Time ...

(原创)使用Python提取Vivado工程的RTL代码

在工程文件夹下运行Python程序即可#Author : / #Description : 从Vivado的项目文件夹中提取rtl文件，用于LEDA调试 #Time&nbs...

Python基于Searx进行信息搜索

“Python基于Searx进行信息搜索” 的相关文章

for循环

列表作为函数参数

(原创)使用Python对任意网站图片进行爬取，仅用于学习

(原创)使用Python递归获取网页内的所有URL，并进行清洗

(原创)使用Python提取ISE工程的RTL代码

(原创)使用Python提取Vivado工程的RTL代码

Copyright © MyFpga.cn 技术的执着 | 蜀ICP备19035584号-1 | | 川公网安备 51142202000123号版权所有 © 2019-2024, 陈语ChanRa1n(网站仅用于学习和教育目的). 由MyFPGA智慧中心驱动，主站访问统计(360奇安信)，Email:[email protected]

Powered By Z-BlogPHP. Theme by TOYEAN.