Python基于Searx进行信息搜索
Python版本:3.7
代码如下:
import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed import logging import time from typing import List, Dict # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler('searx_search.log'), logging.StreamHandler() ] ) # Searx实例列表URL SEARX_INSTANCES_URL = 'https://data.myfpga.cn/searx.txt' # 最大并发数 MAX_CONCURRENT = 3 class SearxSearcher: def __init__(self): self.session = requests.Session() self.session.headers.update({'User-Agent': 'Mozilla/5.0'}) self.search_instances = self._load_instances() self.executor = ThreadPoolExecutor(max_workers=MAX_CONCURRENT) def _load_instances(self) -> List[str]: """从URL加载Searx实例列表""" try: response = self.session.get(SEARX_INSTANCES_URL, timeout=10) return [i.strip() for i in response.text.split('\n') if i.strip()][:10] except Exception as e: logging.error(f"实例加载失败: {str(e)}") return ["https://search.us.projectsegfau.lt"] def search(self, query: str, pages: int = 10) -> List[Dict]: """搜索并解析结果""" futures = { self.executor.submit(self._search_instance, instance, query, pages): instance for instance in self.search_instances[:MAX_CONCURRENT] } results = [] for future in as_completed(futures): instance_results = future.result() # 避免使用海象运算符 if instance_results: results.extend(instance_results) return results[:pages * 10] # 返回前10页的结果 def _search_instance(self, instance: str, query: str, pages: int) -> List[Dict]: """在单个Searx实例上搜索并解析结果""" results = [] for page in range(1, pages + 1): try: response = self.session.get( f"{instance}/search", params={ 'q': query, 'category_general': 1, 'language': 'auto', 'time_range': '', 'safesearch': 0, 'theme': 'simple', 'pageno': page }, timeout=15 ) if not response.ok: logging.warning(f"请求失败: {instance} 第 {page} 页") break soup = BeautifulSoup(response.text, 'html.parser') main_div = soup.find('div', id='results') if not main_div: logging.warning(f"未找到结果: {instance} 第 {page} 页") break for article in main_div.find_all('article', class_='result'): title = article.find('h3').get_text(strip=True) if article.find('h3') else '无标题' url = article.find('a', class_='url_header')['href'] if article.find('a', class_='url_header') else '无URL' content = article.find('p', class_='content').get_text(strip=True) if article.find('p', class_='content') else '无内容' results.append({ 'title': title, 'url': url, 'content': content }) time.sleep(0.5) # 防止请求过快 except Exception as e: logging.error(f"解析失败: {instance} 第 {page} 页 - {str(e)}") break return results if __name__ == "__main__": searcher = SearxSearcher() query = "myfpga.cn" results = searcher.search(query, pages=10) for i, result in enumerate(results, 1): print(f"结果 {i}:") print(f"标题: {result['title']}") print(f"URL: {result['url']}") print(f"内容: {result['content']}") print("-" * 80)