init: Search Hub - 统一多搜索引擎聚合服务

This commit is contained in:
2026-05-09 18:46:05 +08:00
commit 81d726179c
27 changed files with 3179 additions and 0 deletions

0
providers/__init__.py Normal file
View File

137
providers/ai_provider.py Normal file
View File

@@ -0,0 +1,137 @@
"""AI 总结服务 — 基于 OpenCodeZen / OpenAI 兼容 API"""
import json
import time
import requests
from providers.base import BaseProvider
def build_prompts(query, results):
search_text = ''
for i, r in enumerate(results, 1):
search_text += f"[{i}] 标题: {r.get('title', '')}\n"
search_text += f" 来源: {r.get('url', '')}\n"
search_text += f" 内容: {r.get('content', '')}\n\n"
system_prompt = (
'你是一个专业的搜索结果分析助手。'
'请根据用户搜索词和搜索结果,生成一份结构化的中文总结报告。'
'要求:\n'
'1. 先概括本次搜索的核心主题\n'
'2. 列出关键发现和重要信息点(分点说明)\n'
'3. 指出不同信息来源之间的共识与分歧(如有)\n'
'4. 给出综合结论\n'
'使用简洁的 Markdown 格式,不要过于冗长。'
)
user_prompt = f'## 搜索词\n{query}\n\n## 搜索结果\n{search_text}\n\n请根据以上搜索结果生成总结报告。'
return system_prompt, user_prompt
class AIProvider(BaseProvider):
name = 'ai'
display_name = 'AI 总结'
needs_api_key = True
enabled = False
priority = 50
def __init__(self, config: dict):
super().__init__(config)
oc = config.get('opencodezen', {})
self.api_key = oc.get('api_key')
self.base_url = oc.get('base_url', 'https://opencode.ai/zen/go/v1').rstrip('/')
self.model = oc.get('model', 'deepseek-v4-flash')
def is_available(self) -> bool:
return bool(self.api_key)
def search(self, query: str, max_results: int = 5) -> list:
return []
def summarize(self, query: str, results: list) -> dict:
if not self.is_available():
return {'error': 'AI 总结未配置', 'summary': ''}
system_prompt, user_prompt = build_prompts(query, results)
url = f'{self.base_url}/chat/completions'
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
}
payload = {
'model': self.model,
'messages': [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
'max_tokens': 4096,
'temperature': 0.3,
}
start = time.time()
try:
resp = requests.post(url, json=payload, headers=headers, timeout=60)
elapsed = round(time.time() - start, 2)
if resp.status_code != 200:
return {'error': f'AI API 返回 {resp.status_code}', 'summary': '', 'elapsed': elapsed}
data = resp.json()
return {
'summary': data['choices'][0]['message']['content'],
'model': self.model,
'elapsed': elapsed,
'usage': data.get('usage', {}),
}
except Exception as e:
return {'error': str(e), 'summary': '', 'elapsed': round(time.time() - start, 2)}
def summarize_stream(self, query: str, results: list):
if not self.is_available():
yield f"event: error\ndata: {json.dumps({'error': 'AI 总结未配置'})}\n\n"
return
system_prompt, user_prompt = build_prompts(query, results)
url = f'{self.base_url}/chat/completions'
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
}
payload = {
'model': self.model,
'messages': [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
'max_tokens': 4096,
'temperature': 0.3,
'stream': True,
}
start = time.time()
try:
resp = requests.post(url, json=payload, headers=headers, stream=True, timeout=120)
if resp.status_code != 200:
yield f"event: error\ndata: {json.dumps({'error': f'AI API 返回 {resp.status_code}'})}\n\n"
return
for line in resp.iter_lines():
if not line:
continue
line_str = line.decode('utf-8', errors='replace')
if not line_str.startswith('data: '):
continue
data_str = line_str[6:]
if data_str.strip() == '[DONE]':
break
try:
chunk = json.loads(data_str)
delta = chunk.get('choices', [{}])[0].get('delta', {})
content = delta.get('content', '')
if content:
yield f"event: delta\ndata: {json.dumps({'content': content})}\n\n"
except json.JSONDecodeError:
continue
elapsed = round(time.time() - start, 2)
yield f"event: meta\ndata: {json.dumps({'model': self.model, 'elapsed': elapsed})}\n\n"
except Exception as e:
yield f"event: error\ndata: {json.dumps({'error': str(e)})}\n\n"

168
providers/baidu_provider.py Normal file
View File

@@ -0,0 +1,168 @@
"""百度搜索源 — 通过百度千帆官方 API"""
import json
import time
import requests
from providers.base import BaseProvider, SearchResult
class BaiduProvider(BaseProvider):
name = 'baidu'
display_name = '百度搜索'
needs_api_key = True
enabled = True
priority = 10 # auto 模式首选
def __init__(self, config: dict, mode='web'):
"""
mode: 'web' → 网页搜索(快速)
'intelligent' → 智能检索生成AI 分析)
"""
super().__init__(config)
self._mode = mode
if mode == 'intelligent':
self.name = 'baidu-intelligent'
self.display_name = '百度智能检索'
self.priority = 21
self.enabled = False # 仅手动选择,不参与 auto
bc = config.get('baidu', {})
self.api_key = bc.get('api_key')
self.intelligent_url = bc.get(
'intelligent_url',
'https://qianfan.baidubce.com/v2/ai_search/chat/completions',
)
self.web_search_url = bc.get(
'web_search_url',
'https://qianfan.baidubce.com/v2/ai_search/web_search',
)
def is_available(self) -> bool:
return bool(self.api_key)
def search(self, query: str, max_results: int = 10) -> list:
if not self.api_key:
return []
if self._mode == 'intelligent':
# 智能检索按引用条数扣费限制最多3条省额度
return self._intelligent_search(query, min(max_results, 3))
return self._web_search(query, max_results)
def _intelligent_search(self, query: str, max_results: int) -> list:
"""智能检索生成 — 返回 AI 回答 + 引用来源"""
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
}
payload = {
'messages': [{'content': query, 'role': 'user'}],
'stream': False,
'model': 'ernie-4.5-turbo-128k',
'enable_corner_markers': True,
'enable_deep_search': True,
}
try:
resp = requests.post(
self.intelligent_url,
json=payload,
headers=headers,
timeout=60,
)
if resp.status_code != 200:
return []
data = resp.json()
results = []
# 从引用来源中提取搜索结果
references = data.get('references', []) or data.get('result', {}).get('references', [])
for ref in references[:max_results]:
title = ref.get('title', '') or ref.get('name', '')
url = ref.get('url', '') or ref.get('link', '')
content = ref.get('summary', '') or ref.get('content', '') or ref.get('snippet', '')
if title and url:
results.append(SearchResult(
title=title,
url=url,
content=content,
score=0.8,
source=self.name,
))
# 如果没有引用链接,尝试从 AI 回答的 content 中提取
if not results:
ai_content = ''
try:
ai_content = data['choices'][0]['message']['content']
except (KeyError, IndexError):
ai_content = data.get('result', {}).get('answer', '')
if ai_content:
# 作为 AI 搜索结果展示
results.append(SearchResult(
title=f'百度AI: {query}',
url=f'https://www.baidu.com/s?wd={query}',
content=ai_content[:500],
score=0.7,
source=self.name,
))
return results
except requests.exceptions.RequestException:
return []
def _web_search(self, query: str, max_results: int) -> list:
"""百度网页搜索 API"""
if max_results <= 0:
return []
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
}
payload = {
'messages': [{'content': query, 'role': 'user'}],
'search_source': 'baidu_search_v2',
'resource_type_filter': [{'type': 'web', 'top_k': max_results}],
}
try:
resp = requests.post(
self.web_search_url,
json=payload,
headers=headers,
timeout=25,
)
if resp.status_code != 200:
return []
data = resp.json()
results = []
# 响应格式: {"request_id":"...", "references":[...]}
refs = data.get('references', []) or data.get('result', {}).get('items', [])
for ref in refs[:max_results]:
title = ref.get('title', '') or ref.get('name', '')
url = ref.get('url', '') or ref.get('link', '')
# snippet 是简短摘要content 是完整内容
snippet = ref.get('snippet', '') or ref.get('content', '') or ''
published = ref.get('date', '') or ref.get('published_date', '')
if title and url:
results.append(SearchResult(
title=title,
url=url,
content=snippet[:500] if len(snippet) > 500 else snippet,
score=0.6,
source=self.name,
published_date=published,
))
return results
except requests.exceptions.RequestException:
return []

63
providers/base.py Normal file
View File

@@ -0,0 +1,63 @@
"""搜索源抽象基类 — 所有搜索源统一接口"""
from abc import ABC, abstractmethod
class SearchResult:
"""统一搜索结果格式"""
def __init__(self, title='', url='', content='', score=0.0, source='',
published_date=''):
self.title = title
self.url = url
self.content = content
self.score = score
self.source = source
self.published_date = published_date
def to_dict(self):
return {
'title': self.title,
'url': self.url,
'content': self.content,
'score': self.score,
'source': self.source,
'published_date': self.published_date or '',
}
class BaseProvider(ABC):
"""搜索源基类"""
# 源名称(唯一标识)
name = ''
# 展示名称
display_name = ''
# 是否需要 API key
needs_api_key = False
# 是否默认启用
enabled = False
# 优先级(数字越小越优先)
priority = 100
def __init__(self, config: dict):
self.config = config
@abstractmethod
def search(self, query: str, max_results: int = 10) -> list:
"""执行搜索,返回 SearchResult 列表"""
...
def is_available(self) -> bool:
"""检查当前源是否可用"""
return True
def get_status(self) -> dict:
"""返回源状态信息"""
return {
'name': self.name,
'display_name': self.display_name,
'available': self.is_available(),
'enabled': self.enabled,
'needs_api_key': self.needs_api_key,
'priority': self.priority,
}

View File

@@ -0,0 +1,142 @@
"""DuckDuckGo 搜索源 — 免费,无需 API key"""
import time
import requests
from providers.base import BaseProvider, SearchResult
class DuckDuckGoProvider(BaseProvider):
name = 'duckduckgo'
display_name = 'DuckDuckGo'
needs_api_key = False
enabled = False # 国内网络不可用,默认关闭
priority = 30
def __init__(self, config: dict):
super().__init__(config)
def search(self, query: str, max_results: int = 10) -> list:
results = []
# 1. 先尝试 Instant Answer API获取摘要和主题
try:
url = 'https://api.duckduckgo.com/'
params = {
'q': query,
'format': 'json',
'no_html': 1,
'skip_disambig': 1,
}
resp = requests.get(url, params=params, timeout=8)
if resp.status_code == 200:
data = resp.json()
# Abstract
abstract = data.get('AbstractText', '')
if abstract and data.get('AbstractURL'):
results.append(SearchResult(
title=data.get('Heading', 'DuckDuckGo 摘要'),
url=data['AbstractURL'],
content=abstract,
score=0.9,
source=self.name,
))
# Related topics
for topic in data.get('RelatedTopics', []):
if 'Topics' in topic:
for sub in topic['Topics'][:3]:
if sub.get('Text'):
results.append(SearchResult(
title=sub.get('Text', '')[:80],
url=sub.get('FirstURL', ''),
content=sub.get('Text', ''),
score=0.7,
source=self.name,
))
elif topic.get('Text'):
results.append(SearchResult(
title=topic.get('Text', '')[:80],
url=topic.get('FirstURL', ''),
content=topic.get('Text', ''),
score=0.7,
source=self.name,
))
except requests.exceptions.RequestException:
pass
# 2. 如果结果不够,再抓取 HTML 版本获取更多结果
if len(results) < max_results:
try:
url = 'https://html.duckduckgo.com/html/'
resp = requests.post(url, data={'q': query}, timeout=15,
headers={'User-Agent': 'Mozilla/5.0'})
if resp.status_code == 200:
html = resp.text
more = self._parse_html_results(html, max_results - len(results))
results.extend(more)
except requests.exceptions.RequestException:
pass
# 去重
seen_urls = set()
unique = []
for r in results:
if r.url and r.url not in seen_urls:
seen_urls.add(r.url)
unique.append(r)
return unique[:max_results]
def _parse_html_results(self, html: str, limit: int) -> list:
"""简单解析 DuckDuckGo HTML 搜索结果"""
results = []
# 按 <a rel="nofollow" 分割找链接
for block in html.split('<a rel="nofollow"')[1:]:
if len(results) >= limit:
break
try:
# 提取 URL
href_start = block.find('href="')
if href_start == -1:
continue
href_start += 6
href_end = block.find('"', href_start)
url = block[href_start:href_end]
# 提取标题 (在 <a 标签后找 >xxx</a>)
title_start = block.find('>', href_end)
if title_start == -1:
continue
title_start += 1
title_end = block.find('</a>', title_start)
title = self._clean_text(block[title_start:title_end])
# 提取摘要(在 <a> 后的某个 <td> 或 <div> 中)
snippet = ''
for kw in ['class="result-snippet"', 'class="snippet"']:
idx = block.find(kw)
if idx != -1:
tag_close = block.find('>', idx) + 1
next_tag = block.find('<', tag_close)
if next_tag != -1:
snippet = self._clean_text(block[tag_close:next_tag])
break
if url and title:
results.append(SearchResult(
title=title,
url=url,
content=snippet or title,
score=0.6,
source=self.name,
))
except Exception:
continue
return results
@staticmethod
def _clean_text(text: str) -> str:
"""清理 HTML 标签和多余空白"""
import re
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text

View File

@@ -0,0 +1,61 @@
"""SearXNG 搜索源 — 自托管元搜索引擎"""
import time
import requests
from providers.base import BaseProvider, SearchResult
class SearXNGProvider(BaseProvider):
name = 'searxng'
display_name = 'SearXNG'
needs_api_key = False
enabled = True
priority = 30
def __init__(self, config: dict):
super().__init__(config)
sc = config.get('searxng', {})
self.base_url = (sc.get('base_url') or 'http://localhost:8888').rstrip('/')
def is_available(self) -> bool:
return True
def search(self, query: str, max_results: int = 10) -> list:
url = f'{self.base_url}/search'
params = {
'q': query,
'format': 'json',
'language': 'zh-CN',
'categories': 'general',
'pageno': 1,
}
try:
resp = requests.get(url, params=params, timeout=15,
headers={'User-Agent': 'SearchHub/1.0',
'Accept': 'application/json'})
if resp.status_code != 200:
return []
data = resp.json()
results = []
for item in data.get('results', []):
published = item.get('publishedDate', '')
if published:
try:
published = published.replace('T', ' ').split('+')[0].split('Z')[0]
except Exception:
pass
results.append(SearchResult(
title=item.get('title', ''),
url=item.get('url', ''),
content=item.get('content', ''),
score=item.get('score', 0.5),
source=self.name,
published_date=published,
))
return results[:max_results]
except requests.exceptions.RequestException:
return []

View File

@@ -0,0 +1,59 @@
"""Tavily 搜索源"""
import time
import requests
from providers.base import BaseProvider, SearchResult
class TavilyProvider(BaseProvider):
name = 'tavily'
display_name = 'Tavily'
needs_api_key = True
enabled = True
priority = 20
def __init__(self, config: dict):
super().__init__(config)
tc = config.get('tavily', {})
self.api_key = tc.get('api_key')
self.base_url = tc.get('base_url', 'https://api.tavily.com').rstrip('/')
self.depth = tc.get('depth', 'basic')
self.max_results = tc.get('max_results', 10)
def is_available(self) -> bool:
return bool(self.api_key)
def search(self, query: str, max_results: int = None) -> list:
if not self.api_key:
return []
url = f'{self.base_url}/search'
payload = {
'api_key': self.api_key,
'query': query,
'search_depth': self.depth,
'max_results': max_results or self.max_results,
'include_answer': False,
'include_images': False,
}
try:
resp = requests.post(url, json=payload, timeout=30)
if resp.status_code != 200:
return []
data = resp.json()
results = []
for item in data.get('results', []):
results.append(SearchResult(
title=item.get('title', ''),
url=item.get('url', ''),
content=item.get('content', ''),
score=item.get('score', 0),
source=self.name,
published_date=item.get('published_date', ''),
))
return results
except requests.exceptions.RequestException:
return []