init: Search Hub - 统一多搜索引擎聚合服务
This commit is contained in:
0
providers/__init__.py
Normal file
0
providers/__init__.py
Normal file
137
providers/ai_provider.py
Normal file
137
providers/ai_provider.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""AI 总结服务 — 基于 OpenCodeZen / OpenAI 兼容 API"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
from providers.base import BaseProvider
|
||||
|
||||
|
||||
def build_prompts(query, results):
|
||||
search_text = ''
|
||||
for i, r in enumerate(results, 1):
|
||||
search_text += f"[{i}] 标题: {r.get('title', '')}\n"
|
||||
search_text += f" 来源: {r.get('url', '')}\n"
|
||||
search_text += f" 内容: {r.get('content', '')}\n\n"
|
||||
|
||||
system_prompt = (
|
||||
'你是一个专业的搜索结果分析助手。'
|
||||
'请根据用户搜索词和搜索结果,生成一份结构化的中文总结报告。'
|
||||
'要求:\n'
|
||||
'1. 先概括本次搜索的核心主题\n'
|
||||
'2. 列出关键发现和重要信息点(分点说明)\n'
|
||||
'3. 指出不同信息来源之间的共识与分歧(如有)\n'
|
||||
'4. 给出综合结论\n'
|
||||
'使用简洁的 Markdown 格式,不要过于冗长。'
|
||||
)
|
||||
user_prompt = f'## 搜索词\n{query}\n\n## 搜索结果\n{search_text}\n\n请根据以上搜索结果生成总结报告。'
|
||||
return system_prompt, user_prompt
|
||||
|
||||
|
||||
class AIProvider(BaseProvider):
|
||||
name = 'ai'
|
||||
display_name = 'AI 总结'
|
||||
needs_api_key = True
|
||||
enabled = False
|
||||
priority = 50
|
||||
|
||||
def __init__(self, config: dict):
|
||||
super().__init__(config)
|
||||
oc = config.get('opencodezen', {})
|
||||
self.api_key = oc.get('api_key')
|
||||
self.base_url = oc.get('base_url', 'https://opencode.ai/zen/go/v1').rstrip('/')
|
||||
self.model = oc.get('model', 'deepseek-v4-flash')
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return bool(self.api_key)
|
||||
|
||||
def search(self, query: str, max_results: int = 5) -> list:
|
||||
return []
|
||||
|
||||
def summarize(self, query: str, results: list) -> dict:
|
||||
if not self.is_available():
|
||||
return {'error': 'AI 总结未配置', 'summary': ''}
|
||||
|
||||
system_prompt, user_prompt = build_prompts(query, results)
|
||||
url = f'{self.base_url}/chat/completions'
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'messages': [
|
||||
{'role': 'system', 'content': system_prompt},
|
||||
{'role': 'user', 'content': user_prompt},
|
||||
],
|
||||
'max_tokens': 4096,
|
||||
'temperature': 0.3,
|
||||
}
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
resp = requests.post(url, json=payload, headers=headers, timeout=60)
|
||||
elapsed = round(time.time() - start, 2)
|
||||
if resp.status_code != 200:
|
||||
return {'error': f'AI API 返回 {resp.status_code}', 'summary': '', 'elapsed': elapsed}
|
||||
data = resp.json()
|
||||
return {
|
||||
'summary': data['choices'][0]['message']['content'],
|
||||
'model': self.model,
|
||||
'elapsed': elapsed,
|
||||
'usage': data.get('usage', {}),
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'summary': '', 'elapsed': round(time.time() - start, 2)}
|
||||
|
||||
def summarize_stream(self, query: str, results: list):
|
||||
if not self.is_available():
|
||||
yield f"event: error\ndata: {json.dumps({'error': 'AI 总结未配置'})}\n\n"
|
||||
return
|
||||
|
||||
system_prompt, user_prompt = build_prompts(query, results)
|
||||
url = f'{self.base_url}/chat/completions'
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'messages': [
|
||||
{'role': 'system', 'content': system_prompt},
|
||||
{'role': 'user', 'content': user_prompt},
|
||||
],
|
||||
'max_tokens': 4096,
|
||||
'temperature': 0.3,
|
||||
'stream': True,
|
||||
}
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
resp = requests.post(url, json=payload, headers=headers, stream=True, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
yield f"event: error\ndata: {json.dumps({'error': f'AI API 返回 {resp.status_code}'})}\n\n"
|
||||
return
|
||||
|
||||
for line in resp.iter_lines():
|
||||
if not line:
|
||||
continue
|
||||
line_str = line.decode('utf-8', errors='replace')
|
||||
if not line_str.startswith('data: '):
|
||||
continue
|
||||
data_str = line_str[6:]
|
||||
if data_str.strip() == '[DONE]':
|
||||
break
|
||||
try:
|
||||
chunk = json.loads(data_str)
|
||||
delta = chunk.get('choices', [{}])[0].get('delta', {})
|
||||
content = delta.get('content', '')
|
||||
if content:
|
||||
yield f"event: delta\ndata: {json.dumps({'content': content})}\n\n"
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
elapsed = round(time.time() - start, 2)
|
||||
yield f"event: meta\ndata: {json.dumps({'model': self.model, 'elapsed': elapsed})}\n\n"
|
||||
|
||||
except Exception as e:
|
||||
yield f"event: error\ndata: {json.dumps({'error': str(e)})}\n\n"
|
||||
168
providers/baidu_provider.py
Normal file
168
providers/baidu_provider.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""百度搜索源 — 通过百度千帆官方 API"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
from providers.base import BaseProvider, SearchResult
|
||||
|
||||
|
||||
class BaiduProvider(BaseProvider):
|
||||
name = 'baidu'
|
||||
display_name = '百度搜索'
|
||||
needs_api_key = True
|
||||
enabled = True
|
||||
priority = 10 # auto 模式首选
|
||||
|
||||
def __init__(self, config: dict, mode='web'):
|
||||
"""
|
||||
mode: 'web' → 网页搜索(快速)
|
||||
'intelligent' → 智能检索生成(AI 分析)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self._mode = mode
|
||||
if mode == 'intelligent':
|
||||
self.name = 'baidu-intelligent'
|
||||
self.display_name = '百度智能检索'
|
||||
self.priority = 21
|
||||
self.enabled = False # 仅手动选择,不参与 auto
|
||||
|
||||
bc = config.get('baidu', {})
|
||||
self.api_key = bc.get('api_key')
|
||||
self.intelligent_url = bc.get(
|
||||
'intelligent_url',
|
||||
'https://qianfan.baidubce.com/v2/ai_search/chat/completions',
|
||||
)
|
||||
self.web_search_url = bc.get(
|
||||
'web_search_url',
|
||||
'https://qianfan.baidubce.com/v2/ai_search/web_search',
|
||||
)
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return bool(self.api_key)
|
||||
|
||||
def search(self, query: str, max_results: int = 10) -> list:
|
||||
if not self.api_key:
|
||||
return []
|
||||
|
||||
if self._mode == 'intelligent':
|
||||
# 智能检索按引用条数扣费,限制最多3条省额度
|
||||
return self._intelligent_search(query, min(max_results, 3))
|
||||
return self._web_search(query, max_results)
|
||||
|
||||
def _intelligent_search(self, query: str, max_results: int) -> list:
|
||||
"""智能检索生成 — 返回 AI 回答 + 引用来源"""
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
payload = {
|
||||
'messages': [{'content': query, 'role': 'user'}],
|
||||
'stream': False,
|
||||
'model': 'ernie-4.5-turbo-128k',
|
||||
'enable_corner_markers': True,
|
||||
'enable_deep_search': True,
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
self.intelligent_url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=60,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
|
||||
# 从引用来源中提取搜索结果
|
||||
references = data.get('references', []) or data.get('result', {}).get('references', [])
|
||||
for ref in references[:max_results]:
|
||||
title = ref.get('title', '') or ref.get('name', '')
|
||||
url = ref.get('url', '') or ref.get('link', '')
|
||||
content = ref.get('summary', '') or ref.get('content', '') or ref.get('snippet', '')
|
||||
if title and url:
|
||||
results.append(SearchResult(
|
||||
title=title,
|
||||
url=url,
|
||||
content=content,
|
||||
score=0.8,
|
||||
source=self.name,
|
||||
))
|
||||
|
||||
# 如果没有引用链接,尝试从 AI 回答的 content 中提取
|
||||
if not results:
|
||||
ai_content = ''
|
||||
try:
|
||||
ai_content = data['choices'][0]['message']['content']
|
||||
except (KeyError, IndexError):
|
||||
ai_content = data.get('result', {}).get('answer', '')
|
||||
|
||||
if ai_content:
|
||||
# 作为 AI 搜索结果展示
|
||||
results.append(SearchResult(
|
||||
title=f'百度AI: {query}',
|
||||
url=f'https://www.baidu.com/s?wd={query}',
|
||||
content=ai_content[:500],
|
||||
score=0.7,
|
||||
source=self.name,
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
return []
|
||||
|
||||
def _web_search(self, query: str, max_results: int) -> list:
|
||||
"""百度网页搜索 API"""
|
||||
if max_results <= 0:
|
||||
return []
|
||||
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
payload = {
|
||||
'messages': [{'content': query, 'role': 'user'}],
|
||||
'search_source': 'baidu_search_v2',
|
||||
'resource_type_filter': [{'type': 'web', 'top_k': max_results}],
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
self.web_search_url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=25,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
|
||||
# 响应格式: {"request_id":"...", "references":[...]}
|
||||
refs = data.get('references', []) or data.get('result', {}).get('items', [])
|
||||
|
||||
for ref in refs[:max_results]:
|
||||
title = ref.get('title', '') or ref.get('name', '')
|
||||
url = ref.get('url', '') or ref.get('link', '')
|
||||
# snippet 是简短摘要,content 是完整内容
|
||||
snippet = ref.get('snippet', '') or ref.get('content', '') or ''
|
||||
published = ref.get('date', '') or ref.get('published_date', '')
|
||||
|
||||
if title and url:
|
||||
results.append(SearchResult(
|
||||
title=title,
|
||||
url=url,
|
||||
content=snippet[:500] if len(snippet) > 500 else snippet,
|
||||
score=0.6,
|
||||
source=self.name,
|
||||
published_date=published,
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
return []
|
||||
63
providers/base.py
Normal file
63
providers/base.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""搜索源抽象基类 — 所有搜索源统一接口"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class SearchResult:
|
||||
"""统一搜索结果格式"""
|
||||
def __init__(self, title='', url='', content='', score=0.0, source='',
|
||||
published_date=''):
|
||||
self.title = title
|
||||
self.url = url
|
||||
self.content = content
|
||||
self.score = score
|
||||
self.source = source
|
||||
self.published_date = published_date
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'title': self.title,
|
||||
'url': self.url,
|
||||
'content': self.content,
|
||||
'score': self.score,
|
||||
'source': self.source,
|
||||
'published_date': self.published_date or '',
|
||||
}
|
||||
|
||||
|
||||
class BaseProvider(ABC):
|
||||
"""搜索源基类"""
|
||||
|
||||
# 源名称(唯一标识)
|
||||
name = ''
|
||||
# 展示名称
|
||||
display_name = ''
|
||||
# 是否需要 API key
|
||||
needs_api_key = False
|
||||
# 是否默认启用
|
||||
enabled = False
|
||||
# 优先级(数字越小越优先)
|
||||
priority = 100
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def search(self, query: str, max_results: int = 10) -> list:
|
||||
"""执行搜索,返回 SearchResult 列表"""
|
||||
...
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""检查当前源是否可用"""
|
||||
return True
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""返回源状态信息"""
|
||||
return {
|
||||
'name': self.name,
|
||||
'display_name': self.display_name,
|
||||
'available': self.is_available(),
|
||||
'enabled': self.enabled,
|
||||
'needs_api_key': self.needs_api_key,
|
||||
'priority': self.priority,
|
||||
}
|
||||
142
providers/duckduckgo_provider.py
Normal file
142
providers/duckduckgo_provider.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""DuckDuckGo 搜索源 — 免费,无需 API key"""
|
||||
|
||||
import time
|
||||
import requests
|
||||
from providers.base import BaseProvider, SearchResult
|
||||
|
||||
|
||||
class DuckDuckGoProvider(BaseProvider):
|
||||
name = 'duckduckgo'
|
||||
display_name = 'DuckDuckGo'
|
||||
needs_api_key = False
|
||||
enabled = False # 国内网络不可用,默认关闭
|
||||
priority = 30
|
||||
|
||||
def __init__(self, config: dict):
|
||||
super().__init__(config)
|
||||
|
||||
def search(self, query: str, max_results: int = 10) -> list:
|
||||
results = []
|
||||
|
||||
# 1. 先尝试 Instant Answer API(获取摘要和主题)
|
||||
try:
|
||||
url = 'https://api.duckduckgo.com/'
|
||||
params = {
|
||||
'q': query,
|
||||
'format': 'json',
|
||||
'no_html': 1,
|
||||
'skip_disambig': 1,
|
||||
}
|
||||
resp = requests.get(url, params=params, timeout=8)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Abstract
|
||||
abstract = data.get('AbstractText', '')
|
||||
if abstract and data.get('AbstractURL'):
|
||||
results.append(SearchResult(
|
||||
title=data.get('Heading', 'DuckDuckGo 摘要'),
|
||||
url=data['AbstractURL'],
|
||||
content=abstract,
|
||||
score=0.9,
|
||||
source=self.name,
|
||||
))
|
||||
# Related topics
|
||||
for topic in data.get('RelatedTopics', []):
|
||||
if 'Topics' in topic:
|
||||
for sub in topic['Topics'][:3]:
|
||||
if sub.get('Text'):
|
||||
results.append(SearchResult(
|
||||
title=sub.get('Text', '')[:80],
|
||||
url=sub.get('FirstURL', ''),
|
||||
content=sub.get('Text', ''),
|
||||
score=0.7,
|
||||
source=self.name,
|
||||
))
|
||||
elif topic.get('Text'):
|
||||
results.append(SearchResult(
|
||||
title=topic.get('Text', '')[:80],
|
||||
url=topic.get('FirstURL', ''),
|
||||
content=topic.get('Text', ''),
|
||||
score=0.7,
|
||||
source=self.name,
|
||||
))
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# 2. 如果结果不够,再抓取 HTML 版本获取更多结果
|
||||
if len(results) < max_results:
|
||||
try:
|
||||
url = 'https://html.duckduckgo.com/html/'
|
||||
resp = requests.post(url, data={'q': query}, timeout=15,
|
||||
headers={'User-Agent': 'Mozilla/5.0'})
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
more = self._parse_html_results(html, max_results - len(results))
|
||||
results.extend(more)
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# 去重
|
||||
seen_urls = set()
|
||||
unique = []
|
||||
for r in results:
|
||||
if r.url and r.url not in seen_urls:
|
||||
seen_urls.add(r.url)
|
||||
unique.append(r)
|
||||
|
||||
return unique[:max_results]
|
||||
|
||||
def _parse_html_results(self, html: str, limit: int) -> list:
|
||||
"""简单解析 DuckDuckGo HTML 搜索结果"""
|
||||
results = []
|
||||
# 按 <a rel="nofollow" 分割找链接
|
||||
for block in html.split('<a rel="nofollow"')[1:]:
|
||||
if len(results) >= limit:
|
||||
break
|
||||
try:
|
||||
# 提取 URL
|
||||
href_start = block.find('href="')
|
||||
if href_start == -1:
|
||||
continue
|
||||
href_start += 6
|
||||
href_end = block.find('"', href_start)
|
||||
url = block[href_start:href_end]
|
||||
|
||||
# 提取标题 (在 <a 标签后找 >xxx</a>)
|
||||
title_start = block.find('>', href_end)
|
||||
if title_start == -1:
|
||||
continue
|
||||
title_start += 1
|
||||
title_end = block.find('</a>', title_start)
|
||||
title = self._clean_text(block[title_start:title_end])
|
||||
|
||||
# 提取摘要(在 <a> 后的某个 <td> 或 <div> 中)
|
||||
snippet = ''
|
||||
for kw in ['class="result-snippet"', 'class="snippet"']:
|
||||
idx = block.find(kw)
|
||||
if idx != -1:
|
||||
tag_close = block.find('>', idx) + 1
|
||||
next_tag = block.find('<', tag_close)
|
||||
if next_tag != -1:
|
||||
snippet = self._clean_text(block[tag_close:next_tag])
|
||||
break
|
||||
|
||||
if url and title:
|
||||
results.append(SearchResult(
|
||||
title=title,
|
||||
url=url,
|
||||
content=snippet or title,
|
||||
score=0.6,
|
||||
source=self.name,
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _clean_text(text: str) -> str:
|
||||
"""清理 HTML 标签和多余空白"""
|
||||
import re
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
61
providers/searxng_provider.py
Normal file
61
providers/searxng_provider.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""SearXNG 搜索源 — 自托管元搜索引擎"""
|
||||
|
||||
import time
|
||||
import requests
|
||||
from providers.base import BaseProvider, SearchResult
|
||||
|
||||
|
||||
class SearXNGProvider(BaseProvider):
|
||||
name = 'searxng'
|
||||
display_name = 'SearXNG'
|
||||
needs_api_key = False
|
||||
enabled = True
|
||||
priority = 30
|
||||
|
||||
def __init__(self, config: dict):
|
||||
super().__init__(config)
|
||||
sc = config.get('searxng', {})
|
||||
self.base_url = (sc.get('base_url') or 'http://localhost:8888').rstrip('/')
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return True
|
||||
|
||||
def search(self, query: str, max_results: int = 10) -> list:
|
||||
url = f'{self.base_url}/search'
|
||||
params = {
|
||||
'q': query,
|
||||
'format': 'json',
|
||||
'language': 'zh-CN',
|
||||
'categories': 'general',
|
||||
'pageno': 1,
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.get(url, params=params, timeout=15,
|
||||
headers={'User-Agent': 'SearchHub/1.0',
|
||||
'Accept': 'application/json'})
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
for item in data.get('results', []):
|
||||
published = item.get('publishedDate', '')
|
||||
if published:
|
||||
try:
|
||||
published = published.replace('T', ' ').split('+')[0].split('Z')[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results.append(SearchResult(
|
||||
title=item.get('title', ''),
|
||||
url=item.get('url', ''),
|
||||
content=item.get('content', ''),
|
||||
score=item.get('score', 0.5),
|
||||
source=self.name,
|
||||
published_date=published,
|
||||
))
|
||||
return results[:max_results]
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
return []
|
||||
59
providers/tavily_provider.py
Normal file
59
providers/tavily_provider.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Tavily 搜索源"""
|
||||
|
||||
import time
|
||||
import requests
|
||||
from providers.base import BaseProvider, SearchResult
|
||||
|
||||
|
||||
class TavilyProvider(BaseProvider):
|
||||
name = 'tavily'
|
||||
display_name = 'Tavily'
|
||||
needs_api_key = True
|
||||
enabled = True
|
||||
priority = 20
|
||||
|
||||
def __init__(self, config: dict):
|
||||
super().__init__(config)
|
||||
tc = config.get('tavily', {})
|
||||
self.api_key = tc.get('api_key')
|
||||
self.base_url = tc.get('base_url', 'https://api.tavily.com').rstrip('/')
|
||||
self.depth = tc.get('depth', 'basic')
|
||||
self.max_results = tc.get('max_results', 10)
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return bool(self.api_key)
|
||||
|
||||
def search(self, query: str, max_results: int = None) -> list:
|
||||
if not self.api_key:
|
||||
return []
|
||||
|
||||
url = f'{self.base_url}/search'
|
||||
payload = {
|
||||
'api_key': self.api_key,
|
||||
'query': query,
|
||||
'search_depth': self.depth,
|
||||
'max_results': max_results or self.max_results,
|
||||
'include_answer': False,
|
||||
'include_images': False,
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(url, json=payload, timeout=30)
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
for item in data.get('results', []):
|
||||
results.append(SearchResult(
|
||||
title=item.get('title', ''),
|
||||
url=item.get('url', ''),
|
||||
content=item.get('content', ''),
|
||||
score=item.get('score', 0),
|
||||
source=self.name,
|
||||
published_date=item.get('published_date', ''),
|
||||
))
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
return []
|
||||
Reference in New Issue
Block a user