init: Search Hub - 统一多搜索引擎聚合服务

2026-05-09 18:46:05 +08:00
commit 81d726179c
27 changed files with 3179 additions and 0 deletions
--- a/providers/init.py
+++ b/providers/init.py
--- a/providers/ai_provider.py
+++ b/providers/ai_provider.py
@@ -0,0 +1,137 @@
+"""AI 总结服务 — 基于 OpenCodeZen / OpenAI 兼容 API"""
+
+import json
+import time
+import requests
+from providers.base import BaseProvider
+
+
+def build_prompts(query, results):
+    search_text = ''
+    for i, r in enumerate(results, 1):
+        search_text += f"[{i}] 标题: {r.get('title', '')}\n"
+        search_text += f"    来源: {r.get('url', '')}\n"
+        search_text += f"    内容: {r.get('content', '')}\n\n"
+
+    system_prompt = (
+        '你是一个专业的搜索结果分析助手。'
+        '请根据用户搜索词和搜索结果，生成一份结构化的中文总结报告。'
+        '要求：\n'
+        '1. 先概括本次搜索的核心主题\n'
+        '2. 列出关键发现和重要信息点（分点说明）\n'
+        '3. 指出不同信息来源之间的共识与分歧（如有）\n'
+        '4. 给出综合结论\n'
+        '使用简洁的 Markdown 格式，不要过于冗长。'
+    )
+    user_prompt = f'## 搜索词\n{query}\n\n## 搜索结果\n{search_text}\n\n请根据以上搜索结果生成总结报告。'
+    return system_prompt, user_prompt
+
+
+class AIProvider(BaseProvider):
+    name = 'ai'
+    display_name = 'AI 总结'
+    needs_api_key = True
+    enabled = False
+    priority = 50
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+        oc = config.get('opencodezen', {})
+        self.api_key = oc.get('api_key')
+        self.base_url = oc.get('base_url', 'https://opencode.ai/zen/go/v1').rstrip('/')
+        self.model = oc.get('model', 'deepseek-v4-flash')
+
+    def is_available(self) -> bool:
+        return bool(self.api_key)
+
+    def search(self, query: str, max_results: int = 5) -> list:
+        return []
+
+    def summarize(self, query: str, results: list) -> dict:
+        if not self.is_available():
+            return {'error': 'AI 总结未配置', 'summary': ''}
+
+        system_prompt, user_prompt = build_prompts(query, results)
+        url = f'{self.base_url}/chat/completions'
+        headers = {
+            'Authorization': f'Bearer {self.api_key}',
+            'Content-Type': 'application/json',
+        }
+        payload = {
+            'model': self.model,
+            'messages': [
+                {'role': 'system', 'content': system_prompt},
+                {'role': 'user', 'content': user_prompt},
+            ],
+            'max_tokens': 4096,
+            'temperature': 0.3,
+        }
+
+        start = time.time()
+        try:
+            resp = requests.post(url, json=payload, headers=headers, timeout=60)
+            elapsed = round(time.time() - start, 2)
+            if resp.status_code != 200:
+                return {'error': f'AI API 返回 {resp.status_code}', 'summary': '', 'elapsed': elapsed}
+            data = resp.json()
+            return {
+                'summary': data['choices'][0]['message']['content'],
+                'model': self.model,
+                'elapsed': elapsed,
+                'usage': data.get('usage', {}),
+            }
+        except Exception as e:
+            return {'error': str(e), 'summary': '', 'elapsed': round(time.time() - start, 2)}
+
+    def summarize_stream(self, query: str, results: list):
+        if not self.is_available():
+            yield f"event: error\ndata: {json.dumps({'error': 'AI 总结未配置'})}\n\n"
+            return
+
+        system_prompt, user_prompt = build_prompts(query, results)
+        url = f'{self.base_url}/chat/completions'
+        headers = {
+            'Authorization': f'Bearer {self.api_key}',
+            'Content-Type': 'application/json',
+        }
+        payload = {
+            'model': self.model,
+            'messages': [
+                {'role': 'system', 'content': system_prompt},
+                {'role': 'user', 'content': user_prompt},
+            ],
+            'max_tokens': 4096,
+            'temperature': 0.3,
+            'stream': True,
+        }
+
+        start = time.time()
+        try:
+            resp = requests.post(url, json=payload, headers=headers, stream=True, timeout=120)
+            if resp.status_code != 200:
+                yield f"event: error\ndata: {json.dumps({'error': f'AI API 返回 {resp.status_code}'})}\n\n"
+                return
+
+            for line in resp.iter_lines():
+                if not line:
+                    continue
+                line_str = line.decode('utf-8', errors='replace')
+                if not line_str.startswith('data: '):
+                    continue
+                data_str = line_str[6:]
+                if data_str.strip() == '[DONE]':
+                    break
+                try:
+                    chunk = json.loads(data_str)
+                    delta = chunk.get('choices', [{}])[0].get('delta', {})
+                    content = delta.get('content', '')
+                    if content:
+                        yield f"event: delta\ndata: {json.dumps({'content': content})}\n\n"
+                except json.JSONDecodeError:
+                    continue
+
+            elapsed = round(time.time() - start, 2)
+            yield f"event: meta\ndata: {json.dumps({'model': self.model, 'elapsed': elapsed})}\n\n"
+
+        except Exception as e:
+            yield f"event: error\ndata: {json.dumps({'error': str(e)})}\n\n"
--- a/providers/baidu_provider.py
+++ b/providers/baidu_provider.py
@@ -0,0 +1,168 @@
+"""百度搜索源 — 通过百度千帆官方 API"""
+
+import json
+import time
+import requests
+from providers.base import BaseProvider, SearchResult
+
+
+class BaiduProvider(BaseProvider):
+    name = 'baidu'
+    display_name = '百度搜索'
+    needs_api_key = True
+    enabled = True
+    priority = 10  # auto 模式首选
+
+    def __init__(self, config: dict, mode='web'):
+        """
+        mode: 'web' → 网页搜索（快速）
+              'intelligent' → 智能检索生成（AI 分析）
+        """
+        super().__init__(config)
+        self._mode = mode
+        if mode == 'intelligent':
+            self.name = 'baidu-intelligent'
+            self.display_name = '百度智能检索'
+            self.priority = 21
+            self.enabled = False  # 仅手动选择，不参与 auto
+
+        bc = config.get('baidu', {})
+        self.api_key = bc.get('api_key')
+        self.intelligent_url = bc.get(
+            'intelligent_url',
+            'https://qianfan.baidubce.com/v2/ai_search/chat/completions',
+        )
+        self.web_search_url = bc.get(
+            'web_search_url',
+            'https://qianfan.baidubce.com/v2/ai_search/web_search',
+        )
+
+    def is_available(self) -> bool:
+        return bool(self.api_key)
+
+    def search(self, query: str, max_results: int = 10) -> list:
+        if not self.api_key:
+            return []
+
+        if self._mode == 'intelligent':
+            # 智能检索按引用条数扣费，限制最多3条省额度
+            return self._intelligent_search(query, min(max_results, 3))
+        return self._web_search(query, max_results)
+
+    def _intelligent_search(self, query: str, max_results: int) -> list:
+        """智能检索生成 — 返回 AI 回答 + 引用来源"""
+        headers = {
+            'Authorization': f'Bearer {self.api_key}',
+            'Content-Type': 'application/json',
+        }
+        payload = {
+            'messages': [{'content': query, 'role': 'user'}],
+            'stream': False,
+            'model': 'ernie-4.5-turbo-128k',
+            'enable_corner_markers': True,
+            'enable_deep_search': True,
+        }
+
+        try:
+            resp = requests.post(
+                self.intelligent_url,
+                json=payload,
+                headers=headers,
+                timeout=60,
+            )
+            if resp.status_code != 200:
+                return []
+
+            data = resp.json()
+            results = []
+
+            # 从引用来源中提取搜索结果
+            references = data.get('references', []) or data.get('result', {}).get('references', [])
+            for ref in references[:max_results]:
+                title = ref.get('title', '') or ref.get('name', '')
+                url = ref.get('url', '') or ref.get('link', '')
+                content = ref.get('summary', '') or ref.get('content', '') or ref.get('snippet', '')
+                if title and url:
+                    results.append(SearchResult(
+                        title=title,
+                        url=url,
+                        content=content,
+                        score=0.8,
+                        source=self.name,
+                    ))
+
+            # 如果没有引用链接，尝试从 AI 回答的 content 中提取
+            if not results:
+                ai_content = ''
+                try:
+                    ai_content = data['choices'][0]['message']['content']
+                except (KeyError, IndexError):
+                    ai_content = data.get('result', {}).get('answer', '')
+
+                if ai_content:
+                    # 作为 AI 搜索结果展示
+                    results.append(SearchResult(
+                        title=f'百度AI: {query}',
+                        url=f'https://www.baidu.com/s?wd={query}',
+                        content=ai_content[:500],
+                        score=0.7,
+                        source=self.name,
+                    ))
+
+            return results
+
+        except requests.exceptions.RequestException:
+            return []
+
+    def _web_search(self, query: str, max_results: int) -> list:
+        """百度网页搜索 API"""
+        if max_results <= 0:
+            return []
+
+        headers = {
+            'Authorization': f'Bearer {self.api_key}',
+            'Content-Type': 'application/json',
+        }
+        payload = {
+            'messages': [{'content': query, 'role': 'user'}],
+            'search_source': 'baidu_search_v2',
+            'resource_type_filter': [{'type': 'web', 'top_k': max_results}],
+        }
+
+        try:
+            resp = requests.post(
+                self.web_search_url,
+                json=payload,
+                headers=headers,
+                timeout=25,
+            )
+            if resp.status_code != 200:
+                return []
+
+            data = resp.json()
+            results = []
+
+            # 响应格式: {"request_id":"...", "references":[...]}
+            refs = data.get('references', []) or data.get('result', {}).get('items', [])
+
+            for ref in refs[:max_results]:
+                title = ref.get('title', '') or ref.get('name', '')
+                url = ref.get('url', '') or ref.get('link', '')
+                # snippet 是简短摘要，content 是完整内容
+                snippet = ref.get('snippet', '') or ref.get('content', '') or ''
+                published = ref.get('date', '') or ref.get('published_date', '')
+
+                if title and url:
+                    results.append(SearchResult(
+                        title=title,
+                        url=url,
+                        content=snippet[:500] if len(snippet) > 500 else snippet,
+                        score=0.6,
+                        source=self.name,
+                        published_date=published,
+                    ))
+
+            return results
+
+        except requests.exceptions.RequestException:
+            return []
--- a/providers/base.py
+++ b/providers/base.py
@@ -0,0 +1,63 @@
+"""搜索源抽象基类 — 所有搜索源统一接口"""
+
+from abc import ABC, abstractmethod
+
+
+class SearchResult:
+    """统一搜索结果格式"""
+    def __init__(self, title='', url='', content='', score=0.0, source='',
+                 published_date=''):
+        self.title = title
+        self.url = url
+        self.content = content
+        self.score = score
+        self.source = source
+        self.published_date = published_date
+
+    def to_dict(self):
+        return {
+            'title': self.title,
+            'url': self.url,
+            'content': self.content,
+            'score': self.score,
+            'source': self.source,
+            'published_date': self.published_date or '',
+        }
+
+
+class BaseProvider(ABC):
+    """搜索源基类"""
+
+    # 源名称（唯一标识）
+    name = ''
+    # 展示名称
+    display_name = ''
+    # 是否需要 API key
+    needs_api_key = False
+    # 是否默认启用
+    enabled = False
+    # 优先级（数字越小越优先）
+    priority = 100
+
+    def __init__(self, config: dict):
+        self.config = config
+
+    @abstractmethod
+    def search(self, query: str, max_results: int = 10) -> list:
+        """执行搜索，返回 SearchResult 列表"""
+        ...
+
+    def is_available(self) -> bool:
+        """检查当前源是否可用"""
+        return True
+
+    def get_status(self) -> dict:
+        """返回源状态信息"""
+        return {
+            'name': self.name,
+            'display_name': self.display_name,
+            'available': self.is_available(),
+            'enabled': self.enabled,
+            'needs_api_key': self.needs_api_key,
+            'priority': self.priority,
+        }
--- a/providers/duckduckgo_provider.py
+++ b/providers/duckduckgo_provider.py
@@ -0,0 +1,142 @@
+"""DuckDuckGo 搜索源 — 免费，无需 API key"""
+
+import time
+import requests
+from providers.base import BaseProvider, SearchResult
+
+
+class DuckDuckGoProvider(BaseProvider):
+    name = 'duckduckgo'
+    display_name = 'DuckDuckGo'
+    needs_api_key = False
+    enabled = False  # 国内网络不可用，默认关闭
+    priority = 30
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    def search(self, query: str, max_results: int = 10) -> list:
+        results = []
+
+        # 1. 先尝试 Instant Answer API（获取摘要和主题）
+        try:
+            url = 'https://api.duckduckgo.com/'
+            params = {
+                'q': query,
+                'format': 'json',
+                'no_html': 1,
+                'skip_disambig': 1,
+            }
+            resp = requests.get(url, params=params, timeout=8)
+            if resp.status_code == 200:
+                data = resp.json()
+                # Abstract
+                abstract = data.get('AbstractText', '')
+                if abstract and data.get('AbstractURL'):
+                    results.append(SearchResult(
+                        title=data.get('Heading', 'DuckDuckGo 摘要'),
+                        url=data['AbstractURL'],
+                        content=abstract,
+                        score=0.9,
+                        source=self.name,
+                    ))
+                # Related topics
+                for topic in data.get('RelatedTopics', []):
+                    if 'Topics' in topic:
+                        for sub in topic['Topics'][:3]:
+                            if sub.get('Text'):
+                                results.append(SearchResult(
+                                    title=sub.get('Text', '')[:80],
+                                    url=sub.get('FirstURL', ''),
+                                    content=sub.get('Text', ''),
+                                    score=0.7,
+                                    source=self.name,
+                                ))
+                    elif topic.get('Text'):
+                        results.append(SearchResult(
+                            title=topic.get('Text', '')[:80],
+                            url=topic.get('FirstURL', ''),
+                            content=topic.get('Text', ''),
+                            score=0.7,
+                            source=self.name,
+                        ))
+        except requests.exceptions.RequestException:
+            pass
+
+        # 2. 如果结果不够，再抓取 HTML 版本获取更多结果
+        if len(results) < max_results:
+            try:
+                url = 'https://html.duckduckgo.com/html/'
+                resp = requests.post(url, data={'q': query}, timeout=15,
+                                     headers={'User-Agent': 'Mozilla/5.0'})
+                if resp.status_code == 200:
+                    html = resp.text
+                    more = self._parse_html_results(html, max_results - len(results))
+                    results.extend(more)
+            except requests.exceptions.RequestException:
+                pass
+
+        # 去重
+        seen_urls = set()
+        unique = []
+        for r in results:
+            if r.url and r.url not in seen_urls:
+                seen_urls.add(r.url)
+                unique.append(r)
+
+        return unique[:max_results]
+
+    def _parse_html_results(self, html: str, limit: int) -> list:
+        """简单解析 DuckDuckGo HTML 搜索结果"""
+        results = []
+        # 按 <a rel="nofollow" 分割找链接
+        for block in html.split('<a rel="nofollow"')[1:]:
+            if len(results) >= limit:
+                break
+            try:
+                # 提取 URL
+                href_start = block.find('href="')
+                if href_start == -1:
+                    continue
+                href_start += 6
+                href_end = block.find('"', href_start)
+                url = block[href_start:href_end]
+
+                # 提取标题 (在 <a 标签后找 >xxx</a>)
+                title_start = block.find('>', href_end)
+                if title_start == -1:
+                    continue
+                title_start += 1
+                title_end = block.find('</a>', title_start)
+                title = self._clean_text(block[title_start:title_end])
+
+                # 提取摘要（在 <a> 后的某个 <td> 或 <div> 中）
+                snippet = ''
+                for kw in ['class="result-snippet"', 'class="snippet"']:
+                    idx = block.find(kw)
+                    if idx != -1:
+                        tag_close = block.find('>', idx) + 1
+                        next_tag = block.find('<', tag_close)
+                        if next_tag != -1:
+                            snippet = self._clean_text(block[tag_close:next_tag])
+                        break
+
+                if url and title:
+                    results.append(SearchResult(
+                        title=title,
+                        url=url,
+                        content=snippet or title,
+                        score=0.6,
+                        source=self.name,
+                    ))
+            except Exception:
+                continue
+        return results
+
+    @staticmethod
+    def _clean_text(text: str) -> str:
+        """清理 HTML 标签和多余空白"""
+        import re
+        text = re.sub(r'<[^>]+>', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
--- a/providers/searxng_provider.py
+++ b/providers/searxng_provider.py
@@ -0,0 +1,61 @@
+"""SearXNG 搜索源 — 自托管元搜索引擎"""
+
+import time
+import requests
+from providers.base import BaseProvider, SearchResult
+
+
+class SearXNGProvider(BaseProvider):
+    name = 'searxng'
+    display_name = 'SearXNG'
+    needs_api_key = False
+    enabled = True
+    priority = 30
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+        sc = config.get('searxng', {})
+        self.base_url = (sc.get('base_url') or 'http://localhost:8888').rstrip('/')
+
+    def is_available(self) -> bool:
+        return True
+
+    def search(self, query: str, max_results: int = 10) -> list:
+        url = f'{self.base_url}/search'
+        params = {
+            'q': query,
+            'format': 'json',
+            'language': 'zh-CN',
+            'categories': 'general',
+            'pageno': 1,
+        }
+
+        try:
+            resp = requests.get(url, params=params, timeout=15,
+                                headers={'User-Agent': 'SearchHub/1.0',
+                                         'Accept': 'application/json'})
+            if resp.status_code != 200:
+                return []
+
+            data = resp.json()
+            results = []
+            for item in data.get('results', []):
+                published = item.get('publishedDate', '')
+                if published:
+                    try:
+                        published = published.replace('T', ' ').split('+')[0].split('Z')[0]
+                    except Exception:
+                        pass
+
+                results.append(SearchResult(
+                    title=item.get('title', ''),
+                    url=item.get('url', ''),
+                    content=item.get('content', ''),
+                    score=item.get('score', 0.5),
+                    source=self.name,
+                    published_date=published,
+                ))
+            return results[:max_results]
+
+        except requests.exceptions.RequestException:
+            return []
--- a/providers/tavily_provider.py
+++ b/providers/tavily_provider.py
@@ -0,0 +1,59 @@
+"""Tavily 搜索源"""
+
+import time
+import requests
+from providers.base import BaseProvider, SearchResult
+
+
+class TavilyProvider(BaseProvider):
+    name = 'tavily'
+    display_name = 'Tavily'
+    needs_api_key = True
+    enabled = True
+    priority = 20
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+        tc = config.get('tavily', {})
+        self.api_key = tc.get('api_key')
+        self.base_url = tc.get('base_url', 'https://api.tavily.com').rstrip('/')
+        self.depth = tc.get('depth', 'basic')
+        self.max_results = tc.get('max_results', 10)
+
+    def is_available(self) -> bool:
+        return bool(self.api_key)
+
+    def search(self, query: str, max_results: int = None) -> list:
+        if not self.api_key:
+            return []
+
+        url = f'{self.base_url}/search'
+        payload = {
+            'api_key': self.api_key,
+            'query': query,
+            'search_depth': self.depth,
+            'max_results': max_results or self.max_results,
+            'include_answer': False,
+            'include_images': False,
+        }
+
+        try:
+            resp = requests.post(url, json=payload, timeout=30)
+            if resp.status_code != 200:
+                return []
+
+            data = resp.json()
+            results = []
+            for item in data.get('results', []):
+                results.append(SearchResult(
+                    title=item.get('title', ''),
+                    url=item.get('url', ''),
+                    content=item.get('content', ''),
+                    score=item.get('score', 0),
+                    source=self.name,
+                    published_date=item.get('published_date', ''),
+                ))
+            return results
+
+        except requests.exceptions.RequestException:
+            return []