init: Search Hub - 统一多搜索引擎聚合服务

2026-05-09 18:46:05 +08:00
commit 81d726179c
27 changed files with 3179 additions and 0 deletions
--- a/providers/duckduckgo_provider.py
+++ b/providers/duckduckgo_provider.py
@@ -0,0 +1,142 @@
+"""DuckDuckGo 搜索源 — 免费，无需 API key"""
+
+import time
+import requests
+from providers.base import BaseProvider, SearchResult
+
+
+class DuckDuckGoProvider(BaseProvider):
+    name = 'duckduckgo'
+    display_name = 'DuckDuckGo'
+    needs_api_key = False
+    enabled = False  # 国内网络不可用，默认关闭
+    priority = 30
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    def search(self, query: str, max_results: int = 10) -> list:
+        results = []
+
+        # 1. 先尝试 Instant Answer API（获取摘要和主题）
+        try:
+            url = 'https://api.duckduckgo.com/'
+            params = {
+                'q': query,
+                'format': 'json',
+                'no_html': 1,
+                'skip_disambig': 1,
+            }
+            resp = requests.get(url, params=params, timeout=8)
+            if resp.status_code == 200:
+                data = resp.json()
+                # Abstract
+                abstract = data.get('AbstractText', '')
+                if abstract and data.get('AbstractURL'):
+                    results.append(SearchResult(
+                        title=data.get('Heading', 'DuckDuckGo 摘要'),
+                        url=data['AbstractURL'],
+                        content=abstract,
+                        score=0.9,
+                        source=self.name,
+                    ))
+                # Related topics
+                for topic in data.get('RelatedTopics', []):
+                    if 'Topics' in topic:
+                        for sub in topic['Topics'][:3]:
+                            if sub.get('Text'):
+                                results.append(SearchResult(
+                                    title=sub.get('Text', '')[:80],
+                                    url=sub.get('FirstURL', ''),
+                                    content=sub.get('Text', ''),
+                                    score=0.7,
+                                    source=self.name,
+                                ))
+                    elif topic.get('Text'):
+                        results.append(SearchResult(
+                            title=topic.get('Text', '')[:80],
+                            url=topic.get('FirstURL', ''),
+                            content=topic.get('Text', ''),
+                            score=0.7,
+                            source=self.name,
+                        ))
+        except requests.exceptions.RequestException:
+            pass
+
+        # 2. 如果结果不够，再抓取 HTML 版本获取更多结果
+        if len(results) < max_results:
+            try:
+                url = 'https://html.duckduckgo.com/html/'
+                resp = requests.post(url, data={'q': query}, timeout=15,
+                                     headers={'User-Agent': 'Mozilla/5.0'})
+                if resp.status_code == 200:
+                    html = resp.text
+                    more = self._parse_html_results(html, max_results - len(results))
+                    results.extend(more)
+            except requests.exceptions.RequestException:
+                pass
+
+        # 去重
+        seen_urls = set()
+        unique = []
+        for r in results:
+            if r.url and r.url not in seen_urls:
+                seen_urls.add(r.url)
+                unique.append(r)
+
+        return unique[:max_results]
+
+    def _parse_html_results(self, html: str, limit: int) -> list:
+        """简单解析 DuckDuckGo HTML 搜索结果"""
+        results = []
+        # 按 <a rel="nofollow" 分割找链接
+        for block in html.split('<a rel="nofollow"')[1:]:
+            if len(results) >= limit:
+                break
+            try:
+                # 提取 URL
+                href_start = block.find('href="')
+                if href_start == -1:
+                    continue
+                href_start += 6
+                href_end = block.find('"', href_start)
+                url = block[href_start:href_end]
+
+                # 提取标题 (在 <a 标签后找 >xxx</a>)
+                title_start = block.find('>', href_end)
+                if title_start == -1:
+                    continue
+                title_start += 1
+                title_end = block.find('</a>', title_start)
+                title = self._clean_text(block[title_start:title_end])
+
+                # 提取摘要（在 <a> 后的某个 <td> 或 <div> 中）
+                snippet = ''
+                for kw in ['class="result-snippet"', 'class="snippet"']:
+                    idx = block.find(kw)
+                    if idx != -1:
+                        tag_close = block.find('>', idx) + 1
+                        next_tag = block.find('<', tag_close)
+                        if next_tag != -1:
+                            snippet = self._clean_text(block[tag_close:next_tag])
+                        break
+
+                if url and title:
+                    results.append(SearchResult(
+                        title=title,
+                        url=url,
+                        content=snippet or title,
+                        score=0.6,
+                        source=self.name,
+                    ))
+            except Exception:
+                continue
+        return results
+
+    @staticmethod
+    def _clean_text(text: str) -> str:
+        """清理 HTML 标签和多余空白"""
+        import re
+        text = re.sub(r'<[^>]+>', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text