Add Discord attachment reading and web search capabilities

- Discord channel now downloads and extracts text from attachments (text files, PDFs) - Added WebSearchTool using DuckDuckGo for researcher and coder agents - Improved WebFetchTool with User-Agent header and HTML-to-text stripping - Added pypdf and duckduckgo-search dependencies Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 18:07:19 +02:00
parent b3608b35fa
commit e24e3026b6
8 changed files with 189 additions and 3 deletions
@@ -216,6 +216,26 @@ class BashTool(Tool):
            return f"Error: Command timed out after {self._timeout}s"


+def _strip_html(html: str) -> str:
+    """Strip HTML tags and collapse whitespace to get readable text."""
+    # Remove script and style blocks
+    text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
+    # Replace <br>, <p>, <div>, <li> etc. with newlines
+    text = re.sub(r"<(br|p|div|li|h[1-6]|tr)[^>]*/?>", "\n", text, flags=re.IGNORECASE)
+    # Strip remaining tags
+    text = re.sub(r"<[^>]+>", "", text)
+    # Decode common HTML entities
+    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
+    text = text.replace("&quot;", '"').replace("&#39;", "'").replace("&nbsp;", " ")
+    # Collapse whitespace
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+
+
+_WEB_USER_AGENT = "Mozilla/5.0 (compatible; XtrmAgent/1.0; +https://github.com)"
+
+
 class WebFetchTool(Tool):
    @property
    def name(self) -> str:
@@ -223,7 +243,7 @@ class WebFetchTool(Tool):

    @property
    def description(self) -> str:
-        return "Fetch the content of a URL."
+        return "Fetch the content of a URL and return it as readable text."

    @property
    def parameters(self) -> dict[str, Any]:
@@ -237,9 +257,14 @@ class WebFetchTool(Tool):

    async def execute(self, url: str, **_: Any) -> str:
        try:
-            async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+            async with httpx.AsyncClient(
+                timeout=30, follow_redirects=True, headers={"User-Agent": _WEB_USER_AGENT}
+            ) as client:
                resp = await client.get(url)
+                content_type = resp.headers.get("content-type", "")
                text = resp.text
+                if "html" in content_type:
+                    text = _strip_html(text)
                if len(text) > 20_000:
                    text = text[:20_000] + "\n... (truncated)"
                return text
@@ -247,6 +272,51 @@ class WebFetchTool(Tool):
            return f"Error fetching URL: {e}"


+class WebSearchTool(Tool):
+    @property
+    def name(self) -> str:
+        return "web_search"
+
+    @property
+    def description(self) -> str:
+        return "Search the web using DuckDuckGo and return a list of results with title, URL, and snippet."
+
+    @property
+    def parameters(self) -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "Search query"},
+                "max_results": {
+                    "type": "integer",
+                    "description": "Maximum number of results (default: 5)",
+                    "default": 5,
+                },
+            },
+            "required": ["query"],
+        }
+
+    async def execute(self, query: str, max_results: int = 5, **_: Any) -> str:
+        try:
+            from duckduckgo_search import AsyncDDGS
+
+            async with AsyncDDGS() as ddgs:
+                results = await ddgs.atext(query, max_results=max_results)
+
+            if not results:
+                return "No results found."
+
+            lines: list[str] = []
+            for r in results:
+                lines.append(f"**{r.get('title', '')}**")
+                lines.append(r.get("href", ""))
+                lines.append(r.get("body", ""))
+                lines.append("---")
+            return "\n".join(lines)
+        except Exception as e:
+            return f"Error searching: {e}"
+
+
 def register_builtin_tools(registry: Any, workspace: Path) -> None:
    """Register all built-in tools into a ToolRegistry."""
    registry.register(ReadFileTool(workspace))
@@ -255,3 +325,4 @@ def register_builtin_tools(registry: Any, workspace: Path) -> None:
    registry.register(ListDirTool(workspace))
    registry.register(BashTool(workspace))
    registry.register(WebFetchTool())
+    registry.register(WebSearchTool())