dcSpark · guillevalin · Feb 4, 2025
diff --git a/tools/arxiv-search/banner.png b/tools/arxiv-search/banner.png
diff --git a/tools/arxiv-search/icon.png b/tools/arxiv-search/icon.png
diff --git a/tools/arxiv-search/index.test.ts b/tools/arxiv-search/index.test.ts
@@ -0,0 +1,48 @@
+import { expect } from '@jest/globals';
+import { getToolTestClient } from '../../src/test/utils';
+import * as path from 'path';
+
+describe('Arxiv Search Tool', () => {
+  const toolPath = path.join(__dirname, 'tool.py');
+  const client = getToolTestClient();
+
+  it('searches for basic query successfully', async () => {
+    const response = await client.executeToolFromFile(toolPath, {
+      query: 'deep learning'
+    });
+    // the console.log should print multiple levels of the response
+    console.log('Full response:', JSON.stringify(response, null, 2));
+    console.log('\nPapers array:', JSON.stringify(response.papers, null, 2));
+    if (response.papers && response.papers.length > 0) {
+      console.log('\nFirst paper details:', JSON.stringify(response.papers[0], null, 2));
+    }
+
+    expect(Array.isArray(response.papers)).toBe(true);
+    expect(response).toHaveProperty('total_results');
+    expect(response.total_results).toBeGreaterThanOrEqual(0);
+    if (response.total_results > 0) {
+      const paper = response.papers[0];
+      expect(paper).toHaveProperty('title');
+      expect(paper).toHaveProperty('pdf_url');
+    }
+  }, 30000);
+
+  it('applies a maximum results limit', async () => {
+    const response = await client.executeToolFromFile(toolPath, {
+      query: 'blockchain',
+      max_results: 3
+    });
+    expect(Array.isArray(response.papers)).toBe(true);
+    expect(response.papers.length).toBeLessThanOrEqual(3);
+  }, 30000);
+
+  it('handles date range filters (optional)', async () => {
+    const response = await client.executeToolFromFile(toolPath, {
+      query: 'quantum computing',
+      date_from: '2023-01-01',
+      date_to: '2023-12-31',
+      max_results: 5
+    });
+    expect(response.papers.length).toBeLessThanOrEqual(5);
+  }, 30000);
+}); 
diff --git a/tools/arxiv-search/metadata.json b/tools/arxiv-search/metadata.json
@@ -0,0 +1,76 @@
+{
+  "id": "arxiv-search",
+  "name": "arxiv-search",
+  "version": "1.0.0",
+  "description": "Search for papers on arXiv with optional date range and category filters",
+  "author": "Shinkai",
+  "keywords": [
+    "arxiv",
+    "search",
+    "papers",
+    "research",
+    "academic",
+    "scientific"
+  ],
+  "configurations": {
+    "type": "object",
+    "properties": {},
+    "required": []
+  },
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "query": {
+        "type": "string",
+        "description": "Search query string"
+      },
+      "max_results": {
+        "type": "number",
+        "description": "Maximum number of results to return",
+        "default": 10
+      },
+      "date_from": {
+        "type": "string",
+        "description": "Earliest publication date in a parseable date string (optional)"
+      },
+      "date_to": {
+        "type": "string",
+        "description": "Latest publication date in a parseable date string (optional)"
+      },
+      "categories": {
+        "type": "array",
+        "description": "List of category filters (e.g. [cs.LG])",
+        "items": { "type": "string" }
+      }
+    },
+    "required": ["query"]
+  },
+  "result": {
+    "type": "object",
+    "properties": {
+      "papers": {
+        "type": "array",
+        "items": {
+          "type": "object",
+          "properties": {
+            "id": { "type": "string" },
+            "title": { "type": "string" },
+            "authors": {
+              "type": "array",
+              "items": { "type": "string" }
+            },
+            "abstract": { "type": "string" },
+            "published": { "type": "string" },
+            "categories": {
+              "type": "array",
+              "items": { "type": "string" }
+            },
+            "pdf_url": { "type": "string" }
+          }
+        }
+      },
+      "total_results": { "type": "number" }
+    },
+    "required": ["papers", "total_results"]
+  }
+} 
diff --git a/tools/arxiv-search/store.json b/tools/arxiv-search/store.json
@@ -0,0 +1,3 @@
+{
+  "categoryId": "16fa3d8c-305c-410b-a7c9-6708f04c9976"
+}
diff --git a/tools/arxiv-search/tool.py b/tools/arxiv-search/tool.py
@@ -0,0 +1,103 @@
+# /// script
+# dependencies = [
+#   "requests",
+#   "arxiv>=1.4.7",
+#   "python-dateutil"
+# ]
+# ///
+
+import arxiv
+import json
+from typing import List, Dict, Any
+from dateutil import parser
+
+class CONFIG:
+    # For this tool, we don't strictly need configuration fields,
+    # but we can keep them if you plan to store e.g. environment variables.
+    pass
+
+class INPUTS:
+    query: str  # The search query string
+    max_results: int = 10
+    date_from: str = ""
+    date_to: str = ""
+    categories: List[str] = []  # A list of category strings
+
+class OUTPUT:
+    papers: List[Dict[str, Any]]
+    total_results: int
+
+async def run(c: CONFIG, p: INPUTS) -> OUTPUT:
+    """
+    Search for papers on arXiv with advanced filtering.
+    """
+    # For safety, clamp max_results
+    max_results = max(1, min(p.max_results, 50))
+
+    # If categories were provided, combine them into a single query
+    search_query = p.query.strip()
+    if p.categories:
+        cat_filter = " OR ".join(f"cat:{cat.strip()}" for cat in p.categories)
+        search_query = f"({search_query}) AND ({cat_filter})"
+
+    search = arxiv.Search(
+        query=search_query,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.SubmittedDate
+    )
+
+    # Date filters
+    date_from = None
+    date_to = None
+
+    # Attempt to parse date range if provided
+    if p.date_from:
+        try:
+            date_from = parser.parse(p.date_from)
+        except Exception as e:
+            # not fatal, just ignore
+            pass
+
+    if p.date_to:
+        try:
+            date_to = parser.parse(p.date_to)
+        except Exception as e:
+            pass
+
+    papers = []
+    client = arxiv.Client()
+    count = 0
+
+    def is_within(date, start, end):
+        if not date:
+            return True
+        if start and date < start:
+            return False
+        if end and date > end:
+            return False
+        return True
+
+    for result in client.results(search):
+        if is_within(result.published, date_from, date_to):
+            short_id = result.get_short_id()
+            # Convert authors and categories to lists before adding to dictionary
+            authors_list = [str(a.name) for a in result.authors]
+            categories_list = list(result.categories)
+
+            papers.append({
+                "id": short_id,
+                "title": result.title,
+                "authors": authors_list,  # Now explicitly a list of strings
+                "abstract": result.summary,
+                "published": result.published.isoformat(),
+                "categories": categories_list,  # Now explicitly a list
+                "pdf_url": result.pdf_url
+            })
+            count += 1
+            if count >= max_results:
+                break
+
+    out = OUTPUT()
+    out.papers = papers
+    out.total_results = len(papers)
+    return out