dcSpark · guillevalin · Feb 4, 2025 · Feb 4, 2025
diff --git a/tools/arxiv-download/banner.png b/tools/arxiv-download/banner.png
diff --git a/tools/arxiv-download/icon.png b/tools/arxiv-download/icon.png
diff --git a/tools/arxiv-download/index.test.ts b/tools/arxiv-download/index.test.ts
@@ -0,0 +1,58 @@
+import { expect } from '@jest/globals';
+import { getToolTestClient } from '../../src/test/utils';
+import * as path from 'path';
+import * as fs from 'fs';
+
+describe('Arxiv Download Tool', () => {
+  const toolPath = path.join(__dirname, 'tool.py');
+  const client = getToolTestClient();
+  const storageFolder = path.join(process.cwd(), 'test-arxiv-download');
+
+  beforeAll(() => {
+    fs.mkdirSync(storageFolder, { recursive: true });
+  });
+
+  it('handles unknown paper gracefully', async () => {
+    const response = await client.executeToolFromFile(
+      toolPath,
+      { paper_id: 'nonexistentid' },
+      { storage_folder: storageFolder }
+    );
+    expect(response.status).toBe('error');
+    expect(response.message).toMatch(/Paper not found/i);
+  }, 60000);
+
+  it('downloads a known paper (will skip real invalid IDs, you can mock)', async () => {
+    // Provide a real arXiv ID for testing. This test might need to be mocked
+    // for offline usage. For demonstration, let's do something like:
+    const testPaperId = '2101.00001'; // Example only, might exist
+    const response = await client.executeToolFromFile(
+      toolPath,
+      { paper_id: testPaperId, convert_to_md: false },
+      { storage_folder: storageFolder }
+    );
+    if (response.status === 'error') {
+      // Possibly paper not found or network error
+      console.warn('Could not test real paper - possibly invalid or network offline');
+      expect(response.message).toMatch(/Paper not found|Error/);
+    } else {
+      expect(response.status).toBe('success');
+      expect(typeof response.message).toBe('string');
+      // If it succeeded, we should have a PDF in the folder
+      const pdfPath = path.join(storageFolder, testPaperId + '.pdf');
+      expect(fs.existsSync(pdfPath)).toBe(true);
+    }
+  }, 90000);
+
+  it('downloads and converts to md', async () => {
+    const testPaperId = '2101.00001v1';
+    const response = await client.executeToolFromFile(
+      toolPath,
+      { paper_id: testPaperId, convert_to_md: true },
+      { storage_folder: storageFolder }
+    );
+    if (response.status === 'success') {
+      expect(fs.existsSync(response.md_file)).toBe(true);
+    }
+  }, 90000);
+}); 
diff --git a/tools/arxiv-download/metadata.json b/tools/arxiv-download/metadata.json
@@ -0,0 +1,49 @@
+{
+  "id": "arxiv-download",
+  "name": "arxiv-download",
+  "version": "1.0.0",
+  "description": "Download an arXiv paper PDF and optionally convert it to Markdown",
+  "author": "Shinkai",
+  "keywords": [
+    "arxiv",
+    "pdf",
+    "download",
+    "markdown",
+    "research",
+    "paper"
+  ],
+  "configurations": {
+    "type": "object",
+    "properties": {
+      "storage_folder": {
+        "type": "string",
+        "description": "Where to store PDFs/MD outputs",
+        "default": "arxiv_papers"
+      }
+    }
+  },
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "paper_id": {
+        "type": "string",
+        "description": "ArXiv paper ID to download"
+      },
+      "convert_to_md": {
+        "type": "boolean",
+        "description": "Whether to convert the downloaded PDF to .md",
+        "default": true
+      }
+    },
+    "required": ["paper_id"]
+  },
+  "result": {
+    "type": "object",
+    "properties": {
+      "status": { "type": "string" },
+      "message": { "type": "string" },
+      "md_file": { "type": "string" }
+    },
+    "required": ["status", "message"]
+  }
+} 
diff --git a/tools/arxiv-download/store.json b/tools/arxiv-download/store.json
@@ -0,0 +1,3 @@
+{
+  "categoryId": "16fa3d8c-305c-410b-a7c9-6708f04c9976"
+}
diff --git a/tools/arxiv-download/tool.py b/tools/arxiv-download/tool.py
@@ -0,0 +1,81 @@
+# /// script
+# dependencies = [
+#   "requests",
+#   "arxiv>=1.4.7",
+#   "pymupdf4llm",
+#   "pathlib"
+# ]
+# ///
+
+import arxiv
+import requests
+import json
+import pymupdf4llm
+from pathlib import Path
+from typing import Dict, Any
+
+class CONFIG:
+    storage_folder: str = "arxiv_papers"
+
+class INPUTS:
+    paper_id: str  # e.g. "2101.00001"
+    convert_to_md: bool = True
+
+class OUTPUT:
+    status: str
+    message: str
+    md_file: str
+
+async def run(c: CONFIG, p: INPUTS) -> OUTPUT:
+    """
+    Download a paper from arXiv by ID, store as PDF in the storage folder, optionally convert to .md
+    """
+    folder = Path(c.storage_folder)
+    folder.mkdir(parents=True, exist_ok=True)
+
+    # if we already have .md for that paper, skip
+    md_path = folder / f"{p.paper_id}.md"
+    if md_path.exists():
+        out = OUTPUT()
+        out.status = "exists"
+        out.message = f"Paper {p.paper_id} already downloaded/converted."
+        out.md_file = str(md_path)
+        return out
+
+    # otherwise, we do the download
+    search = arxiv.Search(id_list=[p.paper_id])
+    client = arxiv.Client()
+    try:
+        paper = next(client.results(search))
+    except StopIteration:
+        out = OUTPUT()
+        out.status = "error"
+        out.message = f"Paper not found: {p.paper_id}"
+        out.md_file = ""
+        return out
+
+    # Download PDF
+    pdf_path = folder / f"{p.paper_id}.pdf"
+    if not pdf_path.exists():
+        paper.download_pdf(dirpath=str(folder), filename=pdf_path.name)
+
+    # Optionally convert
+    if p.convert_to_md:
+        # Convert using pymupdf4llm
+        try:
+            markdown_text = pymupdf4llm.to_markdown(str(pdf_path), show_progress=False)
+            md_path.write_text(markdown_text, encoding='utf-8')
+            # remove pdf if you want
+            # pdf_path.unlink()
+        except Exception as e:
+            out = OUTPUT()
+            out.status = "error"
+            out.message = f"Conversion failed: {str(e)}"
+            out.md_file = ""
+            return out
+
+    out = OUTPUT()
+    out.status = "success"
+    out.message = f"Paper {p.paper_id} downloaded successfully."
+    out.md_file = str(md_path) if p.convert_to_md else ""
+    return out