diff --git a/tools/arxiv-download/banner.png b/tools/arxiv-download/banner.png new file mode 100644 index 00000000..e40f3189 Binary files /dev/null and b/tools/arxiv-download/banner.png differ diff --git a/tools/arxiv-download/icon.png b/tools/arxiv-download/icon.png new file mode 100644 index 00000000..72ffda65 Binary files /dev/null and b/tools/arxiv-download/icon.png differ diff --git a/tools/arxiv-download/index.test.ts b/tools/arxiv-download/index.test.ts new file mode 100644 index 00000000..0e990bab --- /dev/null +++ b/tools/arxiv-download/index.test.ts @@ -0,0 +1,58 @@ +import { expect } from '@jest/globals'; +import { getToolTestClient } from '../../src/test/utils'; +import * as path from 'path'; +import * as fs from 'fs'; + +describe('Arxiv Download Tool', () => { + const toolPath = path.join(__dirname, 'tool.py'); + const client = getToolTestClient(); + const storageFolder = path.join(process.cwd(), 'test-arxiv-download'); + + beforeAll(() => { + fs.mkdirSync(storageFolder, { recursive: true }); + }); + + it('handles unknown paper gracefully', async () => { + const response = await client.executeToolFromFile( + toolPath, + { paper_id: 'nonexistentid' }, + { storage_folder: storageFolder } + ); + expect(response.status).toBe('error'); + expect(response.message).toMatch(/Paper not found/i); + }, 60000); + + it('downloads a known paper (will skip real invalid IDs, you can mock)', async () => { + // Provide a real arXiv ID for testing. This test might need to be mocked + // for offline usage. For demonstration, let's do something like: + const testPaperId = '2101.00001'; // Example only, might exist + const response = await client.executeToolFromFile( + toolPath, + { paper_id: testPaperId, convert_to_md: false }, + { storage_folder: storageFolder } + ); + if (response.status === 'error') { + // Possibly paper not found or network error + console.warn('Could not test real paper - possibly invalid or network offline'); + expect(response.message).toMatch(/Paper not found|Error/); + } else { + expect(response.status).toBe('success'); + expect(typeof response.message).toBe('string'); + // If it succeeded, we should have a PDF in the folder + const pdfPath = path.join(storageFolder, testPaperId + '.pdf'); + expect(fs.existsSync(pdfPath)).toBe(true); + } + }, 90000); + + it('downloads and converts to md', async () => { + const testPaperId = '2101.00001v1'; + const response = await client.executeToolFromFile( + toolPath, + { paper_id: testPaperId, convert_to_md: true }, + { storage_folder: storageFolder } + ); + if (response.status === 'success') { + expect(fs.existsSync(response.md_file)).toBe(true); + } + }, 90000); +}); \ No newline at end of file diff --git a/tools/arxiv-download/metadata.json b/tools/arxiv-download/metadata.json new file mode 100644 index 00000000..3605f0b0 --- /dev/null +++ b/tools/arxiv-download/metadata.json @@ -0,0 +1,49 @@ +{ + "id": "arxiv-download", + "name": "arxiv-download", + "version": "1.0.0", + "description": "Download an arXiv paper PDF and optionally convert it to Markdown", + "author": "Shinkai", + "keywords": [ + "arxiv", + "pdf", + "download", + "markdown", + "research", + "paper" + ], + "configurations": { + "type": "object", + "properties": { + "storage_folder": { + "type": "string", + "description": "Where to store PDFs/MD outputs", + "default": "arxiv_papers" + } + } + }, + "parameters": { + "type": "object", + "properties": { + "paper_id": { + "type": "string", + "description": "ArXiv paper ID to download" + }, + "convert_to_md": { + "type": "boolean", + "description": "Whether to convert the downloaded PDF to .md", + "default": true + } + }, + "required": ["paper_id"] + }, + "result": { + "type": "object", + "properties": { + "status": { "type": "string" }, + "message": { "type": "string" }, + "md_file": { "type": "string" } + }, + "required": ["status", "message"] + } +} \ No newline at end of file diff --git a/tools/arxiv-download/store.json b/tools/arxiv-download/store.json new file mode 100644 index 00000000..26e0f4c8 --- /dev/null +++ b/tools/arxiv-download/store.json @@ -0,0 +1,3 @@ +{ + "categoryId": "16fa3d8c-305c-410b-a7c9-6708f04c9976" +} diff --git a/tools/arxiv-download/tool.py b/tools/arxiv-download/tool.py new file mode 100644 index 00000000..62eb9863 --- /dev/null +++ b/tools/arxiv-download/tool.py @@ -0,0 +1,81 @@ +# /// script +# dependencies = [ +# "requests", +# "arxiv>=1.4.7", +# "pymupdf4llm", +# "pathlib" +# ] +# /// + +import arxiv +import requests +import json +import pymupdf4llm +from pathlib import Path +from typing import Dict, Any + +class CONFIG: + storage_folder: str = "arxiv_papers" + +class INPUTS: + paper_id: str # e.g. "2101.00001" + convert_to_md: bool = True + +class OUTPUT: + status: str + message: str + md_file: str + +async def run(c: CONFIG, p: INPUTS) -> OUTPUT: + """ + Download a paper from arXiv by ID, store as PDF in the storage folder, optionally convert to .md + """ + folder = Path(c.storage_folder) + folder.mkdir(parents=True, exist_ok=True) + + # if we already have .md for that paper, skip + md_path = folder / f"{p.paper_id}.md" + if md_path.exists(): + out = OUTPUT() + out.status = "exists" + out.message = f"Paper {p.paper_id} already downloaded/converted." + out.md_file = str(md_path) + return out + + # otherwise, we do the download + search = arxiv.Search(id_list=[p.paper_id]) + client = arxiv.Client() + try: + paper = next(client.results(search)) + except StopIteration: + out = OUTPUT() + out.status = "error" + out.message = f"Paper not found: {p.paper_id}" + out.md_file = "" + return out + + # Download PDF + pdf_path = folder / f"{p.paper_id}.pdf" + if not pdf_path.exists(): + paper.download_pdf(dirpath=str(folder), filename=pdf_path.name) + + # Optionally convert + if p.convert_to_md: + # Convert using pymupdf4llm + try: + markdown_text = pymupdf4llm.to_markdown(str(pdf_path), show_progress=False) + md_path.write_text(markdown_text, encoding='utf-8') + # remove pdf if you want + # pdf_path.unlink() + except Exception as e: + out = OUTPUT() + out.status = "error" + out.message = f"Conversion failed: {str(e)}" + out.md_file = "" + return out + + out = OUTPUT() + out.status = "success" + out.message = f"Paper {p.paper_id} downloaded successfully." + out.md_file = str(md_path) if p.convert_to_md else "" + return out \ No newline at end of file