Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add arxiv-download tool #122

Merged
merged 1 commit into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tools/arxiv-download/banner.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tools/arxiv-download/icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 58 additions & 0 deletions tools/arxiv-download/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import { expect } from '@jest/globals';
import { getToolTestClient } from '../../src/test/utils';
import * as path from 'path';
import * as fs from 'fs';

describe('Arxiv Download Tool', () => {
const toolPath = path.join(__dirname, 'tool.py');
const client = getToolTestClient();
const storageFolder = path.join(process.cwd(), 'test-arxiv-download');

beforeAll(() => {
fs.mkdirSync(storageFolder, { recursive: true });
});

it('handles unknown paper gracefully', async () => {
const response = await client.executeToolFromFile(
toolPath,
{ paper_id: 'nonexistentid' },
{ storage_folder: storageFolder }
);
expect(response.status).toBe('error');
expect(response.message).toMatch(/Paper not found/i);
}, 60000);

it('downloads a known paper (will skip real invalid IDs, you can mock)', async () => {
// Provide a real arXiv ID for testing. This test might need to be mocked
// for offline usage. For demonstration, let's do something like:
const testPaperId = '2101.00001'; // Example only, might exist
const response = await client.executeToolFromFile(
toolPath,
{ paper_id: testPaperId, convert_to_md: false },
{ storage_folder: storageFolder }
);
if (response.status === 'error') {
// Possibly paper not found or network error
console.warn('Could not test real paper - possibly invalid or network offline');
expect(response.message).toMatch(/Paper not found|Error/);
} else {
expect(response.status).toBe('success');
expect(typeof response.message).toBe('string');
// If it succeeded, we should have a PDF in the folder
const pdfPath = path.join(storageFolder, testPaperId + '.pdf');
expect(fs.existsSync(pdfPath)).toBe(true);
}
}, 90000);

it('downloads and converts to md', async () => {
const testPaperId = '2101.00001v1';
const response = await client.executeToolFromFile(
toolPath,
{ paper_id: testPaperId, convert_to_md: true },
{ storage_folder: storageFolder }
);
if (response.status === 'success') {
expect(fs.existsSync(response.md_file)).toBe(true);
}
}, 90000);
});
49 changes: 49 additions & 0 deletions tools/arxiv-download/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"id": "arxiv-download",
"name": "arxiv-download",
"version": "1.0.0",
"description": "Download an arXiv paper PDF and optionally convert it to Markdown",
"author": "Shinkai",
"keywords": [
"arxiv",
"pdf",
"download",
"markdown",
"research",
"paper"
],
"configurations": {
"type": "object",
"properties": {
"storage_folder": {
"type": "string",
"description": "Where to store PDFs/MD outputs",
"default": "arxiv_papers"
}
}
},
"parameters": {
"type": "object",
"properties": {
"paper_id": {
"type": "string",
"description": "ArXiv paper ID to download"
},
"convert_to_md": {
"type": "boolean",
"description": "Whether to convert the downloaded PDF to .md",
"default": true
}
},
"required": ["paper_id"]
},
"result": {
"type": "object",
"properties": {
"status": { "type": "string" },
"message": { "type": "string" },
"md_file": { "type": "string" }
},
"required": ["status", "message"]
}
}
3 changes: 3 additions & 0 deletions tools/arxiv-download/store.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"categoryId": "16fa3d8c-305c-410b-a7c9-6708f04c9976"
}
81 changes: 81 additions & 0 deletions tools/arxiv-download/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# /// script
# dependencies = [
# "requests",
# "arxiv>=1.4.7",
# "pymupdf4llm",
# "pathlib"
# ]
# ///

import arxiv
import requests
import json
import pymupdf4llm
from pathlib import Path
from typing import Dict, Any

class CONFIG:
storage_folder: str = "arxiv_papers"

class INPUTS:
paper_id: str # e.g. "2101.00001"
convert_to_md: bool = True

class OUTPUT:
status: str
message: str
md_file: str

async def run(c: CONFIG, p: INPUTS) -> OUTPUT:
"""
Download a paper from arXiv by ID, store as PDF in the storage folder, optionally convert to .md
"""
folder = Path(c.storage_folder)
folder.mkdir(parents=True, exist_ok=True)

# if we already have .md for that paper, skip
md_path = folder / f"{p.paper_id}.md"
if md_path.exists():
out = OUTPUT()
out.status = "exists"
out.message = f"Paper {p.paper_id} already downloaded/converted."
out.md_file = str(md_path)
return out

# otherwise, we do the download
search = arxiv.Search(id_list=[p.paper_id])
client = arxiv.Client()
try:
paper = next(client.results(search))
except StopIteration:
out = OUTPUT()
out.status = "error"
out.message = f"Paper not found: {p.paper_id}"
out.md_file = ""
return out

# Download PDF
pdf_path = folder / f"{p.paper_id}.pdf"
if not pdf_path.exists():
paper.download_pdf(dirpath=str(folder), filename=pdf_path.name)

# Optionally convert
if p.convert_to_md:
# Convert using pymupdf4llm
try:
markdown_text = pymupdf4llm.to_markdown(str(pdf_path), show_progress=False)
md_path.write_text(markdown_text, encoding='utf-8')
# remove pdf if you want
# pdf_path.unlink()
except Exception as e:
out = OUTPUT()
out.status = "error"
out.message = f"Conversion failed: {str(e)}"
out.md_file = ""
return out

out = OUTPUT()
out.status = "success"
out.message = f"Paper {p.paper_id} downloaded successfully."
out.md_file = str(md_path) if p.convert_to_md else ""
return out
Loading