Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add arxiv-search tool #143

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tools/arxiv-search/banner.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tools/arxiv-search/icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
48 changes: 48 additions & 0 deletions tools/arxiv-search/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { expect } from '@jest/globals';
import { getToolTestClient } from '../../src/test/utils';
import * as path from 'path';

describe('Arxiv Search Tool', () => {
const toolPath = path.join(__dirname, 'tool.py');
const client = getToolTestClient();

it('searches for basic query successfully', async () => {
const response = await client.executeToolFromFile(toolPath, {
query: 'deep learning'
});
// the console.log should print multiple levels of the response
console.log('Full response:', JSON.stringify(response, null, 2));
console.log('\nPapers array:', JSON.stringify(response.papers, null, 2));
if (response.papers && response.papers.length > 0) {
console.log('\nFirst paper details:', JSON.stringify(response.papers[0], null, 2));
}

expect(Array.isArray(response.papers)).toBe(true);
expect(response).toHaveProperty('total_results');
expect(response.total_results).toBeGreaterThanOrEqual(0);
if (response.total_results > 0) {
const paper = response.papers[0];
expect(paper).toHaveProperty('title');
expect(paper).toHaveProperty('pdf_url');
}
}, 30000);

it('applies a maximum results limit', async () => {
const response = await client.executeToolFromFile(toolPath, {
query: 'blockchain',
max_results: 3
});
expect(Array.isArray(response.papers)).toBe(true);
expect(response.papers.length).toBeLessThanOrEqual(3);
}, 30000);

it('handles date range filters (optional)', async () => {
const response = await client.executeToolFromFile(toolPath, {
query: 'quantum computing',
date_from: '2023-01-01',
date_to: '2023-12-31',
max_results: 5
});
expect(response.papers.length).toBeLessThanOrEqual(5);
}, 30000);
});
76 changes: 76 additions & 0 deletions tools/arxiv-search/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"id": "arxiv-search",
"name": "arxiv-search",
"version": "1.0.0",
"description": "Search for papers on arXiv with optional date range and category filters",
"author": "Shinkai",
"keywords": [
"arxiv",
"search",
"papers",
"research",
"academic",
"scientific"
],
"configurations": {
"type": "object",
"properties": {},
"required": []
},
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query string"
},
"max_results": {
"type": "number",
"description": "Maximum number of results to return",
"default": 10
},
"date_from": {
"type": "string",
"description": "Earliest publication date in a parseable date string (optional)"
},
"date_to": {
"type": "string",
"description": "Latest publication date in a parseable date string (optional)"
},
"categories": {
"type": "array",
"description": "List of category filters (e.g. [cs.LG])",
"items": { "type": "string" }
}
},
"required": ["query"]
},
"result": {
"type": "object",
"properties": {
"papers": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": { "type": "string" },
"title": { "type": "string" },
"authors": {
"type": "array",
"items": { "type": "string" }
},
"abstract": { "type": "string" },
"published": { "type": "string" },
"categories": {
"type": "array",
"items": { "type": "string" }
},
"pdf_url": { "type": "string" }
}
}
},
"total_results": { "type": "number" }
},
"required": ["papers", "total_results"]
}
}
3 changes: 3 additions & 0 deletions tools/arxiv-search/store.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"categoryId": "16fa3d8c-305c-410b-a7c9-6708f04c9976"
}
103 changes: 103 additions & 0 deletions tools/arxiv-search/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# /// script
# dependencies = [
# "requests",
# "arxiv>=1.4.7",
# "python-dateutil"
# ]
# ///

import arxiv
import json
from typing import List, Dict, Any
from dateutil import parser

class CONFIG:
# For this tool, we don't strictly need configuration fields,
# but we can keep them if you plan to store e.g. environment variables.
pass

class INPUTS:
query: str # The search query string
max_results: int = 10
date_from: str = ""
date_to: str = ""
categories: List[str] = [] # A list of category strings

class OUTPUT:
papers: List[Dict[str, Any]]
total_results: int

async def run(c: CONFIG, p: INPUTS) -> OUTPUT:
"""
Search for papers on arXiv with advanced filtering.
"""
# For safety, clamp max_results
max_results = max(1, min(p.max_results, 50))

# If categories were provided, combine them into a single query
search_query = p.query.strip()
if p.categories:
cat_filter = " OR ".join(f"cat:{cat.strip()}" for cat in p.categories)
search_query = f"({search_query}) AND ({cat_filter})"

search = arxiv.Search(
query=search_query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)

# Date filters
date_from = None
date_to = None

# Attempt to parse date range if provided
if p.date_from:
try:
date_from = parser.parse(p.date_from)
except Exception as e:
# not fatal, just ignore
pass

if p.date_to:
try:
date_to = parser.parse(p.date_to)
except Exception as e:
pass

papers = []
client = arxiv.Client()
count = 0

def is_within(date, start, end):
if not date:
return True
if start and date < start:
return False
if end and date > end:
return False
return True

for result in client.results(search):
if is_within(result.published, date_from, date_to):
short_id = result.get_short_id()
# Convert authors and categories to lists before adding to dictionary
authors_list = [str(a.name) for a in result.authors]
categories_list = list(result.categories)

papers.append({
"id": short_id,
"title": result.title,
"authors": authors_list, # Now explicitly a list of strings
"abstract": result.summary,
"published": result.published.isoformat(),
"categories": categories_list, # Now explicitly a list
"pdf_url": result.pdf_url
})
count += 1
if count >= max_results:
break

out = OUTPUT()
out.papers = papers
out.total_results = len(papers)
return out
Loading