Skip to content

Commit f16b7e8

Browse files
committed
Project setup
1 parent 7a90c3a commit f16b7e8

11 files changed

+334
-0
lines changed

.env.example

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
RUFUS_API_KEY=your-openai-api-key-here

.gitignore

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
__pycache__/
2+
*.py[cod]
3+
*$py.class
4+
*.so
5+
.Python
6+
build/
7+
develop-eggs/
8+
dist/
9+
downloads/
10+
eggs/
11+
.eggs/
12+
lib/
13+
lib64/
14+
parts/
15+
sdist/
16+
var/
17+
wheels/
18+
*.egg-info/
19+
.installed.cfg
20+
*.egg
21+
.env
22+
venv/
23+
.venv/
24+
.idea/
25+
.vscode/

examples/basic_usage.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from rufus import RufusClient
2+
3+
def main():
4+
# Initialize client
5+
client = RufusClient()
6+
7+
# Example: Scraping HR information
8+
documents = client.scrape(
9+
"https://www.sfgov.com", # Government site as mentioned in case study
10+
instructions="Find information about HR policies and employee benefits"
11+
)
12+
13+
# Print results
14+
for doc in documents:
15+
print(f"\nTitle: {doc['title']}")
16+
print(f"Summary: {doc['summary']}")
17+
print(f"Topics: {', '.join(doc['metadata']['topics'])}")
18+
print(f"Relevance: {doc['metadata']['relevance_score']}")
19+
20+
if __name__ == "__main__":
21+
main()

requirements.txt

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
setuptools>=68.0.0
2+
aiohttp==3.9.1
3+
beautifulsoup4==4.12.2
4+
openai==1.3.5
5+
python-dotenv==1.0.0
6+
requests==2.31.0

rufus/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .client import RufusClient
2+
3+
__version__ = "0.1.0"
4+
__all__ = ["RufusClient"]

rufus/client.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import os
2+
from typing import List, Dict
3+
from dotenv import load_dotenv
4+
from .scraper import WebScraper
5+
from .processor import ContentProcessor
6+
7+
class RufusClient:
8+
def __init__(self, api_key: str = None):
9+
"""Initialize Rufus client with API key."""
10+
load_dotenv()
11+
self.api_key = api_key or os.getenv('RUFUS_API_KEY')
12+
if not self.api_key:
13+
raise ValueError("API key is required. Set RUFUS_API_KEY environment variable")
14+
15+
self.scraper = WebScraper()
16+
self.processor = ContentProcessor(api_key=self.api_key)
17+
18+
def scrape(self, url: str, instructions: str = None) -> List[Dict]:
19+
"""Scrape website content based on instructions."""
20+
if not url.startswith(('http://', 'https://')):
21+
raise ValueError("Invalid URL format")
22+
23+
raw_content = self.scraper.crawl(url)
24+
return self.processor.process(raw_content, instructions)

rufus/processor.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from typing import List, Dict
2+
from openai import OpenAI
3+
import logging
4+
import json
5+
from .utils import chunk_text
6+
7+
logging.basicConfig(
8+
level=logging.INFO,
9+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10+
)
11+
logger = logging.getLogger(__name__)
12+
13+
class ContentProcessor:
14+
def __init__(self, api_key: str):
15+
self.client = OpenAI(api_key=api_key)
16+
17+
def generate_extraction_prompt(self, content: str, instructions: str) -> str:
18+
return f"""
19+
Given the following web content and instructions, extract and structure the relevant information.
20+
Create a well-organized document that can be used for RAG applications.
21+
22+
Instructions: {instructions}
23+
24+
Content:
25+
{content}
26+
27+
Return the content in the following JSON format:
28+
{{
29+
"title": "Brief title describing the content",
30+
"summary": "Brief summary of the key points",
31+
"content": "Main extracted content, relevant to the instructions",
32+
"metadata": {{
33+
"topics": ["relevant", "topics", "covered"],
34+
"relevance_score": 0-1 score indicating relevance to instructions
35+
}}
36+
}}
37+
"""
38+
39+
def process_chunk(self, chunk: str, instructions: str) -> Dict:
40+
# """Process a single chunk of content using GPT."""
41+
try:
42+
response = self.client.chat.completions.create(
43+
model="gpt-4", # Changed from gpt-4-turbo-preview
44+
messages=[{
45+
"role": "system",
46+
"content": "You are a content extraction AI that processes web content into structured documents for RAG systems."
47+
}, {
48+
"role": "user",
49+
"content": self.generate_extraction_prompt(chunk, instructions)
50+
}],
51+
temperature=0.3
52+
)
53+
54+
try:
55+
content = response.choices[0].message.content
56+
return json.loads(content)
57+
except json.JSONDecodeError as e:
58+
logger.error(f"Failed to parse GPT response as JSON: {e}")
59+
return None
60+
61+
except Exception as e:
62+
logger.error(f"Error processing chunk: {str(e)}")
63+
return None
64+
65+
def process(self, pages: List[Dict], instructions: str = None) -> List[Dict]:
66+
processed_documents = []
67+
68+
for page in pages:
69+
try:
70+
chunks = chunk_text(page['content'], max_length=4000)
71+
72+
for chunk in chunks:
73+
processed = self.process_chunk(chunk, instructions)
74+
if processed:
75+
if 'metadata' in processed:
76+
processed['metadata']['source_url'] = page['url']
77+
processed_documents.append(processed)
78+
else:
79+
logger.warning(f"Failed to process chunk from {page['url']}")
80+
81+
except Exception as e:
82+
logger.error(f"Error processing page {page['url']}: {str(e)}")
83+
continue
84+
85+
return processed_documents

rufus/scraper.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urljoin, urlparse
4+
from typing import Set, Dict, List
5+
import asyncio
6+
import aiohttp
7+
import logging
8+
9+
class WebScraper:
10+
def __init__(self):
11+
self.visited_urls: Set[str] = set()
12+
self.max_depth = 3
13+
self.max_pages = 100
14+
self.timeout = 30
15+
self.max_concurrent = 5
16+
self.headers = {
17+
'User-Agent': 'Rufus Bot 0.1 - AI-Powered Web Scraper for RAG Systems'
18+
}
19+
20+
def should_crawl(self, url: str, base_domain: str) -> bool:
21+
if url in self.visited_urls:
22+
return False
23+
24+
try:
25+
parsed = urlparse(url)
26+
if parsed.netloc != base_domain:
27+
return False
28+
29+
skip_extensions = ('.pdf', '.jpg', '.png', '.gif', '.css', '.js')
30+
if any(url.lower().endswith(ext) for ext in skip_extensions):
31+
return False
32+
33+
return True
34+
except:
35+
return False
36+
37+
async def fetch_page(self, url: str, session: aiohttp.ClientSession) -> Dict:
38+
try:
39+
async with session.get(url, headers=self.headers) as response:
40+
if response.status == 200:
41+
html = await response.text()
42+
soup = BeautifulSoup(html, 'html.parser')
43+
44+
for tag in soup(['script', 'style', 'meta', 'noscript']):
45+
tag.decompose()
46+
47+
return {
48+
'url': url,
49+
'title': soup.title.string if soup.title else '',
50+
'content': soup.get_text(separator=' ', strip=True),
51+
'links': [
52+
urljoin(url, link.get('href'))
53+
for link in soup.find_all('a', href=True)
54+
]
55+
}
56+
return None
57+
except Exception as e:
58+
logging.error(f"Error fetching {url}: {str(e)}")
59+
return None
60+
61+
async def crawl_async(self, start_url: str) -> List[Dict]:
62+
base_domain = urlparse(start_url).netloc
63+
to_visit = {start_url}
64+
results = []
65+
66+
connector = aiohttp.TCPConnector(limit=self.max_concurrent)
67+
timeout = aiohttp.ClientTimeout(total=self.timeout)
68+
69+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
70+
while to_visit and len(self.visited_urls) < self.max_pages:
71+
current_batch = list(to_visit)[:self.max_concurrent]
72+
to_visit = set(list(to_visit)[self.max_concurrent:])
73+
74+
tasks = [
75+
self.fetch_page(url, session)
76+
for url in current_batch
77+
if self.should_crawl(url, base_domain)
78+
]
79+
80+
pages = await asyncio.gather(*tasks, return_exceptions=True)
81+
82+
for page in pages:
83+
if isinstance(page, dict):
84+
self.visited_urls.add(page['url'])
85+
results.append(page)
86+
for link in page.get('links', []):
87+
if self.should_crawl(link, base_domain):
88+
to_visit.add(link)
89+
90+
return results
91+
92+
def crawl(self, url: str) -> List[Dict]:
93+
return asyncio.run(self.crawl_async(url))

rufus/utils.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from typing import List
2+
import re
3+
4+
def chunk_text(text: str, max_length: int = 4000) -> List[str]:
5+
sentences = re.split(r'(?<=[.!?])\s+', text)
6+
chunks = []
7+
current_chunk = []
8+
current_length = 0
9+
10+
for sentence in sentences:
11+
sentence_length = len(sentence)
12+
13+
if current_length + sentence_length > max_length:
14+
if current_chunk:
15+
chunks.append(' '.join(current_chunk))
16+
current_chunk = [sentence]
17+
current_length = sentence_length
18+
else:
19+
current_chunk.append(sentence)
20+
current_length += sentence_length
21+
22+
if current_chunk:
23+
chunks.append(' '.join(current_chunk))
24+
25+
return chunks

setup.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from setuptools import setup, find_packages
2+
3+
setup(
4+
name="rufus",
5+
version="0.1.0",
6+
packages=find_packages(),
7+
install_requires=[
8+
'aiohttp>=3.9.1',
9+
'beautifulsoup4>=4.12.2',
10+
'openai>=1.3.5',
11+
'python-dotenv>=1.0.0',
12+
'requests>=2.31.0',
13+
],
14+
author="Your Name",
15+
author_email="your.email@example.com",
16+
description="AI-powered web scraper for RAG systems",
17+
long_description=open("README.md").read(),
18+
long_description_content_type="text/markdown",
19+
url="https://github.com/yourusername/rufus",
20+
classifiers=[
21+
"Programming Language :: Python :: 3",
22+
"License :: OSI Approved :: MIT License",
23+
"Operating System :: OS Independent",
24+
],
25+
python_requires=">=3.8",
26+
)

tests/test_rufus.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import unittest
2+
from rufus import RufusClient
3+
4+
class TestRufus(unittest.TestCase):
5+
def setUp(self):
6+
self.client = RufusClient()
7+
8+
def test_invalid_url(self):
9+
with self.assertRaises(ValueError):
10+
self.client.scrape("invalid-url")
11+
12+
def test_scraping(self):
13+
docs = self.client.scrape(
14+
"https://example.com",
15+
instructions="Find product information"
16+
)
17+
self.assertIsInstance(docs, list)
18+
if docs:
19+
self.assertIn('title', docs[0])
20+
self.assertIn('summary', docs[0])
21+
self.assertIn('metadata', docs[0])
22+
23+
if __name__ == '__main__':
24+
unittest.main()

0 commit comments

Comments
 (0)