Skip to content

Commit e79ad01

Browse files
committed
Corrected code
1 parent f16b7e8 commit e79ad01

File tree

6 files changed

+212
-147
lines changed

6 files changed

+212
-147
lines changed

examples/basic_usage.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
11
from rufus import RufusClient
2+
import json
23

34
def main():
4-
# Initialize client
55
client = RufusClient()
6-
7-
# Example: Scraping HR information
86
documents = client.scrape(
9-
"https://www.sfgov.com", # Government site as mentioned in case study
10-
instructions="Find information about HR policies and employee benefits"
11-
)
7+
"https://www.sfgov.com",
8+
instructions="Find information about HR policies and employee benefits"
9+
)
1210

13-
# Print results
14-
for doc in documents:
15-
print(f"\nTitle: {doc['title']}")
16-
print(f"Summary: {doc['summary']}")
17-
print(f"Topics: {', '.join(doc['metadata']['topics'])}")
18-
print(f"Relevance: {doc['metadata']['relevance_score']}")
11+
if documents:
12+
print("\nExtracted Documents:")
13+
for doc in documents:
14+
print(f"\nTitle: {doc['metadata']['title']}")
15+
print(f"Text: {doc['text'][:200]}...")
16+
print(f"Topics: {', '.join(doc['metadata']['topics'])}")
17+
print(f"Type: {doc['metadata']['chunk_type']}")
18+
print(f"Source: {doc['metadata']['source_url']}")
19+
print(f"Relevance: {doc['metadata']['relevance_score']}")
20+
print("-" * 50)
21+
else:
22+
print("No documents were extracted")
1923

2024
if __name__ == "__main__":
2125
main()

requirements.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1-
setuptools>=68.0.0
21
aiohttp==3.9.1
32
beautifulsoup4==4.12.2
43
openai==1.3.5
54
python-dotenv==1.0.0
65
requests==2.31.0
6+
playwright==1.40.0
7+
nltk==3.8.1
8+
uuid==1.30
9+
python-dateutil==2.8.2
10+
tqdm==4.66.1

rufus/client.py

-4
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,15 @@
66

77
class RufusClient:
88
def __init__(self, api_key: str = None):
9-
"""Initialize Rufus client with API key."""
109
load_dotenv()
1110
self.api_key = api_key or os.getenv('RUFUS_API_KEY')
1211
if not self.api_key:
1312
raise ValueError("API key is required. Set RUFUS_API_KEY environment variable")
14-
1513
self.scraper = WebScraper()
1614
self.processor = ContentProcessor(api_key=self.api_key)
1715

1816
def scrape(self, url: str, instructions: str = None) -> List[Dict]:
19-
"""Scrape website content based on instructions."""
2017
if not url.startswith(('http://', 'https://')):
2118
raise ValueError("Invalid URL format")
22-
2319
raw_content = self.scraper.crawl(url)
2420
return self.processor.process(raw_content, instructions)

rufus/processor.py

+89-44
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,130 @@
1-
from typing import List, Dict
2-
from openai import OpenAI
3-
import logging
41
import json
5-
from .utils import chunk_text
2+
import logging
3+
import uuid
4+
from datetime import datetime
5+
from time import sleep
6+
from typing import Dict, List, Optional
7+
from tqdm import tqdm
8+
from openai import OpenAI
9+
import nltk
10+
from nltk.tokenize import sent_tokenize
611

7-
logging.basicConfig(
8-
level=logging.INFO,
9-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10-
)
12+
logging.basicConfig(level=logging.INFO)
1113
logger = logging.getLogger(__name__)
1214

1315
class ContentProcessor:
1416
def __init__(self, api_key: str):
1517
self.client = OpenAI(api_key=api_key)
18+
self.rate_limit_delay = 1
19+
nltk.download('punkt', quiet=True)
1620

17-
def generate_extraction_prompt(self, content: str, instructions: str) -> str:
18-
return f"""
19-
Given the following web content and instructions, extract and structure the relevant information.
20-
Create a well-organized document that can be used for RAG applications.
21+
def preprocess_content(self, content: str, max_length: int = 4000) -> List[str]:
22+
sentences = sent_tokenize(content)
23+
chunks = []
24+
current_chunk = []
25+
current_length = 0
2126

22-
Instructions: {instructions}
27+
for sentence in sentences:
28+
if current_length + len(sentence) > max_length:
29+
if current_chunk:
30+
chunks.append(' '.join(current_chunk))
31+
current_chunk = [sentence]
32+
current_length = len(sentence)
33+
else:
34+
current_chunk.append(sentence)
35+
current_length += len(sentence)
36+
37+
if current_chunk:
38+
chunks.append(' '.join(current_chunk))
39+
return chunks
2340

24-
Content:
25-
{content}
41+
def generate_rag_prompt(self, content: str, instructions: str, metadata: Dict) -> str:
42+
return f"""
43+
Process this content for RAG system integration.
44+
Instructions: {instructions}
45+
Source URL: {metadata.get('url', 'Unknown')}
46+
Content: {content}
2647
27-
Return the content in the following JSON format:
48+
Return strictly valid JSON matching this structure:
2849
{{
29-
"title": "Brief title describing the content",
30-
"summary": "Brief summary of the key points",
31-
"content": "Main extracted content, relevant to the instructions",
32-
"metadata": {{
33-
"topics": ["relevant", "topics", "covered"],
34-
"relevance_score": 0-1 score indicating relevance to instructions
35-
}}
50+
"text": "Main content for embedding",
51+
"title": "Descriptive section title",
52+
"source_url": "Origin URL",
53+
"chunk_type": "policy|procedure|faq|general",
54+
"topics": ["topic1", "topic2"],
55+
"context": "Additional retrieval context",
56+
"relevance_score": 0.0 to 1.0
3657
}}
3758
"""
3859

39-
def process_chunk(self, chunk: str, instructions: str) -> Dict:
40-
# """Process a single chunk of content using GPT."""
60+
def process_chunk(self, chunk: str, instructions: str, metadata: Dict) -> Optional[Dict]:
4161
try:
62+
sleep(self.rate_limit_delay)
4263
response = self.client.chat.completions.create(
43-
model="gpt-4", # Changed from gpt-4-turbo-preview
64+
model="gpt-3.5-turbo",
4465
messages=[{
4566
"role": "system",
46-
"content": "You are a content extraction AI that processes web content into structured documents for RAG systems."
67+
"content": "You are a RAG content processor. Return only valid JSON."
4768
}, {
4869
"role": "user",
49-
"content": self.generate_extraction_prompt(chunk, instructions)
70+
"content": self.generate_rag_prompt(chunk, instructions, metadata)
5071
}],
51-
temperature=0.3
72+
temperature=0.3,
73+
max_tokens=1000
5274
)
5375

54-
try:
55-
content = response.choices[0].message.content
56-
return json.loads(content)
57-
except json.JSONDecodeError as e:
58-
logger.error(f"Failed to parse GPT response as JSON: {e}")
76+
content = response.choices[0].message.content.strip()
77+
if content.startswith("```json"):
78+
content = content[7:-3]
79+
80+
result = json.loads(content)
81+
82+
if result.get('relevance_score', 0) < 0.5:
5983
return None
6084

85+
return {
86+
"id": str(uuid.uuid4()),
87+
"text": result["text"],
88+
"metadata": {
89+
"title": result["title"],
90+
"source_url": result["source_url"],
91+
"chunk_type": result["chunk_type"],
92+
"timestamp": datetime.now().isoformat(),
93+
"topics": result["topics"],
94+
"context": result["context"],
95+
"relevance_score": result["relevance_score"]
96+
}
97+
}
98+
6199
except Exception as e:
62-
logger.error(f"Error processing chunk: {str(e)}")
100+
logger.error(f"Processing error: {str(e)}")
63101
return None
64102

65103
def process(self, pages: List[Dict], instructions: str = None) -> List[Dict]:
66104
processed_documents = []
67105

68-
for page in pages:
106+
for page in tqdm(pages, desc="Processing pages"):
69107
try:
70-
chunks = chunk_text(page['content'], max_length=4000)
108+
chunks = self.preprocess_content(page['content'])
109+
metadata = {
110+
"url": page['url'],
111+
"title": page['title'],
112+
"structured_data": page.get('structured_data', {})
113+
}
71114

72115
for chunk in chunks:
73-
processed = self.process_chunk(chunk, instructions)
74-
if processed:
75-
if 'metadata' in processed:
76-
processed['metadata']['source_url'] = page['url']
77-
processed_documents.append(processed)
78-
else:
79-
logger.warning(f"Failed to process chunk from {page['url']}")
116+
doc = self.process_chunk(chunk, instructions, metadata)
117+
if doc:
118+
processed_documents.append(doc)
80119

81120
except Exception as e:
82121
logger.error(f"Error processing page {page['url']}: {str(e)}")
83122
continue
84123

85124
return processed_documents
125+
126+
def save_to_jsonl(self, documents: List[Dict], output_file: str):
127+
"""Save documents in JSONL format for RAG systems."""
128+
with open(output_file, 'w', encoding='utf-8') as f:
129+
for doc in documents:
130+
f.write(json.dumps(doc) + '\n')

0 commit comments

Comments
 (0)