Skip to content

Commit 0fd39b7

Browse files
committed
Modified ouput to be RAG-friendly
1 parent e79ad01 commit 0fd39b7

File tree

3 files changed

+61
-45
lines changed

3 files changed

+61
-45
lines changed

examples/basic_usage.py

+34-19
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,40 @@
11
from rufus import RufusClient
2-
import json
2+
import logging
3+
import os
34

45
def main():
5-
client = RufusClient()
6-
documents = client.scrape(
7-
"https://www.sfgov.com",
8-
instructions="Find information about HR policies and employee benefits"
9-
)
6+
logging.basicConfig(level=logging.INFO)
7+
logger = logging.getLogger(__name__)
108

11-
if documents:
12-
print("\nExtracted Documents:")
13-
for doc in documents:
14-
print(f"\nTitle: {doc['metadata']['title']}")
15-
print(f"Text: {doc['text'][:200]}...")
16-
print(f"Topics: {', '.join(doc['metadata']['topics'])}")
17-
print(f"Type: {doc['metadata']['chunk_type']}")
18-
print(f"Source: {doc['metadata']['source_url']}")
19-
print(f"Relevance: {doc['metadata']['relevance_score']}")
20-
print("-" * 50)
21-
else:
22-
print("No documents were extracted")
9+
try:
10+
output_dir = "output"
11+
os.makedirs(output_dir, exist_ok=True)
12+
output_file = os.path.join(output_dir, "scraped_content.jsonl")
13+
14+
client = RufusClient()
15+
documents = client.scrape(
16+
"https://www.example.com",
17+
instructions="Extract main content and information"
18+
)
19+
20+
if documents:
21+
# Save using built-in method
22+
client.processor.save_to_jsonl(documents, output_file)
23+
print(f"\nDocuments saved to: {output_file}")
24+
25+
print("\nExtracted Documents Preview:")
26+
for doc in documents:
27+
print(f"\nTitle: {doc['metadata']['title']}")
28+
print(f"Text: {doc['text'][:200]}...")
29+
print(f"Topics: {', '.join(doc['metadata']['topics'])}")
30+
print(f"Type: {doc['metadata']['chunk_type']}")
31+
print(f"Relevance: {doc['metadata']['relevance_score']}")
32+
print("-" * 50)
33+
else:
34+
print("No documents were extracted")
35+
36+
except Exception as e:
37+
logger.error(f"Error during scraping: {str(e)}")
2338

2439
if __name__ == "__main__":
25-
main()
40+
main()

output/scraped_content.jsonl

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"id": "da33722e-2646-470c-9fc3-f64c534ac6d2", "text": "Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...", "metadata": {"title": "Main content", "source_url": "https://www.example.com", "chunk_type": "general", "timestamp": "2024-11-15T12:46:41.907621", "topics": ["example", "domain", "illustrative", "literature"], "context": "Content extracted from the Example Domain website.", "relevance_score": 0.8}}

rufus/scraper.py

+26-26
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,32 @@ async def fetch_with_playwright(self, url: str):
3333
return None
3434

3535
async def fetch_page(self, url: str, session: aiohttp.ClientSession) -> Dict:
36-
try:
37-
async with session.get(url, headers=self.headers) as response:
38-
if response.status == 200:
39-
html = await response.text()
40-
41-
if 'loading' in html.lower() or 'spinner' in html.lower():
42-
html = await self.fetch_with_playwright(url) or html
43-
44-
soup = BeautifulSoup(html, 'html.parser')
45-
46-
for tag in soup(['script', 'style', 'meta']):
47-
tag.decompose()
48-
49-
return {
50-
'url': url,
51-
'title': soup.title.string if soup.title else '',
52-
'content': soup.get_text(separator=' ', strip=True),
53-
'links': [
54-
urljoin(url, link.get('href'))
55-
for link in soup.find_all('a', href=True)
56-
]
57-
}
58-
return None
59-
except Exception as e:
60-
logging.error(f"Error fetching {url}: {str(e)}")
61-
return None
36+
try:
37+
async with session.get(url, headers=self.headers, timeout=self.timeout) as response:
38+
if response.status == 200:
39+
html = await response.text()
40+
soup = BeautifulSoup(html, 'html.parser')
41+
42+
# Basic content extraction
43+
text_content = soup.get_text(separator=' ', strip=True)
44+
if not text_content:
45+
return None
46+
47+
return {
48+
'url': url,
49+
'title': soup.title.string if soup.title else '',
50+
'content': text_content,
51+
'links': [
52+
urljoin(url, link.get('href'))
53+
for link in soup.find_all('a', href=True)
54+
]
55+
}
56+
else:
57+
logging.error(f"HTTP {response.status} for {url}")
58+
return None
59+
except Exception as e:
60+
logging.error(f"Error fetching {url}: {str(e)}")
61+
return None
6262

6363
def should_crawl(self, url: str, base_domain: str) -> bool:
6464
if url in self.visited_urls:

0 commit comments

Comments
 (0)