Modified ouput to be RAG-friendly

kunalshah03 · kunalshah03 · commit 0fd39b756b26 · 2024-11-15T12:47:41.000-05:00
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -1,25 +1,40 @@
 from rufus import RufusClient
-import json
+import logging
+import os
 
 def main():
-    client = RufusClient()
-    documents = client.scrape(
-        "https://www.sfgov.com",
-        instructions="Find information about HR policies and employee benefits"
-    )
+   logging.basicConfig(level=logging.INFO)
+   logger = logging.getLogger(__name__)
 
-    if documents:
-        print("\nExtracted Documents:")
-        for doc in documents:
-            print(f"\nTitle: {doc['metadata']['title']}")
-            print(f"Text: {doc['text'][:200]}...")
-            print(f"Topics: {', '.join(doc['metadata']['topics'])}")
-            print(f"Type: {doc['metadata']['chunk_type']}")
-            print(f"Source: {doc['metadata']['source_url']}")
-            print(f"Relevance: {doc['metadata']['relevance_score']}")
-            print("-" * 50)
-    else:
-        print("No documents were extracted")
+   try:
+       output_dir = "output"
+       os.makedirs(output_dir, exist_ok=True)
+       output_file = os.path.join(output_dir, "scraped_content.jsonl")
+
+       client = RufusClient()
+       documents = client.scrape(
+           "https://www.example.com",
+           instructions="Extract main content and information"
+       )
+
+       if documents:
+           # Save using built-in method
+           client.processor.save_to_jsonl(documents, output_file)
+           print(f"\nDocuments saved to: {output_file}")
+
+           print("\nExtracted Documents Preview:")
+           for doc in documents:
+               print(f"\nTitle: {doc['metadata']['title']}")
+               print(f"Text: {doc['text'][:200]}...")
+               print(f"Topics: {', '.join(doc['metadata']['topics'])}")
+               print(f"Type: {doc['metadata']['chunk_type']}")
+               print(f"Relevance: {doc['metadata']['relevance_score']}")
+               print("-" * 50)
+       else:
+           print("No documents were extracted")
+
+   except Exception as e:
+       logger.error(f"Error during scraping: {str(e)}")
 
 if __name__ == "__main__":
-    main()
+   main()
diff --git a/output/scraped_content.jsonl b/output/scraped_content.jsonl
@@ -0,0 +1 @@
+{"id": "da33722e-2646-470c-9fc3-f64c534ac6d2", "text": "Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...", "metadata": {"title": "Main content", "source_url": "https://www.example.com", "chunk_type": "general", "timestamp": "2024-11-15T12:46:41.907621", "topics": ["example", "domain", "illustrative", "literature"], "context": "Content extracted from the Example Domain website.", "relevance_score": 0.8}}
diff --git a/rufus/scraper.py b/rufus/scraper.py
@@ -33,32 +33,32 @@ async def fetch_with_playwright(self, url: str):
            return None
 
    async def fetch_page(self, url: str, session: aiohttp.ClientSession) -> Dict:
-       try:
-           async with session.get(url, headers=self.headers) as response:
-               if response.status == 200:
-                   html = await response.text()
-
-                   if 'loading' in html.lower() or 'spinner' in html.lower():
-                       html = await self.fetch_with_playwright(url) or html
-
-                   soup = BeautifulSoup(html, 'html.parser')
-
-                   for tag in soup(['script', 'style', 'meta']):
-                       tag.decompose()
-
-                   return {
-                       'url': url,
-                       'title': soup.title.string if soup.title else '',
-                       'content': soup.get_text(separator=' ', strip=True),
-                       'links': [
-                           urljoin(url, link.get('href'))
-                           for link in soup.find_all('a', href=True)
-                       ]
-                   }
-               return None
-       except Exception as e:
-           logging.error(f"Error fetching {url}: {str(e)}")
-           return None
+    try:
+            async with session.get(url, headers=self.headers, timeout=self.timeout) as response:
+                if response.status == 200:
+                    html = await response.text()
+                    soup = BeautifulSoup(html, 'html.parser')
+
+                    # Basic content extraction
+                    text_content = soup.get_text(separator=' ', strip=True)
+                    if not text_content:
+                        return None
+
+                    return {
+                        'url': url,
+                        'title': soup.title.string if soup.title else '',
+                        'content': text_content,
+                        'links': [
+                            urljoin(url, link.get('href'))
+                            for link in soup.find_all('a', href=True)
+                        ]
+                    }
+                else:
+                    logging.error(f"HTTP {response.status} for {url}")
+                    return None
+    except Exception as e:
+            logging.error(f"Error fetching {url}: {str(e)}")
+            return None
 
    def should_crawl(self, url: str, base_domain: str) -> bool:
        if url in self.visited_urls:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"id": "da33722e-2646-470c-9fc3-f64c534ac6d2", "text": "Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...", "metadata": {"title": "Main content", "source_url": "https://www.example.com", "chunk_type": "general", "timestamp": "2024-11-15T12:46:41.907621", "topics": ["example", "domain", "illustrative", "literature"], "context": "Content extracted from the Example Domain website.", "relevance_score": 0.8}}