Corrected code

kunalshah03 · kunalshah03 · commit e79ad0162f74 · 2024-11-15T12:40:14.000-05:00
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -1,21 +1,25 @@
 from rufus import RufusClient
+import json
 
 def main():
-    # Initialize client
     client = RufusClient()
-
-    # Example: Scraping HR information
     documents = client.scrape(
-    "https://www.sfgov.com",  # Government site as mentioned in case study
-    instructions="Find information about HR policies and employee benefits"
-)
+        "https://www.sfgov.com",
+        instructions="Find information about HR policies and employee benefits"
+    )
 
-    # Print results
-    for doc in documents:
-        print(f"\nTitle: {doc['title']}")
-        print(f"Summary: {doc['summary']}")
-        print(f"Topics: {', '.join(doc['metadata']['topics'])}")
-        print(f"Relevance: {doc['metadata']['relevance_score']}")
+    if documents:
+        print("\nExtracted Documents:")
+        for doc in documents:
+            print(f"\nTitle: {doc['metadata']['title']}")
+            print(f"Text: {doc['text'][:200]}...")
+            print(f"Topics: {', '.join(doc['metadata']['topics'])}")
+            print(f"Type: {doc['metadata']['chunk_type']}")
+            print(f"Source: {doc['metadata']['source_url']}")
+            print(f"Relevance: {doc['metadata']['relevance_score']}")
+            print("-" * 50)
+    else:
+        print("No documents were extracted")
 
 if __name__ == "__main__":
     main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,10 @@
-setuptools>=68.0.0
 aiohttp==3.9.1
 beautifulsoup4==4.12.2
 openai==1.3.5
 python-dotenv==1.0.0
 requests==2.31.0
+playwright==1.40.0
+nltk==3.8.1
+uuid==1.30
+python-dateutil==2.8.2
+tqdm==4.66.1
diff --git a/rufus/client.py b/rufus/client.py
@@ -6,19 +6,15 @@
 
 class RufusClient:
     def __init__(self, api_key: str = None):
-        """Initialize Rufus client with API key."""
         load_dotenv()
         self.api_key = api_key or os.getenv('RUFUS_API_KEY')
         if not self.api_key:
             raise ValueError("API key is required. Set RUFUS_API_KEY environment variable")
-
         self.scraper = WebScraper()
         self.processor = ContentProcessor(api_key=self.api_key)
 
     def scrape(self, url: str, instructions: str = None) -> List[Dict]:
-        """Scrape website content based on instructions."""
         if not url.startswith(('http://', 'https://')):
             raise ValueError("Invalid URL format")
-
         raw_content = self.scraper.crawl(url)
         return self.processor.process(raw_content, instructions)
diff --git a/rufus/processor.py b/rufus/processor.py
@@ -1,85 +1,130 @@
-from typing import List, Dict
-from openai import OpenAI
-import logging
 import json
-from .utils import chunk_text
+import logging
+import uuid
+from datetime import datetime
+from time import sleep
+from typing import Dict, List, Optional
+from tqdm import tqdm
+from openai import OpenAI
+import nltk
+from nltk.tokenize import sent_tokenize
 
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 class ContentProcessor:
     def __init__(self, api_key: str):
         self.client = OpenAI(api_key=api_key)
+        self.rate_limit_delay = 1
+        nltk.download('punkt', quiet=True)
 
-    def generate_extraction_prompt(self, content: str, instructions: str) -> str:
-        return f"""
-        Given the following web content and instructions, extract and structure the relevant information.
-        Create a well-organized document that can be used for RAG applications.
+    def preprocess_content(self, content: str, max_length: int = 4000) -> List[str]:
+        sentences = sent_tokenize(content)
+        chunks = []
+        current_chunk = []
+        current_length = 0
 
-        Instructions: {instructions}
+        for sentence in sentences:
+            if current_length + len(sentence) > max_length:
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_length = len(sentence)
+            else:
+                current_chunk.append(sentence)
+                current_length += len(sentence)
+
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
 
-        Content:
-        {content}
+    def generate_rag_prompt(self, content: str, instructions: str, metadata: Dict) -> str:
+        return f"""
+        Process this content for RAG system integration.
+        Instructions: {instructions}
+        Source URL: {metadata.get('url', 'Unknown')}
+        Content: {content}
 
-        Return the content in the following JSON format:
+        Return strictly valid JSON matching this structure:
         {{
-            "title": "Brief title describing the content",
-            "summary": "Brief summary of the key points",
-            "content": "Main extracted content, relevant to the instructions",
-            "metadata": {{
-                "topics": ["relevant", "topics", "covered"],
-                "relevance_score": 0-1 score indicating relevance to instructions
-            }}
+            "text": "Main content for embedding",
+            "title": "Descriptive section title",
+            "source_url": "Origin URL",
+            "chunk_type": "policy|procedure|faq|general",
+            "topics": ["topic1", "topic2"],
+            "context": "Additional retrieval context",
+            "relevance_score": 0.0 to 1.0
         }}
         """
 
-    def process_chunk(self, chunk: str, instructions: str) -> Dict:
-    # """Process a single chunk of content using GPT."""
+    def process_chunk(self, chunk: str, instructions: str, metadata: Dict) -> Optional[Dict]:
         try:
+            sleep(self.rate_limit_delay)
             response = self.client.chat.completions.create(
-                model="gpt-4",  # Changed from gpt-4-turbo-preview
+                model="gpt-3.5-turbo",
                 messages=[{
                     "role": "system",
-                    "content": "You are a content extraction AI that processes web content into structured documents for RAG systems."
+                    "content": "You are a RAG content processor. Return only valid JSON."
                 }, {
                     "role": "user",
-                    "content": self.generate_extraction_prompt(chunk, instructions)
+                    "content": self.generate_rag_prompt(chunk, instructions, metadata)
                 }],
-                temperature=0.3
+                temperature=0.3,
+                max_tokens=1000
             )
 
-            try:
-                content = response.choices[0].message.content
-                return json.loads(content)
-            except json.JSONDecodeError as e:
-                logger.error(f"Failed to parse GPT response as JSON: {e}")
+            content = response.choices[0].message.content.strip()
+            if content.startswith("```json"):
+                content = content[7:-3]
+
+            result = json.loads(content)
+
+            if result.get('relevance_score', 0) < 0.5:
                 return None
 
+            return {
+                "id": str(uuid.uuid4()),
+                "text": result["text"],
+                "metadata": {
+                    "title": result["title"],
+                    "source_url": result["source_url"],
+                    "chunk_type": result["chunk_type"],
+                    "timestamp": datetime.now().isoformat(),
+                    "topics": result["topics"],
+                    "context": result["context"],
+                    "relevance_score": result["relevance_score"]
+                }
+            }
+
         except Exception as e:
-            logger.error(f"Error processing chunk: {str(e)}")
+            logger.error(f"Processing error: {str(e)}")
             return None
 
     def process(self, pages: List[Dict], instructions: str = None) -> List[Dict]:
         processed_documents = []
 
-        for page in pages:
+        for page in tqdm(pages, desc="Processing pages"):
             try:
-                chunks = chunk_text(page['content'], max_length=4000)
+                chunks = self.preprocess_content(page['content'])
+                metadata = {
+                    "url": page['url'],
+                    "title": page['title'],
+                    "structured_data": page.get('structured_data', {})
+                }
 
                 for chunk in chunks:
-                    processed = self.process_chunk(chunk, instructions)
-                    if processed:
-                        if 'metadata' in processed:
-                            processed['metadata']['source_url'] = page['url']
-                        processed_documents.append(processed)
-                    else:
-                        logger.warning(f"Failed to process chunk from {page['url']}")
+                    doc = self.process_chunk(chunk, instructions, metadata)
+                    if doc:
+                        processed_documents.append(doc)
 
             except Exception as e:
                 logger.error(f"Error processing page {page['url']}: {str(e)}")
                 continue
 
         return processed_documents
+
+    def save_to_jsonl(self, documents: List[Dict], output_file: str):
+        """Save documents in JSONL format for RAG systems."""
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for doc in documents:
+                f.write(json.dumps(doc) + '\n')
diff --git a/rufus/scraper.py b/rufus/scraper.py
diff --git a/rufus/utils.py b/rufus/utils.py