Project setup

kunalshah03 · kunalshah03 · commit f16b7e843d3b · 2024-11-15T12:14:38.000-05:00
diff --git a/.env.example b/.env.example
@@ -0,0 +1 @@
+RUFUS_API_KEY=your-openai-api-key-here
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,25 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.env
+venv/
+.venv/
+.idea/
+.vscode/
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -0,0 +1,21 @@
+from rufus import RufusClient
+
+def main():
+    # Initialize client
+    client = RufusClient()
+
+    # Example: Scraping HR information
+    documents = client.scrape(
+    "https://www.sfgov.com",  # Government site as mentioned in case study
+    instructions="Find information about HR policies and employee benefits"
+)
+
+    # Print results
+    for doc in documents:
+        print(f"\nTitle: {doc['title']}")
+        print(f"Summary: {doc['summary']}")
+        print(f"Topics: {', '.join(doc['metadata']['topics'])}")
+        print(f"Relevance: {doc['metadata']['relevance_score']}")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+setuptools>=68.0.0
+aiohttp==3.9.1
+beautifulsoup4==4.12.2
+openai==1.3.5
+python-dotenv==1.0.0
+requests==2.31.0
diff --git a/rufus/__init__.py b/rufus/__init__.py
@@ -0,0 +1,4 @@
+from .client import RufusClient
+
+__version__ = "0.1.0"
+__all__ = ["RufusClient"]
diff --git a/rufus/client.py b/rufus/client.py
@@ -0,0 +1,24 @@
+import os
+from typing import List, Dict
+from dotenv import load_dotenv
+from .scraper import WebScraper
+from .processor import ContentProcessor
+
+class RufusClient:
+    def __init__(self, api_key: str = None):
+        """Initialize Rufus client with API key."""
+        load_dotenv()
+        self.api_key = api_key or os.getenv('RUFUS_API_KEY')
+        if not self.api_key:
+            raise ValueError("API key is required. Set RUFUS_API_KEY environment variable")
+
+        self.scraper = WebScraper()
+        self.processor = ContentProcessor(api_key=self.api_key)
+
+    def scrape(self, url: str, instructions: str = None) -> List[Dict]:
+        """Scrape website content based on instructions."""
+        if not url.startswith(('http://', 'https://')):
+            raise ValueError("Invalid URL format")
+
+        raw_content = self.scraper.crawl(url)
+        return self.processor.process(raw_content, instructions)
diff --git a/rufus/processor.py b/rufus/processor.py
@@ -0,0 +1,85 @@
+from typing import List, Dict
+from openai import OpenAI
+import logging
+import json
+from .utils import chunk_text
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class ContentProcessor:
+    def __init__(self, api_key: str):
+        self.client = OpenAI(api_key=api_key)
+
+    def generate_extraction_prompt(self, content: str, instructions: str) -> str:
+        return f"""
+        Given the following web content and instructions, extract and structure the relevant information.
+        Create a well-organized document that can be used for RAG applications.
+
+        Instructions: {instructions}
+
+        Content:
+        {content}
+
+        Return the content in the following JSON format:
+        {{
+            "title": "Brief title describing the content",
+            "summary": "Brief summary of the key points",
+            "content": "Main extracted content, relevant to the instructions",
+            "metadata": {{
+                "topics": ["relevant", "topics", "covered"],
+                "relevance_score": 0-1 score indicating relevance to instructions
+            }}
+        }}
+        """
+
+    def process_chunk(self, chunk: str, instructions: str) -> Dict:
+    # """Process a single chunk of content using GPT."""
+        try:
+            response = self.client.chat.completions.create(
+                model="gpt-4",  # Changed from gpt-4-turbo-preview
+                messages=[{
+                    "role": "system",
+                    "content": "You are a content extraction AI that processes web content into structured documents for RAG systems."
+                }, {
+                    "role": "user",
+                    "content": self.generate_extraction_prompt(chunk, instructions)
+                }],
+                temperature=0.3
+            )
+
+            try:
+                content = response.choices[0].message.content
+                return json.loads(content)
+            except json.JSONDecodeError as e:
+                logger.error(f"Failed to parse GPT response as JSON: {e}")
+                return None
+
+        except Exception as e:
+            logger.error(f"Error processing chunk: {str(e)}")
+            return None
+
+    def process(self, pages: List[Dict], instructions: str = None) -> List[Dict]:
+        processed_documents = []
+
+        for page in pages:
+            try:
+                chunks = chunk_text(page['content'], max_length=4000)
+
+                for chunk in chunks:
+                    processed = self.process_chunk(chunk, instructions)
+                    if processed:
+                        if 'metadata' in processed:
+                            processed['metadata']['source_url'] = page['url']
+                        processed_documents.append(processed)
+                    else:
+                        logger.warning(f"Failed to process chunk from {page['url']}")
+
+            except Exception as e:
+                logger.error(f"Error processing page {page['url']}: {str(e)}")
+                continue
+
+        return processed_documents
diff --git a/rufus/scraper.py b/rufus/scraper.py
@@ -0,0 +1,93 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from typing import Set, Dict, List
+import asyncio
+import aiohttp
+import logging
+
+class WebScraper:
+    def __init__(self):
+        self.visited_urls: Set[str] = set()
+        self.max_depth = 3
+        self.max_pages = 100
+        self.timeout = 30
+        self.max_concurrent = 5
+        self.headers = {
+            'User-Agent': 'Rufus Bot 0.1 - AI-Powered Web Scraper for RAG Systems'
+        }
+
+    def should_crawl(self, url: str, base_domain: str) -> bool:
+        if url in self.visited_urls:
+            return False
+
+        try:
+            parsed = urlparse(url)
+            if parsed.netloc != base_domain:
+                return False
+
+            skip_extensions = ('.pdf', '.jpg', '.png', '.gif', '.css', '.js')
+            if any(url.lower().endswith(ext) for ext in skip_extensions):
+                return False
+
+            return True
+        except:
+            return False
+
+    async def fetch_page(self, url: str, session: aiohttp.ClientSession) -> Dict:
+        try:
+            async with session.get(url, headers=self.headers) as response:
+                if response.status == 200:
+                    html = await response.text()
+                    soup = BeautifulSoup(html, 'html.parser')
+
+                    for tag in soup(['script', 'style', 'meta', 'noscript']):
+                        tag.decompose()
+
+                    return {
+                        'url': url,
+                        'title': soup.title.string if soup.title else '',
+                        'content': soup.get_text(separator=' ', strip=True),
+                        'links': [
+                            urljoin(url, link.get('href'))
+                            for link in soup.find_all('a', href=True)
+                        ]
+                    }
+                return None
+        except Exception as e:
+            logging.error(f"Error fetching {url}: {str(e)}")
+            return None
+
+    async def crawl_async(self, start_url: str) -> List[Dict]:
+        base_domain = urlparse(start_url).netloc
+        to_visit = {start_url}
+        results = []
+
+        connector = aiohttp.TCPConnector(limit=self.max_concurrent)
+        timeout = aiohttp.ClientTimeout(total=self.timeout)
+
+        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            while to_visit and len(self.visited_urls) < self.max_pages:
+                current_batch = list(to_visit)[:self.max_concurrent]
+                to_visit = set(list(to_visit)[self.max_concurrent:])
+
+                tasks = [
+                    self.fetch_page(url, session)
+                    for url in current_batch
+                    if self.should_crawl(url, base_domain)
+                ]
+
+                pages = await asyncio.gather(*tasks, return_exceptions=True)
+
+                for page in pages:
+                    if isinstance(page, dict):
+                        self.visited_urls.add(page['url'])
+                        results.append(page)
+                        for link in page.get('links', []):
+                            if self.should_crawl(link, base_domain):
+                                to_visit.add(link)
+
+        return results
+
+    def crawl(self, url: str) -> List[Dict]:
+        return asyncio.run(self.crawl_async(url))
diff --git a/rufus/utils.py b/rufus/utils.py
@@ -0,0 +1,25 @@
+from typing import List
+import re
+
+def chunk_text(text: str, max_length: int = 4000) -> List[str]:
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+
+    for sentence in sentences:
+        sentence_length = len(sentence)
+
+        if current_length + sentence_length > max_length:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_length = sentence_length
+        else:
+            current_chunk.append(sentence)
+            current_length += sentence_length
+
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+
+    return chunks
diff --git a/setup.py b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="rufus",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        'aiohttp>=3.9.1',
+        'beautifulsoup4>=4.12.2',
+        'openai>=1.3.5',
+        'python-dotenv>=1.0.0',
+        'requests>=2.31.0',
+    ],
+    author="Your Name",
+    author_email="your.email@example.com",
+    description="AI-powered web scraper for RAG systems",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/yourusername/rufus",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+)
diff --git a/tests/test_rufus.py b/tests/test_rufus.py
@@ -0,0 +1,24 @@
+import unittest
+from rufus import RufusClient
+
+class TestRufus(unittest.TestCase):
+    def setUp(self):
+        self.client = RufusClient()
+
+    def test_invalid_url(self):
+        with self.assertRaises(ValueError):
+            self.client.scrape("invalid-url")
+
+    def test_scraping(self):
+        docs = self.client.scrape(
+            "https://example.com",
+            instructions="Find product information"
+        )
+        self.assertIsInstance(docs, list)
+        if docs:
+            self.assertIn('title', docs[0])
+            self.assertIn('summary', docs[0])
+            self.assertIn('metadata', docs[0])
+
+if __name__ == '__main__':
+    unittest.main()