From 1bd01bb46d810af7dedcd80ca8db1b8d41f08e20 Mon Sep 17 00:00:00 2001
From: Thomas Luechtefeld <tom@insilica.co>
Date: Thu, 12 Dec 2024 12:59:32 -0500
Subject: [PATCH] cir-reports works

---
 .gitignore                 |  5 +++-
 cache/.gitignore           |  2 ++
 dvc.lock                   | 50 +++++++++++++++++++++++++++++++++++++-
 dvc.yaml                   |  2 +-
 stages/02_get_pdf_links.py | 16 ++++++++----
 stages/03_download_pdfs.py |  2 +-
 stages/utils/scraperapi.py | 14 +++++++++++
 7 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 cache/.gitignore
 create mode 100644 stages/utils/scraperapi.py

diff --git a/.gitignore b/.gitignore
index f9496fd..769013b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,7 @@ logs
 /list
 /brick
 
-__pycache__*
\ No newline at end of file
+__pycache__*
+.env
+log/*
+cache/*
\ No newline at end of file
diff --git a/cache/.gitignore b/cache/.gitignore
new file mode 100644
index 0000000..8874dea
--- /dev/null
+++ b/cache/.gitignore
@@ -0,0 +1,2 @@
+/ingredient_page_links.json
+/all_pdf_links.json
diff --git a/dvc.lock b/dvc.lock
index 3137d7a..2368e85 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -1,2 +1,50 @@
 schema: '2.0'
-stages: {}
+stages:
+  get_ingredient_links:
+    cmd: python stages/01_get_ingredient_links.py
+    deps:
+    - path: stages/01_get_ingredient_links.py
+      hash: md5
+      md5: 419ee6fb2fbec591e466a1a58cd02af7
+      size: 2527
+      isexec: true
+    outs:
+    - path: cache/ingredient_page_links.json
+      hash: md5
+      md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
+      size: 223951
+  get_pdf_links:
+    cmd: python stages/02_get_pdf_links.py
+    deps:
+    - path: cache/ingredient_page_links.json
+      hash: md5
+      md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
+      size: 223951
+    - path: stages/02_get_pdf_links.py
+      hash: md5
+      md5: a9f8b66eec9c4dba1b85aa531d19b651
+      size: 2215
+      isexec: true
+    outs:
+    - path: cache/all_pdf_links.json
+      hash: md5
+      md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
+      size: 34390
+  download_pdfs:
+    cmd: python stages/03_download_pdfs.py
+    deps:
+    - path: cache/all_pdf_links.json
+      hash: md5
+      md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
+      size: 34390
+    - path: stages/03_download_pdfs.py
+      hash: md5
+      md5: b968f299304438b687e2d51c329cf5ab
+      size: 775
+      isexec: true
+    outs:
+    - path: brick/cir_reports.pdf
+      hash: md5
+      md5: 480ab5f19049da242446fff3044bb088.dir
+      size: 395549978
+      nfiles: 364
diff --git a/dvc.yaml b/dvc.yaml
index 16da75c..e2ed937 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -20,4 +20,4 @@ stages:
       - stages/03_download_pdfs.py
       - cache/all_pdf_links.json
     outs:
-      - brick/cir_reports_pdf
+      - brick/cir_reports.pdf
diff --git a/stages/02_get_pdf_links.py b/stages/02_get_pdf_links.py
index 653f9b0..5e98acd 100755
--- a/stages/02_get_pdf_links.py
+++ b/stages/02_get_pdf_links.py
@@ -5,13 +5,15 @@
 import time
 import json
 import pathlib
+import logging
 import requests
-
 from utils.simple_cache import simple_cache
 from tqdm import tqdm
 from tenacity import retry, stop_after_attempt, wait_exponential
 
 cachedir = pathlib.Path('./cache')
+format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(filename='log/get_pdf_links.log', level=logging.INFO, format=format)
 
 # GET PDF LINKS ================================================================
 ingredient_page_links = json.load(open(cachedir / 'ingredient_page_links.json'))
@@ -24,7 +26,6 @@
 @simple_cache(simple_cache_dir.as_posix(), expiry_seconds=60*60*48)
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15))
 def download_pdfs_from_ingredient_page(ingredient_page_link):
-    # response = scraperapi.scrape(ingredient_page_link, ultra_premium=True)
     response = requests.get(ingredient_page_link)
     response.raise_for_status()
     time.sleep(0.5)
@@ -37,17 +38,22 @@ def download_pdfs_from_ingredient_page(ingredient_page_link):
 for ingredient_page_link in tqdm(ingredient_page_links):
     try:
         pdf_links = download_pdfs_from_ingredient_page(ingredient_page_link)
+        if len(pdf_links) == 0:
+            logging.info(f"No pdf links found for {ingredient_page_link}")
+            continue
         all_pdf_links.extend(pdf_links)
     except Exception as e:
         print(f"Error downloading {ingredient_page_link}: {e}")
         continue
 
 # write all_pdf_links to cache/all_pdf_links.txt
-all_pdf_links = list(set(all_pdf_links))
-json.dump(all_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))
+unique_pdf_links = list(set(all_pdf_links))
+logging.info(f"Total pdf links found: {len(all_pdf_links)}")
+logging.info(f"Unique pdf links found: {len(unique_pdf_links)}")
+json.dump(unique_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))
 
 # TEST RESULT =====================================================================
 res = json.load(open(cachedir / 'all_pdf_links.json'))
 assert len(res) > 1000
-assert len(res) == len(set(res))
+assert len(res) == len(all_pdf_links)
 
diff --git a/stages/03_download_pdfs.py b/stages/03_download_pdfs.py
index 3ce9517..92a9b19 100755
--- a/stages/03_download_pdfs.py
+++ b/stages/03_download_pdfs.py
@@ -6,7 +6,7 @@
 from tqdm import tqdm
 
 cachedir = pathlib.Path('./cache')
-brickdir = pathlib.Path('./brick') / 'cir_reports_pdf'
+brickdir = pathlib.Path('./brick') / 'cir_reports.pdf'
 brickdir.mkdir(parents=True, exist_ok=True)
 
 # DOWNLOAD PDFS ================================================================
diff --git a/stages/utils/scraperapi.py b/stages/utils/scraperapi.py
new file mode 100644
index 0000000..6d6ed1b
--- /dev/null
+++ b/stages/utils/scraperapi.py
@@ -0,0 +1,14 @@
+import os, requests, re, sqlite3, boto3, json, pathlib, dotenv
+
+dotenv.load_dotenv()
+scraperapi_key = os.getenv('SCRAPER_API')
+
+def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False):
+    params = {
+        'api_key': scraperapi_key,
+        'url': scrape_url,
+        'autoparse': autoparse,
+        'binary_target': binary,
+        'ultra_premium': ultra_premium
+    }
+    return requests.get('http://api.scraperapi.com', params=params)