From 1bd01bb46d810af7dedcd80ca8db1b8d41f08e20 Mon Sep 17 00:00:00 2001 From: Thomas Luechtefeld Date: Thu, 12 Dec 2024 12:59:32 -0500 Subject: [PATCH] cir-reports works --- .gitignore | 5 +++- cache/.gitignore | 2 ++ dvc.lock | 50 +++++++++++++++++++++++++++++++++++++- dvc.yaml | 2 +- stages/02_get_pdf_links.py | 16 ++++++++---- stages/03_download_pdfs.py | 2 +- stages/utils/scraperapi.py | 14 +++++++++++ 7 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 cache/.gitignore create mode 100644 stages/utils/scraperapi.py diff --git a/.gitignore b/.gitignore index f9496fd..769013b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,7 @@ logs /list /brick -__pycache__* \ No newline at end of file +__pycache__* +.env +log/* +cache/* \ No newline at end of file diff --git a/cache/.gitignore b/cache/.gitignore new file mode 100644 index 0000000..8874dea --- /dev/null +++ b/cache/.gitignore @@ -0,0 +1,2 @@ +/ingredient_page_links.json +/all_pdf_links.json diff --git a/dvc.lock b/dvc.lock index 3137d7a..2368e85 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,2 +1,50 @@ schema: '2.0' -stages: {} +stages: + get_ingredient_links: + cmd: python stages/01_get_ingredient_links.py + deps: + - path: stages/01_get_ingredient_links.py + hash: md5 + md5: 419ee6fb2fbec591e466a1a58cd02af7 + size: 2527 + isexec: true + outs: + - path: cache/ingredient_page_links.json + hash: md5 + md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b + size: 223951 + get_pdf_links: + cmd: python stages/02_get_pdf_links.py + deps: + - path: cache/ingredient_page_links.json + hash: md5 + md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b + size: 223951 + - path: stages/02_get_pdf_links.py + hash: md5 + md5: a9f8b66eec9c4dba1b85aa531d19b651 + size: 2215 + isexec: true + outs: + - path: cache/all_pdf_links.json + hash: md5 + md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7 + size: 34390 + download_pdfs: + cmd: python stages/03_download_pdfs.py + deps: + - path: cache/all_pdf_links.json + hash: md5 + md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7 + size: 34390 + - path: stages/03_download_pdfs.py + hash: md5 + md5: b968f299304438b687e2d51c329cf5ab + size: 775 + isexec: true + outs: + - path: brick/cir_reports.pdf + hash: md5 + md5: 480ab5f19049da242446fff3044bb088.dir + size: 395549978 + nfiles: 364 diff --git a/dvc.yaml b/dvc.yaml index 16da75c..e2ed937 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -20,4 +20,4 @@ stages: - stages/03_download_pdfs.py - cache/all_pdf_links.json outs: - - brick/cir_reports_pdf + - brick/cir_reports.pdf diff --git a/stages/02_get_pdf_links.py b/stages/02_get_pdf_links.py index 653f9b0..5e98acd 100755 --- a/stages/02_get_pdf_links.py +++ b/stages/02_get_pdf_links.py @@ -5,13 +5,15 @@ import time import json import pathlib +import logging import requests - from utils.simple_cache import simple_cache from tqdm import tqdm from tenacity import retry, stop_after_attempt, wait_exponential cachedir = pathlib.Path('./cache') +format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +logging.basicConfig(filename='log/get_pdf_links.log', level=logging.INFO, format=format) # GET PDF LINKS ================================================================ ingredient_page_links = json.load(open(cachedir / 'ingredient_page_links.json')) @@ -24,7 +26,6 @@ @simple_cache(simple_cache_dir.as_posix(), expiry_seconds=60*60*48) @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15)) def download_pdfs_from_ingredient_page(ingredient_page_link): - # response = scraperapi.scrape(ingredient_page_link, ultra_premium=True) response = requests.get(ingredient_page_link) response.raise_for_status() time.sleep(0.5) @@ -37,17 +38,22 @@ def download_pdfs_from_ingredient_page(ingredient_page_link): for ingredient_page_link in tqdm(ingredient_page_links): try: pdf_links = download_pdfs_from_ingredient_page(ingredient_page_link) + if len(pdf_links) == 0: + logging.info(f"No pdf links found for {ingredient_page_link}") + continue all_pdf_links.extend(pdf_links) except Exception as e: print(f"Error downloading {ingredient_page_link}: {e}") continue # write all_pdf_links to cache/all_pdf_links.txt -all_pdf_links = list(set(all_pdf_links)) -json.dump(all_pdf_links, open(cachedir / 'all_pdf_links.json', 'w')) +unique_pdf_links = list(set(all_pdf_links)) +logging.info(f"Total pdf links found: {len(all_pdf_links)}") +logging.info(f"Unique pdf links found: {len(unique_pdf_links)}") +json.dump(unique_pdf_links, open(cachedir / 'all_pdf_links.json', 'w')) # TEST RESULT ===================================================================== res = json.load(open(cachedir / 'all_pdf_links.json')) assert len(res) > 1000 -assert len(res) == len(set(res)) +assert len(res) == len(all_pdf_links) diff --git a/stages/03_download_pdfs.py b/stages/03_download_pdfs.py index 3ce9517..92a9b19 100755 --- a/stages/03_download_pdfs.py +++ b/stages/03_download_pdfs.py @@ -6,7 +6,7 @@ from tqdm import tqdm cachedir = pathlib.Path('./cache') -brickdir = pathlib.Path('./brick') / 'cir_reports_pdf' +brickdir = pathlib.Path('./brick') / 'cir_reports.pdf' brickdir.mkdir(parents=True, exist_ok=True) # DOWNLOAD PDFS ================================================================ diff --git a/stages/utils/scraperapi.py b/stages/utils/scraperapi.py new file mode 100644 index 0000000..6d6ed1b --- /dev/null +++ b/stages/utils/scraperapi.py @@ -0,0 +1,14 @@ +import os, requests, re, sqlite3, boto3, json, pathlib, dotenv + +dotenv.load_dotenv() +scraperapi_key = os.getenv('SCRAPER_API') + +def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False): + params = { + 'api_key': scraperapi_key, + 'url': scrape_url, + 'autoparse': autoparse, + 'binary_target': binary, + 'ultra_premium': ultra_premium + } + return requests.get('http://api.scraperapi.com', params=params)