diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..fabc417
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Code/resources/advisory-database"]
+	path = Code/resources/advisory-database
+	url = https://github.com/github/advisory-database.git
diff --git a/Code/collect_commits.py b/Code/collect_commits.py
index f13fbb5..b21f401 100644
--- a/Code/collect_commits.py
+++ b/Code/collect_commits.py
@@ -28,6 +28,15 @@
 session = create_session()
 conn = session.connection()
 
+def download_patch(repo_url, patch_file_address, hashsha):
+    if os.path.exists(patch_file_address): return
+    if 'github.com' in repo_url:
+        cf.logger.info(f'Trying to download patch file directly from github: {repo_url}/commit/{hashsha}.patch')
+        res = requests.get(f'{repo_url}/commit/{hashsha}.patch')
+        if res.status_code != 200: return
+        patch_text=res.text
+        cf.logger.info(patch_file_address)
+        open(patch_file_address, 'w+').write(patch_text)
 
 def extract_commit_url_from_refs(ref_list, cve_id):
     # Direct commit URLS
@@ -373,9 +382,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
                     }
                     repo_commits.append(commit_row)
                     # Create patch file from commit
-                    if os.path.exists(patch_file_address):
+                    if not os.path.exists(patch_file_address):
                         create_git_patch(cached_repo_address, single_hash, cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch")
-                        print(patch_file_address)
                         patch_size = get_file_size(patch_file_address)
                         if patch_size > cf.MAXIMUM_PATCH_SIZE_FOR_DB_STORAGE:
                             continue
@@ -385,23 +393,18 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
                         repo_files.extend(commit_files)
                         repo_methods.extend(commit_methods)
                     except Exception as e:
-                        print(f'Problem while fetching the commits1: {e}')
+                        cf.logger.error(f'Problem while fetching the commits1: {e}')
                 except Exception as e:
-                    print(f'Problem while fetching the commits2: {e}')
+                    cf.logger.error(f'Problem while fetching the commits2: {e}')
         except Exception as e:
+            cf.logger.error(f'Extracting commits failed: {e}')
             try:
                 if not os.path.exists(patch_file_address):
-                    print('Trying to extract commits directly from github')
-                    if 'github' in repo_url:
-                        patch_text = requests.get(f'{repo_url}/commit/{single_hash}.patch').text
-                        open(patch_file_address, 'w+').write(patch_text)
+                    download_patch(repo_url, patch_file_address, single_hash)
             except Exception as e:
                 print(f'Trying to extract commits directly from github failed {str(e)}')
-            print(f'Error: {str(e)}')
     if repo_commits:
-        print('4')
         df_repo_commits = pd.DataFrame.from_dict(repo_commits)
-        print('5')
         df_repo_commits = df_repo_commits[COMMIT_COLUMNS]  # ordering the columns
     else:
         df_repo_commits = None
@@ -414,12 +417,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
         df_repo_files = None
 
     if repo_methods:
-        print('10')
-
         df_repo_methods = pd.DataFrame.from_dict(repo_methods)
-        print('11')
         df_repo_methods = df_repo_methods[METHOD_COLUMNS]  # ordering the
-        print('12')
     else:
         df_repo_methods = None
 
diff --git a/Code/collect_projects.py b/Code/collect_projects.py
index a1f709f..2f4f776 100644
--- a/Code/collect_projects.py
+++ b/Code/collect_projects.py
@@ -1,24 +1,29 @@
 import os
 import shutil
+import sys
 import time
 from urllib.parse import urlparse
+from pathlib import Path
 
 import pandas as pd
 import requests
 import github
 from sqlalchemy import text
+from tqdm import tqdm
 
 import configuration as cf
 import cve_importer
 import database as db
 from Code.cpe_parser import parse_cpe_dict
+from Code.registry_to_github import clean_git_url
 from Code.resources.cpe_to_github_search import search_missing_cpes_in_github
 from Code.resources.cveprojectdatabase import create_cve_mapper_table
-from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories
+from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories, \
+    is_repo_available, PROJECT_STATUS_REPO_REMOVED
 from Code.resources.extract_github_repo_from_ghsd import parse_and_append_ghsd_dataset
 from resources.find_repo_url import apply_cve_cpe_mappers
 from database import create_session
-from collect_commits import extract_commits, extract_project_links
+from collect_commits import extract_commits, extract_project_links, download_patch
 from constants import REPO_COLUMNS
 from utils import prune_tables
 
@@ -26,6 +31,26 @@
 conn = session.connection()
 
 
+
+def add_missing_patches():
+    # Specify the directory
+    directory = Path(cf.PATCH_FILE_STORAGE_PATH)
+    # Get the list of files in the directory
+    patches = {f.name: True for f in directory.iterdir() if f.is_file()}
+
+    res = db.get_query("SELECT hash, repo_url from fixes where extraction_status='COMPLETED'")
+    for r in tqdm(res):
+        if len(r['hash'])<=5: continue
+        repo_name = (r['repo_url'].lstrip('https://')).replace('/', '_')
+        patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{r['hash']}.patch")
+        if patches.get(f"{repo_name}_{r['hash']}.patch"):
+            # Do not download if patch already exists
+            continue
+        download_patch(r['repo_url'], patch_path, r['hash'])
+        time.sleep(5)
+
+
+
 def create_fixes_table():
     query = text('''CREATE TABLE IF NOT EXISTS fixes
 (
@@ -216,36 +241,42 @@ def get_github_repo_meta(repo_url: str, username: str, token):
     """
     returns github meta-information of the repo_url
     """
-
-    # handle renamed repos
-    repo_url = extract_location_header(repo_url)
-
-    repo_url = repo_url.rstrip('/')
-    owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
-    meta_row = {}
-
-    if username == 'None':
-        git_link = github.Github()
-    else:
-        git_link = github.Github(login_or_token=token, user_agent=username)
-    print(owner)
-    git_user = git_link.get_user(owner)
-    print(project)
-    repo = git_user.get_repo(project)
-    meta_row = {'repo_url': repo_url,
-                'repo_name': repo.full_name,
-                'description': repo.description,
-                'date_created': repo.created_at,
-                'date_last_push': repo.pushed_at,
-                'homepage': repo.homepage,
-                'repo_language': repo.language,
-                'forks_count': repo.forks,
-                'stars_count': repo.stargazers_count,
-                'owner': owner}
+    try:
+        repo_status = is_repo_available(repo_url)
+        print(repo_status)
+        if repo_status == 'Removed':
+            return None
+        # handle renamed repos
+        repo_url = extract_location_header(repo_url)
+
+        repo_url = repo_url.rstrip('/')
+        owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
+        meta_row = {}
+
+        if username == 'None':
+            git_link = github.Github()
+        else:
+            git_link = github.Github(login_or_token=token, user_agent=username)
+        cf.logger.info(f"Getting github meta information for {repo_url}")
+        git_user = git_link.get_user(owner)
+        repo = git_user.get_repo(project)
+        meta_row = {'repo_url': repo_url,
+                    'repo_name': repo.full_name,
+                    'description': repo.description,
+                    'date_created': repo.created_at,
+                    'date_last_push': repo.pushed_at,
+                    'homepage': repo.homepage,
+                    'repo_language': repo.language,
+                    'forks_count': repo.forks,
+                    'stars_count': repo.stargazers_count,
+                    'owner': owner}
+        return meta_row
+    except Exception as e:
+        cf.logger.error(f"Getting meta information failed for repo url failed {e}")
+        return None
     # except BadCredentialsException as e:
     #     cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}: {e}')
     #     pass  # or exit(1)
-    return meta_row
 
 
 def save_repo_meta(repo_url):
@@ -257,16 +288,27 @@ def save_repo_meta(repo_url):
     new_conn = new_session.connection()
 
     # ignore when the meta-information of the given repo is already saved.
-
+    repo_url = clean_git_url(repo_url)
     try:
-        if db.table_exists('repository') and db.get_one_query(f"select * from repository where repo_url='{repo_url}'"):
-            return
+        if db.get_one_query(f"select * from repository where repo_url='{repo_url}'"):
+            return 'FIX_WAS_AVAILABLE'
         if 'github.' in repo_url:
             meta_dict = get_github_repo_meta(repo_url, cf.USER, cf.TOKEN)
-            df_meta = pd.DataFrame([meta_dict], columns=REPO_COLUMNS)
-            df_meta.to_sql(name='repository', con=new_conn, if_exists="append", index=False)
+            if not meta_dict:
+                # Repository removed, most likely the next extraction process will fail.
+                return 'REPO_REMOVED'
+            # Build the insert SQL query
+            insert_query = text(f"""
+                INSERT INTO repository ({', '.join(REPO_COLUMNS)}) 
+                VALUES ({', '.join([':{}'.format(col) for col in REPO_COLUMNS])})
+                ON CONFLICT (repo_url) DO NOTHING;
+            """)
+            # Execute the SQL statement using parameterized query
+            new_conn.execute(insert_query, meta_dict)
             new_conn.commit()
+            return True
     except Exception as e:
+        print("Inserting new repository resulted in error.")
         cf.logger.warning(f'Problem while fetching repository meta-information: {e}')
     finally:
         new_conn.close()
@@ -327,11 +369,6 @@ def fetch_and_store_commits():
     commit_fixes_query = f"SELECT * FROM fixes where score >= {THRESHOLD_SCORE} and extraction_status = 'NOT_STARTED' "
     if db.table_exists('commits'):
         commit_fixes_query += ' and hash not in (select distinct hash from commits)'
-        try:
-            db.exec_query('ALTER TABLE commits ADD CONSTRAINT hash_unique_constraint UNIQUE (hash, repo_url);')
-            conn.commit()
-        except Exception as e:
-            print(e)
     # if db.table_exists('file_change'):
     #     try:
     #         db.exec_query('CREATE UNIQUE INDEX hashdiffoldpath_unique_index ON file_change (hash, diff, old_path);')
@@ -350,12 +387,16 @@ def fetch_and_store_commits():
     pcount = 0
 
     for repo_url in repo_urls:
-        save_repo_meta(repo_url)
+        status = save_repo_meta(repo_url)
+        if status == 'REPO_REMOVED':
+            cf.logger.info(f"Repo removed! {repo_url}")
+            db.exec_query(f"UPDATE fixes SET extraction_status='{PROJECT_STATUS_REPO_REMOVED}' where repo_url='{repo_url}' and extraction_status='NOT_STARTED'")
+            continue
         pcount += 1
 
         session = create_session()
         conn = session.connection()
-
+        repo_path=''
         try:
             df_single_repo = df_fixes[df_fixes.repo_url == repo_url]
             hashes = list(df_single_repo.hash.unique())
@@ -365,6 +406,13 @@ def fetch_and_store_commits():
             repo_path = repo_cache_dict.get(repo_url, clone_memo_repo(repo_url))
             df_commit, df_file, df_method = extract_commits(repo_url, hashes, repo_path)
             # remove_directory(repo_path)
+            for single_hash in hashes:
+                repo_name = (repo_url.lstrip('https://')).replace('/', '_')
+                patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch")
+                if os.path.exists(patch_path): continue
+                cf.logger.warning(f'Trying to download patch files directly {repo_url}-{single_hash}')
+                cf.logger.info(f"DL Path: {patch_path}")
+                download_patch(repo_url, patch_path, single_hash)
 
             if df_commit is None:
                 cf.logger.warning(f'Could not retrieve commit information from: {repo_url}')
@@ -386,7 +434,7 @@ def fetch_and_store_commits():
 
                     conn.execute(sql, {
                         'hash': row['hash'],
-                        'repo_url': row['repo_url'],
+                        'repo_url': clean_git_url(row['repo_url']),
                         'author': row['author'],
                         'committer': row['committer'],
                         'msg': row['msg'],
@@ -410,6 +458,7 @@ def fetch_and_store_commits():
                     df_file.to_sql(name="file_change", con=conn, if_exists="append", index=False)
                 else:
                     for index, row in df_file.iterrows():
+                        #TODO: Require a double check for duplicated in file_change, method_change
                         sql = text('''
                             INSERT INTO file_change (file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,code_after,code_before,nloc,complexity,token_count,programming_language)
                             VALUES (:file_change_id,:hash,:filename,:old_path,:new_path,:change_type,:diff,:diff_parsed,:num_lines_added,:num_lines_deleted,:code_after,:code_before,:nloc,:complexity,:token_count,:programming_language)
@@ -443,13 +492,14 @@ def fetch_and_store_commits():
                 conn.commit()
 
             hash_query = str(hashes)[1:-1]
-            print(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})")
             db.exec_query(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})")
 
         except Exception as e:
             # cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}: {e}')
             print(f'Problem occurred while retrieving the project: {repo_url}: {e}')
             # pass  # skip fetching repository if is not available.
+        finally:
+            remove_directory(repo_path)
         conn.commit()
     cf.logger.debug('-' * 70)
 
@@ -491,35 +541,33 @@ def remove_lowscore_fixes(min_score):
 
 
 if __name__ == '__main__':
+    print('Starting ...')
+    start_time = time.perf_counter()
 
-    if False:
-        print('Starting ...')
-        start_time = time.perf_counter()
+    print('Importing CVEs')
+    # Step (1) save CVEs(cve) and cwe tables
+    cve_importer.import_cves()
 
-        print('Importing CVEs')
-        # Step (1) save CVEs(cve) and cwe tables
-        cve_importer.import_cves()
+    print('Parsing & extracting NVD dataset')
+    populate_fixes_table()
 
-        print('Parsing & extracting NVD dataset')
-        populate_fixes_table()
+    print('Parsing & Adding GHSD dataset')
 
-        print('Parsing & Adding GHSD dataset')
+    # Parse & append GHSD dataset
+    parse_and_append_ghsd_dataset()
 
-        # Parse & append GHSD dataset
-        parse_and_append_ghsd_dataset()
+    # Step (2.2) Find any CVE that have no Github fix using CPE
+    # Parse official CPE dictionary
+    parse_cpe_dict()
 
-        # Step (2.2) Find any CVE that have no Github fix using CPE
-        # Parse official CPE dictionary
-        parse_cpe_dict()
-
-        apply_cve_cpe_mappers()
-        #
-        # end_time = time.perf_counter()
-        # hours, minutes, seconds = convert_runtime(start_time, end_time)
-        # cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).')
+    apply_cve_cpe_mappers()
+    #
+    # end_time = time.perf_counter()
+    # hours, minutes, seconds = convert_runtime(start_time, end_time)
+    # cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).')
 
-        # Step (2.3) Run prospector on cve_project table, to find out all fixing commits.
-        add_missing_commits()
+    # Step (2.3) Run prospector on cve_project table, to find out all fixing commits.
+    add_missing_commits()
 
     # remove_lowscore_fixes(cf.MINIMUM_COMMIT_SCORE)
     # Step (3) save commit-, file-, and method- level data tables to the database
@@ -533,6 +581,7 @@ def remove_lowscore_fixes(min_score):
     #     fix_column_types()
     # else:
     #     cf.logger.warning('Data pruning is not possible because there is no information in method_change table')
+    add_missing_patches()
     cf.logger.info('The database is up-to-date.')
     cf.logger.info('-' * 70)
 # ---------------------------------------------------------------------------------------------------------------------
diff --git a/Code/configuration.py b/Code/configuration.py
index 3e18759..8998914 100644
--- a/Code/configuration.py
+++ b/Code/configuration.py
@@ -4,16 +4,17 @@
 from pathlib import Path
 from dotenv import load_dotenv
 import os
-
+import multiprocessing as mp
 
 load_dotenv('.env')
 # set sensible defaults for thTe configurable fields
 DATA_PATH = 'Data'
 DATABASE_NAME = 'CVEfixes_sample.db'
-USER = None
-TOKEN = None
+USER = os.getenv('GITHUB_USER', None)
+TOKEN = os.getenv('GITHUB_TOKEN', None)
 SAMPLE_LIMIT = 0
-NUM_WORKERS = 30
+NUM_WORKERS = 8
+PROSPECTOR_WORKERS = min(mp.cpu_count() - 1, 15) # Anything more than 20 will result in rate limits.
 LOGGING_LEVEL = logging.WARNING
 
 PROSPECTOR_PYTHON_PATH = os.getenv('PROSPECTOR_PYTHON_PATH')
diff --git a/Code/constants.py b/Code/constants.py
index 08c11e0..abf3de5 100644
--- a/Code/constants.py
+++ b/Code/constants.py
@@ -45,7 +45,7 @@
     'rel_type'
 ]
 
-REPO_BLACK_LIST_WORDS_PATTERN = re.compile(r'bugbounty|0day|injection|advisor|GHSA-|zero-day|exploit|poc|cve|vulnerabil|\.github\.io',
+REPO_BLACK_LIST_WORDS_PATTERN = re.compile(r'bugbounty|0day|injection|advisor|GHSA-|zero-day|exploit|poc|cve|vulnerabil|malware|\.github\.io',
                                            re.IGNORECASE)
 REPO_BLACK_LIST_EXACT_WORDS_PATTERN = [
     'research',
@@ -118,6 +118,9 @@
     '0day',
     'IBOS_4.4.3',
     'ttt',
+    'IoT-vulnerable',
+    'security-research',
+    'cxcxcxcxcxcxcxc',
 ]
 
 REPO_BLACK_LIST_EXACT_WORDS_PATTERN = list(map(str.lower, REPO_BLACK_LIST_EXACT_WORDS_PATTERN))
diff --git a/Code/cpe_parser.py b/Code/cpe_parser.py
index 73e6afd..02a712e 100644
--- a/Code/cpe_parser.py
+++ b/Code/cpe_parser.py
@@ -90,12 +90,10 @@ def parse_cpe_dict():
                 'repo_url': repo_url,
                 'rel_type': rel_type,
             })
+        session.commit()
         print(f"Inserted {iz} cpe->repository mapping tuples")
         print(f"Total blacklisted CPEs: {total_blacklisted_count}")
 
         print('Adding missing CPEs based on Github availability')
-
-        # TODO: UNCOMMENT BELOW
         search_missing_cpes_in_github()
 
-        session.commit()
\ No newline at end of file
diff --git a/Code/registry_to_github.py b/Code/registry_to_github.py
index c1cd037..582852d 100644
--- a/Code/registry_to_github.py
+++ b/Code/registry_to_github.py
@@ -43,14 +43,11 @@ def clean_git_url(url):
     for prefix in prefixes:
         if url.startswith(prefix):
             url = url[len(prefix):]
-
     # Remove '.git' extension if present
     if url.endswith(".git"):
         url = url[:-len(".git")]
-
     # Remove user info from ssh urls (e.g., git@)
     url = url.replace("git@", "").replace('/tree/main', '')
-
     return f"https://{url}"
 
 
@@ -78,6 +75,7 @@ def get_version(package_name, ecosystem):
         return
     try:
         package_name = quote(package_name, safe="")
+        if package_name.startswith('vuln%2FGO'): return None
         version = get_version(package_name, ecosystem)
         api = f'https://api.deps.dev/v3alpha/systems/{ecosystem}/packages/{package_name}/versions/{version}'
         response = requests.get(api)
diff --git a/Code/resources/cpe_to_github_search.py b/Code/resources/cpe_to_github_search.py
index 28cebe9..fcdbc8d 100644
--- a/Code/resources/cpe_to_github_search.py
+++ b/Code/resources/cpe_to_github_search.py
@@ -11,6 +11,7 @@
 import requests
 
 from Code.resources.dynamic_commit_collector import execute_command, is_repo_available
+import Code.configuration as cf
 
 
 def exists_in_github(cpe):
@@ -33,6 +34,7 @@ def exists_in_github(cpe):
 
 
 def search_missing_cpes_in_github():
+    print("Search missing CPEs started")
     cpes = get_query(
         'select distinct cpe_name from cve_cpe_mapper where cpe_name not in(select distinct cpe_name from cpe_project)')
 
@@ -40,7 +42,7 @@ def search_missing_cpes_in_github():
     conn = session.connection()
     print(f'Searching for missing CPES... {len(cpes)}')
     total_blacklisted = 0
-    with mp.Pool(processes=5) as pool, tqdm(total=len(cpes)) as progress_bar:
+    with mp.Pool(processes=cf.NUM_WORKERS) as pool, tqdm(total=len(cpes)) as progress_bar:
         new_cpes = list(tqdm(pool.imap_unordered(exists_in_github, cpes), total=len(cpes)))
         for repo in new_cpes:
             if not repo:
@@ -55,4 +57,6 @@ def search_missing_cpes_in_github():
                 'repo_url': repo_address,
                 'rel_type': GITREF_CPE_SEARCH,
             })
+    print("Commiting")
     conn.commit()
+    print("Done")
diff --git a/Code/resources/dynamic_commit_collector.py b/Code/resources/dynamic_commit_collector.py
index 6412585..f4f97f6 100644
--- a/Code/resources/dynamic_commit_collector.py
+++ b/Code/resources/dynamic_commit_collector.py
@@ -9,9 +9,11 @@
 from bs4 import BeautifulSoup
 import psutil
 from sqlalchemy import text
-
+from pathlib import Path
 import multiprocessing as mp
 
+import Code.configuration as cf
+
 from tqdm import tqdm
 
 from Code.database import create_session, fetchone_query, get_query, get_one_query, exec_query
@@ -19,6 +21,8 @@
     PROSPECTOR_GIT_CACHE, HARDWARE_RESOURCE_THRESHOLD_PERCENT, TOKEN
 import orjson
 
+from Code.registry_to_github import is_black_list
+
 # A global lock to prevent prospector working on multpiple CVES from save project
 
 
@@ -29,18 +33,21 @@
 PROJECT_STATUS_REPO_REMOVED = 'REPO_REMOVED'
 PROJECT_STATUS_PROSPECTOR_FAILED = 'PROSPECTOR_FAILED'
 PROJECT_STATUS_NO_FIX_FOUND = 'NO_FIX_WAS_FOUND'
+PROJECT_STATUS_BLOCK_LISTED = 'BLOCK_LISTED'
 PROJECT_STATUS_FIX_FOUND = 'Success'
 
 DISK_USAGE_THRESHOLD = 50
 
 
+Path(PROSPECTOR_GIT_CACHE).mkdir(parents=True, exist_ok=True)
+
+
 def is_repo_available(url):
     try:
         # Send an HTTP GET request to the repository's web page
         response = requests.get(url, headers={
             'Authorization': f'Bearer {TOKEN}'
         })
-
         # Check if the request was successful
         # What if it's renamed?
         if response.status_code == 200:
@@ -50,6 +57,8 @@ def is_repo_available(url):
         # TODO: Add specific access rate limit? Code: 429
 
         if response.status_code == 429:
+            cf.logger.error("We reached a rate limit! Better stop now")
+            time.sleep(60)
             return 'Unavailable'
         # if response.status_code == 404:
         return 'Removed'
@@ -218,6 +227,9 @@ def process_commits(dict_input):
             git_repo_lock_list.append(project_url)
             print(f'{project_url} -> LOCKING {git_repo_lock_list}')
         print(f'Starting finding candidates for {cve} ...')
+        if is_black_list(project_url):
+            exec_query(f"UPDATE cve_project SET checked = '{PROJECT_STATUS_BLOCK_LISTED}' WHERE id = '{id}'")
+            return
         exec_query(f"UPDATE cve_project SET checked = '{PROJECT_STATUS_FINDING_FIX}' WHERE id = '{id}'")
         repo_status = is_repo_available(project_url)
         if repo_status == 'Removed':
@@ -311,8 +323,8 @@ def add_missing_commits(years=None):
                 print(f"Still to low ... performing full cache wipe")
                 cleanup()
         # cpu_count = mp.cpu_count() - 1
-        # cpu_count = mp.cpu_count() - 1
-        cpu_count = 20 # Anything more than it will result in rate limit ...
+        cpu_count = cf.PROSPECTOR_WORKERS
+        # cpu_count = 20 # Anything more than it will result in rate limit ...
 
         for i in range(len(cve_projects)):
             cve_projects[i]['lock_list'] = git_repo_lock_list
diff --git a/Code/resources/extract_github_repo_from_ghsd.py b/Code/resources/extract_github_repo_from_ghsd.py
index 3a87aee..2cea227 100644
--- a/Code/resources/extract_github_repo_from_ghsd.py
+++ b/Code/resources/extract_github_repo_from_ghsd.py
@@ -155,6 +155,6 @@ def parse_and_append_ghsd_dataset():
         })
     conn.commit()
     print("Data inserted into 'fixes' table successfully.")
-    cf.logger.error(f"After black list counter {total_blacklisted}")
+    cf.logger.info(f"After black list counter {total_blacklisted}")
 # populate_fixes_table()
 # parse_and_append_ghsd_dataset()
\ No newline at end of file
diff --git a/Code/utils.py b/Code/utils.py
index bd634e1..3467abd 100644
--- a/Code/utils.py
+++ b/Code/utils.py
@@ -7,6 +7,7 @@
 
 import configuration as cf
 import database as db
+from Code.resources.dynamic_commit_collector import PROJECT_STATUS_REPO_REMOVED
 from database import create_session
 
 session = create_session()
@@ -102,7 +103,7 @@ def prune_tables(datafile):
     # copyfile(datafile, str(datafile).split('.')[0] + '_raw.db')
 
     from Code.collect_projects import save_repo_meta
-    for r in db.get_query('select distinct repo_url from fixes where repo_url not in (select repo_url from repository)'):
+    for r in db.get_query(f'select distinct repo_url from fixes where extraction_status!="{PROJECT_STATUS_REPO_REMOVED}" and repo_url not in (select repo_url from repository)'):
         save_repo_meta(r['repo_url'])
 
 
@@ -118,7 +119,6 @@ def prune_tables(datafile):
     # processing commit, file and method tables for filtering out some invalid records
     df_commit['repo_url'] = df_commit.repo_url.apply(lambda x: x.rsplit('.git')[0])
     df_commit = df_commit.drop_duplicates().reset_index(drop=True)
-    df_repo = df_repo.drop_duplicates().reset_index(drop=True)
     invalid_hashes = set(list(df_commit.hash.unique())).difference(set(list(df_fixes.hash.unique())))
 
     # replace short hash of fix table with long hash from the commits table
diff --git a/README.md b/README.md
index b442f2e..af6304e 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,9 @@ It's highly recommended to first read the MoreFixes paper and understand differe
 
 This tool is consisted of two main components(Morefixes and Prospector) and two data sources(NVD and GSAD)
 ### Configure Morefixes
+> [!CAUTION]
+> You have to configure Prospector as well to make Morefixes work
+Make sure you have cloned this repository with submodules.
 MoreFixes structure itself is based on [CVEFixes project](https://github.com/secureIT-project/CVEfixes).
 Add the Github security advisory database(https://github.com/github/advisory-database) in `Code/resources/ghsd` to get latest vulnerabilities list.
 Then, create a virtual python environment(recommended) in the repo root directory, and install dependencies:
@@ -64,17 +67,27 @@ Then, create a virtual python environment(recommended) in the repo root director
 Renamed `env.sample` to `.env` and update the fields in `.env` and `.CVEfixes.ini` in tool root directory,
 Note that these values should be same for similar services(for example posgtresql database credentials) related to each other.
 
-### Configure prospector
-We are not planning to keep prospector in this repository, and instead, fetch latest Prospector form ProjectKB. As a temporary workaround, you'll need to update the modified version of prospector(which is available in this repository).  
-Update 'config.yaml' in `/prospector` path, and copy the current `.env` file to `/prospector` directory as well. This mess will be fixed in the future :)
-Create a separate virtual environment in `/prospector` and install requirements for prospector(`pip install -r requirements.txt`). Update python executor path in `runner.sh` if the virtual environment directory name is not 'venv'.
+### Configure prospector 
+Update 'config.yaml' in `/prospector` path, and copy the current `.env` file to `/prospector` directory as well.
+Create a separate virtual environment in `/prospector` named `venv` and install requirements for prospector(`pip install -r requirements.txt`).
+In the Prospector venv, Run
+```
+python -m spacy download en_core_web_sm
+python -m spacy download en_core_web_lg
+python -m spacy download en
+```
 
+> [!WARNING]
+> if the virtual environment directory name is not 'venv', update python executor path in `runner.sh` 
+
+> [!WARNING]
+> We are not planning to keep prospector in this repository, this is a temporary workaround.
 
 ### Run the tool
-If you want to update the dataset for new CVEs, run the tool by executing `bash Code/run.sh`. This will first update the GHSA dataset in `/Code/resources/advisory-database` and download latest CPE Dictionary from NVD and starts the whole flow mentioned in the figure 1 of the paper.
+We HIGHLY recommend to first restore the latest backup, and then run the tool to only processes new advisories. Otherwise,
+**Deactivate the venv of Prospector and switch to Morefixes venv**, then run the tool by executing `bash Code/run.sh`. This will first update the GHSA dataset in `/Code/resources/advisory-database` and download latest CPE Dictionary from NVD and starts the whole flow mentioned in the figure 1 of the paper.
 Please note we don't recommend running it on a low-end device and using the published dataset should be an easier choice if you just need to work with the available data.
 
-
 ### Troubleshooting
 One of the heaviest modules of this software, is located at `Code/resources/dynamic_commit_collector.py`, which will process possible fix commits in parallel. If you need to run the software from scratch, make sure to double-check parameters in this page to make sure your system won't break during processing.
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 07d192b..7464615 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,7 +9,7 @@ services:
     env_file:
       - .env
     ports:
-      - "127.0.0.1:5432:5432"
+      - "${POSTGRES_PORT}:${POSTGRES_PORT}"
     volumes:
       - postgres_data2:/var/lib/postgresql/data
       - ./dump_morefixes_27-03-2024_19_52_58.sql:/docker-entrypoint-initdb.d/dump_morefixes_27-03-2024_19_52_58.sql
diff --git a/env.sample b/env.sample
index 18a3f47..baf2923 100644
--- a/env.sample
+++ b/env.sample
@@ -1,12 +1,12 @@
 MIN_COMMIT_SCORE=65
-GIT_CACHE=/ssddata/jafar/tmp
-PROSPECTOR_GIT_CACHE=/ssddata/jafar/tmp
-PROSPECTOR_PATH=/ssddata/jafar/project-kb/prospector/
-PROSPECTOR_PYTHON_PATH=/ssddata/jafar/project-kb/prospector/myenv/bin/python
-PATCH_FILE_STORAGE_PATH=/pool0/data/jef/cvedataset-patches/
-CVE_DATA_PATH=/ssddata/jafar/tmp/cvedata1112222
+GIT_CACHE=/tmp/gitcache
+PROSPECTOR_GIT_CACHE=/tmp/proscache
+PROSPECTOR_PATH=/home/jef/projects/Morefixes/prospector/
+PROSPECTOR_PYTHON_PATH=/home/jef/projects/Morefixes/prospector/venv/bin/python
+PATCH_FILE_STORAGE_PATH=/home/jef/projects/Morefixes/patchesdir/cvedataset-patches
+CVE_DATA_PATH=/tmp/cvedata1112222
 POSTGRES_USER=postgrescvedumper
-POSTGRES_PORT=9921
+POSTGRES_PORT=5432
 POSTGRES_DBNAME=postgrescvedumper
 POSTGRES_DB=postgrescvedumper
 POSTGRES_PASSWORD=a42a18537d74c3b7e584c769152c3d
diff --git a/prospector/config.yaml b/prospector/config.yaml
index 5c73552..7cb29bd 100644
--- a/prospector/config.yaml
+++ b/prospector/config.yaml
@@ -4,7 +4,7 @@
 preprocess_only: False
 
 # Maximum number of commits to process
-max_candidates: 5000
+max_candidates: 3000
 
 fetch_references: True
 
@@ -41,7 +41,7 @@ report:
 log_level: INFO
 
 # The directory used to cache the cloned repositories
-git_cache: /ssddata/jafar/tmp
+git_cache: /tmp/gitcache
 
 # The GitHub API token
 github_token:
diff --git a/prospector/runner.sh b/prospector/runner.sh
index 5e918c2..bac3a47 100755
--- a/prospector/runner.sh
+++ b/prospector/runner.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 cd $(dirname "$0")
 source venv/bin/activate
-#timeout 3000 python3 cli/main.py "$@"
-python3 cli/main.py "$@"
+timeout 3000 python3 cli/main.py "$@"
+#python3 cli/main.py "$@"