diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..fabc417 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Code/resources/advisory-database"] + path = Code/resources/advisory-database + url = https://github.com/github/advisory-database.git diff --git a/Code/collect_commits.py b/Code/collect_commits.py index f13fbb5..b21f401 100644 --- a/Code/collect_commits.py +++ b/Code/collect_commits.py @@ -28,6 +28,15 @@ session = create_session() conn = session.connection() +def download_patch(repo_url, patch_file_address, hashsha): + if os.path.exists(patch_file_address): return + if 'github.com' in repo_url: + cf.logger.info(f'Trying to download patch file directly from github: {repo_url}/commit/{hashsha}.patch') + res = requests.get(f'{repo_url}/commit/{hashsha}.patch') + if res.status_code != 200: return + patch_text=res.text + cf.logger.info(patch_file_address) + open(patch_file_address, 'w+').write(patch_text) def extract_commit_url_from_refs(ref_list, cve_id): # Direct commit URLS @@ -373,9 +382,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None): } repo_commits.append(commit_row) # Create patch file from commit - if os.path.exists(patch_file_address): + if not os.path.exists(patch_file_address): create_git_patch(cached_repo_address, single_hash, cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch") - print(patch_file_address) patch_size = get_file_size(patch_file_address) if patch_size > cf.MAXIMUM_PATCH_SIZE_FOR_DB_STORAGE: continue @@ -385,23 +393,18 @@ def extract_commits(repo_url, hashes, cached_repo_address=None): repo_files.extend(commit_files) repo_methods.extend(commit_methods) except Exception as e: - print(f'Problem while fetching the commits1: {e}') + cf.logger.error(f'Problem while fetching the commits1: {e}') except Exception as e: - print(f'Problem while fetching the commits2: {e}') + cf.logger.error(f'Problem while fetching the commits2: {e}') except Exception as e: + cf.logger.error(f'Extracting commits failed: {e}') try: if not os.path.exists(patch_file_address): - print('Trying to extract commits directly from github') - if 'github' in repo_url: - patch_text = requests.get(f'{repo_url}/commit/{single_hash}.patch').text - open(patch_file_address, 'w+').write(patch_text) + download_patch(repo_url, patch_file_address, single_hash) except Exception as e: print(f'Trying to extract commits directly from github failed {str(e)}') - print(f'Error: {str(e)}') if repo_commits: - print('4') df_repo_commits = pd.DataFrame.from_dict(repo_commits) - print('5') df_repo_commits = df_repo_commits[COMMIT_COLUMNS] # ordering the columns else: df_repo_commits = None @@ -414,12 +417,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None): df_repo_files = None if repo_methods: - print('10') - df_repo_methods = pd.DataFrame.from_dict(repo_methods) - print('11') df_repo_methods = df_repo_methods[METHOD_COLUMNS] # ordering the - print('12') else: df_repo_methods = None diff --git a/Code/collect_projects.py b/Code/collect_projects.py index a1f709f..2f4f776 100644 --- a/Code/collect_projects.py +++ b/Code/collect_projects.py @@ -1,24 +1,29 @@ import os import shutil +import sys import time from urllib.parse import urlparse +from pathlib import Path import pandas as pd import requests import github from sqlalchemy import text +from tqdm import tqdm import configuration as cf import cve_importer import database as db from Code.cpe_parser import parse_cpe_dict +from Code.registry_to_github import clean_git_url from Code.resources.cpe_to_github_search import search_missing_cpes_in_github from Code.resources.cveprojectdatabase import create_cve_mapper_table -from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories +from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories, \ + is_repo_available, PROJECT_STATUS_REPO_REMOVED from Code.resources.extract_github_repo_from_ghsd import parse_and_append_ghsd_dataset from resources.find_repo_url import apply_cve_cpe_mappers from database import create_session -from collect_commits import extract_commits, extract_project_links +from collect_commits import extract_commits, extract_project_links, download_patch from constants import REPO_COLUMNS from utils import prune_tables @@ -26,6 +31,26 @@ conn = session.connection() + +def add_missing_patches(): + # Specify the directory + directory = Path(cf.PATCH_FILE_STORAGE_PATH) + # Get the list of files in the directory + patches = {f.name: True for f in directory.iterdir() if f.is_file()} + + res = db.get_query("SELECT hash, repo_url from fixes where extraction_status='COMPLETED'") + for r in tqdm(res): + if len(r['hash'])<=5: continue + repo_name = (r['repo_url'].lstrip('https://')).replace('/', '_') + patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{r['hash']}.patch") + if patches.get(f"{repo_name}_{r['hash']}.patch"): + # Do not download if patch already exists + continue + download_patch(r['repo_url'], patch_path, r['hash']) + time.sleep(5) + + + def create_fixes_table(): query = text('''CREATE TABLE IF NOT EXISTS fixes ( @@ -216,36 +241,42 @@ def get_github_repo_meta(repo_url: str, username: str, token): """ returns github meta-information of the repo_url """ - - # handle renamed repos - repo_url = extract_location_header(repo_url) - - repo_url = repo_url.rstrip('/') - owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1] - meta_row = {} - - if username == 'None': - git_link = github.Github() - else: - git_link = github.Github(login_or_token=token, user_agent=username) - print(owner) - git_user = git_link.get_user(owner) - print(project) - repo = git_user.get_repo(project) - meta_row = {'repo_url': repo_url, - 'repo_name': repo.full_name, - 'description': repo.description, - 'date_created': repo.created_at, - 'date_last_push': repo.pushed_at, - 'homepage': repo.homepage, - 'repo_language': repo.language, - 'forks_count': repo.forks, - 'stars_count': repo.stargazers_count, - 'owner': owner} + try: + repo_status = is_repo_available(repo_url) + print(repo_status) + if repo_status == 'Removed': + return None + # handle renamed repos + repo_url = extract_location_header(repo_url) + + repo_url = repo_url.rstrip('/') + owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1] + meta_row = {} + + if username == 'None': + git_link = github.Github() + else: + git_link = github.Github(login_or_token=token, user_agent=username) + cf.logger.info(f"Getting github meta information for {repo_url}") + git_user = git_link.get_user(owner) + repo = git_user.get_repo(project) + meta_row = {'repo_url': repo_url, + 'repo_name': repo.full_name, + 'description': repo.description, + 'date_created': repo.created_at, + 'date_last_push': repo.pushed_at, + 'homepage': repo.homepage, + 'repo_language': repo.language, + 'forks_count': repo.forks, + 'stars_count': repo.stargazers_count, + 'owner': owner} + return meta_row + except Exception as e: + cf.logger.error(f"Getting meta information failed for repo url failed {e}") + return None # except BadCredentialsException as e: # cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}: {e}') # pass # or exit(1) - return meta_row def save_repo_meta(repo_url): @@ -257,16 +288,27 @@ def save_repo_meta(repo_url): new_conn = new_session.connection() # ignore when the meta-information of the given repo is already saved. - + repo_url = clean_git_url(repo_url) try: - if db.table_exists('repository') and db.get_one_query(f"select * from repository where repo_url='{repo_url}'"): - return + if db.get_one_query(f"select * from repository where repo_url='{repo_url}'"): + return 'FIX_WAS_AVAILABLE' if 'github.' in repo_url: meta_dict = get_github_repo_meta(repo_url, cf.USER, cf.TOKEN) - df_meta = pd.DataFrame([meta_dict], columns=REPO_COLUMNS) - df_meta.to_sql(name='repository', con=new_conn, if_exists="append", index=False) + if not meta_dict: + # Repository removed, most likely the next extraction process will fail. + return 'REPO_REMOVED' + # Build the insert SQL query + insert_query = text(f""" + INSERT INTO repository ({', '.join(REPO_COLUMNS)}) + VALUES ({', '.join([':{}'.format(col) for col in REPO_COLUMNS])}) + ON CONFLICT (repo_url) DO NOTHING; + """) + # Execute the SQL statement using parameterized query + new_conn.execute(insert_query, meta_dict) new_conn.commit() + return True except Exception as e: + print("Inserting new repository resulted in error.") cf.logger.warning(f'Problem while fetching repository meta-information: {e}') finally: new_conn.close() @@ -327,11 +369,6 @@ def fetch_and_store_commits(): commit_fixes_query = f"SELECT * FROM fixes where score >= {THRESHOLD_SCORE} and extraction_status = 'NOT_STARTED' " if db.table_exists('commits'): commit_fixes_query += ' and hash not in (select distinct hash from commits)' - try: - db.exec_query('ALTER TABLE commits ADD CONSTRAINT hash_unique_constraint UNIQUE (hash, repo_url);') - conn.commit() - except Exception as e: - print(e) # if db.table_exists('file_change'): # try: # db.exec_query('CREATE UNIQUE INDEX hashdiffoldpath_unique_index ON file_change (hash, diff, old_path);') @@ -350,12 +387,16 @@ def fetch_and_store_commits(): pcount = 0 for repo_url in repo_urls: - save_repo_meta(repo_url) + status = save_repo_meta(repo_url) + if status == 'REPO_REMOVED': + cf.logger.info(f"Repo removed! {repo_url}") + db.exec_query(f"UPDATE fixes SET extraction_status='{PROJECT_STATUS_REPO_REMOVED}' where repo_url='{repo_url}' and extraction_status='NOT_STARTED'") + continue pcount += 1 session = create_session() conn = session.connection() - + repo_path='' try: df_single_repo = df_fixes[df_fixes.repo_url == repo_url] hashes = list(df_single_repo.hash.unique()) @@ -365,6 +406,13 @@ def fetch_and_store_commits(): repo_path = repo_cache_dict.get(repo_url, clone_memo_repo(repo_url)) df_commit, df_file, df_method = extract_commits(repo_url, hashes, repo_path) # remove_directory(repo_path) + for single_hash in hashes: + repo_name = (repo_url.lstrip('https://')).replace('/', '_') + patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch") + if os.path.exists(patch_path): continue + cf.logger.warning(f'Trying to download patch files directly {repo_url}-{single_hash}') + cf.logger.info(f"DL Path: {patch_path}") + download_patch(repo_url, patch_path, single_hash) if df_commit is None: cf.logger.warning(f'Could not retrieve commit information from: {repo_url}') @@ -386,7 +434,7 @@ def fetch_and_store_commits(): conn.execute(sql, { 'hash': row['hash'], - 'repo_url': row['repo_url'], + 'repo_url': clean_git_url(row['repo_url']), 'author': row['author'], 'committer': row['committer'], 'msg': row['msg'], @@ -410,6 +458,7 @@ def fetch_and_store_commits(): df_file.to_sql(name="file_change", con=conn, if_exists="append", index=False) else: for index, row in df_file.iterrows(): + #TODO: Require a double check for duplicated in file_change, method_change sql = text(''' INSERT INTO file_change (file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,code_after,code_before,nloc,complexity,token_count,programming_language) VALUES (:file_change_id,:hash,:filename,:old_path,:new_path,:change_type,:diff,:diff_parsed,:num_lines_added,:num_lines_deleted,:code_after,:code_before,:nloc,:complexity,:token_count,:programming_language) @@ -443,13 +492,14 @@ def fetch_and_store_commits(): conn.commit() hash_query = str(hashes)[1:-1] - print(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})") db.exec_query(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})") except Exception as e: # cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}: {e}') print(f'Problem occurred while retrieving the project: {repo_url}: {e}') # pass # skip fetching repository if is not available. + finally: + remove_directory(repo_path) conn.commit() cf.logger.debug('-' * 70) @@ -491,35 +541,33 @@ def remove_lowscore_fixes(min_score): if __name__ == '__main__': + print('Starting ...') + start_time = time.perf_counter() - if False: - print('Starting ...') - start_time = time.perf_counter() + print('Importing CVEs') + # Step (1) save CVEs(cve) and cwe tables + cve_importer.import_cves() - print('Importing CVEs') - # Step (1) save CVEs(cve) and cwe tables - cve_importer.import_cves() + print('Parsing & extracting NVD dataset') + populate_fixes_table() - print('Parsing & extracting NVD dataset') - populate_fixes_table() + print('Parsing & Adding GHSD dataset') - print('Parsing & Adding GHSD dataset') + # Parse & append GHSD dataset + parse_and_append_ghsd_dataset() - # Parse & append GHSD dataset - parse_and_append_ghsd_dataset() + # Step (2.2) Find any CVE that have no Github fix using CPE + # Parse official CPE dictionary + parse_cpe_dict() - # Step (2.2) Find any CVE that have no Github fix using CPE - # Parse official CPE dictionary - parse_cpe_dict() - - apply_cve_cpe_mappers() - # - # end_time = time.perf_counter() - # hours, minutes, seconds = convert_runtime(start_time, end_time) - # cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).') + apply_cve_cpe_mappers() + # + # end_time = time.perf_counter() + # hours, minutes, seconds = convert_runtime(start_time, end_time) + # cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).') - # Step (2.3) Run prospector on cve_project table, to find out all fixing commits. - add_missing_commits() + # Step (2.3) Run prospector on cve_project table, to find out all fixing commits. + add_missing_commits() # remove_lowscore_fixes(cf.MINIMUM_COMMIT_SCORE) # Step (3) save commit-, file-, and method- level data tables to the database @@ -533,6 +581,7 @@ def remove_lowscore_fixes(min_score): # fix_column_types() # else: # cf.logger.warning('Data pruning is not possible because there is no information in method_change table') + add_missing_patches() cf.logger.info('The database is up-to-date.') cf.logger.info('-' * 70) # --------------------------------------------------------------------------------------------------------------------- diff --git a/Code/configuration.py b/Code/configuration.py index 3e18759..8998914 100644 --- a/Code/configuration.py +++ b/Code/configuration.py @@ -4,16 +4,17 @@ from pathlib import Path from dotenv import load_dotenv import os - +import multiprocessing as mp load_dotenv('.env') # set sensible defaults for thTe configurable fields DATA_PATH = 'Data' DATABASE_NAME = 'CVEfixes_sample.db' -USER = None -TOKEN = None +USER = os.getenv('GITHUB_USER', None) +TOKEN = os.getenv('GITHUB_TOKEN', None) SAMPLE_LIMIT = 0 -NUM_WORKERS = 30 +NUM_WORKERS = 8 +PROSPECTOR_WORKERS = min(mp.cpu_count() - 1, 15) # Anything more than 20 will result in rate limits. LOGGING_LEVEL = logging.WARNING PROSPECTOR_PYTHON_PATH = os.getenv('PROSPECTOR_PYTHON_PATH') diff --git a/Code/constants.py b/Code/constants.py index 08c11e0..abf3de5 100644 --- a/Code/constants.py +++ b/Code/constants.py @@ -45,7 +45,7 @@ 'rel_type' ] -REPO_BLACK_LIST_WORDS_PATTERN = re.compile(r'bugbounty|0day|injection|advisor|GHSA-|zero-day|exploit|poc|cve|vulnerabil|\.github\.io', +REPO_BLACK_LIST_WORDS_PATTERN = re.compile(r'bugbounty|0day|injection|advisor|GHSA-|zero-day|exploit|poc|cve|vulnerabil|malware|\.github\.io', re.IGNORECASE) REPO_BLACK_LIST_EXACT_WORDS_PATTERN = [ 'research', @@ -118,6 +118,9 @@ '0day', 'IBOS_4.4.3', 'ttt', + 'IoT-vulnerable', + 'security-research', + 'cxcxcxcxcxcxcxc', ] REPO_BLACK_LIST_EXACT_WORDS_PATTERN = list(map(str.lower, REPO_BLACK_LIST_EXACT_WORDS_PATTERN)) diff --git a/Code/cpe_parser.py b/Code/cpe_parser.py index 73e6afd..02a712e 100644 --- a/Code/cpe_parser.py +++ b/Code/cpe_parser.py @@ -90,12 +90,10 @@ def parse_cpe_dict(): 'repo_url': repo_url, 'rel_type': rel_type, }) + session.commit() print(f"Inserted {iz} cpe->repository mapping tuples") print(f"Total blacklisted CPEs: {total_blacklisted_count}") print('Adding missing CPEs based on Github availability') - - # TODO: UNCOMMENT BELOW search_missing_cpes_in_github() - session.commit() \ No newline at end of file diff --git a/Code/registry_to_github.py b/Code/registry_to_github.py index c1cd037..582852d 100644 --- a/Code/registry_to_github.py +++ b/Code/registry_to_github.py @@ -43,14 +43,11 @@ def clean_git_url(url): for prefix in prefixes: if url.startswith(prefix): url = url[len(prefix):] - # Remove '.git' extension if present if url.endswith(".git"): url = url[:-len(".git")] - # Remove user info from ssh urls (e.g., git@) url = url.replace("git@", "").replace('/tree/main', '') - return f"https://{url}" @@ -78,6 +75,7 @@ def get_version(package_name, ecosystem): return try: package_name = quote(package_name, safe="") + if package_name.startswith('vuln%2FGO'): return None version = get_version(package_name, ecosystem) api = f'https://api.deps.dev/v3alpha/systems/{ecosystem}/packages/{package_name}/versions/{version}' response = requests.get(api) diff --git a/Code/resources/cpe_to_github_search.py b/Code/resources/cpe_to_github_search.py index 28cebe9..fcdbc8d 100644 --- a/Code/resources/cpe_to_github_search.py +++ b/Code/resources/cpe_to_github_search.py @@ -11,6 +11,7 @@ import requests from Code.resources.dynamic_commit_collector import execute_command, is_repo_available +import Code.configuration as cf def exists_in_github(cpe): @@ -33,6 +34,7 @@ def exists_in_github(cpe): def search_missing_cpes_in_github(): + print("Search missing CPEs started") cpes = get_query( 'select distinct cpe_name from cve_cpe_mapper where cpe_name not in(select distinct cpe_name from cpe_project)') @@ -40,7 +42,7 @@ def search_missing_cpes_in_github(): conn = session.connection() print(f'Searching for missing CPES... {len(cpes)}') total_blacklisted = 0 - with mp.Pool(processes=5) as pool, tqdm(total=len(cpes)) as progress_bar: + with mp.Pool(processes=cf.NUM_WORKERS) as pool, tqdm(total=len(cpes)) as progress_bar: new_cpes = list(tqdm(pool.imap_unordered(exists_in_github, cpes), total=len(cpes))) for repo in new_cpes: if not repo: @@ -55,4 +57,6 @@ def search_missing_cpes_in_github(): 'repo_url': repo_address, 'rel_type': GITREF_CPE_SEARCH, }) + print("Commiting") conn.commit() + print("Done") diff --git a/Code/resources/dynamic_commit_collector.py b/Code/resources/dynamic_commit_collector.py index 6412585..f4f97f6 100644 --- a/Code/resources/dynamic_commit_collector.py +++ b/Code/resources/dynamic_commit_collector.py @@ -9,9 +9,11 @@ from bs4 import BeautifulSoup import psutil from sqlalchemy import text - +from pathlib import Path import multiprocessing as mp +import Code.configuration as cf + from tqdm import tqdm from Code.database import create_session, fetchone_query, get_query, get_one_query, exec_query @@ -19,6 +21,8 @@ PROSPECTOR_GIT_CACHE, HARDWARE_RESOURCE_THRESHOLD_PERCENT, TOKEN import orjson +from Code.registry_to_github import is_black_list + # A global lock to prevent prospector working on multpiple CVES from save project @@ -29,18 +33,21 @@ PROJECT_STATUS_REPO_REMOVED = 'REPO_REMOVED' PROJECT_STATUS_PROSPECTOR_FAILED = 'PROSPECTOR_FAILED' PROJECT_STATUS_NO_FIX_FOUND = 'NO_FIX_WAS_FOUND' +PROJECT_STATUS_BLOCK_LISTED = 'BLOCK_LISTED' PROJECT_STATUS_FIX_FOUND = 'Success' DISK_USAGE_THRESHOLD = 50 +Path(PROSPECTOR_GIT_CACHE).mkdir(parents=True, exist_ok=True) + + def is_repo_available(url): try: # Send an HTTP GET request to the repository's web page response = requests.get(url, headers={ 'Authorization': f'Bearer {TOKEN}' }) - # Check if the request was successful # What if it's renamed? if response.status_code == 200: @@ -50,6 +57,8 @@ def is_repo_available(url): # TODO: Add specific access rate limit? Code: 429 if response.status_code == 429: + cf.logger.error("We reached a rate limit! Better stop now") + time.sleep(60) return 'Unavailable' # if response.status_code == 404: return 'Removed' @@ -218,6 +227,9 @@ def process_commits(dict_input): git_repo_lock_list.append(project_url) print(f'{project_url} -> LOCKING {git_repo_lock_list}') print(f'Starting finding candidates for {cve} ...') + if is_black_list(project_url): + exec_query(f"UPDATE cve_project SET checked = '{PROJECT_STATUS_BLOCK_LISTED}' WHERE id = '{id}'") + return exec_query(f"UPDATE cve_project SET checked = '{PROJECT_STATUS_FINDING_FIX}' WHERE id = '{id}'") repo_status = is_repo_available(project_url) if repo_status == 'Removed': @@ -311,8 +323,8 @@ def add_missing_commits(years=None): print(f"Still to low ... performing full cache wipe") cleanup() # cpu_count = mp.cpu_count() - 1 - # cpu_count = mp.cpu_count() - 1 - cpu_count = 20 # Anything more than it will result in rate limit ... + cpu_count = cf.PROSPECTOR_WORKERS + # cpu_count = 20 # Anything more than it will result in rate limit ... for i in range(len(cve_projects)): cve_projects[i]['lock_list'] = git_repo_lock_list diff --git a/Code/resources/extract_github_repo_from_ghsd.py b/Code/resources/extract_github_repo_from_ghsd.py index 3a87aee..2cea227 100644 --- a/Code/resources/extract_github_repo_from_ghsd.py +++ b/Code/resources/extract_github_repo_from_ghsd.py @@ -155,6 +155,6 @@ def parse_and_append_ghsd_dataset(): }) conn.commit() print("Data inserted into 'fixes' table successfully.") - cf.logger.error(f"After black list counter {total_blacklisted}") + cf.logger.info(f"After black list counter {total_blacklisted}") # populate_fixes_table() # parse_and_append_ghsd_dataset() \ No newline at end of file diff --git a/Code/utils.py b/Code/utils.py index bd634e1..3467abd 100644 --- a/Code/utils.py +++ b/Code/utils.py @@ -7,6 +7,7 @@ import configuration as cf import database as db +from Code.resources.dynamic_commit_collector import PROJECT_STATUS_REPO_REMOVED from database import create_session session = create_session() @@ -102,7 +103,7 @@ def prune_tables(datafile): # copyfile(datafile, str(datafile).split('.')[0] + '_raw.db') from Code.collect_projects import save_repo_meta - for r in db.get_query('select distinct repo_url from fixes where repo_url not in (select repo_url from repository)'): + for r in db.get_query(f'select distinct repo_url from fixes where extraction_status!="{PROJECT_STATUS_REPO_REMOVED}" and repo_url not in (select repo_url from repository)'): save_repo_meta(r['repo_url']) @@ -118,7 +119,6 @@ def prune_tables(datafile): # processing commit, file and method tables for filtering out some invalid records df_commit['repo_url'] = df_commit.repo_url.apply(lambda x: x.rsplit('.git')[0]) df_commit = df_commit.drop_duplicates().reset_index(drop=True) - df_repo = df_repo.drop_duplicates().reset_index(drop=True) invalid_hashes = set(list(df_commit.hash.unique())).difference(set(list(df_fixes.hash.unique()))) # replace short hash of fix table with long hash from the commits table diff --git a/README.md b/README.md index b442f2e..af6304e 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,9 @@ It's highly recommended to first read the MoreFixes paper and understand differe This tool is consisted of two main components(Morefixes and Prospector) and two data sources(NVD and GSAD) ### Configure Morefixes +> [!CAUTION] +> You have to configure Prospector as well to make Morefixes work +Make sure you have cloned this repository with submodules. MoreFixes structure itself is based on [CVEFixes project](https://github.com/secureIT-project/CVEfixes). Add the Github security advisory database(https://github.com/github/advisory-database) in `Code/resources/ghsd` to get latest vulnerabilities list. Then, create a virtual python environment(recommended) in the repo root directory, and install dependencies: @@ -64,17 +67,27 @@ Then, create a virtual python environment(recommended) in the repo root director Renamed `env.sample` to `.env` and update the fields in `.env` and `.CVEfixes.ini` in tool root directory, Note that these values should be same for similar services(for example posgtresql database credentials) related to each other. -### Configure prospector -We are not planning to keep prospector in this repository, and instead, fetch latest Prospector form ProjectKB. As a temporary workaround, you'll need to update the modified version of prospector(which is available in this repository). -Update 'config.yaml' in `/prospector` path, and copy the current `.env` file to `/prospector` directory as well. This mess will be fixed in the future :) -Create a separate virtual environment in `/prospector` and install requirements for prospector(`pip install -r requirements.txt`). Update python executor path in `runner.sh` if the virtual environment directory name is not 'venv'. +### Configure prospector +Update 'config.yaml' in `/prospector` path, and copy the current `.env` file to `/prospector` directory as well. +Create a separate virtual environment in `/prospector` named `venv` and install requirements for prospector(`pip install -r requirements.txt`). +In the Prospector venv, Run +``` +python -m spacy download en_core_web_sm +python -m spacy download en_core_web_lg +python -m spacy download en +``` +> [!WARNING] +> if the virtual environment directory name is not 'venv', update python executor path in `runner.sh` + +> [!WARNING] +> We are not planning to keep prospector in this repository, this is a temporary workaround. ### Run the tool -If you want to update the dataset for new CVEs, run the tool by executing `bash Code/run.sh`. This will first update the GHSA dataset in `/Code/resources/advisory-database` and download latest CPE Dictionary from NVD and starts the whole flow mentioned in the figure 1 of the paper. +We HIGHLY recommend to first restore the latest backup, and then run the tool to only processes new advisories. Otherwise, +**Deactivate the venv of Prospector and switch to Morefixes venv**, then run the tool by executing `bash Code/run.sh`. This will first update the GHSA dataset in `/Code/resources/advisory-database` and download latest CPE Dictionary from NVD and starts the whole flow mentioned in the figure 1 of the paper. Please note we don't recommend running it on a low-end device and using the published dataset should be an easier choice if you just need to work with the available data. - ### Troubleshooting One of the heaviest modules of this software, is located at `Code/resources/dynamic_commit_collector.py`, which will process possible fix commits in parallel. If you need to run the software from scratch, make sure to double-check parameters in this page to make sure your system won't break during processing. diff --git a/docker-compose.yml b/docker-compose.yml index 07d192b..7464615 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: env_file: - .env ports: - - "127.0.0.1:5432:5432" + - "${POSTGRES_PORT}:${POSTGRES_PORT}" volumes: - postgres_data2:/var/lib/postgresql/data - ./dump_morefixes_27-03-2024_19_52_58.sql:/docker-entrypoint-initdb.d/dump_morefixes_27-03-2024_19_52_58.sql diff --git a/env.sample b/env.sample index 18a3f47..baf2923 100644 --- a/env.sample +++ b/env.sample @@ -1,12 +1,12 @@ MIN_COMMIT_SCORE=65 -GIT_CACHE=/ssddata/jafar/tmp -PROSPECTOR_GIT_CACHE=/ssddata/jafar/tmp -PROSPECTOR_PATH=/ssddata/jafar/project-kb/prospector/ -PROSPECTOR_PYTHON_PATH=/ssddata/jafar/project-kb/prospector/myenv/bin/python -PATCH_FILE_STORAGE_PATH=/pool0/data/jef/cvedataset-patches/ -CVE_DATA_PATH=/ssddata/jafar/tmp/cvedata1112222 +GIT_CACHE=/tmp/gitcache +PROSPECTOR_GIT_CACHE=/tmp/proscache +PROSPECTOR_PATH=/home/jef/projects/Morefixes/prospector/ +PROSPECTOR_PYTHON_PATH=/home/jef/projects/Morefixes/prospector/venv/bin/python +PATCH_FILE_STORAGE_PATH=/home/jef/projects/Morefixes/patchesdir/cvedataset-patches +CVE_DATA_PATH=/tmp/cvedata1112222 POSTGRES_USER=postgrescvedumper -POSTGRES_PORT=9921 +POSTGRES_PORT=5432 POSTGRES_DBNAME=postgrescvedumper POSTGRES_DB=postgrescvedumper POSTGRES_PASSWORD=a42a18537d74c3b7e584c769152c3d diff --git a/prospector/config.yaml b/prospector/config.yaml index 5c73552..7cb29bd 100644 --- a/prospector/config.yaml +++ b/prospector/config.yaml @@ -4,7 +4,7 @@ preprocess_only: False # Maximum number of commits to process -max_candidates: 5000 +max_candidates: 3000 fetch_references: True @@ -41,7 +41,7 @@ report: log_level: INFO # The directory used to cache the cloned repositories -git_cache: /ssddata/jafar/tmp +git_cache: /tmp/gitcache # The GitHub API token github_token: diff --git a/prospector/runner.sh b/prospector/runner.sh index 5e918c2..bac3a47 100755 --- a/prospector/runner.sh +++ b/prospector/runner.sh @@ -1,5 +1,5 @@ #!/bin/bash cd $(dirname "$0") source venv/bin/activate -#timeout 3000 python3 cli/main.py "$@" -python3 cli/main.py "$@" +timeout 3000 python3 cli/main.py "$@" +#python3 cli/main.py "$@"