Skip to content

Commit

Permalink
- Fix duplicate repository issues
Browse files Browse the repository at this point in the history
- Improve add missing patch files
- Add new block listing
- Better handling removed projects
  • Loading branch information
JafarAkhondali committed Oct 21, 2024
1 parent cbd57ca commit 9270e55
Show file tree
Hide file tree
Showing 16 changed files with 197 additions and 117 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "Code/resources/advisory-database"]
path = Code/resources/advisory-database
url = https://github.com/github/advisory-database.git
29 changes: 14 additions & 15 deletions Code/collect_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@
session = create_session()
conn = session.connection()

def download_patch(repo_url, patch_file_address, hashsha):
if os.path.exists(patch_file_address): return
if 'github.com' in repo_url:
cf.logger.info(f'Trying to download patch file directly from github: {repo_url}/commit/{hashsha}.patch')
res = requests.get(f'{repo_url}/commit/{hashsha}.patch')
if res.status_code != 200: return
patch_text=res.text
cf.logger.info(patch_file_address)
open(patch_file_address, 'w+').write(patch_text)

def extract_commit_url_from_refs(ref_list, cve_id):
# Direct commit URLS
Expand Down Expand Up @@ -373,9 +382,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
}
repo_commits.append(commit_row)
# Create patch file from commit
if os.path.exists(patch_file_address):
if not os.path.exists(patch_file_address):
create_git_patch(cached_repo_address, single_hash, cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch")
print(patch_file_address)
patch_size = get_file_size(patch_file_address)
if patch_size > cf.MAXIMUM_PATCH_SIZE_FOR_DB_STORAGE:
continue
Expand All @@ -385,23 +393,18 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
repo_files.extend(commit_files)
repo_methods.extend(commit_methods)
except Exception as e:
print(f'Problem while fetching the commits1: {e}')
cf.logger.error(f'Problem while fetching the commits1: {e}')
except Exception as e:
print(f'Problem while fetching the commits2: {e}')
cf.logger.error(f'Problem while fetching the commits2: {e}')
except Exception as e:
cf.logger.error(f'Extracting commits failed: {e}')
try:
if not os.path.exists(patch_file_address):
print('Trying to extract commits directly from github')
if 'github' in repo_url:
patch_text = requests.get(f'{repo_url}/commit/{single_hash}.patch').text
open(patch_file_address, 'w+').write(patch_text)
download_patch(repo_url, patch_file_address, single_hash)
except Exception as e:
print(f'Trying to extract commits directly from github failed {str(e)}')
print(f'Error: {str(e)}')
if repo_commits:
print('4')
df_repo_commits = pd.DataFrame.from_dict(repo_commits)
print('5')
df_repo_commits = df_repo_commits[COMMIT_COLUMNS] # ordering the columns
else:
df_repo_commits = None
Expand All @@ -414,12 +417,8 @@ def extract_commits(repo_url, hashes, cached_repo_address=None):
df_repo_files = None

if repo_methods:
print('10')

df_repo_methods = pd.DataFrame.from_dict(repo_methods)
print('11')
df_repo_methods = df_repo_methods[METHOD_COLUMNS] # ordering the
print('12')
else:
df_repo_methods = None

Expand Down
179 changes: 114 additions & 65 deletions Code/collect_projects.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,56 @@
import os
import shutil
import sys
import time
from urllib.parse import urlparse
from pathlib import Path

import pandas as pd
import requests
import github
from sqlalchemy import text
from tqdm import tqdm

import configuration as cf
import cve_importer
import database as db
from Code.cpe_parser import parse_cpe_dict
from Code.registry_to_github import clean_git_url
from Code.resources.cpe_to_github_search import search_missing_cpes_in_github
from Code.resources.cveprojectdatabase import create_cve_mapper_table
from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories
from Code.resources.dynamic_commit_collector import add_missing_commits, execute_command, remove_all_directories, \
is_repo_available, PROJECT_STATUS_REPO_REMOVED
from Code.resources.extract_github_repo_from_ghsd import parse_and_append_ghsd_dataset
from resources.find_repo_url import apply_cve_cpe_mappers
from database import create_session
from collect_commits import extract_commits, extract_project_links
from collect_commits import extract_commits, extract_project_links, download_patch
from constants import REPO_COLUMNS
from utils import prune_tables

session = create_session()
conn = session.connection()



def add_missing_patches():
# Specify the directory
directory = Path(cf.PATCH_FILE_STORAGE_PATH)
# Get the list of files in the directory
patches = {f.name: True for f in directory.iterdir() if f.is_file()}

res = db.get_query("SELECT hash, repo_url from fixes where extraction_status='COMPLETED'")
for r in tqdm(res):
if len(r['hash'])<=5: continue
repo_name = (r['repo_url'].lstrip('https://')).replace('/', '_')
patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{r['hash']}.patch")
if patches.get(f"{repo_name}_{r['hash']}.patch"):
# Do not download if patch already exists
continue
download_patch(r['repo_url'], patch_path, r['hash'])
time.sleep(5)



def create_fixes_table():
query = text('''CREATE TABLE IF NOT EXISTS fixes
(
Expand Down Expand Up @@ -216,36 +241,42 @@ def get_github_repo_meta(repo_url: str, username: str, token):
"""
returns github meta-information of the repo_url
"""

# handle renamed repos
repo_url = extract_location_header(repo_url)

repo_url = repo_url.rstrip('/')
owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
meta_row = {}

if username == 'None':
git_link = github.Github()
else:
git_link = github.Github(login_or_token=token, user_agent=username)
print(owner)
git_user = git_link.get_user(owner)
print(project)
repo = git_user.get_repo(project)
meta_row = {'repo_url': repo_url,
'repo_name': repo.full_name,
'description': repo.description,
'date_created': repo.created_at,
'date_last_push': repo.pushed_at,
'homepage': repo.homepage,
'repo_language': repo.language,
'forks_count': repo.forks,
'stars_count': repo.stargazers_count,
'owner': owner}
try:
repo_status = is_repo_available(repo_url)
print(repo_status)
if repo_status == 'Removed':
return None
# handle renamed repos
repo_url = extract_location_header(repo_url)

repo_url = repo_url.rstrip('/')
owner, project = repo_url.split('/')[-2], repo_url.split('/')[-1]
meta_row = {}

if username == 'None':
git_link = github.Github()
else:
git_link = github.Github(login_or_token=token, user_agent=username)
cf.logger.info(f"Getting github meta information for {repo_url}")
git_user = git_link.get_user(owner)
repo = git_user.get_repo(project)
meta_row = {'repo_url': repo_url,
'repo_name': repo.full_name,
'description': repo.description,
'date_created': repo.created_at,
'date_last_push': repo.pushed_at,
'homepage': repo.homepage,
'repo_language': repo.language,
'forks_count': repo.forks,
'stars_count': repo.stargazers_count,
'owner': owner}
return meta_row
except Exception as e:
cf.logger.error(f"Getting meta information failed for repo url failed {e}")
return None
# except BadCredentialsException as e:
# cf.logger.warning(f'Credential problem while accessing GitHub repository {repo_url}: {e}')
# pass # or exit(1)
return meta_row


def save_repo_meta(repo_url):
Expand All @@ -257,16 +288,27 @@ def save_repo_meta(repo_url):
new_conn = new_session.connection()

# ignore when the meta-information of the given repo is already saved.

repo_url = clean_git_url(repo_url)
try:
if db.table_exists('repository') and db.get_one_query(f"select * from repository where repo_url='{repo_url}'"):
return
if db.get_one_query(f"select * from repository where repo_url='{repo_url}'"):
return 'FIX_WAS_AVAILABLE'
if 'github.' in repo_url:
meta_dict = get_github_repo_meta(repo_url, cf.USER, cf.TOKEN)
df_meta = pd.DataFrame([meta_dict], columns=REPO_COLUMNS)
df_meta.to_sql(name='repository', con=new_conn, if_exists="append", index=False)
if not meta_dict:
# Repository removed, most likely the next extraction process will fail.
return 'REPO_REMOVED'
# Build the insert SQL query
insert_query = text(f"""
INSERT INTO repository ({', '.join(REPO_COLUMNS)})
VALUES ({', '.join([':{}'.format(col) for col in REPO_COLUMNS])})
ON CONFLICT (repo_url) DO NOTHING;
""")
# Execute the SQL statement using parameterized query
new_conn.execute(insert_query, meta_dict)
new_conn.commit()
return True
except Exception as e:
print("Inserting new repository resulted in error.")
cf.logger.warning(f'Problem while fetching repository meta-information: {e}')
finally:
new_conn.close()
Expand Down Expand Up @@ -327,11 +369,6 @@ def fetch_and_store_commits():
commit_fixes_query = f"SELECT * FROM fixes where score >= {THRESHOLD_SCORE} and extraction_status = 'NOT_STARTED' "
if db.table_exists('commits'):
commit_fixes_query += ' and hash not in (select distinct hash from commits)'
try:
db.exec_query('ALTER TABLE commits ADD CONSTRAINT hash_unique_constraint UNIQUE (hash, repo_url);')
conn.commit()
except Exception as e:
print(e)
# if db.table_exists('file_change'):
# try:
# db.exec_query('CREATE UNIQUE INDEX hashdiffoldpath_unique_index ON file_change (hash, diff, old_path);')
Expand All @@ -350,12 +387,16 @@ def fetch_and_store_commits():
pcount = 0

for repo_url in repo_urls:
save_repo_meta(repo_url)
status = save_repo_meta(repo_url)
if status == 'REPO_REMOVED':
cf.logger.info(f"Repo removed! {repo_url}")
db.exec_query(f"UPDATE fixes SET extraction_status='{PROJECT_STATUS_REPO_REMOVED}' where repo_url='{repo_url}' and extraction_status='NOT_STARTED'")
continue
pcount += 1

session = create_session()
conn = session.connection()

repo_path=''
try:
df_single_repo = df_fixes[df_fixes.repo_url == repo_url]
hashes = list(df_single_repo.hash.unique())
Expand All @@ -365,6 +406,13 @@ def fetch_and_store_commits():
repo_path = repo_cache_dict.get(repo_url, clone_memo_repo(repo_url))
df_commit, df_file, df_method = extract_commits(repo_url, hashes, repo_path)
# remove_directory(repo_path)
for single_hash in hashes:
repo_name = (repo_url.lstrip('https://')).replace('/', '_')
patch_path = os.path.join(cf.PATCH_FILE_STORAGE_PATH, f"{repo_name}_{single_hash}.patch")
if os.path.exists(patch_path): continue
cf.logger.warning(f'Trying to download patch files directly {repo_url}-{single_hash}')
cf.logger.info(f"DL Path: {patch_path}")
download_patch(repo_url, patch_path, single_hash)

if df_commit is None:
cf.logger.warning(f'Could not retrieve commit information from: {repo_url}')
Expand All @@ -386,7 +434,7 @@ def fetch_and_store_commits():

conn.execute(sql, {
'hash': row['hash'],
'repo_url': row['repo_url'],
'repo_url': clean_git_url(row['repo_url']),
'author': row['author'],
'committer': row['committer'],
'msg': row['msg'],
Expand All @@ -410,6 +458,7 @@ def fetch_and_store_commits():
df_file.to_sql(name="file_change", con=conn, if_exists="append", index=False)
else:
for index, row in df_file.iterrows():
#TODO: Require a double check for duplicated in file_change, method_change
sql = text('''
INSERT INTO file_change (file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,code_after,code_before,nloc,complexity,token_count,programming_language)
VALUES (:file_change_id,:hash,:filename,:old_path,:new_path,:change_type,:diff,:diff_parsed,:num_lines_added,:num_lines_deleted,:code_after,:code_before,:nloc,:complexity,:token_count,:programming_language)
Expand Down Expand Up @@ -443,13 +492,14 @@ def fetch_and_store_commits():
conn.commit()

hash_query = str(hashes)[1:-1]
print(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})")
db.exec_query(f"UPDATE fixes SET extraction_status='COMPLETED' where hash in ({hash_query})")

except Exception as e:
# cf.logger.warning(f'Problem occurred while retrieving the project: {repo_url}: {e}')
print(f'Problem occurred while retrieving the project: {repo_url}: {e}')
# pass # skip fetching repository if is not available.
finally:
remove_directory(repo_path)
conn.commit()
cf.logger.debug('-' * 70)

Expand Down Expand Up @@ -491,35 +541,33 @@ def remove_lowscore_fixes(min_score):


if __name__ == '__main__':
print('Starting ...')
start_time = time.perf_counter()

if False:
print('Starting ...')
start_time = time.perf_counter()
print('Importing CVEs')
# Step (1) save CVEs(cve) and cwe tables
cve_importer.import_cves()

print('Importing CVEs')
# Step (1) save CVEs(cve) and cwe tables
cve_importer.import_cves()
print('Parsing & extracting NVD dataset')
populate_fixes_table()

print('Parsing & extracting NVD dataset')
populate_fixes_table()
print('Parsing & Adding GHSD dataset')

print('Parsing & Adding GHSD dataset')
# Parse & append GHSD dataset
parse_and_append_ghsd_dataset()

# Parse & append GHSD dataset
parse_and_append_ghsd_dataset()
# Step (2.2) Find any CVE that have no Github fix using CPE
# Parse official CPE dictionary
parse_cpe_dict()

# Step (2.2) Find any CVE that have no Github fix using CPE
# Parse official CPE dictionary
parse_cpe_dict()

apply_cve_cpe_mappers()
#
# end_time = time.perf_counter()
# hours, minutes, seconds = convert_runtime(start_time, end_time)
# cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).')
apply_cve_cpe_mappers()
#
# end_time = time.perf_counter()
# hours, minutes, seconds = convert_runtime(start_time, end_time)
# cf.logger.info(f'Time elapsed to pull the data {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f} (hh:mm:ss).')

# Step (2.3) Run prospector on cve_project table, to find out all fixing commits.
add_missing_commits()
# Step (2.3) Run prospector on cve_project table, to find out all fixing commits.
add_missing_commits()

# remove_lowscore_fixes(cf.MINIMUM_COMMIT_SCORE)
# Step (3) save commit-, file-, and method- level data tables to the database
Expand All @@ -533,6 +581,7 @@ def remove_lowscore_fixes(min_score):
# fix_column_types()
# else:
# cf.logger.warning('Data pruning is not possible because there is no information in method_change table')
add_missing_patches()
cf.logger.info('The database is up-to-date.')
cf.logger.info('-' * 70)
# ---------------------------------------------------------------------------------------------------------------------
9 changes: 5 additions & 4 deletions Code/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
from pathlib import Path
from dotenv import load_dotenv
import os

import multiprocessing as mp

load_dotenv('.env')
# set sensible defaults for thTe configurable fields
DATA_PATH = 'Data'
DATABASE_NAME = 'CVEfixes_sample.db'
USER = None
TOKEN = None
USER = os.getenv('GITHUB_USER', None)
TOKEN = os.getenv('GITHUB_TOKEN', None)
SAMPLE_LIMIT = 0
NUM_WORKERS = 30
NUM_WORKERS = 8
PROSPECTOR_WORKERS = min(mp.cpu_count() - 1, 15) # Anything more than 20 will result in rate limits.
LOGGING_LEVEL = logging.WARNING

PROSPECTOR_PYTHON_PATH = os.getenv('PROSPECTOR_PYTHON_PATH')
Expand Down
Loading

0 comments on commit 9270e55

Please sign in to comment.