Skip to content

Commit

Permalink
Merge pull request #137 from attilamester/feature/129-augmentation-on…
Browse files Browse the repository at this point in the history
…-callgraph-object

Feature/129 augmentation on callgraph object
  • Loading branch information
attilamester authored Mar 28, 2024
2 parents fab30c5 + f7ea9f2 commit 1db3dc5
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 17 deletions.
6 changes: 3 additions & 3 deletions src/core/data/bodmas.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,13 @@ def filename_from_sha256(cls, sha256):
return f"unpacked_{sha256}.exe"


class BodmasPymetangined(Bodmas):
class BodmasAugmented(Bodmas):

@classmethod
def get_dir_samples(cls):
return os.path.join(os.path.dirname(Bodmas.get_dir_samples()), "pymetangined")
return os.path.join(os.path.dirname(Bodmas.get_dir_samples()), "augmented")

@classmethod
def get_dir_analysis(cls):
return os.path.join(os.path.dirname(Bodmas.get_dir_analysis()),
os.path.basename(Bodmas.get_dir_analysis()) + "_pymetangined")
os.path.basename(Bodmas.get_dir_analysis()) + "_augmented")
53 changes: 40 additions & 13 deletions src/core/processors/r2_scanner/augmentation_metaengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import random
import shutil
import subprocess
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from typing import Type

import pandas as pd

from core.data import DatasetProvider
from core.data.bodmas import Bodmas, BodmasPymetangined
from core.data.bodmas import Bodmas, BodmasAugmented
from core.model.sample import Sample
from core.processors.cg_image_classification.paths import get_cg_image_classification_env
from core.processors.r2_scanner.scan_samples import scan_sample
from core.processors.util import decorator_sample_processor
from helpers.ground_truth import BODMAS_GROUND_TRUTH_CSV, BODMAS_GROUND_TRUTH_EXT_AUGM_CSV
from core.processors.util import decorator_sample_processor, process_samples
from helpers.ground_truth import BODMAS_GROUND_TRUTH_CSV, BODMAS_GROUND_TRUTH_WITH_AUGM_CSV
from helpers.ground_truth import (BODMAS_GT_COL0,
BODMAS_GT_COL1_ts,
BODMAS_GT_COL2_fam,
Expand All @@ -26,6 +26,7 @@
BODMAS_GT_COL9_augmof_sha)
from util import config
from util.logger import Logger
from util.misc import ensure_dir

config.load_env(get_cg_image_classification_env())

Expand Down Expand Up @@ -64,7 +65,7 @@ def augment_pymetangine(input_path: str, output_path: str, random: bool = True):

def augment_pymetangine_sample(sample: Sample):
input_path = sample.filepath
output_path = get_augmented_filename(BodmasPymetangined.get_dir_samples(), os.path.basename(input_path))
output_path = get_augmented_filename(BodmasAugmented.get_dir_samples(), os.path.basename(input_path))

augmented = False
for trial in range(2):
Expand Down Expand Up @@ -135,14 +136,14 @@ def create_augmentation_with_pymetangine(pct=0.2, original_min_occurencies: int
def create_augmentation_ground_truth():
df = pd.read_csv(BODMAS_GROUND_TRUTH_CSV, index_col=BODMAS_GT_COL0)
augm_data = []
for filename in os.listdir(BodmasPymetangined.get_dir_samples()):
for filename in os.listdir(BodmasAugmented.get_dir_samples()):
if "_augm" in filename:
original_sha256 = filename.split("_")[0]
orig_sample = Bodmas.get_sample(sha256=original_sha256)
augm_sample = Sample(filepath=os.path.join(BodmasPymetangined.get_dir_samples(), filename),
augm_sample = Sample(filepath=os.path.join(BodmasAugmented.get_dir_samples(), filename),
sha256=None, check_hashes=False)
shutil.copyfile(augm_sample.filepath, os.path.join(BodmasPymetangined.get_dir_samples(),
BodmasPymetangined.filename_from_sha256(
shutil.copyfile(augm_sample.filepath, os.path.join(BodmasAugmented.get_dir_samples(),
BodmasAugmented.filename_from_sha256(
augm_sample.sha256)))
family = df.loc[original_sha256, "family"]
augm_data.append([original_sha256, None, family,
Expand All @@ -162,11 +163,35 @@ def create_augmentation_ground_truth():
df[BODMAS_GT_COL9_augmof_sha] = ""
df_augm = pd.concat([df, df_augm], ignore_index=True)
df_augm = df_augm[column_order]
df_augm.to_csv(BODMAS_GROUND_TRUTH_EXT_AUGM_CSV, index=False)
df_augm.to_csv(BODMAS_GROUND_TRUTH_WITH_AUGM_CSV, index=False)


@decorator_sample_processor(BodmasPymetangined)
def scan_pymetangine_sample(dset: Type[DatasetProvider], sample: Sample):
def complete_image_collection(augm_images_dir, dim):
df = pd.read_csv(BODMAS_GROUND_TRUTH_WITH_AUGM_CSV)
df.set_index("md5", inplace=True)

original_images_dir = os.path.join(Bodmas.get_dir_images(), f"images_{dim[0]}x{dim[1]}_with_augm")
ensure_dir(original_images_dir)

n = 0
for index, row in df.iterrows():
if pd.notna(row[BODMAS_GT_COL8_augmof_md5]):
n += 1
file_to_copy = f"{index}_{dim[0]}x{dim[1]}_True_True.png"
path_to_copy_to = os.path.join(original_images_dir, file_to_copy)
path_to_copy_from = os.path.join(augm_images_dir, subdir, file_to_copy)

if not os.path.isfile(path_to_copy_from):
print(f"{n} Error: File not found: {path_to_copy_from}")
continue

shutil.copy(path_to_copy_from, path_to_copy_to)

print(f"{n} Copied \n\t{path_to_copy_from} --> \n\t{path_to_copy_to}")


@decorator_sample_processor(BodmasAugmented)
def scan_augmented_sample(dset: Type[DatasetProvider], sample: Sample):
scan_sample(dset, sample)


Expand All @@ -175,5 +200,7 @@ def scan_pymetangine_sample(dset: Type[DatasetProvider], sample: Sample):
# create_augmentation_with_pymetangine()
# create_augmentation_ground_truth()

# process_samples(BodmasPymetangined, scan_pymetangine_sample, batch_size=1000, max_batches=None,
# process_samples(BodmasAugmented, scan_augmented_sample, batch_size=1000, max_batches=None,
# pool=ThreadPoolExecutor(max_workers=8))

# complete_image_collection(BodmasAugmented.get_dir_images(), (100, 100))
2 changes: 1 addition & 1 deletion src/helpers/ground_truth.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

BODMAS_METADATA_CSV = "/opt/work/bd/BODMAS/bodmas_metadata.csv"
BODMAS_GROUND_TRUTH_CSV = "/opt/work/bd/BODMAS_ground_truth/BODMAS_ground_truth.csv"
BODMAS_GROUND_TRUTH_EXT_AUGM_CSV = "/opt/work/bd/BODMAS_ground_truth/BODMAS_ground_truth_ext-augm.csv"
BODMAS_GROUND_TRUTH_WITH_AUGM_CSV = "/opt/work/bd/BODMAS_ground_truth/BODMAS_ground_truth_with_augm.csv"
"""
extended with unpacked and augmented samples
"""
Expand Down

0 comments on commit 1db3dc5

Please sign in to comment.