Skip to content

Commit

Permalink
dynamic analysis 2
Browse files Browse the repository at this point in the history
  • Loading branch information
EtomicBomb committed Jan 12, 2025
1 parent 9aa507e commit 2d9b9a0
Show file tree
Hide file tree
Showing 265 changed files with 1,403 additions and 156 deletions.
5 changes: 5 additions & 0 deletions aurpkg/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ mkdir -p ${OUT}

script="./scripts/pacaur.sh"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="aurpkg"
export BENCHMARK_SCRIPT="$(realpath "$script")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN")"

# Switch to user "user" to avoid permission issues

echo "$script"
Expand Down
4 changes: 3 additions & 1 deletion bio/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# install dependencies
required_version="1.7"

Expand Down Expand Up @@ -43,4 +44,5 @@ else
echo "Failed to install the correct version of Samtools."
exit 1
fi
fi
fi

2 changes: 2 additions & 0 deletions bio/input.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

IN=inputs
IN_NAME=input.txt

Expand Down
9 changes: 8 additions & 1 deletion bio/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

# create bam files with regions
################### 1KG SAMPLES
IN=inputs
Expand All @@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
IN_NAME=input_small.txt
fi

export BENCHMARK_CATEGORY="bio"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}

"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
script_file=./scripts/bio.sh
export BENCHMARK_SCRIPT="$(realpath "$script_file")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"

$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
6 changes: 6 additions & 0 deletions covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="covid-mts"
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"

2 changes: 2 additions & 0 deletions file-enc/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

sudo apt-get update

pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
Expand Down
9 changes: 7 additions & 2 deletions file-enc/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
suffix=".small"
fi

export BENCHMARK_CATEGORY="file-enc"
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
10 changes: 4 additions & 6 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl

static: $(STATIC_OUTPUTS)

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
target/dynamic_analysis.jsonl: dynamic_analysis.py
python3 $< | sort > $@

target/lines_of_code.csv: count_lines_of_code.py
Expand All @@ -22,8 +22,6 @@ target/shellmetrics.sh:
chmod +x $@

target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
python3 get_cyclomatic.py > $@
python3 get_cyclomatic.py | sort > $@

dynamic:

.PHONY: static dynamic clean-static static-test
.PHONY: static clean-static static-test
Empty file modified infrastructure/colossal_table.py
100644 → 100755
Empty file.
40 changes: 30 additions & 10 deletions infrastructure/data/script-globs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
{
"aurpkg": {
"scripts": ["aurpkg/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/bio.sh"]
},
"covid-mts": {
"scripts": ["covid-mts/scripts/*.sh"]
},
Expand All @@ -24,10 +30,7 @@
"scripts": ["sklearn/scripts/run.sh"]
},
"riker": {
"scripts": ["riker/scripts/*/build.sh"]
},
"uniq-ips": {
"scripts": ["uniq-ips/scripts/run.sh"]
"scripts": ["riker/scripts/*/run.sh"]
},
"unix50": {
"scripts": ["unix50/scripts/*.sh"]
Expand All @@ -36,19 +39,36 @@
"scripts": ["web-index/scripts/*.sh"]
},
"makeself": {
"scripts": ["makeself/makeself/*.sh",
"makeself/makeself/test/*/*.sh"]
"scripts": ["makeself/makeself/makeself.sh", "makeself/makeself/test/*/*.sh"]
},
"vps-audit": {
"scripts": ["vps-audit/scripts/*.sh"]
},
"vps-audit-negate": {
"scripts": ["vps-audit-negate/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/bio.sh"]
"infrastructure/standards/100-files": {
"scripts": ["infrastructure/standards/100-files/scripts/*.sh"]
},
"aurpkg": {
"scripts": ["aurpkg/scripts/pacaur.sh"]
"infrastructure/standards/read-write": {
"scripts": ["infrastructure/standards/read-write/scripts/*.sh"]
},
"infrastructure/standards/shell-memory": {
"scripts": ["infrastructure/standards/shell-memory/scripts/*.sh"]
},
"infrastructure/standards/sleep": {
"scripts": ["infrastructure/standards/sleep/scripts/*.sh"]
},
"infrastructure/standards/time-in-shell-subprocess": {
"scripts": ["infrastructure/standards/time-in-shell-subprocess/scripts/*.sh"]
},
"infrastructure/standards/user-time": {
"scripts": ["infrastructure/standards/user-time/scripts/*.sh"]
},
"infrastructure/standards/user-time-in-shell": {
"scripts": ["infrastructure/standards/user-time-in-shell/scripts/*.sh"]
},
"infrastructure/standards/write-only": {
"scripts": ["infrastructure/standards/write-only/scripts/*.sh"]
}
}
214 changes: 214 additions & 0 deletions infrastructure/dynamic_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env python3

from typing import Optional
import lzma
from collections import defaultdict
from pathlib import Path
import json
import math
import sys
from dataclasses import dataclass

from project_root import get_project_root

def correct_base(path):
return Path(path).is_relative_to('/benchmarks')

def rebase(path):
return Path(path).relative_to('/benchmarks')

def is_shell(pid, processes):
a = next(iter(processes[pid].values()))
return len(a.cmdline) > 0 and a.cmdline[0].endswith('sh')

def get_input_files(record):
s = set()
for file_path, _, _, mode, _ in record.open_files:
known_modes = {'r', 'r+', 'w', 'a'}
assert mode in known_modes, f"unknown mode {mode}"
is_a_script = '.sh' in Path(file_path).suffixes
if mode != 'w' and not is_a_script and correct_base(file_path):
s.add(file_path)
return s

@dataclass(slots=True)
class MortemEntry:
io_zombie: str
stat_zombie: str
elapsed_secs: str
stat_before: str
stat_after: str
script: str
pid: int
benchmark_experiment_start: str
category: str
input_file: Optional[str]
sc_clk_tck: int

@dataclass(slots=True)
class LogEntry:
pid: int
parent: int
times: list
log_current_time: str
benchmark_experiment_start: str
cmdline: list[str]
cwd: str
create_time: float
uss: int
num_fds: int
open_files: list

def read_log_file(path):
parents = defaultdict(lambda: None)
children = defaultdict(set)
processes = defaultdict(list)
with lzma.open(path, 'r') as lines:
for entries in lines:
for data in json.loads(entries):
data = LogEntry(**data)
processes[data.pid].append(data)
children[data.parent].add(data.pid)
parents[data.pid] = data.parent
processes = defaultdict(lambda: None, {pid: {r.log_current_time: r for r in rs} for pid, rs in processes.items()})
return processes, parents, children

def get_descendents(pid: int, children: dict[int, set[int]]) -> set[int]:
descendents = set()
stack = [pid]
while len(stack) > 0:
pid = stack.pop()
descendents.add(pid)
stack += children[pid]
return descendents

@dataclass
class Stat:
utime: int
stime: int
cutime: int
cstime: int

@dataclass
class PsutilTimes:
user: float
system: float
children_user: float
children_system: float
iowait: float

def get_stat(stat_file_contents: str, sc_clk_tck: int):
# https://linux.die.net/man/5/proc
stat = stat_file_contents.split()
return Stat(
utime=float(stat[13]) / sc_clk_tck,
stime=float(stat[14]) / sc_clk_tck,
cutime=float(stat[15]) / sc_clk_tck,
cstime=float(stat[16]) / sc_clk_tck,
)

@dataclass
class Io:
rchar: int
wchar: int

def get_io(io_file_contents: str):
io_file_contents = io_file_contents.splitlines()
_, rchar = next(l for l in io_file_contents if l.startswith('rchar')).split()
_, wchar = next(l for l in io_file_contents if l.startswith('wchar')).split()
return Io(
rchar=int(rchar),
wchar=int(wchar),
)

def get_desc(pid, processes):
return next(p for p in processes[pid].values())

def find_shell_process(pid, processes):
target = next(p for p in processes if get_desc(p, processes).cmdline[1].endswith('io_shell.py'))
all_children = [p for p in processes if get_desc(p, processes).parent == target]
assert len(all_children) == 1, "one bash process"
pid = all_children[0]
assert get_desc(pid, processes).cmdline[0].endswith('sh')
return pid

def print_statistics(pid, processes, parents, children, mortem):
pid = find_shell_process(pid, processes)
assert is_shell(pid, processes)
descendents = get_descendents(pid, children)

all_readings = list(processes[pid].keys())

stat_zombie = get_stat(mortem.stat_zombie, mortem.sc_clk_tck)
stat_before = get_stat(mortem.stat_before, mortem.sc_clk_tck)
stat_after = get_stat(mortem.stat_after, mortem.sc_clk_tck)
io_zombie = get_io(mortem.io_zombie)

script = str(rebase(mortem.script))
user = stat_after.cutime - stat_before.cutime
system = stat_after.cstime - stat_before.cstime
uss = max(
sum(processes[d][r].uss for d in descendents if r in processes[d])
for r in all_readings
)
read_chars = io_zombie.rchar
write_chars = io_zombie.wchar

tis_user = sum(max(PsutilTimes(*r.times).user for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
tis_system = sum(max(PsutilTimes(*r.times).system for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
tis_user += stat_zombie.utime # we have a more accurate measurement of the first process
tis_system += stat_zombie.stime # we have a more accurate measurement of the first process

input_files = set(p for d in descendents for r in processes[d].values() for p in get_input_files(r))
if mortem.input_file is not None:
input_files |= {mortem.input_file}
input_files = list(str(rebase(p)) for p in input_files)

duration = float(mortem.elapsed_secs)

start = mortem.benchmark_experiment_start

category = mortem.category

num_fds = max(r.num_fds for r in processes[pid].values())
children_num_fds = max(
sum(processes[d][r].num_fds for d in descendents if r in processes[d])
for r in all_readings
)

data = dict(
script=script,
user_time=user,
system_time=system,
max_unique_set_size=uss,
read_chars=read_chars,
write_chars=write_chars,
user_time_in_shell=tis_user,
system_time_in_shell=tis_system,
all_input_files=input_files,
wall_time=duration,
start_time=start,
category=category,
num_fds=num_fds,
children_num_fds=children_num_fds,
)
print(json.dumps(data))

if __name__ == '__main__':
root = get_project_root()
process_logs = root / 'infrastructure' / 'target' / 'process-logs'
for mortem_path in process_logs.glob('*.mortem'):
print('processing log', mortem_path.relative_to(root), file=sys.stderr)
mortems = [
MortemEntry(**json.loads(line))
for line in mortem_path.read_text().splitlines()
]
mortems = {
mortem.pid: mortem
for mortem in mortems
}
path = mortem_path.with_suffix('.jsonl.xz')
processes, parents, children = read_log_file(path)
top_level = [pid for pid in processes if parents[parents[pid]] is None]
for pid in top_level:
print_statistics(pid, processes, parents, children, mortem=mortems[pid])
2 changes: 1 addition & 1 deletion infrastructure/get_cyclomatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
file, _func, _lineno, _lloc, ccn, _lines, _comment, _blank = line.split(',')
file = json.loads(file)
file = Path(file).relative_to(root)
datas[file].append((ccn))
datas[file].append((ccn,))

for file, datas in datas.items():
ccn = sum(float(ccn) for ccn, in datas)
Expand Down
Loading

0 comments on commit 2d9b9a0

Please sign in to comment.