Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic analysis 2 #50

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions aurpkg/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ mkdir -p ${OUT}

script="./scripts/pacaur.sh"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="aurpkg"
export BENCHMARK_SCRIPT="$(realpath "$script")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN")"

# Switch to user "user" to avoid permission issues

echo "$script"
Expand Down
4 changes: 3 additions & 1 deletion bio/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# install dependencies
required_version="1.7"

Expand Down Expand Up @@ -43,4 +44,5 @@ else
echo "Failed to install the correct version of Samtools."
exit 1
fi
fi
fi

2 changes: 2 additions & 0 deletions bio/input.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

IN=inputs
IN_NAME=input.txt

Expand Down
9 changes: 8 additions & 1 deletion bio/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

# create bam files with regions
################### 1KG SAMPLES
IN=inputs
Expand All @@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
IN_NAME=input_small.txt
fi

export BENCHMARK_CATEGORY="bio"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}

"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
script_file=./scripts/bio.sh
export BENCHMARK_SCRIPT="$(realpath "$script_file")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"

$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
6 changes: 6 additions & 0 deletions covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="covid-mts"
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"

2 changes: 2 additions & 0 deletions file-enc/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

sudo apt-get update

pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
Expand Down
9 changes: 7 additions & 2 deletions file-enc/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
suffix=".small"
fi

export BENCHMARK_CATEGORY="file-enc"
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
10 changes: 4 additions & 6 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl

static: $(STATIC_OUTPUTS)

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
target/dynamic_analysis.jsonl: dynamic_analysis.py
python3 $< | sort > $@

target/lines_of_code.csv: count_lines_of_code.py
Expand All @@ -22,8 +22,6 @@ target/shellmetrics.sh:
chmod +x $@

target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
python3 get_cyclomatic.py > $@
python3 get_cyclomatic.py | sort > $@

dynamic:

.PHONY: static dynamic clean-static static-test
.PHONY: static clean-static static-test
Empty file modified infrastructure/colossal_table.py
100644 → 100755
Empty file.
38 changes: 29 additions & 9 deletions infrastructure/data/script-globs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
{
"aurpkg": {
"scripts": ["aurpkg/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/bio.sh"]
},
"covid-mts": {
"scripts": ["covid-mts/scripts/*.sh"]
},
Expand Down Expand Up @@ -26,29 +32,43 @@
"riker": {
"scripts": ["riker/scripts/*/build.sh"]
},
"uniq-ips": {
"scripts": ["uniq-ips/scripts/run.sh"]
},
"unix50": {
"scripts": ["unix50/scripts/*.sh"]
},
"web-index": {
"scripts": ["web-index/scripts/*.sh"]
},
"makeself": {
"scripts": ["makeself/makeself/*.sh",
"makeself/makeself/test/*/*.sh"]
"scripts": ["makeself/makeself/makeself.sh", "makeself/makeself/test/*/*.sh"]
},
"vps-audit": {
"scripts": ["vps-audit/scripts/*.sh"]
},
"vps-audit-negate": {
"scripts": ["vps-audit-negate/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/bio.sh"]
"infrastructure/standards/100-files": {
"scripts": ["infrastructure/standards/100-files/scripts/*.sh"]
},
"aurpkg": {
"scripts": ["aurpkg/scripts/pacaur.sh"]
"infrastructure/standards/read-write": {
"scripts": ["infrastructure/standards/read-write/scripts/*.sh"]
},
"infrastructure/standards/shell-memory": {
"scripts": ["infrastructure/standards/shell-memory/scripts/*.sh"]
},
"infrastructure/standards/sleep": {
"scripts": ["infrastructure/standards/sleep/scripts/*.sh"]
},
"infrastructure/standards/time-in-shell-subprocess": {
"scripts": ["infrastructure/standards/time-in-shell-subprocess/scripts/*.sh"]
},
"infrastructure/standards/user-time": {
"scripts": ["infrastructure/standards/user-time/scripts/*.sh"]
},
"infrastructure/standards/user-time-in-shell": {
"scripts": ["infrastructure/standards/user-time-in-shell/scripts/*.sh"]
},
"infrastructure/standards/write-only": {
"scripts": ["infrastructure/standards/write-only/scripts/*.sh"]
}
}
214 changes: 214 additions & 0 deletions infrastructure/dynamic_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env python3

from typing import Optional
import lzma
from collections import defaultdict
from pathlib import Path
import json
import math
import sys
from dataclasses import dataclass

from project_root import get_project_root

def correct_base(path):
return Path(path).is_relative_to('/benchmarks')

def rebase(path):
return Path(path).relative_to('/benchmarks')

def is_shell(pid, processes):
a = next(iter(processes[pid].values()))
return len(a.cmdline) > 0 and a.cmdline[0].endswith('sh')

def get_input_files(record):
s = set()
for file_path, _, _, mode, _ in record.open_files:
known_modes = {'r', 'r+', 'w', 'a'}
assert mode in known_modes, f"unknown mode {mode}"
is_a_script = '.sh' in Path(file_path).suffixes
if mode != 'w' and not is_a_script and correct_base(file_path):
s.add(file_path)
return s

@dataclass(slots=True)
class MortemEntry:
io_zombie: str
stat_zombie: str
elapsed_secs: str
stat_before: str
stat_after: str
script: str
pid: int
benchmark_experiment_start: str
category: str
input_file: Optional[str]
sc_clk_tck: int

@dataclass(slots=True)
class LogEntry:
pid: int
parent: int
times: list
log_current_time: str
benchmark_experiment_start: str
cmdline: list[str]
cwd: str
create_time: float
uss: int
num_fds: int
open_files: list

def read_log_file(path):
parents = defaultdict(lambda: None)
children = defaultdict(set)
processes = defaultdict(list)
with lzma.open(path, 'r') as lines:
for entries in lines:
for data in json.loads(entries):
data = LogEntry(**data)
processes[data.pid].append(data)
children[data.parent].add(data.pid)
parents[data.pid] = data.parent
processes = defaultdict(lambda: None, {pid: {r.log_current_time: r for r in rs} for pid, rs in processes.items()})
return processes, parents, children

def get_descendents(pid: int, children: dict[int, set[int]]) -> set[int]:
descendents = set()
stack = [pid]
while len(stack) > 0:
pid = stack.pop()
descendents.add(pid)
stack += children[pid]
return descendents

@dataclass
class Stat:
utime: int
stime: int
cutime: int
cstime: int

@dataclass
class PsutilTimes:
user: float
system: float
children_user: float
children_system: float
iowait: float

def get_stat(stat_file_contents: str, sc_clk_tck: int):
# https://linux.die.net/man/5/proc
stat = stat_file_contents.split()
return Stat(
utime=float(stat[13]) / sc_clk_tck,
stime=float(stat[14]) / sc_clk_tck,
cutime=float(stat[15]) / sc_clk_tck,
cstime=float(stat[16]) / sc_clk_tck,
)

@dataclass
class Io:
rchar: int
wchar: int

def get_io(io_file_contents: str):
io_file_contents = io_file_contents.splitlines()
_, rchar = next(l for l in io_file_contents if l.startswith('rchar')).split()
_, wchar = next(l for l in io_file_contents if l.startswith('wchar')).split()
return Io(
rchar=int(rchar),
wchar=int(wchar),
)

def get_desc(pid, processes):
return next(p for p in processes[pid].values())

def find_shell_process(pid, processes):
target = next(p for p in processes if get_desc(p, processes).cmdline[1].endswith('io_shell.py'))
all_children = [p for p in processes if get_desc(p, processes).parent == target]
assert len(all_children) == 1, "one bash process"
pid = all_children[0]
assert get_desc(pid, processes).cmdline[0].endswith('sh')
return pid

def print_statistics(pid, processes, parents, children, mortem):
pid = find_shell_process(pid, processes)
assert is_shell(pid, processes)
descendents = get_descendents(pid, children)

all_readings = list(processes[pid].keys())

stat_zombie = get_stat(mortem.stat_zombie, mortem.sc_clk_tck)
stat_before = get_stat(mortem.stat_before, mortem.sc_clk_tck)
stat_after = get_stat(mortem.stat_after, mortem.sc_clk_tck)
io_zombie = get_io(mortem.io_zombie)

script = str(rebase(mortem.script))
user = stat_after.cutime - stat_before.cutime
system = stat_after.cstime - stat_before.cstime
uss = max(
sum(processes[d][r].uss for d in descendents if r in processes[d])
for r in all_readings
)
read_chars = io_zombie.rchar
write_chars = io_zombie.wchar

tis_user = sum(max(PsutilTimes(*r.times).user for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
tis_system = sum(max(PsutilTimes(*r.times).system for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
tis_user += stat_zombie.utime # we have a more accurate measurement of the first process
tis_system += stat_zombie.stime # we have a more accurate measurement of the first process

input_files = set(p for d in descendents for r in processes[d].values() for p in get_input_files(r))
if mortem.input_file is not None:
input_files |= {mortem.input_file}
input_files = list(str(rebase(p)) for p in input_files)

duration = float(mortem.elapsed_secs)

start = mortem.benchmark_experiment_start

category = mortem.category

num_fds = max(r.num_fds for r in processes[pid].values())
children_num_fds = max(
sum(processes[d][r].num_fds for d in descendents if r in processes[d])
for r in all_readings
)

data = dict(
script=script,
user_time=user,
system_time=system,
max_unique_set_size=uss,
read_chars=read_chars,
write_chars=write_chars,
user_time_in_shell=tis_user,
system_time_in_shell=tis_system,
all_input_files=input_files,
wall_time=duration,
start_time=start,
category=category,
num_fds=num_fds,
children_num_fds=children_num_fds,
)
print(json.dumps(data))

if __name__ == '__main__':
root = get_project_root()
process_logs = root / 'infrastructure' / 'target' / 'process-logs'
for mortem_path in process_logs.glob('*.mortem'):
print('processing log', mortem_path.relative_to(root), file=sys.stderr)
mortems = [
MortemEntry(**json.loads(line))
for line in mortem_path.read_text().splitlines()
]
mortems = {
mortem.pid: mortem
for mortem in mortems
}
path = mortem_path.with_suffix('.jsonl.xz')
processes, parents, children = read_log_file(path)
top_level = [pid for pid in processes if parents[parents[pid]] is None]
for pid in top_level:
print_statistics(pid, processes, parents, children, mortem=mortems[pid])
2 changes: 1 addition & 1 deletion infrastructure/get_cyclomatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
file, _func, _lineno, _lloc, ccn, _lines, _comment, _blank = line.split(',')
file = json.loads(file)
file = Path(file).relative_to(root)
datas[file].append((ccn))
datas[file].append((ccn,))

for file, datas in datas.items():
ccn = sum(float(ccn) for ccn, in datas)
Expand Down
Loading
Loading