dynamic analysis 2

binpash · Jan 12, 2025 · 2d9b9a0 · 2d9b9a0
1 parent 9aa507e
commit 2d9b9a0
Show file tree

Hide file tree

Showing 265 changed files with 1,403 additions and 156 deletions.
diff --git a/aurpkg/run.sh b/aurpkg/run.sh
@@ -7,6 +7,11 @@ mkdir -p ${OUT}
 
 script="./scripts/pacaur.sh"
 
+BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
+export BENCHMARK_CATEGORY="aurpkg"
+export BENCHMARK_SCRIPT="$(realpath "$script")"
+export BENCHMARK_INPUT_FILE="$(realpath "$IN")"
+
 # Switch to user "user" to avoid permission issues
 
 echo "$script"

diff --git a/bio/deps.sh b/bio/deps.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # install dependencies
 required_version="1.7"
 
@@ -43,4 +44,5 @@ else
         echo "Failed to install the correct version of Samtools."
         exit 1
     fi
-fi
+fi
+
diff --git a/bio/input.sh b/bio/input.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 IN=inputs
 IN_NAME=input.txt
 

diff --git a/bio/run.sh b/bio/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # create bam files with regions
 ################### 1KG SAMPLES
 IN=inputs
@@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
     IN_NAME=input_small.txt
 fi
 
+export BENCHMARK_CATEGORY="bio"
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
 
-"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
+script_file=./scripts/bio.sh 
+export BENCHMARK_SCRIPT="$(realpath "$script_file")"
+export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"
+
+$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
diff --git a/covid-mts/run.sh b/covid-mts/run.sh
@@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
 mkdir -p "$output_scoped"
 
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
+export BENCHMARK_CATEGORY="covid-mts"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"
 
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
 $BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
 $BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
 $BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
 $BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
 
diff --git a/file-enc/deps.sh b/file-enc/deps.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 sudo apt-get update
 
 pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'

diff --git a/file-enc/run.sh b/file-enc/run.sh
@@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
     suffix=".small"
 fi
 
+export BENCHMARK_CATEGORY="file-enc"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
-$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
-$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix
+
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
diff --git a/infrastructure/Makefile b/infrastructure/Makefile
@@ -1,8 +1,8 @@
-STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
+STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl
 
 static: $(STATIC_OUTPUTS)
 
-target/scripts_to_benchmark.csv: scripts_to_benchmark.py
+target/dynamic_analysis.jsonl: dynamic_analysis.py
 	python3 $< | sort > $@
 
 target/lines_of_code.csv: count_lines_of_code.py 
@@ -22,8 +22,6 @@ target/shellmetrics.sh:
 	chmod +x $@
 
 target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
-	python3 get_cyclomatic.py > $@
+	python3 get_cyclomatic.py | sort > $@
 
-dynamic:
-
-.PHONY: static dynamic clean-static static-test
+.PHONY: static clean-static static-test
diff --git a/infrastructure/colossal_table.py b/infrastructure/colossal_table.py
diff --git a/infrastructure/data/script-globs.json b/infrastructure/data/script-globs.json
@@ -1,4 +1,10 @@
 {
+    "aurpkg": {
+        "scripts": ["aurpkg/scripts/*.sh"]
+    },
+    "bio": {
+	"scripts": ["bio/scripts/bio.sh"]
+    },
     "covid-mts": {
         "scripts": ["covid-mts/scripts/*.sh"]
     },
@@ -24,10 +30,7 @@
         "scripts": ["sklearn/scripts/run.sh"]
     },
     "riker": {
-        "scripts": ["riker/scripts/*/build.sh"]
-    },
-    "uniq-ips": {
-        "scripts": ["uniq-ips/scripts/run.sh"]
+        "scripts": ["riker/scripts/*/run.sh"]
     },
     "unix50": {
         "scripts": ["unix50/scripts/*.sh"]
@@ -36,19 +39,36 @@
         "scripts": ["web-index/scripts/*.sh"]
     },
     "makeself": {
-        "scripts": ["makeself/makeself/*.sh",
-		    "makeself/makeself/test/*/*.sh"]
+        "scripts": ["makeself/makeself/makeself.sh", "makeself/makeself/test/*/*.sh"]
     },
     "vps-audit": {
         "scripts": ["vps-audit/scripts/*.sh"]
     },
     "vps-audit-negate": {
         "scripts": ["vps-audit-negate/scripts/*.sh"]
     },
-    "bio": {
-	"scripts": ["bio/scripts/bio.sh"]
+    "infrastructure/standards/100-files": {
+        "scripts": ["infrastructure/standards/100-files/scripts/*.sh"]
     },
-    "aurpkg": {
-	"scripts": ["aurpkg/scripts/pacaur.sh"]
+    "infrastructure/standards/read-write": {
+        "scripts": ["infrastructure/standards/read-write/scripts/*.sh"]
+    },
+    "infrastructure/standards/shell-memory": {
+        "scripts": ["infrastructure/standards/shell-memory/scripts/*.sh"]
+    },
+    "infrastructure/standards/sleep": {
+        "scripts": ["infrastructure/standards/sleep/scripts/*.sh"]
+    },
+    "infrastructure/standards/time-in-shell-subprocess": {
+        "scripts": ["infrastructure/standards/time-in-shell-subprocess/scripts/*.sh"]
+    },
+    "infrastructure/standards/user-time": {
+        "scripts": ["infrastructure/standards/user-time/scripts/*.sh"]
+    },
+    "infrastructure/standards/user-time-in-shell": {
+        "scripts": ["infrastructure/standards/user-time-in-shell/scripts/*.sh"]
+    },
+    "infrastructure/standards/write-only": {
+        "scripts": ["infrastructure/standards/write-only/scripts/*.sh"]
     }
 }
diff --git a/infrastructure/dynamic_analysis.py b/infrastructure/dynamic_analysis.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+
+from typing import Optional
+import lzma
+from collections import defaultdict
+from pathlib import Path
+import json
+import math
+import sys
+from dataclasses import dataclass
+
+from project_root import get_project_root
+
+def correct_base(path):
+    return Path(path).is_relative_to('/benchmarks')
+
+def rebase(path):
+    return Path(path).relative_to('/benchmarks')
+
+def is_shell(pid, processes):
+    a = next(iter(processes[pid].values()))
+    return len(a.cmdline) > 0 and a.cmdline[0].endswith('sh')
+
+def get_input_files(record):
+    s = set()
+    for file_path, _, _, mode, _ in record.open_files:
+        known_modes = {'r', 'r+', 'w', 'a'}
+        assert mode in known_modes, f"unknown mode {mode}"
+        is_a_script = '.sh' in Path(file_path).suffixes
+        if mode != 'w' and not is_a_script and correct_base(file_path):
+            s.add(file_path)
+    return s
+
+@dataclass(slots=True)
+class MortemEntry:
+    io_zombie: str
+    stat_zombie: str
+    elapsed_secs: str
+    stat_before: str
+    stat_after: str
+    script: str
+    pid: int
+    benchmark_experiment_start: str
+    category: str
+    input_file: Optional[str]
+    sc_clk_tck: int
+
+@dataclass(slots=True)
+class LogEntry:
+    pid: int
+    parent: int
+    times: list
+    log_current_time: str
+    benchmark_experiment_start: str
+    cmdline: list[str]
+    cwd: str
+    create_time: float
+    uss: int
+    num_fds: int
+    open_files: list
+
+def read_log_file(path):
+    parents = defaultdict(lambda: None)
+    children = defaultdict(set)
+    processes = defaultdict(list)
+    with lzma.open(path, 'r') as lines:
+        for entries in lines:
+            for data in json.loads(entries): 
+                data = LogEntry(**data)
+                processes[data.pid].append(data)
+                children[data.parent].add(data.pid)
+                parents[data.pid] = data.parent
+    processes = defaultdict(lambda: None, {pid: {r.log_current_time: r for r in rs} for pid, rs in processes.items()})
+    return processes, parents, children
+
+def get_descendents(pid: int, children: dict[int, set[int]]) -> set[int]:
+    descendents = set()
+    stack = [pid]
+    while len(stack) > 0:
+        pid = stack.pop()
+        descendents.add(pid)
+        stack += children[pid]
+    return descendents
+
+@dataclass
+class Stat:
+    utime: int
+    stime: int
+    cutime: int
+    cstime: int
+
+@dataclass
+class PsutilTimes:
+    user: float
+    system: float
+    children_user: float
+    children_system: float
+    iowait: float
+
+def get_stat(stat_file_contents: str, sc_clk_tck: int):
+    # https://linux.die.net/man/5/proc
+    stat = stat_file_contents.split()
+    return Stat(
+        utime=float(stat[13]) / sc_clk_tck,
+        stime=float(stat[14]) / sc_clk_tck,
+        cutime=float(stat[15]) / sc_clk_tck,
+        cstime=float(stat[16]) / sc_clk_tck,
+    )
+
+@dataclass
+class Io:
+    rchar: int
+    wchar: int
+
+def get_io(io_file_contents: str):
+    io_file_contents = io_file_contents.splitlines()
+    _, rchar = next(l for l in io_file_contents if l.startswith('rchar')).split()
+    _, wchar = next(l for l in io_file_contents if l.startswith('wchar')).split()
+    return Io(
+        rchar=int(rchar),
+        wchar=int(wchar),
+    )
+
+def get_desc(pid, processes):
+    return next(p for p in processes[pid].values())
+
+def find_shell_process(pid, processes):
+    target = next(p for p in processes if get_desc(p, processes).cmdline[1].endswith('io_shell.py'))
+    all_children = [p for p in processes if get_desc(p, processes).parent == target]
+    assert len(all_children) == 1, "one bash process" 
+    pid = all_children[0]
+    assert get_desc(pid, processes).cmdline[0].endswith('sh')
+    return pid
+
+def print_statistics(pid, processes, parents, children, mortem):
+    pid = find_shell_process(pid, processes)
+    assert is_shell(pid, processes)
+    descendents = get_descendents(pid, children)
+
+    all_readings = list(processes[pid].keys())
+
+    stat_zombie = get_stat(mortem.stat_zombie, mortem.sc_clk_tck)
+    stat_before = get_stat(mortem.stat_before, mortem.sc_clk_tck)
+    stat_after = get_stat(mortem.stat_after, mortem.sc_clk_tck)
+    io_zombie = get_io(mortem.io_zombie)
+
+    script = str(rebase(mortem.script))
+    user = stat_after.cutime - stat_before.cutime
+    system = stat_after.cstime - stat_before.cstime
+    uss = max(
+        sum(processes[d][r].uss for d in descendents if r in processes[d])
+        for r in all_readings
+    )
+    read_chars = io_zombie.rchar
+    write_chars = io_zombie.wchar
+
+    tis_user = sum(max(PsutilTimes(*r.times).user for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
+    tis_system = sum(max(PsutilTimes(*r.times).system for r in processes[d].values()) for d in descendents - {pid} if is_shell(d, processes))
+    tis_user += stat_zombie.utime # we have a more accurate measurement of the first process
+    tis_system += stat_zombie.stime # we have a more accurate measurement of the first process
+
+    input_files = set(p for d in descendents for r in processes[d].values() for p in get_input_files(r)) 
+    if mortem.input_file is not None:
+        input_files |= {mortem.input_file}
+    input_files = list(str(rebase(p)) for p in input_files)
+
+    duration = float(mortem.elapsed_secs)
+
+    start = mortem.benchmark_experiment_start
+
+    category = mortem.category
+
+    num_fds = max(r.num_fds for r in processes[pid].values())
+    children_num_fds = max(
+        sum(processes[d][r].num_fds for d in descendents if r in processes[d])
+        for r in all_readings
+    )
+
+    data = dict(
+        script=script,
+        user_time=user,
+        system_time=system,
+        max_unique_set_size=uss,
+        read_chars=read_chars,
+        write_chars=write_chars,
+        user_time_in_shell=tis_user,
+        system_time_in_shell=tis_system,
+        all_input_files=input_files,
+        wall_time=duration,
+        start_time=start,
+        category=category,
+        num_fds=num_fds,
+        children_num_fds=children_num_fds,
+    )
+    print(json.dumps(data))
+
+if __name__ == '__main__':
+    root = get_project_root()
+    process_logs = root / 'infrastructure' / 'target' / 'process-logs'
+    for mortem_path in process_logs.glob('*.mortem'):
+        print('processing log', mortem_path.relative_to(root), file=sys.stderr)
+        mortems = [
+            MortemEntry(**json.loads(line)) 
+            for line in mortem_path.read_text().splitlines()
+        ]
+        mortems = {
+            mortem.pid: mortem
+            for mortem in mortems 
+        }
+        path = mortem_path.with_suffix('.jsonl.xz')
+        processes, parents, children = read_log_file(path)
+        top_level = [pid for pid in processes if parents[parents[pid]] is None]
+        for pid in top_level:
+            print_statistics(pid, processes, parents, children, mortem=mortems[pid])
diff --git a/infrastructure/get_cyclomatic.py b/infrastructure/get_cyclomatic.py
@@ -23,7 +23,7 @@
     file, _func, _lineno, _lloc, ccn, _lines, _comment, _blank = line.split(',')
     file = json.loads(file)
     file = Path(file).relative_to(root)
-    datas[file].append((ccn))
+    datas[file].append((ccn,))
 
 for file, datas in datas.items():
     ccn = sum(float(ccn) for ccn, in datas)