Skip to content

Commit

Permalink
Static and dynamic viz presentation updates for paper
Browse files Browse the repository at this point in the history
  • Loading branch information
LLazarek committed Jan 13, 2025
1 parent 9aa507e commit e82edd9
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 75 deletions.
163 changes: 95 additions & 68 deletions infrastructure/viz/dynamic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

import argparse
from sys import stderr
from pathlib import Path
import pandas as pd
Expand All @@ -16,6 +17,8 @@
data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
input_size_path = root / 'infrastructure/data/size_inputs.jsonl'

figsize = (5, 3)

def get_input_sizes_df(df):
sizes_df = pd.read_json(input_size_path, lines=True)
def find_input_size(row):
Expand Down Expand Up @@ -60,79 +63,70 @@ def get_map_df():
]
return pd.DataFrame(items, columns=['script', 'benchmark'])

def plot_benchmark_times_split(df):
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='user_time', data=df, color='blue', label='User time')
sns.barplot(x='benchmark', y='system_time', data=df, color='red', label='System time')
plt.xticks(rotation=60, ha='right')
plt.subplots_adjust(bottom=0.25)
plt.yscale('symlog', linthresh=0.1)
plt.legend()
plt.show()

def plot_benchmark_times(df,
ax,
legend=True,
ticks = ([0, 0.1, 1, 10, 100, 1000, 10000],
['0', '0.1s', '1s', '10s', '100s', '1,000s', '10,000s']),
ylabel='CPU time',
linthresh=0.1):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='time_in_commands', data=df, color='blue', label='Commands')
sns.barplot(x='benchmark', y='time_in_shell', data=df, color='green', label='Shell')
plt.xticks(rotation=60, ha='right')
plt.xlabel('')
plt.subplots_adjust(bottom=0.25)
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.legend()
plt.show()
ax.grid(True, which='both', axis='y')
sns.barplot(x='benchmark', y='time_in_commands', data=df, color='#117733', label='Commands', ax=ax, zorder=3, hatch='//')
sns.barplot(x='benchmark', y='time_in_shell', data=df, color='#88CCEE', label='Shell', ax=ax, zorder=3, hatch='\\\\')
#ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
ax.set_xlabel('')
ax.set_yscale('symlog', linthresh=linthresh)
ax.set_yticks(ticks[0])
ax.set_yticklabels(ticks[1])
ax.set_ylabel(ylabel)
if legend:
ax.legend(loc=('best' if legend == True else legend))
else:
ax.legend().set_visible(False)

def plot_io(df,
ax,
ticks=([0, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000],
['0', '100MB', '1GB', '10GB', '100GB', '1TB']),
ylabel='IO bytes',
linthresh=100000000):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='io_chars', data=df, color='green', label=None)
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.xticks(rotation=60, ha='right')
plt.xlabel('')
plt.subplots_adjust(bottom=0.25)
plt.show()
ax.grid(True, which='both', axis='y')
sns.barplot(x='benchmark', y='io_chars', data=df, color='#882255', ax=ax, zorder=3)
ax.set_yscale('symlog', linthresh=linthresh)
ax.set_yticks(ticks[0])
ax.set_yticklabels(ticks[1])
ax.set_ylabel(ylabel)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
ax.set_xlabel('')

def plot_memory(df,
ax,
ticks=([0, 1000000, 10000000, 100000000, 1000000000],
['0', '1MB', '10MB', '100MB', '1GB']),
ylabel='Memory high water mark (bytes)',
ylabel='Memory (high water, bytes)',
linthresh=1000000):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='max_unique_set_size', data=df, color='purple', label=None)
plt.xticks(rotation=60, ha='right')
plt.xlabel('')
plt.subplots_adjust(bottom=0.25)
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.show()
ax.grid(True, which='both', axis='y')
sns.barplot(x='benchmark', y='max_unique_set_size', data=df, color='#CC6677', ax=ax, zorder=3)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
ax.set_xlabel('')
ax.set_yscale('symlog', linthresh=linthresh)
ax.set_yticks(ticks[0])
ax.set_yticklabels(ticks[1])
ax.set_ylabel(ylabel)

def plot_time_vs_wall(df):
def plot_time_vs_wall(df, ax):
# what fraction of the real (wall) runtime of the process is user or system time?
df['time_occupied'] = (df['user_time'] + df['system_time']) / df['wall_time']
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='time_occupied', data=df, color='blue')
plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=60, ha='right')
plt.xlabel('')
plt.yticks(np.linspace(0, 6, 10))
plt.ylabel('Proporion of CPU time to wall time')
plt.show()
ax.grid(True, which='both', axis='y')
sns.barplot(x='benchmark', y='time_occupied', data=df, color='#44AA99', ax=ax, zorder=3)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
ax.set_xlabel('')
ax.set_yticks(np.linspace(0, 6, 7))
ax.set_ylabel('CPU time / wall time')

dynamic_analysis_script_translations = {
"riker/scripts/vim/run.sh": "riker/scripts/vim/build.sh",
Expand Down Expand Up @@ -190,16 +184,18 @@ def read_data():

return df, bench_df

def main():
def main(output_dir=None):
_, df = read_data()

def name(str):
return os.path.join(output_dir, f"bensh-dyn-{str}.pdf") if output_dir else None


# # this metric is bad in the cases that we don't have the data for the inputs
# # the benchmark has a specific input on disk that it processed. how much of this input did it process per second?
# df['bytes_per_second_input'] = df['input_size'] / (df['wall_time'])
# sns.set(style="whitegrid")
# plt.figure(figsize=(10, 6))
# plt.figure(figsize=figsize)
# sns.barplot(x='benchmark', y='bytes_per_second_input', data=df, color='blue', label='Commands')
# plt.xticks(rotation=60, ha='right')
# plt.title('missing input')
Expand All @@ -212,7 +208,7 @@ def main():
# # fraction of the actual scheduled (user + system) time is in the shell. how much cpu work is in the launnched shell
# df['time_in_shell_frac'] = (df['user_time_in_shell'] + df['system_time_in_shell']) / (df['user_time'] + df['system_time'])
# sns.set(style="whitegrid")
# plt.figure(figsize=(10, 6))
# plt.figure(figsize=figsize)
# sns.barplot(x='benchmark', y='time_in_shell_frac', data=df, color='blue', label='Commands')
# plt.xticks(rotation=60, ha='right')
# plt.subplots_adjust(bottom=0.25)
Expand All @@ -221,38 +217,69 @@ def main():
# plt.legend()
# plt.show()

num_plots = 8
cols = 2
rows = num_plots // cols
fig, axes = plt.subplots(rows, cols, figsize=(12 , 9), sharex=True)
axes = axes.flatten()

plot_time_vs_wall(df)
plot_benchmark_times(df)
plot_io(df)
plot_memory(df)
# the benchmark did a certain amount of io. how many bytes per second was this?
df_rel_to_wall = df.copy()
df_rel_to_wall['io_chars'] = (df_rel_to_wall['read_chars'] + df_rel_to_wall['write_chars']) / (df_rel_to_wall['wall_time'])
plot_io(df_rel_to_wall,
axes[1],
ylabel='IO per second wall time',
ticks=([0, 1000000, 10000000, 100000000, 1000000000, 10000000000],
['0', '1MB', '10MB', '100MB', '1GB', '10GB']),
linthresh=1000000)

plot_time_vs_wall(df, axes[0])
plot_benchmark_times(df, axes[2], legend=False)
plot_memory(df, axes[4])
plot_io(df, axes[6])

df_rel_to_input = df.copy()
df_rel_to_input['io_chars'] = df_rel_to_input['io_chars'] / df_rel_to_input['input_size']
df_rel_to_input['max_unique_set_size'] = df_rel_to_input['max_unique_set_size'] / df_rel_to_input['input_size']
df_rel_to_input['time_in_shell'] = df_rel_to_input['time_in_shell'] / df_rel_to_input['input_size']
df_rel_to_input['time_in_commands'] = df_rel_to_input['time_in_commands'] / df_rel_to_input['input_size']

plot_benchmark_times(df_rel_to_input, ylabel='CPU time per input byte',
plot_benchmark_times(df_rel_to_input,
axes[3],
legend=(0.1, 0.65),
ylabel='CPU time per input byte',
ticks=([0, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01],
['0', '10ns', '100ns', '1us', '10us', '100us', '1ms', '10ms']),
linthresh=0.00000001)
plot_io(df_rel_to_input, ylabel='IO per input byte',
plot_io(df_rel_to_input,
axes[7],
ylabel='IO per input byte',
ticks=([0, 1, 10, 100, 1000, 10000, 100000, 1000000],
['0', '1B', '10B', '100B', '1KB', '10KB', '100KB', '1MB']),
linthresh=1)
plot_memory(df_rel_to_input, ylabel='Memory per input byte',
plot_memory(df_rel_to_input,
axes[5],
ylabel='Memory per input byte',
ticks=([0, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
['0', '0.001B', '0.01B', '0.1B', '1B', '10B', '100B', '1KB', '10KB']),
linthresh=0.001)

# the benchmark did a certain amount of io. how many bytes per second was this?
df_rel_to_wall = df.copy()
df_rel_to_wall['io_chars'] = (df_rel_to_wall['read_chars'] + df_rel_to_wall['write_chars']) / (df_rel_to_wall['wall_time'])
plot_io(df_rel_to_wall, ylabel='IO bytes per second wall time',
ticks=([0, 1000000, 10000000, 100000000, 1000000000, 10000000000],
['0', '1MB', '10MB', '100MB', '1GB', '10GB']),
linthresh=1000000)
plt.setp(axes[6].get_xticklabels(), visible=True, rotation=60, ha='right')
plt.setp(axes[7].get_xticklabels(), visible=True, rotation=60, ha='right')

plt.rcParams.update({
"text.usetex": True,
"font.family": "serif",
"font.serif": ["Times New Roman"], # Replace with your LaTeX font if different
})
plt.tight_layout()
if output_dir:
plt.savefig(name('trellis'))
else:
plt.show()

if __name__ == '__main__':
main()
parser = argparse.ArgumentParser(description='Generate dynamic characterization plots.')
parser.add_argument('output_dir', nargs='?', help='Directory to save the plots as PDF')
args = parser.parse_args()
main(args.output_dir)
31 changes: 24 additions & 7 deletions infrastructure/viz/syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from all_scripts import get_all_scripts
from project_root import get_project_root
import argparse

# Data format example:
# covid-mts/scripts/1.sh,command(cat):1;quoted_control:5;command(sed):1;command(cut):2;command(sort):2;command(uniq):1;command(awk):1;pipeline:1
Expand Down Expand Up @@ -100,7 +101,7 @@ def get_map_df():
]
return pd.DataFrame(items, columns=['script', 'benchmark'])

def node_heatmap(df):
def node_heatmap(df, outdir=None):
# todo which of these are missing entirely?
#unique_node_names = list(set(df['nodes'].apply(lambda x: [x for x in x.keys()]).sum()))

Expand All @@ -122,15 +123,28 @@ def node_heatmap(df):
heatmap_data = heatmap_data.loc[[x for x in heatmap_data.index if x not in node_order] + list(reversed(node_order))]
annot_data = annot_data.loc[[x for x in annot_data.index if x not in node_order] + list(reversed(node_order))]

plt.figure(figsize=(50, 8))
sns.heatmap(heatmap_data, cmap='Reds', annot=annot_data, fmt='', cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data,
cmap='Reds',
annot=annot_data,
fmt='',
cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
# sns.clustermap(heatmap_data, col_cluster=False, cmap='Reds', annot=annot_data, fmt='', cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
plt.xlabel('')
plt.xticks(rotation=60, ha='right')
plt.ylabel('')
plt.title('')
plt.subplots_adjust(bottom=0.15)
plt.show()
plt.tight_layout()
if outdir:
plt.rcParams.update({
"text.usetex": True,
"font.family": "serif",
"font.serif": ["Times New Roman"], # Replace with your LaTeX font if different
})
plt.savefig(os.path.join(outdir, 'bensh-stx-analysis.pdf'))
else:
plt.show()

def extract_special_command(node):
for sc in special_commands:
Expand Down Expand Up @@ -163,9 +177,12 @@ def read_data(merge_commands=True):

return (df, bench_df)

def main():
def main(outdir=None):
_, df = read_data()
node_heatmap(df)
node_heatmap(df, outdir)

if __name__ == '__main__':
main()
parser = argparse.ArgumentParser(description='Generate node heatmap.')
parser.add_argument('output_dir', nargs='?', help='Directory to save the plot as PDF')
args = parser.parse_args()
main(args.output_dir)

0 comments on commit e82edd9

Please sign in to comment.