Static and dynamic viz presentation updates for paper

binpash · Jan 13, 2025 · e82edd9 · e82edd9
1 parent 9aa507e
commit e82edd9
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 75 deletions.
diff --git a/infrastructure/viz/dynamic.py b/infrastructure/viz/dynamic.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import argparse
 from sys import stderr
 from pathlib import Path
 import pandas as pd
@@ -16,6 +17,8 @@
 data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
 input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
 
+figsize = (5, 3)
+
 def get_input_sizes_df(df):
     sizes_df = pd.read_json(input_size_path, lines=True)
     def find_input_size(row):
@@ -60,79 +63,70 @@ def get_map_df():
     ]
     return pd.DataFrame(items, columns=['script', 'benchmark'])
 
-def plot_benchmark_times_split(df):
-    sns.set_theme(style="whitegrid")
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='benchmark', y='user_time', data=df, color='blue', label='User time')
-    sns.barplot(x='benchmark', y='system_time', data=df, color='red', label='System time')
-    plt.xticks(rotation=60, ha='right')
-    plt.subplots_adjust(bottom=0.25)
-    plt.yscale('symlog', linthresh=0.1)
-    plt.legend()
-    plt.show()
-
 def plot_benchmark_times(df,
+                         ax,
+                         legend=True,
                          ticks = ([0, 0.1, 1, 10, 100, 1000, 10000], 
                                   ['0', '0.1s', '1s', '10s', '100s', '1,000s', '10,000s']),
                          ylabel='CPU time',
                          linthresh=0.1):
     sns.set(style="whitegrid")
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='benchmark', y='time_in_commands', data=df, color='blue', label='Commands')
-    sns.barplot(x='benchmark', y='time_in_shell', data=df, color='green', label='Shell')
-    plt.xticks(rotation=60, ha='right')
-    plt.xlabel('')
-    plt.subplots_adjust(bottom=0.25)
-    plt.yscale('symlog', linthresh=linthresh)
-    plt.yticks(*ticks)
-    plt.ylabel(ylabel)
-    plt.legend()
-    plt.show()
+    ax.grid(True, which='both', axis='y')
+    sns.barplot(x='benchmark', y='time_in_commands', data=df, color='#117733', label='Commands', ax=ax, zorder=3, hatch='//')
+    sns.barplot(x='benchmark', y='time_in_shell', data=df, color='#88CCEE', label='Shell', ax=ax, zorder=3, hatch='\\\\')
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
+    ax.set_xlabel('')
+    ax.set_yscale('symlog', linthresh=linthresh)
+    ax.set_yticks(ticks[0])
+    ax.set_yticklabels(ticks[1])
+    ax.set_ylabel(ylabel)
+    if legend:
+        ax.legend(loc=('best' if legend == True else legend))
+    else:
+        ax.legend().set_visible(False)
 
 def plot_io(df,
+            ax,
             ticks=([0, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000], 
                    ['0', '100MB', '1GB', '10GB', '100GB', '1TB']),
             ylabel='IO bytes',
             linthresh=100000000):
     sns.set(style="whitegrid")
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='benchmark', y='io_chars', data=df, color='green', label=None)
-    plt.yscale('symlog', linthresh=linthresh)
-    plt.yticks(*ticks)
-    plt.ylabel(ylabel)
-    plt.xticks(rotation=60, ha='right')
-    plt.xlabel('')
-    plt.subplots_adjust(bottom=0.25)
-    plt.show()
+    ax.grid(True, which='both', axis='y')
+    sns.barplot(x='benchmark', y='io_chars', data=df, color='#882255', ax=ax, zorder=3)
+    ax.set_yscale('symlog', linthresh=linthresh)
+    ax.set_yticks(ticks[0])
+    ax.set_yticklabels(ticks[1])
+    ax.set_ylabel(ylabel)
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
+    ax.set_xlabel('')
 
 def plot_memory(df,
+                ax,
                 ticks=([0, 1000000, 10000000, 100000000, 1000000000], 
                        ['0', '1MB', '10MB', '100MB', '1GB']),
-                ylabel='Memory high water mark (bytes)',
+                ylabel='Memory (high water, bytes)',
                 linthresh=1000000):
     sns.set(style="whitegrid")
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='benchmark', y='max_unique_set_size', data=df, color='purple', label=None)
-    plt.xticks(rotation=60, ha='right')
-    plt.xlabel('')
-    plt.subplots_adjust(bottom=0.25)
-    plt.yscale('symlog', linthresh=linthresh)
-    plt.yticks(*ticks)
-    plt.ylabel(ylabel)
-    plt.show()
+    ax.grid(True, which='both', axis='y')
+    sns.barplot(x='benchmark', y='max_unique_set_size', data=df, color='#CC6677', ax=ax, zorder=3)
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
+    ax.set_xlabel('')
+    ax.set_yscale('symlog', linthresh=linthresh)
+    ax.set_yticks(ticks[0])
+    ax.set_yticklabels(ticks[1])
+    ax.set_ylabel(ylabel)
 
-def plot_time_vs_wall(df):
+def plot_time_vs_wall(df, ax):
     # what fraction of the real (wall) runtime of the process is user or system time?
     df['time_occupied'] = (df['user_time'] + df['system_time']) / df['wall_time']
     sns.set(style="whitegrid")
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='benchmark', y='time_occupied', data=df, color='blue')
-    plt.subplots_adjust(bottom=0.25)
-    plt.xticks(rotation=60, ha='right')
-    plt.xlabel('')
-    plt.yticks(np.linspace(0, 6, 10))
-    plt.ylabel('Proporion of CPU time to wall time')
-    plt.show()
+    ax.grid(True, which='both', axis='y')
+    sns.barplot(x='benchmark', y='time_occupied', data=df, color='#44AA99', ax=ax, zorder=3)
+    #ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha='right')
+    ax.set_xlabel('')
+    ax.set_yticks(np.linspace(0, 6, 7))
+    ax.set_ylabel('CPU time / wall time')
 
 dynamic_analysis_script_translations = {
     "riker/scripts/vim/run.sh": "riker/scripts/vim/build.sh",
@@ -190,16 +184,18 @@ def read_data():
 
     return df, bench_df
 
-def main():
+def main(output_dir=None):
     _, df = read_data()
 
+    def name(str):
+        return os.path.join(output_dir, f"bensh-dyn-{str}.pdf") if output_dir else None
 
 
 #     # this metric is bad in the cases that we don't have the data for the inputs
 #     # the benchmark has a specific input on disk that it processed. how much of this input did it process per second?
 #     df['bytes_per_second_input'] = df['input_size'] / (df['wall_time'])
 #     sns.set(style="whitegrid")
-#     plt.figure(figsize=(10, 6))
+#     plt.figure(figsize=figsize)
 #     sns.barplot(x='benchmark', y='bytes_per_second_input', data=df, color='blue', label='Commands')
 #     plt.xticks(rotation=60, ha='right')
 #     plt.title('missing input')
@@ -212,7 +208,7 @@ def main():
     # # fraction of the actual scheduled (user + system) time is in the shell. how much cpu work is in the launnched shell
     # df['time_in_shell_frac'] = (df['user_time_in_shell'] + df['system_time_in_shell']) / (df['user_time'] + df['system_time'])
     # sns.set(style="whitegrid")
-    # plt.figure(figsize=(10, 6))
+    # plt.figure(figsize=figsize)
     # sns.barplot(x='benchmark', y='time_in_shell_frac', data=df, color='blue', label='Commands')
     # plt.xticks(rotation=60, ha='right')
     # plt.subplots_adjust(bottom=0.25)
@@ -221,38 +217,69 @@ def main():
     # plt.legend()
     # plt.show()
 
+    num_plots = 8
+    cols = 2
+    rows = num_plots // cols
+    fig, axes = plt.subplots(rows, cols, figsize=(12 , 9), sharex=True)
+    axes = axes.flatten()
 
-    plot_time_vs_wall(df)
-    plot_benchmark_times(df)
-    plot_io(df)
-    plot_memory(df)
+    # the benchmark did a certain amount of io. how many bytes per second was this?
+    df_rel_to_wall = df.copy()
+    df_rel_to_wall['io_chars'] = (df_rel_to_wall['read_chars'] + df_rel_to_wall['write_chars']) / (df_rel_to_wall['wall_time'])
+    plot_io(df_rel_to_wall, 
+            axes[1],
+            ylabel='IO per second wall time',
+            ticks=([0, 1000000, 10000000, 100000000, 1000000000, 10000000000], 
+                   ['0', '1MB', '10MB', '100MB', '1GB', '10GB']),
+            linthresh=1000000)
+
+    plot_time_vs_wall(df, axes[0])
+    plot_benchmark_times(df, axes[2], legend=False)
+    plot_memory(df, axes[4])
+    plot_io(df, axes[6])
 
     df_rel_to_input = df.copy()
     df_rel_to_input['io_chars'] = df_rel_to_input['io_chars'] / df_rel_to_input['input_size']
     df_rel_to_input['max_unique_set_size'] = df_rel_to_input['max_unique_set_size'] / df_rel_to_input['input_size']
     df_rel_to_input['time_in_shell'] = df_rel_to_input['time_in_shell'] / df_rel_to_input['input_size']
     df_rel_to_input['time_in_commands'] = df_rel_to_input['time_in_commands'] / df_rel_to_input['input_size']
 
-    plot_benchmark_times(df_rel_to_input, ylabel='CPU time per input byte',
+    plot_benchmark_times(df_rel_to_input, 
+                         axes[3],
+                         legend=(0.1, 0.65),
+                         ylabel='CPU time per input byte',
                          ticks=([0, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01], 
                                 ['0', '10ns', '100ns',    '1us',    '10us', '100us',  '1ms', '10ms']),
                                 linthresh=0.00000001)
-    plot_io(df_rel_to_input, ylabel='IO per input byte',
+    plot_io(df_rel_to_input, 
+            axes[7],
+            ylabel='IO per input byte',
             ticks=([0,    1,   10,     100,    1000,  10000,  100000, 1000000], 
                    ['0', '1B', '10B', '100B', '1KB', '10KB', '100KB', '1MB']),
                    linthresh=1)
-    plot_memory(df_rel_to_input, ylabel='Memory per input byte',
+    plot_memory(df_rel_to_input, 
+                axes[5],
+                ylabel='Memory per input byte',
                 ticks=([0,   0.001,   0.01,     0.1,  1,    10,    100,   1000,  10000], 
                        ['0', '0.001B', '0.01B', '0.1B', '1B', '10B', '100B', '1KB', '10KB']),
                 linthresh=0.001)
 
-    # the benchmark did a certain amount of io. how many bytes per second was this?
-    df_rel_to_wall = df.copy()
-    df_rel_to_wall['io_chars'] = (df_rel_to_wall['read_chars'] + df_rel_to_wall['write_chars']) / (df_rel_to_wall['wall_time'])
-    plot_io(df_rel_to_wall, ylabel='IO bytes per second wall time',
-            ticks=([0, 1000000, 10000000, 100000000, 1000000000, 10000000000], 
-                   ['0', '1MB', '10MB', '100MB', '1GB', '10GB']),
-            linthresh=1000000)
+    plt.setp(axes[6].get_xticklabels(), visible=True, rotation=60, ha='right')
+    plt.setp(axes[7].get_xticklabels(), visible=True, rotation=60, ha='right')
+
+    plt.rcParams.update({
+        "text.usetex": True,
+        "font.family": "serif",
+        "font.serif": ["Times New Roman"],  # Replace with your LaTeX font if different
+    })
+    plt.tight_layout()
+    if output_dir:
+        plt.savefig(name('trellis'))
+    else:
+        plt.show()
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description='Generate dynamic characterization plots.')
+    parser.add_argument('output_dir', nargs='?', help='Directory to save the plots as PDF')
+    args = parser.parse_args()
+    main(args.output_dir)
diff --git a/infrastructure/viz/syntax.py b/infrastructure/viz/syntax.py
@@ -11,6 +11,7 @@
 sys.path.insert(1, os.path.join(sys.path[0], '..'))
 from all_scripts import get_all_scripts
 from project_root import get_project_root
+import argparse
 
 # Data format example:
 # covid-mts/scripts/1.sh,command(cat):1;quoted_control:5;command(sed):1;command(cut):2;command(sort):2;command(uniq):1;command(awk):1;pipeline:1
@@ -100,7 +101,7 @@ def get_map_df():
     ]
     return pd.DataFrame(items, columns=['script', 'benchmark'])
 
-def node_heatmap(df):
+def node_heatmap(df, outdir=None):
     # todo which of these are missing entirely?
     #unique_node_names = list(set(df['nodes'].apply(lambda x: [x for x in x.keys()]).sum()))
 
@@ -122,15 +123,28 @@ def node_heatmap(df):
     heatmap_data = heatmap_data.loc[[x for x in heatmap_data.index if x not in node_order] + list(reversed(node_order))]
     annot_data = annot_data.loc[[x for x in annot_data.index if x not in node_order] + list(reversed(node_order))]
 
-    plt.figure(figsize=(50, 8))
-    sns.heatmap(heatmap_data, cmap='Reds', annot=annot_data, fmt='', cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(heatmap_data, 
+                cmap='Reds', 
+                annot=annot_data, 
+                fmt='', 
+                cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
     # sns.clustermap(heatmap_data, col_cluster=False, cmap='Reds', annot=annot_data, fmt='', cbar_kws={'label': 'Occurrences (* denotes more than 5)'})
     plt.xlabel('')
     plt.xticks(rotation=60, ha='right')
     plt.ylabel('')
     plt.title('')
     plt.subplots_adjust(bottom=0.15)
-    plt.show()
+    plt.tight_layout()
+    if outdir:
+        plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "serif",
+            "font.serif": ["Times New Roman"],  # Replace with your LaTeX font if different
+        })
+        plt.savefig(os.path.join(outdir, 'bensh-stx-analysis.pdf'))
+    else:
+        plt.show()
 
 def extract_special_command(node):
     for sc in special_commands:
@@ -163,9 +177,12 @@ def read_data(merge_commands=True):
 
     return (df, bench_df)
 
-def main():
+def main(outdir=None):
     _, df = read_data()
-    node_heatmap(df)
+    node_heatmap(df, outdir)
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description='Generate node heatmap.')
+    parser.add_argument('output_dir', nargs='?', help='Directory to save the plot as PDF')
+    args = parser.parse_args()
+    main(args.output_dir)