kblaszczak-intel
diff --git a/‎benchmarks/distributed/rpc/parameter_server/launcher.py
+16 b/‎benchmarks/distributed/rpc/parameter_server/launcher.py
+16
diff --git a/‎benchmarks/distributed/rpc/rl/README.md
+1-1 b/‎benchmarks/distributed/rpc/rl/README.md
+1-1
diff --git a/‎benchmarks/distributed/rpc/rl/launcher.py
+6-6 b/‎benchmarks/distributed/rpc/rl/launcher.py
+6-6
diff --git a/‎benchmarks/dynamo/common.py
+10-2 b/‎benchmarks/dynamo/common.py
+10-2
diff --git a/‎benchmarks/dynamo/distributed.py
+11-4 b/‎benchmarks/dynamo/distributed.py
+11-4
diff --git a/‎benchmarks/dynamo/runner.py
+13-5 b/‎benchmarks/dynamo/runner.py
+13-5
diff --git a/‎benchmarks/dynamo/test.py
+1-1 b/‎benchmarks/dynamo/test.py
+1-1
diff --git a/‎benchmarks/fastrnns/bench.py
+4-4 b/‎benchmarks/fastrnns/bench.py
+4-4
diff --git a/‎benchmarks/fastrnns/profile.py
+3-3 b/‎benchmarks/fastrnns/profile.py
+3-3
diff --git a/‎benchmarks/fastrnns/test.py
+2-2 b/‎benchmarks/fastrnns/test.py
+2-2
@@ -448,11 +448,13 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="RPC server Benchmark")
     parser.add_argument(
+        "--master-addr",
         "--master_addr",
         type=str,
         help="IP address of the machine that will host the process with rank 0"
     )
     parser.add_argument(
+        "--master-port",
         "--master_port",
         type=str,
         help="A free port on the machine that will host the process with rank 0"
@@ -493,6 +495,7 @@ def main(args):
         help="cudaserver count for benchmark run"
     )
     parser.add_argument(
+        "--rpc-timeout",
         "--rpc_timeout",
         type=int,
         help="timeout in seconds to use for RPC"
@@ -508,6 +511,7 @@ def main(args):
         help="epoch count for training"
     )
     parser.add_argument(
+        "--batch-size",
         "--batch_size",
         type=int,
         help="number of training examples used in one iteration"
@@ -523,62 +527,74 @@ def main(args):
         help="id for model configuration"
     )
     parser.add_argument(
+        "--data-config-path",
         "--data_config_path",
         type=str,
         help="path to data configuration file"
     )
     parser.add_argument(
+        "--model-config-path",
         "--model_config_path",
         type=str,
         help="path to model configuration file"
     )
     parser.add_argument(
+        "--server-config-path",
         "--server_config_path",
         type=str,
         help="path to server configuration file"
     )
     parser.add_argument(
+        "--trainer-config-path",
         "--trainer_config_path",
         type=str,
         help="path to trainer configuration file"
     )
     parser.add_argument(
+        "--torch-seed",
         "--torch_seed",
         type=int,
         help="seed for generating random numbers to a non-deterministic random number"
     )
     parser.add_argument(
+        "--cuda-seed",
         "--cuda_seed",
         type=int,
         help="seed for generating random numbers to a random number for the current GPU"
     )
     parser.add_argument(
+        "--preprocess-data",
         "--preprocess_data",
         type=str,
         help="this function will be used to preprocess data before training"
     )
     parser.add_argument(
+        "--create-criterion",
         "--create_criterion",
         type=str,
         help="this function will be used to create the criterion used for model loss calculation"
     )
     parser.add_argument(
+        "--create-ddp-model",
         "--create_ddp_model",
         type=str,
         help="this function will be used to create the ddp model used during training"
     )
     parser.add_argument(
+        "--hook-state",
         "--hook_state",
         type=str,
         help="this will be the state class used when registering the ddp communication hook"
     )
     parser.add_argument(
+        "--ddp-hook",
         "--ddp_hook",
         type=str,
         default="allreduce_hook",
         help="ddp communication hook"
     )
     parser.add_argument(
+        "--iteration-step",
         "--iteration_step",
         type=str,
         help="this will be the function called for each iteration of training"
 
@@ -20,7 +20,7 @@ This benchmark depends on PyTorch.
 
 For any environments you are interested in, pass the corresponding arguments to `python launcher.py`.
 
-```python launcher.py --world_size="10,20" --master_addr="127.0.0.1" --master_port="29501 --batch="True" --state_size="10-20-10" --nlayers="5" --out_features="10" --output_file_path="benchmark_report.json"```
+```python launcher.py --world-size="10,20" --master-addr="127.0.0.1" --master-port="29501 --batch="True" --state-size="10-20-10" --nlayers="5" --out-features="10" --output-file-path="benchmark_report.json"```
 
 Example Output:
 
 
@@ -29,15 +29,15 @@ def str2bool(v):
 
 
 parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
-parser.add_argument('--world_size', type=str, default='10')
-parser.add_argument('--master_addr', type=str, default='127.0.0.1')
-parser.add_argument('--master_port', type=str, default='29501')
+parser.add_argument('--world-size', '--world_size', type=str, default='10')
+parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
+parser.add_argument('--master-port', '--master_port', type=str, default='29501')
 parser.add_argument('--batch', type=str, default='True')
 
-parser.add_argument('--state_size', type=str, default='10-20-10')
+parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
 parser.add_argument('--nlayers', type=str, default='5')
-parser.add_argument('--out_features', type=str, default='10')
-parser.add_argument('--output_file_path', type=str, default='benchmark_report.json')
+parser.add_argument('--out-features', '--out_features', type=str, default='10')
+parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
 
 args = parser.parse_args()
 args = vars(args)
 
@@ -1520,7 +1520,9 @@ def parse_args(args=None):
         default=False,
         help="use channels last format",
     )
-    parser.add_argument("--batch_size", type=int, help="batch size for benchmarking")
+    parser.add_argument(
+        "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
+    )
     parser.add_argument(
         "--iterations", type=int, default=2, help="how many iterations to run"
     )
@@ -1651,7 +1653,11 @@ def get_example_inputs(self):
         action="store_true",
         help="exports trace of kineto profiler",
     )
-    parser.add_argument("--profiler_trace_name", help="Overwrites exported trace name")
+    parser.add_argument(
+        "--profiler-trace-name",
+        "--profiler_trace_name",
+        help="Overwrites exported trace name",
+    )
 
     parser.add_argument(
         "--diff-branch",
@@ -1670,6 +1676,7 @@ def get_example_inputs(self):
     )
 
     parser.add_argument(
+        "--cold-start-latency",
         "--cold_start_latency",
         action="store_true",
         help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
@@ -1787,6 +1794,7 @@ def get_example_inputs(self):
         help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",
     )
     group.add_argument(
+        "--recompile-profiler",
         "--recompile_profiler",
         action="store_true",
         help="Run the dynamo recompilation profiler on each model.",
 
@@ -121,24 +121,29 @@ def print_compile(gm, ex):
         help="if set to a str, uses dynamo[str] backend. else, eager",
     )
     parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("--batch_size", default=None)
+    parser.add_argument("--batch-size", "--batch_size", default=None)
     parser.add_argument(
         "--torchviz", action="store_true", help="Dump autograd graph with torchviz"
     )
     parser.add_argument("--profile", action="store_true", help="Run the profiler")
-    parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
+    parser.add_argument(
+        "--trace-file", "--trace_file", default="profile.json", help="Run the profiler"
+    )
     parser.add_argument("--repeat", default=10, help="Repeats for timing run")
     parser.add_argument(
+        "--dynamo-no-optimize-ddp",
         "--dynamo_no_optimize_ddp",
         action="store_true",
         help="Disable dynamo's ddp optimizer (enabled by default)",
     )
     parser.add_argument(
+        "--fsdp-checkpoint",
         "--fsdp_checkpoint",
         action="store_true",
         help="Use gradient checkpointing via model-specific policy",
     )
     parser.add_argument(
+        "--fsdp-wrap",
         "--fsdp_wrap",
         action="store_true",
         help="Apply fsdp to submodules via model-specific policy",
@@ -150,10 +155,12 @@ def print_compile(gm, ex):
 
     model_arg = parser.add_mutually_exclusive_group(required=True)
     model_arg.add_argument(
-        "--torchbench_model", help="name of torchbench model, e.g. hf_Bert"
+        "--torchbench-model",
+        "--torchbench_model",
+        help="name of torchbench model, e.g. hf_Bert",
     )
     model_arg.add_argument(
-        "--toy_model", action="store_true", help="use toy model instead"
+        "--toy-model", "--toy_model", action="store_true", help="use toy model instead"
     )
     args = parser.parse_args()
 
 
@@ -13,10 +13,10 @@
 below) for inference, run them and visualize the logs.
 
 If you want to just print the commands, you could use the following command
--> python benchmarks/runner.py --print_run_commands --suites=torchbench --inference
+-> python benchmarks/runner.py --print-run-commands --suites=torchbench --inference
 
 Similarly, if you want to just visualize the already finished logs
--> python benchmarks/runner.py --visualize_logs --suites=torchbench --inference
+-> python benchmarks/runner.py --visualize-logs --suites=torchbench --inference
 
 If you want to test float16
 -> python benchmarks/runner.py --suites=torchbench --inference --dtypes=float16
@@ -178,11 +178,13 @@ def parse_args():
     # Choose either generation of commands, pretty parsing or e2e runs
     group = parser.add_mutually_exclusive_group(required=False)
     group.add_argument(
+        "--print-run-commands",
         "--print_run_commands",
         action="store_true",
         help="Generate commands and saves them to run.sh",
     )
     group.add_argument(
+        "--visualize-logs",
         "--visualize_logs",
         action="store_true",
         help="Pretty print the log files and draw graphs",
@@ -265,7 +267,11 @@ def parse_args():
         help="Github CLI path",
     )
     parser.add_argument(
-        "--batch_size", type=int, default=None, help="batch size for benchmarking"
+        "--batch-size",
+        "--batch_size",
+        type=int,
+        default=None,
+        help="batch size for benchmarking",
     )
     parser.add_argument(
         "--threads",
@@ -276,12 +282,14 @@ def parse_args():
     )
     launcher_group = parser.add_argument_group("CPU Launcher Parameters")
     launcher_group.add_argument(
+        "--enable-cpu-launcher",
         "--enable_cpu_launcher",
         action="store_true",
         default=False,
         help="Use torch.backends.xeon.run_cpu to get the peak performance on Intel(R) Xeon(R) Scalable Processors.",
     )
     launcher_group.add_argument(
+        "--cpu-launcher-args",
         "--cpu_launcher_args",
         type=str,
         default="",
@@ -370,10 +378,10 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                         "inductor",
                         "inductor_no_cudagraphs",
                     ):
-                        cmd = f"{cmd} --cold_start_latency"
+                        cmd = f"{cmd} --cold-start-latency"
 
                     if args.batch_size is not None:
-                        cmd = f"{cmd} --batch_size {args.batch_size}"
+                        cmd = f"{cmd} --batch-size {args.batch_size}"
 
                     if args.threads is not None:
                         cmd = f"{cmd} --threads {args.threads}"
 
@@ -36,7 +36,7 @@ def test_benchmark_infra_runs(self) -> None:
                     "--performance",
                     "--only=BERT_pytorch",
                     "-n1",
-                    "--batch_size=1",
+                    "--batch-size=1",
                 ]
             )
             run(TorchBenchmarkRunner(), args, original_dir)
 
@@ -209,7 +209,7 @@ def bench_group(model_list, bench_name, bench_group, bench_args):
     parser.add_argument('--warmup', default='10', type=int)
     parser.add_argument('--nloops', default='100', type=int)
     parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--variable_lstms', action='store_true',
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true',
                         help='Also benchmark variable sequence length lstms '
                         'Note that some of these run really slowly '
                         'and that the `seqLength` flag will be ignored.')
@@ -224,9 +224,9 @@ def bench_group(model_list, bench_name, bench_group, bench_args):
                         help='The fuser backend to use. One of: te, old, or none')
     parser.add_argument('--executor', default=None, type=str,
                         help='The executor to use. One of: legacy, simple, profiling')
-    parser.add_argument('--cuda_pointwise_loop_level', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_count', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_size', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-loop-level', '--cuda_pointwise_loop_level', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-count', '--cuda_pointwise_block_count', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-size', '--cuda_pointwise_block_size', default=None, type=int)
 
     args = parser.parse_args()
     set_fuser(args.fuser, args.executor)
 
@@ -95,7 +95,7 @@ def full_profile(rnns, **args):
     for k, v in args.items():
         profile_args.append('--{}={}'.format(k, v))
     profile_args.append('--rnns {}'.format(' '.join(rnns)))
-    profile_args.append('--internal_run')
+    profile_args.append('--internal-run')
 
     outpath = nvprof_output_filename(rnns, **args)
 
@@ -114,15 +114,15 @@ def full_profile(rnns, **args):
     parser.add_argument('--inputSize', default='512', type=int)
     parser.add_argument('--hiddenSize', default='512', type=int)
     parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--sleep_between_seconds', default='1', type=int)
+    parser.add_argument('--sleep-between-seconds', '--sleep_between_seconds', default='1', type=int)
     parser.add_argument('--nloops', default='5', type=int)
 
     parser.add_argument('--rnns', nargs='*',
                         help='What to run. cudnn, aten, jit, etc')
 
     # if internal_run, we actually run the rnns.
     # if not internal_run, we shell out to nvprof with internal_run=T
-    parser.add_argument('--internal_run', default=False, action='store_true',
+    parser.add_argument('--internal-run', '--internal_run', default=False, action='store_true',
                         help='Don\'t use this')
     args = parser.parse_args()
     if args.rnns is None:
 
@@ -128,8 +128,8 @@ def test_vl_py(**test_args):
     parser.add_argument('--hiddenSize', default='512', type=int)
     parser.add_argument('--miniBatch', default='64', type=int)
     parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--check_grad', default='True', type=bool)
-    parser.add_argument('--variable_lstms', action='store_true')
+    parser.add_argument('--check-grad', '--check_grad', default='True', type=bool)
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true')
     parser.add_argument('--seed', default='17', type=int)
     parser.add_argument('--verbose', action='store_true')
     parser.add_argument('--rnns', nargs='*',
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_benchmark_infra_runs(self) -> None:`
`36`	`36`	`"--performance",`
`37`	`37`	`"--only=BERT_pytorch",`
`38`	`38`	`"-n1",`
`39`		`- "--batch_size=1",`
	`39`	`+ "--batch-size=1",`
`40`	`40`	`]`
`41`	`41`	`)`
`42`	`42`	`run(TorchBenchmarkRunner(), args, original_dir)`