finish updating gpt2- --> mistral-

dlwh · dlwh · commit 4373fa450d64 · 2022-08-10T12:51:23.000-07:00
diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ Environments and non-Python dependencies can be managed with conda, and Python d
 
 #### Prerequisites
 
-First, make sure to update `conf/tutorial-gpt2-micro.yaml` with the directories you want to store the Hugging Face
+First, make sure to update `conf/mistral-micro.yaml` with the directories you want to store the Hugging Face
 cache and model runs.
 
 ```
@@ -59,7 +59,7 @@ For single-node single-gpu training, run:
 ```bash
 conda activate mistral
 cd mistral
-CUDA_VISIBLE_DEVICES=0 python train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
+CUDA_VISIBLE_DEVICES=0 python train.py --config conf/mistral-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
 ```
 
 #### Multi-node multi-GPU training with DeepSpeed
diff --git a/conf/archive/v1/gpt2-debug-config.yaml b/conf/archive/v1/gpt2-debug-config.yaml
@@ -6,7 +6,7 @@
 inherit:
     - datasets/openwebtext.yaml
     - models/gpt2-small.yaml
-    - trainers/gpt2-small-short.yaml
+    - trainers/debug.yaml
 
 # Run ID -- defaults to `null`; override as you like!
 run_id: null
diff --git a/conf/mistral-medium.yaml b/conf/mistral-medium.yaml
@@ -1,5 +1,5 @@
-# gpt2-mistral-small-config.yaml
-#   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
+# mistral-medium-config.yaml
+#   Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 #
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
diff --git a/conf/mistral-micro.yaml b/conf/mistral-micro.yaml
@@ -1,4 +1,4 @@
-# tutorial-gpt2-micro.yaml
+# mistral2-micro.yaml
 #   Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture,
 #   and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2.
 #
diff --git a/conf/mistral-small.yaml b/conf/mistral-small.yaml
@@ -1,4 +1,4 @@
-# gpt2-mistral-small-config.yaml
+# mistral-small-config.yaml
 #   Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,
 #   and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.
 #
diff --git a/conf/models/mistral-medium.yaml b/conf/models/mistral-medium.yaml
@@ -1,4 +1,4 @@
-# gpt2-medium-config.yaml
+# mistral-medium-config.yaml
 #   Configuration for the GPT-2 Medium Model.
 ---
 model:
diff --git a/conf/models/mistral-micro.yaml b/conf/models/mistral-micro.yaml
@@ -1,4 +1,4 @@
-# gpt2-micro-config.yaml
+# mistral-micro-config.yaml
 #   Configuration for the GPT-2 Micro Model.
 ---
 model:
diff --git a/conf/models/mistral-small.yaml b/conf/models/mistral-small.yaml
@@ -1,4 +1,4 @@
-# gpt2-small-config.yaml
+# mistral-small.yaml
 #   Configuration for the GPT-2 Small Model.
 ---
 model:
diff --git a/conf/trainers/gpt2-medium.yaml b/conf/trainers/gpt2-medium.yaml
@@ -1,4 +1,4 @@
-# gpt2-small.yaml
+# gpt2-medium.yaml
 #   Trainer config for Full GPT-2 Medium, with the full fixed batch size of 512 (with gradient accumulation).
 #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 #   continues to stay valid!
diff --git a/conf/tutorial-shakespeare-gpt2-micro.yaml b/conf/tutorial-shakespeare-gpt2-micro.yaml
@@ -7,7 +7,7 @@
 # Inherit Dataset, Tokenization, Model, and Training Details
 inherit:
     - datasets/shakespeare.yaml
-    - models/gpt2-micro.yaml
+    - models/mistral-micro.yaml
     - trainers/gpt2-small-short.yaml
 
 # Run ID -- make sure to override!
diff --git a/docs/getting_started/config.rst b/docs/getting_started/config.rst
@@ -9,20 +9,20 @@ Configurations are specified using the `Quinine <https://github.com/krandiash/qu
 Quinine allows users to integrate multiple config files and layer configs on top of each other.
 It is designed for machine learning projects with large sets of nested hyperparameters.
 
-The easiest way to understand Quinine is to study ``conf/tutorial-gpt2-micro.yaml`` which is presented below.
+The easiest way to understand Quinine is to study ``conf/mistral-micro.yaml`` which is presented below.
 
 This config specifies a variety of settings, and draws configurations from ``conf/datasets/wikitext103.yaml``,
-``conf/models/gpt2-micro.yaml`` and ``conf/trainers/gpt2-small.yaml``. This allows for clean separation of the
+``conf/models/mistral-micro.yaml`` and ``conf/trainers/gpt2-small.yaml``. This allows for clean separation of the
 configs for the dataset (e.g. name or number of pre-processing workers), the model (e.g. number of layers),
 and the trainer (e.g. learning rate), while high level configs are specified in the main config file.
 
-Most of the defaults in ``conf/tutorial-gpt2-micro.yaml`` will work, but you will need to change
+Most of the defaults in ``conf/mistral-micro.yaml`` will work, but you will need to change
 the Weights & Biases settings and specify the artifacts directories ``cache_dir`` and ``run_dir``.
 
-Example config: tutorial-gpt2-micro.yaml
+Example config: mistral-micro.yaml
 ----------------------------------------
 
-``conf/tutorial-gpt2-micro.yaml`` is a basic configuration file that can be used for an introductory training run
+``conf/mistral-micro.yaml`` is a basic configuration file that can be used for an introductory training run
 
-.. include:: ../../conf/tutorial-gpt2-micro.yaml
+.. include:: ../../conf/mistral-micro.yaml
    :literal:
diff --git a/docs/getting_started/evaluate.rst b/docs/getting_started/evaluate.rst
@@ -11,11 +11,11 @@ To run evaluation, use this command: ::
 
     cd mistral
     conda activate mistral
-    CUDA_VISIBLE_DEVICES=0 python train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --model.initial_weights /path/to/runs/my-run/checkpoint-400000 --run_training False
+    CUDA_VISIBLE_DEVICES=0 python train.py --config conf/mistral-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --model.initial_weights /path/to/runs/my-run/checkpoint-400000 --run_training False
 
 This will skip the training process and run a final evaluation, initializing from the weights of the checkpoint.
 
-To evaluate a particular model, you need to supply the same config that was used to train the model (e.g. ``conf/tutorial-gpt2-micro.yaml``) in this example.
+To evaluate a particular model, you need to supply the same config that was used to train the model (e.g. ``conf/mistral-micro.yaml``) in this example.
 
 Example Output
 --------------
diff --git a/docs/getting_started/train.rst b/docs/getting_started/train.rst
@@ -5,15 +5,15 @@ Training "Hello World"
 ----------------------
 
 You should now be ready to launch a demo training run. There are example
-configurations for training on WikiText-103 in ``conf/tutorial-gpt2-micro.yaml``. You
+configurations for training on WikiText-103 in ``conf/mistral-micro.yaml``. You
 will need to update the artifacts directories and the wandb settings in this file before
 running training.
 
 To launch a training run, use this command (found in ``scripts/run/single-node.sh``) ::
 
     cd mistral
     conda activate mistral
-    CUDA_VISIBLE_DEVICES=0 python train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2
+    CUDA_VISIBLE_DEVICES=0 python train.py --config conf/mistral-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2
 
 You may need to adjust your batch size depending on the available GPU memory.
 
diff --git a/docs/tutorials/gcp_plus_kubernetes.rst b/docs/tutorials/gcp_plus_kubernetes.rst
@@ -199,7 +199,7 @@ The demo script ``gcp/run-demo-job.sh`` simply launches training with DeepSpeed:
 .. include:: ../../gcp/run-demo-job.sh
    :literal:
 
-Make sure to update ``conf/tutorial-gpt2-micro.yaml`` to include your project specific values for Weights & Biases
+Make sure to update ``conf/mistral-micro.yaml`` to include your project specific values for Weights & Biases
 and the directories to store the cache and models, as described in the :doc:`Configuration section<../getting_started/config>`.
 
 You can learn more about DeepSpeed training in the :doc:`DeepSpeed tutorial<deepspeed>`.
diff --git a/docs/tutorials/generate.rst b/docs/tutorials/generate.rst
@@ -18,7 +18,7 @@ Run run_generation.py With Your Model
 -------------------------------------
 
 As your model training runs, it should save checkpoints with all of the model resources in the directory
-you specified with ``articfacts.run_dir`` in the ``conf/tutorial-gpt2-micro.yaml`` config file.
+you specified with ``artifacts.run_dir`` in the ``conf/mistral-micro.yaml`` config file.
 
 For this example, lets assume you have saved the checkpoints in ``/home/tutorial-gpt2-micro/runs/run-1``. If you trained
 for 400000 steps, you should have a corresponding checkpoint at ``/home/tutorial-gpt2-micro/runs/run-1/checkpoint-400000``.
diff --git a/tests/conf/train-diff.yaml b/tests/conf/train-diff.yaml
@@ -6,29 +6,14 @@
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
 inherit:
-    - datasets/wikitext2-detokenized.yaml
-    - models/gpt2-micro.yaml
+    - train.yaml
     - trainers/gpt2-small-diff.yaml
 
-# Run ID -- make sure to override!
-run_id: null
-
-# Weights & Biases
-wandb: hello-world
-group: gpt2-small
-
 # Artifacts & Caching
 artifacts:
     cache_dir: /nlp/scr/jebolton/mistral-hello-world/artifacts
     run_dir: /nlp/scr/jebolton/mistral-hello-world/runs
 
-# Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
-effective_bsz: 16
-
-# Resume from Checkpoint
-resume: false
-resume_checkpoint: null
-
 # List of frequencies at which to save checkpoints, provided as a list of two-element tuples:
 #   - Frequency (`freq`) at which to save checkpoints (# steps)
 #   - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`)
@@ -38,26 +23,8 @@ checkpoint_frequency:
     - [100, 20000]
     - [1000, 400000]
 
-# `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch`
-local_rank: -1
-nnodes: -1
-nproc_per_node: -1
-
-# DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed`
-num_gpus: -1
-num_nodes: -1
-world_size: -1
-
-# Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL
-log_level: 20
-
 # Random Seed
 seed: 40
 
-online_eval:
-    do_wikitext: false
-    do_lambada: false
-    stride: 256
-
 run_training: false
 run_final_eval: false
diff --git a/tests/conf/trainers/gpt2-small-diff.yaml b/tests/conf/trainers/gpt2-small-diff.yaml
@@ -1,67 +1,21 @@
-# gpt2-small.yaml
+# gpt2-small-diff.yaml
 #   Trainer config for Full GPT-2 Small, with the full fixed batch size of 512 (with gradient accumulation).
 #   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
 #   continues to stay valid!
 #       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
 ---
-training_arguments:
-    # Overwrite from Top-Level Config
-    output_dir: null
-
-    # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
-    do_train: true
-    evaluation_strategy: steps
-
-    # Set these based on GPU RAM/your available hardware
-    per_device_train_batch_size: 8
-    per_device_eval_batch_size: 16
-
-    # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
-    gradient_accumulation_steps: null
 
-    # For Online Evaluation, only keep around the Losses
-    prediction_loss_only: true
+inherit:
+    - gpt2-small.yaml
 
+training_arguments:
     # Learning Rate & Optimization Parameters, assumes AdamW
-    learning_rate: 0.0006
     weight_decay: 0.2
     adam_beta1: 0.7
     adam_beta2: 0.3
-    adam_epsilon: 1.0e-8
 
     # Gradient Norm
     max_grad_norm: 2.0
 
     # Maximum Training Steps (Overrides epochs!)
-    max_steps: 100000
-
-    # LR Scheduling Parameters -- Warmup Steps should be 1% of total steps (Could use ratio)
-    lr_scheduler_type: linear   # Cosine not supported if we want to use DeepSpeed Optimizers (gets overwritten!)
-    warmup_steps: 4000
-
-    # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
-    run_name: null
-    logging_dir: null
-    logging_first_step: true
-    logging_steps: 50
-
-    # Saving and Evaluation Steps
-    eval_steps: 1000
-    save_steps: 1000
-
-    # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
-    ignore_data_skip: false
-
-    # Seeds -- Should be Overwritten at Runtime!
-    seed: null
-
-    ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
-    fp16: true
-    sharded_ddp: null
-    deepspeed: null
-
-    # Dataloader Parallelism
-    dataloader_num_workers: 0
-
-    # Should be overwritten from the Top-Level Config or CLI!
-    local_rank: null
+    max_steps: 100000
diff --git a/tutorials/custom-dataset/README.md b/tutorials/custom-dataset/README.md
@@ -45,7 +45,7 @@ typically done at the top in the inherit section. For example,
 # Inherit Dataset, Tokenization, Model, and Training Details
 inherit:
     - datasets/pubmed_local.yaml
-    - models/gpt2-small.yaml
+    - models/mistral-small.yaml
     - trainers/gpt2-small.yaml
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`		`-# gpt2-mistral-small-config.yaml`
`2`		`-# Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,`
	`1`	`+# mistral-medium-config.yaml`
	`2`	`+# Full Mistral GPT-2 Medium Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,`
`3`	`3`	`# and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.`
`4`	`4`	`#`
`5`	`5`	`# Inheritance and core paths can all be overridden from the command line or by re-writing these files.`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# tutorial-gpt2-micro.yaml`
	`1`	`+# mistral2-micro.yaml`
`2`	`2`	`# Demo GPT-2 Micro Training Config, currently working with the WikiText103 Dataset, GPT-2 Micro Architecture,`
`3`	`3`	`# and batch size of 2. Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 2.`
`4`	`4`	`#`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# gpt2-mistral-small-config.yaml`
	`1`	`+# mistral-small-config.yaml`
`2`	`2`	`# Full Mistral GPT-2 Small Training Config, currently working with the OpenWebText Dataset, GPT-2 Small Architecture,`
`3`	`3`	`# and full batch size (512). Runs with DeepSpeed ZeRO-2, with a per-device BSZ of 16.`
`4`	`4`	`#`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# gpt2-medium-config.yaml`
	`1`	`+# mistral-medium-config.yaml`
`2`	`2`	`# Configuration for the GPT-2 Medium Model.`
`3`	`3`	`---`
`4`	`4`	`model:`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# gpt2-micro-config.yaml`
	`1`	`+# mistral-micro-config.yaml`
`2`	`2`	`# Configuration for the GPT-2 Micro Model.`
`3`	`3`	`---`
`4`	`4`	`model:`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# gpt2-small-config.yaml`
	`1`	`+# mistral-small.yaml`
`2`	`2`	`# Configuration for the GPT-2 Small Model.`
`3`	`3`	`---`
`4`	`4`	`model:`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# gpt2-small.yaml`
	`1`	`+# gpt2-medium.yaml`
`2`	`2`	`# Trainer config for Full GPT-2 Medium, with the full fixed batch size of 512 (with gradient accumulation).`
`3`	`3`	`# This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this`
`4`	`4`	`# continues to stay valid!`