stanford-crfm
diff --git a/‎.flake8
+2 b/‎.flake8
+2
diff --git a/‎ARCHITECTURE.md
+10 b/‎ARCHITECTURE.md
+10
diff --git a/‎CONTRIBUTING.md
+4 b/‎CONTRIBUTING.md
+4
diff --git a/‎Makefile
+30 b/‎Makefile
+30
diff --git a/‎README.md
+56-24 b/‎README.md
+56-24
diff --git a/‎conf/datasets/wikitext103.yaml
+9 b/‎conf/datasets/wikitext103.yaml
+9
diff --git a/‎conf/gpt2-config.yaml
+42 b/‎conf/gpt2-config.yaml
+42
diff --git a/‎conf/models/gpt2-small.yaml
+11 b/‎conf/models/gpt2-small.yaml
+11
diff --git a/‎conf/trainers/toy.yaml
+59 b/‎conf/trainers/toy.yaml
+59
@@ -2,3 +2,5 @@
 exclude = .git
 max-line-length = 119
 ignore = E203, E501, W503, W605
+per-file-ignores =
+    */__init__.py: F401
@@ -1 +1,11 @@
 # Mistral Architecture
+
+Sidd will write this up later -- essentially, it's probably worth walking through Hydra setup and general architectural
+and design choices.
+
+Might be a good way to establish general design patterns that will be helpful in the long-term.
+
+## Configuration
+
+Configuration is hard, especially with something as monolithic as trying to keep track of all the possible Hugging Face
+trainer configurations; to this end we use
@@ -1,5 +1,9 @@
 # Contributing to Mistral
 
+TL;DR: Follow the Quickstart in the README and make sure to `pre-commit install`!
+
+---
+
 Mostly a work in progress - Sidd/Laurel will fill in with necessary information. Generally, get folks set up with
 style and testing (:yikes:) pipeline, PR flow, etc.
 
 
@@ -1 +1,31 @@
 .PHONY: help serialize-env check autoformat
+.DEFAULT: help
+
+# Create Valid Architectures
+ARCHITECTURES := cpu gpu
+
+# Generates a useful overview/help message for various make features - add to this as necessary!
+help:
+	@echo "make serialize-env arch=<ID>"
+	@echo "    After (un)installing dependencies, dump environment.yaml for arch :: < cpu | gpu >"
+	@echo "make check"
+	@echo "    Run code style and linting (black, flake, isort) *without* changing files!"
+	@echo "make autoformat"
+	@echo "    Run code styling (black, isort) and update in place - committing with pre-commit also does this."
+
+serialize-env:
+ifneq ($(filter $(arch),$(ARCHITECTURES)),)
+	rm -f environments/environment-$(arch).yaml
+	conda env export --no-builds | grep -v "^prefix: " > environments/environment-$(arch).yaml
+else
+	@echo "Argument 'arch' is not set - try calling 'make serialize-env arch=<ID>' with ID = < cpu | gpu >"
+endif
+
+check:
+	isort --check .
+	black --check .
+	flake8 .
+
+autoformat:
+	isort --atomic .
+	black .
@@ -12,16 +12,16 @@ A Project Mercury Endeavor.
 
 If contributing to this repository, please make sure to do the following:
 
-+ Read the instructions in [`CONTRIBUTING.md`](./CONTRIBUTING.md)
++ Read the instructions in [`CONTRIBUTING.md`](./CONTRIBUTING.md) - Notably, before committing to the repository, *make
+sure to set up your dev environment and pre-commit install (`pre-commit install`)!*
 
 + Install and activate the Conda Environment using the `QUICKSTART` instructions below.
 
 + On installing new dependencies (via `pip` or `conda`), please make sure to update the `environment-<ID>.yaml` files
   via the following command (note that you need to separately create the `environment-cpu.yaml` file by exporting from
   your local development environment!):
 
-  `rm environments/environment-<ID>.yaml; conda env export --no-builds |
-    grep -v "^prefix: " > environments/environment-<ID>.yaml`
+  `make serialize-env --arch=<cpu | gpu>`
 
 ---
 
@@ -32,40 +32,69 @@ Clones `mistral` to the working directory, then walks through dependency setup,
 `transformers` repo, you may have to refresh the `transformers` install via `pip install git+https://github.com
 /huggingface/transformers`. On any shared resources (NLP Cluster, DGX Boxes) @Sidd will monitor this.
 
-### GPU & Cluster Environments (Shared Resources)
+### Shared NLP Environment (Stanford Folks)
 
-Ensure that you're using the appropriate `environment-<ID>.yaml` file --> if PyTorch doesn't build properly for your
-setup, checking the CUDA Toolkit is usually a good place to start. We have `environment-<ID>.yaml` files for CUDA
-10.1, 11 (and any additional support can be added -- file an issue if necessary).
+Note for @Stanford folks - the NLP Cluster (with the DGX Boxes pending) have all of the following Conda environments
+already set up - the only necessary steps are cloning the repo, activating the appropriate env, and running the
+`pre-commit install` command.
 
----
+#### Interactive Session (from a Jagupard Machine) -- Direct Development on Cluster
 
-## Start-Up (from Scratch)
+```bash
+cd /nlp/scr/$USER  # Replace $USER with you!
+git clone https://github.com/stanford-mercury/mistral.git
+cd mistral
+conda activate mistral
+pre-commit install  # Important!
+```
 
-Use these commands if you're starting a repository from scratch (this shouldn't be necessary to use this repo, but is
-included for completeness). If you're just trying to run/use this code, look at the Quickstart section above.
+### Local Development - Linux w/ GPU & CUDA 11.0
 
-### GPU & Cluster Environments (CUDA 10.1, 11.0)
+Note: Assumes that `conda` (Miniconda or Anaconda are both fine) is installed and on your path.
 
-CUDA 10.1 & 11.0 (note only CUDA Toolkit dependency version needs to change for building the below).
+Ensure that you're using the appropriate `environment-<gpu | cpu>.yaml` file --> if PyTorch doesn't build properly for
+your setup, checking the CUDA Toolkit is usually a good place to start. We have `environment-<gpu>.yaml` files for CUDA
+11.0 (and any additional CUDA Toolkit support can be added -- file an issue if necessary).
 
 ```bash
-conda create --name mistral-10.1 python=3.8
-conda install pytorch torchvision torchaudio cudatoolkit=10.1 -c pytorch   # CUDA=10.1 on NLP Cluster
-conda install ipython jupyter
+git clone https://github.com/stanford-mercury/mistral.git
+cd mistral
+conda env create -f environments/environment-gpu.yaml  # Choose CUDA Kernel based on Hardware!
+conda activate mistral
+pre-commit install  # Important!
+```
 
-pip install black datasets flake8 h5py hydra-core hydra_colorlog isort matplotlib pre-commit
+### Local Development - CPU (Mac OS & Linux)
 
-# Install Bleeding-Edge Transformers Library!
-pip install git+https://github.com/huggingface/transformers
+Note: Assumes that `conda` (Miniconda or Anaconda are both fine) is installed and on your path. Use the `-cpu`
+environment file.
+
+```bash
+git clone https://github.com/stanford-mercury/mistral.git
+cd mistral
+conda env create -f environments/environment-cpu.yaml
+conda activate mistral
+pre-commit install  # Important!
 ```
 
+---
+
+## Start-Up (from Scratch)
+
+Use these commands if you're starting a repository from scratch (this shouldn't be necessary to use this repo, but is
+included for completeness). If you're just trying to run/use this code, look at the Quickstart section above.
+
+### GPU & Cluster Environments (CUDA 11.0)
+
 ```bash
-conda create --name mistral-11.0 python=3.8
-conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch   # CUDA=11.0 on DGX Boxes, GCP/AWS
+conda create --name mistral python=3.8
+conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch   # CUDA=11.0 on most of Cluster!
 conda install ipython jupyter
 
-pip install black datasets flake8 h5py hydra-core hydra_colorlog isort matplotlib pre-commit
+pip install black datasets flake8 h5py isort matplotlib pre-commit
+
+# Install Bleeding-Edge Quinine Library!
+pip install git+https://github.com/krandiash/quinine.git
 
 # Install Bleeding-Edge Transformers Library!
 pip install git+https://github.com/huggingface/transformers
@@ -76,11 +105,14 @@ pip install git+https://github.com/huggingface/transformers
 Similar to the above, but installs the CPU-only versions of Torch and similar dependencies.
 
 ```bash
-conda create --name mistral-cpu python=3.8
+conda create --name mistral python=3.8
 conda install pytorch torchvision torchaudio -c pytorch
 conda install ipython jupyter
 
-pip install black datasets flake8 h5py hydra-core hydra_colorlog isort matplotlib pre-commit
+pip install black datasets flake8 h5py isort matplotlib pre-commit
+
+# Install Bleeding-Edge Quinine Library!
+pip install git+https://github.com/krandiash/quinine.git
 
 # Install Bleeding-Edge Transformers Library!
 pip install git+https://github.com/huggingface/transformers
 
@@ -0,0 +1,9 @@
+# wikitext103.yaml
+#   Configuration for WikiText-103 Dataset.
+---
+dataset:
+    id: wikitext
+    name: wikitext-103-raw-v1
+
+    # Number of Preprocessing Workers -- TODO 13 :: I have no idea the effect this number has when running distributed!
+    num_proc: 4
@@ -0,0 +1,42 @@
+# gpt-config.yaml
+#   Core GPT-2 Config, currently working with the WikiText-103 Dataset, GPT-2 Small Architecture, and Single-Node
+#   Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
+---
+# Inherit Dataset, Tokenization, Model, and Training Details
+inherit:
+    - datasets/wikitext103.yaml
+    - models/gpt2-small.yaml
+    - trainers/toy.yaml
+
+# Run ID -- defaults to `null`; override as you like!
+run_id: null
+
+# Weights & Biases (Set os.environ["WANDB_PROJECT"])
+wandb: null
+
+# Artifacts & Caching
+artifacts:
+    cache_dir: /u/scr/nlp/mercury/mistral/artifacts
+    run_dir: /u/scr/nlp/mercury/mistral/runs
+
+# Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this!
+# TODO 8 :: Do we want to dynamically set gradient accumulation based on effective batch size?
+bsz: 2
+
+# Resume from Checkpoint
+resume: false
+
+# Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL :: Fix w/ TODO 1
+log_level: 20
+
+# Top-Level Infrastructure Parameters
+infra:
+    # Local Rank -- for Distributed Training :: -1 refers to non-distributed training, 0-8 (16?) otherwise
+    rank: -1
+
+    # GPUs assumed to be uniform *across* nodes
+    nodes: 1
+    gpus: 1
+
+# Random Seed
+seed: 21
@@ -0,0 +1,11 @@
+# gpt2-small-config.yaml
+#   Configuration for the GPT-2 Small Model.
+---
+model:
+    id: "gpt2-small"
+
+    # Sequence Length
+    seq_len: 1024
+
+    # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch)
+    pretrained_tokenizer: True
@@ -0,0 +1,59 @@
+# toy.yaml
+#   Toy trainer config for Single-GPU training, with a fixed batch size of 2 (with gradient accumulation).
+#   This contract exactly follows that of HF.TrainingArguments so we can pass as a simple **kwargs -- make sure this
+#   continues to stay valid!
+#       Reference: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
+---
+training_arguments:
+    # Overwrite from Top-Level Config
+    output_dir: null
+
+    # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
+    do_train: true
+    evaluation_strategy: steps
+
+    # Set these based on GPU RAM available...
+    per_device_train_batch_size: 2
+    per_device_eval_batch_size: 4
+
+    # TODO 9 :: Set this dynamically?
+    gradient_accumulation_steps: 4
+
+    # TODO 10 :: Unclear what a good value is here -- this is somewhat arbitrary...
+    eval_accumulation_steps: 8
+
+    # Learning Rate & Optimization Parameters, assumes AdamW -- TODO 11 :: Check these and then double check them!
+    learning_rate: 5.0e-5
+    weight_decay: 0.01
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_epsilon: 1.0e-8
+
+    # Gradient Norm
+    max_grad_norm: 1.0
+
+    # Maximum Training Steps (Overrides epochs!) -- TODO 12 :: Check this!
+    max_steps: 50
+
+    # LR Scheduling Parameters -- TODO 13 :: Check these and then double check them!
+    lr_scheduler_type: cosine
+    warmup_steps: 10
+
+    # Logging Parameters -- Logging Directory (Tensorboard - is this necessary?) should be Overwritten at Runtime!
+    run_name: null
+    logging_dir: null
+    logging_first_step: True
+    logging_steps: 10
+
+    # Saving and Evaluation Steps
+    eval_steps: 10
+    save_steps: 10
+
+    # Seeds -- Should be Overwritten at Runtime!
+    seed: null
+
+    ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
+    fp16: False
+
+    # Should be overwritten from the Top-Level Config or CLI!
+    local_rank: null