Merge branch 'inference' of https://github.com/flexflow/FlexFlow into…

… update_legion
flexflow · Aug 12, 2024 · 95075c0 · 95075c0
2 parents 1fbf673 + 6a1a188
commit 95075c0
Show file tree

Hide file tree

Showing 63 changed files with 2,956 additions and 24 deletions.
diff --git a/.github/README.md b/.github/README.md
@@ -4,12 +4,6 @@
 
 ---
 
-## News🔥:
-
-* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6
-* [08/16/2023] Adding Starcoder model support
-* [08/14/2023] Released Docker images for different CUDA versions
-
 ## What is FlexFlow Serve
 
 The high computational and memory requirements of generative large language
@@ -178,7 +172,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -9,9 +9,9 @@ on:
     branches:
       - "inference"
       - "master"
-  # schedule:
-  #   # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
-  #   - cron: "0 8 * * 0"
+  schedule:
+    # At 00:00 on day-of-month 1, 14, and 28.
+    - cron: "0 0 1,14,28 * *" 
   workflow_dispatch:
 
 # Cancel outdated workflows if they are still running
@@ -58,13 +58,28 @@ jobs:
 
       - name: Check availability of flexflow modules in Python
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
-
+
+  keep-runner-registered:
+    name: Keep runner alive
+    if: ${{ github.event_name == 'schedule' }}
+    runs-on: [self-hosted, rocm_builder]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: rocm-builder-start
+    steps:
+      - name: Keep alive
+        run: |
+          echo "Keep self-hosted runner registered with Github"
+          sleep 10m  
 
   docker-build-and-publish-rocm:
     name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
     needs: rocm-builder-start
     runs-on: [self-hosted, rocm_builder]
-    if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+    if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     strategy:
       matrix:
         hip_version: ["5.3", "5.4", "5.5", "5.6"]
@@ -106,19 +121,19 @@ jobs:
       cuda_version: ${{ matrix.cuda_version }}
     steps:
       - name: Checkout Git Repository
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
           build_needed: ${{ matrix.cuda_version == '12.0' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
@@ -133,19 +148,19 @@ jobs:
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
 
       - name: Publish Docker environment image (on push to inference)
-        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
         run: |
           ./docker/publish.sh flexflow-environment
           ./docker/publish.sh flexflow
 
   rocm-builder-stop:
-    needs: docker-build-and-publish-rocm
+    needs: [docker-build-and-publish-rocm, keep-runner-registered]
     if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     runs-on: ubuntu-latest
     name: Stop the AWS instance we used to build the ROCM Docker images
@@ -166,7 +181,7 @@ jobs:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04
     needs: [docker-build-cuda, docker-build-and-publish-rocm]
-    if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
+    if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }}
     steps:
       - name: Send Slack message
         env:

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -1,5 +1,7 @@
 name: "gpu-ci"
 on:
+  schedule:
+    - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28.
   push:
     branches:
       - "inference"
@@ -43,8 +45,28 @@ jobs:
           pip3 install pygithub
           python3 .github/workflows/helpers/gpu_ci_helper.py
 
+  keep-runner-registered:
+    name: Keep runner alive
+    if: ${{ github.event_name == 'schedule' }}
+    runs-on: [self-hosted, gpu]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: gpu-ci-concierge
+    container:
+      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      options: --gpus all --shm-size=8192m
+    steps:
+      - name: Keep alive
+        run: |
+          echo "Keep self-hosted runner registered with Github"
+          sleep 10m
+  
   python-interface-check:
     name: Check Python Interface
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     defaults:
       run:
@@ -119,6 +141,7 @@ jobs:
 
   inference-tests:
     name: Inference Tests
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     defaults:
       run:
@@ -195,6 +218,7 @@ jobs:
 
   training-tests:
     name: Training Tests
+    if: ${{ github.event_name != 'schedule' }}
     runs-on: [self-hosted, gpu]
     # skip this time-consuming test for PRs to the inference branch
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}

diff --git a/SERVE.md b/SERVE.md
@@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 

diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
         bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
         rm ~/${MINICONDA_SCRIPT_NAME} && \
+	/opt/conda/bin/conda config --set solver classic && \
         /opt/conda/bin/conda upgrade --all && \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya

diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst
@@ -3,7 +3,7 @@ Layers API
 **********
 
 Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables,
-and the outputs of a layer is a tensor or a list of tensors.
+and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer
 
 .. automodule:: flexflow.core.flexflow_cffi
    :noindex:

diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+    input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+    out = ffmodel.add(input_tensor1, input_tensor2)
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+    dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+    dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+    ffmodel.init_layers()
+
+    dataloader_input1.reset()
+    dataloader_input1.next_batch(ffmodel)
+
+    dataloader_input2.reset()
+    dataloader_input2.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    out.inline_map(ffmodel, ffconfig)
+    return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    _ = test_add(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+    residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT)
+
+    output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm(
+        input_tensor,
+        residual_tensor,
+        axes=axes,
+        elementwise_affine=elementwise_affine,
+        eps=eps,
+        use_bias=use_bias,
+        name="add_bias_residual_layer_norm_layer"
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+        metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+    dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_residual.reset()
+
+    dataloader_input.next_batch(ffmodel)
+    dataloader_residual.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    output_tensor.inline_map(ffmodel, ffconfig)
+    layer_norm_output.inline_map(ffmodel, ffconfig)
+    output_result = output_tensor.get_array(ffmodel, ffconfig)
+    layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+    return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+    residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+    axes_to_normalize = [1, 2]  # Example axes to normalize
+
+    output_result, layer_norm_result = test_add_bias_residual_layer_norm(
+        ffconfig,
+        input_data,
+        residual_data,
+        axes=axes_to_normalize,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nResidual Array:")
+    print(residual_data)
+    print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:")
+    print(output_result)
+    print("\nLayer Norm Result:")
+    print(layer_norm_result)
diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py
@@ -0,0 +1,61 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None):
+    ffmodel = FFModel(ffconfig)
+
+    input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+    arg_top_k_output = ffmodel.arg_top_k(
+        input_tensor,
+        k,
+        sorted,
+        speculative_decoding,
+        name="arg_top_k_layer",
+    )
+
+    ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+    ffmodel.optimizer = ffoptimizer
+    ffmodel.compile(
+        loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+        metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+    )
+
+    dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+    ffmodel.init_layers()
+
+    dataloader_input.reset()
+    dataloader_input.next_batch(ffmodel)
+
+    ffmodel.forward()
+
+    arg_top_k_output.inline_map(ffmodel, ffconfig)
+    output_result = arg_top_k_output.get_array(ffmodel, ffconfig)
+
+    return output_result
+
+
+if __name__ == '__main__':
+    init_flexflow_runtime()
+    ffconfig = FFConfig()
+
+    input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+    k_value = 5
+    sorted_value = True
+    speculative_decoding_value = False  # Example value for speculative_decoding
+
+    output_result = test_arg_top_k(
+        ffconfig,
+        input_data,
+        k=k_value,
+        sorted=sorted_value,
+        speculative_decoding=speculative_decoding_value,
+    )
+
+    print("Input Array:")
+    print(input_data)
+    print("\nOutput Array after applying arg_top_k:")
+    print(output_result)
+3 −0		.gitmodules
+28 −4		CMakeLists.txt
+0 −1		example/CMakeLists.txt
+5 −1		example/build_and_run.sh
+69 −18		example/example.cc
+17 −2		include/tokenizers_c.h
+38 −0		include/tokenizers_cpp.h
+1 −0		msgpack
+1 −1		rust/Cargo.toml
+100 −16		rust/src/lib.rs
+64 −10		src/huggingface_tokenizer.cc
+143 −0		src/rwkv_world_tokenizer.cc
+33 −0		src/rwkv_world_tokenizer.h
+20 −1		src/sentencepiece_tokenizer.cc
+1 −1		web/build.sh
+2 −2		web/package.json
+31 −10		web/src/tokenizers.ts
+3 −1		web/src/tokenizers_binding.cc
+22 −1		web/tests/src/index.ts