Merge pull request ROCm#26 from ROCmSoftwarePlatform/IFU-main-2022-05-02

Ifu main 2022 05 02
abojarov · May 4, 2022 · c6f77ae · c6f77ae
2 parents 9a5a33b + 18b48e9
commit c6f77ae
Show file tree

Hide file tree

Showing 40 changed files with 1,235 additions and 284 deletions.
diff --git a/.github/workflows/fbgemmci.yml b/.github/workflows/fbgemmci.yml
@@ -206,7 +206,10 @@ jobs:
       run: |
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
         sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
-        sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
+        # https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+        sudo dpkg -i cuda-keyring_1.0-1_all.deb
+        # sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
         sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
         sudo apt-get update
         sudo apt-get -y install cuda-minimal-build-11-3 cuda-nvrtc-dev-11-3 cuda-nvtx-11-3 cuda-libraries-dev-11-3

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -54,6 +54,14 @@ if(USE_CUDA)
   message("Building for cuda_architectures = \"${cuda_architectures}\"")
   message("${message_line}")
 
+if(DEFINED GLIBCXX_USE_CXX11_ABI)
+  if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
+    set(CXX_STANDARD_REQUIRED ON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    message("${CMAKE_CXX_FLAGS}")
+  endif()
+endif()
+
 #
 # Toch Cuda Extensions are normally compiled with the flags below. However we
 # disabled -D__CUDA_NO_HALF_CONVERSIONS__ here as it caused "error: no suitable
@@ -112,13 +120,15 @@ set(OPTIMIZERS
     adam
     approx_rowwise_adagrad
     approx_rowwise_adagrad_with_weight_decay
+    approx_rowwise_adagrad_with_counter
     approx_sgd
     lamb
     lars_sgd
     partial_rowwise_adam
     partial_rowwise_lamb
     rowwise_adagrad
     rowwise_adagrad_with_weight_decay
+    rowwise_adagrad_with_counter
     rowwise_weighted_adagrad
     sgd)
 
@@ -297,6 +307,22 @@ set_source_files_properties(
 # Actual static SOURCES
 #
 
+# Ensure NVML_LIB_PATH is empty if it wasn't set and if the
+# default lib path doesn't exist.
+if(NOT NVML_LIB_PATH)
+  set(DEFAULT_NVML_LIB_PATH
+    "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+
+  if(EXISTS ${DEFAULT_NVML_LIB_PATH})
+    message(
+      STATUS
+      "Setting NVML_LIB_PATH: \
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so"
+    )
+    set(NVML_LIB_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+  endif()
+endif()
+
 set(fbgemm_gpu_sources_cpu
     codegen/embedding_forward_split_cpu.cpp
     codegen/embedding_forward_quantized_host_cpu.cpp
@@ -316,11 +342,18 @@ if(NOT FBGEMM_CPU_ONLY)
     codegen/embedding_bounds_check_host.cpp
     src/cumem_utils_host.cpp
     src/layout_transform_ops_gpu.cpp
-    # src/merge_pooled_embeddings_cpu.cpp src/merge_pooled_embeddings_gpu.cpp
     src/permute_pooled_embedding_ops_gpu.cpp
+    src/permute_pooled_embedding_ops_split_gpu.cpp
+    src/permute_pooled_embedding_ops_split_cpu.cpp
     src/quantize_ops_gpu.cpp
     src/sparse_ops_gpu.cpp
     src/split_table_batched_embeddings.cpp)
+
+    if(NVML_LIB_PATH)
+      list(APPEND fbgemm_gpu_sources_cpu
+        src/merge_pooled_embeddings_cpu.cpp
+        src/merge_pooled_embeddings_gpu.cpp)
+    endif()
 endif()
 
 set(fbgemm_gpu_sources_cpu_option "-mavx;-mf16c;-mfma;-mavx2")
@@ -335,6 +368,7 @@ if(NOT FBGEMM_CPU_ONLY)
     codegen/embedding_bounds_check.cu src/cumem_utils.cu
     src/histogram_binning_calibration_ops.cu src/jagged_tensor_ops.cu
     src/layout_transform_ops.cu src/permute_pooled_embedding_ops.cu
+    src/permute_pooled_embedding_ops_split.cu
     src/quantize_ops.cu src/sparse_ops.cu src/split_embeddings_cache_cuda.cu
     src/split_embeddings_utils.cu)
 
@@ -397,6 +431,9 @@ endif()
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
 
 target_link_libraries(fbgemm_gpu_py ${TORCH_LIBRARIES})
+if(NVML_LIB_PATH)
+  target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH})
+endif()
 target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
 if(USE_CUDA)
   set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)

diff --git a/fbgemm_gpu/README.md b/fbgemm_gpu/README.md
@@ -27,6 +27,15 @@ conda install pytorch cudatoolkit=11.3 -c pytorch-nightly
 conda install scikit-build jinja2 ninja cmake hypothesis
 ```
 
+**If you're planning to build from source** and **don't** have `nvml.h` in your system, you can install it via the command
+below.
+```
+conda install -c conda-forge cudatoolkit-dev
+```
+ Certain operations require this library to be present. Be sure to provide the path to `libnvidia-ml.so` to
+`--nvml_lib_path` if installing from source (e.g. `python setup.py install --nvml_lib_path path_to_libnvidia-ml.so`).
+
+
 ## PIP install
 
 Currently only built with sm70/80 (V100/A100 GPU) wheel supports: