Revert "Revert "Nvfuser code removal (pytorch#111093)"" (pytorch#111604)

jjsjann123 · pytorchmergebot · commit 39c09d4da66d · 2023-10-23T18:32:41.000Z
This reverts commit 715dfce. The original PR pytorch#111093 is reverted due to broken internal build. Pull Request resolved: pytorch#111604 Approved by: https://github.com/davidberard98
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -615,7 +615,7 @@ test_libtorch_jit() {
 
   # Run jit and lazy tensor cpp tests together to finish them faster
   if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
-    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
+    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
   else
     # CUDA tests have already been skipped when CUDA is not available
     python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -197,9 +197,6 @@ option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
      BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
-cmake_dependent_option(
-    BUILD_NVFUSER "Build NVFUSER" ON
-    "USE_CUDA OR USE_ROCM" OFF)
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(
@@ -1206,19 +1203,6 @@ if(BUILD_JNI)
   add_subdirectory(android/pytorch_android)
 endif()
 
-if(NOT USE_CUDA AND NOT USE_ROCM)
-  set(BUILD_NVFUSER OFF CACHE BOOL "BUILD nvfuser" FORCE)
-endif()
-
-if(BUILD_NVFUSER)
-  if(DEFINED ENV{NVFUSER_SOURCE_DIR})
-    add_subdirectory($ENV{NVFUSER_SOURCE_DIR} nvfuser)
-  else()
-    add_subdirectory(third_party/nvfuser nvfuser)
-  endif()
-  add_compile_definitions(BUILD_NVFUSER)
-endif()
-
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -255,7 +255,6 @@ core_sources_full_mobile_no_backend_interface_xplat = [
     "torch/csrc/jit/passes/constant_propagation.cpp",
     "torch/csrc/jit/passes/restore_mutation.cpp",
     "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
-    "torch/csrc/jit/passes/cuda_graph_fuser.cpp",
     "torch/csrc/jit/passes/dead_code_elimination.cpp",
     "torch/csrc/jit/passes/eliminate_no_ops.cpp",
     "torch/csrc/jit/passes/remove_redundant_profiles.cpp",
diff --git a/setup.py b/setup.py
@@ -189,9 +189,6 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
-#   NVFUSER_SOURCE_DIR
-#     specify nvfuser root directory
-#
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
@@ -632,11 +629,6 @@ def run(self):
         else:
             report("-- Not using ITT")
 
-        if cmake_cache_vars["BUILD_NVFUSER"]:
-            report("-- Building nvfuser")
-        else:
-            report("-- Not Building nvfuser")
-
         # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
         # in system CFLAGS
         c_flags = str(os.getenv("CFLAGS", ""))
@@ -736,22 +728,6 @@ def build_extensions(self):
                     os.makedirs(dst_dir)
                 self.copy_file(src, dst)
 
-        # Copy nvfuser extension
-        for i, ext in enumerate(self.extensions):
-            if ext.name != "nvfuser._C":
-                continue
-            fullname = self.get_ext_fullname(ext.name)
-            filename = self.get_ext_filename(fullname)
-            fileext = os.path.splitext(filename)[1]
-            src = os.path.join(os.path.dirname(filename), "nvfuser" + fileext)
-            dst = os.path.join(os.path.realpath(self.build_lib), filename)
-            if os.path.exists(src):
-                report(f"Copying {ext.name} from {src} to {dst}")
-                dst_dir = os.path.dirname(dst)
-                if not os.path.exists(dst_dir):
-                    os.makedirs(dst_dir)
-                self.copy_file(src, dst)
-
         setuptools.command.build_ext.build_ext.build_extensions(self)
 
     def get_outputs(self):
@@ -1011,8 +987,6 @@ def make_relative_rpath_args(path):
         excludes.extend(["caffe2", "caffe2.*"])
     if not cmake_cache_vars["BUILD_FUNCTORCH"]:
         excludes.extend(["functorch", "functorch.*"])
-    if not cmake_cache_vars["BUILD_NVFUSER"]:
-        excludes.extend(["nvfuser", "nvfuser.*"])
     packages = find_packages(exclude=excludes)
     C = Extension(
         "torch._C",
@@ -1046,10 +1020,6 @@ def make_relative_rpath_args(path):
         extensions.append(
             Extension(name="functorch._C", sources=[]),
         )
-    if cmake_cache_vars["BUILD_NVFUSER"]:
-        extensions.append(
-            Extension(name="nvfuser._C", sources=[]),
-        )
 
     cmdclass = {
         "bdist_wheel": wheel_concatenate,
@@ -1312,8 +1282,6 @@ def main():
         "include/torch/csrc/jit/tensorexpr/*.h",
         "include/torch/csrc/jit/tensorexpr/operators/*.h",
         "include/torch/csrc/jit/codegen/cuda/*.h",
-        "include/torch/csrc/jit/codegen/cuda/ops/*.h",
-        "include/torch/csrc/jit/codegen/cuda/scheduler/*.h",
         "include/torch/csrc/onnx/*.h",
         "include/torch/csrc/profiler/*.h",
         "include/torch/csrc/profiler/orchestration/*.h",
@@ -1355,18 +1323,6 @@ def main():
         "utils/model_dump/code.js",
         "utils/model_dump/*.mjs",
     ]
-    if get_cmake_cache_vars()["BUILD_NVFUSER"]:
-        torch_package_data.extend(
-            [
-                "share/cmake/nvfuser/*.cmake",
-                "include/nvfuser/*.h",
-                "include/nvfuser/kernel_db/*.h",
-                "include/nvfuser/multidevice/*.h",
-                "include/nvfuser/ops/*.h",
-                "include/nvfuser/python_frontend/*.h",
-                "include/nvfuser/scheduler/*.h",
-            ]
-        )
 
     if get_cmake_cache_vars()["BUILD_CAFFE2"]:
         torch_package_data.extend(
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -14,43 +14,10 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class LoadingNvfuserLibrary {
- public:
-#ifdef USE_CUDA
-  LoadingNvfuserLibrary() {
-    std::string library_name;
-    if (const char* path = std::getenv("TORCH_NVFUSER_LIBRARY_PATH")) {
-      library_name = path;
-    }
-#if defined(_WIN32)
-    library_name += "nvfuser_codegen.dll";
-#elif defined(__APPLE__)
-    library_name += "libnvfuser_codegen.dylib";
-#else
-    library_name += "libnvfuser_codegen.so";
-#endif
-    try {
-      // NOTE: we need to refactor this to a lazy load instead. We could end up
-      // with double de-allocation with our python API loading the library.
-      // Leaking the handle should solve the problem for now
-      nvfuserLib_ = std::make_shared<at::DynamicLibrary>(
-          library_name.c_str(), nullptr, true);
-    } catch (const c10::DynamicLibraryError& e) {
-#if defined(BUILD_NVFUSER) || !defined(NDEBUG)
-      TORCH_WARN_ONCE("Loading nvfuser library failed with: ", e.msg());
-#endif
-    }
-  }
-
-#endif // USE_CUDA
-  std::shared_ptr<at::DynamicLibrary> nvfuserLib_;
-};
-
-static LoadingNvfuserLibrary loading_nvfuser_library_;
-
 static std::atomic<bool> cuda_fusion_guard_mode{true};
 
 bool isEnabled() {
+  TORCH_WARN_ONCE("torch::jit::fuser::cuda::isEnabled() is deprecated");
   return false;
 }
 
diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp
@@ -7,7 +7,6 @@
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/quantization/helper.h>
 
 #include <stack>
diff --git a/torch/csrc/jit/passes/cuda_graph_fuser.cpp b/torch/csrc/jit/passes/cuda_graph_fuser.cpp
diff --git a/torch/csrc/jit/passes/cuda_graph_fuser.h b/torch/csrc/jit/passes/cuda_graph_fuser.h
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -857,11 +857,6 @@ class TensorExprFuser {
     if (device->is_cpu()) {
       return canFuseOnCPU();
     } else if (device->is_cuda()) {
-#ifndef C10_MOBILE
-      if (fuser::cuda::isEnabled()) {
-        return false;
-      }
-#endif
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
       return false;
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
@@ -27,7 +27,6 @@
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/create_functional_graphs.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -14,7 +14,6 @@
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
 #include <torch/csrc/jit/passes/graph_fuser.h>
@@ -646,13 +645,6 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
     // before any other pass that could insert `prim::iprofile_value` node on
     // `aten::_grad_sum_to_size` input.
     InsertProfileNodesForSpecializeAutogradZero(pr_.get());
-    // `InsertProfileNodesForCUDAFuser` inserts profile node for non-tensor
-    // value
-#ifndef C10_MOBILE
-    if (torch::jit::fuser::cuda::isEnabled()) {
-      torch::jit::fuser::cuda::InsertProfileNodesForCUDAFuser(pr_.get());
-    }
-#endif
     GRAPH_DUMP("Profiled Graph: ", pr_->graph());
     profiling_plan_ = ExecutionPlan(pr_->graph(), function_name_);
     // fall-through
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -207,13 +207,7 @@ void ProfilingRecord::insertShapeProfile(
 }
 
 static bool needsProfiledInputs(Node* n) {
-  if (tensorexpr::isSupported(n) ||
-#ifndef C10_MOBILE
-      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
-#else
-      false
-#endif
-  ) {
+  if (tensorexpr::isSupported(n)) {
     return true;
   }
 
@@ -244,13 +238,7 @@ static bool needsProfiledInputs(Node* n) {
 }
 
 static bool needsProfiledOutput(Node* n) {
-  if (tensorexpr::isSupported(n) ||
-#ifndef C10_MOBILE
-      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
-#else
-      false
-#endif
-  ) {
+  if (tensorexpr::isSupported(n)) {
     return true;
   }