kblaszczak-intel · Nov 3, 2021
diff --git a/‎caffe2/cuda_rtc/common_rtc.h
+36-25 b/‎caffe2/cuda_rtc/common_rtc.h
+36-25
diff --git a/‎caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+20-21 b/‎caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+20-21
@@ -7,14 +7,14 @@
 #include <cuda.h>
 #include <nvrtc.h>
 
-#define NVRTC_CHECK(condition)                                                 \
-  do {                                                                         \
-    nvrtcResult result = condition;                                            \
-    if (result != NVRTC_SUCCESS) {                                             \
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "   \
-                      << nvrtcGetErrorString(result);                          \
-    }                                                                          \
-  } while(0)
+#define NVRTC_CHECK(condition)                                          \
+  do {                                                                  \
+    nvrtcResult result = condition;                                     \
+    if (result != NVRTC_SUCCESS) {                                      \
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
+                 << nvrtcGetErrorString(result);                        \
+    }                                                                   \
+  } while (0)
 
 namespace caffe2 {
 
@@ -39,15 +39,14 @@ class CudaRTCFunction {
     VLOG(1) << "function src:\n" << src;
     // Actually do the compiling.
     nvrtcProgram prog;
-    NVRTC_CHECK(nvrtcCreateProgram(
-        &prog, src.c_str(), nullptr, 0, nullptr, nullptr));
+    NVRTC_CHECK(
+        nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr));
     // Compile the program.
     // TODO(Yangqing): how to find the current gpu architecture instead of hard
     // coding it?
-    const char *nvrtc_opts[] = {"--gpu-architecture=compute_35",
-                                "--use_fast_math"};
-    nvrtcResult compile_result = nvrtcCompileProgram(
-        prog, 2, nvrtc_opts);
+    const char* nvrtc_opts[] = {
+        "--gpu-architecture=compute_35", "--use_fast_math"};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
     if (compile_result != NVRTC_SUCCESS) {
       size_t log_size;
       NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
@@ -74,21 +73,33 @@ class CudaRTCFunction {
   }
 
   template <typename... Args>
-  void Launch(unsigned int gx, unsigned int gy, unsigned int gz,
-              unsigned int bx, unsigned int by, unsigned int bz,
-              unsigned int shared_mem, cudaStream_t stream,
-              Args... args) {
+  void Launch(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      Args... args) {
     CAFFE_ENFORCE(
         module_loaded_, "Cannot call Launch before a module is loaded.");
-    void * args_voidp[] = {&args...};
+    void* args_voidp[] = {&args...};
     CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
         kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
   }
 
-  void LaunchEx(unsigned int gx, unsigned int gy, unsigned int gz,
-                unsigned int bx, unsigned int by, unsigned int bz,
-                unsigned int shared_mem, cudaStream_t stream,
-                void** extra) {
+  void LaunchEx(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      void** extra) {
     CAFFE_ENFORCE(
         module_loaded_, "Cannot call Launch before a module is loaded.");
     CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
@@ -115,6 +126,6 @@ inline std::string GetUniqueName() {
   return ss.str();
 }
 
-}  // namepsace caffe2
+} // namespace caffe2
 
-#endif  // CAFFE2_CUDA_RTC_COMMON_RTC_H_
+#endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_
@@ -5,8 +5,7 @@
 
 namespace caffe2 {
 namespace {
-class ElementwiseRTCFunction
-    : public CudaRTCFunction<ElementwiseRTCFunction> {
+class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> {
  public:
   ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
 
@@ -22,22 +21,21 @@ class ElementwiseRTCFunction
   string name_;
 };
 
-template<>
+template <>
 string ElementwiseRTCFunction::GetSource(
-    int input_size, int output_size,
+    int input_size,
+    int output_size,
     const string command_string) {
   std::stringstream ss;
-  ss << "extern \"C\" __global__ void " << name_ <<
-        "(const size_t nthreads, \n";
+  ss << "extern \"C\" __global__ void " << name_
+     << "(const size_t nthreads, \n";
   // Insert the parameter list.
   int remain_params = input_size + output_size;
   for (int i = 0; i < input_size; ++i) {
-    ss << "const float* in" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "const float* in" << i << ((remain_params--) ? ", \n" : "");
   }
   for (int i = 0; i < output_size; ++i) {
-    ss << "float* out" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "float* out" << i << ((remain_params--) ? ", \n" : "");
   }
   ss << ") {\n"
         "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
@@ -46,7 +44,7 @@ string ElementwiseRTCFunction::GetSource(
      << "}\n}";
   return ss.str();
 }
-}  // namespace
+} // namespace
 
 /**
  * A GPU operator that can generate limited elementwise operations.
@@ -75,17 +73,17 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
  public:
   ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws) {
-    const string src = OperatorBase::GetSingleArgument<string>(
-        "rtc_src", "");
+    const string src = OperatorBase::GetSingleArgument<string>("rtc_src", "");
     CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
     func_.Compile(InputSize(), OutputSize(), src);
   }
   ~ElementwiseRTCOp() override {}
 
   bool RunOnDevice() override {
-    static_assert(sizeof(void*) == sizeof(size_t),
-                  "The argbuffer relies on the assumption that void* and "
-                  "size_t have the same size.");
+    static_assert(
+        sizeof(void*) == sizeof(size_t),
+        "The argbuffer relies on the assumption that void* and "
+        "size_t have the same size.");
     vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
     size_t* argBuffer = argBuffer_vec.data();
     CAFFE_ENFORCE(
@@ -102,10 +100,11 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
     }
     size_t argBufferSize = sizeof(argBuffer);
     void* config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-      CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
-      CU_LAUNCH_PARAM_END
-    };
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &argBufferSize,
+        CU_LAUNCH_PARAM_END};
     func_.LaunchEx(
         CAFFE_GET_BLOCKS(Input(0).numel()),
         1,
@@ -127,4 +126,4 @@ namespace {
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
 }
 
-}  // namespace caffe2
+} // namespace caffe2