SWDEV-340007 - Add per-thread stream support in hip documents

jujiang-del · jujiang-del · commit 233b8a835622 · 2022-09-09T12:24:17.000-04:00
Change-Id: Ib32d768b296966fc28e6e30875e8fda366e6eff7
diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
@@ -33,6 +33,7 @@
 - [Why _OpenMP is undefined when compiling with -fopenmp?](#why-_openmp-is-undefined-when-compiling-with--fopenmp)
 - [Does the HIP-Clang compiler support extern shared declarations?](#does-the-hip-clang-compiler-support-extern-shared-declarations)
 - [I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu: Unable to find code object for all current devices?](#i-have-multiple-hip-enabled-devices-and-i-am-getting-an-error-message-hipErrorNoBinaryForGpu-unable-to-find-code-object-for-all-current-devices)
+- [How to use per-thread default stream in HIP?](#how-to-use-per-thread-default-stream-in-hip)
 - [How can I know the version of HIP?](#how-can-I-know-the-version-of-hip)
 <!-- tocstop -->
 
@@ -94,7 +95,7 @@ However, we can provide a rough summary of the features included in each CUDA SD
 - CUDA 6.5 :
     - __shfl intriniscs (supported)
 - CUDA 7.0 :
-    - Per-thread-streams (under development)
+    - Per-thread default streams (supported)
     - C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features)
 - CUDA 7.5 :
     - float16 (supported)
@@ -260,6 +261,18 @@ If you have a precompiled application/library (like rocblas, tensorflow etc) whi
  - The application/library does not ship code object bundles for *all* of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
  - The application/library does not ship code object bundles for *some* of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
 
+### How to use per-thread default stream in HIP?
+
+The per-thread default stream is an implicit stream local to both the thread and the current device. It does not do any implicit synchronization with other streams (like explicitly created streams), or default per-thread stream on other threads.
+
+The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
+
+In ROCm, a compilation option should be added in order to compile the translation unit with per-thread default stream enabled.
+“-fgpu-default-stream=per-thread”.
+Once source is compiled with per-thread default stream enabled, all APIs will be executed on per thread default stream, hence there will not be any implicit synchronization with other streams.
+
+Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization.
+
 ### How can I know the version of HIP?
 
 HIP version definition has been updated since ROCm 4.2 release as the following:
diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md
@@ -139,6 +139,15 @@ This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallo
 
 The test codes in the link (https://github.com/ROCm-Developer-Tools/HIP/blob/develop/tests/src/deviceLib/hipDeviceMalloc.cpp) show how to implement application using malloc and free functions in device kernels.
 
+## Use of Per-thread default stream
+
+The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads.
+The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
+The per-thread default stream can be enabled via adding a compilation option,
+“-fgpu-default-stream=per-thread”.
+
+And users can explicitly use "hipStreamPerThread" as per-thread default stream handle as input in API commands. There are test codes as examples in the link (https://github.com/ROCm-Developer-Tools/HIP/tree/develop/tests/catch/unit/streamperthread).
+
 ## Use of Long Double Type
 
 In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU.  HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type.
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
@@ -1246,7 +1246,7 @@ hipError_t hipInit(unsigned int flags);
  *
  * @param [out] driverVersion
  *
- * @returns #hipSuccess, #hipErrorInavlidValue
+ * @returns #hipSuccess, #hipErrorInvalidValue
  *
  * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision.
  * This function always set *driverVersion to 4 as an approximation though HIP supports
@@ -1262,7 +1262,8 @@ hipError_t hipDriverGetVersion(int* driverVersion);
  *
  * @param [out] runtimeVersion
  *
- * @returns #hipSuccess, #hipErrorInavlidValue
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
  *
  * @warning The version definition of HIP runtime is different from CUDA.
  * On AMD platform, the function returns HIP runtime version,
@@ -1277,7 +1278,7 @@ hipError_t hipRuntimeGetVersion(int* runtimeVersion);
  * @param [out] device
  * @param [in] ordinal
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
 
@@ -1287,7 +1288,7 @@ hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
  * @param [out] minor
  * @param [in] device
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
 /**
@@ -1296,7 +1297,7 @@ hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device
  * @param [in] len
  * @param [in] device
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
 /**
@@ -1318,7 +1319,7 @@ hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device);
  * @param [in] srcDevice
  * @param [in] dstDevice
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
                                     int srcDevice, int dstDevice);
@@ -1328,23 +1329,23 @@ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
  * @param [in] len
  * @param [in] device
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
 /**
  * @brief Returns a handle to a compute device.
  * @param [out] device handle
  * @param [in] PCI Bus ID
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
  */
 hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
 /**
  * @brief Returns the total amount of memory on the device.
  * @param [out] bytes
  * @param [in] device
  *
- * @returns #hipSuccess, #hipErrorInavlidDevice
+ * @returns #hipSuccess, #hipErrorInvalidDevice
  */
 hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device);
 // doxygen end initialization