NVIDIA
diff --git a/‎.travis.yml
+3 b/‎.travis.yml
+3
diff --git a/‎CMakeLists.txt
+2-1 b/‎CMakeLists.txt
+2-1
diff --git a/‎Makefile
+15-4 b/‎Makefile
+15-4
diff --git a/‎Makefile.config.example
+1 b/‎Makefile.config.example
+1
diff --git a/‎include/caffe/common.hpp
+6 b/‎include/caffe/common.hpp
+6
diff --git a/‎include/caffe/layer.hpp
+20-1 b/‎include/caffe/layer.hpp
+20-1
diff --git a/‎include/caffe/layers/cudnn_batch_norm_layer.hpp
+45 b/‎include/caffe/layers/cudnn_batch_norm_layer.hpp
+45
diff --git a/‎include/caffe/layers/cudnn_conv_layer.hpp
+6-6 b/‎include/caffe/layers/cudnn_conv_layer.hpp
+6-6
diff --git a/‎include/caffe/layers/cudnn_lcn_layer.hpp
+6-5 b/‎include/caffe/layers/cudnn_lcn_layer.hpp
+6-5
diff --git a/‎include/caffe/layers/cudnn_relu_layer.hpp
+1-1 b/‎include/caffe/layers/cudnn_relu_layer.hpp
+1-1
diff --git a/‎include/caffe/layers/cudnn_sigmoid_layer.hpp
+1-1 b/‎include/caffe/layers/cudnn_sigmoid_layer.hpp
+1-1
diff --git a/‎include/caffe/layers/cudnn_tanh_layer.hpp
+1-1 b/‎include/caffe/layers/cudnn_tanh_layer.hpp
+1-1
diff --git a/‎include/caffe/parallel.hpp
+2 b/‎include/caffe/parallel.hpp
+2
diff --git a/‎include/caffe/util/cudnn.hpp
+16-4 b/‎include/caffe/util/cudnn.hpp
+16-4
@@ -8,6 +8,9 @@ env:
     - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
     - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
     - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
+#  Currently there is no way to install cudnn via apt-get. Uncomment when it's available.
+#    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_CUDNN=true
+#    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_CUDNN=true
 
 language: cpp
 
 
@@ -64,7 +64,8 @@ configure_file(cmake/Templates/caffe_config.h.in "${PROJECT_BINARY_DIR}/caffe_co
 
 # ---[ Includes
 set(Caffe_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR})
+set(THIRDPARTY_DIR ${PROJECT_SOURCE_DIR}/3rdparty)
+include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR} ${THIRDPARTY_DIR})
 include_directories(BEFORE src) # This is needed for gtest.
 
 # ---[ Subdirectories
 
@@ -7,12 +7,19 @@ $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example.)
 endif
 include $(CONFIG_FILE)
 
+# Rectify input parameters
+ifeq ($(CPU_ONLY),1)
+  USE_CUDNN=0
+endif
+
+PROJECT_DIR=$(PWD)
+
 BUILD_DIR_LINK := $(BUILD_DIR)
 ifeq ($(RELEASE_BUILD_DIR),)
-	RELEASE_BUILD_DIR := .$(BUILD_DIR)_release
+	RELEASE_BUILD_DIR := $(PROJECT_DIR)/.$(BUILD_DIR)_release
 endif
 ifeq ($(DEBUG_BUILD_DIR),)
-	DEBUG_BUILD_DIR := .$(BUILD_DIR)_debug
+	DEBUG_BUILD_DIR := $(PROJECT_DIR)/.$(BUILD_DIR)_debug
 endif
 
 DEBUG ?= 0
@@ -24,6 +31,8 @@ else
 	OTHER_BUILD_DIR := $(DEBUG_BUILD_DIR)
 endif
 
+THIRDPARTY_DIR=$(PROJECT_DIR)/3rdparty
+
 # All of the directories containing code.
 SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
 	\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print)
@@ -171,7 +180,7 @@ ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
 endif
 CUDA_LIB_DIR += $(CUDA_DIR)/lib
 
-INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
+INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include $(THIRDPARTY_DIR)
 ifneq ($(CPU_ONLY), 1)
 	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
@@ -325,6 +334,8 @@ endif
 # cuDNN acceleration configuration.
 ifeq ($(USE_CUDNN), 1)
 	LIBRARIES += cudnn
+	INCLUDE_DIRS += ${CUDNN_DIR}/include
+	LIBRARY_DIRS += ${CUDNN_DIR}/install/cuda/lib64
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
@@ -440,7 +451,7 @@ endif
 # Define build targets
 ##############################
 .PHONY: all lib test clean docs linecount lint lintclean tools examples $(DIST_ALIASES) \
-	py mat py$(PROJECT) mat$(PROJECT) proto runtest \
+	py mat py$(PROJECT) mat$(PROJECT) thirdparty proto runtest \
 	superclean supercleanlist supercleanfiles warn everything
 
 all: lib tools examples
 
@@ -2,6 +2,7 @@
 # Contributions simplifying and improving our build system are welcome!
 
 # cuDNN acceleration switch (uncomment to build with cuDNN).
+# cuDNN version 4 or higher is required.
 # USE_CUDNN := 1
 
 # CPU-only switch (uncomment to build without GPU support).
 
@@ -136,6 +136,9 @@ class Caffe {
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
+#ifdef USE_CUDNN
+  inline static cudnnHandle_t cudnn_handle() { return Get().cudnn_handle_; }
+#endif
 #endif
 
   // Returns the mode: running on CPU or GPU.
@@ -168,6 +171,9 @@ class Caffe {
 #ifndef CPU_ONLY
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
+#ifdef USE_CUDNN
+  cudnnHandle_t cudnn_handle_;
+#endif
 #endif
   shared_ptr<RNG> random_generator_;
 
 
@@ -38,7 +38,8 @@ class Layer {
    * layer.
    */
   explicit Layer(const LayerParameter& param)
-    : layer_param_(param), is_shared_(false) {
+    : layer_param_(param), is_shared_(false),
+      forward_passed_(false), backward_passed_(false) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -316,6 +317,21 @@ class Layer {
     param_propagate_down_[param_id] = value;
   }
 
+  bool IsForwardPassed() const {
+    return forward_passed_;
+  }
+
+  void ForwardPassed(bool passed) {
+    forward_passed_ = passed;
+  }
+
+  bool IsBackwardPassed() const {
+    return backward_passed_;
+  }
+
+  void BackwardPassed(bool passed) {
+    backward_passed_ = passed;
+  }
 
  protected:
   /** The protobuf that stores the layer parameters */
@@ -431,6 +447,9 @@ class Layer {
   /** Whether this layer is actually shared by other nets*/
   bool is_shared_;
 
+  bool forward_passed_;
+  bool backward_passed_;
+
   /** The mutex for sequential forward if this layer is shared */
   shared_ptr<boost::mutex> forward_mutex_;
 
 
@@ -0,0 +1,45 @@
+#ifndef CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_
+#define CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/batch_norm_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+template <typename Dtype>
+class CuDNNBatchNormLayer : public BatchNormLayer<Dtype> {
+ public:
+  explicit CuDNNBatchNormLayer(const LayerParameter& param)
+      : BatchNormLayer<Dtype>(param), epsilon_(1e-4), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNBatchNormLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  // cuDNN descriptors / handles
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+  cudnnTensorDescriptor_t scale_bias_mean_var_desc_;
+  cudnnBatchNormMode_t mode_;
+
+  double epsilon_;
+  Blob<Dtype> save_mean_, save_inv_var_;
+  bool handles_setup_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_
@@ -8,6 +8,9 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/conv_layer.hpp"
+#ifndef CPU_ONLY
+#include "caffe/util/gpu_memory.hpp"
+#endif
 
 namespace caffe {
 
@@ -44,8 +47,6 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t* handle_;
-  cudaStream_t*  stream_;
 
   // algorithms for forward and backwards convolutions
   cudnnConvolutionFwdAlgo_t *fwd_algo_;
@@ -56,14 +57,13 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   cudnnTensorDescriptor_t    bias_desc_;
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
-  int bottom_offset_, top_offset_, bias_offset_;
+
+  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
 
   size_t *workspace_fwd_sizes_;
   size_t *workspace_bwd_data_sizes_;
   size_t *workspace_bwd_filter_sizes_;
-  size_t workspaceSizeInBytes;  // size of underlying storage
-  void *workspaceData;  // underlying storage
-  void **workspace;  // aliases into workspaceData
+  gpu_memory::buffer workspace;
 };
 #endif
 
 
@@ -9,6 +9,9 @@
 
 #include "caffe/layers/lrn_layer.hpp"
 #include "caffe/layers/power_layer.hpp"
+#ifndef CPU_ONLY
+#include "caffe/util/gpu_memory.hpp"
+#endif
 
 namespace caffe {
 
@@ -17,8 +20,7 @@ template <typename Dtype>
 class CuDNNLCNLayer : public LRNLayer<Dtype> {
  public:
   explicit CuDNNLCNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
-        tempData1(NULL), tempData2(NULL) {}
+    : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize_(0) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -32,15 +34,14 @@ class CuDNNLCNLayer : public LRNLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
   cudnnLRNDescriptor_t norm_desc_;
   cudnnTensorDescriptor_t bottom_desc_, top_desc_;
 
   int size_, pre_pad_;
   Dtype alpha_, beta_, k_;
 
-  size_t tempDataSize;
-  void *tempData1, *tempData2;
+  size_t             tempDataSize_;
+  gpu_memory::buffer temp1_, temp2_;
 };
 #endif
 
 
@@ -34,9 +34,9 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
 
@@ -34,9 +34,9 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
 
@@ -34,9 +34,9 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif
 
 
@@ -57,6 +57,8 @@ class GPUParams : public Params<Dtype> {
   using Params<Dtype>::size_;
   using Params<Dtype>::data_;
   using Params<Dtype>::diff_;
+ private:
+  int buffer_device_;
 };
 
 class DevicePair {
 
@@ -8,7 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #define CUDNN_VERSION_MIN(major, minor, patch) \
-    (CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))
+     (CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))
 
 #define CUDNN_CHECK(condition) \
   do { \
@@ -91,8 +91,13 @@ template <typename Dtype>
 inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
     int n, int c, int h, int w) {
   CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+#if CUDNN_VERSION_MIN(5, 0, 0)
   CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      n, c, h, w));
+                                         CUDNN_TENSOR_NCHW, n, c, h, w));
+#else
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
+                                            CUDNN_TENSOR_NCHW, n, c, h, w));
+#endif
 }
 
 template <typename Dtype>
@@ -123,8 +128,15 @@ inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
     LOG(FATAL) << "Unknown pooling method.";
   }
   CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-        pad_h, pad_w, stride_h, stride_w));
+#if CUDNN_VERSION_MIN(5, 0, 0)
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode,
+                                          CUDNN_PROPAGATE_NAN, h, w,
+                                          pad_h, pad_w, stride_h, stride_w));
+#else
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(*pool_desc, *mode,
+                                          CUDNN_PROPAGATE_NAN, h, w,
+                                          pad_h, pad_w, stride_h, stride_w));
+#endif
 }
 
 }  // namespace cudnn