forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpybind_state_gpu.cc
177 lines (162 loc) · 5.97 KB
/
pybind_state_gpu.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Note(jiayq): the import_array function is done inside
// caffe2_python.cc. Read
// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
// for more details.
#define NO_IMPORT_ARRAY
#include "pybind_state.h"
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#ifdef CAFFE2_USE_CUDNN
#include "caffe2/core/common_cudnn.h"
#endif // CAFFE2_USE_CUDNN
#include <c10/cuda/CUDAGuard.h>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/operator_fallback_gpu.h"
#include "caffe2/python/pybind_state_registry.h"
#ifdef CAFFE2_USE_TRT
#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
#endif // CAFFE2_USE_TRT
namespace caffe2 {
namespace python {
REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
namespace py = pybind11;
void addCUDAGlobalMethods(py::module& m) {
m.def("num_cuda_devices", &NumCudaDevices);
m.def("get_cuda_version", &CudaVersion);
#ifdef CAFFE2_USE_CUDNN
m.def("get_cudnn_version", &cudnnCompiledVersion);
m.attr("cudnn_convolution_fwd_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
m.attr("cudnn_convolution_bwd_data_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
m.attr("cudnn_convolution_bwd_filter_algo_count") =
py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
#else
m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
#endif
m.def("get_gpu_memory_info", [](int device_id) {
CUDAGuard guard(device_id);
size_t device_free, device_total;
CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
return std::pair<size_t, size_t>{device_free, device_total};
});
m.def("get_cuda_peer_access_pattern", []() {
std::vector<std::vector<bool>> pattern;
CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
return pattern;
});
m.def("get_device_properties", [](int deviceid) {
auto& prop = GetDeviceProperty(deviceid);
std::map<std::string, py::object> obj;
obj["name"] = py::cast(prop.name);
obj["major"] = py::cast(prop.major);
obj["minor"] = py::cast(prop.minor);
obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
return obj;
});
m.def(
"onnx_to_trt_op",
[](const py::bytes& onnx_model_str,
const std::unordered_map<std::string, std::vector<int>>&
output_size_hints,
int max_batch_size,
int max_workspace_size,
int verbosity,
bool debug_builder) -> py::bytes {
#ifdef CAFFE2_USE_TRT
TensorRTTransformer t(
max_batch_size, max_workspace_size, verbosity, debug_builder);
auto op_def =
t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
std::string out;
op_def.SerializeToString(&out);
return py::bytes(out);
#else
CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
#endif // CAFFE2_USE_TRT
});
m.def(
"transform_trt",
[](const py::bytes& pred_net_str,
const std::unordered_map<std::string, std::vector<int>>& shapes,
int max_batch_size,
int max_workspace_size,
int verbosity,
bool debug_builder,
bool build_serializable_op) -> py::bytes {
#ifdef CAFFE2_USE_TRT
caffe2::NetDef pred_net;
if (!ParseProtoFromLargeString(
pred_net_str.cast<std::string>(), &pred_net)) {
LOG(ERROR) << "broken pred_net protobuf";
}
std::unordered_map<std::string, TensorShape> tensor_shapes;
for (const auto& it : shapes) {
tensor_shapes.emplace(
it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
}
TensorRTTransformer ts(
max_batch_size,
max_workspace_size,
verbosity,
debug_builder,
build_serializable_op);
ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
std::string pred_net_str2;
pred_net.SerializeToString(&pred_net_str2);
return py::bytes(pred_net_str2);
#else
CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
#endif // CAFFE2_USE_TRT
});
};
void addCUDAObjectMethods(py::module& m) {
py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
.def_property_readonly(
"data",
[](DLPackWrapper<CUDAContext>* t) -> py::object {
CAFFE_ENFORCE_EQ(
t->device_option.device_type(),
PROTO_CUDA,
"Expected CUDA device option for CUDA tensor");
return t->data();
},
"Return DLPack tensor with tensor's data.")
.def(
"feed",
[](DLPackWrapper<CUDAContext>* t, py::object obj) {
CAFFE_ENFORCE_EQ(
t->device_option.device_type(),
PROTO_CUDA,
"Expected CUDA device option for CUDA tensor");
t->feed(obj);
},
"Copy data from given DLPack tensor into this tensor.")
.def_property_readonly(
"_shape",
[](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); })
.def(
"_reshape",
[](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
t->tensor->Resize(dims);
});
}
PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
addGlobalMethods(m);
addCUDAGlobalMethods(m);
addObjectMethods(m);
addCUDAObjectMethods(m);
for (const auto& addition : PybindAdditionRegistry()->Keys()) {
PybindAdditionRegistry()->Create(addition, m);
}
}
} // namespace python
} // namespace caffe2