Skip to content

Commit 8af835a

Browse files
[GPU] Convert input to output data type in ActivationJitConstants (openvinotoolkit#23054)
### Details: - *Convert the input to the output data type to fix that cl kernel build failed for an ambiguous issue of the fmax/fmin functions occurring by the different data types between input and output.* ### Tickets: - *133562*
1 parent 530b9b5 commit 8af835a

File tree

4 files changed

+72
-5
lines changed

4 files changed

+72
-5
lines changed

src/plugins/intel_gpu/src/kernel_selector/jitter.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -1573,7 +1573,8 @@ JitConstants MakeActivationJitConstants(std::vector<kernel_selector::base_activa
15731573
Datatype out_dt,
15741574
const std::string& suffix,
15751575
bool use_type_parameter,
1576-
bool disable_type_conversion) {
1576+
bool disable_type_conversion,
1577+
bool convert_input_to_output_dt) {
15771578
JitConstants res = {};
15781579
if (params.empty()) {
15791580
return MakeActivationJitConstants({ActivationFunction::NONE, 0.f, 0.f}, out_dt,
@@ -1602,7 +1603,14 @@ JitConstants MakeActivationJitConstants(std::vector<kernel_selector::base_activa
16021603

16031604
if (i == 0) {
16041605
activation_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
1605-
res_activation = "ACTIVATION_FUNC" + activation_suffix + activation_params;
1606+
if (convert_input_to_output_dt) {
1607+
// Convert the input to the output data type to fix that cl kernel build failed for an ambiguous issue of the fmax/fmin functions
1608+
// occurring by the different data types between input and output.
1609+
res_activation = "ACTIVATION_FUNC" + activation_suffix
1610+
+ "(" + (use_type_parameter? "jit_type, ":"") + "convert_" + toCLType(out_dt) + "(input), params)";
1611+
} else {
1612+
res_activation = "ACTIVATION_FUNC" + activation_suffix + activation_params;
1613+
}
16061614
} else {
16071615
res_activation = "ACTIVATION" + activation_suffix + "(" + (use_type_parameter ? "jit_type, " : "") +
16081616
res_activation + ", ACTIVATION_PARAMS" + activation_suffix + ")";

src/plugins/intel_gpu/src/kernel_selector/jitter.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,8 @@ JitConstants MakeActivationJitConstants(std::vector<kernel_selector::base_activa
310310
Datatype output_dt,
311311
const std::string& suffix = "",
312312
bool use_type_parameter = false,
313-
bool disable_type_conversion = false);
313+
bool disable_type_conversion = false,
314+
bool convert_input_to_output_dt = false);
314315
JitConstants MakeBaseParamsJitConstants(const base_params& params);
315316
JitConstants MakeLoopUnrollParamsJitConstants(uint32_t loopCount);
316317

src/plugins/intel_gpu/src/kernel_selector/kernel_base.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,9 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params, b
9090
// Changed data type from unit type to output data type to fix the issue case that
9191
// the activation function makes cl kernel build error when the output data type
9292
// and unit type are different and activation param is existed
93-
jit.Merge(MakeActivationJitConstants(params.activations, params.outputs[0].GetDType()));
93+
bool convert_input_to_output_dt = (params.outputs[0].GetDType() == Datatype::F32 && params.inputs[0].GetDType() == Datatype::F16);
94+
// If input is FP16 and output is FP32, convert input to float before running activation function.
95+
jit.Merge(MakeActivationJitConstants(params.activations, params.outputs[0].GetDType(), "", false, false, convert_input_to_output_dt));
9496

9597
if (add_tensor_definitions) {
9698
for (size_t i = 0; i < params.inputs.size(); i++) {
@@ -128,7 +130,7 @@ bool KernelBase::IsSIMDSizeSupported(const EngineInfo &info, size_t simd_size) c
128130
}
129131

130132
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
131-
// MakeBaseParamsJitConstants
133+
// MakeFusedOpsJitConstants
132134
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
133135
JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_params &params,
134136
const std::vector<FusedOpsConfiguration> &conf) const {

src/plugins/intel_gpu/tests/unit/test_cases/depth_to_space_gpu_test.cpp

+56
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <intel_gpu/primitives/reshape.hpp>
1111
#include <intel_gpu/primitives/permute.hpp>
1212
#include <intel_gpu/primitives/reorder.hpp>
13+
#include <intel_gpu/primitives/eltwise.hpp>
1314

1415
#include <cstddef>
1516

@@ -423,3 +424,58 @@ TEST(depth_to_space_fp32_gpu, d1822_bs2_depth_first) {
423424
TEST(export_import_depth_to_space_fp32_gpu, d1822_bs2_depth_first) {
424425
test_depth_to_space_fp32_gpu_d1822_bs2_depth_first<float>(true);
425426
}
427+
428+
static void test_depth_to_space_fp16_input_fp32_output(bool is_caching_test) {
429+
auto& engine = get_test_engine();
430+
431+
auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } });
432+
auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 1, 3, 2 } });
433+
434+
size_t block_size = 1;
435+
436+
set_values(input, {
437+
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
438+
2.0f, 2.0f, 3.0f, 4.0f, 6.0f,
439+
3.0f, 3.0f, 3.0f, 5.0f, 1.0f,
440+
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
441+
});
442+
set_values(weights, {
443+
ov::float16(1.0f), ov::float16(2.0f), ov::float16(1.0f),
444+
ov::float16(2.0f), ov::float16(1.0f), ov::float16(2.0f)
445+
});
446+
447+
// Apply existed topology that makes kernel build failure because of input and output data types are different.
448+
topology topology;
449+
topology.add(cldnn::input_layout("input", input->get_layout()));
450+
topology.add(cldnn::data("weights", weights));
451+
topology.add(cldnn::reorder("reorder_input", input_info("input"), cldnn::layout(data_types::f16, format::byxf, { 1, 1, 4, 5 })));
452+
topology.add(cldnn::convolution("conv", input_info("reorder_input"), "weights", "", 1, { 2, 1 }, {1, 1}, {0, 0}, {0, 0}, false));
453+
topology.add(cldnn::depth_to_space("depth_to_space", input_info("conv"), block_size, depth_to_space_mode::depth_first));
454+
topology.add(cldnn::activation("activate", input_info("depth_to_space"), cldnn::activation_func::relu_negative_slope, {0.25f, 0.f}));
455+
topology.add(cldnn::reorder("convert:output", input_info("activate"), format::any, data_types::f32, {}, reorder_mean_mode::subtract, padding(), true));
456+
topology.add(cldnn::reorder("result:output/sink_port_0", input_info("convert:output"), format::bfyx, data_types::f32, {}, reorder_mean_mode::subtract, padding(), false));
457+
458+
ExecutionConfig config = get_test_default_config(engine);
459+
config.set_property(ov::intel_gpu::optimize_data(true));
460+
461+
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
462+
463+
network->set_input_data("input", input);
464+
465+
auto outputs = network->execute();
466+
467+
auto output = outputs.at("result:output/sink_port_0").get_memory();
468+
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
469+
470+
std::vector<float> expected_results = {
471+
24.0f, 24.0f, 32.0f, 28.0f
472+
};
473+
474+
for (size_t i = 0; i < expected_results.size(); ++i) {
475+
ASSERT_EQ(expected_results[i], output_ptr[i]);
476+
}
477+
}
478+
479+
TEST(depth_to_space_gpu, fp16_input_fp32_output) {
480+
test_depth_to_space_fp16_input_fp32_output(false);
481+
}

0 commit comments

Comments
 (0)