Skip to content

Commit 6f11b74

Browse files
committed
Paged attention
1 parent 757caa1 commit 6f11b74

24 files changed

+1932
-2
lines changed

src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ class ProgramBuilder final {
172172
void CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ov::Node>& op);
173173
};
174174

175+
void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op);
175176
void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr<ov::Node>& node, CustomLayerPtr customLayer);
176177
void CreateUnaryEltwiseOp(ProgramBuilder& p, const std::shared_ptr<ov::Node>& node,
177178
cldnn::activation_func func, cldnn::activation_additional_params params);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
#include "primitive.hpp"
7+
#include "intel_gpu/graph/program.hpp"
8+
9+
#include <vector>
10+
11+
namespace cldnn {
12+
13+
struct paged_attention : public primitive_base<paged_attention> {
14+
CLDNN_DECLARE_PRIMITIVE(paged_attention)
15+
16+
paged_attention() : primitive_base("", {}) {}
17+
18+
paged_attention(const primitive_id& id,
19+
const std::vector<input_info>& inputs,
20+
const padding& output_padding = padding())
21+
: primitive_base(id, inputs, {output_padding}) {
22+
OPENVINO_ASSERT(inputs.size() == 13, "[GPU] Unexpected inputs number for PagedAttention primitive: ", inputs.size());
23+
}
24+
25+
bool operator==(const primitive& rhs) const override {
26+
return compare_common_params(rhs);
27+
}
28+
29+
void save(BinaryOutputBuffer& ob) const override {
30+
primitive_base<paged_attention>::save(ob);
31+
ob << head_size;
32+
ob << heads_num;
33+
ob << kv_heads_num;
34+
ob << block_size;
35+
ob << x_block_size;
36+
}
37+
38+
void load(BinaryInputBuffer& ib) override {
39+
primitive_base<paged_attention>::load(ib);
40+
ib >> head_size;
41+
ib >> heads_num;
42+
ib >> kv_heads_num;
43+
ib >> block_size;
44+
ib >> x_block_size;
45+
}
46+
47+
size_t head_size;
48+
size_t heads_num;
49+
size_t kv_heads_num;
50+
size_t block_size;
51+
size_t x_block_size;
52+
};
53+
} // namespace cldnn

src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ struct gemm_impl : multi_stage_primitive<gemm> {
3333
return make_unique<gemm_impl>(*this);
3434
}
3535

36+
gemm_impl() = default;
37+
38+
gemm_impl(const std::vector<kernel_selector::kernel_data>& kd) : parent(kd) {
39+
this->can_reuse_memory = true;
40+
}
41+
3642
void load(BinaryInputBuffer& ib) override {
3743
parent::load(ib);
3844
if (is_dynamic()) {

src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
5353
}
5454
this->can_reuse_memory = false;
5555
this->_kernel_name = other._kernel_name;
56+
this->can_reuse_memory = other.can_reuse_memory;
5657
this->_is_dynamic = other._is_dynamic;
5758
}
5859

0 commit comments

Comments
 (0)