Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate/jit/matmul tiling 2d #1472

Merged
merged 28 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions crates/burn-jit/src/codegen/dialect/gpu/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,14 @@ macro_rules! gpu {
};
// out = vec4(a, b, c, d)
($scope:expr, $out:ident = vec4($a:ident,$b:ident,$c:ident,$d:ident)) => {
$scope.register($crate::codegen::dialect::gpu::Operator::AssignVec4(
$crate::codegen::dialect::gpu::AssignVec4Operator{a:$a,b:$b,c:$c,d:$d,out:$out}
));
let i = $scope.zero(Elem::UInt);
gpu!($scope, $out[i] = $a);
gpu!($scope, i = i + 1u32);
gpu!($scope, $out[i] = $b);
gpu!($scope, i = i + 1u32);
gpu!($scope, $out[i] = $c);
gpu!($scope, i = i + 1u32);
gpu!($scope, $out[i] = $d);
};
// out = input
($scope:expr, $out:ident = $input:ident) => {
Expand Down
11 changes: 0 additions & 11 deletions crates/burn-jit/src/codegen/dialect/gpu/operation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ pub enum Operator {
BitwiseXor(BinaryOperator),
ShiftLeft(BinaryOperator),
ShiftRight(BinaryOperator),
AssignVec4(AssignVec4Operator),
}

/// All metadata that can be access in a shader.
Expand Down Expand Up @@ -108,16 +107,6 @@ pub struct ClampOperator {
pub out: Variable,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[allow(missing_docs)]
pub struct AssignVec4Operator {
pub a: Variable,
pub b: Variable,
pub c: Variable,
pub d: Variable,
pub out: Variable,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[allow(missing_docs)]
pub struct ReadGlobalOperator {
Expand Down
18 changes: 1 addition & 17 deletions crates/burn-jit/src/codegen/dialect/gpu/vectorization.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
use super::{
AssignVec4Operator, BinaryOperator, ClampOperator, Item, Operation, Operator, UnaryOperator,
Variable,
};
use super::{BinaryOperator, ClampOperator, Item, Operation, Operator, UnaryOperator, Variable};

/// Define a vectorization scheme.
#[allow(dead_code)]
Expand Down Expand Up @@ -83,7 +80,6 @@ impl Operator {
Operator::BitwiseXor(op) => Operator::BitwiseXor(op.vectorize(vectorization)),
Operator::ShiftLeft(op) => Operator::ShiftLeft(op.vectorize(vectorization)),
Operator::ShiftRight(op) => Operator::ShiftRight(op.vectorize(vectorization)),
Operator::AssignVec4(op) => Operator::AssignVec4(op.vectorize(vectorization)),
}
}
}
Expand Down Expand Up @@ -118,18 +114,6 @@ impl ClampOperator {
}
}

impl AssignVec4Operator {
pub(crate) fn vectorize(&self, vectorization: Vectorization) -> Self {
Self {
a: self.a,
b: self.b,
c: self.c,
d: self.d,
out: self.out.vectorize(vectorization),
}
}
}

impl Variable {
pub(crate) fn vectorize(&self, vectorize: Vectorization) -> Self {
match self {
Expand Down
7 changes: 0 additions & 7 deletions crates/burn-jit/src/fusion/tracing/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -356,13 +356,6 @@ impl TraceBuilder {
&mut local_tensor_ids_input,
&mut local_tensor_ids_output,
),
gpu::Operator::AssignVec4(op) => {
mark(&op.a, &mut local_tensor_ids_input);
mark(&op.b, &mut local_tensor_ids_input);
mark(&op.c, &mut local_tensor_ids_input);
mark(&op.d, &mut local_tensor_ids_input);
mark(&op.out, &mut local_tensor_ids_output);
}
},
Operation::Procedure(proc) => {
match proc {
Expand Down
8 changes: 0 additions & 8 deletions crates/burn-jit/src/kernel/matmul/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,6 @@ use super::{
init_matmul_output, matmul_autotune, matmul_simple, matmul_tiling_2d, matmul_tiling_2d_padded,
};

#[derive(Debug, Clone)]
pub(crate) enum Tiling2DAssumption {
// Input shapes are divisible by their corresponding block sizes
Round,
// Bounds must be checked
None,
}

#[derive(Debug, Clone)]
/// Tiling 2D parameters
pub struct Tiling2dConfig {
Expand Down
1 change: 0 additions & 1 deletion crates/burn-jit/src/kernel/matmul/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,3 @@ pub mod padding;
mod padding;

pub use tiling2d::*;
use tiling2d_shader::*;
32 changes: 15 additions & 17 deletions crates/burn-jit/src/kernel/matmul/tiling2d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ use std::marker::PhantomData;

use super::{
padding::{crop, pad_round, PaddingOutput},
shape_out, tiling2d_launch_options, MatmulTiling2dShader, Tiling2DAssumption, Tiling2dConfig,
shape_out, tiling2d_launch_options,
tiling2d_shader::MatmulTiling2dShader,
Tiling2dConfig,
};

#[derive(new, Debug)]
Expand All @@ -25,7 +27,7 @@ struct MatmulTiling2d<E: JitElement> {
#[derive(new, Debug)]
struct MatmulTiling2dEagerKernel<R: Runtime> {
config: Tiling2dConfig,
assumption: Tiling2DAssumption,
bounds_check_required: bool,
_runtime: PhantomData<R>,
}

Expand All @@ -41,7 +43,7 @@ impl<R: Runtime> DynamicKernelSource for MatmulTiling2dEagerKernel<R> {
MatmulTiling2dShader {
variables: gpu::BinaryOperator { lhs, rhs, out },
config: self.config.clone(),
assumption: self.assumption.clone(),
bounds_check_required: self.bounds_check_required,
unroll: false,
}
.expand(&mut scope);
Expand Down Expand Up @@ -76,10 +78,10 @@ impl<R: Runtime> DynamicKernelSource for MatmulTiling2dEagerKernel<R> {

fn id(&self) -> String {
format!(
"{:?}config={:?}assumption={:?}",
"{:?}config={:?}boundcheck={:?}",
core::any::TypeId::of::<Self>(),
self.config,
self.assumption
self.bounds_check_required
)
}
}
Expand All @@ -92,9 +94,9 @@ pub fn matmul_tiling_2d<R: Runtime, E: JitElement + Element, const D: usize>(
out: JitTensor<R, E, D>,
config: Tiling2dConfig,
) -> JitTensor<R, E, D> {
let assumption = check_assumption(&lhs.shape, &rhs.shape, &config);
let bounds_check_required = check_bound_requirement(&lhs.shape, &rhs.shape, &config);

let kernel = MatmulTiling2dEagerKernel::<R>::new(config.clone(), assumption);
let kernel = MatmulTiling2dEagerKernel::<R>::new(config.clone(), bounds_check_required);
let client = lhs.client.clone();

let lhs = match lhs.batch_swapped_with_row_col() {
Expand Down Expand Up @@ -126,7 +128,7 @@ pub fn matmul_tiling_2d_padded<R: Runtime, E: JitElement + Element, const D: usi
out: JitTensor<R, E, D>,
config: Tiling2dConfig,
) -> JitTensor<R, E, D> {
let kernel = MatmulTiling2dEagerKernel::<R>::new(config.clone(), Tiling2DAssumption::Round);
let kernel = MatmulTiling2dEagerKernel::<R>::new(config.clone(), false);
let client = lhs.client.clone();

// A tensor may need to be padded, in which case it will implicitly become contiguous
Expand Down Expand Up @@ -177,16 +179,12 @@ pub fn matmul_tiling_2d_padded<R: Runtime, E: JitElement + Element, const D: usi
crop(rounded_output, out)
}

fn check_assumption<const D: usize>(
fn check_bound_requirement<const D: usize>(
lhs_shape: &Shape<D>,
rhs_shape: &Shape<D>,
config: &Tiling2dConfig,
) -> Tiling2DAssumption {
let m_divisible = lhs_shape.dims[D - 2] % config.block_size_m == 0;
let k_divisible = lhs_shape.dims[D - 1] % config.block_size_k == 0;
let n_divisible = rhs_shape.dims[D - 1] % config.block_size_n == 0;
match m_divisible && k_divisible && n_divisible {
true => Tiling2DAssumption::Round,
false => Tiling2DAssumption::None,
}
) -> bool {
lhs_shape.dims[D - 2] % config.block_size_m != 0
|| lhs_shape.dims[D - 1] % config.block_size_k != 0
|| rhs_shape.dims[D - 1] % config.block_size_n != 0
}
Loading
Loading