Skip to content

Commit bee37a8

Browse files
committed
drm/xe/uapi: Use hint for guc to set GT frequency
Allow user to provide a low latency hint. When set, KMD sends a hint to GuC which results in special handling for that process. SLPC will ramp the GT frequency aggressively every time it switches to this process. We need to enable the use of SLPC Compute strategy during init, but it will apply only to processes that set this bit during process creation. Improvement with this approach as below: Before, :~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency Platform: Intel(R) OpenCL Graphics Device: Intel(R) Graphics [0xe20b] Driver version : 24.52.0 (Linux x64) Compute units : 160 Clock frequency : 2850 MHz Kernel launch latency : 283.16 us After, :~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency Platform: Intel(R) OpenCL Graphics Device: Intel(R) Graphics [0xe20b] Driver version : 24.52.0 (Linux x64) Compute units : 160 Clock frequency : 2850 MHz Kernel launch latency : 63.38 us UMD Compute PR : intel/compute-runtime#794 Mesa PR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33214 v9(Vinay): - remove extra line, align commit message v8(Vinay): - Add separate example for using low latency hint v7(Jose): - Update UMD PR - applicable to all gpus V6: - init flags, remove redundant flags check (MAuld) V5: - Move uapi doc to documentation and GuC ABI specific change (Rodrigo) - Modify logic to restrict exec queue flags (MAuld) V4: - To make it clear, dont use exec queue word (Vinay) - Correct typo in description of flag (Jose/Vinay) - rename set_strategy api and replace ctx with exec queue(Vinay) - Start with 0th bit to indentify user flags (Jose) V3: - Conver user flag to kernel internal flag and use (Oak) - Support query config for use to check kernel support (Jose) - Dont need to take runtime pm (Vinay) V2: - DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT 1 planned for other hint(Szymon) - Add motivation to description (Lucas) Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250212121814.9947-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
1 parent b7b68c6 commit bee37a8

File tree

8 files changed

+75
-4
lines changed

8 files changed

+75
-4
lines changed

Documentation/gpu/drm-uapi.rst

+18
Original file line numberDiff line numberDiff line change
@@ -583,3 +583,21 @@ dma-buf interoperability
583583

584584
Please see Documentation/userspace-api/dma-buf-alloc-exchange.rst for
585585
information on how dma-buf is integrated and exposed within DRM.
586+
587+
Low latency hint by user
588+
========================
589+
590+
Allow users to provide a hint to kernel for cases demanding low latency
591+
profile. Please note it will have impact on power consumption. User can
592+
indicate low latency hint with flag while creating exec queue as
593+
mentioned below,
594+
595+
struct drm_xe_exec_queue_create exec_queue_create = {
596+
.flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
597+
.extensions = 0,
598+
.vm_id = vm,
599+
.num_bb_per_exec = 1,
600+
.num_eng_per_bb = 1,
601+
.instances = to_user_pointer(&instance),
602+
};
603+
ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);

drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h

+3
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ struct slpc_task_state_data {
174174
};
175175
} __packed;
176176

177+
#define SLPC_CTX_FREQ_REQ_IS_COMPUTE REG_BIT(28)
178+
#define SLPC_OPTIMIZED_STRATEGY_COMPUTE REG_BIT(0)
179+
177180
struct slpc_shared_data_header {
178181
/* Total size in bytes of this shared buffer. */
179182
u32 size;

drivers/gpu/drm/xe/xe_exec_queue.c

+7-3
Original file line numberDiff line numberDiff line change
@@ -604,11 +604,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
604604
struct xe_tile *tile;
605605
struct xe_exec_queue *q = NULL;
606606
u32 logical_mask;
607+
u32 flags = 0;
607608
u32 id;
608609
u32 len;
609610
int err;
610611

611-
if (XE_IOCTL_DBG(xe, args->flags) ||
612+
if (XE_IOCTL_DBG(xe, args->flags & ~DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT) ||
612613
XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
613614
return -EINVAL;
614615

@@ -625,6 +626,9 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
625626
if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
626627
return -EINVAL;
627628

629+
if (args->flags & DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT)
630+
flags |= EXEC_QUEUE_FLAG_LOW_LATENCY;
631+
628632
if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
629633
if (XE_IOCTL_DBG(xe, args->width != 1) ||
630634
XE_IOCTL_DBG(xe, args->num_placements != 1) ||
@@ -633,8 +637,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
633637

634638
for_each_tile(tile, xe, id) {
635639
struct xe_exec_queue *new;
636-
u32 flags = EXEC_QUEUE_FLAG_VM;
637640

641+
flags |= EXEC_QUEUE_FLAG_VM;
638642
if (id)
639643
flags |= EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD;
640644

@@ -680,7 +684,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
680684
}
681685

682686
q = xe_exec_queue_create(xe, vm, logical_mask,
683-
args->width, hwe, 0,
687+
args->width, hwe, flags,
684688
args->extensions);
685689
up_read(&vm->lock);
686690
xe_vm_put(vm);

drivers/gpu/drm/xe/xe_exec_queue_types.h

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ struct xe_exec_queue {
8585
#define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD BIT(3)
8686
/* kernel exec_queue only, set priority to highest level */
8787
#define EXEC_QUEUE_FLAG_HIGH_PRIORITY BIT(4)
88+
/* flag to indicate low latency hint to guc */
89+
#define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5)
8890

8991
/**
9092
* @flags: flags for this exec queue, should statically setup aside from ban

drivers/gpu/drm/xe/xe_guc_pc.c

+16
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,17 @@ static int pc_init_freqs(struct xe_guc_pc *pc)
995995
return ret;
996996
}
997997

998+
static int pc_action_set_strategy(struct xe_guc_pc *pc, u32 val)
999+
{
1000+
int ret = 0;
1001+
1002+
ret = pc_action_set_param(pc,
1003+
SLPC_PARAM_STRATEGIES,
1004+
val);
1005+
1006+
return ret;
1007+
}
1008+
9981009
/**
9991010
* xe_guc_pc_start - Start GuC's Power Conservation component
10001011
* @pc: Xe_GuC_PC instance
@@ -1054,6 +1065,11 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
10541065
}
10551066

10561067
ret = pc_action_setup_gucrc(pc, GUCRC_FIRMWARE_CONTROL);
1068+
if (ret)
1069+
goto out;
1070+
1071+
/* Enable SLPC Optimized Strategy for compute */
1072+
ret = pc_action_set_strategy(pc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
10571073

10581074
out:
10591075
xe_force_wake_put(gt_to_fw(gt), fw_ref);

drivers/gpu/drm/xe/xe_guc_submit.c

+8
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <drm/drm_managed.h>
1616

1717
#include "abi/guc_actions_abi.h"
18+
#include "abi/guc_actions_slpc_abi.h"
1819
#include "abi/guc_klvs_abi.h"
1920
#include "regs/xe_lrc_layout.h"
2021
#include "xe_assert.h"
@@ -400,6 +401,7 @@ static void __guc_exec_queue_policy_add_##func(struct exec_queue_policy *policy,
400401
MAKE_EXEC_QUEUE_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM)
401402
MAKE_EXEC_QUEUE_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
402403
MAKE_EXEC_QUEUE_POLICY_ADD(priority, SCHEDULING_PRIORITY)
404+
MAKE_EXEC_QUEUE_POLICY_ADD(slpc_exec_queue_freq_req, SLPM_GT_FREQUENCY)
403405
#undef MAKE_EXEC_QUEUE_POLICY_ADD
404406

405407
static const int xe_exec_queue_prio_to_guc[] = {
@@ -414,14 +416,20 @@ static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q)
414416
struct exec_queue_policy policy;
415417
enum xe_exec_queue_priority prio = q->sched_props.priority;
416418
u32 timeslice_us = q->sched_props.timeslice_us;
419+
u32 slpc_exec_queue_freq_req = 0;
417420
u32 preempt_timeout_us = q->sched_props.preempt_timeout_us;
418421

419422
xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
420423

424+
if (q->flags & EXEC_QUEUE_FLAG_LOW_LATENCY)
425+
slpc_exec_queue_freq_req |= SLPC_CTX_FREQ_REQ_IS_COMPUTE;
426+
421427
__guc_exec_queue_policy_start_klv(&policy, q->guc->id);
422428
__guc_exec_queue_policy_add_priority(&policy, xe_exec_queue_prio_to_guc[prio]);
423429
__guc_exec_queue_policy_add_execution_quantum(&policy, timeslice_us);
424430
__guc_exec_queue_policy_add_preemption_timeout(&policy, preempt_timeout_us);
431+
__guc_exec_queue_policy_add_slpc_exec_queue_freq_req(&policy,
432+
slpc_exec_queue_freq_req);
425433

426434
xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g,
427435
__guc_exec_queue_policy_action_size(&policy), 0, 0);

drivers/gpu/drm/xe/xe_query.c

+2
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
339339
if (xe_device_get_root_tile(xe)->mem.vram.usable_size)
340340
config->info[DRM_XE_QUERY_CONFIG_FLAGS] =
341341
DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM;
342+
config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
343+
DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
342344
config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
343345
xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
344346
config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;

include/uapi/drm/xe_drm.h

+19-1
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ struct drm_xe_query_mem_regions {
393393
*
394394
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
395395
* has usable VRAM
396+
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
397+
* has low latency hint support
396398
* - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
397399
* required by this device, typically SZ_4K or SZ_64K
398400
* - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
@@ -409,6 +411,7 @@ struct drm_xe_query_config {
409411
#define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID 0
410412
#define DRM_XE_QUERY_CONFIG_FLAGS 1
411413
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM (1 << 0)
414+
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY (1 << 1)
412415
#define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2
413416
#define DRM_XE_QUERY_CONFIG_VA_BITS 3
414417
#define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4
@@ -1204,6 +1207,20 @@ struct drm_xe_vm_bind {
12041207
* };
12051208
* ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
12061209
*
1210+
* Allow users to provide a hint to kernel for cases demanding low latency
1211+
* profile. User can indicate low latency hint with flag while creating
1212+
* exec queue as mentioned below,
1213+
*
1214+
* struct drm_xe_exec_queue_create exec_queue_create = {
1215+
* .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
1216+
* .extensions = 0,
1217+
* .vm_id = vm,
1218+
* .num_bb_per_exec = 1,
1219+
* .num_eng_per_bb = 1,
1220+
* .instances = to_user_pointer(&instance),
1221+
* };
1222+
* ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
1223+
*
12071224
*/
12081225
struct drm_xe_exec_queue_create {
12091226
#define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY 0
@@ -1222,7 +1239,8 @@ struct drm_xe_exec_queue_create {
12221239
/** @vm_id: VM to use for this exec queue */
12231240
__u32 vm_id;
12241241

1225-
/** @flags: MBZ */
1242+
#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT (1 << 0)
1243+
/** @flags: flags to use for this exec queue */
12261244
__u32 flags;
12271245

12281246
/** @exec_queue_id: Returned exec queue ID */

0 commit comments

Comments
 (0)