Skip to content

Commit 561bd1c

Browse files
Jonathan-Cavittintel-lab-lkp
authored andcommitted
drm/xe/xe_vm: Add per VM fault info
Add additional information to each VM so they can report up to the first 50 seen faults. Only pagefaults are saved this way currently, though in the future, all faults should be tracked by the VM for future reporting. Additionally, of the pagefaults reported, only failed pagefaults are saved this way, as successful pagefaults should recover silently and not need to be reported to userspace. v2: - Free vm after use (Shuicheng) - Compress pf copy logic (Shuicheng) - Update fault_unsuccessful before storing (Shuicheng) - Fix old struct name in comments (Shuicheng) - Keep first 50 pagefaults instead of last 50 (Jianxun) v3: - Avoid unnecessary execution by checking MAX_PFS earlier (jcavitt) - Fix double-locking error (jcavitt) - Assert kmemdump is successful (Shuicheng) v4: - Rename xe_vm.pfs to xe_vm.faults (jcavitt) - Store fault data and not pagefault in xe_vm faults list (jcavitt) - Store address, address type, and address precision per fault (jcavitt) - Store engine class and instance data per fault (Jianxun) - Add and fix kernel docs (Michal W) - Properly handle kzalloc error (Michal W) - s/MAX_PFS/MAX_FAULTS_SAVED_PER_VM (Michal W) - Store fault level per fault (Micahl M) v5: - Store fault and access type instead of address type (Jianxun) v6: - Store pagefaults in non-fault-mode VMs as well (Jianxun) v7: - Fix kernel docs and comments (Michal W) v8: - Fix double-locking issue (Jianxun) v9: - Do not report faults from reserved engines (Jianxun) v10: - Remove engine class and instance (Ivan) v11: - Perform kzalloc outside of lock (Auld) v12: - Fix xe_vm_fault_entry kernel docs (Shuicheng) v13: - Rebase and refactor (jcavitt) v14: - Correctly ignore fault mode in save_pagefault_to_vm (jcavitt) v15: - s/save_pagefault_to_vm/xe_pagefault_save_to_vm (Matt Brost) - Use guard instead of spin_lock/unlock (Matt Brost) - GT was added to xe_pagefault struct. Use xe_gt_hw_engine instead of creating a new helper function (Matt Brost) v16: - Set address precision programmatically (Matt Brost) v17: - Set address precision to fixed value (Matt Brost) v18: - s/uAPI/Link in commit log links - Use kzalloc_obj Link: intel/compute-runtime#878 Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com> Suggested-by: Matthew Brost <matthew.brost@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Cc: Shuicheng Lin <shuicheng.lin@intel.com> Cc: Jianxun Zhang <jianxun.zhang@intel.com> Cc: Michal Wajdeczko <Michal.Wajdeczko@intel.com> Cc: Michal Mzorek <michal.mzorek@intel.com> Cc: Ivan Briano <ivan.briano@intel.com> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Matthew Brost <matthew.brost@intel.com>
1 parent 0daad6c commit 561bd1c

4 files changed

Lines changed: 138 additions & 0 deletions

File tree

drivers/gpu/drm/xe/xe_pagefault.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,31 @@ static void xe_pagefault_print(struct xe_pagefault *pf)
250250
pf->consumer.engine_instance);
251251
}
252252

253+
static void xe_pagefault_save_to_vm(struct xe_device *xe, struct xe_pagefault *pf)
254+
{
255+
struct xe_vm *vm;
256+
257+
/*
258+
* Pagefault may be asociated to VM that is not in fault mode.
259+
* Perform asid_to_vm behavior, except if VM is not in fault
260+
* mode, return VM anyways.
261+
*/
262+
down_read(&xe->usm.lock);
263+
vm = xa_load(&xe->usm.asid_to_vm, pf->consumer.asid);
264+
if (vm)
265+
xe_vm_get(vm);
266+
else
267+
vm = ERR_PTR(-EINVAL);
268+
up_read(&xe->usm.lock);
269+
270+
if (IS_ERR(vm))
271+
return;
272+
273+
xe_vm_add_fault_entry_pf(vm, pf);
274+
275+
xe_vm_put(vm);
276+
}
277+
253278
static void xe_pagefault_queue_work(struct work_struct *w)
254279
{
255280
struct xe_pagefault_queue *pf_queue =
@@ -268,6 +293,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
268293

269294
err = xe_pagefault_service(&pf);
270295
if (err) {
296+
xe_pagefault_save_to_vm(gt_to_xe(pf.gt), &pf);
271297
if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
272298
xe_pagefault_print(&pf);
273299
xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n",

drivers/gpu/drm/xe/xe_vm.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "xe_device.h"
2828
#include "xe_drm_client.h"
2929
#include "xe_exec_queue.h"
30+
#include "xe_gt.h"
3031
#include "xe_migrate.h"
3132
#include "xe_pat.h"
3233
#include "xe_pm.h"
@@ -577,6 +578,74 @@ static void preempt_rebind_work_func(struct work_struct *w)
577578
trace_xe_vm_rebind_worker_exit(vm);
578579
}
579580

581+
/**
582+
* xe_vm_add_fault_entry_pf() - Add pagefault to vm fault list
583+
* @vm: The VM.
584+
* @pf: The pagefault.
585+
*
586+
* This function takes the data from the pagefault @pf and saves it to @vm->faults.list.
587+
*
588+
* The function exits silently if the list is full, and reports a warning if the pagefault
589+
* could not be saved to the list.
590+
*/
591+
void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf)
592+
{
593+
struct xe_vm_fault_entry *e;
594+
struct xe_hw_engine *hwe;
595+
596+
/* Do not report faults on reserved engines */
597+
hwe = xe_gt_hw_engine(pf->gt, pf->consumer.engine_class,
598+
pf->consumer.engine_instance, false);
599+
if (!hwe || xe_hw_engine_is_reserved(hwe))
600+
return;
601+
602+
e = kzalloc_obj(*e);
603+
if (!e) {
604+
drm_warn(&vm->xe->drm,
605+
"Could not allocate memory for fault!\n");
606+
return;
607+
}
608+
609+
guard(spinlock)(&vm->faults.lock);
610+
611+
/*
612+
* Limit the number of faults in the fault list to prevent
613+
* memory overuse.
614+
*/
615+
if (vm->faults.len >= MAX_FAULTS_SAVED_PER_VM) {
616+
kfree(e);
617+
return;
618+
}
619+
620+
e->address = pf->consumer.page_addr;
621+
/*
622+
* TODO:
623+
* Address precision is currently always SZ_4K, but this may change
624+
* in the future.
625+
*/
626+
e->address_precision = SZ_4K;
627+
e->access_type = pf->consumer.access_type;
628+
e->fault_type = FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
629+
pf->consumer.fault_type_level),
630+
e->fault_level = FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
631+
pf->consumer.fault_type_level),
632+
633+
list_add_tail(&e->list, &vm->faults.list);
634+
vm->faults.len++;
635+
}
636+
637+
static void xe_vm_clear_fault_entries(struct xe_vm *vm)
638+
{
639+
struct xe_vm_fault_entry *e, *tmp;
640+
641+
guard(spinlock)(&vm->faults.lock);
642+
list_for_each_entry_safe(e, tmp, &vm->faults.list, list) {
643+
list_del(&e->list);
644+
kfree(e);
645+
}
646+
vm->faults.len = 0;
647+
}
648+
580649
static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
581650
{
582651
int i;
@@ -1538,6 +1607,9 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
15381607
INIT_LIST_HEAD(&vm->userptr.invalidated);
15391608
spin_lock_init(&vm->userptr.invalidated_lock);
15401609

1610+
INIT_LIST_HEAD(&vm->faults.list);
1611+
spin_lock_init(&vm->faults.lock);
1612+
15411613
ttm_lru_bulk_move_init(&vm->lru_bulk_move);
15421614

15431615
INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
@@ -1854,6 +1926,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)
18541926
}
18551927
up_write(&xe->usm.lock);
18561928

1929+
xe_vm_clear_fault_entries(vm);
1930+
18571931
for_each_tile(tile, xe, id)
18581932
xe_range_fence_tree_fini(&vm->rftree[id]);
18591933

drivers/gpu/drm/xe/xe_vm.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
#include "xe_map.h"
1313
#include "xe_vm_types.h"
1414

15+
/**
16+
* MAX_FAULTS_SAVED_PER_VM - Maximum number of faults each vm can store before future
17+
* faults are discarded to prevent memory overuse
18+
*/
19+
#define MAX_FAULTS_SAVED_PER_VM 50
20+
1521
struct drm_device;
1622
struct drm_printer;
1723
struct drm_file;
@@ -22,6 +28,7 @@ struct dma_fence;
2228

2329
struct xe_exec_queue;
2430
struct xe_file;
31+
struct xe_pagefault;
2532
struct xe_sync_entry;
2633
struct xe_svm_range;
2734
struct drm_exec;
@@ -318,6 +325,8 @@ void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap);
318325
void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p);
319326
void xe_vm_snapshot_free(struct xe_vm_snapshot *snap);
320327

328+
void xe_vm_add_fault_entry_pf(struct xe_vm *vm, struct xe_pagefault *pf);
329+
321330
/**
322331
* xe_vm_set_validating() - Register this task as currently making bos resident
323332
* @allow_res_evict: Allow eviction of buffer objects bound to @vm when

drivers/gpu/drm/xe/xe_vm_types.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
struct drm_pagemap;
2525

2626
struct xe_bo;
27+
struct xe_pagefault;
2728
struct xe_svm_range;
2829
struct xe_sync_entry;
2930
struct xe_user_fence;
@@ -176,6 +177,24 @@ struct xe_userptr_vma {
176177

177178
struct xe_device;
178179

180+
/**
181+
* struct xe_vm_fault_entry - Elements of vm->faults.list
182+
* @list: link into @xe_vm.faults.list
183+
* @address: address of the fault
184+
* @address_precision: precision of faulted address
185+
* @access_type: type of address access that resulted in fault
186+
* @fault_type: type of fault reported
187+
* @fault_level: fault level of the fault
188+
*/
189+
struct xe_vm_fault_entry {
190+
struct list_head list;
191+
u64 address;
192+
u32 address_precision;
193+
u8 access_type;
194+
u8 fault_type;
195+
u8 fault_level;
196+
};
197+
179198
struct xe_vm {
180199
/** @gpuvm: base GPUVM used to track VMAs */
181200
struct drm_gpuvm gpuvm;
@@ -333,6 +352,16 @@ struct xe_vm {
333352
bool capture_once;
334353
} error_capture;
335354

355+
/** @faults: List of all faults associated with this VM */
356+
struct {
357+
/** @faults.lock: lock protecting @faults.list */
358+
spinlock_t lock;
359+
/** @faults.list: list of xe_vm_fault_entry entries */
360+
struct list_head list;
361+
/** @faults.len: length of @faults.list */
362+
unsigned int len;
363+
} faults;
364+
336365
/**
337366
* @validation: Validation data only valid with the vm resv held.
338367
* Note: This is really task state of the task holding the vm resv,

0 commit comments

Comments
 (0)