Currently XE lacks clean-on-free implementation so using
TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
only for WB cached BOs which are not pooled so there is no need to
return a zeroed pages to a pool.

Without the patch:
api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
Perf tool top 5 entries:
11.16%  api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page
7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
5.53%  api_overhead_be  [kernel.kallsyms] [k] lookup_address_in_pgd_attr

With the patch:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
Perf tool top 5 entries:
25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find

This is still better than the base case where there was no
page clearing offloading.

Cc: Christian König <christian.koe...@amd.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimi...@intel.com>
Cc: Matthew Auld <matthew.a...@intel.com>
Cc: Matthew Brost <matthew.br...@intel.com>
Cc: Thomas Hellström <thomas.hellst...@linux.intel.com>
Signed-off-by: Nirmoy Das <nirmoy....@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 6ed0e1955215..a18408d5d185 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -283,6 +283,7 @@ struct xe_ttm_tt {
        struct device *dev;
        struct sg_table sgt;
        struct sg_table *sg;
+       bool clear_system_pages;
 };
 
 static int xe_tt_map_sg(struct ttm_tt *tt)
@@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct 
ttm_buffer_object *ttm_bo,
        }
 
        /*
-        * If the device can support gpu clear system pages then set proper ttm
+        * If the device can support gpu clear system pages then set proper
         * flag. Zeroed pages are only required for ttm_bo_type_device so
         * unwanted data is not leaked to userspace.
+        *
+        * XE currently does clear-on-alloc so gpu clear will only work on
+        * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global pool will
+        * get poisoned ono-zeroed pages.
         */
-       if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys)
-               page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
+       if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys &&
+           bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
+               tt->clear_system_pages = true;
 
        err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages);
        if (err) {
@@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct 
ttm_buffer_object *ttm_bo,
 static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
                              struct ttm_operation_ctx *ctx)
 {
+       struct xe_ttm_tt *xe_tt;
        int err;
 
+       xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
+
        /*
         * dma-bufs are not populated with pages, and the dma-
         * addresses are set up when moved to XE_PL_TT.
@@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, 
struct ttm_tt *tt,
                return 0;
 
        /* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear system pages */
-       if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
+       if (xe_tt->clear_system_pages)
                tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
 
        err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
@@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, 
bool evict,
        struct ttm_resource *old_mem = ttm_bo->resource;
        u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM;
        struct ttm_tt *ttm = ttm_bo->ttm;
+       struct xe_ttm_tt *xe_tt = container_of(ttm, struct xe_ttm_tt, ttm);
        struct xe_migrate *migrate = NULL;
        struct dma_fence *fence;
        bool move_lacks_source;
@@ -671,15 +681,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, 
bool evict,
        bool needs_clear;
        bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
                                  ttm && ttm_tt_is_populated(ttm)) ? true : 
false;
-       bool clear_system_pages;
+       bool clear_system_pages = false;
        int ret = 0;
 
        /*
         * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path when
         * moving to system as the bo doesn't have dma_mapping.
         */
-       if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
-               ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
+       if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
+           xe_tt->clear_system_pages)
+               xe_tt->clear_system_pages = false;
 
        /* Bo creation path, moving to system or TT. */
        if ((!old_mem && ttm) && !handle_system_ccs) {
@@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, 
bool evict,
        move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
                                                
(!mem_type_is_vram(old_mem_type) && !tt_has_data);
 
-       clear_system_pages = ttm && (ttm->page_flags & 
TTM_TT_FLAG_CLEARED_ON_FREE);
+       clear_system_pages = ttm && xe_tt->clear_system_pages;
        needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
                (!ttm && ttm_bo->type == ttm_bo_type_device) ||
                clear_system_pages;
-- 
2.42.0

Reply via email to