Hi Mika,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on drm-tip/drm-tip v6.12 next-20241122]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    
https://github.com/intel-lab-lkp/linux/commits/Mika-Laitio/ammdgpu-fix-for-gfx1103-queue-evict-restore-crash/20241122-035602
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:    
https://lore.kernel.org/r/20241121195233.10679-1-lamikr%40gmail.com
patch subject: [PATCH] ammdgpu fix for gfx1103 queue evict/restore crash
config: arm64-allmodconfig 
(https://download.01.org/0day-ci/archive/20241123/202411231603.pmbyckko-...@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 
592c0fe55f6d9a811028b5f3507be91458ab2713)
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20241123/202411231603.pmbyckko-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <l...@intel.com>
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202411231603.pmbyckko-...@intel.com/

All warnings (new ones prefixed by >>):

   In file included from 
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:32:
   In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_priv.h:37:
   In file included from include/linux/kfifo.h:40:
   In file included from include/linux/dma-mapping.h:11:
   In file included from include/linux/scatterlist.h:8:
   In file included from include/linux/mm.h:2213:
   include/linux/vmstat.h:504:43: warning: arithmetic between different 
enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') 
[-Wenum-enum-conversion]
     504 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     505 |                            item];
         |                            ~~~~
   include/linux/vmstat.h:511:43: warning: arithmetic between different 
enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') 
[-Wenum-enum-conversion]
     511 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     512 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
   include/linux/vmstat.h:518:36: warning: arithmetic between different 
enumeration types ('enum node_stat_item' and 'enum lru_list') 
[-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   include/linux/vmstat.h:524:43: warning: arithmetic between different 
enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') 
[-Wenum-enum-conversion]
     524 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     525 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:1354:1: 
>> warning: unused label 'out_unlock' [-Wunused-label]
    1354 | out_unlock:
         | ^~~~~~~~~~~
   5 warnings generated.


vim +/out_unlock +1354 
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c

  1292  
  1293  static int restore_process_queues_cpsch(struct device_queue_manager 
*dqm,
  1294                                          struct qcm_process_device *qpd)
  1295  {
  1296          struct queue *q;
  1297          struct device *dev = dqm->dev->adev->dev;
  1298          struct kfd_process_device *pdd;
  1299          uint64_t eviction_duration;
  1300          int retval = 0;
  1301  
  1302          // gfx1103 APU fails to remove the queue usually after 10-50 
attempts
  1303          if (dqm->dev->adev->flags & AMD_IS_APU)
  1304                  goto out;
  1305          pdd = qpd_to_pdd(qpd);
  1306  
  1307          dqm_lock(dqm);
  1308          if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do 
nothing */
  1309                  goto out;
  1310          if (qpd->evicted > 1) { /* ref count still > 0, decrement & 
quit */
  1311                  qpd->evicted--;
  1312                  goto out;
  1313          }
  1314  
  1315          /* The debugger creates processes that temporarily have not 
acquired
  1316           * all VMs for all devices and has no VMs itself.
  1317           * Skip queue restore on process restore.
  1318           */
  1319          if (!pdd->drm_priv)
  1320                  goto vm_not_acquired;
  1321  
  1322          pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
  1323                              pdd->process->pasid);
  1324  
  1325          /* Update PD Base in QPD */
  1326          qpd->page_table_base = 
amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
  1327          pr_debug("Updated PD address to 0x%llx\n", 
qpd->page_table_base);
  1328  
  1329          /* activate all active queues on the qpd */
  1330          list_for_each_entry(q, &qpd->queues_list, list) {
  1331                  q->properties.is_evicted = false;
  1332                  if (!QUEUE_IS_ACTIVE(q->properties))
  1333                          continue;
  1334  
  1335                  q->properties.is_active = true;
  1336                  increment_queue_count(dqm, &pdd->qpd, q);
  1337  
  1338                  if (dqm->dev->kfd->shared_resources.enable_mes) {
  1339                          retval = add_queue_mes(dqm, q, qpd);
  1340                          if (retval) {
  1341                                  dev_err(dev, "Failed to restore queue 
%d\n",
  1342                                          q->properties.queue_id);
  1343                                  goto out;
  1344                          }
  1345                  }
  1346          }
  1347          if (!dqm->dev->kfd->shared_resources.enable_mes)
  1348                  retval = execute_queues_cpsch(dqm,
  1349                                                
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
  1350          eviction_duration = get_jiffies_64() - 
pdd->last_evict_timestamp;
  1351          atomic64_add(eviction_duration, &pdd->evict_duration_counter);
  1352  vm_not_acquired:
  1353          qpd->evicted = 0;
> 1354  out_unlock:
  1355          dqm_unlock(dqm);
  1356  out:
  1357          return retval;
  1358  }
  1359  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Reply via email to