from:"Kamal, Asad"

RE: [PATCH] drm/amdgpu: Fix XCP instance mask calculation

2024-09-12 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, September 12, 2024 4:41 PM
To: amd-gfx@lists.freedesktop.org; Kamal, Asad 
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH] drm/amdgpu: Fix XCP instance mask calculation

Fix instance mask calculation for VCN IP. There are cases where VCN instance 
could be shared across partitions. Fix here so that other blocks don't need to 
check for any shared instances based on partition mode.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 32 +-
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 8381dcaa68e2..719f1aa6a429 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -94,8 +94,6 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device 
*adev,
case AMDGPU_RING_TYPE_VCN_ENC:
case AMDGPU_RING_TYPE_VCN_JPEG:
ip_blk = AMDGPU_XCP_VCN;
-   if (aqua_vanjaram_xcp_vcn_shared(adev))
-   inst_mask = 1 << (inst_idx * 2);
break;
default:
DRM_ERROR("Not support ring type %d!", ring->funcs->type); @@ 
-105,6 +103,8 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device 
*adev,
for (xcp_id = 0; xcp_id < adev->xcp_mgr->num_xcps; xcp_id++) {
if (adev->xcp_mgr->xcp[xcp_id].ip[ip_blk].inst_mask & 
inst_mask) {
ring->xcp_id = xcp_id;
+   dev_dbg(adev->dev, "ring:%s xcp_id :%u", ring->name,
+   ring->xcp_id);
if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
adev->gfx.enforce_isolation[xcp_id].xcp_id = 
xcp_id;
break;
@@ -394,38 +394,31 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct 
amdgpu_xcp_mgr *xcp_mgr, int x
struct amdgpu_xcp_ip *ip)
 {
struct amdgpu_device *adev = xcp_mgr->adev;
+   int num_sdma, num_vcn, num_shared_vcn, num_xcp;
int num_xcc_xcp, num_sdma_xcp, num_vcn_xcp;
-   int num_sdma, num_vcn;

num_sdma = adev->sdma.num_instances;
num_vcn = adev->vcn.num_vcn_inst;
+   num_shared_vcn = 1;
+
+   num_xcc_xcp = adev->gfx.num_xcc_per_xcp;
+   num_xcp = NUM_XCC(adev->gfx.xcc_mask) / num_xcc_xcp;

switch (xcp_mgr->mode) {
case AMDGPU_SPX_PARTITION_MODE:
-   num_sdma_xcp = num_sdma;
-   num_vcn_xcp = num_vcn;
-   break;
case AMDGPU_DPX_PARTITION_MODE:
-   num_sdma_xcp = num_sdma / 2;
-   num_vcn_xcp = num_vcn / 2;
-   break;
case AMDGPU_TPX_PARTITION_MODE:
-   num_sdma_xcp = num_sdma / 3;
-   num_vcn_xcp = num_vcn / 3;
-   break;
case AMDGPU_QPX_PARTITION_MODE:
-   num_sdma_xcp = num_sdma / 4;
-   num_vcn_xcp = num_vcn / 4;
-   break;
case AMDGPU_CPX_PARTITION_MODE:
-   num_sdma_xcp = 2;
-   num_vcn_xcp = num_vcn ? 1 : 0;
+   num_sdma_xcp = DIV_ROUND_UP(num_sdma, num_xcp);
+   num_vcn_xcp = DIV_ROUND_UP(num_vcn, num_xcp);
break;
default:
return -EINVAL;
}

-   num_xcc_xcp = adev->gfx.num_xcc_per_xcp;
+   if (num_vcn && num_xcp > num_vcn)
+   num_shared_vcn = num_xcp / num_vcn;

switch (ip_id) {
case AMDGPU_XCP_GFXHUB:
@@ -441,7 +434,8 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct 
amdgpu_xcp_mgr *xcp_mgr, int x
ip->ip_funcs = &sdma_v4_4_2_xcp_funcs;
break;
case AMDGPU_XCP_VCN:
-   ip->inst_mask = XCP_INST_MASK(num_vcn_xcp, xcp_id);
+   ip->inst_mask =
+   XCP_INST_MASK(num_vcn_xcp, xcp_id / num_shared_vcn);
/* TODO : Assign IP funcs */
break;
default:
--
2.25.1

RE: [PATCH v2] drm/amd/pm: Allow setting max UCLK on SMU v13.0.6

2024-02-13 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 
Tested-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, February 9, 2024 1:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH v2] drm/amd/pm: Allow setting max UCLK on SMU v13.0.6

Allow reducing max UCLK in MANUAL performance level. New UCLK value
should be less than the max DPM level UCLK level value.

Ex:
echo manual > "/sys/bus/pci/devices/.../power_dpm_force_performance_level"
echo m 1 900 > "/sys/bus/pci/devices/.../pp_od_clk_voltage”
echo c > "/sys/bus/pci/devices/.../pp_od_clk_voltage”

Signed-off-by: Lijo Lazar 
---
v2:
On switching perf level to auto, restore GFX and UCLK levels only if 
needed.

 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 122 +++---
 1 file changed, 102 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 03873d784be6..6e8a7eb1864d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1578,6 +1578,8 @@ static int smu_v13_0_6_set_performance_level(struct 
smu_context *smu,
struct smu_13_0_dpm_context *dpm_context = smu_dpm->dpm_context;
struct smu_13_0_dpm_table *gfx_table =
&dpm_context->dpm_tables.gfx_table;
+   struct smu_13_0_dpm_table *uclk_table =
+   &dpm_context->dpm_tables.uclk_table;
struct smu_umd_pstate_table *pstate_table = &smu->pstate_table;
int ret;

@@ -1593,17 +1595,27 @@ static int smu_v13_0_6_set_performance_level(struct 
smu_context *smu,
return 0;

case AMD_DPM_FORCED_LEVEL_AUTO:
-   if ((gfx_table->min == pstate_table->gfxclk_pstate.curr.min) &&
-   (gfx_table->max == pstate_table->gfxclk_pstate.curr.max))
-   return 0;
+   if ((gfx_table->min != pstate_table->gfxclk_pstate.curr.min) ||
+   (gfx_table->max != pstate_table->gfxclk_pstate.curr.max)) {
+   ret = smu_v13_0_6_set_gfx_soft_freq_limited_range(
+   smu, gfx_table->min, gfx_table->max);
+   if (ret)
+   return ret;

-   ret = smu_v13_0_6_set_gfx_soft_freq_limited_range(
-   smu, gfx_table->min, gfx_table->max);
-   if (ret)
-   return ret;
+   pstate_table->gfxclk_pstate.curr.min = gfx_table->min;
+   pstate_table->gfxclk_pstate.curr.max = gfx_table->max;
+   }
+
+   if (uclk_table->max != pstate_table->uclk_pstate.curr.max) {
+   /* Min UCLK is not expected to be changed */
+   ret = smu_v13_0_set_soft_freq_limited_range(
+   smu, SMU_UCLK, 0, uclk_table->max);
+   if (ret)
+   return ret;
+   pstate_table->uclk_pstate.curr.max = uclk_table->max;
+   }
+   pstate_table->uclk_pstate.custom.max = 0;

-   pstate_table->gfxclk_pstate.curr.min = gfx_table->min;
-   pstate_table->gfxclk_pstate.curr.max = gfx_table->max;
return 0;
case AMD_DPM_FORCED_LEVEL_MANUAL:
return 0;
@@ -1626,7 +1638,8 @@ static int smu_v13_0_6_set_soft_freq_limited_range(struct 
smu_context *smu,
uint32_t max_clk;
int ret = 0;

-   if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK)
+   if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK &&
+   clk_type != SMU_UCLK)
return -EINVAL;

if ((smu_dpm->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL) &&
@@ -1636,18 +1649,31 @@ static int 
smu_v13_0_6_set_soft_freq_limited_range(struct smu_context *smu,
if (smu_dpm->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) {
if (min >= max) {
dev_err(smu->adev->dev,
-   "Minimum GFX clk should be less than the 
maximum allowed clock\n");
+   "Minimum clk should be less than the maximum 
allowed clock\n");
return -EINVAL;
}

-   if ((min == pstate_table->gfxclk_pstate.curr.min) &&
-   (max == pstate_table->gfxclk_pstate.curr.max))
-   return 0;
+   if (clk_type == SMU_GFXCLK) {
+   if ((min == pstate_table->gfxclk_pstate.curr.min) &&
+   (m

RE: [PATCH 1/2] drm/amdgpu: Add fatal error detected flag

2024-02-22 Thread Kamal, Asad

[AMD Official Use Only - General]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, February 22, 2024 3:47 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kuehling, Felix ; Joshi, 
Mukul ; Kamal, Asad 
Subject: [PATCH 1/2] drm/amdgpu: Add fatal error detected flag

For a RAS error that needs a full reset to recover, set the fatal error status. 
Clear the status once the device is reset.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 32 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  6 
 3 files changed, 39 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1ef892bea488..d475c54c0a08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5308,6 +5308,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
+   amdgpu_ras_set_fed(tmp_adev, false);
r = amdgpu_device_asic_init(tmp_adev);
if (r) {
dev_warn(tmp_adev->dev, "asic atom init 
failed!"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 46f3d1013e8c..2c94de305c69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2439,6 +2439,18 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, 
&reset_context.flags);

+   /* For any RAS error that needs a full reset to
+* recover, set the fatal error status
+*/
+   if (hive) {
+   list_for_each_entry(remote_adev,
+   &hive->device_list,
+   gmc.xgmi.head)
+   amdgpu_ras_set_fed(remote_adev,
+  true);
+   } else {
+   amdgpu_ras_set_fed(adev, true);
+   }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -3440,6 +3452,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
return 0;
 }

+bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) {
+   struct amdgpu_ras *ras;
+
+   ras = amdgpu_ras_get_context(adev);
+   if (!ras)
+   return false;
+
+   return atomic_read(&ras->fed);
+}
+
+void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) {
+   struct amdgpu_ras *ras;
+
+   ras = amdgpu_ras_get_context(adev);
+   if (ras)
+   atomic_set(&ras->fed, !!status);
+}
+
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index d10e5bb0e52f..e0f8ce9d8440 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -477,6 +477,8 @@ struct amdgpu_ras {
wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
+   /* Fatal error detected flag */
+   atomic_t fed;
 };

 struct ras_fs_data {
@@ -873,4 +875,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info 
*err_info,

 void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
struct ras_err_addr *mca_err_addr);
+
+void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool
+amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
+
 #endif
--
2.25.1

RE: [PATCH] drm/amd/pm: Increase SMUv13.0.6 mode-2 reset time

2024-02-26 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, February 26, 2024 4:08 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Increase SMUv13.0.6 mode-2 reset time

On SOCs with SMUv13.0.6, mode-2 reset takes a bit longer. Wait for 200ms before 
trying to restore config space after mode-2 reset.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 29c102fe650d..2b7a60b23d6b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2307,8 +2307,8 @@ static int smu_v13_0_6_mode2_reset(struct smu_context 
*smu)
ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
   SMU_RESET_MODE_2);

-   /* This is similar to FLR, wait till max FLR timeout */
-   msleep(100);
+   /* Reset takes a bit longer, wait for 200ms. */
+   msleep(200);

dev_dbg(smu->adev->dev, "restore config space...\n");
/* Restore the config space saved during init */
--
2.25.1

RE: [PATCH] drm/amdgpu: Do a basic health check before reset

2024-03-13 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, March 13, 2024 3:12 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Do a basic health check before reset

Check if the device is present in the bus before trying to recover. It could be 
that device itself is lost from the bus in some hang situations.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void 
amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)

 }

+static int amdgpu_device_health_check(struct list_head
+*device_list_handle) {
+   struct amdgpu_device *tmp_adev;
+   int ret = 0;
+   u32 status;
+
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+   pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+   if (PCI_POSSIBLE_ERROR(status)) {
+   dev_err(tmp_adev->dev, "device lost from bus!");
+   ret = -ENODEV;
+   }
+   }
+
+   return ret;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}

+   if (!amdgpu_sriov_vf(adev)) {
+   r = amdgpu_device_health_check(device_list_handle);
+   if (r)
+   goto end_reset;
+   }
+
/* We need to lock reset domain only once both for XGMI and single 
device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

+end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
--
2.25.1

RE: [PATCH 2/2] drm/amd/pm: Use metric table for pcie speed/width

2024-03-14 Thread Kamal, Asad

[AMD Official Use Only - General]

-Original Message-
From: Lazar, Lijo 
Sent: Friday, March 15, 2024 12:25 PM
To: Kamal, Asad ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Ma, Le ; Zhang, 
Morris ; Cheung, Donald ; Khatir, 
Sepehr ; Oliveira, Daniel ; 
Poag, Charis 
Subject: Re: [PATCH 2/2] drm/amd/pm: Use metric table for pcie speed/width



On 3/15/2024 11:11 AM, Asad Kamal wrote:
> Report pcie link speed/width using metric table in case of one vf & if
> pmfw support is available, else report directly from registers in case
> of pf. Skip reporting it for other cases.
>
> Signed-off-by: Asad Kamal 
> ---
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index 744c84f3029f..2a934864b5eb 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -2167,6 +2167,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
> smu_context *smu, void **table
>   (struct gpu_metrics_v1_5 *)smu_table->gpu_metrics_table;
>   struct amdgpu_device *adev = smu->adev;
>   int ret = 0, xcc_id, inst, i, j;
> + enum amdgpu_sriov_vf_mode mode;
>   MetricsTableX_t *metrics_x;
>   MetricsTableA_t *metrics_a;
>   u16 link_width_level;
> @@ -2229,7 +2230,17 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
> smu_context *smu, void **table
>   gpu_metrics->gfxclk_lock_status = GET_METRIC_FIELD(GfxLockXCDMak) >>
> GET_INST(GC, 0);
>
>   if (!(adev->flags & AMD_IS_APU)) {
> - if (!amdgpu_sriov_vf(adev)) {
> + /*Check smu version, PCIE link speed and width will be reported 
> from pmfw metric
> +  * table for both pf & one vf for smu version 85.99.0 or higher 
> else report only
> +  * for pf from registers
> +  */
> + mode = amdgpu_virt_get_sriov_vf_mode(adev);
> + if (smu->smc_fw_version >= 0x556300 &&
> +         mode != SRIOV_VF_MODE_MULTI_VF) {

gpu_metrics is only available in one-VF mode, so there is no need to do this 
extra check there.

Thanks,
Lijo
[Kamal, Asad] Thank you, will send a v2 with changes

> + gpu_metrics->pcie_link_width = metrics_x->PCIeLinkWidth;
> + gpu_metrics->pcie_link_speed =
> + pcie_gen_to_speed(metrics_x->PCIeLinkSpeed);
> + } else if (!amdgpu_sriov_vf(adev)) {
>   link_width_level = 
> smu_v13_0_6_get_current_pcie_link_width_level(smu);
>   if (link_width_level > MAX_LINK_WIDTH)
>   link_width_level = 0;
> @@ -2239,6 +2250,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
> smu_context *smu, void **table
>   gpu_metrics->pcie_link_speed =
>   smu_v13_0_6_get_current_pcie_link_speed(smu);
>   }
> +
>   gpu_metrics->pcie_bandwidth_acc =
>   SMUQ10_ROUND(metrics_x->PcieBandwidthAcc[0]);
>   gpu_metrics->pcie_bandwidth_inst =

RE: [PATCH] drm/amdgpu: Refine IB schedule error logging

2024-03-21 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, March 21, 2024 6:06 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Koenig, Christian ; 
Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Refine IB schedule error logging

Downgrade to debug information when IBs are skipped. Also, use dev_* to 
identify the device.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4b3000c21ef2..e4742b65032d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
dma_fence_set_error(finished, -ECANCELED);

if (finished->error < 0) {
-   DRM_INFO("Skip scheduling IBs!\n");
+   dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)",
+   ring->name);
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
   &fence);
if (r)
-   DRM_ERROR("Error scheduling IBs (%d)\n", r);
+   dev_err(adev->dev,
+   "Error scheduling IBs (%d) in ring(%s)", r,
+   ring->name);
}

job->job_run_counter++;
--
2.25.1

RE: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags

2024-04-01 Thread Kamal, Asad

[AMD Official Use Only - General]

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Thursday, March 28, 2024 8:06 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Wang, Yang(Kevin) 
Subject: [PATCH v2 2/4] drm/amd/pm: Add PMFW message and capability flags

Add flags to categorize messages and PMFW capabilities.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 ++-  
drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  | 7 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a870bdd49a4e..aa835df7ba1a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -458,7 +458,7 @@ struct smu_umd_pstate_table {  struct cmn2asic_msg_mapping {
int valid_mapping;
int map_to;
-   int valid_in_vf;
+   uint32_t flags;
 };

[Kamal, Asad] Do we need to change the following macro , to have flags rather 
than valid_in_vf
#define MSG_MAP(msg, index, valid_in_vf) \
[SMU_MSG_##msg] = {1, (index), (valid_in_vf)}

Thanks & Regards
Asad

 struct cmn2asic_mapping {
@@ -538,6 +538,7 @@ struct smu_context {
uint32_t smc_driver_if_version;
uint32_t smc_fw_if_version;
uint32_t smc_fw_version;
+   uint32_t smc_fw_caps;

bool uploading_custom_pp_table;
bool dc_controlled_by_gpio;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index af427cc7dbb8..c48214e3dc8e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -445,4 +445,11 @@ enum smu_feature_mask {
SMU_FEATURE_COUNT,
 };

+/* Message category flags */
+#define SMU_MSG_VF_FLAG(1U << 0)
+#define SMU_MSG_RAS_PRI(1U << 1)
+
+/* Firmware capability flags */
+#define SMU_FW_CAP_RAS_PRI (1U << 0)
+
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index b8dbd4e25348..3227e514e8ae 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -437,7 +437,7 @@ int smu_cmn_to_asic_specific_index(struct smu_context *smu,
return -EINVAL;

if (amdgpu_sriov_vf(smu->adev) &&
-   !msg_mapping.valid_in_vf)
+   !(msg_mapping.flags & SMU_MSG_VF_FLAG))
return -EACCES;

return msg_mapping.map_to;
--
2.25.1

RE: [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier

2024-04-01 Thread Kamal, Asad

[AMD Official Use Only - General]

Series is
Reviewed-by: Asad Kamal 

Thanks & Regards
Asad


-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Thursday, March 28, 2024 8:06 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Wang, Yang(Kevin) 
Subject: [PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier

In case of fatal errors, set FED status when interrupt is received. Set the 
flag on other devices in the hive before RAS recovery work.

Signed-off-by: Lijo Lazar 
---
v2: Avoid accessing hive in interrupt handler as it may take mutex path (Kevin)

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device 
*adev,
return ret;
 }

+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+  struct amdgpu_hive_info *hive, bool status) {
+   struct amdgpu_device *tmp_adev;
+
+   if (hive) {
+   list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+   amdgpu_ras_set_fed(tmp_adev, status);
+   } else {
+   amdgpu_ras_set_fed(adev, status);
+   }
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)  {
struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
struct list_head device_list, *device_list_handle =  NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);

-   if (hive)
+   if (hive) {
atomic_set(&hive->ras_recovery, 1);
+
+   /* If any device which is part of the hive received RAS fatal
+* error interrupt, set fatal error status on all. This
+* condition will need a recovery, and flag will be cleared
+* as part of recovery.
+*/
+   list_for_each_entry(remote_adev, &hive->device_list,
+   gmc.xgmi.head)
+   if (amdgpu_ras_get_fed_status(remote_adev)) {
+   amdgpu_ras_set_fed_all(adev, hive, true);
+   break;
+   }
+   }
if (!ras->disable_ras_err_cnt_harvest) {

/* Build list of devices to query RAS related errors */ @@ 
-2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, 
&reset_context.flags);

-   /* For any RAS error that needs a full reset to
-* recover, set the fatal error status
-*/
-   if (hive) {
-   list_for_each_entry(remote_adev,
-   &hive->device_list,
-   gmc.xgmi.head)
-   amdgpu_ras_set_fed(remote_adev,
-  true);
-   } else {
-   amdgpu_ras_set_fed(adev, true);
-   }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
  "(ERREVENT_ATHUB_INTERRUPT) detected!\n");

+   amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
--
2.25.1

RE: [PATCH] drm/amd/pm: Report uclk and sclk limit

2024-04-02 Thread Kamal, Asad

[AMD Official Use Only - General]

Please ignore this patch will send a fresh one

-Original Message-
From: Kamal, Asad 
Sent: Tuesday, April 2, 2024 3:45 PM
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Zhang, Hawking ; 
Ma, Le ; Zhang, Morris ; Kamal, Asad 
; Cheung, Donald ; Khatir, Sepehr 
; Oliveira, Daniel ; Poag, 
Charis ; Liu, Shuzhou (Bill) 
Subject: [PATCH] drm/amd/pm: Report uclk and sclk limit

Report max set uclk and sclk for smu_v_13_0_6

Signed-off-by: Asad Kamal 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 575292314f57..f81096bfbf2c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1010,8 +1010,11 @@ static int smu_v13_0_6_print_clk_levels(struct 
smu_context *smu,

switch (type) {
case SMU_OD_SCLK:
-   size += sysfs_emit_at(buf, size, "%s:\n", "GFXCLK");
-   fallthrough;
+   size += sysfs_emit_at(buf, size, "%s:\n", "OD_SCLK");
+   size += sysfs_emit_at(buf, size, "0: %uMhz\n1: %uMhz\n",
+ pstate_table->gfxclk_pstate.curr.min,
+ pstate_table->gfxclk_pstate.curr.max);
+   break;
case SMU_SCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_GFXCLK,
&now);
@@ -1052,8 +1055,11 @@ static int smu_v13_0_6_print_clk_levels(struct 
smu_context *smu,
break;

case SMU_OD_MCLK:
-   size += sysfs_emit_at(buf, size, "%s:\n", "MCLK");
-   fallthrough;
+   size += sysfs_emit_at(buf, size, "%s:\n", "OD_MCLK");
+   size += sysfs_emit_at(buf, size, "0: %uMhz\n1: %uMhz\n",
+ pstate_table->uclk_pstate.curr.min,
+ pstate_table->uclk_pstate.curr.max);
+   break;
case SMU_MCLK:
ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_UCLK,
&now);
--
2.42.0

RE: [PATCH v2] drm/amd/pm: Ignore initial value in smu response register

2024-07-09 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Tuesday, July 9, 2024 3:03 PM
To: Slivka, Danijel ; amd-gfx@lists.freedesktop.org
Cc: Wang, Yang(Kevin) ; Feng, Kenneth 

Subject: Re: [PATCH v2] drm/amd/pm: Ignore initial value in smu response 
register



On 7/8/2024 7:01 PM, Danijel Slivka wrote:
> Why:
> If the reg mmMP1_SMN_C2PMSG_90 is being written to during amdgpu
> driver load or driver unload, subsequent amdgpu driver load will fail
> at smu_hw_init. The default of mmMP1_SMN_C2PMSG_90 register at a clean
> environment is 0x1 and if value differs from expected, amdgpu driver
> load will fail.
>
> How to fix:
> Ignore the initial value in smu response register before the first smu
> message is sent,if smc in SMU_FW_INIT state, just proceed further to
> send the message. If register holds an unexpected value after smu
> message was sent set, smc_state to SMU_FW_HANG state and no further
> smu messages will be sent.
>
> v2:
> Set SMU_FW_INIT state at the start of smu hw_init/resume.
> Check smc_fw_state before sending smu message if in hang state skip
> sending message.
> Set SMU_FW_HANG only in case unexpected value is detected
>
> Signed-off-by: Danijel Slivka 

Patch looks good to me

Reviewed-by: Lijo Lazar 

Copying Kenneth/Kevin as well.

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c |  2 ++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  7 
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 34 ---
>  3 files changed, 38 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index d79bdb1e8cdf..fb8643d25d1b 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1755,6 +1755,8 @@ static int smu_start_smc_engine(struct smu_context *smu)
>   struct amdgpu_device *adev = smu->adev;
>   int ret = 0;
>
> + smu->smc_fw_state = SMU_FW_INIT;
> +
>   if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
>   if (amdgpu_ip_version(adev, MP1_HWIP, 0) < IP_VERSION(11, 0, 
> 0)) {
>   if (smu->ppt_funcs->load_microcode) { diff --git
> a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a34c802f52be..b44a185d07e8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -495,6 +495,12 @@ struct stb_context {
>   spinlock_t lock;
>  };
>
> +enum smu_fw_status {
> + SMU_FW_INIT = 0,
> + SMU_FW_RUNTIME,
> + SMU_FW_HANG,
> +};
> +
>  #define WORKLOAD_POLICY_MAX 7
>
>  /*
> @@ -562,6 +568,7 @@ struct smu_context {
>   uint32_t smc_fw_if_version;
>   uint32_t smc_fw_version;
>   uint32_t smc_fw_caps;
> + uint8_t smc_fw_state;
>
>   bool uploading_custom_pp_table;
>   bool dc_controlled_by_gpio;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 5592fd825aa3..d7c983a1f3f5 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -315,11 +315,20 @@ int smu_cmn_send_msg_without_waiting(struct smu_context 
> *smu,
>   if (adev->no_hw_access)
>   return 0;
>
> - reg = __smu_cmn_poll_stat(smu);
> - res = __smu_cmn_reg2errno(smu, reg);
> - if (reg == SMU_RESP_NONE ||
> - res == -EREMOTEIO)
> + if (smu->smc_fw_state == SMU_FW_HANG) {
> + dev_err(adev->dev, "SMU is in hanged state, failed to send smu
> +message!\n");
>   goto Out;
> + }
> +
> + if (smu->smc_fw_state == SMU_FW_INIT) {
> + smu->smc_fw_state = SMU_FW_RUNTIME;
> + } else {
> + reg = __smu_cmn_poll_stat(smu);
> + res = __smu_cmn_reg2errno(smu, reg);
> + if (reg == SMU_RESP_NONE || res == -EREMOTEIO)
> + goto Out;
> + }
> +
>   __smu_cmn_send_msg(smu, msg_index, param);
>   res = 0;
>  Out:
> @@ -350,6 +359,9 @@ int smu_cmn_wait_for_response(struct smu_context *smu)
>   reg = __smu_cmn_poll_stat(smu);
>   res = __smu_cmn_reg2errno(smu, reg);
>
> + if (res == -EREMOTEIO)
> + smu->smc_fw_state = SMU_FW_HANG;
> +
>   if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) &&
>   res && (res != -ETIME)) {
>   amdgpu_device_halt(smu->adev);
> @@ -418,6 +430,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
> *smu,
>   goto Out;
>   }
>
> + if (smu->smc_fw_state == SMU_FW_HANG) {
> + dev_err(adev->dev, "SMU is in hanged state, failed to send smu 
> message!\n");
> + goto Out;
> + } else if (smu->smc_fw_state == SMU_FW_INIT) {
> + /* Ignore initial smu response

RE: [PATCH] drm/amd/pm: Ignore throttle events on SMUv13.0.6

2024-07-25 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, July 24, 2024 4:26 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Ignore throttle events on SMUv13.0.6

Spurious events are seen, temporarily ignore the events altogether.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 1d76916ed056..b2059663883a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1739,6 +1739,8 @@ static int smu_v13_0_6_irq_process(struct amdgpu_device 
*adev,
 */
switch (ctxid) {
case IH_INTERRUPT_CONTEXT_ID_THERMAL_THROTTLING:
+   /* Ignore throttle events temporarily as some 
are spurious.*/
+   return 0;
/*
 * Increment the throttle interrupt counter
 */
--
2.25.1

RE: [PATCH] drm/amd/pm: Restore config space after reset

2024-04-17 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, April 17, 2024 5:32 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Restore config space after reset

During mode-2 reset, pci config space registers are affected at device side. 
However, certain platforms have switches which assign virtual BAR addresses and 
returns the same even after device is reset. This affects pci_restore_state() 
as it doesn't issue another config write, if the value read is same as the 
saved value.

Add a workaround to write saved config space values from driver side.
Presently, these switches are in platforms with SMU v13.0.6 SOCs, hence 
restrict the workaround only to those.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 25 +++
 1 file changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 59e5c6256ea2..ef17c8c1bf39 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2315,6 +2315,17 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
return sizeof(*gpu_metrics);
 }

+static void smu_v13_0_6_restore_pci_config(struct smu_context *smu) {
+   struct amdgpu_device *adev = smu->adev;
+   int i;
+
+   for (i = 0; i < 16; i++)
+   pci_write_config_dword(adev->pdev, i * 4,
+  adev->pdev->saved_config_space[i]);
+   pci_restore_msi_state(adev->pdev);
+}
+
 static int smu_v13_0_6_mode2_reset(struct smu_context *smu)  {
int ret = 0, index;
@@ -2336,6 +2347,20 @@ static int smu_v13_0_6_mode2_reset(struct smu_context 
*smu)
/* Restore the config space saved during init */
amdgpu_device_load_pci_state(adev->pdev);

+   /* Certain platforms have switches which assign virtual BAR values to
+* devices. OS uses the virtual BAR values and device behind the switch
+* is assgined another BAR value. When device's config space registers
+* are queried, switch returns the virtual BAR values. When mode-2 reset
+* is performed, switch is unaware of it, and will continue to return
+* the same virtual values to the OS.This affects
+* pci_restore_config_space() API as it doesn't write the value saved if
+* the current value read from config space is the same as what is
+* saved. As a workaround, make sure the config space is restored
+* always.
+*/
+   if (!(adev->flags & AMD_IS_APU))
+   smu_v13_0_6_restore_pci_config(smu);
+
dev_dbg(smu->adev->dev, "wait for reset ack\n");
do {
ret = smu_cmn_wait_for_response(smu);
--
2.25.1

RE: [PATCH] drm/amd/amdxcp: Use unique name for partition dev

2024-04-30 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Tuesday, April 30, 2024 5:06 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Koenig, Christian ; Zhu, 
James 
Subject: [PATCH] drm/amd/amdxcp: Use unique name for partition dev

amdxcp is a platform driver for creating partition devices. libdrm library 
identifies a platform device based on 'OF_FULLNAME' or 'MODALIAS'. If two or 
more devices have the same platform name, drm library only picks the first 
device. Platform driver core uses name of the device to populate 'MODALIAS'. 
When 'amdxcp' is used as the base name, only first partition device gets 
identified. Assign unique name so that drm library identifies partition devices 
separately.

amdxcp doesn't support probe of partitions, it doesn't bother about modaliases.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c 
b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
index 90ddd8371176..b4131053b31b 100644
--- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
+++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
@@ -50,12 +50,14 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)  {
struct platform_device *pdev;
struct xcp_device *pxcp_dev;
+   char dev_name[20];
int ret;

if (pdev_num >= MAX_XCP_PLATFORM_DEVICE)
return -ENODEV;

-   pdev = platform_device_register_simple("amdgpu_xcp", pdev_num, NULL, 0);
+   snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", pdev_num);
+   pdev = platform_device_register_simple(dev_name, -1, NULL, 0);
if (IS_ERR(pdev))
return PTR_ERR(pdev);

--
2.25.1

RE: [PATCH] drm/amd/pm: Fix aldebaran pcie speed reporting

2024-05-09 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Thursday, May 9, 2024 2:29 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Wang, Yang(Kevin) 
Subject: [PATCH] drm/amd/pm: Fix aldebaran pcie speed reporting

Fix the field definitions for LC_CURRENT_DATA_RATE.

Signed-off-by: Lijo Lazar 

Fixes: c05d1c401572 ("drm/amd/swsmu: add aldebaran smu13 ip support")
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 0fd25b72a40c..9c0445fa9f9b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -79,8 +79,8 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_10.bin");
 #define PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD_MASK 0x0070L  #define 
PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD__SHIFT 0x4
 #define smnPCIE_LC_SPEED_CNTL  0x11140290
-#define PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE_MASK 0xC000 -#define 
PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT 0xE
+#define PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE_MASK 0xE0 #define
+PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT 0x5

 #define ENABLE_IMU_ARG_GFXOFF_ENABLE   1

--
2.25.1

RE: [PATCH v4 01/10] drm/amd/pm: Add support for DPM policies

2024-05-15 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is
Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Wednesday, May 15, 2024 1:10 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: RE: [PATCH v4 01/10] drm/amd/pm: Add support for DPM policies

[AMD Official Use Only - AMD Internal Distribution Only]

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Tuesday, May 14, 2024 7:06 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH v4 01/10] drm/amd/pm: Add support for DPM policies

Add support to set/get information about different DPM policies. The support is 
only available on SOCs which use swsmu architecture.

A DPM policy type may be defined with different levels. For example, a policy 
may be defined to select Pstate preference and then later a pstate preference 
may be chosen.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2: Add NULL checks before accessing smu_dpm_policy_ctxt
v3: Rebase to add device_attr_id__pm_policy
v4: Use macro to define policy type for consistency.

 .../gpu/drm/amd/include/kgd_pp_interface.h| 16 +++
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 29 ++
 drivers/gpu/drm/amd/pm/amdgpu_pm.c| 94 ++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  4 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h|  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 98 +++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 31 ++
 7 files changed, 273 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index 805c9d37a2b4..8ed9aa9a990d 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -273,6 +273,22 @@ enum pp_xgmi_plpd_mode {
XGMI_PLPD_COUNT,
 };

+enum pp_pm_policy {
+   PP_PM_POLICY_NONE = -1,
+   PP_PM_POLICY_SOC_PSTATE = 0,
+   PP_PM_POLICY_NUM,
+};
+
+enum pp_policy_soc_pstate {
+   SOC_PSTATE_DEFAULT = 0,
+   SOC_PSTATE_0,
+   SOC_PSTATE_1,
+   SOC_PSTATE_2,
+   SOC_PSTAT_COUNT,
+};
+
+#define PP_POLICY_MAX_LEVELS 5
+
 #define PP_GROUP_MASK0xF000
 #define PP_GROUP_SHIFT   28

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index eee919577b44..b443906484e7 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -411,6 +411,35 @@ int amdgpu_dpm_set_xgmi_plpd_mode(struct amdgpu_device 
*adev, int mode)
return ret;
 }

+ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev, char
+*buf) {
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret = -EOPNOTSUPP;
+
+   if (is_support_sw_smu(adev)) {
+   mutex_lock(&adev->pm.mutex);
+   ret = smu_get_pm_policy_info(smu, buf);
+   mutex_unlock(&adev->pm.mutex);
+   }
+
+   return ret;
+}
+
+int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, int policy_type,
+int policy_level) {
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret = -EOPNOTSUPP;
+
+   if (is_support_sw_smu(adev)) {
+   mutex_lock(&adev->pm.mutex);
+   ret = smu_set_pm_policy(smu, policy_type, policy_level);
+   mutex_unlock(&adev->pm.mutex);
+   }
+
+   return ret;
+}
+
 int amdgpu_dpm_enable_mgpu_fan_boost(struct amdgpu_device *adev)  {
void *pp_handle = adev->powerplay.pp_handle; diff --git 
a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 110f2fc31754..6dab0b085239 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2278,6 +2278,98 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct device 
*dev,
return count;
 }

+static ssize_t amdgpu_get_pm_policy(struct device *dev,
+   struct device_attribute *attr, char *buf) {
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+   if (adev->in_suspend && !adev->in_runpm)
+   return -EPERM;
+
+   return amdgpu_dpm_get_pm_policy_info(adev, buf); }
+
+#define STR_SOC_PSTATE_POLICY "soc_pstate"
[Kevin]:

Better to move above macro to top of file.

Best Regards,
Kevin
+
+static ssize_t amdgpu_set_pm_policy(struct device *dev,
+   struct device_attribute *attr,
+   const char *buf, size_t count) {
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev

RE: [PATCH v5 10/10] Documentation/amdgpu: Add PM policy documentation

2024-05-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]


Series is

Reviewed-by: Asad Kamal mailto:asad.ka...@amd.com>>



Thanks & Regards

Asad

From: Deucher, Alexander 
Sent: Friday, May 17, 2024 12:57 AM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Kamal, Asad ; 
Ma, Le 
Subject: Re: [PATCH v5 10/10] Documentation/amdgpu: Add PM policy documentation


[AMD Official Use Only - AMD Internal Distribution Only]

I didn't have time to go through every patch in detail, but overall it looks 
good to me.  The series is:
Acked-by: Alex Deucher 
mailto:alexander.deuc...@amd.com>>

From: Lazar, Lijo mailto:lijo.la...@amd.com>>
Sent: Thursday, May 16, 2024 8:43 AM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Kamal, Asad 
mailto:asad.ka...@amd.com>>; Ma, Le 
mailto:le...@amd.com>>
Subject: [PATCH v5 10/10] Documentation/amdgpu: Add PM policy documentation

Add documentation about the newly added pm_policy node in sysfs.

Signed-off-by: Lijo Lazar mailto:lijo.la...@amd.com>>
---

v5: Update documentation to reflect pm_policy nodes and sub nodes for each
policy type

 Documentation/gpu/amdgpu/thermal.rst |  6 
 drivers/gpu/drm/amd/pm/amdgpu_pm.c   | 53 
 2 files changed, 59 insertions(+)

diff --git a/Documentation/gpu/amdgpu/thermal.rst 
b/Documentation/gpu/amdgpu/thermal.rst
index 2f6166f81e6a..6d942b5c58f0 100644
--- a/Documentation/gpu/amdgpu/thermal.rst
+++ b/Documentation/gpu/amdgpu/thermal.rst
@@ -49,6 +49,12 @@ pp_power_profile_mode
 .. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c
:doc: pp_power_profile_mode

+pm_policy
+-
+
+.. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c
+   :doc: pm_policy
+
 \*_busy_percent
 ---

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 5ff7783dfc43..3e5ffb83f398 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2220,6 +2220,59 @@ struct amdgpu_pm_policy_attr {
 enum pp_pm_policy id;
 };

+/**
+ * DOC: pm_policy
+ *
+ * Certain SOCs can support different power policies to optimize application
+ * performance. However, this policy is provided only at SOC level and not at a
+ * per-process level. This is useful especially when entire SOC is utilized for
+ * dedicated workload.
+ *
+ * The amdgpu driver provides a sysfs API for selecting the policy. Presently,
+ * only two types of policies are supported through this interface.
+ *
+ *  Pstate Policy Selection - This is to select different Pstate profiles which
+ *  decides clock/throttling preferences.
+ *
+ *  XGMI PLPD Policy Selection - When multiple devices are connected over XGMI,
+ *  this helps to select policy to be applied for per link power down.
+ *
+ * The list of available policies and policy levels vary between SOCs. They can
+ * be viewed under pm_policy node directory. If SOC doesn't support any policy,
+ * this node won't be available. The different policies supported will be
+ * available as separate nodes under pm_policy.
+ *
+ * cat /sys/bus/pci/devices/.../pm_policy/
+ *
+ * Reading the policy file shows the different levels supported. The level 
which
+ * is applied presently is denoted by * (asterisk). E.g.,
+ *
+ * .. code-block:: console
+ *
+ * cat /sys/bus/pci/devices/.../pm_policy/soc_pstate
+ * 0 : soc_pstate_default
+ * 1 : soc_pstate_0
+ * 2 : soc_pstate_1*
+ * 3 : soc_pstate_2
+ *
+ * cat /sys/bus/pci/devices/.../pm_policy/xgmi_plpd
+ * 0 : plpd_disallow
+ * 1 : plpd_default
+ * 2 : plpd_optimized*
+ *
+ * To apply a specific policy
+ *
+ * "echo   > /sys/bus/pci/devices/.../pm_policy/"
+ *
+ * For the levels listed in the example above, to select "plpd_optimized" for
+ * XGMI and "soc_pstate_2" for soc pstate policy -
+ *
+ * .. code-block:: console
+ *
+ * echo "2" > /sys/bus/pci/devices/.../pm_policy/xgmi_plpd
+ * echo "3" > /sys/bus/pci/devices/.../pm_policy/soc_pstate
+ *
+ */
 static ssize_t amdgpu_get_pm_policy_attr(struct device *dev,
  struct device_attribute *attr,
  char *buf)
--
2.25.1

RE: [PATCH] drm/amdgpu: Remove duplicate check for *is_queue_unmap in sdma_v7_0_ring_set_wptr

2024-05-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Gao, Likun
Sent: Friday, May 17, 2024 9:32 AM
To: SHANMUGAM, SRINIVASAN ; Koenig, Christian 
; Deucher, Alexander 
Cc: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Dan 
Carpenter 
Subject: RE: [PATCH] drm/amdgpu: Remove duplicate check for *is_queue_unmap in 
sdma_v7_0_ring_set_wptr

[AMD Official Use Only - AMD Internal Distribution Only]

[AMD Official Use Only - AMD Internal Distribution Only]

This patch was
Reviewed-by: Likun Gao .

Regards,
Likun

-Original Message-
From: SHANMUGAM, SRINIVASAN 
Sent: Friday, May 17, 2024 11:33 AM
To: Koenig, Christian ; Deucher, Alexander 

Cc: amd-gfx@lists.freedesktop.org; SHANMUGAM, SRINIVASAN 
; Gao, Likun ; Zhang, Hawking 
; Dan Carpenter 
Subject: [PATCH] drm/amdgpu: Remove duplicate check for *is_queue_unmap in 
sdma_v7_0_ring_set_wptr

This commit removes a duplicate check for *is_queue_unmap in the 
sdma_v7_0_ring_set_wptr function. The check at line 171 was considered dead 
code because at this point in the code, we already know that *is_queue_unmap is 
false due to the check at line 161.

By removing this unnecessary check, improves the readability of the code

Fixes the below:
drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c:171 sdma_v7_0_ring_set_wptr()
warn: duplicate check '*is_queue_unmap' (previous on line 161)

drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
140 static void sdma_v7_0_ring_set_wptr(struct amdgpu_ring *ring)
141 {
142 struct amdgpu_device *adev = ring->adev;
143 uint32_t *wptr_saved;
144 uint32_t *is_queue_unmap;
145 uint64_t aggregated_db_index;
146 uint32_t mqd_size = adev->mqds[AMDGPU_HW_IP_DMA].mqd_size;
147
148 DRM_DEBUG("Setting write pointer\n");
149
150 if (ring->is_mes_queue) {
151 wptr_saved = (uint32_t *)(ring->mqd_ptr + mqd_size);
152 is_queue_unmap = (uint32_t *)(ring->mqd_ptr + mqd_size +
 Set here

153   sizeof(uint32_t));
154 aggregated_db_index =
155 amdgpu_mes_get_aggregated_doorbell_index(adev,
156  ring->hw_prio);
157
158 atomic64_set((atomic64_t *)ring->wptr_cpu_addr,
159  ring->wptr << 2);
160 *wptr_saved = ring->wptr << 2;
161 if (*is_queue_unmap) {
^^^ Checked here

162 WDOORBELL64(aggregated_db_index, ring->wptr << 
2);
163 DRM_DEBUG("calling WDOORBELL64(0x%08x, 
0x%016llx)\n",
164 ring->doorbell_index, 
ring->wptr << 2);
165 WDOORBELL64(ring->doorbell_index, ring->wptr << 
2);
166 } else {
167 DRM_DEBUG("calling WDOORBELL64(0x%08x, 
0x%016llx)\n",
168 ring->doorbell_index, 
ring->wptr << 2);
169 WDOORBELL64(ring->doorbell_index, ring->wptr << 
2);
170
--> 171 if (*is_queue_unmap)
^^^ This is dead code.  We know 
it's false.

172 WDOORBELL64(aggregated_db_index,
173 ring->wptr << 2);
174 }
175 } else {
176 if (ring->use_doorbell) {
177 DRM_DEBUG("Using doorbell -- "
178   "wptr_offs == 0x%08x "

Fixes: 6d9c711786e6 ("drm/amdgpu: Add sdma v7_0 ip block support (v7)")
Cc: Likun Gao 
Cc: Hawking Zhang 
Cc: Christian König 
Cc: Alex Deucher 
Reported-by: Dan Carpenter 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 4a5252e08883..ab1dea77be6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -167,10 +167,6 @@ static void sdma_v7_0_ring_set_wptr(struct amdgpu_ring 
*ring)
DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n",
ring->doorbell_index, ring->wptr << 2);
WDOORBELL64(ring->doorbell_index, ring->wptr << 2);
-
-   if (*is_queue_unmap)
-   WDOORBELL64(aggregated_db_index,
-   ring->wptr << 2);
}
} else {
if (ring->use_doorbell) {
--

RE: [PATCH] drm/amdgpu: Skip coredump during resets for debug

2024-05-31 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, May 31, 2024 6:04 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Koenig, Christian ; 
Khatri, Sunil 
Subject: [PATCH] drm/amdgpu: Skip coredump during resets for debug

Skip scheduling coredump when gpu reset is intentionally triggered through 
debugfs.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..1a9fda1d20fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -981,6 +981,7 @@ static void amdgpu_debugfs_reset_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+   set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);  }
--
2.25.1

RE: [PATCH] drm/amdgpu: Skip execution of pending reset jobs

2023-11-10 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Friday, November 10, 2023 4:19 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Hawking 

Subject: Re: [PATCH] drm/amdgpu: Skip execution of pending reset jobs



On 11/9/2023 1:08 PM, Lijo Lazar wrote:
> cancel_work is not backported to all custom kernels. Add a workaround
> to skip execution of already queued recovery jobs, if the device is
> already reset.
>
> Signed-off-by: Lijo Lazar 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 
>   3 files changed, 30 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bebc73c6822c..c66524e2a56a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5411,6 +5411,8 @@ static inline void 
> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   {
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> + amdgpu_reset_domain_clear_pending(adev->reset_domain);
> +
>   #if defined(CONFIG_DEBUG_FS)
>   if (!amdgpu_sriov_vf(adev))
>   cancel_work(&adev->reset_work);
> @@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   bool audio_suspended = false;
>   bool gpu_reset_for_dev_remove = false;
>
> + if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
> + return 0;
> +
>   gpu_reset_for_dev_remove =
>   test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, 
> &reset_context->flags) &&
>   test_bit(AMDGPU_NEED_FULL_RESET, 
> &reset_context->flags); diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4baa300121d8..3ece7267d6ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
>   kvfree(reset_domain);
>   }
>
> +static void amdgpu_reset_domain_cancel_all_work(struct work_struct
> +*work) {
> + struct amdgpu_reset_domain *reset_domain =
> + container_of(work, struct amdgpu_reset_domain, clear);
> +
> + reset_domain->drain = false;
> +}
> +
>   struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum 
> amdgpu_reset_domain_type type,
>char *wq_name)
>   {
> @@ -142,6 +150,7 @@ struct amdgpu_reset_domain
> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>
>   }
>
> + INIT_WORK(&reset_domain->clear,
> +amdgpu_reset_domain_cancel_all_work);
>   atomic_set(&reset_domain->in_gpu_reset, 0);
>   atomic_set(&reset_domain->reset_res, 0);
>   init_rwsem(&reset_domain->sem);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index b0335a1c5e90..70059eea7e2f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
>   struct rw_semaphore sem;
>   atomic_t in_gpu_reset;
>   atomic_t reset_res;
> + struct work_struct clear;
> + bool drain;
>   };
>
>   #ifdef CONFIG_DEV_COREDUMP
> @@ -137,6 +139,20 @@ static inline bool amdgpu_reset_domain_schedule(struct 
> amdgpu_reset_domain *doma
>   return queue_work(domain->wq, work);
>   }
>
> +static inline void amdgpu_reset_domain_clear_pending(struct
> +amdgpu_reset_domain *domain) {
> + domain->drain = true;
> + /* queue one more work to the domain queue. Till this work is finished,
> +  * domain is in drain mode.
> +  */
> + queue_work(domain->wq, &domain->clear); }
> +
> +static inline bool amdgpu_reset_domain_in_drain_mode(struct
> +amdgpu_reset_domain *domain) {
> + return domain->drain;
> +}
> +
>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain
> *reset_domain);
>
>   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain
> *reset_domain);

RE: [PATCH] drm/amd/pm: Don't send unload message for reset

2023-11-14 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal mailto:asad.ka...@amd.com>>

Thanks & Regards
Asad

From: Wang, Yang(Kevin) 
Sent: Wednesday, November 15, 2023 8:42 AM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: Re: [PATCH] drm/amd/pm: Don't send unload message for reset


[AMD Official Use Only - General]

Reviewed-by: Yang Wang mailto:kevinyang.w...@amd.com>>

Best Regards,
Kevin

From: Lazar, Lijo mailto:lijo.la...@amd.com>>
Sent: Wednesday, November 15, 2023 11:04
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Kamal, Asad 
mailto:asad.ka...@amd.com>>; Ma, Le 
mailto:le...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>
Subject: [PATCH] drm/amd/pm: Don't send unload message for reset

No need to notify about unload during reset. Also remove the FW version
check.

Signed-off-by: Lijo Lazar mailto:lijo.la...@amd.com>>
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index b51cd9e50e64..d431553ad8b8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1500,7 +1500,7 @@ static int smu_v13_0_6_register_irq_handler(struct 
smu_context *smu)

 static int smu_v13_0_6_notify_unload(struct smu_context *smu)
 {
-   if (smu->smc_fw_version <= 0x553500)
+   if (amdgpu_in_reset(smu->adev))
 return 0;

 dev_dbg(smu->adev->dev, "Notify PMFW about driver unload");
--
2.25.1

RE: [PATCH] drm/amdgpu: Restrict extended wait to PSP v13.0.6

2023-11-29 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, November 29, 2023 6:07 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Limonciello, 
Mario 
Subject: [PATCH] drm/amdgpu: Restrict extended wait to PSP v13.0.6

Only PSPv13.0.6 SOCs take a longer time to reach steady state. Other
PSPv13 based SOCs don't need extended wait. Also, reduce PSPv13.0.6 wait time.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 32048b805200..d335d1d2e93e 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -60,7 +60,7 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_0_ta.bin");
 #define GFX_CMD_USB_PD_USE_LFB 0x480

 /* Retry times for vmbx ready wait */
-#define PSP_VMBX_POLLING_LIMIT 2
+#define PSP_VMBX_POLLING_LIMIT 3000

 /* VBIOS gfl defines */
 #define MBOX_READY_MASK 0x8000
@@ -161,14 +161,18 @@ static int psp_v13_0_wait_for_vmbx_ready(struct 
psp_context *psp)  static int psp_v13_0_wait_for_bootloader(struct psp_context 
*psp)  {
struct amdgpu_device *adev = psp->adev;
-   int retry_loop, ret;
+   int retry_loop, retry_cnt, ret;

+   retry_cnt =
+   (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) ?
+   PSP_VMBX_POLLING_LIMIT :
+   10;
/* Wait for bootloader to signify that it is ready having bit 31 of
 * C2PMSG_35 set to 1. All other bits are expected to be cleared.
 * If there is an error in processing command, bits[7:0] will be set.
 * This is applicable for PSP v13.0.6 and newer.
 */
-   for (retry_loop = 0; retry_loop < PSP_VMBX_POLLING_LIMIT; retry_loop++) 
{
+   for (retry_loop = 0; retry_loop < retry_cnt; retry_loop++) {
ret = psp_wait_for(
psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35),
0x8000, 0x, false);
--
2.25.1

RE: [PATCH v4] drm/amdgpu : Add register read/write debugfs support for AID's

2023-12-19 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Mangesh Gadre
Sent: Wednesday, December 20, 2023 10:43 AM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Lazar, Lijo 
Cc: Gadre, Mangesh ; Koenig, Christian 

Subject: [PATCH v4] drm/amdgpu : Add register read/write debugfs support for 
AID's

SMN address is larger than 32 bits for registers on different AID's Updating 
existing interface to support access to such registers.

Signed-off-by: Mangesh Gadre 
Reviewed-by: Christian König 
---
v2 : Adding hardware family check for creating
 debugfs interface for PCIe register access
v3 : Instead of creating new debugfs interface,now using
 existing interface with address range check for
 call to appropriate interface (Lijo)
v4 : Using available helper instead of explicit right
 shift operations (Christian)


 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 96d634bfa448..391af8060704 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -559,7 +559,11 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file 
*f, char __user *buf,
while (size) {
uint32_t value;

-   value = RREG32_PCIE(*pos);
+   if (upper_32_bits(*pos))
+   value = RREG32_PCIE_EXT(*pos);
+   else
+   value = RREG32_PCIE(*pos);
+
r = put_user(value, (uint32_t *)buf);
if (r)
goto out;
@@ -619,7 +623,10 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file 
*f, const char __user
if (r)
goto out;

-   WREG32_PCIE(*pos, value);
+   if (upper_32_bits(*pos))
+   WREG32_PCIE_EXT(*pos, value);
+   else
+   WREG32_PCIE(*pos, value);

result += 4;
buf += 4;
--
2.34.1

RE: [PATCH] drm/amdgpu: Update irq disable flow during unload

2024-01-08 Thread Kamal, Asad

[AMD Official Use Only - General]

Hi Christian,

Thank you for the comment.

This is not normal reset, it is reset done during unload for smu v_13_0_2.

Thanks & Regards
Asad

-Original Message-
From: Koenig, Christian 
Sent: Monday, January 8, 2024 1:33 PM
To: Kamal, Asad 
Subject: Re: [PATCH] drm/amdgpu: Update irq disable flow during unload

Am 05.01.24 um 16:21 schrieb Asad Kamal:
> In certain special cases, e.g device reset before module unload, irq
> gets disabled as part of reset sequence and won't get enabled back.
> Add special check to cover such scenarios

Well complete NAK to that. Resets shouldn't affect the IRQ state at all!

If this is an issue then something else is broken.

Regards,
Christian.

>
> Signed-off-by: Asad Kamal 
> Suggested-by: Lijo Lazar 
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 12 ++--
>   drivers/gpu/drm/amd/amdgpu/soc15.c| 13 +++--
>   2 files changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 372de9f1ce59..a4e1b9a58679 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2361,6 +2361,7 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device 
> *adev)
>   static int gmc_v9_0_hw_fini(void *handle)
>   {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> + bool irq_release = true;
>
>   gmc_v9_0_gart_disable(adev);
>
> @@ -2378,9 +2379,16 @@ static int gmc_v9_0_hw_fini(void *handle)
>   if (adev->mmhub.funcs->update_power_gating)
>   adev->mmhub.funcs->update_power_gating(adev, false);
>
> - amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> + if (adev->shutdown)
> + irq_release = amdgpu_irq_enabled(adev, &adev->gmc.vm_fault, 0);
>
> - if (adev->gmc.ecc_irq.funcs &&
> + if (irq_release)
> + amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> + if (adev->shutdown)
> + irq_release = amdgpu_irq_enabled(adev, &adev->gmc.ecc_irq, 0);
> +
> + if (adev->gmc.ecc_irq.funcs && irq_release &&
>   amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
>   amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index 15033efec2ba..7ee835049d57 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -1266,6 +1266,7 @@ static int soc15_common_hw_init(void *handle)
>   static int soc15_common_hw_fini(void *handle)
>   {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> + bool irq_release = true;
>
>   /* Disable the doorbell aperture and selfring doorbell aperture
>* separately in hw_fini because soc15_enable_doorbell_aperture @@
> -1280,10 +1281,18 @@ static int soc15_common_hw_fini(void *handle)
>
>   if (adev->nbio.ras_if &&
>   amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
> - if (adev->nbio.ras &&
> + if (adev->shutdown)
> + irq_release = amdgpu_irq_enabled(adev,
> +&adev->nbio.ras_controller_irq, 0);
> +
> + if (adev->nbio.ras && irq_release &&
>   adev->nbio.ras->init_ras_controller_interrupt)
>   amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
> - if (adev->nbio.ras &&
> +
> + if (adev->shutdown)
> + irq_release = amdgpu_irq_enabled(adev,
> + &adev->nbio.ras_err_event_athub_irq, 0);
> +
> + if (adev->nbio.ras && irq_release &&
>   adev->nbio.ras->init_ras_err_event_athub_interrupt)
>   amdgpu_irq_put(adev, 
> &adev->nbio.ras_err_event_athub_irq, 0);
>   }

RE: [PATCH] drm/amd/pm: Add error log for smu v13.0.6 reset

2024-01-10 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, January 11, 2024 9:52 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Add error log for smu v13.0.6 reset

For all mode-2 reset fail cases, add error log.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 4ebc6b421c2c..7513d1cfeebd 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2235,17 +2235,18 @@ static int smu_v13_0_6_mode2_reset(struct smu_context 
*smu)
continue;
}

-   if (ret) {
-   dev_err(adev->dev,
-   "failed to send mode2 message \tparam: 0x%08x 
error code %d\n",
-   SMU_RESET_MODE_2, ret);
+   if (ret)
goto out;
-   }
+
} while (ret == -ETIME && timeout);

 out:
mutex_unlock(&smu->message_lock);

+   if (ret)
+   dev_err(adev->dev, "failed to send mode2 reset, error code %d",
+   ret);
+
return ret;
 }

--
2.25.1

RE: [PATCH] drm/amd/pm: Fix smuv13.0.6 current clock reporting

2024-01-11 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, January 11, 2024 4:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 
; Wang, Yang(Kevin) 
Subject: [PATCH] drm/amd/pm: Fix smuv13.0.6 current clock reporting

When current clock is equal to max dpm level clock, the level is not indicated 
correctly with *. Fix by comparing current clock against dpm level value.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 7513d1cfeebd..a28649f21093 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -970,7 +970,9 @@ static int smu_v13_0_6_print_clks(struct smu_context *smu, 
char *buf, int size,
if (i < (clocks.num_levels - 1))
clk2 = clocks.data[i + 1].clocks_in_khz / 1000;

-   if (curr_clk >= clk1 && curr_clk < clk2) {
+   if (curr_clk == clk1) {
+   level = i;
+   } else if (curr_clk >= clk1 && curr_clk < clk2) {
level = (curr_clk - clk1) <= (clk2 - curr_clk) ?
i :
i + 1;
--
2.25.1

RE: [PATCH] drm/amdgpu: fix sdma ecc irq unbalanced issue

2024-01-15 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Monday, January 15, 2024 5:33 PM
To: amd-gfx@lists.freedesktop.org
Cc: Wang, Yang(Kevin) ; Zhang, Hawking 

Subject: [PATCH] drm/amdgpu: fix sdma ecc irq unbalanced issue

fix sdma ecc irq unblanced issue when do mode2 reset.

Fixes: 90b87f67124a ("drm/amdgpu: add sdma v4.4.2 ACA support")

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 15 +++
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 4bb055eacad5..fec5a3d1c4bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2267,21 +2267,12 @@ static int sdma_v4_4_2_ras_late_init(struct 
amdgpu_device *adev, struct ras_comm  {
int r;

-   r = amdgpu_ras_block_late_init(adev, ras_block);
+   r = amdgpu_sdma_ras_late_init(adev, ras_block);
if (r)
return r;

-   r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__SDMA,
-   &sdma_v4_4_2_aca_info, NULL);
-   if (r)
-   goto late_fini;
-
-   return 0;
-
-late_fini:
-   amdgpu_ras_block_late_fini(adev, ras_block);
-
-   return r;
+   return amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__SDMA,
+  &sdma_v4_4_2_aca_info, NULL);
 }

 static struct amdgpu_sdma_ras sdma_v4_4_2_ras = {
--
2.34.1

RE: [PATCH] drm/amd/pm: Fetch current power limit from FW

2024-01-18 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, January 18, 2024 2:33 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Fetch current power limit from FW

Power limit of SMUv13.0.6 SOCs can be updated by out-of-band ways. Fetch the 
limit from firmware instead of using cached values.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c16703868e5c..88cacb4770e2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2710,6 +2710,7 @@ int smu_get_power_limit(void *handle,
case SMU_PPT_LIMIT_CURRENT:
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
+   case IP_VERSION(13, 0, 6):
case IP_VERSION(11, 0, 7):
case IP_VERSION(11, 0, 11):
case IP_VERSION(11, 0, 12):
--
2.25.1

RE: [PATCH] drm/amdgpu: Restrict bootloader wait to SMUv13.0.6

2023-09-04 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal asad.ka...@amd.com

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Monday, September 4, 2023 6:32 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kamal, Asad ; Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Restrict bootloader wait to SMUv13.0.6

Restrict the wait for boot loader steady state only to SMUv13.0.6. For older 
SOCs, ASIC init has a longer wait period and that takes care.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index d2a88bc630d2..469eed084976 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -160,9 +160,6 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context 
*psp)
struct amdgpu_device *adev = psp->adev;
int retry_loop, ret;

-   if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6))
-   psp_v13_0_wait_for_vmbx_ready(psp);
-
/* Wait for bootloader to signify that it is ready having bit 31 of
 * C2PMSG_35 set to 1. All other bits are expected to be cleared.
 * If there is an error in processing command, bits[7:0] will be set.
@@ -180,6 +177,19 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
return ret;
 }

+static int psp_v13_0_wait_for_bootloader_steady_state(struct
+psp_context *psp) {
+   struct amdgpu_device *adev = psp->adev;
+
+   if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6)) {
+   psp_v13_0_wait_for_vmbx_ready(psp);
+
+   return psp_v13_0_wait_for_bootloader(psp);
+   }
+
+   return 0;
+}
+
 static int psp_v13_0_bootloader_load_component(struct psp_context  *psp,
   struct psp_bin_desc  
*bin_desc,
   enum psp_bootloader_cmd  bl_cmd) 
@@ -737,7 +747,7 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)

 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
-   .wait_for_bootloader = psp_v13_0_wait_for_bootloader,
+   .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
.bootloader_load_kdb = psp_v13_0_bootloader_load_kdb,
.bootloader_load_spl = psp_v13_0_bootloader_load_spl,
.bootloader_load_sysdrv = psp_v13_0_bootloader_load_sysdrv,
--
2.25.1

RE: [PATCH] drm/amdgpu: Use default reset method handler

2023-09-07 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 
Tested-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, September 6, 2023 4:56 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Use default reset method handler

When reset method is not passed in reset context, look for the handler for 
default reset method. On Aldebaran, default reset method for SOCs connected to 
CPU over XGMI is MODE2.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 82e1c83a7ccc..5d2516210a3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -50,6 +50,13 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control 
*reset_ctl,
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
int i;

+   if (reset_context->method == AMD_RESET_METHOD_NONE) {
+   if (aldebaran_is_mode2_default(reset_ctl))
+   reset_context->method = AMD_RESET_METHOD_MODE2;
+   else
+   reset_context->method = amdgpu_asic_reset_method(adev);
+   }
+
if (reset_context->method != AMD_RESET_METHOD_NONE) {
dev_dbg(adev->dev, "Getting reset handler for method %d\n",
reset_context->method);
@@ -59,15 +66,6 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control 
*reset_ctl,
}
}

-   if (aldebaran_is_mode2_default(reset_ctl)) {
-   for_each_handler(i, handler, reset_ctl) {
-   if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
-   reset_context->method = AMD_RESET_METHOD_MODE2;
-   return handler;
-   }
-   }
-   }
-
dev_dbg(adev->dev, "Reset handler not found!\n");

return NULL;
--
2.25.1

RE: [PATCH] drm/amdgpu: Restore partition mode after reset

2023-09-11 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 
Tested-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, September 8, 2023 4:10 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Restore partition mode after reset

On a full device reset, PSP FW gets unloaded. Hence restore the partition mode 
by placing a new request.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c| 28 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h|  1 +
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c |  2 +-
 4 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5f32e8d4f3d3..5d2b6a7c5f6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5115,6 +5115,11 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
if (r)
return r;

+   r = amdgpu_xcp_restore_partition_mode(
+   tmp_adev->xcp_mgr);
+   if (r)
+   goto out;
+
r = amdgpu_device_ip_resume_phase2(tmp_adev);
if (r)
goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 565a1fa436d4..2b99eed5ba19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -163,16 +163,11 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
return 0;
 }

-int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode)
+static int __amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr,
+ int mode)
 {
int ret, curr_mode, num_xcps = 0;

-   if (!xcp_mgr || mode == AMDGPU_XCP_MODE_NONE)
-   return -EINVAL;
-
-   if (xcp_mgr->mode == mode)
-   return 0;
-
if (!xcp_mgr->funcs || !xcp_mgr->funcs->switch_partition_mode)
return 0;

@@ -201,6 +196,25 @@ int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr 
*xcp_mgr, int mode)
return ret;
 }

+int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr,
+int mode) {
+   if (!xcp_mgr || mode == AMDGPU_XCP_MODE_NONE)
+   return -EINVAL;
+
+   if (xcp_mgr->mode == mode)
+   return 0;
+
+   return __amdgpu_xcp_switch_partition_mode(xcp_mgr, mode); }
+
+int amdgpu_xcp_restore_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr) {
+   if (!xcp_mgr || xcp_mgr->mode == AMDGPU_XCP_MODE_NONE)
+   return 0;
+
+   return __amdgpu_xcp_switch_partition_mode(xcp_mgr, xcp_mgr->mode); }
+
 int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags) 
 {
int mode;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 9a1036aeec2a..90138bc5f03d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -129,6 +129,7 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int 
init_mode,  int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, 
int mode);  int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, 
u32 flags);  int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr 
*xcp_mgr, int mode);
+int amdgpu_xcp_restore_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr);
 int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr,
 enum AMDGPU_XCP_IP_BLOCK ip, int instance);

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index d0fc62784e82..3f715e7fe1a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -500,7 +500,7 @@ static int aqua_vanjaram_switch_partition_mode(struct 
amdgpu_xcp_mgr *xcp_mgr,
return -EINVAL;
}

-   if (adev->kfd.init_complete)
+   if (adev->kfd.init_complete && !amdgpu_in_reset(adev))
flags |= AMDGPU_XCP_OPS_KFD;

if (flags & AMDGPU_XCP_OPS_KFD) {
--
2.25.1

Re: [PATCH] drm/amd/pm: Round Q10 format values in SMU v13.0.6

2023-09-18 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal mailto:asad.ka...@amd.com>>

Thanks & Regards
Asad

From: amd-gfx  on behalf of Lijo Lazar 

Sent: Friday, September 15, 2023 6:29:37 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Kamal, Asad 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Round Q10 format values in SMU v13.0.6

Instead of neglecting fractional part, round the Q10 format values in
SMU v13.0.6 metrics table.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 70 ++-
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index f9c1219f0c4f..11a6cd96c601 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -244,6 +244,8 @@ struct PPTable_t {
 };

 #define SMUQ10_TO_UINT(x) ((x) >> 10)
+#define SMUQ10_FRAC(x) ((x) & 0x3ff)
+#define SMUQ10_ROUND(x) ((SMUQ10_TO_UINT(x)) + ((SMUQ10_FRAC(x)) >= 0x200))

 struct smu_v13_0_6_dpm_map {
 enum smu_clk_type clk_type;
@@ -389,25 +391,25 @@ static int smu_v13_0_6_setup_driver_pptable(struct 
smu_context *smu)
 return -ETIME;

 pptable->MaxSocketPowerLimit =
-   SMUQ10_TO_UINT(metrics->MaxSocketPowerLimit);
+   SMUQ10_ROUND(metrics->MaxSocketPowerLimit);
 pptable->MaxGfxclkFrequency =
-   SMUQ10_TO_UINT(metrics->MaxGfxclkFrequency);
+   SMUQ10_ROUND(metrics->MaxGfxclkFrequency);
 pptable->MinGfxclkFrequency =
-   SMUQ10_TO_UINT(metrics->MinGfxclkFrequency);
+   SMUQ10_ROUND(metrics->MinGfxclkFrequency);

 for (i = 0; i < 4; ++i) {
 pptable->FclkFrequencyTable[i] =
-   SMUQ10_TO_UINT(metrics->FclkFrequencyTable[i]);
+   SMUQ10_ROUND(metrics->FclkFrequencyTable[i]);
 pptable->UclkFrequencyTable[i] =
-   SMUQ10_TO_UINT(metrics->UclkFrequencyTable[i]);
-   pptable->SocclkFrequencyTable[i] = SMUQ10_TO_UINT(
+   SMUQ10_ROUND(metrics->UclkFrequencyTable[i]);
+   pptable->SocclkFrequencyTable[i] = SMUQ10_ROUND(
 metrics->SocclkFrequencyTable[i]);
 pptable->VclkFrequencyTable[i] =
-   SMUQ10_TO_UINT(metrics->VclkFrequencyTable[i]);
+   SMUQ10_ROUND(metrics->VclkFrequencyTable[i]);
 pptable->DclkFrequencyTable[i] =
-   SMUQ10_TO_UINT(metrics->DclkFrequencyTable[i]);
+   SMUQ10_ROUND(metrics->DclkFrequencyTable[i]);
 pptable->LclkFrequencyTable[i] =
-   SMUQ10_TO_UINT(metrics->LclkFrequencyTable[i]);
+   SMUQ10_ROUND(metrics->LclkFrequencyTable[i]);
 }

 /* use AID0 serial number by default */
@@ -730,50 +732,50 @@ static int smu_v13_0_6_get_smu_metrics_data(struct 
smu_context *smu,
 smu_cmn_get_smc_version(smu, NULL, &smu_version);
 if (smu_version >= 0x552F00) {
 xcc_id = GET_INST(GC, 0);
-   *value = 
SMUQ10_TO_UINT(metrics->GfxclkFrequency[xcc_id]);
+   *value = SMUQ10_ROUND(metrics->GfxclkFrequency[xcc_id]);
 } else {
 *value = 0;
 }
 break;
 case METRICS_CURR_SOCCLK:
 case METRICS_AVERAGE_SOCCLK:
-   *value = SMUQ10_TO_UINT(metrics->SocclkFrequency[0]);
+   *value = SMUQ10_ROUND(metrics->SocclkFrequency[0]);
 break;
 case METRICS_CURR_UCLK:
 case METRICS_AVERAGE_UCLK:
-   *value = SMUQ10_TO_UINT(metrics->UclkFrequency);
+   *value = SMUQ10_ROUND(metrics->UclkFrequency);
 break;
 case METRICS_CURR_VCLK:
-   *value = SMUQ10_TO_UINT(metrics->VclkFrequency[0]);
+   *value = SMUQ10_ROUND(metrics->VclkFrequency[0]);
 break;
 case METRICS_CURR_DCLK:
-   *value = SMUQ10_TO_UINT(metrics->DclkFrequency[0]);
+   *value = SMUQ10_ROUND(metrics->DclkFrequency[0]);
 break;
 case METRICS_CURR_FCLK:
-   *value = SMUQ10_TO_UINT(metrics->FclkFrequency);
+   *value = SMUQ10_ROUND(metrics->FclkFrequency);
 break;

RE: [PATCH] drm/amdgpu:Expose physical id of device in XGMI hive

2023-09-21 Thread Kamal, Asad

[AMD Official Use Only - General]

-Original Message-
From: amd-gfx  On Behalf Of Mangesh Gadre
Sent: Thursday, September 21, 2023 10:06 AM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Lazar, Lijo ; Ma, Le ; Zhang, Morris 

Cc: Gadre, Mangesh ; Lazar, Lijo 
Subject: [PATCH] drm/amdgpu:Expose physical id of device in XGMI hive

This identifies the physical ordering of devices in the hive

Signed-off-by: Mangesh Gadre 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 061534e845a7..4cf38164d72c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -325,6 +325,17 @@ static ssize_t amdgpu_xgmi_show_device_id(struct device 
*dev,

 }

+static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.physical_node_id);
+
+}
+
 static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -390,6 +401,7 @@ static ssize_t amdgpu_xgmi_show_error(struct device *dev,


 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
+static DEVICE_ATTR(xgmi_physical_id, 0444,
+amdgpu_xgmi_show_physical_id, NULL);
[Kamal, Asad]  Can we use S_IRUGO in place of hard code value 0444?

Regards
Asad
 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);  static 
DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL);  static 
DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); @@ 
-407,6 +419,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct 
amdgpu_device *adev,
return ret;
}

+   ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id);
+   if (ret) {
+   dev_err(adev->dev, "XGMI: Failed to create device file 
xgmi_physical_id\n");
+   return ret;
+   }
+
/* Create xgmi error file */
ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
if (ret)
@@ -448,6 +466,7 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct 
amdgpu_device *adev,

 remove_file:
device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
+   device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
device_remove_file(adev->dev, &dev_attr_xgmi_error);
device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
device_remove_file(adev->dev, &dev_attr_xgmi_num_links); @@ -463,6 
+482,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
memset(node, 0, sizeof(node));

device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
+   device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
device_remove_file(adev->dev, &dev_attr_xgmi_error);
device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
--
2.34.1

RE: [PATCH] drm/amdgpu:Expose physical id of device in XGMI hive

2023-09-21 Thread Kamal, Asad

[AMD Official Use Only - General]

With clarification from Lijo, patch is,

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad


-Original Message-
From: Lazar, Lijo 
Sent: Thursday, September 21, 2023 2:44 PM
To: Kamal, Asad ; Gadre, Mangesh ; 
amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Ma, Le 
; Zhang, Morris 
Subject: Re: [PATCH] drm/amdgpu:Expose physical id of device in XGMI hive



On 9/21/2023 12:34 PM, Kamal, Asad wrote:
> [AMD Official Use Only - General]
>
> -Original Message-
> From: amd-gfx  On Behalf Of
> Mangesh Gadre
> Sent: Thursday, September 21, 2023 10:06 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> ; Lazar, Lijo ; Ma, Le
> ; Zhang, Morris 
> Cc: Gadre, Mangesh ; Lazar, Lijo
> 
> Subject: [PATCH] drm/amdgpu:Expose physical id of device in XGMI hive
>
> This identifies the physical ordering of devices in the hive
>
> Signed-off-by: Mangesh Gadre 
> Reviewed-by: Lijo Lazar 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 20 
>   1 file changed, 20 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 061534e845a7..4cf38164d72c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -325,6 +325,17 @@ static ssize_t amdgpu_xgmi_show_device_id(struct
> device *dev,
>
>   }
>
> +static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev,
> +struct device_attribute *attr,
> +char *buf) {
> +   struct drm_device *ddev = dev_get_drvdata(dev);
> +   struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +   return sysfs_emit(buf, "%llu\n",
> + adev->gmc.xgmi.physical_node_id);
> +
> +}
> +
>   static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev,
>  struct device_attribute *attr,
>  char *buf) @@ -390,6 +401,7
> @@ static ssize_t amdgpu_xgmi_show_error(struct device *dev,
>
>
>   static DEVICE_ATTR(xgmi_device_id, S_IRUGO,
> amdgpu_xgmi_show_device_id, NULL);
> +static DEVICE_ATTR(xgmi_physical_id, 0444,
> +amdgpu_xgmi_show_physical_id, NULL);
> [Kamal, Asad]  Can we use S_IRUGO in place of hard code value 0444?
>
This is the recommended way by checkpatch. S_IR* will result in "Symbolic 
permissions are not preferred. Consider using octal permissions"
[Kamal, Asad] Ok.

Thanks,
Lijo

> Regards
> Asad
>   static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);  
> static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL);  
> static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, 
> NULL); @@ -407,6 +419,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct 
> amdgpu_device *adev,
>  return ret;
>  }
>
> +   ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id);
> +   if (ret) {
> +   dev_err(adev->dev, "XGMI: Failed to create device file 
> xgmi_physical_id\n");
> +   return ret;
> +   }
> +
>  /* Create xgmi error file */
>  ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
>  if (ret)
> @@ -448,6 +466,7 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct
> amdgpu_device *adev,
>
>   remove_file:
>  device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
> +   device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
>  device_remove_file(adev->dev, &dev_attr_xgmi_error);
>  device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
>  device_remove_file(adev->dev, &dev_attr_xgmi_num_links); @@ -463,6 
> +482,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device 
> *adev,
>  memset(node, 0, sizeof(node));
>
>  device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
> +   device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
>  device_remove_file(adev->dev, &dev_attr_xgmi_error);
>  device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
>  device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
> --
> 2.34.1
>

RE: [PATCH 1/2] drm/amd/pm: Add throttle limit for SMU v13.0.6

2023-09-29 Thread Kamal, Asad

[AMD Official Use Only - General]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Wednesday, September 20, 2023 4:10 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Hawking 

Subject: [PATCH 1/2] drm/amd/pm: Add throttle limit for SMU v13.0.6

CTF limit represents the max operating temperature and thermal limit gives the 
limit at which throttling starts. Add support for both limits.
SOC and HBM may have different limit values.*_emergency_max gives  max 
operating temperature and *_crit_max value represents throttle limit.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 34 +++
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 11a6cd96c601..73db595bf6d9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -165,6 +165,7 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(SetSoftMaxGfxClk,
PPSMC_MSG_SetSoftMaxGfxClk,0),
MSG_MAP(PrepareMp1ForUnload, 
PPSMC_MSG_PrepareForDriverUnload,  0),
MSG_MAP(GetCTFLimit, PPSMC_MSG_GetCTFLimit, 
0),
+   MSG_MAP(GetThermalLimit, 
PPSMC_MSG_ReadThrottlerLimit,  0),
MSG_MAP(ClearMcaOnRead,  PPSMC_MSG_ClearMcaOnRead,  
0),
MSG_MAP(QueryValidMcaCount,  
PPSMC_MSG_QueryValidMcaCount,  0),
MSG_MAP(QueryValidMcaCeCount,
PPSMC_MSG_QueryValidMcaCeCount,0),
@@ -2110,7 +2111,7 @@ static int 
smu_v13_0_6_get_thermal_temperature_range(struct smu_context *smu,
 struct 
smu_temperature_range *range)  {
struct amdgpu_device *adev = smu->adev;
-   u32 aid_temp, xcd_temp, mem_temp;
+   u32 aid_temp, xcd_temp, max_temp;
uint32_t smu_version;
u32 ccd_temp = 0;
int ret;
@@ -2126,31 +2127,50 @@ static int 
smu_v13_0_6_get_thermal_temperature_range(struct smu_context *smu,
if (smu_version < 0x554500)
return 0;

+   /* Get SOC Max operating temperature */
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetCTFLimit,
  PPSMC_AID_THM_TYPE, &aid_temp);
if (ret)
goto failed;
-
if (adev->flags & AMD_IS_APU) {
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetCTFLimit,
  PPSMC_CCD_THM_TYPE, 
&ccd_temp);
if (ret)
goto failed;
}
-
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetCTFLimit,
  PPSMC_XCD_THM_TYPE, &xcd_temp);
if (ret)
goto failed;
-
-   range->hotspot_crit_max = max3(aid_temp, xcd_temp, ccd_temp) *
+   range->hotspot_emergency_max = max3(aid_temp, xcd_temp, ccd_temp) *
   SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+
+   /* Get HBM Max operating temperature */
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetCTFLimit,
- PPSMC_HBM_THM_TYPE, &mem_temp);
+ PPSMC_HBM_THM_TYPE, &max_temp);
if (ret)
goto failed;
+   range->mem_emergency_max =
+   max_temp * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+
+   /* Get SOC thermal throttle limit */
+   ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetThermalLimit,
+ 
PPSMC_THROTTLING_LIMIT_TYPE_SOCKET,
+ &max_temp);
+   if (ret)
+   goto failed;
+   range->hotspot_crit_max =
+   max_temp * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+
+   /* Get HBM thermal throttle limit */
+   ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetThermalLimit,
+ PPSMC_THROTTLING_LIMIT_TYPE_HBM,
+ &max_temp);
+   if (ret)
+   goto failed;
+
+   range->mem_crit_max = max_temp *
+SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;

-   range->mem_crit_max = mem_temp * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
 failed:
return ret;
 }
--
2.25.1

RE: [PATCH v2 3/3] drm/amd/pm: Use gpu_metrics_v1_4 for SMUv13.0.6

2023-10-09 Thread Kamal, Asad

[AMD Official Use Only - General]

-Original Message-
From: Lazar, Lijo 
Sent: Monday, October 9, 2023 7:59 PM
To: Kamal, Asad ; amd-gfx@lists.freedesktop.org
Cc: Ma, Le ; Zhang, Morris ; Zhang, Hawking 

Subject: Re: [PATCH v2 3/3] drm/amd/pm: Use gpu_metrics_v1_4 for SMUv13.0.6



On 10/9/2023 5:28 PM, Lazar, Lijo wrote:
>
>
> On 10/6/2023 8:11 PM, Asad Kamal wrote:
>> Use gpu_metrics_v1_4 for SMUv13.0.6 to fill gpu metric info
>>
>> Signed-off-by: Asad Kamal 
>
> Series is:
>  Reviewed-by: Lijo Lazar 
>

On a second thought, since there is no FW release yet with FW metrics table v9 
support, suggest to drop patch 1 and pcie_bandwidth_inst  value assignment. 
Will keep the field as place holder till there is a FW update.
[Kamal, Asad] Will send a v3 with the changes.

Thanks & Regards
Asad

Thanks,
Lijo

> Thanks,
> Lijo
>
>> ---
>>   .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 67
>> ---
>>   1 file changed, 43 insertions(+), 24 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
>> index ce971a93d28b..3a07f1c95e45 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
>> @@ -279,7 +279,7 @@ static int smu_v13_0_6_tables_init(struct
>> smu_context *smu)
>>   return -ENOMEM;
>>   smu_table->metrics_time = 0;
>> -smu_table->gpu_metrics_table_size = sizeof(struct
>> gpu_metrics_v1_3);
>> +smu_table->gpu_metrics_table_size = sizeof(struct
>> +gpu_metrics_v1_4);
>>   smu_table->gpu_metrics_table =
>>   kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL);
>>   if (!smu_table->gpu_metrics_table) { @@ -1969,22 +1969,19 @@
>> static int smu_v13_0_6_get_current_pcie_link_speed(struct smu_context
>> *smu)
>>   static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu,
>> void **table)
>>   {
>>   struct smu_table_context *smu_table = &smu->smu_table;
>> -struct gpu_metrics_v1_3 *gpu_metrics =
>> -(struct gpu_metrics_v1_3 *)smu_table->gpu_metrics_table;
>> +struct gpu_metrics_v1_4 *gpu_metrics =
>> +(struct gpu_metrics_v1_4 *)smu_table->gpu_metrics_table;
>>   struct amdgpu_device *adev = smu->adev;
>> -int ret = 0, inst0, xcc0;
>> +int ret = 0, xcc_id, inst, i;
>>   MetricsTable_t *metrics;
>>   u16 link_width_level;
>> -inst0 = adev->sdma.instance[0].aid_id;
>> -xcc0 = GET_INST(GC, 0);
>> -
>>   metrics = kzalloc(sizeof(MetricsTable_t), GFP_KERNEL);
>>   ret = smu_v13_0_6_get_metrics_table(smu, metrics, true);
>>   if (ret)
>>   return ret;
>> -smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 3);
>> +smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 4);
>>   gpu_metrics->temperature_hotspot =
>>   SMUQ10_ROUND(metrics->MaxSocketTemperature);
>> @@ -2000,30 +1997,38 @@ static ssize_t
>> smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table
>>   gpu_metrics->average_umc_activity =
>>   SMUQ10_ROUND(metrics->DramBandwidthUtilization);
>> -gpu_metrics->average_socket_power =
>> +gpu_metrics->curr_socket_power =
>>   SMUQ10_ROUND(metrics->SocketPower);
>>   /* Energy counter reported in 15.259uJ (2^-16) units */
>>   gpu_metrics->energy_accumulator = metrics->SocketEnergyAcc;
>> -gpu_metrics->current_gfxclk =
>> -SMUQ10_ROUND(metrics->GfxclkFrequency[xcc0]);
>> -gpu_metrics->current_socclk =
>> -SMUQ10_ROUND(metrics->SocclkFrequency[inst0]);
>> -gpu_metrics->current_uclk =
>> SMUQ10_ROUND(metrics->UclkFrequency);
>> -gpu_metrics->current_vclk0 =
>> -SMUQ10_ROUND(metrics->VclkFrequency[inst0]);
>> -gpu_metrics->current_dclk0 =
>> -SMUQ10_ROUND(metrics->DclkFrequency[inst0]);
>> +for (i = 0; i < MAX_GFX_CLKS; i++) {
>> +xcc_id = GET_INST(GC, i);
>> +if (xcc_id >= 0)
>> +gpu_metrics->current_gfxclk[i] =
>> +SMUQ10_ROUND(metrics->GfxclkFrequency[xcc_id]);
>> +
>> +if (i < MAX_CLKS) {
>> +gpu_metrics->current_socclk[i] =
>> +SMUQ10_ROUND(metrics->SocclkFrequency[i]);
>> +inst = GET_INST(VCN, i);
>> +if (inst >= 0) {
>> +gpu_metrics

RE: [PATCH] Revert "drm/amdgpu: Program xcp_ctl registers as needed"

2023-10-11 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Gadre, Mangesh 
Sent: Wednesday, October 11, 2023 3:11 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Lazar, Lijo ; Ma, Le ; Zhang, Morris 
; Kamal, Asad 
Cc: Gadre, Mangesh ; Lazar, Lijo 
Subject: [PATCH] Revert "drm/amdgpu: Program xcp_ctl registers as needed"

This reverts commit 3cf01336313894419498a0d5eb367f092a436195.

XCP_CTL register is programmed by firmware and register access is protected.

Signed-off-by: Mangesh Gadre 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index fbfe0a1c4b19..39bc441695f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,7 +623,7 @@ static int gfx_v9_4_3_switch_compute_partition(struct 
amdgpu_device *adev,
int num_xccs_per_xcp)
 {
int ret, i, num_xcc;
-   u32 tmp = 0, regval;
+   u32 tmp = 0;

if (adev->psp.funcs) {
ret = psp_spatial_partition(&adev->psp, @@ -631,24 +631,23 @@ 
static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev,
num_xccs_per_xcp);
if (ret)
return ret;
-   }
-
-   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+   } else {
+   num_xcc = NUM_XCC(adev->gfx.xcc_mask);

-   for (i = 0; i < num_xcc; i++) {
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
-   num_xccs_per_xcp);
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
-   i % num_xccs_per_xcp);
-   regval = RREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL);
-   if (regval != tmp)
+   for (i = 0; i < num_xcc; i++) {
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
+   num_xccs_per_xcp);
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
+   i % num_xccs_per_xcp);
WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL,
 tmp);
+   }
+   ret = 0;
}

adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;

-   return 0;
+   return ret;
 }

 static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
--
2.34.1

RE: [PATCH] drm/amdgpu: Add a read to GFX v9.4.3 ring test

2023-10-20 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Zhang, Hawking 
Sent: Friday, October 20, 2023 12:36 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Kamal, Asad 
; Ma, Le 
Subject: RE: [PATCH] drm/amdgpu: Add a read to GFX v9.4.3 ring test

[AMD Official Use Only - General]

Acked-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Friday, October 20, 2023 15:02
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH] drm/amdgpu: Add a read to GFX v9.4.3 ring test

Issue a read to confirm the register write before ringing doorbell. With 
multiple XCCs there is chance for race condition.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index a1c2c952d882..5861e4d0eda9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -256,6 +256,7 @@ static int gfx_v9_4_3_ring_test_ring(struct amdgpu_ring 
*ring)
xcc_offset = SOC15_REG_OFFSET(GC, 0, regSCRATCH_REG0);
scratch_reg0_offset = SOC15_REG_OFFSET(GC, GET_INST(GC, ring->xcc_id), 
regSCRATCH_REG0);
WREG32(scratch_reg0_offset, 0xCAFEDEAD);
+   tmp = RREG32(scratch_reg0_offset);

r = amdgpu_ring_alloc(ring, 3);
if (r)
--
2.25.1

RE: [PATCH 1/2] drm/amd/pm: Hide irrelevant pm device attributes

2023-11-03 Thread Kamal, Asad

[AMD Official Use Only - General]

Seies is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, November 3, 2023 11:30 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Hawking 

Subject: [PATCH 1/2] drm/amd/pm: Hide irrelevant pm device attributes

Change return code to EOPNOTSUPP for unsupported functions. Use the error code 
information to hide sysfs nodes not valid for the SOC.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 12 ++--
 drivers/gpu/drm/amd/pm/amdgpu_pm.c| 12 
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c |  4 ++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index aed635e2da9c..aed232d107b6 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -491,7 +491,7 @@ int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum 
amd_pp_sensors senso  int amdgpu_dpm_get_apu_thermal_limit(struct amdgpu_device 
*adev, uint32_t *limit)  {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-   int ret = -EINVAL;
+   int ret = -EOPNOTSUPP;

if (pp_funcs && pp_funcs->get_apu_thermal_limit) {
mutex_lock(&adev->pm.mutex);
@@ -505,7 +505,7 @@ int amdgpu_dpm_get_apu_thermal_limit(struct amdgpu_device 
*adev, uint32_t *limit  int amdgpu_dpm_set_apu_thermal_limit(struct 
amdgpu_device *adev, uint32_t limit)  {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-   int ret = -EINVAL;
+   int ret = -EOPNOTSUPP;

if (pp_funcs && pp_funcs->set_apu_thermal_limit) {
mutex_lock(&adev->pm.mutex);
@@ -1182,7 +1182,7 @@ int amdgpu_dpm_get_sclk_od(struct amdgpu_device *adev)
int ret = 0;

if (!pp_funcs->get_sclk_od)
-   return 0;
+   return -EOPNOTSUPP;

mutex_lock(&adev->pm.mutex);
ret = pp_funcs->get_sclk_od(adev->powerplay.pp_handle);
@@ -1196,7 +1196,7 @@ int amdgpu_dpm_set_sclk_od(struct amdgpu_device *adev, 
uint32_t value)
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;

if (is_support_sw_smu(adev))
-   return 0;
+   return -EOPNOTSUPP;

mutex_lock(&adev->pm.mutex);
if (pp_funcs->set_sclk_od)
@@ -1219,7 +1219,7 @@ int amdgpu_dpm_get_mclk_od(struct amdgpu_device *adev)
int ret = 0;

if (!pp_funcs->get_mclk_od)
-   return 0;
+   return -EOPNOTSUPP;

mutex_lock(&adev->pm.mutex);
ret = pp_funcs->get_mclk_od(adev->powerplay.pp_handle);
@@ -1233,7 +1233,7 @@ int amdgpu_dpm_set_mclk_od(struct amdgpu_device *adev, 
uint32_t value)
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;

if (is_support_sw_smu(adev))
-   return 0;
+   return -EOPNOTSUPP;

mutex_lock(&adev->pm.mutex);
if (pp_funcs->set_mclk_od)
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 6ad957aaef3c..083048131bca 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2237,6 +2237,18 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
} else if (DEVICE_ATTR_IS(xgmi_plpd_policy)) {
if (amdgpu_dpm_get_xgmi_plpd_mode(adev, NULL) == XGMI_PLPD_NONE)
*states = ATTR_STATE_UNSUPPORTED;
+   } else if (DEVICE_ATTR_IS(pp_dpm_mclk_od)) {
+   if (amdgpu_dpm_get_mclk_od(adev) == -EOPNOTSUPP)
+   *states = ATTR_STATE_UNSUPPORTED;
+   } else if (DEVICE_ATTR_IS(pp_dpm_sclk_od)) {
+   if (amdgpu_dpm_get_sclk_od(adev) == -EOPNOTSUPP)
+   *states = ATTR_STATE_UNSUPPORTED;
+   } else if (DEVICE_ATTR_IS(apu_thermal_cap)) {
+   u32 limit;
+
+   if (amdgpu_dpm_get_apu_thermal_limit(adev, &limit) ==
+   -EOPNOTSUPP)
+   *states = ATTR_STATE_UNSUPPORTED;
}

switch (gc_ver) {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 7fe32cdea5a8..6d6221024d7e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2747,7 +2747,7 @@ static int smu_read_sensor(void *handle,

 static int smu_get_apu_thermal_limit(void *handle, uint32_t *limit)  {
-   int ret = -EINVAL;
+   int ret = -EOPNOTSUPP;
struct smu_context *smu = handle;

if (smu->ppt_funcs && smu->ppt_funcs->get_apu_thermal_limit)
@@ -2758,7 +2758,7 @@ static int smu_get_apu_thermal_limit(void *handle, 
uint32_t *limit)

 static int smu_set_apu_thermal_limit(void *handle, uint32_t limit)  {
-   int ret = -EINVAL;
+   int ret = -EOPNOTSUPP;
struct smu_co

RE: [PATCH] drm/amdgpu: Fix sdma 4.4.2 doorbell rptr/wptr init

2023-11-05 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 
Tested-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, November 6, 2023 9:51 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH] drm/amdgpu: Fix sdma 4.4.2 doorbell rptr/wptr init

Doorbell rptr/wptr can be set through multiple ways including direct register 
initialization. Disable doorbell during hw_fini once the ring is disabled so 
that during next module reload direct initialization takes effect. Also, move 
the direct initialization after minor update is set to 1 since rptr/wptr are 
reinitialized back to 0 which could be lower than the previous doorbell value 
(ex: cases like module reload).

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 25 ++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index c46bc6aa4f48..bd65a62f8903 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -427,6 +427,7 @@ static void sdma_v4_4_2_inst_gfx_stop(struct amdgpu_device 
*adev,
  uint32_t inst_mask)
 {
struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES];
+   u32 doorbell_offset, doorbell;
u32 rb_cntl, ib_cntl;
int i, unset = 0;

@@ -444,6 +445,18 @@ static void sdma_v4_4_2_inst_gfx_stop(struct amdgpu_device 
*adev,
ib_cntl = RREG32_SDMA(i, regSDMA_GFX_IB_CNTL);
ib_cntl = REG_SET_FIELD(ib_cntl, SDMA_GFX_IB_CNTL, IB_ENABLE, 
0);
WREG32_SDMA(i, regSDMA_GFX_IB_CNTL, ib_cntl);
+
+   if (sdma[i]->use_doorbell) {
+   doorbell = RREG32_SDMA(i, regSDMA_GFX_DOORBELL);
+   doorbell_offset = RREG32_SDMA(i, 
regSDMA_GFX_DOORBELL_OFFSET);
+
+   doorbell = REG_SET_FIELD(doorbell, SDMA_GFX_DOORBELL, 
ENABLE, 0);
+   doorbell_offset = REG_SET_FIELD(doorbell_offset,
+   SDMA_GFX_DOORBELL_OFFSET,
+   OFFSET, 0);
+   WREG32_SDMA(i, regSDMA_GFX_DOORBELL, doorbell);
+   WREG32_SDMA(i, regSDMA_GFX_DOORBELL_OFFSET, 
doorbell_offset);
+   }
}
 }

@@ -631,12 +644,6 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device 
*adev, unsigned int i)
rb_cntl = sdma_v4_4_2_rb_cntl(ring, rb_cntl);
WREG32_SDMA(i, regSDMA_GFX_RB_CNTL, rb_cntl);

-   /* Initialize the ring buffer's read and write pointers */
-   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0);
-   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0);
-   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, 0);
-   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, 0);
-
/* set the wb address whether it's enabled or not */
WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_ADDR_HI,
   upper_32_bits(adev->wb.gpu_addr + wb_offset) & 0x); @@ 
-654,6 +661,12 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device 
*adev, unsigned int i)
/* before programing wptr to a less value, need set minor_ptr_update 
first */
WREG32_SDMA(i, regSDMA_GFX_MINOR_PTR_UPDATE, 1);

+   /* Initialize the ring buffer's read and write pointers */
+   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0);
+   WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0);
+   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, 0);
+   WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, 0);
+
doorbell = RREG32_SDMA(i, regSDMA_GFX_DOORBELL);
doorbell_offset = RREG32_SDMA(i, regSDMA_GFX_DOORBELL_OFFSET);

--
2.25.1

RE: [PATCH] drm/amdgpu: Change golden settings for GFX v9.4.3

2023-07-04 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, July 4, 2023 9:09 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 
; Gadre, Mangesh 
Subject: [PATCH] drm/amdgpu: Change golden settings for GFX v9.4.3

Change the settings applicable for A0. GRBM_MCM_ADDR setting will be applied by 
firmware.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index c1e3625ad136..51532d0dd7a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -195,14 +195,11 @@ static void gfx_v9_4_3_init_golden_registers(struct 
amdgpu_device *adev)
num_xcc = NUM_XCC(adev->gfx.xcc_mask);
for (i = 0; i < num_xcc; i++) {
dev_inst = GET_INST(GC, i);
-   if (dev_inst >= 2)
-   WREG32_SOC15(GC, dev_inst, regGRBM_MCM_ADDR, 0x4);

+   WREG32_SOC15(GC, dev_inst, regGB_ADDR_CONFIG,
+GOLDEN_GB_ADDR_CONFIG);
/* Golden settings applied by driver for ASIC with rev_id 0 */
if (adev->rev_id == 0) {
-   WREG32_SOC15(GC, dev_inst, regGB_ADDR_CONFIG,
-GOLDEN_GB_ADDR_CONFIG);
-
WREG32_FIELD15_PREREG(GC, dev_inst, TCP_UTCL1_CNTL1,
  REDUCE_FIFO_DEPTH_BY_2, 2);
}
--
2.25.1

RE: [PATCH] drm/amdgpu: Remove redundant GFX v9.4.3 sequence

2023-07-05 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

-Original Message-
From: Ma, Le 
Sent: Wednesday, July 5, 2023 11:52 AM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Gadre, Mangesh 

Subject: RE: [PATCH] drm/amdgpu: Remove redundant GFX v9.4.3 sequence

[AMD Official Use Only - General]

Reviewed-by: Le Ma 

> -Original Message-
> From: Lazar, Lijo 
> Sent: Wednesday, July 5, 2023 1:31 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Kamal, Asad ; Ma, Le
> ; Gadre, Mangesh 
> Subject: [PATCH] drm/amdgpu: Remove redundant GFX v9.4.3 sequence
>
> Programming of XCC id is already taken care with partition mode change.
>
> Signed-off-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 29
> -
>  1 file changed, 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 51532d0dd7a7..548b1123f7c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1034,32 +1034,6 @@ static void
> gfx_v9_4_3_xcc_disable_gpa_mode(struct amdgpu_device *adev, int xcc_
>   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCPC_PSP_DEBUG, data);
> }
>
> -static void gfx_v9_4_3_xcc_program_xcc_id(struct amdgpu_device *adev,
> -   int xcc_id)
> -{
> - uint32_t tmp = 0;
> - int num_xcc;
> -
> - num_xcc = NUM_XCC(adev->gfx.xcc_mask);
> - switch (num_xcc) {
> - /* directly config VIRTUAL_XCC_ID to 0 for 1-XCC */
> - case 1:
> - WREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_HYP_XCP_CTL, 0x8);
> - break;
> - case 2:
> - case 4:
> - case 6:
> - case 8:
> - tmp = (xcc_id % adev->gfx.num_xcc_per_xcp) <<
> REG_FIELD_SHIFT(CP_HYP_XCP_CTL, VIRTUAL_XCC_ID);
> - tmp = tmp | (adev->gfx.num_xcc_per_xcp <<
> REG_FIELD_SHIFT(CP_HYP_XCP_CTL, NUM_XCC_IN_XCP));
> - WREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_HYP_XCP_CTL, tmp);
> -
> - break;
> - default:
> - break;
> - }
> -}
> -
>  static bool gfx_v9_4_3_is_rlc_enabled(struct amdgpu_device *adev)  {
>   uint32_t rlc_setting;
> @@ -1917,9 +1891,6 @@ static int gfx_v9_4_3_xcc_cp_resume(struct
> amdgpu_device *adev, int xcc_id)
>   return r;
>   }
>
> - /* set the virtual and physical id based on partition_mode */
> - gfx_v9_4_3_xcc_program_xcc_id(adev, xcc_id);
> -
>   r = gfx_v9_4_3_xcc_kiq_resume(adev, xcc_id);
>   if (r)
>   return r;
> --
> 2.25.1

RE: [PATCH] drm/amdgpu: Restore HQD persistent state register

2023-07-25 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Zhang, Hawking 
Sent: Tuesday, July 25, 2023 11:23 AM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Kamal, Asad 

Subject: RE: [PATCH] drm/amdgpu: Restore HQD persistent state register

[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, July 25, 2023 13:46
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Restore HQD persistent state register

On GFX v9.4.3, compute queue MQD is populated using the values in HQD 
persistent state register. Hence don't clear the values on module unload, 
instead restore it to the default reset value so that MQD is initialized 
correctly during next module load. In particular, preload flag needs to be set 
on compute queue MQD, otherwise it could cause uninitialized values being used 
at device reset state resulting in EDC.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 8b361aa87d01..306dc6533397 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -46,6 +46,7 @@ MODULE_FIRMWARE("amdgpu/gc_9_4_3_rlc.bin");
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000L

 #define GOLDEN_GB_ADDR_CONFIG 0x2a114042
+#define CP_HQD_PERSISTENT_STATE_DEFAULT 0xbe05301

 struct amdgpu_gfx_ras gfx_v9_4_3_ras;

@@ -1726,7 +1727,7 @@ static int gfx_v9_4_3_xcc_q_fini_register(struct 
amdgpu_ring *ring,

WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_HQD_IQ_TIMER, 0);
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_HQD_IB_CONTROL, 0);
-   WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_HQD_PERSISTENT_STATE, 
0);
+   WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id),
+regCP_HQD_PERSISTENT_STATE, CP_HQD_PERSISTENT_STATE_DEFAULT);
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), 
regCP_HQD_PQ_DOORBELL_CONTROL, 0x4000);
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), 
regCP_HQD_PQ_DOORBELL_CONTROL, 0);
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_HQD_PQ_RPTR, 0);
--
2.25.1

RE: [PATCH] drm/amdgpu: Keep reset handlers shared

2023-08-16 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 
Tested-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Ma, Le 
Sent: Wednesday, August 16, 2023 4:17 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Kamal, Asad 
; Zhang, Hawking 
Subject: RE: [PATCH] drm/amdgpu: Keep reset handlers shared

[AMD Official Use Only - General]

Reviewed-by: Le Ma 

> -Original Message-
> From: amd-gfx  On Behalf Of
> Lazar, Lijo
> Sent: Wednesday, August 16, 2023 1:38 PM
> To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Kamal, Asad
> ; Zhang, Hawking 
> Subject: RE: [PATCH] drm/amdgpu: Keep reset handlers shared
>
> [AMD Official Use Only - General]
>
> [AMD Official Use Only - General]
>
> 
>
> Thanks,
> Lijo
>
> -Original Message-
> From: amd-gfx  On Behalf Of
> Lijo Lazar
> Sent: Thursday, August 10, 2023 5:14 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Kamal, Asad
> ; Zhang, Hawking 
> Subject: [PATCH] drm/amdgpu: Keep reset handlers shared
>
> Instead of maintaining a list per device, keep the reset handlers
> common per ASIC family. A pointer to the list of handlers is maintained in 
> reset control.
>
> Signed-off-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/amdgpu/aldebaran.c  | 19 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  8 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h   | 16 
>  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 20 +++-
>  drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c   | 19 +++
>  5 files changed, 45 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 2b97b8a96fb4..82e1c83a7ccc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -48,20 +48,19 @@ aldebaran_get_reset_handler(struct
> amdgpu_reset_control *reset_ctl,  {
> struct amdgpu_reset_handler *handler;
> struct amdgpu_device *adev = (struct amdgpu_device
> *)reset_ctl->handle;
> +   int i;
>
> if (reset_context->method != AMD_RESET_METHOD_NONE) {
> dev_dbg(adev->dev, "Getting reset handler for method %d\n",
> reset_context->method);
> -   list_for_each_entry(handler, &reset_ctl->reset_handlers,
> -handler_list) {
> +   for_each_handler(i, handler, reset_ctl) {
> if (handler->reset_method == reset_context->method)
> return handler;
> }
> }
>
> if (aldebaran_is_mode2_default(reset_ctl)) {
> -   list_for_each_entry(handler, &reset_ctl->reset_handlers,
> -handler_list) {
> +   for_each_handler(i, handler, reset_ctl) {
> if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
> reset_context->method = 
> AMD_RESET_METHOD_MODE2;
> return handler; @@ -124,9 +123,9 @@
> static void aldebaran_async_reset(struct work_struct *work)
> struct amdgpu_reset_control *reset_ctl =
> container_of(work, struct amdgpu_reset_control, reset_work);
> struct amdgpu_device *adev = (struct amdgpu_device
> *)reset_ctl->handle;
> +   int i;
>
> -   list_for_each_entry(handler, &reset_ctl->reset_handlers,
> -handler_list) {
> +   for_each_handler(i, handler, reset_ctl) {
> if (handler->reset_method == reset_ctl->active_reset) {
> dev_dbg(adev->dev, "Resetting device\n");
> handler->do_reset(adev); @@ -395,6 +394,11 @@
> static struct amdgpu_reset_handler aldebaran_mode2_handler = {
> .do_reset   = aldebaran_mode2_reset,
>  };
>
> +static struct amdgpu_reset_handler
> +   *aldebaran_rst_handlers[AMDGPU_RESET_MAX_HANDLERS] = {
> +   &aldebaran_mode2_handler,
> +   };
> +
>  int aldebaran_reset_init(struct amdgpu_device *adev)  {
> struct amdgpu_reset_control *reset_ctl; @@ -408,10 +412,9 @@
> int aldebaran_reset_init(struct amdgpu_device *adev)
> reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
> reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
>
> -   INIT_LIST_HEAD(&reset_ctl->reset_handlers);
> INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset)

RE: [PATCH] drm/amdgpu: Fix the return for gpu mode1_reset

2023-08-19 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Hawking Zhang 
Sent: Saturday, August 19, 2023 12:11 PM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Lazar, Lijo 
; Deucher, Alexander ; Kamal, 
Asad 
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Fix the return for gpu mode1_reset

amdgpu_device_mode1_reset will return gpu mode1_reset succeed (ret = 0) as long 
as wait_for_bootloader call succeed, regardless of the status reported by smu 
or psp firmware. This results to driver continue executing recovery even smu or 
psp fail to perform mode1 reset.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5586146b8c76..533daba2accb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4701,12 +4701,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)
}

if (ret)
-   dev_err(adev->dev, "GPU mode1 reset failed\n");
+   goto mode1_reset_failed;

amdgpu_device_load_pci_state(adev->pdev);
ret = amdgpu_psp_wait_for_bootloader(adev);
if (ret)
-   return ret;
+   goto mode1_reset_failed;

/* wait for asic to come out of reset */
for (i = 0; i < adev->usec_timeout; i++) { @@ -4717,8 +4717,17 @@ int 
amdgpu_device_mode1_reset(struct amdgpu_device *adev)
udelay(1);
}

+   if (i >= adev->usec_timeout) {
+   ret = -ETIMEDOUT;
+   goto mode1_reset_failed;
+   }
+
amdgpu_atombios_scratch_regs_engine_hung(adev, false);

+   return 0;
+
+mode1_reset_failed:
+   dev_err(adev->dev, "GPU mode1 reset failed\n");
return ret;
 }

--
2.17.1

RE: [PATCH] drm/amd/pm: Fixes incorrect type in 'amdgpu_hwmon_show_power_avg() & _input()'

2023-08-20 Thread Kamal, Asad

[AMD Official Use Only - General]

-Original Message-
From: amd-gfx  On Behalf Of Srinivasan 
Shanmugam
Sent: Monday, August 21, 2023 11:36 AM
To: Koenig, Christian ; Deucher, Alexander 
; Chen, Guchun 
Cc: Pan, Xinhui ; SHANMUGAM, SRINIVASAN 
; amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amd/pm: Fixes incorrect type in 
'amdgpu_hwmon_show_power_avg() & _input()'

The val is defined as unsigned int type, if(val<0) is invalid, hence modified 
its type to ssize_t

Fixes the below:

drivers/gpu/drm/amd/pm/amdgpu_pm.c:2800:5-8: WARNING: Unsigned expression 
compared with zero: val < 0
drivers/gpu/drm/amd/pm/amdgpu_pm.c:2813:5-8: WARNING: Unsigned expression 
compared with zero: val < 0

Cc: Guchun Chen 
Cc: Christian König 
Cc: Alex Deucher 
Cc: "Pan, Xinhui" 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index f03647fa3df6..cdc28e4da0c9 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2794,26 +2794,26 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct 
device *dev,
   struct device_attribute *attr,
   char *buf)
 {
-   unsigned int val;
+   ssize_t val;

val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_AVG_POWER);

[Kamal, Asad] amdgpu_hwmon_get_power -> This is returning unsigned int value, 
Shall we not change return type for amdgpu_hwmon_get_power?
if (val < 0)
return val;

-   return sysfs_emit(buf, "%u\n", val);
+   return sysfs_emit(buf, "%zd\n", val);
 }

 static ssize_t amdgpu_hwmon_show_power_input(struct device *dev,
 struct device_attribute *attr,
 char *buf)
 {
-   unsigned int val;
+   ssize_t val;

val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER);
if (val < 0)
return val;

-   return sysfs_emit(buf, "%u\n", val);
+   return sysfs_emit(buf, "%zd\n", val);
 }

 static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
--
2.25.1

<>

RE: [PATCH] drm/amd/pm: Fixes incorrect type in 'amdgpu_hwmon_show_power_avg() & _input()'

2023-08-21 Thread Kamal, Asad

[AMD Official Use Only - General]

With the following patch in place
https://patchwork.freedesktop.org/patch/553433/?series=122640&rev=1

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: SHANMUGAM, SRINIVASAN 
Sent: Monday, August 21, 2023 12:06 PM
To: Kamal, Asad ; SHANMUGAM, SRINIVASAN 
; Koenig, Christian ; 
Deucher, Alexander ; Chen, Guchun 

Cc: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/pm: Fixes incorrect type in 
'amdgpu_hwmon_show_power_avg() & _input()'


On 8/21/2023 11:59 AM, Kamal, Asad wrote:
> [AMD Official Use Only - General]
>
> -Original Message-
> From: amd-gfx  On Behalf Of
> Srinivasan Shanmugam
> Sent: Monday, August 21, 2023 11:36 AM
> To: Koenig, Christian ; Deucher, Alexander
> ; Chen, Guchun 
> Cc: Pan, Xinhui ; SHANMUGAM, SRINIVASAN
> ; amd-gfx@lists.freedesktop.org
> Subject: [PATCH] drm/amd/pm: Fixes incorrect type in 
> 'amdgpu_hwmon_show_power_avg() & _input()'
>
> The val is defined as unsigned int type, if(val<0) is invalid, hence
> modified its type to ssize_t
>
> Fixes the below:
>
> drivers/gpu/drm/amd/pm/amdgpu_pm.c:2800:5-8: WARNING: Unsigned
> expression compared with zero: val < 0
> drivers/gpu/drm/amd/pm/amdgpu_pm.c:2813:5-8: WARNING: Unsigned
> expression compared with zero: val < 0
>
> Cc: Guchun Chen 
> Cc: Christian König 
> Cc: Alex Deucher 
> Cc: "Pan, Xinhui" 
> Signed-off-by: Srinivasan Shanmugam 
> ---
>   drivers/gpu/drm/amd/pm/amdgpu_pm.c | 8 
>   1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> index f03647fa3df6..cdc28e4da0c9 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> @@ -2794,26 +2794,26 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct 
> device *dev,
> struct device_attribute *attr,
>     char *buf)
>   {
> -   unsigned int val;
> +   ssize_t val;
>
>  val = amdgpu_hwmon_get_power(dev,
> AMDGPU_PP_SENSOR_GPU_AVG_POWER);
>
> [Kamal, Asad] amdgpu_hwmon_get_power -> This is returning unsigned int value, 
> Shall we not change return type for amdgpu_hwmon_get_power?
Thanks!, looks like there is already a fix posted here
https://patchwork.freedesktop.org/patch/553433/?series=122640&rev=1 for 
'amdgpu_hwmon_get_power'
> if (val < 0)
>  return val;
>
> -   return sysfs_emit(buf, "%u\n", val);
> +   return sysfs_emit(buf, "%zd\n", val);
>   }
>
>   static ssize_t amdgpu_hwmon_show_power_input(struct device *dev,
>   struct device_attribute *attr,
>   char *buf)
>   {
> -   unsigned int val;
> +   ssize_t val;
>
>  val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER);
>  if (val < 0)
>  return val;
>
> -   return sysfs_emit(buf, "%u\n", val);
> +   return sysfs_emit(buf, "%zd\n", val);
>   }
>
>   static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
> --
> 2.25.1
>

RE: [PATCH] drm/amdgpu: Add only valid firmware version nodes

2023-08-25 Thread Kamal, Asad

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal asad.ka...@amd.com

Thanks & Regards
Asad

From: amd-gfx  On Behalf Of Wang, 
Yang(Kevin)
Sent: Friday, August 25, 2023 4:46 PM
To: amd-gfx@lists.freedesktop.org; Lazar, Lijo 
Cc: Deucher, Alexander ; Zhang, Hawking 

Subject: Re: [PATCH] drm/amdgpu: Add only valid firmware version nodes


[AMD Official Use Only - General]


[AMD Official Use Only - General]

Reviewed-by: Yang Wang mailto:kevinyang.w...@amd.com>>

Best Regards,
Kevin

发件人: amd-gfx 
mailto:amd-gfx-boun...@lists.freedesktop.org>>
 代表 Lijo Lazar mailto:lijo.la...@amd.com>>
发送时间: 星期五, 八月 25, 2023 17:29
收件人: amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>
抄送: Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Zhang, Hawking 
mailto:hawking.zh...@amd.com>>
主题: [PATCH] drm/amdgpu: Add only valid firmware version nodes

Show only firmware version attributes that have valid version. Hide
others.

Signed-off-by: Lijo Lazar mailto:lijo.la...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 33 ---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index 8beefc045e14..b0b37c056c36 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -669,15 +669,25 @@ const char *amdgpu_ucode_name(enum AMDGPU_UCODE_ID 
ucode_id)
 }
 }

+static inline int amdgpu_ucode_is_valid(uint32_t fw_version)
+{
+   if (!fw_version)
+   return -EINVAL;
+
+   return 0;
+}
+
 #define FW_VERSION_ATTR(name, mode, field)  \
 static ssize_t show_##name(struct device *dev,  \
- struct device_attribute *attr, \
- char *buf) \
+  struct device_attribute *attr, char *buf) \
 {   \
 struct drm_device *ddev = dev_get_drvdata(dev); \
 struct amdgpu_device *adev = drm_to_adev(ddev); \
 \
-   return sysfs_emit(buf, "0x%08x\n", adev->field);\
+   if (!buf)   \
+   return amdgpu_ucode_is_valid(adev->field);  \
+   \
+   return sysfs_emit(buf, "0x%08x\n", adev->field);\
 }   \
 static DEVICE_ATTR(name, mode, show_##name, NULL)

@@ -722,9 +732,24 @@ static struct attribute *fw_attrs[] = {
 NULL
 };

+#define to_dev_attr(x) container_of(x, struct device_attribute, attr)
+
+static umode_t amdgpu_ucode_sys_visible(struct kobject *kobj,
+   struct attribute *attr, int idx)
+{
+   struct device_attribute *dev_attr = to_dev_attr(attr);
+   struct device *dev = kobj_to_dev(kobj);
+
+   if (dev_attr->show(dev, dev_attr, NULL) == -EINVAL)
+   return 0;
+
+   return attr->mode;
+}
+
 static const struct attribute_group fw_attr_group = {
 .name = "fw_version",
-   .attrs = fw_attrs
+   .attrs = fw_attrs,
+   .is_visible = amdgpu_ucode_sys_visible
 };

 int amdgpu_ucode_sysfs_init(struct amdgpu_device *adev)
--
2.25.1

RE: [PATCH 1/2] drm/amd/pm: Update SMUv13.0.6 PMFW headers

2024-10-08 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Ping

-Original Message-
From: Kamal, Asad 
Sent: Friday, October 4, 2024 8:31 PM
To: amd-gfx@lists.freedesktop.org; Lazar, Lijo 
Cc: Ma, Le ; Zhang, Hawking ; Zhang, 
Morris ; Kamal, Asad ; Poag, Charis 
; Cheung, Donald ; Khatir, Sepehr 
; Oliveira, Daniel 
Subject: [PATCH 1/2] drm/amd/pm: Update SMUv13.0.6 PMFW headers

Update pmfw headers for smuv13.0.6 to version 0xE

Signed-off-by: Asad Kamal 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
index 822c6425d90e..0f96b8c59a0e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
@@ -123,7 +123,7 @@ typedef enum {
   VOLTAGE_GUARDBAND_COUNT
 } GFX_GUARDBAND_e;

-#define SMU_METRICS_TABLE_VERSION 0xD
+#define SMU_METRICS_TABLE_VERSION 0xE

 typedef struct __attribute__((packed, aligned(4))) {
   uint32_t AccumulationCounter;
@@ -231,6 +231,9 @@ typedef struct __attribute__((packed, aligned(4))) {
   // PER XCD ACTIVITY
   uint32_t GfxBusy[8];
   uint64_t GfxBusyAcc[8];
+
+  //PCIE BW Data and error count
+  uint32_t PCIeOtherEndRecoveryAcc;   // The Pcie counter itself is 
accumulated
 } MetricsTableX_t;

 typedef struct __attribute__((packed, aligned(4))) {
--
2.46.0

RE: [PATCH] drm/amdgpu: Use SPX as default in partition config

2024-10-14 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, October 14, 2024 2:49 PM
To: amd-gfx@lists.freedesktop.org; Kamal, Asad 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Zhou, Hao (Claire) 
Subject: [PATCH] drm/amdgpu: Use SPX as default in partition config

In certain cases - ex: when a reset is required on initialization - XCP manager 
won't have a valid partition mode. In such cases, use SPX as the default 
selected mode for which partition configuration details are populated.

Signed-off-by: Lijo Lazar 
Reported-by: Hao Zhou 

Fixes: c7de57033d9b ("drm/amdgpu: Add sysfs nodes to get xcp details")
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 111bf897e72e..83a16918ea76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -606,7 +606,7 @@ void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev)  
{
struct amdgpu_xcp_res_details *xcp_res;
struct amdgpu_xcp_cfg *xcp_cfg;
-   int i, r, j, rid;
+   int i, r, j, rid, mode;

if (!adev->xcp_mgr)
return;
@@ -625,11 +625,15 @@ void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev)
if (r)
goto err1;

-   r = amdgpu_xcp_get_res_info(xcp_cfg->xcp_mgr, xcp_cfg->xcp_mgr->mode, 
xcp_cfg);
+   mode = (xcp_cfg->xcp_mgr->mode ==
+   AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE) ?
+  AMDGPU_SPX_PARTITION_MODE :
+  xcp_cfg->xcp_mgr->mode;
+   r = amdgpu_xcp_get_res_info(xcp_cfg->xcp_mgr, mode, xcp_cfg);
if (r)
goto err1;

-   xcp_cfg->mode = xcp_cfg->xcp_mgr->mode;
+   xcp_cfg->mode = mode;
for (i = 0; i < xcp_cfg->num_res; i++) {
xcp_res = &xcp_cfg->xcp_res[i];
rid = xcp_res->id;
--
2.25.1

RE: [PATCH] drm/amdgpu: Add compatible NPS mode info

2024-11-01 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Wednesday, October 30, 2024 2:00 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Add compatible NPS mode info

Populate the compatible NPS modes also for providing partition configuration 
details through sysfs.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h|  1 +
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 11 +++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 7ac89d78a5bf..b63f53242c57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -77,6 +77,7 @@ struct amdgpu_xcp_cfg {
u8 num_res;
struct amdgpu_xcp_mgr *xcp_mgr;
struct kobject kobj;
+   u16 compatible_nps_modes;
 };

 struct amdgpu_xcp_ip_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 890976b7ce77..fea0d2d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -455,6 +455,7 @@ static int aqua_vanjaram_get_xcp_res_info(struct 
amdgpu_xcp_mgr *xcp_mgr,
int max_res[AMDGPU_XCP_RES_MAX] = {};
bool res_lt_xcp;
int num_xcp, i;
+   u16 nps_modes;

if (!(xcp_mgr->supp_xcp_modes & BIT(mode)))
return -EINVAL;
@@ -467,23 +468,33 @@ static int aqua_vanjaram_get_xcp_res_info(struct 
amdgpu_xcp_mgr *xcp_mgr,
switch (mode) {
case AMDGPU_SPX_PARTITION_MODE:
num_xcp = 1;
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE);
break;
case AMDGPU_DPX_PARTITION_MODE:
num_xcp = 2;
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE);
break;
case AMDGPU_TPX_PARTITION_MODE:
num_xcp = 3;
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
+   BIT(AMDGPU_NPS4_PARTITION_MODE);
break;
case AMDGPU_QPX_PARTITION_MODE:
num_xcp = 4;
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
+   BIT(AMDGPU_NPS4_PARTITION_MODE);
break;
case AMDGPU_CPX_PARTITION_MODE:
num_xcp = NUM_XCC(adev->gfx.xcc_mask);
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
+   BIT(AMDGPU_NPS4_PARTITION_MODE);
break;
default:
return -EINVAL;
}

+   xcp_cfg->compatible_nps_modes =
+   (adev->gmc.supported_nps_modes & nps_modes);
xcp_cfg->num_res = ARRAY_SIZE(max_res);

for (i = 0; i < xcp_cfg->num_res; i++) {
--
2.25.1

RE: [PATCH 1/3] drm/amd/pm: Add APIs for device access checks

2025-02-05 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad


-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Tuesday, February 4, 2025 12:08 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Pelloux-Prayer, Pierre-Eric 
; Feng, Kenneth ; 
Limonciello, Mario 
Subject: [PATCH 1/3] drm/amd/pm: Add APIs for device access checks

Wrap the checks before device access in helper functions and use them for 
device access. The generic order of APIs now is to do input argument validation 
first and check if device access is allowed.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 616 +++--
 1 file changed, 229 insertions(+), 387 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 0aca0803514e..0fe0b798f559 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -97,6 +97,77 @@ const char * const amdgpu_pp_profile_name[] = {
"UNCAPPED",
 };

+/**
+ * amdgpu_pm_dev_state_check - Check if device can be accessed.
+ * @adev: Target device.
+ *
+ * Checks the state of the @adev for access. Return 0 if the device is
+ * accessible or a negative error code otherwise.
+ */
+static int amdgpu_pm_dev_state_check(struct amdgpu_device *adev) {
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+   if (adev->in_suspend && !adev->in_runpm)
+   return -EPERM;
+
+   return 0;
+}
+
+/**
+ * amdgpu_pm_get_access - Check if device can be accessed, resume if needed.
+ * @adev: Target device.
+ *
+ * Checks the state of the @adev for access. Use runtime pm API to
+resume if
+ * needed. Return 0 if the device is accessible or a negative error
+code
+ * otherwise.
+ */
+static int amdgpu_pm_get_access(struct amdgpu_device *adev) {
+   int ret;
+
+   ret = amdgpu_pm_dev_state_check(adev);
+   if (ret)
+   return ret;
+
+   return pm_runtime_resume_and_get(adev->dev);
+}
+
+/**
+ * amdgpu_pm_get_access_if_active - Check if device is active for access.
+ * @adev: Target device.
+ *
+ * Checks the state of the @adev for access. Use runtime pm API to
+determine
+ * if device is active. Allow access only if device is active.Return 0
+if the
+ * device is accessible or a negative error code otherwise.
+ */
+static int amdgpu_pm_get_access_if_active(struct amdgpu_device *adev) {
+   int ret;
+
+   ret = amdgpu_pm_dev_state_check(adev);
+   if (ret)
+   return ret;
+
+   ret = pm_runtime_get_if_active(adev->dev);
+   if (ret <= 0)
+   return ret ?: -EPERM;
+
+   return 0;
+}
+
+/**
+ * amdgpu_pm_put_access - Put to auto suspend mode after a device access.
+ * @adev: Target device.
+ *
+ * Should be paired with amdgpu_pm_get_access* calls  */ static inline
+void amdgpu_pm_put_access(struct amdgpu_device *adev) {
+   pm_runtime_mark_last_busy(adev->dev);
+   pm_runtime_put_autosuspend(adev->dev);
+}
+
 /**
  * DOC: power_dpm_state
  *
@@ -140,18 +211,13 @@ static ssize_t amdgpu_get_power_dpm_state(struct device 
*dev,
enum amd_pm_state_type pm;
int ret;

-   if (amdgpu_in_reset(adev))
-   return -EPERM;
-   if (adev->in_suspend && !adev->in_runpm)
-   return -EPERM;
-
-   ret = pm_runtime_get_if_active(ddev->dev);
-   if (ret <= 0)
-   return ret ?: -EPERM;
+   ret = amdgpu_pm_get_access_if_active(adev);
+   if (ret)
+   return ret;

amdgpu_dpm_get_current_power_state(adev, &pm);

-   pm_runtime_put_autosuspend(ddev->dev);
+   amdgpu_pm_put_access(adev);

return sysfs_emit(buf, "%s\n",
  (pm == POWER_STATE_TYPE_BATTERY) ? "battery" :
@@ -168,11 +234,6 @@ static ssize_t amdgpu_set_power_dpm_state(struct device 
*dev,
enum amd_pm_state_type  state;
int ret;

-   if (amdgpu_in_reset(adev))
-   return -EPERM;
-   if (adev->in_suspend && !adev->in_runpm)
-   return -EPERM;
-
if (strncmp("battery", buf, strlen("battery")) == 0)
state = POWER_STATE_TYPE_BATTERY;
else if (strncmp("balanced", buf, strlen("balanced")) == 0) @@ -182,14 
+243,13 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
else
return -EINVAL;

-   ret = pm_runtime_resume_and_get(ddev->dev);
+   ret = amdgpu_pm_get_access(adev);
if (ret < 0)
return ret;

amdgpu_dpm_set_power_state(adev, state);

-   pm_runtime_mark_last_busy(ddev->dev);
-   pm_runtime_put_autosuspend(ddev->dev);
+   amdgpu_pm_put_access(adev);

return count;
 }
@@ -263,18 +323,13 @@ static ssize_t 
amdgpu_get_power_dpm_force_performance_level(struct device *dev,
enum amd_dpm_forced_level level = 0xff;
int ret;

-   if (amdgpu_in

RE: [PATCH] drm/amdgpu: Refine ip detection log message

2024-12-17 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, December 17, 2024 12:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Refine ip detection log message

'add ip block' causes a confusion if the blocks are disabled later with 
ip_block_mask. Instead change to 'detected' and also add device context.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a121aab5c74..182b6288df9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2365,8 +2365,8 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
break;
}

-   DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
- ip_block_version->funcs->name);
+   dev_info(adev->dev, "detected ip block number %d <%s>\n",
+adev->num_ip_blocks, ip_block_version->funcs->name);

adev->ip_blocks[adev->num_ip_blocks].adev = adev;

--
2.25.1

RE: [PATCH] drm/amdgpu: Increase FRU File Id buffer size

2024-12-03 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, December 4, 2024 9:52 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Increase FRU File Id buffer size

Some boards use longer File Ids.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.h
index bc58dca18035..98f3196599ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.h
@@ -32,7 +32,7 @@ struct amdgpu_fru_info {
charproduct_name[AMDGPU_PRODUCT_NAME_LEN];
charserial[20];
charmanufacturer_name[32];
-   charfru_id[32];
+   charfru_id[50];
 };

 int amdgpu_fru_get_product_info(struct amdgpu_device *adev);
--
2.25.1

RE: [PATCH v4] drm/amd/pm: Fix smu v13.0.6 caps initialization

2025-01-20 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, January 21, 2025 1:12 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Wang, 
Yang(Kevin) ; Deucher, Alexander 

Subject: [PATCH v4] drm/amd/pm: Fix smu v13.0.6 caps initialization

Fix the initialization and usage of SMU v13.0.6 capability values. Use 
caps_set/clear functions to set/clear capability.

Also, fix SET_UCLK_MAX capability on APUs, it is supported on APUs.

Signed-off-by: Lijo Lazar 
Reviewed-by: Alex Deucher 
Reviewed-by: Yang Wang 

Fixes: 9bb53d2ce109 ("drm/amd/pm: Add capability flags for SMU v13.0.6")
---
v1: ("drm/amd/pm: Use correct macros for smu caps")
v2:
Use caps_set/clear instead of macros (Alex). Commit message changed.
Use BIT_ULL (Kevin)
Fix SET_UCLK_MAX capability on APUs
v3:
Rename to cap to indicate operations on single capability (Alex)
Use SMU_CAP in enum value definition also for consistency
v4:
Rebase on top of the new checks for SDMA RESET cap.
Add SMU v13.0.12 initial caps values.
Keep CTF_LIMIT cap check common for SMU v13.0.6 (Asad)

 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 239 ++
 1 file changed, 134 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index f8821783a099..fa11e30bff24 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -101,26 +101,25 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_14.bin");
 #define MCA_BANK_IPID(_ip, _hwid, _type) \
[AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, }

+#define SMU_CAP(x) SMU_13_0_6_CAPS_##x
+
 enum smu_v13_0_6_caps {
-   SMU_13_0_6_CAPS_DPM,
-   SMU_13_0_6_CAPS_UNI_METRICS,
-   SMU_13_0_6_CAPS_DPM_POLICY,
-   SMU_13_0_6_CAPS_OTHER_END_METRICS,
-   SMU_13_0_6_CAPS_SET_UCLK_MAX,
-   SMU_13_0_6_CAPS_PCIE_METRICS,
-   SMU_13_0_6_CAPS_HST_LIMIT_METRICS,
-   SMU_13_0_6_CAPS_MCA_DEBUG_MODE,
-   SMU_13_0_6_CAPS_PER_INST_METRICS,
-   SMU_13_0_6_CAPS_CTF_LIMIT,
-   SMU_13_0_6_CAPS_RMA_MSG,
-   SMU_13_0_6_CAPS_ACA_SYND,
-   SMU_13_0_6_CAPS_SDMA_RESET,
-   SMU_13_0_6_CAPS_ALL,
+   SMU_CAP(DPM),
+   SMU_CAP(UNI_METRICS),
+   SMU_CAP(DPM_POLICY),
+   SMU_CAP(OTHER_END_METRICS),
+   SMU_CAP(SET_UCLK_MAX),
+   SMU_CAP(PCIE_METRICS),
+   SMU_CAP(HST_LIMIT_METRICS),
+   SMU_CAP(MCA_DEBUG_MODE),
+   SMU_CAP(PER_INST_METRICS),
+   SMU_CAP(CTF_LIMIT),
+   SMU_CAP(RMA_MSG),
+   SMU_CAP(ACA_SYND),
+   SMU_CAP(SDMA_RESET),
+   SMU_CAP(ALL),
 };

-#define SMU_CAPS_MASK(x) (ULL(1) << x)
-#define SMU_CAPS(x) SMU_CAPS_MASK(SMU_13_0_6_CAPS_##x)
-
 struct mca_bank_ipid {
enum amdgpu_mca_ip ip;
uint16_t hwid;
@@ -283,100 +282,143 @@ struct smu_v13_0_6_dpm_map {
uint32_t *freq_table;
 };

-static void smu_v13_0_14_init_caps(struct smu_context *smu)
+static inline void smu_v13_0_6_cap_set(struct smu_context *smu,
+  enum smu_v13_0_6_caps cap)
+{
+   struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
+
+   dpm_context->caps |= BIT_ULL(cap);
+}
+
+static inline void smu_v13_0_6_cap_clear(struct smu_context *smu,
+enum smu_v13_0_6_caps cap)
 {
struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
-   uint64_t caps = SMU_CAPS(DPM) | SMU_CAPS(UNI_METRICS) |
-   SMU_CAPS(SET_UCLK_MAX) | SMU_CAPS(DPM_POLICY) |
-   SMU_CAPS(PCIE_METRICS) | SMU_CAPS(CTF_LIMIT) |
-   SMU_CAPS(MCA_DEBUG_MODE) | SMU_CAPS(RMA_MSG) |
-   SMU_CAPS(ACA_SYND);
+
+   dpm_context->caps &= ~BIT_ULL(cap);
+}
+
+static inline bool smu_v13_0_6_cap_supported(struct smu_context *smu,
+enum smu_v13_0_6_caps cap)
+{
+   struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
+
+   return !!(dpm_context->caps & BIT_ULL(cap)); }
+
+static void smu_v13_0_14_init_caps(struct smu_context *smu) {
+   enum smu_v13_0_6_caps default_cap_list[] = { SMU_CAP(DPM),
+SMU_CAP(UNI_METRICS),
+SMU_CAP(SET_UCLK_MAX),
+SMU_CAP(DPM_POLICY),
+SMU_CAP(PCIE_METRICS),
+SMU_CAP(CTF_LIMIT),
+SMU_CAP(MCA_DEBUG_MODE),
+

RE: [PATCH 1/2] drm/amdgpu: Clean up GFX v9.4.3 IP version checks

2025-01-28 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, January 28, 2025 1:22 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH 1/2] drm/amdgpu: Clean up GFX v9.4.3 IP version checks

Remove unnecessary IP version checks for GFX 9.4.3 and similar variants.
Wrap checks inside meaningful function.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c  | 68 ++--  
drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 22 
 2 files changed, 29 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 2ba185875baa..f4635fc8a7ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -942,21 +942,12 @@ static int gfx_v9_4_3_gpu_early_init(struct amdgpu_device 
*adev)
adev->gfx.funcs = &gfx_v9_4_3_gfx_funcs;
adev->gfx.ras = &gfx_v9_4_3_ras;

-   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
-   case IP_VERSION(9, 4, 3):
-   case IP_VERSION(9, 4, 4):
-   case IP_VERSION(9, 5, 0):
-   adev->gfx.config.max_hw_contexts = 8;
-   adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
-   adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
-   adev->gfx.config.sc_hiz_tile_fifo_size = 0x30;
-   adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
-   gb_addr_config = RREG32_SOC15(GC, GET_INST(GC, 0), 
regGB_ADDR_CONFIG);
-   break;
-   default:
-   BUG();
-   break;
-   }
+   adev->gfx.config.max_hw_contexts = 8;
+   adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
+   adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
+   adev->gfx.config.sc_hiz_tile_fifo_size = 0x30;
+   adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
+   gb_addr_config = RREG32_SOC15(GC, GET_INST(GC, 0), regGB_ADDR_CONFIG);

adev->gfx.config.gb_addr_config = gb_addr_config;

@@ -2795,16 +2786,10 @@ static int gfx_v9_4_3_set_clockgating_state(struct 
amdgpu_ip_block *ip_block,
return 0;

num_xcc = NUM_XCC(adev->gfx.xcc_mask);
-   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
-   case IP_VERSION(9, 4, 3):
-   case IP_VERSION(9, 4, 4):
-   for (i = 0; i < num_xcc; i++)
-   gfx_v9_4_3_xcc_update_gfx_clock_gating(
-   adev, state == AMD_CG_STATE_GATE, i);
-   break;
-   default:
-   break;
-   }
+   for (i = 0; i < num_xcc; i++)
+   gfx_v9_4_3_xcc_update_gfx_clock_gating(
+   adev, state == AMD_CG_STATE_GATE, i);
+
return 0;
 }

@@ -4867,34 +4852,13 @@ static void gfx_v9_4_3_set_rlc_funcs(struct 
amdgpu_device *adev)

 static void gfx_v9_4_3_set_gds_init(struct amdgpu_device *adev)  {
-   /* init asci gds info */
-   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
-   case IP_VERSION(9, 4, 3):
-   case IP_VERSION(9, 4, 4):
-   case IP_VERSION(9, 5, 0):
-   /* 9.4.3 removed all the GDS internal memory,
-* only support GWS opcode in kernel, like barrier
-* semaphore.etc */
-   adev->gds.gds_size = 0;
-   break;
-   default:
-   adev->gds.gds_size = 0x1;
-   break;
-   }
-
-   switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
-   case IP_VERSION(9, 4, 3):
-   case IP_VERSION(9, 4, 4):
-   case IP_VERSION(9, 5, 0):
-   /* deprecated for 9.4.3, no usage at all */
-   adev->gds.gds_compute_max_wave_id = 0;
-   break;
-   default:
-   /* this really depends on the chip */
-   adev->gds.gds_compute_max_wave_id = 0x7ff;
-   break;
-   }
+   /* 9.4.3 variants removed all the GDS internal memory,
+* only support GWS opcode in kernel, like barrier
+* semaphore.etc */

+   /* init asic gds info */
+   adev->gds.gds_size = 0;
+   adev->gds.gds_compute_max_wave_id = 0;
adev->gds.gws_size = 64;
adev->gds.oa_size = 16;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 5470cef7e9bd..cb25f7f0dfc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -313,6 +313,16 @@ gfxhub_v1_2_xcc_disable_identity_aperture(struct 
amdgpu_device *adev,
}
 }

+static inline bool
+gfxhub_v1_2_per_process_xnack_support(struct amdgpu_device *adev) {
+   /*
+* TODO: Check if this function is really needed, so far only 9.4.3
+* va

RE: [PATCH] drm/amd/pm: Use one level table if dpm not enabled

2025-01-30 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, January 30, 2025 2:50 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Use one level table if dpm not enabled

For SMU v13.0.6 variants, if dpm is disabled for a clock, fill current 
frequency as the only level in frequency table. Also, drop Lclk table as it is 
not used.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 33 +++
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index de533076e157..7f0b4cc1141b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -279,6 +279,20 @@ struct PPTable_t {
   sizeof(MetricsTableV1_t),\
   sizeof(MetricsTableV2_t)))

+#define METRICS_CURR_FREQ_F0(clk) \
+   SMUQ10_ROUND(GET_METRIC_FIELD(clk##Frequency, version)) #define
+METRICS_CURR_FREQ_F1(clk) \
+   SMUQ10_ROUND(GET_METRIC_FIELD(clk##Frequency, version)[0])
+
+#define INIT_FREQ_TABLE(type, clk, levels, f)   \
+   if (!smu_cmn_clk_dpm_is_enabled(smu, type)) \
+   pptable->clk##FrequencyTable[0] = f(clk);   \
+   else\
+   for (i = 0; i < levels; i++)\
+   pptable->clk##FrequencyTable[i] = SMUQ10_ROUND( \
+   GET_METRIC_FIELD(clk##FrequencyTable,   \
+version)[i]);
+
 struct smu_v13_0_6_dpm_map {
enum smu_clk_type clk_type;
uint32_t feature_num;
@@ -806,20 +820,11 @@ static int smu_v13_0_6_setup_driver_pptable(struct 
smu_context *smu)
pptable->MinGfxclkFrequency =
SMUQ10_ROUND(GET_METRIC_FIELD(MinGfxclkFrequency, 
version));

-   for (i = 0; i < 4; ++i) {
-   pptable->FclkFrequencyTable[i] =
-   
SMUQ10_ROUND(GET_METRIC_FIELD(FclkFrequencyTable, version)[i]);
-   pptable->UclkFrequencyTable[i] =
-   
SMUQ10_ROUND(GET_METRIC_FIELD(UclkFrequencyTable, version)[i]);
-   pptable->SocclkFrequencyTable[i] = SMUQ10_ROUND(
-   GET_METRIC_FIELD(SocclkFrequencyTable, 
version)[i]);
-   pptable->VclkFrequencyTable[i] =
-   
SMUQ10_ROUND(GET_METRIC_FIELD(VclkFrequencyTable, version)[i]);
-   pptable->DclkFrequencyTable[i] =
-   
SMUQ10_ROUND(GET_METRIC_FIELD(DclkFrequencyTable, version)[i]);
-   pptable->LclkFrequencyTable[i] =
-   
SMUQ10_ROUND(GET_METRIC_FIELD(LclkFrequencyTable, version)[i]);
-   }
+   INIT_FREQ_TABLE(SMU_FCLK, Fclk, 4, METRICS_CURR_FREQ_F0);
+   INIT_FREQ_TABLE(SMU_UCLK, Uclk, 4, METRICS_CURR_FREQ_F0);
+   INIT_FREQ_TABLE(SMU_SOCCLK, Socclk, 4, METRICS_CURR_FREQ_F1);
+   INIT_FREQ_TABLE(SMU_VCLK, Vclk, 4, METRICS_CURR_FREQ_F1);
+   INIT_FREQ_TABLE(SMU_DCLK, Dclk, 4, METRICS_CURR_FREQ_F1);

/* use AID0 serial number by default */
pptable->PublicSerialNumber_AID =
--
2.25.1

RE: [PATCH 1/2] drm/amd/pm: Add SMUv13.0.12 PMFW headers

2025-01-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Hi @Deucher, Alexander,

Please hold on to this series, we are currently working on a refined version, 
this current series will be dropped.

Thanks & Regards
Asad

-Original Message-
From: Deucher, Alexander 
Sent: Wednesday, January 15, 2025 9:58 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kamal, Asad ; Lazar, Lijo ; 
Deucher, Alexander 
Subject: [PATCH 1/2] drm/amd/pm: Add SMUv13.0.12 PMFW headers

From: Asad Kamal 

Add pmfw headers for smuv13.0.12 to pmfw version 86.24.0

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
Signed-off-by: Alex Deucher 
---
 .../pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h  | 248 ++
 1 file changed, 248 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h
new file mode 100644
index 0..859e7a3813bb5
--- /dev/null
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef SMU_13_0_12_PMFW_H
+#define SMU_13_0_12_PMFW_H
+
+#define NUM_VCLK_DPM_LEVELS   4
+#define NUM_DCLK_DPM_LEVELS   4
+#define NUM_SOCCLK_DPM_LEVELS 4
+#define NUM_LCLK_DPM_LEVELS   4
+#define NUM_UCLK_DPM_LEVELS   4
+#define NUM_FCLK_DPM_LEVELS   4
+#define NUM_XGMI_DPM_LEVELS   2
+#define NUM_CXL_BITRATES  4
+#define NUM_PCIE_BITRATES 4
+#define NUM_XGMI_BITRATES 4
+#define NUM_XGMI_WIDTHS   3
+#define NUM_TDP_GROUPS4
+#define NUM_SOC_P2S_TABLES6
+#define NUM_GFX_P2S_TABLES8
+#define NUM_PSM_DIDT_THRESHOLDS 3
+
+typedef enum {
+/*0*/   FEATURE_DATA_CALCULATION= 0,
+/*1*/   FEATURE_DPM_FCLK= 1,
+/*2*/   FEATURE_DPM_GFXCLK  = 2,
+/*3*/   FEATURE_DPM_LCLK= 3,
+/*4*/   FEATURE_DPM_SOCCLK  = 4,
+/*5*/   FEATURE_DPM_UCLK= 5,
+/*6*/   FEATURE_DPM_VCN = 6,
+/*7*/   FEATURE_DPM_XGMI= 7,
+/*8*/   FEATURE_DS_FCLK = 8,
+/*9*/   FEATURE_DS_GFXCLK   = 9,
+/*10*/  FEATURE_DS_LCLK = 10,
+/*11*/  FEATURE_DS_MP0CLK   = 11,
+/*12*/  FEATURE_DS_MP1CLK   = 12,
+/*13*/  FEATURE_DS_MPIOCLK  = 13,
+/*14*/  FEATURE_DS_SOCCLK   = 14,
+/*15*/  FEATURE_DS_VCN  = 15,
+/*16*/  FEATURE_APCC_DFLL   = 16,
+/*17*/  FEATURE_APCC_PLUS   = 17,
+/*18*/  FEATURE_PPT = 18,
+/*19*/  FEATURE_TDC = 19,
+/*20*/  FEATURE_THERMAL = 20,
+/*21*/  FEATURE_SOC_PCC = 21,
+/*22*/  FEATURE_PROCHOT = 22,
+/*23*/  FEATURE_FDD_AID_HBM = 23,
+/*24*/  FEATURE_FDD_AID_SOC = 24,
+/*25*/  FEATURE_FDD_XCD_EDC = 25,
+/*26*/  FEATURE_FDD_XCD_XVMIN   = 26,
+/*27*/  FEATURE_FW_CTF  = 27,
+/*28*/  FEATURE_SMU_CG  = 28,
+/*29*/  FEATURE_PSI7= 29,
+/*30*/  FEATURE_XGMI_PER_LINK_PWR_DOWN  = 30,
+/*31*/  FEATURE_SOC_DC_RTC  = 31,
+/*32*/  FEATURE_GFX_DC_RTC  = 32,
+/*33*/  FEATURE_DVM_MIN_PSM = 33,
+/*34*/  FEATURE_PRC = 34,
+/*35*/  FEATURE_PSM_SQ_THROTTLER= 35,
+/*36*/  FEATURE_PIT = 36,
+/*37*/  FEATURE_DVO = 37,
+/*38*/  FEATURE_XVMINORPSM_CLKSTOP_DS   = 38,
+
+/*39*/  NUM_FEATURES= 39
+} FEATURE_LIST_e;
+
+//enum for MPIO PCIe gen speed msgs
+typedef enum {
+  PCIE_LIN

RE: [PATCH] drm/amd/pm: Add capability flags for SMU v13.0.6

2025-01-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Thursday, January 16, 2025 8:37 PM
To: Lazar, Lijo 
Cc: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Deucher, Alexander ; Kamal, Asad 
; Wang, Yang(Kevin) 
Subject: Re: [PATCH] drm/amd/pm: Add capability flags for SMU v13.0.6

On Thu, Jan 16, 2025 at 7:29 AM Lijo Lazar  wrote:
>
> Add capability flags for SMU v13.0.6 variants. Initialize the flags
> based on firmware support. As there are multiple IP versions
> maintained, it is more manageable with one time initialization caps
> flags based on IP version and firmware feature support.
>
> Signed-off-by: Lijo Lazar 

This is a nice cleanup.
Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h  |   1 +
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 225
> --
>  2 files changed, 158 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> index 356d9422b411..8d4a96e23326 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> @@ -107,6 +107,7 @@ struct smu_13_0_dpm_context {
> struct smu_13_0_dpm_tables  dpm_tables;
> uint32_tworkload_policy_mask;
> uint32_tdcef_min_ds_clk;
> +   uint64_tcaps;
>  };
>
>  enum smu_13_0_power_state {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index c12959a36d78..56e26fcd3066 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -101,38 +101,25 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_14.bin");
>  #define MCA_BANK_IPID(_ip, _hwid, _type) \
> [AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, }
>
> -static inline bool smu_v13_0_6_is_unified_metrics(struct smu_context
> *smu) -{
> -   return (smu->adev->flags & AMD_IS_APU) &&
> -   smu->smc_fw_version <= 0x4556900;
> -}
> -
> -static inline bool smu_v13_0_6_is_other_end_count_available(struct
> smu_context *smu) -{
> -   switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) {
> -   case IP_VERSION(13, 0, 6):
> -   return smu->smc_fw_version >= 0x557600;
> -   case IP_VERSION(13, 0, 14):
> -   return smu->smc_fw_version >= 0x05550E00;
> -   default:
> -   return false;
> -   }
> -}
> -
> -static inline bool smu_v13_0_6_is_blw_host_limit_available(struct
> smu_context *smu) -{
> -   if (smu->adev->flags & AMD_IS_APU)
> -   return smu->smc_fw_version >= 0x04556F00;
> +enum smu_v13_0_6_caps {
> +   SMU_13_0_6_CAPS_DPM,
> +   SMU_13_0_6_CAPS_UNI_METRICS,
> +   SMU_13_0_6_CAPS_DPM_POLICY,
> +   SMU_13_0_6_CAPS_OTHER_END_METRICS,
> +   SMU_13_0_6_CAPS_SET_UCLK_MAX,
> +   SMU_13_0_6_CAPS_PCIE_METRICS,
> +   SMU_13_0_6_CAPS_HST_LIMIT_METRICS,
> +   SMU_13_0_6_CAPS_MCA_DEBUG_MODE,
> +   SMU_13_0_6_CAPS_PER_INST_METRICS,
> +   SMU_13_0_6_CAPS_CTF_LIMIT,
> +   SMU_13_0_6_CAPS_RMA_MSG,
> +   SMU_13_0_6_CAPS_ACA_SYND,
> +   SMU_13_0_6_CAPS_SDMA_RESET,
> +   SMU_13_0_6_CAPS_ALL,
> +};
>
> -   switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) {
> -   case IP_VERSION(13, 0, 6):
> -   return smu->smc_fw_version >= 0x557900;
> -   case IP_VERSION(13, 0, 14):
> -   return smu->smc_fw_version >= 0x05551000;
> -   default:
> -   return false;
> -   }
> -}
> +#define SMU_CAPS_MASK(x) (ULL(1) << x) #define SMU_CAPS(x)
> +SMU_CAPS_MASK(SMU_13_0_6_CAPS_##x)
>
>  struct mca_bank_ipid {
> enum amdgpu_mca_ip ip;
> @@ -297,6 +284,119 @@ struct smu_v13_0_6_dpm_map {
> uint32_t *freq_table;
>  };
>
> +static void smu_v13_0_14_init_caps(struct smu_context *smu) {
> +   struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
> +   uint64_t caps = SMU_CAPS(DPM) | SMU_CAPS(UNI_METRICS) |
> +   SMU_CAPS(SET_UCLK_MAX) | SMU_CAPS(DPM_POLICY) |
> +   SMU_CAPS(PCIE_METRICS) | SMU_CAPS(CTF_LIMIT) |
> +   SMU_CAPS(MCA_DEBUG_MODE) | SMU_CAPS(RMA_MSG) |
> +   SMU_CAPS(ACA_SYND);
> +   uint32_t fw_ver = smu->smc_fw_version;
> +
> +   if (fw_ver >= 0x05550E00)
> +

RE: [PATCH 1/2] drm/amd/pm: Add debug bit for smu pool allocation

2025-03-18 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Tuesday, March 18, 2025 6:52 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: Re: [PATCH 1/2] drm/amd/pm: Add debug bit for smu pool allocation



On 3/7/2025 11:50 AM, Lijo Lazar wrote:
> In certain cases, it's desirable to avoid PMFW log transactions to
> system memory. Add a mask bit to decide whether to allocate smu pool
> in device memory or system memory.
>
> Signed-off-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 5 +
>  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   | 3 ++-
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 -
>  3 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index b161daa90019..22775c204632 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -140,6 +140,7 @@ enum AMDGPU_DEBUG_MASK {
>   AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
>   AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5),
>   AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
> + AMDGPU_DEBUG_SMU_POOL = BIT(7),
>  };
>
>  unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2231,6 +2232,10 @@
> static void amdgpu_init_debug_options(struct amdgpu_device *adev)
>   pr_info("debug: ring reset disabled\n");
>   adev->debug_disable_gpu_ring_reset = true;
>   }
> + if (amdgpu_debug_mask & AMDGPU_DEBUG_SMU_POOL) {
> + pr_info("debug: use vram for smu pool\n");
> + adev->pm.smu_debug_mask |= SMU_DEBUG_POOL_USE_VRAM;
> + }
>  }
>
>  static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev,
> unsigned long flags) diff --git
> a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index 9fb26b5c8ae7..f93d287dbf13 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -295,7 +295,8 @@ enum ip_power_state {  };
>
>  /* Used to mask smu debug modes */
> -#define SMU_DEBUG_HALT_ON_ERROR  0x1
> +#define SMU_DEBUG_HALT_ON_ERROR  BIT(0)
> +#define SMU_DEBUG_POOL_USE_VRAM  BIT(1)
>
>  #define MAX_SMU_I2C_BUSES   2
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 54a31d586d55..f6def50ba22d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1027,7 +1027,10 @@ static int smu_alloc_memory_pool(struct
> smu_context *smu)
>
>   memory_pool->size = pool_size;
>   memory_pool->align = PAGE_SIZE;
> - memory_pool->domain = AMDGPU_GEM_DOMAIN_GTT;
> + memory_pool->domain =
> + (adev->pm.smu_debug_mask & SMU_DEBUG_POOL_USE_VRAM) ?
> + AMDGPU_GEM_DOMAIN_VRAM :
> + AMDGPU_GEM_DOMAIN_GTT;
>
>   switch (pool_size) {
>   case SMU_MEMORY_POOL_SIZE_256_MB:

RE: [PATCH] drm/amdgpu: Fix xgmi v6.4.1 link status reporting

2025-04-01 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, April 1, 2025 4:55 PM
To: amd-gfx@lists.freedesktop.org; Lazar, Lijo 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Fix xgmi v6.4.1 link status reporting

Use the right register offsets for getting link status.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 477424472bbe..95231de26cb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -296,15 +296,27 @@ static const struct amdgpu_pcs_ras_field 
xgmi3x16_pcs_ras_fields[] = {

 static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int 
global_link_num)  {
-   const u32 smnpcs_xgmi3x16_pcs_state_hist1 = 0x11a00070;
-   const int xgmi_inst = 2;
-   u32 link_inst;
+   const u32 smn_xgmi_6_4_pcs_state_hist1[2] = { 0x11a00070, 0x11b00070 };
+   const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x11b00070,
+   0x12100070 };
+   u32 i, n;
u64 addr;

-   link_inst = global_link_num % xgmi_inst;
+   switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+   case IP_VERSION(6, 4, 0):
+   n = ARRAY_SIZE(smn_xgmi_6_4_pcs_state_hist1);
+   addr = smn_xgmi_6_4_pcs_state_hist1[global_link_num % n];
+   break;
+   case IP_VERSION(6, 4, 1):
+   n = ARRAY_SIZE(smn_xgmi_6_4_1_pcs_state_hist1);
+   addr = smn_xgmi_6_4_1_pcs_state_hist1[global_link_num % n];
+   break;
+   default:
+   return U32_MAX;
+   }

-   addr = (smnpcs_xgmi3x16_pcs_state_hist1 | (link_inst << 20)) +
-   adev->asic_funcs->encode_ext_smn_addressing(global_link_num / 
xgmi_inst);
+   i = global_link_num / n;
+   addr += adev->asic_funcs->encode_ext_smn_addressing(i);

return RREG32_PCIE_EXT(addr);
 }
--
2.25.1

RE: [PATCH v3 1/7] drm/amd/pm: Add ip version check for smu_v13_0_12 functions

2025-04-22 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Hawking,

There are changes in fw again to report voltage in mv. Please ignore v3 of this 
series. Will send a fresh v3.

Thanks & Regards
Asad

-Original Message-
From: Zhang, Hawking 
Sent: Tuesday, April 22, 2025 12:06 PM
To: Kamal, Asad ; amd-gfx@lists.freedesktop.org; Lazar, 
Lijo 
Cc: Ma, Le ; Zhang, Morris ; Deucher, 
Alexander 
Subject: RE: [PATCH v3 1/7] drm/amd/pm: Add ip version check for smu_v13_0_12 
functions

[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Kamal, Asad 
Sent: Friday, April 18, 2025 22:06
To: amd-gfx@lists.freedesktop.org; Lazar, Lijo 
Cc: Zhang, Hawking ; Ma, Le ; Zhang, 
Morris ; Kamal, Asad ; Deucher, 
Alexander 
Subject: [PATCH v3 1/7] drm/amd/pm: Add ip version check for smu_v13_0_12 
functions

Add ip version check to use smu_v13_0_12 specific functions

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6d84257b5301..177c55f457f4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -748,7 +748,8 @@ static int smu_v13_0_6_setup_driver_pptable(struct 
smu_context *smu)
int ret, i, retry = 100;
uint32_t table_version;

-   if (smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
+   if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12) 
&&
+   smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
return smu_v13_0_12_setup_driver_pptable(smu);

/* Store one-time values in driver PPTable */ @@ -1131,7 +1132,8 @@ 
static int smu_v13_0_6_get_smu_metrics_data(struct smu_context *smu,
if (ret)
return ret;

-   if (smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
+   if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12) 
&&
+   smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
return smu_v13_0_12_get_smu_metrics_data(smu, member, value);

/* For clocks with multiple instances, only report the first one */ @@ 
-2496,7 +2498,8 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
return ret;
}

-   if (smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
+   if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12) 
&&
+   smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS)))
return smu_v13_0_12_get_gpu_metrics(smu, table);

metrics_v1 = (MetricsTableV1_t *)metrics_v0;
--
2.46.0

RE: [PATCH] drm/amdgpu: Print bootloader status for long waits

2025-04-28 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, April 28, 2025 3:40 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Wang, 
Yang(Kevin) 
Subject: [PATCH] drm/amdgpu: Print bootloader status for long waits

If it needs a long wait for completion of bootloader execution, report the 
status in between. That helps to know if there is some issue during bootloader 
execution.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 31 ++
 1 file changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index f5f616ab20e7..f8af2cc63446 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -85,6 +85,8 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_4_ta.bin");

 #define regMP1_PUB_SCRATCH00x3b10090

+#define PSP13_BL_STATUS_SIZE 100
+
 static int psp_v13_0_init_microcode(struct psp_context *psp)  {
struct amdgpu_device *adev = psp->adev; @@ -151,6 +153,32 @@ static 
bool psp_v13_0_is_sos_alive(struct psp_context *psp)
return sol_reg != 0x0;
 }

+static void psp_v13_0_bootloader_print_status(struct psp_context *psp,
+ const char *msg)
+{
+   struct amdgpu_device *adev = psp->adev;
+   u32 bl_status_reg;
+   char bl_status_msg[PSP13_BL_STATUS_SIZE];
+   int i, at;
+
+   if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
+   amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+   amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) {
+   at = 0;
+   for_each_inst(i, adev->aid_mask) {
+   bl_status_reg =
+   (SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_92)
+<< 2) +
+   adev->asic_funcs->encode_ext_smn_addressing(i);
+   at += snprintf(bl_status_msg + at,
+  PSP13_BL_STATUS_SIZE - at,
+  " status(%02i): 0x%08x", i,
+  RREG32_PCIE_EXT(bl_status_reg));
+   }
+   dev_info(adev->dev, "%s - %s", msg, bl_status_msg);
+   }
+}
+
 static int psp_v13_0_wait_for_vmbx_ready(struct psp_context *psp)  {
struct amdgpu_device *adev = psp->adev; @@ -196,6 +224,9 @@ static int 
psp_v13_0_wait_for_bootloader(struct psp_context *psp)

if (ret == 0)
return 0;
+   if (retry_loop && !(retry_loop % 10))
+   psp_v13_0_bootloader_print_status(
+   psp, "Waiting for bootloader completion");
}

return ret;
--
2.25.1

RE: [PATCH v2] drm/amdgpu: Disallow partition query during reset

2025-04-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, April 16, 2025 1:42 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH v2] drm/amdgpu: Disallow partition query during reset

Reject queries to get current partition modes during reset. Also, don't accept 
sysfs interface requests to switch compute partition mode while in reset.

Signed-off-by: Lijo Lazar 
---
v2: Keep consistent error code, return EPERM

 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 10 ++  
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  4 
 2 files changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 2c933d436e56..67ebeed77d71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1353,6 +1353,10 @@ static ssize_t 
amdgpu_gfx_get_current_compute_partition(struct device *dev,
struct amdgpu_device *adev = drm_to_adev(ddev);
int mode;

+   /* Only minimal precaution taken to reject requests while in reset.*/
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+
mode = amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
   AMDGPU_XCP_FL_NONE);

@@ -1396,8 +1400,14 @@ static ssize_t amdgpu_gfx_set_compute_partition(struct 
device *dev,
return -EINVAL;
}

+   /* Don't allow a switch while under reset */
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EPERM;
+
ret = amdgpu_xcp_switch_partition_mode(adev->xcp_mgr, mode);

+   up_read(&adev->reset_domain->sem);
+
if (ret)
return ret;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ecb74ccf1d90..6b0fbbb91e57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -1230,6 +1230,10 @@ static ssize_t current_memory_partition_show(
struct amdgpu_device *adev = drm_to_adev(ddev);
enum amdgpu_memory_partition mode;

+   /* Only minimal precaution taken to reject requests while in reset */
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+
mode = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
if ((mode >= ARRAY_SIZE(nps_desc)) ||
(BIT(mode) & AMDGPU_ALL_NPS_MASK) != BIT(mode))
--
2.25.1

RE: [PATCH] drm/amdgpu: Fix comment style

2025-04-27 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, April 28, 2025 10:38 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; kernel test 
robot 
Subject: [PATCH] drm/amdgpu: Fix comment style

Fix code comment style

Signed-off-by: Lijo Lazar 
Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202504271826.xy2ffo28-...@intel.com/
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 17f0911ee7e9..82013b495436 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2165,7 +2165,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct 
amdgpu_device *adev)
/* Fatal error events are handled on host side */
if (amdgpu_sriov_vf(adev))
return;
-   /**
+   /*
 * If the current interrupt is caused by a non-fatal RAS error, skip
 * check for fatal error. For fatal errors, FED status of all devices
 * in XGMI hive gets set when the first device gets fatal error
--
2.25.1

RE: [PATCH] drm/amd/pm: Use macro to initialize metrics table

2025-04-28 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, April 29, 2025 8:45 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Wang, 
Yang(Kevin) 
Subject: [PATCH] drm/amd/pm: Use macro to initialize metrics table

Helps to keep a build time check about usage of right datatype and avoids 
maintainence as new versions get added.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 67 --  
drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 13 -
 2 files changed, 11 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 80eb1a03b3ca..7eaf58fd7f9a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -1051,73 +1051,6 @@ int smu_cmn_get_combo_pptable(struct smu_context *smu)
false);
 }

-void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev) -{
-   struct metrics_table_header *header = (struct metrics_table_header 
*)table;
-   uint16_t structure_size;
-
-#define METRICS_VERSION(a, b)  ((a << 16) | b)
-
-   switch (METRICS_VERSION(frev, crev)) {
-   case METRICS_VERSION(1, 0):
-   structure_size = sizeof(struct gpu_metrics_v1_0);
-   break;
-   case METRICS_VERSION(1, 1):
-   structure_size = sizeof(struct gpu_metrics_v1_1);
-   break;
-   case METRICS_VERSION(1, 2):
-   structure_size = sizeof(struct gpu_metrics_v1_2);
-   break;
-   case METRICS_VERSION(1, 3):
-   structure_size = sizeof(struct gpu_metrics_v1_3);
-   break;
-   case METRICS_VERSION(1, 4):
-   structure_size = sizeof(struct gpu_metrics_v1_4);
-   break;
-   case METRICS_VERSION(1, 5):
-   structure_size = sizeof(struct gpu_metrics_v1_5);
-   break;
-   case METRICS_VERSION(1, 6):
-   structure_size = sizeof(struct gpu_metrics_v1_6);
-   break;
-   case METRICS_VERSION(1, 7):
-   structure_size = sizeof(struct gpu_metrics_v1_7);
-   break;
-   case METRICS_VERSION(1, 8):
-   structure_size = sizeof(struct gpu_metrics_v1_8);
-   break;
-   case METRICS_VERSION(2, 0):
-   structure_size = sizeof(struct gpu_metrics_v2_0);
-   break;
-   case METRICS_VERSION(2, 1):
-   structure_size = sizeof(struct gpu_metrics_v2_1);
-   break;
-   case METRICS_VERSION(2, 2):
-   structure_size = sizeof(struct gpu_metrics_v2_2);
-   break;
-   case METRICS_VERSION(2, 3):
-   structure_size = sizeof(struct gpu_metrics_v2_3);
-   break;
-   case METRICS_VERSION(2, 4):
-   structure_size = sizeof(struct gpu_metrics_v2_4);
-   break;
-   case METRICS_VERSION(3, 0):
-   structure_size = sizeof(struct gpu_metrics_v3_0);
-   break;
-   default:
-   return;
-   }
-
-#undef METRICS_VERSION
-
-   memset(header, 0xFF, structure_size);
-
-   header->format_revision = frev;
-   header->content_revision = crev;
-   header->structure_size = structure_size;
-
-}
-
 int smu_cmn_set_mp1_state(struct smu_context *smu,
  enum pp_mp1_state mp1_state)
 {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index a020277dec3e..cd75fdfd6b4a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -40,6 +40,17 @@
 #define SMU_IH_INTERRUPT_CONTEXT_ID_FAN_ABNORMAL0x8
 #define SMU_IH_INTERRUPT_CONTEXT_ID_FAN_RECOVERY0x9

+#define smu_cmn_init_soft_gpu_metrics(ptr, a, b)  \
+   do {  \
+   typecheck(struct gpu_metrics_v##a##_##b, typeof(*(ptr))); \
+   struct metrics_table_header *header = \
+   (struct metrics_table_header *)table; \
+   memset(header, 0xFF, sizeof(*(ptr))); \
+   header->format_revision = a;  \
+   header->content_revision = b; \
+   header->structure_size = sizeof(*(ptr));  \
+   } while (0)
+
 extern const int link_speed[];

 /* Helper to Convert from PCIE Gen 1/2/3/4/5/6 to 0.1 GT/s speed units */ @@ 
-125,8 +136,6 @@ int smu_cmn_get_metrics_table(struct smu_context *smu,

 int smu_cmn_get_combo_pptable(struct smu_context *smu);

-void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t

RE: [PATCH] drm/amd/pm: Fetch current power limit from PMFW

2025-02-18 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, February 18, 2025 5:47 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Fetch current power limit from PMFW

On SMU v13.0.12, always query the firmware to get the current power limit as it 
could be updated through other means also.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 7fdd7190a0c6..f0a14496d2b2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2791,6 +2791,7 @@ int smu_get_power_limit(void *handle,
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
+   case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
case IP_VERSION(11, 0, 7):
case IP_VERSION(11, 0, 11):
--
2.25.1

RE: [PATCH] drm/amd/pm: Fix indentation issue

2025-03-02 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

ping

-Original Message-
From: amd-gfx  On Behalf Of Asad Kamal
Sent: Thursday, February 27, 2025 8:33 PM
To: amd-gfx@lists.freedesktop.org; Lazar, Lijo 
Cc: Zhang, Hawking ; Ma, Le ; Zhang, 
Morris ; Kamal, Asad ; Deucher, 
Alexander ; kernel test robot 
Subject: [PATCH] drm/amd/pm: Fix indentation issue

Fix indentation issue for smu_v_13_0_12 get_gpu_metrics

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202502272246.oisqunc1-...@intel.com

Signed-off-by: Asad Kamal 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
index 5e80b9aabfc9..285dbfe10303 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
@@ -469,7 +469,7 @@ ssize_t smu_v13_0_12_get_gpu_metrics(struct smu_context 
*smu, void **table)
SMUQ10_ROUND(metrics->GfxBusy[inst]);
gpu_metrics->xcp_stats[i].gfx_busy_acc[idx] =
SMUQ10_ROUND(metrics->GfxBusyAcc[inst]);
-   idx++;
+   idx++;
}
}

--
2.46.0

RE: [PATCH] drm/amdgpu: Use wafl version for xgmi

2025-02-25 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, February 25, 2025 4:56 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 
; Zhang, Morris 
Subject: [PATCH] drm/amdgpu: Use wafl version for xgmi

XGMI and WAFL share the same versions. Use WAFL version if XGMI version is not 
present in discovery.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 967a992829bd..8425e5dbf80e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1290,6 +1290,7 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
uint16_t die_offset;
uint16_t ip_offset;
uint16_t num_dies;
+   uint32_t wafl_ver;
uint16_t num_ips;
uint16_t hw_id;
uint8_t inst;
@@ -1303,6 +1304,7 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
return r;
}

+   wafl_ver = 0;
adev->gfx.xcc_mask = 0;
adev->sdma.sdma_mask = 0;
adev->vcn.inst_mask = 0;
@@ -1403,6 +1405,10 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
adev->gfx.xcc_mask |=
(1U << ip->instance_number);

+   if (!wafl_ver && le16_to_cpu(ip->hw_id) == WAFLC_HWID)
+   wafl_ver = IP_VERSION_FULL(ip->major, ip->minor,
+  ip->revision, 0, 0);
+
for (k = 0; k < num_base_address; k++) {
/*
 * convert the endianness of base addresses in 
place, @@ -1468,6 +1474,9 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
}
}

+   if (wafl_ver && !adev->ip_versions[XGMI_HWIP][0])
+   adev->ip_versions[XGMI_HWIP][0] = wafl_ver;
+
return 0;
 }

@@ -2772,10 +2781,6 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
break;
}

-   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))
-   adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 4, 0);
-
/* set NBIO version */
switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
case IP_VERSION(6, 1, 0):
--
2.25.1

RE: [PATCH] drm/amdgpu: Add NPS2 to DPX compatible mode

2025-03-24 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Monday, March 24, 2025 1:29 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH] drm/amdgpu: Add NPS2 to DPX compatible mode

Compute partition DPX is possible in NPS2 mode. Update the compatible modes for 
DPX.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 3c07517be09a..ae071985f26e 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -473,7 +473,8 @@ static int aqua_vanjaram_get_xcp_res_info(struct 
amdgpu_xcp_mgr *xcp_mgr,
break;
case AMDGPU_DPX_PARTITION_MODE:
num_xcp = 2;
-   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE);
+   nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
+   BIT(AMDGPU_NPS2_PARTITION_MODE);
break;
case AMDGPU_TPX_PARTITION_MODE:
num_xcp = 3;
--
2.25.1

RE: [PATCH] drm/amd/pm: Move SMUv13.0.12 function declarations

2025-05-13 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, May 13, 2025 7:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Move SMUv13.0.12 function declarations

Move them to SMUv13.0.6 header file as they are used only in SMU v13.0.6.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 9 -
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 8 
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
index 9d4cb54a45de..4263798d716b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
@@ -298,14 +298,5 @@ int smu_v13_0_get_boot_freq_by_index(struct smu_context 
*smu,

 void smu_v13_0_interrupt_work(struct smu_context *smu);  void 
smu_v13_0_reset_custom_level(struct smu_context *smu); -bool 
smu_v13_0_12_is_dpm_running(struct smu_context *smu); -int 
smu_v13_0_12_get_max_metrics_size(void);
-int smu_v13_0_12_setup_driver_pptable(struct smu_context *smu); -int 
smu_v13_0_12_get_smu_metrics_data(struct smu_context *smu,
- MetricsMember_t member,
- uint32_t *value);
-ssize_t smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table); 
-extern const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[]; -extern 
const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[];  #endif  #endif 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
index 6e7293d3f264..1ccc150882eb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
@@ -75,4 +75,12 @@ extern void smu_v13_0_6_set_ppt_funcs(struct smu_context 
*smu);  bool smu_v13_0_6_cap_supported(struct smu_context *smu, enum 
smu_v13_0_6_caps cap);  int smu_v13_0_6_get_static_metrics_table(struct 
smu_context *smu);

+bool smu_v13_0_12_is_dpm_running(struct smu_context *smu); int
+smu_v13_0_12_get_max_metrics_size(void);
+int smu_v13_0_12_setup_driver_pptable(struct smu_context *smu); int
+smu_v13_0_12_get_smu_metrics_data(struct smu_context *smu,
+ MetricsMember_t member, uint32_t *value); 
ssize_t
+smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table);
+extern const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[];
+extern const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[];
 #endif
--
2.25.1

RE: [PATCH 1/2] drm/amdgpu: Check pcie replays reporting support

2025-05-29 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad


-Original Message-
From: Lazar, Lijo 
Sent: Thursday, May 29, 2025 2:00 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Gadre, Mangesh 

Subject: [PATCH 1/2] drm/amdgpu: Check pcie replays reporting support

Check if pcie replay count reporting is supported before creating sysfs 
attribute.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c   | 9 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h   | 2 ++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 72e41781afb0..9eaee8dacea7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -232,7 +232,7 @@ static int amdgpu_device_attr_sysfs_init(struct 
amdgpu_device *adev)  {
int ret = 0;

-   if (!amdgpu_sriov_vf(adev))
+   if (amdgpu_nbio_is_replay_cnt_supported(adev))
ret = sysfs_create_file(&adev->dev->kobj,
&dev_attr_pcie_replay_count.attr);

@@ -241,7 +241,7 @@ static int amdgpu_device_attr_sysfs_init(struct 
amdgpu_device *adev)

 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)  {
-   if (!amdgpu_sriov_vf(adev))
+   if (amdgpu_nbio_is_replay_cnt_supported(adev))
sysfs_remove_file(&adev->dev->kobj,
  &dev_attr_pcie_replay_count.attr);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index d085687a47ea..e56ba93a8df6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -53,6 +53,15 @@ u64 amdgpu_nbio_get_pcie_replay_count(struct amdgpu_device 
*adev)
return 0;
 }

+bool amdgpu_nbio_is_replay_cnt_supported(struct amdgpu_device *adev) {
+   if (amdgpu_sriov_vf(adev) || !adev->asic_funcs->get_pcie_replay_count ||
+   (!adev->nbio.funcs || !adev->nbio.funcs->get_pcie_replay_count))
+   return false;
+
+   return true;
+}
+
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)  {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 79c2f807b9fe..b528de6a01f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -119,4 +119,6 @@ int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);  
int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 u64 amdgpu_nbio_get_pcie_replay_count(struct amdgpu_device *adev);

+bool amdgpu_nbio_is_replay_cnt_supported(struct amdgpu_device *adev);
+
 #endif
--
2.25.1

RE: [PATCH 1/4] drm/amdgpu: Deprecate xgmi_link_speed enum

2025-06-16 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, June 16, 2025 12:24 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH 1/4] drm/amdgpu: Deprecate xgmi_link_speed enum

xgmi doesn't have discrete max speeds defined. Speed numbers can be arbitrary 
based on SOC. Deprecate the enum.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 --  
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 8 +---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d9ad37711c3e..6f9997198518 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1771,13 +1771,15 @@ void amdgpu_xgmi_early_init(struct amdgpu_device *adev)
case IP_VERSION(9, 4, 0):
case IP_VERSION(9, 4, 1):
case IP_VERSION(9, 4, 2):
-   adev->gmc.xgmi.max_speed = XGMI_SPEED_25GT;
+   /* 25 GT/s */
+   adev->gmc.xgmi.max_speed = 25;
adev->gmc.xgmi.max_width = 16;
break;
case IP_VERSION(9, 4, 3):
case IP_VERSION(9, 4, 4):
case IP_VERSION(9, 5, 0):
-   adev->gmc.xgmi.max_speed = XGMI_SPEED_32GT;
+   /* 32 GT/s */
+   adev->gmc.xgmi.max_speed = 32;
adev->gmc.xgmi.max_width = 16;
break;
default:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index f994be985f42..433d94f52ac3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -25,12 +25,6 @@
 #include 
 #include "amdgpu_ras.h"

-enum amdgpu_xgmi_link_speed {
-   XGMI_SPEED_16GT = 16,
-   XGMI_SPEED_25GT = 25,
-   XGMI_SPEED_32GT = 32
-};
-
 struct amdgpu_hive_info {
struct kobject kobj;
uint64_t hive_id;
@@ -97,7 +91,7 @@ struct amdgpu_xgmi {
struct ras_common_if *ras_if;
bool connected_to_cpu;
struct amdgpu_xgmi_ras *ras;
-   enum amdgpu_xgmi_link_speed max_speed;
+   uint16_t max_speed;
uint8_t max_width;
 };

--
2.25.1

RE: [PATCH] drm/amdgpu: Extend bus status check to more cases

2025-06-15 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, June 13, 2025 4:37 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Extend bus status check to more cases

In case of unexpected errors, check if device is alive on the bus.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h| 15 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  7 +--  
drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c |  5 -
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 10 ++
 4 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ea9206ab3ed1..928d28d100f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1780,4 +1780,19 @@ extern const struct attribute_group 
amdgpu_flash_attr_group;

 void amdgpu_set_init_level(struct amdgpu_device *adev,
   enum amdgpu_init_lvl_id lvl);
+
+static inline int amdgpu_device_bus_status_check(struct amdgpu_device
+*adev) {
+   u32 status;
+   int r;
+
+   r = pci_read_config_dword(adev->pdev, PCI_COMMAND, &status);
+   if (r || PCI_POSSIBLE_ERROR(status)) {
+   dev_err(adev->dev, "device lost from bus!");
+   return -ENODEV;
+   }
+
+   return 0;
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2795a34563f4..a968a9cab27e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6066,14 +6066,9 @@ static int amdgpu_device_health_check(struct list_head 
*device_list_handle)  {
struct amdgpu_device *tmp_adev;
int ret = 0;
-   u32 status;

list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
-   if (PCI_POSSIBLE_ERROR(status)) {
-   dev_err(tmp_adev->dev, "device lost from bus!");
-   ret = -ENODEV;
-   }
+   ret |= amdgpu_device_bus_status_check(tmp_adev);
}

return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 1c083304ae77..41acc1ce0b3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -353,11 +353,14 @@ static int aqua_vanjaram_query_partition_mode(struct 
amdgpu_xcp_mgr *xcp_mgr)

if (adev->nbio.funcs->get_compute_partition_mode) {
mode = adev->nbio.funcs->get_compute_partition_mode(adev);
-   if (mode != derv_mode)
+   if (mode != derv_mode) {
dev_warn(
adev->dev,
"Mismatch in compute partition mode - reported 
: %d derived : %d",
mode, derv_mode);
+   if (derv_mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
+   amdgpu_device_bus_status_check(adev);
+   }
}

return mode;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 7eaf58fd7f9a..59f9abd0f7b8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -86,6 +86,7 @@ static void smu_cmn_read_arg(struct smu_context *smu,
 #define SMU_RESP_BUSY_OTHER 0xFC
 #define SMU_RESP_DEBUG_END  0xFB

+#define SMU_RESP_UNEXP (~0U)
 /**
  * __smu_cmn_poll_stat -- poll for a status from the SMU
  * @smu: a pointer to SMU context
@@ -171,6 +172,15 @@ static void __smu_cmn_reg_print_error(struct smu_context 
*smu,
dev_err_ratelimited(adev->dev,
"SMU: I'm debugging!");
break;
+   case SMU_RESP_UNEXP:
+   if (amdgpu_device_bus_status_check(smu->adev)) {
+   /* print error immediately if device is off the bus */
+   dev_err(adev->dev,
+   "SMU: response:0x%08X for index:%d param:0x%08X 
message:%s?",
+   reg_c2pmsg_90, msg_index, param, message);
+   break;
+   }
+   fallthrough;
default:
dev_err_ratelimited(adev->dev,
"SMU: response:0x%08X for index:%d 
param:0x%08X message:%s?",
--
2.25.1

RE: [PATCH v2 1/2] drm/amd/pm: Update SMU v13.0.12 pmfw header

2025-06-15 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, June 16, 2025 11:09 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH v2 1/2] drm/amd/pm: Update SMU v13.0.12 pmfw header

Update PMFW metrics table definition to version 0x13

Signed-off-by: Lijo Lazar 
---
 .../amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h
index d7505cfc433a..0a2ca544f4e3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h
@@ -86,8 +86,10 @@ typedef enum {
 /*36*/  FEATURE_PIT = 36,
 /*37*/  FEATURE_DVO = 37,
 /*38*/  FEATURE_XVMINORPSM_CLKSTOP_DS   = 38,
+/*39*/  FEATURE_GLOBAL_DPM  = 39,
+/*40*/  FEATURE_NODE_POWER_MANAGER  = 40,

-/*39*/  NUM_FEATURES= 39
+/*41*/  NUM_FEATURES= 41
 } FEATURE_LIST_e;

 //enum for MPIO PCIe gen speed msgs
@@ -133,7 +135,7 @@ typedef enum {
   GFX_DVM_MARGIN_COUNT
 } GFX_DVM_MARGIN_e;

-#define SMU_METRICS_TABLE_VERSION 0x12
+#define SMU_METRICS_TABLE_VERSION 0x13

 typedef struct __attribute__((packed, aligned(4))) {
   uint64_t AccumulationCounter;
@@ -275,6 +277,16 @@ typedef struct {
   //PSNs
   uint64_t PublicSerialNumber_AID[4];
   uint64_t PublicSerialNumber_XCD[8];
+
+  //XGMI
+  uint32_t MaxXgmiWidth;
+  uint32_t MaxXgmiBitrate;
+
+  // Telemetry
+  uint32_t InputTelemetryVoltageInmV;
+
+  // General info
+  uint32_t pldmVersion[2];
 } StaticMetricsTable_t;
 #pragma pack(pop)

--
2.25.1

RE: [PATCH] drm/amdgpu: Convert from DRM_* to dev_*

2025-06-24 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 24, 2025 1:47 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Convert from DRM_* to dev_*

Convert from generic DRM_* to dev_* calls to have device context info.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 271 +++---
 .../gpu/drm/amd/amdgpu/amdgpu_doorbell_mgr.c  |  15 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |   4 +-
 .../gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c|  24 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  20 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c   |  24 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c  |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  22 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  77 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  56 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  21 +-
 12 files changed, 320 insertions(+), 219 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c8a6b3689dea..334e442c95ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1288,14 +1288,14 @@ u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
  */
 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
-   DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
+   dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg);
BUG();
return 0;
 }

 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t 
reg)
 {
-   DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
+   dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
BUG();
return 0;
 }
@@ -1312,15 +1312,17 @@ static uint32_t amdgpu_invalid_rreg_ext(struct 
amdgpu_device *adev, uint64_t reg
  */
 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, 
uint32_t v)
 {
-   DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
- reg, v);
+   dev_err(adev->dev,
+   "Invalid callback to write register 0x%04X with 0x%08X\n", reg,
+   v);
BUG();
 }

 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, 
uint32_t v)
 {
-   DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
- reg, v);
+   dev_err(adev->dev,
+   "Invalid callback to write register 0x%llX with 0x%08X\n", reg,
+   v);
BUG();
 }

@@ -1336,14 +1338,15 @@ static void amdgpu_invalid_wreg_ext(struct 
amdgpu_device *adev, uint64_t reg, ui
  */
 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 {
-   DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
+   dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n",
+   reg);
BUG();
return 0;
 }

 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t 
reg)
 {
-   DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
+   dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
BUG();
return 0;
 }
@@ -1360,15 +1363,17 @@ static uint64_t amdgpu_invalid_rreg64_ext(struct 
amdgpu_device *adev, uint64_t r
  */
 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, 
uint64_t v)
 {
-   DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 
0x%08llX\n",
- reg, v);
+   dev_err(adev->dev,
+   "Invalid callback to write 64 bit register 0x%04X with 
0x%08llX\n",
+   reg, v);
BUG();
 }

 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t 
reg, uint64_t v)
 {
-   DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 
0x%08llX\n",
- reg, v);
+   dev_err(adev->dev,
+   "Invalid callback to write 64 bit register 0x%llX with 
0x%08llX\n",
+   reg, v);
BUG();
 }

@@ -1386,8 +1391,9 @@ static void amdgpu_invalid_wreg64_ext(struct 
amdgpu_device *adev, uint64_t reg,
 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
  uint32_t block, uint32_t reg)
 {
-   DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
- reg, block);
+   dev_err(adev->dev,
+   "Invalid

RE: [PATCH] drm/amd/pm: Show default gfx clock levels

2025-06-06 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, June 6, 2025 12:25 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Show default gfx clock levels

For SMU v13.0.6 SOCs, always show default clock levels for gfx in pp_dpm_sclk. 
Any custom min/max levels set by user will be available in pp_od_clk_voltage

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 32bdffa360ee..36f210698bea 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1416,8 +1416,9 @@ static int smu_v13_0_6_print_clk_levels(struct 
smu_context *smu,
return ret;
}

-   min_clk = pstate_table->gfxclk_pstate.curr.min;
-   max_clk = pstate_table->gfxclk_pstate.curr.max;
+   single_dpm_table = &(dpm_context->dpm_tables.gfx_table);
+   min_clk = single_dpm_table->min;
+   max_clk = single_dpm_table->max;

if (now < SMU_13_0_6_DSCLK_THRESHOLD) {
size += sysfs_emit_at(buf, size, "S: %uMhz *\n",
--
2.25.1

RE: [PATCH] drm/amdgpu: Suspend IH during mode-2 reset

2025-06-09 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, June 9, 2025 10:11 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Suspend IH during mode-2 reset

On multi-aid SOCs, there could be a continuous stream of interrupts from GC 
after poison consumption. Suspend IH to disable them before doing
mode-2 reset. This avoids conflicts in hardware accesses during interrupt 
handlers while a reset is ongoing.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 33 ++
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 49bf8f3a748f..48c09addb29f 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -71,18 +71,29 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control 
*reset_ctl,
return NULL;
 }

+static inline uint32_t aldebaran_get_ip_block_mask(struct amdgpu_device
+*adev) {
+   uint32_t ip_block_mask = BIT(AMD_IP_BLOCK_TYPE_GFX) |
+BIT(AMD_IP_BLOCK_TYPE_SDMA);
+
+   if (adev->aid_mask)
+   ip_block_mask |= BIT(AMD_IP_BLOCK_TYPE_IH);
+
+   return ip_block_mask;
+}
+
 static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)  {
+   uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);
+   uint32_t ip_block;
int r, i;

amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);

for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
-   if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+   ip_block = BIT(adev->ip_blocks[i].version->type);
+   if (!(ip_block_mask & ip_block))
continue;

r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
@@ -200,8 +211,10 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,  static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)  
{
struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
+   uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);
struct amdgpu_firmware_info *ucode;
struct amdgpu_ip_block *cmn_block;
+   struct amdgpu_ip_block *ih_block;
int ucode_count = 0;
int i, r;

@@ -243,6 +256,18 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device 
*adev)
if (r)
return r;

+   if (ip_block_mask & BIT(AMD_IP_BLOCK_TYPE_IH)) {
+   ih_block = amdgpu_device_ip_get_ip_block(adev,
+AMD_IP_BLOCK_TYPE_IH);
+   if (unlikely(!ih_block)) {
+   dev_err(adev->dev, "Failed to get IH handle\n");
+   return -EINVAL;
+   }
+   r = amdgpu_ip_block_resume(ih_block);
+   if (r)
+   return r;
+   }
+
/* Reinit GFXHUB */
adev->gfxhub.funcs->init(adev);
r = adev->gfxhub.funcs->gart_enable(adev);
--
2.25.1

RE: [PATCH] drm/amd/pm: Get max/min frequency on aldebaran VF

2025-07-14 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, July 15, 2025 8:53 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Get max/min frequency on aldebaran VF

PMFW interface to get max/min frequencies is not available on aldebaran VFs. 
Use data, if available, in DPM tables to get the max/min frequencies.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 57 ++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 9a61cf904275..b067147b7c41 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -342,6 +342,61 @@ static int aldebaran_get_allowed_feature_mask(struct 
smu_context *smu,
return 0;
 }

+static int aldebaran_get_dpm_ultimate_freq(struct smu_context *smu,
+  enum smu_clk_type clk_type,
+  uint32_t *min, uint32_t *max)
+{
+   struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
+   struct smu_13_0_dpm_table *dpm_table;
+   uint32_t min_clk, max_clk;
+
+   if (amdgpu_sriov_vf(smu->adev)) {
+   switch (clk_type) {
+   case SMU_MCLK:
+   case SMU_UCLK:
+   dpm_table = &dpm_context->dpm_tables.uclk_table;
+   break;
+   case SMU_GFXCLK:
+   case SMU_SCLK:
+   dpm_table = &dpm_context->dpm_tables.gfx_table;
+   break;
+   case SMU_SOCCLK:
+   dpm_table = &dpm_context->dpm_tables.soc_table;
+   break;
+   case SMU_FCLK:
+   dpm_table = &dpm_context->dpm_tables.fclk_table;
+   break;
+   case SMU_VCLK:
+   dpm_table = &dpm_context->dpm_tables.vclk_table;
+   break;
+   case SMU_DCLK:
+   dpm_table = &dpm_context->dpm_tables.dclk_table;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   min_clk = dpm_table->min;
+   max_clk = dpm_table->max;
+
+   if (min) {
+   if (!min_clk)
+   return -ENODATA;
+   *min = min_clk;
+   }
+   if (max) {
+   if (!max_clk)
+   return -ENODATA;
+   *max = max_clk;
+   }
+
+   } else {
+   return smu_v13_0_get_dpm_ultimate_freq(smu, clk_type, min, max);
+   }
+
+   return 0;
+}
+
 static int aldebaran_set_default_dpm_table(struct smu_context *smu)  {
struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; @@ 
-2081,7 +2136,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.set_azalia_d3_pme = smu_v13_0_set_azalia_d3_pme,
.get_max_sustainable_clocks_by_dc = 
smu_v13_0_get_max_sustainable_clocks_by_dc,
.get_bamaco_support = aldebaran_get_bamaco_support,
-   .get_dpm_ultimate_freq = smu_v13_0_get_dpm_ultimate_freq,
+   .get_dpm_ultimate_freq = aldebaran_get_dpm_ultimate_freq,
.set_soft_freq_limited_range = aldebaran_set_soft_freq_limited_range,
.od_edit_dpm_table = aldebaran_usr_edit_dpm_table,
.set_df_cstate = aldebaran_set_df_cstate,
--
2.49.0

RE: [PATCH v2] drm/amd/pm: Use cached data for min/max clocks

2025-07-14 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, July 14, 2025 8:56 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH v2] drm/amd/pm: Use cached data for min/max clocks

If dpm tables are already populated on SMU v13.0.6 SOCs, use the cached data. 
Otherwise, fetch values from firmware.

Signed-off-by: Lijo Lazar 
---
v2: Coding style - reorder declarations and remove unwanted initializations

 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 38 +--
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 68624afe7d83..f2cf333b2e40 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -881,51 +881,51 @@ static int smu_v13_0_6_get_dpm_ultimate_freq(struct 
smu_context *smu,
 enum smu_clk_type clk_type,
 uint32_t *min, uint32_t *max)  {
+   struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
struct smu_table_context *smu_table = &smu->smu_table;
struct PPTable_t *pptable =
(struct PPTable_t *)smu_table->driver_pptable;
-   uint32_t clock_limit = 0, param;
+   struct smu_13_0_dpm_table *dpm_table;
+   uint32_t min_clk, max_clk, param;
int ret = 0, clk_id = 0;

-   if (!smu_cmn_clk_dpm_is_enabled(smu, clk_type)) {
+   /* Use dpm tables, if data is already fetched */
+   if (pptable->Init) {
switch (clk_type) {
case SMU_MCLK:
case SMU_UCLK:
-   if (pptable->Init)
-   clock_limit = pptable->UclkFrequencyTable[0];
+   dpm_table = &dpm_context->dpm_tables.uclk_table;
break;
case SMU_GFXCLK:
case SMU_SCLK:
-   if (pptable->Init)
-   clock_limit = pptable->MinGfxclkFrequency;
+   dpm_table = &dpm_context->dpm_tables.gfx_table;
break;
case SMU_SOCCLK:
-   if (pptable->Init)
-   clock_limit = pptable->SocclkFrequencyTable[0];
+   dpm_table = &dpm_context->dpm_tables.soc_table;
break;
case SMU_FCLK:
-   if (pptable->Init)
-   clock_limit = pptable->FclkFrequencyTable[0];
+   dpm_table = &dpm_context->dpm_tables.fclk_table;
break;
case SMU_VCLK:
-   if (pptable->Init)
-   clock_limit = pptable->VclkFrequencyTable[0];
+   dpm_table = &dpm_context->dpm_tables.vclk_table;
break;
case SMU_DCLK:
-   if (pptable->Init)
-   clock_limit = pptable->DclkFrequencyTable[0];
+   dpm_table = &dpm_context->dpm_tables.dclk_table;
break;
default:
-   break;
+   return -EINVAL;
}

-   if (min)
-   *min = clock_limit;
+   min_clk = dpm_table->min;
+   max_clk = dpm_table->max;

+   if (min)
+   *min = min_clk;
if (max)
-   *max = clock_limit;
+   *max = max_clk;

-   return 0;
+   if (min_clk && max_clk)
+   return 0;
}

if (!(clk_type == SMU_GFXCLK || clk_type == SMU_SCLK)) {
--
2.49.0

RE: [PATCH 1/3] drm/amd/pm: Use cached metrics data on SMUv13.0.6

2025-07-14 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, July 11, 2025 3:05 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH 1/3] drm/amd/pm: Use cached metrics data on SMUv13.0.6

Cached metrics data validity is 1ms on SMUv13.0.6 SOCs. It's not reasonable for 
any client to query gpu_metrics at a faster rate and constantly interrupt PMFW.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 68624afe7d83..0fa8652b603a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2693,7 +2693,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
bool per_inst;

metrics_v0 = kzalloc(METRICS_TABLE_SIZE, GFP_KERNEL);
-   ret = smu_v13_0_6_get_metrics_table(smu, metrics_v0, true);
+   ret = smu_v13_0_6_get_metrics_table(smu, metrics_v0, false);
if (ret) {
kfree(metrics_v0);
return ret;
--
2.49.0

RE: [PATCH] drm/amdgpu: Add a noverbose flag to psp_wait_for

2025-07-01 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Apart from minor comment inline, patch looks good to me.

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad


-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, July 1, 2025 2:54 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Tomasevic, 
Vojislav 
Subject: [PATCH] drm/amdgpu: Add a noverbose flag to psp_wait_for

For extended wait with retries on a PSP register value, add a noverbose flag to 
avoid excessive error messages on each timeout.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 13 +++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h  |  7 ++-
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c   |  4 +-
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c   | 35 ---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0_8.c | 10 ++---
 drivers/gpu/drm/amd/amdgpu/psp_v12_0.c   | 20 -
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c   | 41 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c | 22 +-
 drivers/gpu/drm/amd/amdgpu/psp_v14_0.c   | 55 +---
 drivers/gpu/drm/amd/amdgpu/psp_v3_1.c| 41 +-
 10 files changed, 130 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 04dedf38eb0d..25aa35de1e41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -575,9 +575,11 @@ static int psp_sw_fini(struct amdgpu_ip_block *ip_block)
return 0;
 }

-int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
-uint32_t reg_val, uint32_t mask, bool check_changed)
+int psp_wait_for(struct psp_context *psp, uint32_t reg_index, uint32_t reg_val,
+uint32_t mask, uint32_t flags)
 {
+   bool check_changed = flags & PSP_WAITREG_CHANGED;
+   bool verbose = !(flags & PSP_WAITREG_NOVERBOSE);
uint32_t val;
int i;
struct amdgpu_device *adev = psp->adev; @@ -597,9 +599,10 @@ int 
psp_wait_for(struct psp_context *psp, uint32_t reg_index,
udelay(1);
}

-   dev_err(adev->dev,
-   "psp reg (0x%x) wait timed out, mask: %x, read: %x exp: %x",
-   reg_index, mask, val, reg_val);
+   if (verbose)
+   dev_err(adev->dev,
+   "psp reg (0x%x) wait timed out, mask: %x, read: %x exp: 
%x",
+   reg_index, mask, val, reg_val);

return -ETIME;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 4bc0ec49d2e9..35888f1937bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,9 @@ enum psp_reg_prog_id {
PSP_REG_LAST
 };

+#define PSP_WAITREG_CHANGED (1U << 0) /* check if the value has changed
+*/ #define PSP_WAITREG_NOVERBOSE (1U << 1) /* No error verbose */
+
AK: Can we use BIT macro here?

 struct psp_funcs {
int (*init_microcode)(struct psp_context *psp);
int (*wait_for_bootloader)(struct psp_context *psp); @@ -532,8 +535,8 
@@ extern const struct amdgpu_ip_block_version psp_v13_0_ip_block;  extern 
const struct amdgpu_ip_block_version psp_v13_0_4_ip_block;  extern const struct 
amdgpu_ip_block_version psp_v14_0_ip_block;

-extern int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
-   uint32_t field_val, uint32_t mask, bool check_changed);
+int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
+uint32_t field_val, uint32_t mask, uint32_t flags);
 extern int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t 
reg_index,
uint32_t field_val, uint32_t mask, uint32_t 
msec_timeout);

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
index 2c4ebd98927f..3584b8c18fd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
@@ -94,7 +94,7 @@ static int psp_v10_0_ring_create(struct psp_context *psp,

/* Wait for response flag (bit 31) in C2PMSG_64 */
ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
-  MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, false);
+  MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, 0);

return ret;
 }
@@ -115,7 +115,7 @@ static int psp_v10_0_ring_stop(struct psp_context *psp,

/* Wait for response flag (bit 31) in C2PMSG_64 */
ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
-  MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, false);
+  MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, 0);

return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index 1a4a26e6ffd2..6cc05d36e359 100644
--- a/dri

RE: [PATCH] drm/amdgpu: Update supported modes for GC v9.5.0

2025-07-21 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Tuesday, July 22, 2025 9:16 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Gadre, Mangesh ; Lin, Amber 

Subject: [PATCH] drm/amdgpu: Update supported modes for GC v9.5.0

For GC v9.5.0 SOCs, both CPX and QPX compute modes are also supported in
NPS2 mode.

Signed-off-by: Lijo Lazar 
Acked-by: Mangesh Gadre 
---
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 914cf4bfb033..811124ff88a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -227,6 +227,7 @@ static int __aqua_vanjaram_get_px_mode_info(struct 
amdgpu_xcp_mgr *xcp_mgr,
uint16_t *nps_modes)
 {
struct amdgpu_device *adev = xcp_mgr->adev;
+   uint32_t gc_ver = amdgpu_ip_version(adev, GC_HWIP, 0);

if (!num_xcp || !nps_modes || !(xcp_mgr->supp_xcp_modes & BIT(px_mode)))
return -EINVAL;
@@ -250,12 +251,14 @@ static int __aqua_vanjaram_get_px_mode_info(struct 
amdgpu_xcp_mgr *xcp_mgr,
*num_xcp = 4;
*nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
 BIT(AMDGPU_NPS4_PARTITION_MODE);
+   if (gc_ver == IP_VERSION(9, 5, 0))
+   *nps_modes |= BIT(AMDGPU_NPS2_PARTITION_MODE);
break;
case AMDGPU_CPX_PARTITION_MODE:
*num_xcp = NUM_XCC(adev->gfx.xcc_mask);
*nps_modes = BIT(AMDGPU_NPS1_PARTITION_MODE) |
 BIT(AMDGPU_NPS4_PARTITION_MODE);
-   if (amdgpu_sriov_vf(adev))
+   if (gc_ver == IP_VERSION(9, 5, 0))
*nps_modes |= BIT(AMDGPU_NPS2_PARTITION_MODE);
break;
default:
--
2.49.0

RE: [PATCH] drm/amdgpu: Log reset source during recovery

2025-07-23 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, July 23, 2025 10:45 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: Log reset source during recovery

To get more context, add reset source to identify the source of gpu recovery - 
job timeout, RAS, HWS hang etc.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 685cc602357e..4fe74341aa13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6442,8 +6442,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
emergency_restart();
}

-   dev_info(adev->dev, "GPU %s begin!\n",
-   need_emergency_restart ? "jobs stop":"reset");
+   dev_info(adev->dev, "GPU %s begin!. Source:  %d\n",
+need_emergency_restart ? "jobs stop" : "reset",
+reset_context->src);

if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev);
--
2.49.0

RE: [PATCH] drm/amdgpu: Update external revid for GC v9.5.0

2025-07-24 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Friday, July 25, 2025 11:26 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH] drm/amdgpu: Update external revid for GC v9.5.0

Use different external revid for GC v9.5.0 SOCs.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index c457be3a3c56..9e74c9822e62 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1218,6 +1218,8 @@ static int soc15_common_early_init(struct amdgpu_ip_block 
*ip_block)
AMD_PG_SUPPORT_JPEG;
/*TODO: need a new external_rev_id for GC 9.4.4? */
adev->external_rev_id = adev->rev_id + 0x46;
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0))
+   adev->external_rev_id = adev->rev_id + 0x50;
break;
default:
/* FIXME: not supported yet */
--
2.49.0

RE: [PATCH] drm/amd/pm: Allow static metrics table query in VF

2025-08-01 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Friday, August 1, 2025 11:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad 
Subject: [PATCH] drm/amd/pm: Allow static metrics table query in VF

Allow statics metrics table to be queried on SMUv13.0.6 SOCs in VF mode.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 9cc294f4708b..148941d7ba51 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -177,7 +177,7 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(SetThrottlingPolicy, 
PPSMC_MSG_SetThrottlingPolicy, 0),
MSG_MAP(ResetSDMA,   PPSMC_MSG_ResetSDMA,   
0),
MSG_MAP(ResetVCN,PPSMC_MSG_ResetVCN,
   0),
-   MSG_MAP(GetStaticMetricsTable,   
PPSMC_MSG_GetStaticMetricsTable,   0),
+   MSG_MAP(GetStaticMetricsTable,   
PPSMC_MSG_GetStaticMetricsTable,   1),
 };

 // clang-format on
--
2.49.0

RE: [PATCH 1/4] drm/amdgpu: Add helpers to set/get unique ids

2025-08-05 Thread Kamal, Asad

[AMD Official Use Only - AMD Internal Distribution Only]

Series is
Reviewed-by: Asad Kamal 

Thanks & Regards
Asad

-Original Message-
From: Lazar, Lijo 
Sent: Monday, August 4, 2025 5:28 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Huang, 
JinHuiEric 
Subject: [PATCH 1/4] drm/amdgpu: Add helpers to set/get unique ids

Add a struct to store unique id information for each type. Add helper to fetch 
the unique id.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h| 20 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 71 ++
 2 files changed, 91 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ef3af170dda4..c2771e32d0bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -819,6 +819,20 @@ struct amdgpu_ip_map_info {
uint32_t mask);
 };

+enum amdgpu_uid_type {
+   AMDGPU_UID_TYPE_XCD,
+   AMDGPU_UID_TYPE_AID,
+   AMDGPU_UID_TYPE_SOC,
+   AMDGPU_UID_TYPE_MAX
+};
+
+#define AMDGPU_UID_INST_MAX 8 /* max number of instances for each UID
+type */
+
+struct amdgpu_uid {
+   uint64_t uid[AMDGPU_UID_TYPE_MAX][AMDGPU_UID_INST_MAX];
+   struct amdgpu_device *adev;
+};
+
 struct amd_powerplay {
void *pp_handle;
const struct amd_pm_funcs *pp_funcs;
@@ -1302,6 +1316,7 @@ struct amdgpu_device {
struct list_headuserq_mgr_list;
struct mutexuserq_mutex;
booluserq_halt_for_enforce_isolation;
+   struct amdgpu_uid *uid_info;
 };

 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, @@ 
-1785,4 +1800,9 @@ static inline int amdgpu_device_bus_status_check(struct 
amdgpu_device *adev)
return 0;
 }

+void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
+  enum amdgpu_uid_type type, uint8_t inst,
+  uint64_t uid);
+uint64_t amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
+  enum amdgpu_uid_type type, uint8_t inst);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 370c8daada8c..9eee1d4e1eaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2675,6 +2675,24 @@ static int amdgpu_device_parse_gpu_info_fw(struct 
amdgpu_device *adev)
return err;
 }

+static void amdgpu_uid_init(struct amdgpu_device *adev) {
+   /* Initialize the UID for the device */
+   adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL);
+   if (!adev->uid_info) {
+   dev_warn(adev->dev, "Failed to allocate memory for UID\n");
+   return;
+   }
+   adev->uid_info->adev = adev;
+}
+
+static void amdgpu_uid_fini(struct amdgpu_device *adev) {
+   /* Free the UID memory */
+   kfree(adev->uid_info);
+   adev->uid_info = NULL;
+}
+
 /**
  * amdgpu_device_ip_early_init - run early init for hardware IPs
  *
@@ -2858,6 +2876,8 @@ static int amdgpu_device_ip_early_init(struct 
amdgpu_device *adev)
if (adev->gmc.xgmi.supported)
amdgpu_xgmi_early_init(adev);

+   if (amdgpu_is_multi_aid(adev))
+   amdgpu_uid_init(adev);
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
if (ip_block->status.valid != false)
amdgpu_amdkfd_device_probe(adev);
@@ -3648,6 +3668,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
}

amdgpu_ras_fini(adev);
+   amdgpu_uid_fini(adev);

return 0;
 }
@@ -7467,3 +7488,53 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t 
supported_reset)
size += sysfs_emit_at(buf, size, "\n");
return size;
 }
+
+void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
+  enum amdgpu_uid_type type, uint8_t inst,
+  uint64_t uid)
+{
+   if (!uid_info)
+   return;
+
+   if (type >= AMDGPU_UID_TYPE_MAX) {
+   dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
+type);
+   return;
+   }
+
+   if (inst >= AMDGPU_UID_INST_MAX) {
+   dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
+inst);
+   return;
+   }
+
+   if (uid_info->uid[type][inst] != 0) {
+   dev_warn_once(
+   uid_info->adev->dev,
+   "Overwriting existing UID %llu for type %d instance 
%d\n",
+   uid_info->uid[type][inst], type, inst);
+   }
+
+   uid_info->uid[type][inst] = uid;
+}
+
+u64 am

RE: [PATCH v2] drm/amd/pm: Increase cache interval time

2025-08-05 Thread Kamal, Asad

[Public]

Hi Lijo,

As per discussion offline, understood the requirement. Will drop this patch and 
send a fresh one as discussed later.

Thanks & Regards
Asad

From: Lazar, Lijo 
Sent: Tuesday, August 5, 2025 9:54 PM
To: Kamal, Asad ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Ma, Le ; Zhang, 
Morris ; Deucher, Alexander ; 
Kamal, Asad 
Subject: Re: [PATCH v2] drm/amd/pm: Increase cache interval time


[Public]

Hi Asad,

Sorry, after initing the cache interval time, I meant to move the cache time 
check logic to swsmu level and not at smu v13.0.12. I believe this was the 
original ask from Alex.

Other SOCs can customize if required by adjusting the cache interval.

Thanks,
Lijo

From: Kamal, Asad mailto:asad.ka...@amd.com>>
Sent: Tuesday, August 5, 2025 9:20:58 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>; Lazar, 
Lijo mailto:lijo.la...@amd.com>>
Cc: Zhang, Hawking mailto:hawking.zh...@amd.com>>; Ma, 
Le mailto:le...@amd.com>>; Zhang, Morris 
mailto:shiwu.zh...@amd.com>>; Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Kamal, Asad 
mailto:asad.ka...@amd.com>>
Subject: [PATCH v2] drm/amd/pm: Increase cache interval time

Increase cache interval time to 50 ms while fetching system
metrics table for smu_v13_0_12 since polling interval is less frequent for
this data.

v2: Make caching interval soc independent, however customization can be
done in soc specific callbacks(Alex/Lijo)

Signed-off-by: Asad Kamal mailto:asad.ka...@amd.com>>
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 4 
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 +++
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 9 -
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index dc48a1dd8be4..c62d68d7410f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1162,8 +1162,12 @@ static void smu_free_dummy_read_table(struct smu_context 
*smu)

 static int smu_smc_table_sw_init(struct smu_context *smu)
 {
+   struct smu_table_context *smu_table = &smu->smu_table;
 int ret;

+   smu_table->tables[SMU_TABLE_TEMP_METRICS].cache_interval =
+   AMDGPU_TEMP_METRICS_CACHE_INTERVAL;
+
 /**
  * Create smu_table structure, and init smc tables such as
  * TABLE_PPTABLE, TABLE_WATERMARKS, TABLE_SMU_METRICS, and etc.
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 611b381b9147..7a52c00c700e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -32,6 +32,8 @@
 #include "smu_types.h"
 #include "linux/firmware.h"

+#define AMDGPU_TEMP_METRICS_CACHE_INTERVAL 50
+
 #define SMU_THERMAL_MINIMUM_ALERT_TEMP  0
 #define SMU_THERMAL_MAXIMUM_ALERT_TEMP  255
 #define SMU_TEMPERATURE_UNITS_PER_CENTIGRADES   1000
@@ -258,6 +260,7 @@ struct smu_table {
 struct amdgpu_bo *bo;
 uint32_t version;
 unsigned long  metrics_time;
+   uint32_t cache_interval;
 };

 enum smu_perf_level_designation {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
index fca50f6a8ef6..5ead66375d38 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
@@ -361,18 +361,17 @@ int smu_v13_0_12_get_smu_metrics_data(struct smu_context 
*smu,
 return 0;
 }

-static int smu_v13_0_12_get_system_metrics_table(struct smu_context *smu, void 
*metrics_table,
-bool bypass_cache)
+static int smu_v13_0_12_get_system_metrics_table(struct smu_context *smu, void 
*metrics_table)
 {
 struct smu_table_context *smu_table = &smu->smu_table;
 uint32_t table_size = smu_table->tables[SMU_TABLE_SMU_METRICS].size;
 struct smu_table *table = &smu_table->driver_table;
 int ret;

-   if (bypass_cache || 
!smu_table->tables[SMU_TABLE_TEMP_METRICS].metrics_time ||
+   if (!smu_table->tables[SMU_TABLE_TEMP_METRICS].metrics_time ||
 time_after(jiffies,
smu_table->tables[SMU_TABLE_TEMP_METRICS].metrics_time +
-  msecs_to_jiffies(1))) {
+  
msecs_to_jiffies(smu_table->tables[SMU_TABLE_TEMP_METRICS].cache_interval))) {
 ret = smu_cmn_send_smc_msg(smu, SMU_MSG_GetSystemMetricsTable, 
NULL);
 if (ret) {
 dev_info(smu->adev->dev,
@@ -544,7 +543,7 @@ static ssize_t smu_v13_0_12_get_temp_metrics(str

88 matches

Mail list logo