On Wed, Jan 23, 2019 at 02:00:49AM +0200, Oded Gabbay wrote:
> This patch adds the H/W queues module and the code to initialize Goya's
> various compute and DMA engines and their queues.
> 
> Goya has 5 DMA channels, 8 TPC engines and a single MME engine. For each
> channel/engine, there is a H/W queue logic which is used to pass commands
> from the user to the H/W. That logic is called QMAN.
> 
> There are two types of QMANs: external and internal. The DMA QMANs are
> considered external while the TPC and MME QMANs are considered internal.
> For each external queue there is a completion queue, which is located on
> the Host memory.
> 
> The differences between external and internal QMANs are:
> 
> 1. The location of the queue's memory. External QMANs are located on the
>    Host memory while internal QMANs are located on the on-chip memory.
> 
> 2. The external QMAN write an entry to a completion queue and sends an
>    MSI-X interrupt upon completion of a command buffer that was given to
>    it. The internal QMAN doesn't do that.
> 
> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
> ---
>  drivers/misc/habanalabs/Makefile              |    2 +-
>  drivers/misc/habanalabs/device.c              |   74 +-
>  drivers/misc/habanalabs/goya/goya.c           | 1518 +++++++++++++++--
>  drivers/misc/habanalabs/goya/goyaP.h          |    6 +
>  drivers/misc/habanalabs/habanalabs.h          |  176 +-
>  drivers/misc/habanalabs/habanalabs_drv.c      |    6 +
>  drivers/misc/habanalabs/hw_queue.c            |  404 +++++
>  .../habanalabs/include/goya/goya_packets.h    |  234 +++
>  .../habanalabs/include/habanalabs_device_if.h |  272 +++
>  drivers/misc/habanalabs/irq.c                 |  150 ++
>  10 files changed, 2721 insertions(+), 121 deletions(-)
>  create mode 100644 drivers/misc/habanalabs/hw_queue.c
>  create mode 100644 drivers/misc/habanalabs/include/goya/goya_packets.h
>  create mode 100644 drivers/misc/habanalabs/irq.c
> 
> diff --git a/drivers/misc/habanalabs/Makefile 
> b/drivers/misc/habanalabs/Makefile
> index 2530c9b78ca4..c07f3ccb57dc 100644
> --- a/drivers/misc/habanalabs/Makefile
> +++ b/drivers/misc/habanalabs/Makefile
> @@ -5,7 +5,7 @@
>  obj-m        := habanalabs.o
>  
>  habanalabs-y := habanalabs_drv.o device.o context.o asid.o 
> habanalabs_ioctl.o \
> -             command_buffer.o
> +             command_buffer.o hw_queue.o irq.o
>  
>  include $(src)/goya/Makefile
>  habanalabs-y += $(HL_GOYA_FILES)
> diff --git a/drivers/misc/habanalabs/device.c 
> b/drivers/misc/habanalabs/device.c
> index 9fc7218a973c..98220628a467 100644
> --- a/drivers/misc/habanalabs/device.c
> +++ b/drivers/misc/habanalabs/device.c
> @@ -170,13 +170,22 @@ static int device_early_init(struct hl_device *hdev)
>       if (rc)
>               goto early_fini;
>  
> +     hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
> +     if (hdev->cq_wq == NULL) {
> +             dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
> +             goto asid_fini;
> +     }
> +
>       hl_cb_mgr_init(&hdev->kernel_cb_mgr);
>  
>       mutex_init(&hdev->device_open);
> +     mutex_init(&hdev->send_cpu_message_lock);
>       atomic_set(&hdev->fd_open_cnt, 0);
>  
>       return 0;
>  
> +asid_fini:
> +     hl_asid_fini(hdev);
>  early_fini:
>       if (hdev->asic_funcs->early_fini)
>               hdev->asic_funcs->early_fini(hdev);
> @@ -192,9 +201,12 @@ static int device_early_init(struct hl_device *hdev)
>   */
>  static void device_early_fini(struct hl_device *hdev)
>  {
> +     mutex_destroy(&hdev->send_cpu_message_lock);
>  
>       hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
>  
> +     destroy_workqueue(hdev->cq_wq);
> +
>       hl_asid_fini(hdev);
>  
>       if (hdev->asic_funcs->early_fini)
> @@ -273,7 +285,7 @@ int hl_device_resume(struct hl_device *hdev)
>   */
>  int hl_device_init(struct hl_device *hdev, struct class *hclass)
>  {
> -     int rc;
> +     int i, rc, cq_ready_cnt;
>  
>       /* Create device */
>       rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
> @@ -294,11 +306,48 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>       if (rc)
>               goto early_fini;
>  
> +     /*
> +      * Initialize the H/W queues. Must be done before hw_init, because
> +      * there the addresses of the kernel queue are being written to the
> +      * registers of the device
> +      */
> +     rc = hl_hw_queues_create(hdev);
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to initialize kernel queues\n");
> +             goto sw_fini;
> +     }
> +
> +     /*
> +      * Initialize the completion queues. Must be done before hw_init,
> +      * because there the addresses of the completion queues are being
> +      * passed as arguments to request_irq
> +      */
> +     hdev->completion_queue =
> +                     kcalloc(hdev->asic_prop.completion_queues_count,
> +                             sizeof(*hdev->completion_queue), GFP_KERNEL);
> +
> +     if (!hdev->completion_queue) {
> +             dev_err(hdev->dev, "failed to allocate completion queues\n");
> +             rc = -ENOMEM;
> +             goto hw_queues_destroy;
> +     }
> +
> +     for (i = 0, cq_ready_cnt = 0;
> +                     i < hdev->asic_prop.completion_queues_count;
> +                     i++, cq_ready_cnt++) {
> +             rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
> +             if (rc) {
> +                     dev_err(hdev->dev,
> +                             "failed to initialize completion queue\n");
> +                     goto cq_fini;
> +             }
> +     }
> +
>       /* Allocate the kernel context */
>       hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
>       if (!hdev->kernel_ctx) {
>               rc = -ENOMEM;
> -             goto sw_fini;
> +             goto cq_fini;
>       }
>  
>       hdev->user_ctx = NULL;
> @@ -324,6 +373,14 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>  
>       hdev->disabled = false;
>  
> +     /* Check that the communication with the device is working */
> +     rc = hdev->asic_funcs->test_queues(hdev);
> +     if (rc) {
> +             dev_err(hdev->dev, "Failed to detect if device is alive\n");
> +             rc = 0;

Why rc is 0 here?

> +             goto out_disabled;
> +     }
> +
>       dev_notice(hdev->dev,
>               "Successfully added device to habanalabs driver\n");
>  
> @@ -335,6 +392,12 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>                       "kernel ctx is still alive on initialization 
> failure\n");
>  free_ctx:
>       kfree(hdev->kernel_ctx);
> +cq_fini:
> +     for (i = 0 ; i < cq_ready_cnt ; i++)
> +             hl_cq_fini(hdev, &hdev->completion_queue[i]);
> +     kfree(hdev->completion_queue);
> +hw_queues_destroy:
> +     hl_hw_queues_destroy(hdev);
>  sw_fini:
>       hdev->asic_funcs->sw_fini(hdev);
>  early_fini:
> @@ -364,6 +427,7 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>   */
>  void hl_device_fini(struct hl_device *hdev)
>  {
> +     int i;
>       dev_info(hdev->dev, "Removing device\n");
>  
>       /* Mark device as disabled */
> @@ -378,6 +442,12 @@ void hl_device_fini(struct hl_device *hdev)
>       /* Reset the H/W. It will be in idle state after this returns */
>       hdev->asic_funcs->hw_fini(hdev, true);
>  
> +     for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
> +             hl_cq_fini(hdev, &hdev->completion_queue[i]);
> +     kfree(hdev->completion_queue);
> +
> +     hl_hw_queues_destroy(hdev);
> +
>       /* Call ASIC S/W finalize function */
>       hdev->asic_funcs->sw_fini(hdev);
>  
> diff --git a/drivers/misc/habanalabs/goya/goya.c 
> b/drivers/misc/habanalabs/goya/goya.c
> index f715e01838b3..08d5227eaf1d 100644
> --- a/drivers/misc/habanalabs/goya/goya.c
> +++ b/drivers/misc/habanalabs/goya/goya.c
> @@ -98,6 +98,26 @@
>  static void goya_get_fixed_properties(struct hl_device *hdev)
>  {
>       struct asic_fixed_properties *prop = &hdev->asic_prop;
> +     int i;
> +
> +     for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
> +             prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
> +             prop->hw_queues_props[i].kmd_only = 0;
> +     }
> +
> +     for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
> +             prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
> +             prop->hw_queues_props[i].kmd_only = 1;
> +     }
> +
> +     for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
> +                     NUMBER_OF_INT_HW_QUEUES; i++) {
> +             prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
> +             prop->hw_queues_props[i].kmd_only = 0;
> +     }
> +
> +     for (; i < HL_MAX_QUEUES; i++)
> +             prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
>  
>       prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
>  
> @@ -126,6 +146,18 @@ static void goya_get_fixed_properties(struct hl_device 
> *hdev)
>       prop->high_pll = PLL_HIGH_DEFAULT;
>  }
>  
> +int goya_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
> +{
> +     struct armcp_packet pkt;
> +
> +     memset(&pkt, 0, sizeof(pkt));
> +
> +     pkt.opcode = opcode;
> +
> +     return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
> +                     sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
> +}
> +
>  /**
>   * goya_pci_bars_map - Map PCI BARS of Goya device
>   *
> @@ -509,6 +541,8 @@ static int goya_sw_init(struct hl_device *hdev)
>       if (!goya)
>               return -ENOMEM;
>  
> +     goya->test_cpu_queue = goya_test_cpu_queue;
> +
>       /* according to goya_init_iatu */
>       goya->ddr_bar_cur_addr = DRAM_PHYS_BASE;
>       hdev->asic_specific = goya;
> @@ -595,6 +629,299 @@ int goya_sw_fini(struct hl_device *hdev)
>       return 0;
>  }
>  
> +static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
> +             dma_addr_t bus_address)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     u32 mtr_base_lo, mtr_base_hi;
> +     u32 so_base_lo, so_base_hi;
> +     u32 gic_base_lo, gic_base_hi;
> +     u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI);
> +
> +     mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +
> +     WREG32(mmDMA_QM_0_PQ_BASE_LO + reg_off, lower_32_bits(bus_address));
> +     WREG32(mmDMA_QM_0_PQ_BASE_HI + reg_off, upper_32_bits(bus_address));
> +
> +     WREG32(mmDMA_QM_0_PQ_SIZE + reg_off, ilog2(HL_QUEUE_LENGTH));
> +     WREG32(mmDMA_QM_0_PQ_PI + reg_off, 0);
> +     WREG32(mmDMA_QM_0_PQ_CI + reg_off, 0);
> +
> +     WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
> +     WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
> +     WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
> +     WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
> +     WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
> +     WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
> +     WREG32(mmDMA_QM_0_GLBL_ERR_WDATA + reg_off,
> +                     GOYA_ASYNC_EVENT_ID_DMA0_QM + dma_id);
> +
> +     /* PQ has buffer of 2 cache lines, while CQ has 8 lines */
> +     WREG32(mmDMA_QM_0_PQ_CFG1 + reg_off, 0x00020002);
> +     WREG32(mmDMA_QM_0_CQ_CFG1 + reg_off, 0x00080008);
> +
> +     if (dma_id == 0)
> +             WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED);
> +     else
> +             if (goya->hw_cap_initialized & HW_CAP_MMU)
> +                     WREG32(mmDMA_QM_0_GLBL_PROT + reg_off,
> +                                     QMAN_DMA_PARTLY_TRUSTED);
> +             else
> +                     WREG32(mmDMA_QM_0_GLBL_PROT + reg_off,
> +                                     QMAN_DMA_FULLY_TRUSTED);
> +
> +     WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN);
> +     WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE);
> +}
> +
> +static void goya_init_dma_ch(struct hl_device *hdev, int dma_id)
> +{
> +     u32 gic_base_lo, gic_base_hi;
> +     u64 sob_addr;
> +     u32 reg_off = dma_id * (mmDMA_CH_1_CFG1 - mmDMA_CH_0_CFG1);
> +
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +
> +     WREG32(mmDMA_CH_0_ERRMSG_ADDR_LO + reg_off, gic_base_lo);
> +     WREG32(mmDMA_CH_0_ERRMSG_ADDR_HI + reg_off, gic_base_hi);
> +     WREG32(mmDMA_CH_0_ERRMSG_WDATA + reg_off,
> +                     GOYA_ASYNC_EVENT_ID_DMA0_CH + dma_id);
> +
> +     if (dma_id) {
> +             sob_addr = CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1000 +
> +                             (dma_id - 1) * 4;
> +             WREG32(mmDMA_CH_0_WR_COMP_ADDR_LO + reg_off,
> +                             lower_32_bits(sob_addr));
> +             WREG32(mmDMA_CH_0_WR_COMP_ADDR_HI + reg_off,
> +                             upper_32_bits(sob_addr));
> +             WREG32(mmDMA_CH_0_WR_COMP_WDATA + reg_off, 0x80000001);
> +     }
> +}
> +
> +/**
> + * goya_init_dma_qmans - Initialize QMAN DMA registers
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Initialize the H/W registers of the QMAN DMA channels
> + *
> + */
> +static void goya_init_dma_qmans(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     struct hl_hw_queue *q;
> +     dma_addr_t bus_address;
> +     int i;
> +
> +     if (goya->hw_cap_initialized & HW_CAP_DMA)
> +             return;
> +
> +     q = &hdev->kernel_queues[0];
> +
> +     for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) {
> +             bus_address = q->bus_address +
> +                             hdev->asic_prop.host_phys_base_address;
> +
> +             goya_init_dma_qman(hdev, i, bus_address);
> +             goya_init_dma_ch(hdev, i);
> +     }
> +
> +     goya->hw_cap_initialized |= HW_CAP_DMA;
> +}
> +
> +/**
> + * goya_disable_external_queues - Disable external queues
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + */
> +static void goya_disable_external_queues(struct hl_device *hdev)
> +{
> +     WREG32(mmDMA_QM_0_GLBL_CFG0, 0);
> +     WREG32(mmDMA_QM_1_GLBL_CFG0, 0);
> +     WREG32(mmDMA_QM_2_GLBL_CFG0, 0);
> +     WREG32(mmDMA_QM_3_GLBL_CFG0, 0);
> +     WREG32(mmDMA_QM_4_GLBL_CFG0, 0);
> +}
> +
> +static int goya_stop_queue(struct hl_device *hdev, u32 cfg_reg,
> +                             u32 cp_sts_reg, u32 glbl_sts0_reg)
> +{
> +     int rc;
> +     u32 status;
> +
> +     /* use the values of TPC0 as they are all the same*/
> +
> +     WREG32(cfg_reg, 1 << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT);
> +
> +     status = RREG32(cp_sts_reg);
> +     if (status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK) {
> +             rc = hl_poll_timeout(
> +                     hdev,
> +                     cp_sts_reg,
> +                     status,
> +                     !(status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK),
> +                     1000,
> +                     QMAN_FENCE_TIMEOUT_USEC);
> +
> +             /* if QMAN is stuck in fence no need to check for stop */
> +             if (rc)
> +                     return 0;

Isn't it an error?

> +     }
> +
> +     rc = hl_poll_timeout(
> +             hdev,
> +             glbl_sts0_reg,
> +             status,
> +             (status & TPC0_QM_GLBL_STS0_CP_IS_STOP_MASK),
> +             1000,
> +             QMAN_STOP_TIMEOUT_USEC);
> +
> +     if (rc) {
> +             dev_err(hdev->dev,
> +                     "Timeout while waiting for QMAN to stop\n");
> +             return -EINVAL;
> +     }
> +
> +     return 0;
> +}
> +
> +/**
> + * goya_stop_external_queues - Stop external queues
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Returns 0 on success
> + *
> + */
> +static int goya_stop_external_queues(struct hl_device *hdev)
> +{
> +     int rc = goya_stop_queue(hdev,
> +                     mmDMA_QM_0_GLBL_CFG1,
> +                     mmDMA_QM_0_CP_STS,
> +                     mmDMA_QM_0_GLBL_STS0);
> +
> +     if (rc)
> +             dev_err(hdev->dev, "failed to stop DMA QMAN 0\n");
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmDMA_QM_1_GLBL_CFG1,
> +                     mmDMA_QM_1_CP_STS,
> +                     mmDMA_QM_1_GLBL_STS0);
> +
> +     if (rc)
> +             dev_err(hdev->dev, "failed to stop DMA QMAN 1\n");
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmDMA_QM_2_GLBL_CFG1,
> +                     mmDMA_QM_2_CP_STS,
> +                     mmDMA_QM_2_GLBL_STS0);
> +
> +     if (rc)
> +             dev_err(hdev->dev, "failed to stop DMA QMAN 2\n");
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmDMA_QM_3_GLBL_CFG1,
> +                     mmDMA_QM_3_CP_STS,
> +                     mmDMA_QM_3_GLBL_STS0);
> +
> +     if (rc)
> +             dev_err(hdev->dev, "failed to stop DMA QMAN 3\n");
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmDMA_QM_4_GLBL_CFG1,
> +                     mmDMA_QM_4_CP_STS,
> +                     mmDMA_QM_4_GLBL_STS0);
> +
> +     if (rc)
> +             dev_err(hdev->dev, "failed to stop DMA QMAN 4\n");
> +
> +     return rc;
> +}
> +
> +static void goya_resume_external_queues(struct hl_device *hdev)
> +{
> +     WREG32(mmDMA_QM_0_GLBL_CFG1, 0);
> +     WREG32(mmDMA_QM_1_GLBL_CFG1, 0);
> +     WREG32(mmDMA_QM_2_GLBL_CFG1, 0);
> +     WREG32(mmDMA_QM_3_GLBL_CFG1, 0);
> +     WREG32(mmDMA_QM_4_GLBL_CFG1, 0);
> +}
> +
> +/**
> + * goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Returns 0 on success
> + *
> + */
> +int goya_init_cpu_queues(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     dma_addr_t bus_address;
> +     u32 status;
> +     struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
> +     int err;
> +
> +     if (!hdev->cpu_queues_enable)
> +             return 0;
> +
> +     if (goya->hw_cap_initialized & HW_CAP_CPU_Q)
> +             return 0;
> +
> +     bus_address = cpu_pq->bus_address +
> +                     hdev->asic_prop.host_phys_base_address;
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address));
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address));
> +
> +     bus_address = hdev->cpu_accessible_dma_address +
> +                     hdev->asic_prop.host_phys_base_address;
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address));
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address));
> +
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_10, CPU_ACCESSIBLE_MEM_SIZE);
> +
> +     /* Used for EQ CI */
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, 0);
> +
> +     WREG32(mmCPU_IF_PF_PQ_PI, 0);
> +
> +     WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, PQ_INIT_STATUS_READY_FOR_CP);
> +
> +     WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
> +                     GOYA_ASYNC_EVENT_ID_PI_UPDATE);
> +
> +     err = hl_poll_timeout(
> +             hdev,
> +             mmPSOC_GLOBAL_CONF_SCRATCHPAD_7,
> +             status,
> +             (status == PQ_INIT_STATUS_READY_FOR_HOST),
> +             1000,
> +             GOYA_CPU_TIMEOUT_USEC);
> +
> +     if (err) {
> +             dev_err(hdev->dev,
> +                     "Failed to communicate with ARM CPU (ArmCP timeout)\n");
> +             return -EIO;
> +     }
> +
> +     goya->hw_cap_initialized |= HW_CAP_CPU_Q;
> +     return 0;
> +}
> +
>  /**
>   * goya_init_pll - Initialize pll registers
>   *
> @@ -1960,152 +2287,646 @@ static void goya_init_golden_registers(struct 
> hl_device *hdev)
>       goya->hw_cap_initialized |= HW_CAP_GOLDEN;
>  }
>  
> -
> -/**
> - * goya_push_uboot_to_device - Push u-boot FW code to device
> - *
> - * @hdev: pointer to hl_device structure
> - *
> - * Copy u-boot fw code from firmware file to SRAM BAR.
> - * Returns 0 on success
> - *
> - */
> -static int goya_push_uboot_to_device(struct hl_device *hdev)
> +static void goya_init_mme_qman(struct hl_device *hdev)
>  {
> -     char fw_name[200];
> -     const u64 *fw_data;
> -     void __iomem *dst;
> -     size_t fw_size, i;
> -     int rc;
> +     u32 mtr_base_lo, mtr_base_hi;
> +     u32 so_base_lo, so_base_hi;
> +     u32 gic_base_lo, gic_base_hi;
> +     u64 qman_base_addr;
>  
> -     snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
> +     mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
>  
> -     rc = request_firmware(&hdev->spl_fw, fw_name, hdev->dev);
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
>  
> -     if (rc) {
> -             dev_err(hdev->dev, "Failed to request u-boot fw image\n");
> -             goto out;
> -     }
> +     qman_base_addr = hdev->asic_prop.sram_base_address +
> +                             MME_QMAN_BASE_OFFSET;
>  
> -     fw_size = hdev->spl_fw->size;
> -     if ((fw_size % 4) != 0) {
> -             dev_err(hdev->dev, "illegal u-boot firmware size %lu\n",
> -                     fw_size);
> -             rc = -EINVAL;
> -             goto out;
> -     }
> +     WREG32(mmMME_QM_PQ_BASE_LO, lower_32_bits(qman_base_addr));
> +     WREG32(mmMME_QM_PQ_BASE_HI, upper_32_bits(qman_base_addr));
> +     WREG32(mmMME_QM_PQ_SIZE, ilog2(MME_QMAN_LENGTH));
> +     WREG32(mmMME_QM_PQ_PI, 0);
> +     WREG32(mmMME_QM_PQ_CI, 0);
> +     WREG32(mmMME_QM_CP_LDMA_SRC_BASE_LO_OFFSET, 0x10C0);
> +     WREG32(mmMME_QM_CP_LDMA_SRC_BASE_HI_OFFSET, 0x10C4);
> +     WREG32(mmMME_QM_CP_LDMA_TSIZE_OFFSET, 0x10C8);
> +     WREG32(mmMME_QM_CP_LDMA_COMMIT_OFFSET, 0x10CC);
>  
> -     dev_dbg(hdev->dev, "u-boot firmware size == %lu\n", fw_size);
> +     WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
> +     WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
> +     WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_LO, so_base_lo);
> +     WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_HI, so_base_hi);
>  
> -     fw_data = (const u64 *) hdev->spl_fw->data;
> -     dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
> +     /* QMAN CQ has 8 cache lines */
> +     WREG32(mmMME_QM_CQ_CFG1, 0x00080008);
>  
> -     if ((hdev->spl_fw->size % 8) != 0)
> -             fw_size -= 8;
> +     WREG32(mmMME_QM_GLBL_ERR_ADDR_LO, gic_base_lo);
> +     WREG32(mmMME_QM_GLBL_ERR_ADDR_HI, gic_base_hi);
>  
> -     for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
> -             if (!(i & (0x80000 - 1)))
> -                     dev_dbg(hdev->dev,
> -                             "u-boot copied so far %lu out of %lu",
> -                             i, fw_size);
> +     WREG32(mmMME_QM_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_QM);
>  
> -             writeq(*fw_data, dst);
> -     }
> +     WREG32(mmMME_QM_GLBL_ERR_CFG, QMAN_MME_ERR_MSG_EN);
>  
> -     if ((hdev->spl_fw->size % 8) != 0)
> -             writel(*(const u32 *) fw_data, dst);
> +     WREG32(mmMME_QM_GLBL_PROT, QMAN_MME_ERR_PROT);
>  
> -out:
> -     release_firmware(hdev->spl_fw);
> -     return rc;
> +     WREG32(mmMME_QM_GLBL_CFG0, QMAN_MME_ENABLE);
>  }
>  
> -/**
> - * goya_push_linux_to_device - Push LINUX FW code to device
> - *
> - * @hdev: pointer to hl_device structure
> - *
> - * Copy LINXU fw code from firmware file to DDR BAR.
> - * Returns 0 on success
> - *
> - */
> -static int goya_push_linux_to_device(struct hl_device *hdev)
> +static void goya_init_mme_cmdq(struct hl_device *hdev)
>  {
> -     char fw_name[200];
> -     const u64 *fw_data;
> -     void __iomem *dst;
> -     size_t fw_size, i;
> -     int rc;
> +     u32 mtr_base_lo, mtr_base_hi;
> +     u32 so_base_lo, so_base_hi;
> +     u32 gic_base_lo, gic_base_hi;
> +     u64 qman_base_addr;
>  
> -     snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
> +     mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
>  
> -     rc = request_firmware(&hdev->spl_fw, fw_name, hdev->dev);
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
>  
> -     if (rc) {
> -             dev_err(hdev->dev, "Failed to request Linux fw image\n");
> -             goto out;
> -     }
> +     qman_base_addr = hdev->asic_prop.sram_base_address +
> +                             MME_QMAN_BASE_OFFSET;
>  
> -     fw_size = hdev->spl_fw->size;
> -     if ((fw_size % 4) != 0) {
> -             dev_err(hdev->dev, "illegal Linux firmware size %lu\n",
> -                     fw_size);
> -             rc = -EINVAL;
> -             goto out;
> -     }
> +     WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
> +     WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
> +     WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO, so_base_lo);
> +     WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_HI, so_base_hi);
>  
> -     dev_dbg(hdev->dev, "Linux firmware size == %lu\n", fw_size);
> +     /* CMDQ CQ has 20 cache lines */
> +     WREG32(mmMME_CMDQ_CQ_CFG1, 0x00140014);
>  
> -     fw_data = (const u64 *) hdev->spl_fw->data;
> -     dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
> +     WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_LO, gic_base_lo);
> +     WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_HI, gic_base_hi);
>  
> -     if ((hdev->spl_fw->size % 8) != 0)
> -             fw_size -= 8;
> +     WREG32(mmMME_CMDQ_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_CMDQ);
>  
> -     for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
> -             if (!(i & (0x80000 - 1))) {
> -                     dev_dbg(hdev->dev,
> -                             "Linux copied so far %lu out of %lu",
> -                             i, fw_size);
> -                     usleep_range(20, 100);
> -             }
> -             writeq(*fw_data, dst);
> -     }
> +     WREG32(mmMME_CMDQ_GLBL_ERR_CFG, CMDQ_MME_ERR_MSG_EN);
>  
> -     if ((hdev->spl_fw->size % 8) != 0)
> -             writel(*(const u32 *) fw_data, dst);
> +     WREG32(mmMME_CMDQ_GLBL_PROT, CMDQ_MME_ERR_PROT);
>  
> -out:
> -     release_firmware(hdev->spl_fw);
> -     return rc;
> +     WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE);
>  }
>  
> -static int goya_pldm_init_cpu(struct hl_device *hdev)
> +static void goya_init_mme_qmans(struct hl_device *hdev)
>  {
> -     u32 val, unit_rst_val;
> -     int rc;
> +     struct goya_device *goya = hdev->asic_specific;
> +     u32 so_base_lo, so_base_hi;
>  
> -     /* Must initialize SRAM scrambler before pushing u-boot to SRAM */
> -     goya_init_golden_registers(hdev);
> +     if (goya->hw_cap_initialized & HW_CAP_MME)
> +             return;
>  
> -     /* Put ARM cores into reset */
> -     WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT);
> -     val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
>  
> -     /* Reset the CA53 MACRO */
> -     unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> -     WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET);
> -     val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> -     WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val);
> -     val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> +     WREG32(mmMME_SM_BASE_ADDRESS_LOW, so_base_lo);
> +     WREG32(mmMME_SM_BASE_ADDRESS_HIGH, so_base_hi);
>  
> -     rc = goya_push_uboot_to_device(hdev);
> -     if (rc)
> -             return rc;
> +     goya_init_mme_qman(hdev);
> +     goya_init_mme_cmdq(hdev);
>  
> -     rc = goya_push_linux_to_device(hdev);
> -     if (rc)
> -             return rc;
> +     goya->hw_cap_initialized |= HW_CAP_MME;
> +}
> +
> +static void goya_init_tpc_qman(struct hl_device *hdev, u32 base_off, int 
> tpc_id)
> +{
> +     u32 mtr_base_lo, mtr_base_hi;
> +     u32 so_base_lo, so_base_hi;
> +     u32 gic_base_lo, gic_base_hi;
> +     u64 qman_base_addr;
> +     u32 reg_off = tpc_id * (mmTPC1_QM_PQ_PI - mmTPC0_QM_PQ_PI);
> +
> +     mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +
> +     qman_base_addr = hdev->asic_prop.sram_base_address + base_off;
> +
> +     WREG32(mmTPC0_QM_PQ_BASE_LO + reg_off, lower_32_bits(qman_base_addr));
> +     WREG32(mmTPC0_QM_PQ_BASE_HI + reg_off, upper_32_bits(qman_base_addr));
> +     WREG32(mmTPC0_QM_PQ_SIZE + reg_off, ilog2(TPC_QMAN_LENGTH));
> +     WREG32(mmTPC0_QM_PQ_PI + reg_off, 0);
> +     WREG32(mmTPC0_QM_PQ_CI + reg_off, 0);
> +     WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET + reg_off, 0x10C0);
> +     WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_HI_OFFSET + reg_off, 0x10C4);
> +     WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET + reg_off, 0x10C8);
> +     WREG32(mmTPC0_QM_CP_LDMA_COMMIT_OFFSET + reg_off, 0x10CC);
> +
> +     WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
> +     WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
> +     WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
> +     WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
> +
> +     WREG32(mmTPC0_QM_CQ_CFG1 + reg_off, 0x00080008);
> +
> +     WREG32(mmTPC0_QM_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
> +     WREG32(mmTPC0_QM_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
> +
> +     WREG32(mmTPC0_QM_GLBL_ERR_WDATA + reg_off,
> +                     GOYA_ASYNC_EVENT_ID_TPC0_QM + tpc_id);
> +
> +     WREG32(mmTPC0_QM_GLBL_ERR_CFG + reg_off, QMAN_TPC_ERR_MSG_EN);
> +
> +     WREG32(mmTPC0_QM_GLBL_PROT + reg_off, QMAN_TPC_ERR_PROT);
> +
> +     WREG32(mmTPC0_QM_GLBL_CFG0 + reg_off, QMAN_TPC_ENABLE);
> +}
> +
> +static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id)
> +{
> +     u32 mtr_base_lo, mtr_base_hi;
> +     u32 so_base_lo, so_base_hi;
> +     u32 gic_base_lo, gic_base_hi;
> +     u32 reg_off = tpc_id * (mmTPC1_CMDQ_CQ_CFG1 - mmTPC0_CMDQ_CQ_CFG1);
> +
> +     mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +
> +     gic_base_lo =
> +             lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +     gic_base_hi =
> +             upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
> +
> +     WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
> +     WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
> +     WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
> +     WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
> +
> +     WREG32(mmTPC0_CMDQ_CQ_CFG1 + reg_off, 0x00140014);
> +
> +     WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
> +     WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
> +
> +     WREG32(mmTPC0_CMDQ_GLBL_ERR_WDATA + reg_off,
> +                     GOYA_ASYNC_EVENT_ID_TPC0_CMDQ + tpc_id);
> +
> +     WREG32(mmTPC0_CMDQ_GLBL_ERR_CFG + reg_off, CMDQ_TPC_ERR_MSG_EN);
> +
> +     WREG32(mmTPC0_CMDQ_GLBL_PROT + reg_off, CMDQ_TPC_ERR_PROT);
> +
> +     WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE);
> +}
> +
> +static void goya_init_tpc_qmans(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     u32 so_base_lo, so_base_hi;
> +     u32 cfg_off = mmTPC1_CFG_SM_BASE_ADDRESS_LOW -
> +                     mmTPC0_CFG_SM_BASE_ADDRESS_LOW;
> +     int i;
> +
> +     if (goya->hw_cap_initialized & HW_CAP_TPC)
> +             return;
> +
> +     so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +     so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
> +
> +     for (i = 0 ; i < TPC_MAX_NUM ; i++) {
> +             WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_LOW + i * cfg_off,
> +                             so_base_lo);
> +             WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_HIGH + i * cfg_off,
> +                             so_base_hi);
> +     }
> +
> +     goya_init_tpc_qman(hdev, TPC0_QMAN_BASE_OFFSET, 0);
> +     goya_init_tpc_qman(hdev, TPC1_QMAN_BASE_OFFSET, 1);
> +     goya_init_tpc_qman(hdev, TPC2_QMAN_BASE_OFFSET, 2);
> +     goya_init_tpc_qman(hdev, TPC3_QMAN_BASE_OFFSET, 3);
> +     goya_init_tpc_qman(hdev, TPC4_QMAN_BASE_OFFSET, 4);
> +     goya_init_tpc_qman(hdev, TPC5_QMAN_BASE_OFFSET, 5);
> +     goya_init_tpc_qman(hdev, TPC6_QMAN_BASE_OFFSET, 6);
> +     goya_init_tpc_qman(hdev, TPC7_QMAN_BASE_OFFSET, 7);
> +
> +     for (i = 0 ; i < TPC_MAX_NUM ; i++)
> +             goya_init_tpc_cmdq(hdev, i);
> +
> +     goya->hw_cap_initialized |= HW_CAP_TPC;
> +}
> +
> +/**
> + * goya_disable_internal_queues - Disable internal queues
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + */
> +static void goya_disable_internal_queues(struct hl_device *hdev)
> +{
> +     WREG32(mmMME_QM_GLBL_CFG0, 0);
> +     WREG32(mmMME_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC0_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC0_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC1_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC1_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC2_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC2_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC3_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC3_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC4_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC4_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC5_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC5_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC6_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC6_CMDQ_GLBL_CFG0, 0);
> +
> +     WREG32(mmTPC7_QM_GLBL_CFG0, 0);
> +     WREG32(mmTPC7_CMDQ_GLBL_CFG0, 0);
> +}
> +
> +/**
> + * goya_stop_internal_queues - Stop internal queues
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Returns 0 on success
> + *
> + */
> +static int goya_stop_internal_queues(struct hl_device *hdev)
> +{
> +     int rc, retval = 0;
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmMME_QM_GLBL_CFG1,
> +                     mmMME_QM_CP_STS,
> +                     mmMME_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop MME QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmMME_CMDQ_GLBL_CFG1,
> +                     mmMME_CMDQ_CP_STS,
> +                     mmMME_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop MME CMDQ\n");
> +             retval = -EIO;
> +     }

If I understand correctly, the queues can be and should be stopped 
independently and
failure to stop one of them wouldn't prevent stopping the others.
If that's the case a comment explaining that would be nice.

> +     rc = goya_stop_queue(hdev,
> +                     mmTPC0_QM_GLBL_CFG1,
> +                     mmTPC0_QM_CP_STS,
> +                     mmTPC0_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 0 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC0_CMDQ_GLBL_CFG1,
> +                     mmTPC0_CMDQ_CP_STS,
> +                     mmTPC0_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 0 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC1_QM_GLBL_CFG1,
> +                     mmTPC1_QM_CP_STS,
> +                     mmTPC1_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 1 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC1_CMDQ_GLBL_CFG1,
> +                     mmTPC1_CMDQ_CP_STS,
> +                     mmTPC1_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 1 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC2_QM_GLBL_CFG1,
> +                     mmTPC2_QM_CP_STS,
> +                     mmTPC2_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 2 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC2_CMDQ_GLBL_CFG1,
> +                     mmTPC2_CMDQ_CP_STS,
> +                     mmTPC2_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 2 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC3_QM_GLBL_CFG1,
> +                     mmTPC3_QM_CP_STS,
> +                     mmTPC3_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 3 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC3_CMDQ_GLBL_CFG1,
> +                     mmTPC3_CMDQ_CP_STS,
> +                     mmTPC3_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 3 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC4_QM_GLBL_CFG1,
> +                     mmTPC4_QM_CP_STS,
> +                     mmTPC4_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 4 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC4_CMDQ_GLBL_CFG1,
> +                     mmTPC4_CMDQ_CP_STS,
> +                     mmTPC4_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 4 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC5_QM_GLBL_CFG1,
> +                     mmTPC5_QM_CP_STS,
> +                     mmTPC5_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 5 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC5_CMDQ_GLBL_CFG1,
> +                     mmTPC5_CMDQ_CP_STS,
> +                     mmTPC5_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 5 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC6_QM_GLBL_CFG1,
> +                     mmTPC6_QM_CP_STS,
> +                     mmTPC6_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 6 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC6_CMDQ_GLBL_CFG1,
> +                     mmTPC6_CMDQ_CP_STS,
> +                     mmTPC6_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 6 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC7_QM_GLBL_CFG1,
> +                     mmTPC7_QM_CP_STS,
> +                     mmTPC7_QM_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 7 QMAN\n");
> +             retval = -EIO;
> +     }
> +
> +     rc = goya_stop_queue(hdev,
> +                     mmTPC7_CMDQ_GLBL_CFG1,
> +                     mmTPC7_CMDQ_CP_STS,
> +                     mmTPC7_CMDQ_GLBL_STS0);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop TPC 7 CMDQ\n");
> +             retval = -EIO;
> +     }
> +
> +     return rc;
> +}
> +
> +static void goya_resume_internal_queues(struct hl_device *hdev)
> +{
> +     WREG32(mmMME_QM_GLBL_CFG1, 0);
> +     WREG32(mmMME_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC0_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC1_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC2_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC3_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC4_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC5_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC6_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0);
> +
> +     WREG32(mmTPC7_QM_GLBL_CFG1, 0);
> +     WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0);
> +}
> +
> +
> +/**
> + * goya_push_uboot_to_device - Push u-boot FW code to device
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Copy u-boot fw code from firmware file to SRAM BAR.
> + * Returns 0 on success
> + *
> + */
> +static int goya_push_uboot_to_device(struct hl_device *hdev)
> +{
> +     char fw_name[200];
> +     const u64 *fw_data;
> +     void __iomem *dst;
> +     size_t fw_size, i;
> +     int rc;
> +
> +     snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
> +
> +     rc = request_firmware(&hdev->spl_fw, fw_name, hdev->dev);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "Failed to request u-boot fw image\n");
> +             goto out;
> +     }
> +
> +     fw_size = hdev->spl_fw->size;
> +     if ((fw_size % 4) != 0) {
> +             dev_err(hdev->dev, "illegal u-boot firmware size %lu\n",
> +                     fw_size);
> +             rc = -EINVAL;
> +             goto out;
> +     }
> +
> +     dev_dbg(hdev->dev, "u-boot firmware size == %lu\n", fw_size);
> +
> +     fw_data = (const u64 *) hdev->spl_fw->data;
> +     dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
> +
> +     if ((hdev->spl_fw->size % 8) != 0)
> +             fw_size -= 8;
> +
> +     for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
> +             if (!(i & (0x80000 - 1)))
> +                     dev_dbg(hdev->dev,
> +                             "u-boot copied so far %lu out of %lu",
> +                             i, fw_size);
> +
> +             writeq(*fw_data, dst);
> +     }
> +
> +     if ((hdev->spl_fw->size % 8) != 0)
> +             writel(*(const u32 *) fw_data, dst);
> +
> +out:
> +     release_firmware(hdev->spl_fw);
> +     return rc;
> +}
> +
> +/**
> + * goya_push_linux_to_device - Push LINUX FW code to device
> + *
> + * @hdev: pointer to hl_device structure
> + *
> + * Copy LINXU fw code from firmware file to DDR BAR.
> + * Returns 0 on success
> + *
> + */
> +static int goya_push_linux_to_device(struct hl_device *hdev)
> +{
> +     char fw_name[200];
> +     const u64 *fw_data;
> +     void __iomem *dst;
> +     size_t fw_size, i;
> +     int rc;
> +
> +     snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
> +
> +     rc = request_firmware(&hdev->spl_fw, fw_name, hdev->dev);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "Failed to request Linux fw image\n");
> +             goto out;
> +     }
> +
> +     fw_size = hdev->spl_fw->size;
> +     if ((fw_size % 4) != 0) {
> +             dev_err(hdev->dev, "illegal Linux firmware size %lu\n",
> +                     fw_size);
> +             rc = -EINVAL;
> +             goto out;
> +     }
> +
> +     dev_dbg(hdev->dev, "Linux firmware size == %lu\n", fw_size);
> +
> +     fw_data = (const u64 *) hdev->spl_fw->data;
> +     dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
> +
> +     if ((hdev->spl_fw->size % 8) != 0)
> +             fw_size -= 8;
> +
> +     for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
> +             if (!(i & (0x80000 - 1))) {
> +                     dev_dbg(hdev->dev,
> +                             "Linux copied so far %lu out of %lu",
> +                             i, fw_size);
> +                     usleep_range(20, 100);
> +             }
> +             writeq(*fw_data, dst);
> +     }
> +
> +     if ((hdev->spl_fw->size % 8) != 0)
> +             writel(*(const u32 *) fw_data, dst);
> +
> +out:
> +     release_firmware(hdev->spl_fw);
> +     return rc;
> +}
> +
> +static int goya_pldm_init_cpu(struct hl_device *hdev)
> +{
> +     u32 val, unit_rst_val;
> +     int rc;
> +
> +     /* Must initialize SRAM scrambler before pushing u-boot to SRAM */
> +     goya_init_golden_registers(hdev);
> +
> +     /* Put ARM cores into reset */
> +     WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT);
> +     val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
> +
> +     /* Reset the CA53 MACRO */
> +     unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> +     WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET);
> +     val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> +     WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val);
> +     val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
> +
> +     rc = goya_push_uboot_to_device(hdev);
> +     if (rc)
> +             return rc;
> +
> +     rc = goya_push_linux_to_device(hdev);
> +     if (rc)
> +             return rc;
>  
>       WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY);
>       WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA);
> @@ -2339,6 +3160,19 @@ static int goya_hw_init(struct hl_device *hdev)
>  
>       goya_init_security(hdev);
>  
> +     goya_init_dma_qmans(hdev);
> +
> +     goya_init_mme_qmans(hdev);
> +
> +     goya_init_tpc_qmans(hdev);
> +
> +     rc = goya_init_cpu_queues(hdev);
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n",
> +                     rc);
> +             goto disable_queues;
> +     }
> +
>       /* CPU initialization is finished, we can now move to 48 bit DMA mask */
>       rc = pci_set_dma_mask(hdev->pdev, DMA_BIT_MASK(48));
>       if (rc) {
> @@ -2347,7 +3181,7 @@ static int goya_hw_init(struct hl_device *hdev)
>               if (rc) {
>                       dev_err(hdev->dev,
>                               "Unable to set pci dma mask to 32 bits\n");
> -                     return rc;
> +                     goto disable_pci_access;
>               }
>       }
>  
> @@ -2359,7 +3193,7 @@ static int goya_hw_init(struct hl_device *hdev)
>               if (rc) {
>                       dev_err(hdev->dev,
>                               "Unable to set pci consistent dma mask to 32 
> bits\n");
> -                     return rc;
> +                     goto disable_pci_access;
>               }
>       }
>  
> @@ -2367,6 +3201,14 @@ static int goya_hw_init(struct hl_device *hdev)
>       val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
>  
>       return 0;
> +
> +disable_pci_access:
> +     goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
> +disable_queues:
> +     goya_disable_internal_queues(hdev);
> +     goya_disable_external_queues(hdev);
> +
> +     return rc;
>  }
>  
>  /**
> @@ -2473,12 +3315,40 @@ static void goya_hw_fini(struct hl_device *hdev, bool 
> hard_reset)
>  
>  int goya_suspend(struct hl_device *hdev)
>  {
> -     return 0;
> +     int rc;
> +
> +     rc = goya_stop_internal_queues(hdev);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop internal queues\n");
> +             return rc;
> +     }
> +
> +     rc = goya_stop_external_queues(hdev);
> +
> +     if (rc) {
> +             dev_err(hdev->dev, "failed to stop external queues\n");
> +             return rc;
> +     }
> +
> +     rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
> +     if (rc)
> +             dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
> +
> +     return rc;
>  }
>  
>  int goya_resume(struct hl_device *hdev)
>  {
> -     return 0;
> +     int rc;
> +
> +     goya_resume_external_queues(hdev);
> +     goya_resume_internal_queues(hdev);
> +
> +     rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
> +     if (rc)
> +             dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
> +     return rc;
>  }
>  
>  int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
> @@ -2502,6 +3372,104 @@ int goya_cb_mmap(struct hl_device *hdev, struct 
> vm_area_struct *vma,
>       return rc;
>  }
>  
> +void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
> +{
> +     u32 db_reg_offset, db_value;
> +     bool invalid_queue = false;
> +
> +     switch (hw_queue_id) {
> +     case GOYA_QUEUE_ID_DMA_0:
> +             db_reg_offset = mmDMA_QM_0_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_DMA_1:
> +             db_reg_offset = mmDMA_QM_1_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_DMA_2:
> +             db_reg_offset = mmDMA_QM_2_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_DMA_3:
> +             db_reg_offset = mmDMA_QM_3_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_DMA_4:
> +             db_reg_offset = mmDMA_QM_4_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_CPU_PQ:
> +             if (hdev->cpu_queues_enable)
> +                     db_reg_offset = mmCPU_IF_PF_PQ_PI;
> +             else
> +                     invalid_queue = true;
> +             break;
> +
> +     case GOYA_QUEUE_ID_MME:
> +             db_reg_offset = mmMME_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC0:
> +             db_reg_offset = mmTPC0_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC1:
> +             db_reg_offset = mmTPC1_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC2:
> +             db_reg_offset = mmTPC2_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC3:
> +             db_reg_offset = mmTPC3_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC4:
> +             db_reg_offset = mmTPC4_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC5:
> +             db_reg_offset = mmTPC5_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC6:
> +             db_reg_offset = mmTPC6_QM_PQ_PI;
> +             break;
> +
> +     case GOYA_QUEUE_ID_TPC7:
> +             db_reg_offset = mmTPC7_QM_PQ_PI;
> +             break;
> +
> +     default:
> +             invalid_queue = true;
> +     }
> +
> +     if (invalid_queue) {
> +             /* Should never get here */
> +             dev_err(hdev->dev, "h/w queue %d is invalid. Can't set pi\n",
> +                     hw_queue_id);
> +             return;
> +     }
> +
> +     db_value = pi;
> +
> +     if (hdev->ifh)
> +             return;
> +
> +     /* ring the doorbell */
> +     WREG32(db_reg_offset, db_value);
> +
> +     if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
> +             WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
> +                             GOYA_ASYNC_EVENT_ID_PI_UPDATE);
> +}
> +
> +void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
> +{
> +     /* Not needed in Goya */
> +}
> +
>  void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
>                                       dma_addr_t *dma_handle, gfp_t flags)
>  {
> @@ -2514,6 +3482,311 @@ void goya_dma_free_coherent(struct hl_device *hdev, 
> size_t size, void *cpu_addr,
>       dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
>  }
>  
> +void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
> +                             dma_addr_t *dma_handle, u16 *queue_len)
> +{
> +     void *base;
> +     u32 offset;
> +
> +     *dma_handle = hdev->asic_prop.sram_base_address;
> +
> +     base = hdev->pcie_bar[SRAM_CFG_BAR_ID];
> +
> +     switch (queue_id) {
> +     case GOYA_QUEUE_ID_MME:
> +             offset = MME_QMAN_BASE_OFFSET;
> +             *queue_len = MME_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC0:
> +             offset = TPC0_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC1:
> +             offset = TPC1_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC2:
> +             offset = TPC2_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC3:
> +             offset = TPC3_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC4:
> +             offset = TPC4_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC5:
> +             offset = TPC5_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC6:
> +             offset = TPC6_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     case GOYA_QUEUE_ID_TPC7:
> +             offset = TPC7_QMAN_BASE_OFFSET;
> +             *queue_len = TPC_QMAN_LENGTH;
> +             break;
> +     default:
> +             dev_err(hdev->dev, "Got invalid queue id %d\n", queue_id);
> +             return NULL;
> +     }
> +
> +     base += offset;
> +     *dma_handle += offset;
> +
> +     return base;
> +}
> +
> +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
> +                             u32 timeout, long *result)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     struct armcp_packet *pkt;
> +     dma_addr_t pkt_dma_addr;
> +     u32 tmp;
> +     int rc = 0;
> +
> +     if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) {
> +             if (result)
> +                     *result = 0;
> +             return 0;
> +     }
> +
> +     if (len > CPU_CB_SIZE) {
> +             dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n",
> +                     len);
> +             return -ENOMEM;
> +     }
> +
> +     pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
> +                                                             &pkt_dma_addr);
> +     if (!pkt) {
> +             dev_err(hdev->dev,
> +                     "Failed to allocate DMA memory for packet to CPU\n");
> +             return -ENOMEM;
> +     }
> +
> +     memcpy(pkt, msg, len);
> +
> +     mutex_lock(&hdev->send_cpu_message_lock);
> +
> +     if (hdev->disabled)
> +             goto out;
> +
> +     rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
> +                     pkt_dma_addr);
> +     if (rc) {
> +             dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
> +             goto out;
> +     }
> +
> +     rc = hl_poll_timeout_memory(hdev, (u64) &pkt->fence, timeout, &tmp);
> +
> +     hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);
> +
> +     if (rc == -ETIMEDOUT) {
> +             dev_err(hdev->dev,
> +                     "Timeout while waiting for CPU packet fence\n");
> +             goto out;
> +     }
> +
> +     if (tmp == ARMCP_PACKET_FENCE_VAL) {
> +             if (pkt->rc) {
> +                     dev_err(hdev->dev,
> +                             "failed to execute CPU packet, rc: %d\n",
> +                                     pkt->rc);
> +                     rc = -EINVAL;
> +             } else if (result) {
> +                     *result = pkt->result;

For some error cases above the *result is not initialized.

> +             }
> +     } else {
> +             dev_err(hdev->dev, "CPU packet wrong fence value\n");
> +             rc = -EINVAL;
> +     }
> +
> +out:
> +     mutex_unlock(&hdev->send_cpu_message_lock);
> +
> +     hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt);
> +
> +     return rc;
> +}
> +
> +int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
> +{
> +     struct packet_msg_prot *fence_pkt;
> +     dma_addr_t pkt_dma_addr;
> +     u32 fence_val, tmp;
> +     dma_addr_t fence_dma_addr;
> +     u32 *fence_ptr;
> +     int rc;
> +
> +     fence_val = GOYA_QMAN0_FENCE_VAL;
> +
> +     fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
> +                                                     &fence_dma_addr);
> +     if (!fence_ptr) {
> +             dev_err(hdev->dev,
> +                     "Failed to allocate memory for queue testing\n");
> +             return -ENOMEM;
> +     }
> +
> +     *fence_ptr = 0;
> +
> +     fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev,
> +                                     sizeof(struct packet_msg_prot),
> +                                     GFP_KERNEL, &pkt_dma_addr);
> +     if (!fence_pkt) {
> +             dev_err(hdev->dev,
> +                     "Failed to allocate packet for queue testing\n");
> +             rc = -ENOMEM;
> +             goto free_fence_ptr;
> +     }
> +
> +     fence_pkt->opcode = PACKET_MSG_PROT;
> +     fence_pkt->value = fence_val;
> +     fence_pkt->addr = fence_dma_addr +
> +                             hdev->asic_prop.host_phys_base_address;
> +
> +     rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id,
> +                                     sizeof(struct packet_msg_prot),
> +                                     pkt_dma_addr);
> +     if (rc) {
> +             dev_err(hdev->dev,
> +                     "Failed to send fence packet\n");
> +             goto free_pkt;
> +     }
> +
> +     rc = hl_poll_timeout_memory(hdev, (u64) fence_ptr,
> +                                     GOYA_TEST_QUEUE_WAIT_USEC, &tmp);
> +
> +     hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
> +
> +     if ((!rc) && (tmp == fence_val)) {
> +             dev_info(hdev->dev,
> +                     "queue test on H/W queue %d succeeded\n",
> +                     hw_queue_id);
> +     } else {
> +             dev_err(hdev->dev,
> +                     "H/W queue %d test failed (scratch(0x%08llX) == 
> 0x%08X)\n",
> +                     hw_queue_id, fence_dma_addr, tmp);
> +             rc = -EINVAL;
> +     }
> +
> +free_pkt:
> +     hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt,
> +                                     pkt_dma_addr);
> +free_fence_ptr:
> +     hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
> +                                     fence_dma_addr);
> +     return rc;
> +}
> +
> +int goya_test_cpu_queue(struct hl_device *hdev)
> +{
> +     struct armcp_packet test_pkt;
> +     long result;
> +     int rc;
> +
> +     /* cpu_queues_enable flag is always checked in send cpu message */
> +
> +     memset(&test_pkt, 0, sizeof(test_pkt));
> +
> +     test_pkt.opcode = ARMCP_PACKET_TEST;
> +     test_pkt.value = ARMCP_PACKET_FENCE_VAL;
> +
> +     rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
> +                     sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
> +
> +     if (!rc)
> +             dev_info(hdev->dev, "queue test on CPU queue succeeded\n");
> +     else
> +             dev_err(hdev->dev, "CPU queue test failed (0x%08lX)\n", result);
> +
> +     return rc;
> +}
> +
> +static int goya_test_queues(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +     int i, rc, ret_val = 0;
> +
> +     if (hdev->ifh)
> +             return 0;
> +
> +     for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
> +             rc = goya_test_queue(hdev, i);
> +             if (rc)
> +                     ret_val = -EINVAL;
> +     }
> +
> +     if (hdev->cpu_queues_enable) {
> +             rc = goya->test_cpu_queue(hdev);
> +             if (rc)
> +                     ret_val = -EINVAL;
> +     }
> +
> +     return ret_val;
> +}
> +
> +void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size, gfp_t 
> mem_flags,
> +                             dma_addr_t *dma_handle)
> +{
> +     if (size > GOYA_DMA_POOL_BLK_SIZE)
> +             return NULL;
> +
> +     return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
> +}
> +
> +void goya_dma_pool_free(struct hl_device *hdev, void *vaddr,
> +                     dma_addr_t dma_addr)
> +{
> +     dma_pool_free(hdev->dma_pool, vaddr, dma_addr);
> +}
> +
> +void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
> +                     dma_addr_t *dma_handle)
> +{
> +     u64 kernel_addr;
> +
> +     /* roundup to CPU_PKT_SIZE */
> +     size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK;
> +
> +     kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);
> +
> +     *dma_handle = hdev->cpu_accessible_dma_address +
> +                     (kernel_addr - (u64) hdev->cpu_accessible_dma_mem);
> +
> +     return (void *) kernel_addr;
> +}
> +
> +void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
> +                     void *vaddr)
> +{
> +     /* roundup to CPU_PKT_SIZE */
> +     size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK;
> +
> +     gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) vaddr, size);
> +}
> +
> +
> +static void goya_hw_queues_lock(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +
> +     spin_lock(&goya->hw_queues_lock);
> +}
> +
> +static void goya_hw_queues_unlock(struct hl_device *hdev)
> +{
> +     struct goya_device *goya = hdev->asic_specific;
> +
> +     spin_unlock(&goya->hw_queues_lock);
> +}
> +
>  static const struct hl_asic_funcs goya_funcs = {
>       .early_init = goya_early_init,
>       .early_fini = goya_early_fini,
> @@ -2525,8 +3798,19 @@ static const struct hl_asic_funcs goya_funcs = {
>       .resume = goya_resume,
>       .mmap = goya_mmap,
>       .cb_mmap = goya_cb_mmap,
> +     .ring_doorbell = goya_ring_doorbell,
> +     .flush_pq_write = goya_flush_pq_write,
>       .dma_alloc_coherent = goya_dma_alloc_coherent,
>       .dma_free_coherent = goya_dma_free_coherent,
> +     .get_int_queue_base = goya_get_int_queue_base,
> +     .test_queues = goya_test_queues,
> +     .dma_pool_zalloc = goya_dma_pool_zalloc,
> +     .dma_pool_free = goya_dma_pool_free,
> +     .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
> +     .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
> +     .hw_queues_lock = goya_hw_queues_lock,
> +     .hw_queues_unlock = goya_hw_queues_unlock,
> +     .send_cpu_message = goya_send_cpu_message
>  };
>  
>  /**
> diff --git a/drivers/misc/habanalabs/goya/goyaP.h 
> b/drivers/misc/habanalabs/goya/goyaP.h
> index 45a6d2ca2752..598a718d3df1 100644
> --- a/drivers/misc/habanalabs/goya/goyaP.h
> +++ b/drivers/misc/habanalabs/goya/goyaP.h
> @@ -9,6 +9,7 @@
>  #define GOYAP_H_
>  
>  #include "habanalabs.h"
> +#include "include/goya/goya_packets.h"
>  #include "include/goya/goya_boot_if.h"
>  #include "include/goya/goya.h"
>  
> @@ -117,12 +118,17 @@ enum goya_fw_component {
>  };
>  
>  struct goya_device {
> +     int (*test_cpu_queue)(struct hl_device *hdev);
> +
>       /* TODO: remove hw_queues_lock after moving to scheduler code */
>       spinlock_t      hw_queues_lock;
>       u64             ddr_bar_cur_addr;
>       u32             hw_cap_initialized;
>  };
>  
> +int goya_test_cpu_queue(struct hl_device *hdev);
> +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
> +                             u32 timeout, long *result);
>  void goya_init_security(struct hl_device *hdev);
>  
>  #endif /* GOYAP_H_ */
> diff --git a/drivers/misc/habanalabs/habanalabs.h 
> b/drivers/misc/habanalabs/habanalabs.h
> index adda281ec2af..8232e2259463 100644
> --- a/drivers/misc/habanalabs/habanalabs.h
> +++ b/drivers/misc/habanalabs/habanalabs.h
> @@ -30,10 +30,36 @@
>  struct hl_device;
>  struct hl_fpriv;
>  
> +/**
> + * enum hl_queue_type - Supported QUEUE types.
> + * @QUEUE_TYPE_NA: queue is not available.
> + * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the
> + *                  host.
> + * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
> + *                   memories and/or operates the compute engines.
> + * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
> + */
> +enum hl_queue_type {
> +     QUEUE_TYPE_NA,
> +     QUEUE_TYPE_EXT,
> +     QUEUE_TYPE_INT,
> +     QUEUE_TYPE_CPU
> +};
>  
> +/**
> + * struct hw_queue_properties - queue information.
> + * @type: queue type.
> + * @kmd_only: true if only KMD is allowed to send a job to this queue, false
> + *            otherwise.
> + */
> +struct hw_queue_properties {
> +     enum hl_queue_type      type;
> +     u8                      kmd_only;
> +};
>  
>  /**
>   * struct asic_fixed_properties - ASIC specific immutable properties.
> + * @hw_queues_props: H/W queues properties.
>   * @uboot_ver: F/W U-boot version.
>   * @preboot_ver: F/W Preboot version.
>   * @sram_base_address: SRAM physical start address.
> @@ -64,6 +90,7 @@ struct hl_fpriv;
>   * @tpc_enabled_mask: which TPCs are enabled.
>   */
>  struct asic_fixed_properties {
> +     struct hw_queue_properties      hw_queues_props[HL_MAX_QUEUES];
>       char                    uboot_ver[VERSION_MAX_LEN];
>       char                    preboot_ver[VERSION_MAX_LEN];
>       u64                     sram_base_address;
> @@ -145,7 +172,92 @@ struct hl_cb {
>  
>  
>  
> +/*
> + * QUEUES
> + */
> +
> +struct hl_cs_job;
> +
> +/*
> + * Currently, there are two limitations on the maximum length of a queue:
> + *
> + * 1. The memory footprint of the queue. The current allocated space for the
> + *    queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE,
> + *    the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE,
> + *    which currently is 4096/16 = 256 entries.
> + *
> + *    To increase that, we need either to decrease the size of the
> + *    BD (difficult), or allocate more than a single page (easier).
> + *
> + * 2. Because the size of the JOB handle field in the BD CTL / completion 
> queue
> + *    is 10-bit, we can have up to 1024 open jobs per hardware queue.
> + *    Therefore, each queue can hold up to 1024 entries.
> + *
> + * HL_QUEUE_LENGTH is in units of struct hl_bd.
> + * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE
> + */
> +
> +#define HL_PAGE_SIZE                 4096 /* minimum page size */
> +/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */
>  #define HL_QUEUE_LENGTH                      256
> +#define HL_QUEUE_SIZE_IN_BYTES               (HL_QUEUE_LENGTH * HL_BD_SIZE)
> +
> +/*
> + * HL_CQ_LENGTH is in units of struct hl_cq_entry.
> + * HL_CQ_LENGTH should be <= HL_PAGE_SIZE
> + */
> +#define HL_CQ_LENGTH                 HL_QUEUE_LENGTH
> +#define HL_CQ_SIZE_IN_BYTES          (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE)
> +
> +
> +
> +/**
> + * struct hl_hw_queue - describes a H/W transport queue.
> + * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
> + * @queue_type: type of queue.
> + * @kernel_address: holds the queue's kernel virtual address.
> + * @bus_address: holds the queue's DMA address.
> + * @pi: holds the queue's pi value.
> + * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real 
> ci).
> + * @hw_queue_id: the id of the H/W queue.
> + * @int_queue_len: length of internal queue (number of entries).
> + * @valid: is the queue valid (we have array of 32 queues, not all of them
> + *           exists).
> + */
> +struct hl_hw_queue {
> +     struct hl_cs_job        **shadow_queue;
> +     enum hl_queue_type      queue_type;
> +     u64                     kernel_address;
> +     dma_addr_t              bus_address;
> +     u32                     pi;
> +     u32                     ci;
> +     u32                     hw_queue_id;
> +     u16                     int_queue_len;
> +     u8                      valid;
> +};
> +
> +/**
> + * struct hl_cq - describes a completion queue
> + * @hdev: pointer to the device structure
> + * @kernel_address: holds the queue's kernel virtual address
> + * @bus_address: holds the queue's DMA address
> + * @hw_queue_id: the id of the matching H/W queue
> + * @ci: ci inside the queue
> + * @pi: pi inside the queue
> + * @free_slots_cnt: counter of free slots in queue
> + */
> +struct hl_cq {
> +     struct hl_device        *hdev;
> +     u64                     kernel_address;
> +     dma_addr_t              bus_address;
> +     u32                     hw_queue_id;
> +     u32                     ci;
> +     u32                     pi;
> +     atomic_t                free_slots_cnt;
> +};
> +
> +
> +
>  
>  
>  /*
> @@ -180,8 +292,20 @@ enum hl_asic_type {
>   * @resume: handles IP specific H/W or SW changes for resume.
>   * @mmap: mmap function, does nothing.
>   * @cb_mmap: maps a CB.
> + * @ring_doorbell: increment PI on a given QMAN.
> + * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing 
> failed.
>   * @dma_alloc_coherent: DMA allocate coherent memory.
>   * @dma_free_coherent: free DMA allocation.
> + * @get_int_queue_base: get the internal queue base address.
> + * @test_queues: run simple test on all queues for sanity check.
> + * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
> + *                   size of allocation is HL_DMA_POOL_BLK_SIZE.
> + * @dma_pool_free: free small DMA allocation from pool.
> + * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
> + * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
> + * @hw_queues_lock: acquire H/W queues lock.
> + * @hw_queues_unlock: release H/W queues lock.
> + * @send_cpu_message: send buffer to ArmCP.
>   */
>  struct hl_asic_funcs {
>       int (*early_init)(struct hl_device *hdev);
> @@ -195,10 +319,27 @@ struct hl_asic_funcs {
>       int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
>       int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
>                       u64 kaddress, phys_addr_t paddress, u32 size);
> +     void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
> +     void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
>       void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
>                                       dma_addr_t *dma_handle, gfp_t flag);
>       void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
>                                       void *cpu_addr, dma_addr_t dma_handle);
> +     void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
> +                             dma_addr_t *dma_handle, u16 *queue_len);
> +     int (*test_queues)(struct hl_device *hdev);
> +     void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size,
> +                             gfp_t mem_flags, dma_addr_t *dma_handle);
> +     void (*dma_pool_free)(struct hl_device *hdev, void *vaddr,
> +                             dma_addr_t dma_addr);
> +     void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev,
> +                             size_t size, dma_addr_t *dma_handle);
> +     void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
> +                             size_t size, void *vaddr);
> +     void (*hw_queues_lock)(struct hl_device *hdev);
> +     void (*hw_queues_unlock)(struct hl_device *hdev);
> +     int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
> +                             u16 len, u32 timeout, long *result);
>  };
>  
>  
> @@ -240,6 +381,17 @@ struct hl_ctx_mgr {
>  
>  
>  
> +/**
> + * struct hl_cs_job - command submission job.
> + * @finish_work: workqueue object to run when job is completed.
> + * @id: the id of this job inside a CS.
> + */
> +struct hl_cs_job {
> +     struct work_struct      finish_work;
> +     u32                     id;
> +};
> +
> +
>  /*
>   * FILE PRIVATE STRUCTURE
>   */
> @@ -316,7 +468,11 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
>   * @dev: realted kernel basic device structure.
>   * @asic_name: ASIC specific nmae.
>   * @asic_type: ASIC specific type.
> + * @completion_queue: array of hl_cq.
> + * @cq_wq: work queue of completion queues for executing work in process 
> context
> + * @eq_wq: work queue of event queue for executing work in process context.
>   * @kernel_ctx: KMD context structure.
> + * @kernel_queues: array of hl_hw_queue.
>   * @kernel_cb_mgr: command buffer manager for creating/destroying/handling 
> CGs.
>   * @dma_pool: DMA pool for small allocations.
>   * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
> @@ -326,6 +482,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
>   * @asid_bitmap: holds used/available ASIDs.
>   * @asid_mutex: protects asid_bitmap.
>   * @device_open: lock for sanity checks upon FD open.
> + * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue.
>   * @asic_prop: ASIC specific immutable properties.
>   * @asic_funcs: ASIC specific functions.
>   * @asic_specific: ASIC specific information to use only from ASIC files.
> @@ -345,7 +502,10 @@ struct hl_device {
>       struct device                   *dev;
>       char                            asic_name[16];
>       enum hl_asic_type               asic_type;
> +     struct hl_cq                    *completion_queue;
> +     struct workqueue_struct         *cq_wq;
>       struct hl_ctx                   *kernel_ctx;
> +     struct hl_hw_queue              *kernel_queues;
>       struct hl_cb_mgr                kernel_cb_mgr;
>       struct dma_pool                 *dma_pool;
>       void                            *cpu_accessible_dma_mem;
> @@ -356,6 +516,7 @@ struct hl_device {
>       struct mutex                    asid_mutex;
>       /* TODO: change to rw_sem for multiple contexts (same as other IOCTL) */
>       struct mutex                    device_open;
> +     struct mutex                    send_cpu_message_lock;
>       struct asic_fixed_properties    asic_prop;
>       const struct hl_asic_funcs      *asic_funcs;
>       void                            *asic_specific;
> @@ -374,7 +535,9 @@ struct hl_device {
>       u8                              cpu_enable;
>       u8                              reset_pcilink;
>       u8                              config_pll;
> +     u8                              cpu_queues_enable;
>       u8                              fw_loading;
> +     u8                              ifh;
>       u8                              pldm;
>  };
>  
> @@ -418,7 +581,18 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 
> addr, u32 timeout_us,
>                               u32 *val);
>  int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
>                               u32 timeout_us, u32 *val);
> -
> +int hl_hw_queues_create(struct hl_device *hdev);
> +void hl_hw_queues_destroy(struct hl_device *hdev);
> +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
> +                             u32 cb_size, u64 cb_ptr);
> +u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
> +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
> +
> +#define hl_queue_inc_ptr(p)          hl_hw_queue_add_ptr(p, 1)
> +#define hl_pi_2_offset(pi)           ((pi) & (HL_QUEUE_LENGTH - 1))
> +
> +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id);
> +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q);
>  int hl_asid_init(struct hl_device *hdev);
>  void hl_asid_fini(struct hl_device *hdev);
>  unsigned long hl_asid_alloc(struct hl_device *hdev);
> diff --git a/drivers/misc/habanalabs/habanalabs_drv.c 
> b/drivers/misc/habanalabs/habanalabs_drv.c
> index bd80683118d3..b64f58ad0f5d 100644
> --- a/drivers/misc/habanalabs/habanalabs_drv.c
> +++ b/drivers/misc/habanalabs/habanalabs_drv.c
> @@ -184,13 +184,19 @@ int create_hdev(struct hl_device **dev, struct pci_dev 
> *pdev,
>       hdev->cpu_enable = 1;
>       hdev->reset_pcilink = 0;
>       hdev->config_pll = 0;
> +     hdev->cpu_queues_enable = 1;
>       hdev->fw_loading = 1;
> +     hdev->ifh = 0;
>       hdev->pldm = 0;
>  
>       /* If CPU is disabled, no point in loading FW */
>       if (!hdev->cpu_enable)
>               hdev->fw_loading = 0;
>  
> +     /* If we don't load FW, no need to initialize CPU queues */
> +     if (!hdev->fw_loading)
> +             hdev->cpu_queues_enable = 0;
> +
>       hdev->disabled = true;
>       hdev->pdev = pdev; /* can be NULL in case of simulator device */
>  
> diff --git a/drivers/misc/habanalabs/hw_queue.c 
> b/drivers/misc/habanalabs/hw_queue.c
> new file mode 100644
> index 000000000000..65102a5bc2ca
> --- /dev/null
> +++ b/drivers/misc/habanalabs/hw_queue.c
> @@ -0,0 +1,404 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright 2016-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + */
> +
> +#include "habanalabs.h"
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/delay.h>
> +
> +/**
> + * hl_queue_add_ptr - add to pi or ci and checks if it wraps around
> + *
> + * @ptr: the current pi/ci value
> + * @val: the amount to add
> + *
> + * Add val to ptr. It can go until twice the queue length.
> + */
> +inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val)
> +{
> +     ptr += val;
> +     ptr &= ((HL_QUEUE_LENGTH << 1) - 1);
> +     return ptr;
> +}
> +
> +static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
> +{
> +     int delta = (q->pi - q->ci);
> +
> +     if (delta >= 0)
> +             return (queue_len - delta);
> +     else
> +             return (abs(delta) - queue_len);
> +}
> +
> +/**
> + * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
> + *
> + * @hdev: pointer to habanalabs device structure
> + * @q: pointer to habanalabs queue structure
> + * @ctl: BD's control word
> + * @len: BD's length
> + * @ptr: BD's pointer
> + *
> + * This function assumes there is enough space on the queue to submit a new
> + * BD to it. It initializes the next BD and calls the device specific
> + * function to set the pi (and doorbell)
> + *
> + * This function must be called when the scheduler mutex is taken
> + *
> + */
> +static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue 
> *q,
> +                             u32 ctl, u32 len, u64 ptr)
> +{
> +     struct hl_bd *bd;
> +
> +     bd = (struct hl_bd *) q->kernel_address;
> +     bd += hl_pi_2_offset(q->pi);
> +     bd->ctl = ctl;
> +     bd->len = len;
> +     bd->ptr = ptr + hdev->asic_prop.host_phys_base_address;
> +
> +     q->pi = hl_queue_inc_ptr(q->pi);
> +     hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
> +}
> +
> +/**
> + * ext_queue_sanity_checks - perform some sanity checks on external queue
> + *
> + * @hdev              : pointer to hl_device structure
> + * @q                 :      pointer to hl_hw_queue structure
> + * @num_of_entries    : how many entries to check for space
> + * @reserve_cq_entry  :      whether to reserve an entry in the cq
> + *
> + * H/W queues spinlock should be taken before calling this function
> + *
> + * Perform the following:
> + * - Make sure we have enough space in the h/w queue
> + * - Make sure we have enough space in the completion queue
> + * - Reserve space in the completion queue (needs to be reversed if there
> + *   is a failure down the road before the actual submission of work). Only
> + *   do this action if reserve_cq_entry is true
> + *
> + */
> +static int ext_queue_sanity_checks(struct hl_device *hdev,
> +                             struct hl_hw_queue *q, int num_of_entries,
> +                             bool reserve_cq_entry)
> +{
> +     atomic_t *free_slots =
> +                     &hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
> +     int free_slots_cnt;
> +
> +     /* Check we have enough space in the queue */
> +     free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
> +
> +     if (free_slots_cnt < num_of_entries) {
> +             dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
> +                     q->hw_queue_id, num_of_entries);
> +             return -EAGAIN;
> +     }
> +
> +     if (reserve_cq_entry) {
> +             /*
> +              * Check we have enough space in the completion queue
> +              * Add -1 to counter (decrement) unless counter was already 0
> +              * In that case, CQ is full so we can't submit a new CB because
> +              * we won't get ack on its completion
> +              * atomic_add_unless will return 0 if counter was already 0
> +              */
> +             if (atomic_add_negative(num_of_entries * -1, free_slots)) {
> +                     dev_dbg(hdev->dev, "No space for %d on CQ %d\n",
> +                             num_of_entries, q->hw_queue_id);
> +                     atomic_add(num_of_entries, free_slots);
> +                     return -EAGAIN;
> +             }
> +     }
> +
> +     return 0;
> +}
> +
> +/**
> + * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without 
> completion
> + *
> + * @hdev: pointer to hl_device structure
> + * @hw_queue_id: Queue's type
> + * @cb_size: size of CB
> + * @cb_ptr: pointer to CB location
> + *
> + * This function sends a single CB, that must NOT generate a completion entry
> + *
> + */
> +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
> +                             u32 cb_size, u64 cb_ptr)
> +{
> +     struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
> +     int rc;
> +
> +     /*
> +      * The CPU queue is a synchronous queue with an effective depth of
> +      * a single entry (although it is allocated with room for multiple
> +      * entries). Therefore, there is a different lock, called
> +      * send_cpu_message_lock, that serializes accesses to the CPU queue.
> +      * As a result, we don't need to lock the access to the entire H/W
> +      * queues module when submitting a JOB to the CPU queue
> +      */
> +     if (q->queue_type != QUEUE_TYPE_CPU)
> +             hdev->asic_funcs->hw_queues_lock(hdev);
> +
> +     if (hdev->disabled) {
> +             rc = -EPERM;
> +             goto out;
> +     }
> +
> +     rc = ext_queue_sanity_checks(hdev, q, 1, false);
> +     if (rc)
> +             goto out;
> +
> +     ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
> +
> +out:
> +     if (q->queue_type != QUEUE_TYPE_CPU)
> +             hdev->asic_funcs->hw_queues_unlock(hdev);
> +
> +     return rc;
> +}
> +
> +/**
> + * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
> + *
> + * @hdev: pointer to hl_device structure
> + * @hw_queue_id: which queue to increment its ci
> + */
> +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
> +{
> +     struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
> +
> +     q->ci = hl_queue_inc_ptr(q->ci);
> +}
> +
> +static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
> +                                     struct hl_hw_queue *q)
> +{
> +     void *p;
> +     int rc;
> +
> +     p = hdev->asic_funcs->dma_alloc_coherent(hdev,
> +                             HL_QUEUE_SIZE_IN_BYTES,
> +                             &q->bus_address, GFP_KERNEL | __GFP_ZERO);
> +     if (!p)
> +             return -ENOMEM;
> +
> +     q->kernel_address = (u64) p;
> +
> +     q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH,
> +                                     sizeof(*q->shadow_queue),
> +                                     GFP_KERNEL);
> +     if (!q->shadow_queue) {
> +             dev_err(hdev->dev,
> +                     "Failed to allocate shadow queue for H/W queue %d\n",
> +                     q->hw_queue_id);
> +             rc = -ENOMEM;
> +             goto free_queue;
> +     }
> +
> +     /* Make sure read/write pointers are initialized to start of queue */
> +     q->ci = 0;
> +     q->pi = 0;
> +
> +     return 0;
> +
> +free_queue:
> +     hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES,
> +                     (void *) q->kernel_address, q->bus_address);
> +
> +     return rc;
> +}
> +
> +static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
> +{
> +     void *p;
> +
> +     p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id,
> +                                     &q->bus_address, &q->int_queue_len);
> +     if (!p) {
> +             dev_err(hdev->dev,
> +                     "Failed to get base address for internal queue %d\n",
> +                     q->hw_queue_id);
> +             return -EFAULT;
> +     }
> +
> +     q->kernel_address = (u64) p;
> +     q->pi = 0;
> +     q->ci = 0;
> +
> +     return 0;
> +}
> +
> +static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
> +{
> +     return ext_and_cpu_hw_queue_init(hdev, q);
> +}
> +
> +static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
> +{
> +     return ext_and_cpu_hw_queue_init(hdev, q);
> +}
> +
> +/**
> + * hw_queue_init - main initialization function for H/W queue object
> + *
> + * @hdev: pointer to hl_device device structure
> + * @q: pointer to hl_hw_queue queue structure
> + * @hw_queue_id: The id of the H/W queue
> + *
> + * Allocate dma-able memory for the queue and initialize fields
> + * Returns 0 on success
> + */
> +static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
> +                     u32 hw_queue_id)
> +{
> +     int rc;
> +
> +     BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE);
> +
> +     q->hw_queue_id = hw_queue_id;
> +
> +     switch (q->queue_type) {
> +     case QUEUE_TYPE_EXT:
> +             rc = ext_hw_queue_init(hdev, q);
> +             break;
> +
> +     case QUEUE_TYPE_INT:
> +             rc = int_hw_queue_init(hdev, q);
> +             break;
> +
> +     case QUEUE_TYPE_CPU:
> +             rc = cpu_hw_queue_init(hdev, q);
> +             break;
> +
> +     case QUEUE_TYPE_NA:
> +             q->valid = 0;
> +             return 0;
> +
> +     default:
> +             dev_crit(hdev->dev, "wrong queue type %d during init\n",
> +                     q->queue_type);
> +             rc = -EINVAL;
> +             break;
> +     }
> +
> +     if (rc)
> +             return rc;
> +
> +     q->valid = 1;
> +
> +     return 0;
> +}
> +
> +/**
> + * hw_queue_fini - destroy queue
> + *
> + * @hdev: pointer to hl_device device structure
> + * @q: pointer to hl_hw_queue queue structure
> + *
> + * Free the queue memory
> + */
> +static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
> +{
> +     if (!q->valid)
> +             return;
> +
> +     /*
> +      * If we arrived here, there are no jobs waiting on this queue
> +      * so we can safely remove it.
> +      * This is because this function can only called when:
> +      * 1. Either a context is deleted, which only can occur if all its
> +      *    jobs were finished
> +      * 2. A context wasn't able to be created due to failure or timeout,
> +      *    which means there are no jobs on the queue yet
> +      *
> +      * The only exception are the queues of the kernel context, but
> +      * if they are being destroyed, it means that the entire module is
> +      * being removed. If the module is removed, it means there is no open
> +      * user context. It also means that if a job was submitted by
> +      * the kernel driver (e.g. context creation), the job itself was
> +      * released by the kernel driver when a timeout occurred on its
> +      * Completion. Thus, we don't need to release it again.
> +      */
> +
> +     if (q->queue_type == QUEUE_TYPE_INT)
> +             return;
> +
> +     kfree(q->shadow_queue);
> +
> +     hdev->asic_funcs->dma_free_coherent(hdev,
> +                     HL_QUEUE_SIZE_IN_BYTES,
> +                     (void *) q->kernel_address, q->bus_address);
> +}
> +
> +int hl_hw_queues_create(struct hl_device *hdev)
> +{
> +     struct asic_fixed_properties *asic = &hdev->asic_prop;
> +     struct hl_hw_queue *q;
> +     int i, rc, q_ready_cnt;
> +
> +     hdev->kernel_queues = kcalloc(HL_MAX_QUEUES,
> +                             sizeof(*hdev->kernel_queues), GFP_KERNEL);
> +
> +     if (!hdev->kernel_queues) {
> +             dev_err(hdev->dev, "Not enough memory for H/W queues\n");
> +             return -ENOMEM;
> +     }
> +
> +     /* Initialize the H/W queues */
> +     for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues;
> +                     i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
> +
> +             q->queue_type = asic->hw_queues_props[i].type;
> +             rc = hw_queue_init(hdev, q, i);
> +             if (rc) {
> +                     dev_err(hdev->dev,
> +                             "failed to initialize queue %d\n", i);
> +                     goto release_queues;
> +             }
> +     }
> +
> +     return 0;
> +
> +release_queues:
> +     for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
> +             hw_queue_fini(hdev, q);
> +
> +     kfree(hdev->kernel_queues);
> +
> +     return rc;
> +}
> +
> +void hl_hw_queues_destroy(struct hl_device *hdev)
> +{
> +     struct hl_hw_queue *q;
> +     int i;
> +
> +     for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
> +             hw_queue_fini(hdev, q);
> +
> +     kfree(hdev->kernel_queues);
> +}
> +
> +void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
> +{
> +     struct hl_hw_queue *q;
> +     int i;
> +
> +     for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) {
> +             if ((!q->valid) ||
> +                     ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
> +                     continue;
> +             q->pi = q->ci = 0;
> +     }
> +}
> diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h 
> b/drivers/misc/habanalabs/include/goya/goya_packets.h
> new file mode 100644
> index 000000000000..669a3f37ccb7
> --- /dev/null
> +++ b/drivers/misc/habanalabs/include/goya/goya_packets.h
> @@ -0,0 +1,234 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + *
> + * Copyright 2017-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + *
> + * Authors:
> + *
> + * Oded Gabbay <oded.gab...@gmail.com>
> + * Guy Eilat <gei...@habana.ai>
> + *
> + */
> +
> +#ifndef GOYA_PACKETS_H
> +#define GOYA_PACKETS_H
> +
> +#include <linux/types.h>
> +
> +#define PACKET_HEADER_PACKET_ID_SHIFT                56
> +#define PACKET_HEADER_PACKET_ID_MASK         0x1F00000000000000ull
> +
> +enum packet_id {
> +     PACKET_WREG_32 = 0x1,
> +     PACKET_WREG_BULK = 0x2,
> +     PACKET_MSG_LONG = 0x3,
> +     PACKET_MSG_SHORT = 0x4,
> +     PACKET_CP_DMA = 0x5,
> +     PACKET_MSG_PROT = 0x7,
> +     PACKET_FENCE = 0x8,
> +     PACKET_LIN_DMA = 0x9,
> +     PACKET_NOP = 0xA,
> +     PACKET_STOP = 0xB,
> +     MAX_PACKET_ID = (PACKET_HEADER_PACKET_ID_MASK >>
> +                             PACKET_HEADER_PACKET_ID_SHIFT) + 1
> +};
> +
> +enum goya_dma_direction {
> +     DMA_HOST_TO_DRAM,
> +     DMA_HOST_TO_SRAM,
> +     DMA_DRAM_TO_SRAM,
> +     DMA_SRAM_TO_DRAM,
> +     DMA_SRAM_TO_HOST,
> +     DMA_DRAM_TO_HOST,
> +     DMA_DRAM_TO_DRAM,
> +     DMA_SRAM_TO_SRAM,
> +     DMA_ENUM_MAX
> +};
> +
> +struct packet_nop {
> +     __u32 reserved;
> +     union {
> +             struct {
> +                     __u32:24;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1;
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +};
> +
> +struct packet_stop {
> +     __u32 reserved;
> +     union {
> +             struct {
> +                     __u32:24;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1; /* must be 0 */
> +                     __u32 msg_barrier :1; /* must be 0 */
> +             };
> +             __u32 ctl;
> +     };
> +};
> +
> +struct packet_wreg32 {
> +     __u32 value;
> +     union {
> +             struct {
> +                     __u32 reg_offset :16;
> +                     __u32:7;
> +                     __u32 local :1; /* 0: write to TCL regs,
> +                                      * 1: write to CMDQ regs
> +                                      */
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1; /* must be 1 */
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +};
> +
> +struct packet_wreg_bulk {
> +     __u32 size64 :16;
> +     __u32:16;
> +     __u32 reg_offset :16;
> +     __u32:8;
> +     __u32 opcode :5;
> +     __u32 eng_barrier :1;
> +     __u32 reg_barrier :1; /* must be 1 */
> +     __u32 msg_barrier :1;
> +     __u64 values[0]; /* data starts here */
> +};
> +
> +struct packet_msg_long {
> +     __u32 value;
> +     union {
> +             struct {
> +                     __u32:16;
> +                     __u32 weakly_ordered :1;
> +                     __u32 no_snoop :1;
> +                     __u32:2;
> +                     __u32 op :2; /* 0: write <value>. 1: write timestamp. */
> +                     __u32:2;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1;
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +     __u64 addr;
> +};
> +
> +struct packet_msg_short {
> +     union {
> +             struct {
> +                     __u32 sync_id :10;
> +                     __u32:5;
> +                     __u32 mode : 1;
> +                     __u32 sync_value :16;
> +             } mon_arm_register;
> +             struct {
> +                     __u32 sync_value :16;
> +                     __u32:15;
> +                     __u32 mode :1;
> +             } so_upd;
> +             __u32 value;
> +     };
> +     union {
> +             struct {
> +                     __u32 msg_addr_offset :16;
> +                     __u32 weakly_ordered :1;
> +                     __u32 no_snoop :1;
> +                     __u32:2;
> +                     __u32 op :2;
> +                     __u32 base :2;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1;
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +};
> +
> +struct packet_msg_prot {
> +     __u32 value;
> +     union {
> +             struct {
> +                     __u32:16;
> +                     __u32 weakly_ordered :1;
> +                     __u32 no_snoop :1;
> +                     __u32:2;
> +                     __u32 op :2; /* 0: write <value>. 1: write timestamp. */
> +                     __u32:2;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1;
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +     __u64 addr;
> +};
> +
> +struct packet_fence {
> +     __u32 dec_val :4;
> +     __u32:12;
> +     __u32 gate_val :8;
> +     __u32:6;
> +     __u32 id :2;
> +     __u32:24;
> +     __u32 opcode :5;
> +     __u32 eng_barrier :1;
> +     __u32 reg_barrier :1;
> +     __u32 msg_barrier :1;
> +};
> +
> +struct packet_lin_dma {
> +     __u32 tsize;
> +     union {
> +             struct {
> +                     __u32 weakly_ordered :1; /* H/W bug, must be 1 */
> +                     __u32 rdcomp :1;
> +                     __u32 wrcomp :1;
> +                     __u32 no_snoop :1;
> +                     __u32 src_disable :1;
> +                     __u32 dst_disable :1;
> +                     __u32 memset_mode :1;
> +                     __u32 tensor_dma :1; /* N/A, must be 0 */
> +                     __u32 cntrl :12;
> +                     __u32 dma_dir :3; /* S/W only, no effect on HW */
> +                     __u32:1;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1; /* must be 1 */
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +     __u64 src_addr;
> +     __u64 dst_addr;
> +};
> +
> +struct packet_cp_dma {
> +     __u32 tsize;
> +     union {
> +             struct {
> +                     __u32 weakly_ordered :1;
> +                     __u32 no_snoop :1;
> +                     __u32:22;
> +                     __u32 opcode :5;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1; /* must be 1 */
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +     __u64 src_addr;
> +};
> +
> +#endif /* GOYA_PACKETS_H */
> diff --git a/drivers/misc/habanalabs/include/habanalabs_device_if.h 
> b/drivers/misc/habanalabs/include/habanalabs_device_if.h
> index 9dbb7077eabd..62df9981f68a 100644
> --- a/drivers/misc/habanalabs/include/habanalabs_device_if.h
> +++ b/drivers/misc/habanalabs/include/habanalabs_device_if.h
> @@ -97,6 +97,278 @@ enum pq_init_status {
>       PQ_INIT_STATUS_READY_FOR_HOST
>  };
>  
> +/*
> + * ArmCP Primary Queue Packets
> + *
> + * During normal operation, KMD needs to send various messages to ArmCP,
> + * usually either to SET some value into a H/W periphery or to GET the 
> current
> + * value of some H/W periphery. For example, SET the frequency of MME/TPC and
> + * GET the value of the thermal sensor.
> + *
> + * These messages can be initiated either by the User application or by KMD
> + * itself, e.g. power management code. In either case, the communication from
> + * KMD to ArmCP will *always* be in synchronous mode, meaning that KMD will
> + * send a single message and poll until the message was acknowledged and the
> + * results are ready (if results are needed).
> + *
> + * This means that only a single message can be sent at a time and KMD must
> + * wait for its result before sending the next message. Having said that,
> + * because these are control messages which are sent in a relatively low
> + * frequency, this limitation seems acceptable. It's important to note that
> + * in case of multiple devices, messages to different devices *can* be sent
> + * at the same time.
> + *
> + * The message, inputs/outputs (if relevant) and fence object will be located
> + * on the device DDR at an address that will be determined by KMD. During
> + * device initialization phase, KMD will pass to ArmCP that address.  Most of
> + * the message types will contain inputs/outputs inside the message itself.
> + * The common part of each message will contain the opcode of the message 
> (its
> + * type) and a field representing a fence object.
> + *
> + * When KMD wishes to send a message to ArmCP, it will write the message
> + * contents to the device DDR, clear the fence object and then write the
> + * value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue
> + * the 484 interrupt-id to the ARM core.
> + *
> + * Upon receiving the 484 interrupt-id, ArmCP will read the message from the
> + * DDR. In case the message is a SET operation, ArmCP will first perform the
> + * operation and then write to the fence object on the device DDR. In case 
> the
> + * message is a GET operation, ArmCP will first fill the results section on 
> the
> + * device DDR and then write to the fence object. If an error occurred, ArmCP
> + * will fill the rc field with the right error code.
> + *
> + * In the meantime, KMD will poll on the fence object. Once KMD sees that the
> + * fence object is signaled, it will read the results from the device DDR
> + * (if relevant) and resume the code execution in KMD.
> + *
> + * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
> + * so the value being put by the KMD matches the value read by ArmCP
> + *
> + * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
> + *
> + * Detailed description:
> + *
> + * ARMCP_PACKET_DISABLE_PCI_ACCESS -
> + *       After receiving this packet the embedded CPU must NOT issue PCI
> + *       transactions (read/write) towards the Host CPU. This also include
> + *       sending MSI-X interrupts.
> + *       This packet is usually sent before the device is moved to D3Hot 
> state.
> + *
> + * ARMCP_PACKET_ENABLE_PCI_ACCESS -
> + *       After receiving this packet the embedded CPU is allowed to issue PCI
> + *       transactions towards the Host CPU, including sending MSI-X 
> interrupts.
> + *       This packet is usually send after the device is moved to D0 state.
> + *
> + * ARMCP_PACKET_TEMPERATURE_GET -
> + *       Fetch the current temperature / Max / Max Hyst / Critical /
> + *       Critical Hyst of a specified thermal sensor. The packet's
> + *       arguments specify the desired sensor and the field to get.
> + *
> + * ARMCP_PACKET_VOLTAGE_GET -
> + *       Fetch the voltage / Max / Min of a specified sensor. The packet's
> + *       arguments specify the sensor and type.
> + *
> + * ARMCP_PACKET_CURRENT_GET -
> + *       Fetch the current / Max / Min of a specified sensor. The packet's
> + *       arguments specify the sensor and type.
> + *
> + * ARMCP_PACKET_FAN_SPEED_GET -
> + *       Fetch the speed / Max / Min of a specified fan. The packet's
> + *       arguments specify the sensor and type.
> + *
> + * ARMCP_PACKET_PWM_GET -
> + *       Fetch the pwm value / mode of a specified pwm. The packet's
> + *       arguments specify the sensor and type.
> + *
> + * ARMCP_PACKET_PWM_SET -
> + *       Set the pwm value / mode of a specified pwm. The packet's
> + *       arguments specify the sensor, type and value.
> + *
> + * ARMCP_PACKET_FREQUENCY_SET -
> + *       Set the frequency of a specified PLL. The packet's arguments specify
> + *       the PLL and the desired frequency. The actual frequency in the 
> device
> + *       might differ from the requested frequency.
> + *
> + * ARMCP_PACKET_FREQUENCY_GET -
> + *       Fetch the frequency of a specified PLL. The packet's arguments 
> specify
> + *       the PLL.
> + *
> + * ARMCP_PACKET_LED_SET -
> + *       Set the state of a specified led. The packet's arguments
> + *       specify the led and the desired state.
> + *
> + * ARMCP_PACKET_I2C_WR -
> + *       Write 32-bit value to I2C device. The packet's arguments specify the
> + *       I2C bus, address and value.
> + *
> + * ARMCP_PACKET_I2C_RD -
> + *       Read 32-bit value from I2C device. The packet's arguments specify 
> the
> + *       I2C bus and address.
> + *
> + * ARMCP_PACKET_INFO_GET -
> + *       Fetch information from the device as specified in the packet's
> + *       structure. KMD passes the max size it allows the ArmCP to write to
> + *       the structure, to prevent data corruption in case of mismatched
> + *       KMD/FW versions.
> + *
> + * ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
> + *
> + * ARMCP_PACKET_UNMASK_RAZWI_IRQ -
> + *       Unmask the given IRQ. The IRQ number is specified in the value 
> field.
> + *       The packet is sent after receiving an interrupt and printing its
> + *       relevant information.
> + *
> + * ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
> + *       Unmask the given IRQs. The IRQs numbers are specified in an array 
> right
> + *       after the armcp_packet structure, where its first element is the 
> array
> + *       length. The packet is sent after a soft reset was done in order to
> + *       handle any interrupts that were sent during the reset process.
> + *
> + * ARMCP_PACKET_TEST -
> + *       Test packet for ArmCP connectivity. The CPU will put the fence value
> + *       in the result field.
> + *
> + * ARMCP_PACKET_FREQUENCY_CURR_GET -
> + *       Fetch the current frequency of a specified PLL. The packet's 
> arguments
> + *       specify the PLL.
> + *
> + * ARMCP_PACKET_MAX_POWER_GET -
> + *       Fetch the maximal power of the device.
> + *
> + * ARMCP_PACKET_MAX_POWER_SET -
> + *       Set the maximal power of the device. The packet's arguments specify
> + *       the power.
> + *
> + * ARMCP_PACKET_EEPROM_DATA_GET -
> + *       Get EEPROM data from the ArmCP kernel. The buffer is specified in 
> the
> + *       addr field. The CPU will put the returned data size in the result
> + *       field. In addition, KMD passes the max size it allows the ArmCP to
> + *       write to the structure, to prevent data corruption in case of
> + *       mismatched KMD/FW versions.
> + *
> + */
> +
> +enum armcp_packet_id {
> +     ARMCP_PACKET_DISABLE_PCI_ACCESS = 1,    /* internal */
> +     ARMCP_PACKET_ENABLE_PCI_ACCESS,         /* internal */
> +     ARMCP_PACKET_TEMPERATURE_GET,           /* sysfs */
> +     ARMCP_PACKET_VOLTAGE_GET,               /* sysfs */
> +     ARMCP_PACKET_CURRENT_GET,               /* sysfs */
> +     ARMCP_PACKET_FAN_SPEED_GET,             /* sysfs */
> +     ARMCP_PACKET_PWM_GET,                   /* sysfs */
> +     ARMCP_PACKET_PWM_SET,                   /* sysfs */
> +     ARMCP_PACKET_FREQUENCY_SET,             /* sysfs */
> +     ARMCP_PACKET_FREQUENCY_GET,             /* sysfs */
> +     ARMCP_PACKET_LED_SET,                   /* debugfs */
> +     ARMCP_PACKET_I2C_WR,                    /* debugfs */
> +     ARMCP_PACKET_I2C_RD,                    /* debugfs */
> +     ARMCP_PACKET_INFO_GET,                  /* IOCTL */
> +     ARMCP_PACKET_FLASH_PROGRAM_REMOVED,
> +     ARMCP_PACKET_UNMASK_RAZWI_IRQ,          /* internal */
> +     ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,    /* internal */
> +     ARMCP_PACKET_TEST,                      /* internal */
> +     ARMCP_PACKET_FREQUENCY_CURR_GET,        /* sysfs */
> +     ARMCP_PACKET_MAX_POWER_GET,             /* sysfs */
> +     ARMCP_PACKET_MAX_POWER_SET,             /* sysfs */
> +     ARMCP_PACKET_EEPROM_DATA_GET,           /* sysfs */
> +};
> +
> +#define ARMCP_PACKET_FENCE_VAL       0xFE8CE7A5
> +
> +struct armcp_packet {
> +     union {
> +             __u64 value;    /* For SET packets */
> +             __u64 result;   /* For GET packets */
> +             __u64 addr;     /* For PQ */
> +     };
> +
> +     union {
> +             struct {
> +                     __u32:12;
> +                     __u32 rc :4;
> +                     __u32 opcode :13;
> +                     __u32 eng_barrier :1;
> +                     __u32 reg_barrier :1;
> +                     __u32 msg_barrier :1;
> +             };
> +             __u32 ctl;
> +     };
> +
> +     __u32 fence;            /* Signal to KMD that message is completed */
> +
> +     union {
> +             struct {/* For temperature/current/voltage/fan/pwm get/set */
> +                     __u16 sensor_index;
> +                     __u16 type;
> +             };
> +
> +             struct {        /* For I2C read/write */
> +                     __u8 i2c_bus;
> +                     __u8 i2c_addr;
> +                     __u8 i2c_reg;
> +                     __u8 pad; /* unused */
> +             };
> +
> +             /* For frequency get/set */
> +             __u32 pll_index;
> +
> +             /* For led set */
> +             __u32 led_index;
> +
> +             /* For get Armcp info/EEPROM data */
> +             __u32 data_max_size;
> +     };
> +};
> +
> +struct armcp_unmask_irq_arr_packet {
> +     struct armcp_packet armcp_pkt;
> +     __u32 length;
> +     __u32 irqs[0];
> +};
> +
> +enum armcp_packet_rc {
> +     armcp_packet_success,
> +     armcp_packet_invalid,
> +     armcp_packet_fault
> +};
> +
> +enum armcp_temp_type {
> +     armcp_temp_input,
> +     armcp_temp_max = 6,
> +     armcp_temp_max_hyst,
> +     armcp_temp_crit,
> +     armcp_temp_crit_hyst
> +};
> +
> +enum armcp_in_attributes {
> +     armcp_in_input,
> +     armcp_in_min,
> +     armcp_in_max
> +};
> +
> +enum armcp_curr_attributes {
> +     armcp_curr_input,
> +     armcp_curr_min,
> +     armcp_curr_max
> +};
> +
> +enum armcp_fan_attributes {
> +     armcp_fan_input,
> +     armcp_fan_min = 2,
> +     armcp_fan_max
> +};
> +
> +enum armcp_pwm_attributes {
> +     armcp_pwm_input,
> +     armcp_pwm_enable
> +};
> +
> +/* Event Queue Packets */
> +
> +struct eq_generic_event {
> +     __u64 data[7];
> +};
> +
>  /*
>   * ArmCP info
>   */
> diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c
> new file mode 100644
> index 000000000000..97b0de7ea5c2
> --- /dev/null
> +++ b/drivers/misc/habanalabs/irq.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright 2016-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + */
> +
> +#include "habanalabs.h"
> +
> +#include <linux/dma-mapping.h>
> +
> +
> +/**
> + * hl_cq_inc_ptr - increment ci or pi of cq
> + *
> + * @ptr: the current ci or pi value of the completion queue
> + *
> + * Increment ptr by 1. If it reaches the number of completion queue
> + * entries, set it to 0
> + */
> +inline u32 hl_cq_inc_ptr(u32 ptr)
> +{
> +     ptr++;
> +     if (unlikely(ptr == HL_CQ_LENGTH))
> +             ptr = 0;
> +     return ptr;
> +}
> +
> +/**
> + * hl_irq_handler_cq - irq handler for completion queue
> + *
> + * @irq: irq number
> + * @arg: pointer to completion queue structure
> + *
> + */
> +irqreturn_t hl_irq_handler_cq(int irq, void *arg)
> +{
> +     struct hl_cq *cq = arg;
> +     struct hl_device *hdev = cq->hdev;
> +     struct hl_hw_queue *queue;
> +     struct hl_cs_job *job;
> +     bool shadow_index_valid;
> +     u16 shadow_index;
> +     u32 *cq_entry;
> +     u32 *cq_base;
> +
> +     if (hdev->disabled) {
> +             dev_dbg(hdev->dev,
> +                     "Device disabled but received IRQ %d for CQ %d\n",
> +                     irq, cq->hw_queue_id);
> +             return IRQ_HANDLED;
> +     }
> +
> +     cq_base = (u32 *) cq->kernel_address;
> +
> +     while (1) {
> +             bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK)
> +                                             >> CQ_ENTRY_READY_SHIFT);
> +
> +             if (!entry_ready)
> +                     break;
> +
> +             cq_entry = (u32 *) &cq_base[cq->ci];
> +
> +             /*
> +              * Make sure we read CQ entry contents after we've
> +              * checked the ownership bit.
> +              */
> +             dma_rmb();
> +
> +             shadow_index_valid =
> +                     ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK)
> +                                     >> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT);
> +
> +             shadow_index = (u16)
> +                     ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK)
> +                                     >> CQ_ENTRY_SHADOW_INDEX_SHIFT);
> +
> +             queue = &hdev->kernel_queues[cq->hw_queue_id];
> +
> +             if ((shadow_index_valid) && (!hdev->disabled)) {
> +                     job = queue->shadow_queue[hl_pi_2_offset(shadow_index)];
> +                     queue_work(hdev->cq_wq, &job->finish_work);
> +             }
> +
> +             /*
> +              * Update ci of the context's queue. There is no
> +              * need to protect it with spinlock because this update is
> +              * done only inside IRQ and there is a different IRQ per
> +              * queue
> +              */
> +             queue->ci = hl_queue_inc_ptr(queue->ci);
> +
> +             /* Clear CQ entry ready bit */
> +             cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK;
> +
> +             cq->ci = hl_cq_inc_ptr(cq->ci);
> +
> +             /* Increment free slots */
> +             atomic_inc(&cq->free_slots_cnt);
> +     }
> +
> +     return IRQ_HANDLED;
> +}
> +
> +/**
> + * hl_cq_init - main initialization function for an cq object
> + *
> + * @hdev: pointer to device structure
> + * @q: pointer to cq structure
> + * @hw_queue_id: The H/W queue ID this completion queue belongs to
> + *
> + * Allocate dma-able memory for the completion queue and initialize fields
> + * Returns 0 on success
> + */
> +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
> +{
> +     void *p;
> +
> +     BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
> +
> +     p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
> +                             &q->bus_address, GFP_KERNEL | __GFP_ZERO);
> +     if (!p)
> +             return -ENOMEM;
> +
> +     q->hdev = hdev;
> +     q->kernel_address = (u64) p;
> +     q->hw_queue_id = hw_queue_id;
> +     q->ci = 0;
> +     q->pi = 0;
> +
> +     atomic_set(&q->free_slots_cnt, HL_CQ_LENGTH);
> +
> +     return 0;
> +}
> +
> +/**
> + * hl_cq_fini - destroy completion queue
> + *
> + * @hdev: pointer to device structure
> + * @q: pointer to cq structure
> + *
> + * Free the completion queue memory
> + */
> +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q)
> +{
> +     hdev->asic_funcs->dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
> +                     (void *) q->kernel_address, q->bus_address);
> +}
> -- 
> 2.17.1
> 

-- 
Sincerely yours,
Mike.

Reply via email to