On Mon, 27 Mar 2017 16:10:25 -0700
Eric Anholt <e...@anholt.net> wrote:

> Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to
> keep from triggering the hardware addressing bug between of the tile
> binner of the tile alloc memory (where the top 4 bits come from the
> tile state data array's address).
> 
> To work around that and allow more memory to be reserved for graphics,
> allocate a single BO to store tile state data arrays and tile
> alloc/overflow memory while the GPU is active, and make sure that that
> one BO doesn't happen to cross a 256MB boundary.  With that in place,
> we can allocate textures and shaders anywhere in system memory (still
> contiguous, of course).

It's hard to review something I don't quite understand, but I didn't
spot any problem and the code seems to follow what the commit message
says: make sure the memory used by the tile binner (still have to look
at what a tile binner is :-)) does not cross a 256MB.

So, not sure my review has a real value here, but

Reviewed-by: Boris Brezillon <boris.brezil...@free-electrons.com>

> 
> Signed-off-by: Eric Anholt <e...@anholt.net>
> ---
>  drivers/gpu/drm/vc4/vc4_drv.h       |  28 +++++--
>  drivers/gpu/drm/vc4/vc4_gem.c       |  12 ++-
>  drivers/gpu/drm/vc4/vc4_irq.c       |  61 +++++++--------
>  drivers/gpu/drm/vc4/vc4_render_cl.c |   3 +-
>  drivers/gpu/drm/vc4/vc4_v3d.c       | 150 
> ++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_validate.c  |  54 ++++++-------
>  6 files changed, 234 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index dffce6293d87..5d9532163cbe 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -95,12 +95,23 @@ struct vc4_dev {
>        */
>       struct list_head seqno_cb_list;
>  
> -     /* The binner overflow memory that's currently set up in
> -      * BPOA/BPOS registers.  When overflow occurs and a new one is
> -      * allocated, the previous one will be moved to
> -      * vc4->current_exec's free list.
> +     /* The memory used for storing binner tile alloc, tile state,
> +      * and overflow memory allocations.  This is freed when V3D
> +      * powers down.
>        */
> -     struct vc4_bo *overflow_mem;
> +     struct vc4_bo *bin_bo;
> +
> +     /* Size of blocks allocated within bin_bo. */
> +     uint32_t bin_alloc_size;
> +
> +     /* Bitmask of the bin_alloc_size chunks in bin_bo that are
> +      * used.
> +      */
> +     uint32_t bin_alloc_used;
> +
> +     /* Bitmask of the current bin_alloc used for overflow memory. */
> +     uint32_t bin_alloc_overflow;
> +
>       struct work_struct overflow_mem_work;
>  
>       int power_refcount;
> @@ -293,8 +304,12 @@ struct vc4_exec_info {
>       bool found_increment_semaphore_packet;
>       bool found_flush;
>       uint8_t bin_tiles_x, bin_tiles_y;
> -     struct drm_gem_cma_object *tile_bo;
> +     /* Physical address of the start of the tile alloc array
> +      * (where each tile's binned CL will start)
> +      */
>       uint32_t tile_alloc_offset;
> +     /* Bitmask of which binner slots are freed when this job completes. */
> +     uint32_t bin_slots;
>  
>       /**
>        * Computed addresses pointing into exec_bo where we start the
> @@ -522,6 +537,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
>  extern struct platform_driver vc4_v3d_driver;
>  int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
>  int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
>  
>  /* vc4_validate.c */
>  int
> diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> index e9c381c42139..3ecd1ba7af75 100644
> --- a/drivers/gpu/drm/vc4/vc4_gem.c
> +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> @@ -705,6 +705,7 @@ static void
>  vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
>  {
>       struct vc4_dev *vc4 = to_vc4_dev(dev);
> +     unsigned long irqflags;
>       unsigned i;
>  
>       if (exec->bo) {
> @@ -720,6 +721,11 @@ vc4_complete_exec(struct drm_device *dev, struct 
> vc4_exec_info *exec)
>               drm_gem_object_unreference_unlocked(&bo->base.base);
>       }
>  
> +     /* Free up the allocation of any bin slots we used. */
> +     spin_lock_irqsave(&vc4->job_lock, irqflags);
> +     vc4->bin_alloc_used &= ~exec->bin_slots;
> +     spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
>       mutex_lock(&vc4->power_lock);
>       if (--vc4->power_refcount == 0) {
>               pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
> @@ -968,9 +974,9 @@ vc4_gem_destroy(struct drm_device *dev)
>       /* V3D should already have disabled its interrupt and cleared
>        * the overflow allocation registers.  Now free the object.
>        */
> -     if (vc4->overflow_mem) {
> -             
> drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> -             vc4->overflow_mem = NULL;
> +     if (vc4->bin_bo) {
> +             drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> +             vc4->bin_bo = NULL;
>       }
>  
>       if (vc4->hang_state)
> diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
> index cdc6e6760705..47bbec962f45 100644
> --- a/drivers/gpu/drm/vc4/vc4_irq.c
> +++ b/drivers/gpu/drm/vc4/vc4_irq.c
> @@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
>  {
>       struct vc4_dev *vc4 =
>               container_of(work, struct vc4_dev, overflow_mem_work);
> -     struct drm_device *dev = vc4->dev;
> -     struct vc4_bo *bo;
> +     struct vc4_bo *bo = vc4->bin_bo;
> +     int bin_bo_slot;
> +     struct vc4_exec_info *exec;
> +     unsigned long irqflags;
>  
> -     bo = vc4_bo_create(dev, 256 * 1024, true);
> -     if (IS_ERR(bo)) {
> +     bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
> +     if (bin_bo_slot < 0) {
>               DRM_ERROR("Couldn't allocate binner overflow mem\n");
>               return;
>       }
>  
> -     /* If there's a job executing currently, then our previous
> -      * overflow allocation is getting used in that job and we need
> -      * to queue it to be released when the job is done.  But if no
> -      * job is executing at all, then we can free the old overflow
> -      * object direcctly.
> -      *
> -      * No lock necessary for this pointer since we're the only
> -      * ones that update the pointer, and our workqueue won't
> -      * reenter.
> -      */
> -     if (vc4->overflow_mem) {
> -             struct vc4_exec_info *current_exec;
> -             unsigned long irqflags;
> -
> -             spin_lock_irqsave(&vc4->job_lock, irqflags);
> -             current_exec = vc4_first_bin_job(vc4);
> -             if (!current_exec)
> -                     current_exec = vc4_last_render_job(vc4);
> -             if (current_exec) {
> -                     vc4->overflow_mem->seqno = current_exec->seqno;
> -                     list_add_tail(&vc4->overflow_mem->unref_head,
> -                                   &current_exec->unref_list);
> -                     vc4->overflow_mem = NULL;
> +     spin_lock_irqsave(&vc4->job_lock, irqflags);
> +
> +     if (vc4->bin_alloc_overflow) {
> +             /* If we had overflow memory allocated previously,
> +              * then that chunk will free when the current bin job
> +              * is done.  If we don't have a bin job running, then
> +              * the chunk will be done whenever the list of render
> +              * jobs has drained.
> +              */
> +             exec = vc4_first_bin_job(vc4);
> +             if (!exec)
> +                     exec = vc4_last_render_job(vc4);
> +             if (exec) {
> +                     exec->bin_slots |= vc4->bin_alloc_overflow;
> +             } else {
> +                     /* There's nothing queued in the hardware, so
> +                      * the old slot is free immediately.
> +                      */
> +                     vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
>               }
> -             spin_unlock_irqrestore(&vc4->job_lock, irqflags);
>       }
> +     vc4->bin_alloc_overflow = BIT(bin_bo_slot);
>  
> -     if (vc4->overflow_mem)
> -             
> drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> -     vc4->overflow_mem = bo;
> -
> -     V3D_WRITE(V3D_BPOA, bo->base.paddr);
> +     V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
>       V3D_WRITE(V3D_BPOS, bo->base.base.size);
>       V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
>       V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
> +     spin_unlock_irqrestore(&vc4->job_lock, irqflags);
>  }
>  
>  static void
> diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c 
> b/drivers/gpu/drm/vc4/vc4_render_cl.c
> index 4339471f517f..5dc19429d4ae 100644
> --- a/drivers/gpu/drm/vc4/vc4_render_cl.c
> +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
> @@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
>  
>       if (has_bin) {
>               rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
> -             rcl_u32(setup, (exec->tile_bo->paddr +
> -                             exec->tile_alloc_offset +
> +             rcl_u32(setup, (exec->tile_alloc_offset +
>                               (y * exec->bin_tiles_x + x) * 32));
>       }
>  
> diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
> index 7cc346ad9b0b..a88078d7c9d1 100644
> --- a/drivers/gpu/drm/vc4/vc4_v3d.c
> +++ b/drivers/gpu/drm/vc4/vc4_v3d.c
> @@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
>       V3D_WRITE(V3D_VPMBASE, 0);
>  }
>  
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
> +{
> +     struct drm_device *dev = vc4->dev;
> +     unsigned long irqflags;
> +     int slot;
> +     uint64_t seqno = 0;
> +     struct vc4_exec_info *exec;
> +
> +try_again:
> +     spin_lock_irqsave(&vc4->job_lock, irqflags);
> +     slot = ffs(~vc4->bin_alloc_used);
> +     if (slot != 0) {
> +             /* Switch from ffs() bit index to a 0-based index. */
> +             slot--;
> +             vc4->bin_alloc_used |= BIT(slot);
> +             spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +             return slot;
> +     }
> +
> +     /* Couldn't find an open slot.  Wait for render to complete
> +      * and try again.
> +      */
> +     exec = vc4_last_render_job(vc4);
> +     if (exec)
> +             seqno = exec->seqno;
> +     spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
> +     if (seqno) {
> +             int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
> +
> +             if (ret == 0)
> +                     goto try_again;
> +
> +             return ret;
> +     }
> +
> +     return -ENOMEM;
> +}
> +
> +/**
> + * vc4_allocate_bin_bo() - allocates the memory that will be used for
> + * tile binning.
> + *
> + * The binner has a limitation that the addresses in the tile state
> + * buffer that point into the tile alloc buffer or binner overflow
> + * memory only have 28 bits (256MB), and the top 4 on the bus for
> + * tile alloc references end up coming from the tile state buffer's
> + * address.
> + *
> + * To work around this, we allocate a single large buffer while V3D is
> + * in use, make sure that it has the top 4 bits constant across its
> + * entire extent, and then put the tile state, tile alloc, and binner
> + * overflow memory inside that buffer.
> + *
> + * This creates a limitation where we may not be able to execute a job
> + * if it doesn't fit within the buffer that we allocated up front.
> + * However, it turns out that 16MB is "enough for anybody", and
> + * real-world applications run into allocation failures from the
> + * overall CMA pool before they make scenes complicated enough to run
> + * out of bin space.
> + */
> +int
> +vc4_allocate_bin_bo(struct drm_device *drm)
> +{
> +     struct vc4_dev *vc4 = to_vc4_dev(drm);
> +     struct vc4_v3d *v3d = vc4->v3d;
> +     uint32_t size = 16 * 1024 * 1024;
> +     int ret = 0;
> +     struct list_head list;
> +
> +     /* We may need to try allocating more than once to get a BO
> +      * that doesn't cross 256MB.  Track the ones we've allocated
> +      * that failed so far, so that we can free them when we've got
> +      * one that succeeded (if we freed them right away, our next
> +      * allocation would probably be the same chunk of memory).
> +      */
> +     INIT_LIST_HEAD(&list);
> +
> +     while (true) {
> +             struct vc4_bo *bo = vc4_bo_create(drm, size, true);
> +
> +             if (IS_ERR(bo)) {
> +                     ret = PTR_ERR(bo);
> +
> +                     dev_err(&v3d->pdev->dev,
> +                             "Failed to allocate memory for tile binning: "
> +                             "%d. You may need to enable CMA or give it "
> +                             "more memory.",
> +                             ret);
> +                     break;
> +             }
> +
> +             /* Check if this BO won't trigger the addressing bug. */
> +             if ((bo->base.paddr & 0xf0000000) ==
> +                 ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
> +                     vc4->bin_bo = bo;
> +
> +                     /* Set up for allocating 512KB chunks of
> +                      * binner memory.  The biggest allocation we
> +                      * need to do is for the initial tile alloc +
> +                      * tile state buffer.  We can render to a
> +                      * maximum of ((2048*2048) / (32*32) = 4096
> +                      * tiles in a frame (until we do floating
> +                      * point rendering, at which point it would be
> +                      * 8192).  Tile state is 48b/tile (rounded to
> +                      * a page), and tile alloc is 32b/tile
> +                      * (rounded to a page), plus a page of extra,
> +                      * for a total of 320kb for our worst-case.
> +                      * We choose 512kb so that it divides evenly
> +                      * into our 16MB, and the rest of the 512kb
> +                      * will be used as storage for the overflow
> +                      * from the initial 32b CL per bin.
> +                      */
> +                     vc4->bin_alloc_size = 512 * 1024;
> +                     vc4->bin_alloc_used = 0;
> +                     vc4->bin_alloc_overflow = 0;
> +                     WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
> +                                  bo->base.base.size / vc4->bin_alloc_size);
> +
> +                     break;
> +             }
> +
> +             /* Put it on the list to free later, and try again. */
> +             list_add(&bo->unref_head, &list);
> +     }
> +
> +     /* Free all the BOs we allocated but didn't choose. */
> +     while (!list_empty(&list)) {
> +             struct vc4_bo *bo = list_last_entry(&list,
> +                                                 struct vc4_bo, unref_head);
> +
> +             list_del(&bo->unref_head);
> +             drm_gem_object_put_unlocked(&bo->base.base);
> +     }
> +
> +     return ret;
> +}
> +
>  #ifdef CONFIG_PM
>  static int vc4_v3d_runtime_suspend(struct device *dev)
>  {
> @@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
>  
>       vc4_irq_uninstall(vc4->dev);
>  
> +     drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> +     vc4->bin_bo = NULL;
> +
>       return 0;
>  }
>  
> @@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
>  {
>       struct vc4_v3d *v3d = dev_get_drvdata(dev);
>       struct vc4_dev *vc4 = v3d->vc4;
> +     int ret;
> +
> +     ret = vc4_allocate_bin_bo(vc4->dev);
> +     if (ret)
> +             return ret;
>  
>       vc4_v3d_init_hw(vc4->dev);
>       vc4_irq_postinstall(vc4->dev);
> @@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct 
> device *master, void *data)
>               return -EINVAL;
>       }
>  
> +     ret = vc4_allocate_bin_bo(drm);
> +     if (ret)
> +             return ret;
> +
>       /* Reset the binner overflow address/size at setup, to be sure
>        * we don't reuse an old one.
>        */
> diff --git a/drivers/gpu/drm/vc4/vc4_validate.c 
> b/drivers/gpu/drm/vc4/vc4_validate.c
> index da6f1e138e8d..3de8f11595c0 100644
> --- a/drivers/gpu/drm/vc4/vc4_validate.c
> +++ b/drivers/gpu/drm/vc4/vc4_validate.c
> @@ -348,10 +348,11 @@ static int
>  validate_tile_binning_config(VALIDATE_ARGS)
>  {
>       struct drm_device *dev = exec->exec_bo->base.dev;
> -     struct vc4_bo *tile_bo;
> +     struct vc4_dev *vc4 = to_vc4_dev(dev);
>       uint8_t flags;
> -     uint32_t tile_state_size, tile_alloc_size;
> -     uint32_t tile_count;
> +     uint32_t tile_state_size;
> +     uint32_t tile_count, bin_addr;
> +     int bin_slot;
>  
>       if (exec->found_tile_binning_mode_config_packet) {
>               DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
> @@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
>               return -EINVAL;
>       }
>  
> +     bin_slot = vc4_v3d_get_bin_slot(vc4);
> +     if (bin_slot < 0) {
> +             if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
> +                     DRM_ERROR("Failed to allocate binner memory: %d\n",
> +                               bin_slot);
> +             }
> +             return bin_slot;
> +     }
> +
> +     /* The slot we allocated will only be used by this job, and is
> +      * free when the job completes rendering.
> +      */
> +     exec->bin_slots |= BIT(bin_slot);
> +     bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
> +
>       /* The tile state data array is 48 bytes per tile, and we put it at
>        * the start of a BO containing both it and the tile alloc.
>        */
>       tile_state_size = 48 * tile_count;
>  
>       /* Since the tile alloc array will follow us, align. */
> -     exec->tile_alloc_offset = roundup(tile_state_size, 4096);
> +     exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
>  
>       *(uint8_t *)(validated + 14) =
>               ((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
> @@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
>                VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
>                              VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
>  
> -     /* Initial block size. */
> -     tile_alloc_size = 32 * tile_count;
> -
> -     /*
> -      * The initial allocation gets rounded to the next 256 bytes before
> -      * the hardware starts fulfilling further allocations.
> -      */
> -     tile_alloc_size = roundup(tile_alloc_size, 256);
> -
> -     /* Add space for the extra allocations.  This is what gets used first,
> -      * before overflow memory.  It must have at least 4096 bytes, but we
> -      * want to avoid overflow memory usage if possible.
> -      */
> -     tile_alloc_size += 1024 * 1024;
> -
> -     tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
> -                             true);
> -     exec->tile_bo = &tile_bo->base;
> -     if (IS_ERR(exec->tile_bo))
> -             return PTR_ERR(exec->tile_bo);
> -     list_add_tail(&tile_bo->unref_head, &exec->unref_list);
> -
>       /* tile alloc address. */
> -     *(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
> -                                     exec->tile_alloc_offset);
> +     *(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
>       /* tile alloc size. */
> -     *(uint32_t *)(validated + 4) = tile_alloc_size;
> +     *(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
> +                                     exec->tile_alloc_offset);
>       /* tile state address. */
> -     *(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
> +     *(uint32_t *)(validated + 8) = bin_addr;
>  
>       return 0;
>  }

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to