On Sat, Oct 22, 2016 at 10:50:34AM -0700, Jason Ekstrand wrote: > This method of doing copies has the advantage of touching very little of > the GPU state. While it does disable all the shader stages, it doesn't > have to blow away binding tables, viewports, scissors, or any other bits of > dynamic state other than VBO 32 which is already reserved. All of the > state that it does touch is contained within a pipeline anyway so that's > the only thing that has to be dirtied. > --- > src/intel/vulkan/Makefile.sources | 4 + > src/intel/vulkan/anv_genX.h | 5 + > src/intel/vulkan/genX_gpu_memcpy.c | 223 > +++++++++++++++++++++++++++++++++++++ > 3 files changed, 232 insertions(+) > create mode 100644 src/intel/vulkan/genX_gpu_memcpy.c > > diff --git a/src/intel/vulkan/Makefile.sources > b/src/intel/vulkan/Makefile.sources > index c51c146..910996e 100644 > --- a/src/intel/vulkan/Makefile.sources > +++ b/src/intel/vulkan/Makefile.sources > @@ -66,6 +66,7 @@ VULKAN_GENERATED_FILES := \ > GEN7_FILES := \ > genX_cmd_buffer.c \ > genX_blorp_exec.c \ > + genX_gpu_memcpy.c \ > genX_pipeline.c \ > gen7_cmd_buffer.c \ > gen7_pipeline.c \ > @@ -74,6 +75,7 @@ GEN7_FILES := \ > GEN75_FILES := \ > genX_cmd_buffer.c \ > genX_blorp_exec.c \ > + genX_gpu_memcpy.c \ > genX_pipeline.c \ > gen7_cmd_buffer.c \ > gen7_pipeline.c \ > @@ -82,6 +84,7 @@ GEN75_FILES := \ > GEN8_FILES := \ > genX_cmd_buffer.c \ > genX_blorp_exec.c \ > + genX_gpu_memcpy.c \ > genX_pipeline.c \ > gen8_cmd_buffer.c \ > gen8_pipeline.c \ > @@ -90,6 +93,7 @@ GEN8_FILES := \ > GEN9_FILES := \ > genX_cmd_buffer.c \ > genX_blorp_exec.c \ > + genX_gpu_memcpy.c \ > genX_pipeline.c \ > gen8_cmd_buffer.c \ > gen8_pipeline.c \ > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > index d4ed325..6bf9381 100644 > --- a/src/intel/vulkan/anv_genX.h > +++ b/src/intel/vulkan/anv_genX.h > @@ -69,5 +69,10 @@ genX(graphics_pipeline_create)(VkDevice _device, > const VkAllocationCallbacks *alloc, > VkPipeline *pPipeline); > > +void genX(cmd_buffer_gpu_memcpy)(struct anv_cmd_buffer *cmd_buffer, > + struct anv_bo *dst, uint32_t dst_offset, > + struct anv_bo *src, uint32_t src_offset, > + uint32_t size); > + > void genX(blorp_exec)(struct blorp_batch *batch, > const struct blorp_params *params); > diff --git a/src/intel/vulkan/genX_gpu_memcpy.c > b/src/intel/vulkan/genX_gpu_memcpy.c > new file mode 100644 > index 0000000..65df376 > --- /dev/null > +++ b/src/intel/vulkan/genX_gpu_memcpy.c > @@ -0,0 +1,223 @@ > +/* > + * Copyright © 2016 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > + * IN THE SOFTWARE. > + */ > + > +#include "anv_private.h" > + > +#include "genxml/gen_macros.h" > +#include "genxml/genX_pack.h" > + > +#include "common/gen_l3_config.h" > + > +/** > + * This file implements some lightweight memcpy/memset operations on the GPU > + * using a vertex buffer and streamout. > + */ > + > +/** > + * Returns the greatest common divisor of a and b that is a power of two. > + */ > +static inline uint64_t > +gcd_pow2_u64(uint64_t a, uint64_t b) > +{ > + assert(a > 0 || b > 0); > + > + unsigned a_log2 = ffsll(a) - 1; > + unsigned b_log2 = ffsll(b) - 1;
These could be const. > + > + /* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which will? > + * case, the MIN2() will take the other one. If both are 0 then we will > + * hit the assert above. > + */ > + return 1 << MIN2(a_log2, b_log2); > +} > + > +void > +genX(cmd_buffer_gpu_memcpy)(struct anv_cmd_buffer *cmd_buffer, > + struct anv_bo *dst, uint32_t dst_offset, > + struct anv_bo *src, uint32_t src_offset, > + uint32_t size) > +{ > + if (size == 0) > + return; > + > + assert(dst_offset + size <= dst->size); > + assert(src_offset + size <= src->size); > + > + unsigned bs = 16; Perhaps a short comment telling that in the maximum there can be four 32-bits components being written at one time. > + bs = gcd_pow2_u64(bs, src_offset); > + bs = gcd_pow2_u64(bs, dst_offset); > + bs = gcd_pow2_u64(bs, size); > + > + enum isl_format format; > + switch (bs) { > + case 4: format = ISL_FORMAT_R32_UINT; break; > + case 8: format = ISL_FORMAT_R32G32_UINT; break; > + case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break; > + default: > + unreachable("Invalid size"); > + } > + > + if (!cmd_buffer->state.current_l3_config) { > + const struct gen_l3_config *cfg = > + gen_get_default_l3_config(&cmd_buffer->device->info); > + genX(cmd_buffer_config_l3)(cmd_buffer, cfg); > + } I was wondering about this, but reading further down realized that urb setup relies on this. > + > + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); > + > + genX(flush_pipeline_select_3d)(cmd_buffer); > + > + uint32_t *dw; > + dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_VERTEX_BUFFERS)); > + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, dw + 1, > + &(struct GENX(VERTEX_BUFFER_STATE)) { > + .VertexBufferIndex = 32, /* Reserved for this */ > + .AddressModifyEnable = true, > + .BufferStartingAddress = { src, src_offset }, > + .BufferPitch = bs, > +#if (GEN_GEN >= 8) > + .MemoryObjectControlState = GENX(MOCS), > + .BufferSize = size, > +#else > + .VertexBufferMemoryObjectControlState = GENX(MOCS), > + .EndAddress = { src, src_offset + size - 1 }, > +#endif > + }); > + > + dw = anv_batch_emitn(&cmd_buffer->batch, 3, > GENX(3DSTATE_VERTEX_ELEMENTS)); > + GENX(VERTEX_ELEMENT_STATE_pack)(&cmd_buffer->batch, dw + 1, > + &(struct GENX(VERTEX_ELEMENT_STATE)) { > + .VertexBufferIndex = 32, > + .Valid = true, > + .SourceElementFormat = format, > + .SourceElementOffset = 0, > + .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, > + .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, > + .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, I suppose this could test against 16 as well (bs can be only 4,8 or 16). Just wondering aloud really. > + .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, > + }); > + > +#if GEN_GEN >= 8 > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), sgvs); > +#endif > + > + /* Disable all shader stages */ > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), vs); > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), hs); > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), te); > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), DS); > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs); > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS), gs); > + > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SBE), sbe) { > + sbe.VertexURBEntryReadOffset = 1; > + sbe.NumberofSFOutputAttributes = 1; > + sbe.VertexURBEntryReadLength = 1; > +#if GEN_GEN >= 8 > + sbe.ForceVertexURBEntryReadLength = true; > + sbe.ForceVertexURBEntryReadOffset = true; > +#endif > + > +#if GEN_GEN >= 9 > + for (unsigned i = 0; i < 32; i++) > + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; > +#endif > + } > + > + genX(emit_urb_setup)(cmd_buffer->device, &cmd_buffer->batch, > + VK_SHADER_STAGE_VERTEX_BIT | > + VK_SHADER_STAGE_FRAGMENT_BIT, Perhaps a small comment telling why vs and fs need to be reset while the rest can be left as they were. > + DIV_ROUND_UP(32, 64), 0, > + cmd_buffer->state.current_l3_config); > + > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { > + sob.SOBufferIndex = 0; > + sob.SOBufferObjectControlState = GENX(MOCS); > + sob.SurfaceBaseAddress = (struct anv_address) { dst, dst_offset }; > + > +#if GEN_GEN >= 8 > + sob.SOBufferEnable = true; > + sob.SurfaceSize = size - 1; > +#else > + sob.SurfacePitch = bs; > + sob.SurfaceEndAddress = sob.SurfaceBaseAddress; > + sob.SurfaceEndAddress.offset += size; > +#endif > + > +#if GEN_GEN >= 8 > + /* Always start the stream off at offset 0 */ > + sob.StreamOffsetWriteEnable = true; > + sob.StreamOffset = 0; > +#endif > + } > + > +#if GEN_GEN <= 7 > + /* The hardware can do this for us on BDW+ */ Checking that I understood correctly: This is needed because SOL stage updates automatically the offset register and we need to reset it before we issue new operation? If so, perhaps a short comment. > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), load) { > + load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num); > + load.DataDWord = 0; > + } > +#endif > + > + dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_SO_DECL_LIST), > + .StreamtoBufferSelects0 = (1 << 0), > + .NumEntries0 = 1); > + GENX(SO_DECL_ENTRY_pack)(&cmd_buffer->batch, dw + 3, > + &(struct GENX(SO_DECL_ENTRY)) { > + .Stream0Decl = { > + .OutputBufferSlot = 0, > + .RegisterIndex = 0, > + .ComponentMask = (1 << (bs / 4)) - 1, > + }, > + }); > + > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so) { > + so.SOFunctionEnable = true; > + so.RenderingDisable = true; > + so.Stream0VertexReadOffset = 0; > + so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64); > +#if GEN_GEN >= 8 > + so.Buffer0SurfacePitch = bs; > +#else > + so.SOBufferEnable0 = true; > +#endif > + } > + > +#if GEN_GEN >= 8 > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { > + topo.PrimitiveTopologyType = _3DPRIM_POINTLIST; > + } > +#endif > + > + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { > + prim.VertexAccessType = SEQUENTIAL; > + prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; > + prim.VertexCountPerInstance = size / bs; > + prim.StartVertexLocation = 0; > + prim.InstanceCount = 1; > + prim.StartInstanceLocation = 0; > + prim.BaseVertexLocation = 0; > + } > + > + cmd_buffer->state.dirty |= ANV_CMD_DIRTY_PIPELINE; > +} > -- > 2.5.0.400.gff86faf > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev