Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com> > On Oct 26, 2016, at 7:08 PM, George Kyriazis <george.kyria...@intel.com> > wrote: > > Used for common code reuse and simplification > --- > .../drivers/swr/rasterizer/core/backend.cpp | 561 ++++++++------------- > src/gallium/drivers/swr/rasterizer/core/backend.h | 150 +++++- > 2 files changed, 351 insertions(+), 360 deletions(-) > > diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp > b/src/gallium/drivers/swr/rasterizer/core/backend.cpp > index f71c2b2..3b22892 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp > +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp > @@ -451,134 +451,95 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint3 > AR_BEGIN(BESingleSampleBackend, pDC->drawId); > AR_BEGIN(BESetup, pDC->drawId); > > - const API_STATE& state = GetApiState(pDC); > - const SWR_RASTSTATE& rastState = state.rastState; > - const SWR_PS_STATE *pPSState = &state.psState; > - const SWR_BLEND_STATE *pBlendState = &state.blendState; > - uint64_t coverageMask = work.coverageMask[0]; > + const API_STATE &state = GetApiState(pDC); > > - // broadcast scalars > BarycentricCoeffs coeffs; > - coeffs.vIa = _simd_broadcast_ss(&work.I[0]); > - coeffs.vIb = _simd_broadcast_ss(&work.I[1]); > - coeffs.vIc = _simd_broadcast_ss(&work.I[2]); > - > - coeffs.vJa = _simd_broadcast_ss(&work.J[0]); > - coeffs.vJb = _simd_broadcast_ss(&work.J[1]); > - coeffs.vJc = _simd_broadcast_ss(&work.J[2]); > - > - coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); > - coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); > - coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); > + SetupBarycentricCoeffs(&coeffs, work); > > - coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); > + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, > *pStencilBuffer; > + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, > state.psState.numRenderTargets, renderBuffers); > > - coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); > - coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); > - coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); > + SWR_PS_CONTEXT psContext; > + SetupPixelShaderContext<T>(&psContext, work); > > - uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; > - uint32_t NumRT = state.psState.numRenderTargets; > - for(uint32_t rt = 0; rt < NumRT; ++rt) > - { > - pColorBase[rt] = renderBuffers.pColor[rt]; > - } > - uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = > renderBuffers.pStencil; > AR_END(BESetup, 1); > > - SWR_PS_CONTEXT psContext; > - psContext.pAttribs = work.pAttribs; > - psContext.pPerspAttribs = work.pPerspAttribs; > - psContext.frontFace = work.triFlags.frontFacing; > - psContext.primID = work.triFlags.primID; > - > - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the > shader because of pull attribs > - psContext.I = work.I; > - psContext.J = work.J; > - psContext.recipDet = work.recipDet; > - psContext.pRecipW = work.pRecipW; > - psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX; > - psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY; > - psContext.rasterizerSampleCount = T::MultisampleT::numSamples; > + psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > > - for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) > + const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); > + > + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) > { > - // UL pixel corner > - psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps((float)yy)); > - // pixel center > - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps((float)yy)); > + psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > > - for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) > + const simdscalar dx = > _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); > + > + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += > SIMD_TILE_X_DIM) > { > #if USE_8x2_TILE_BACKEND > const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); > > #endif > - if(coverageMask & MASK) > + simdmask coverageMask = work.coverageMask[0] & MASK; > + > + if (coverageMask) > { > - psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps((float)xx)); > - // pixel center > - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps((float)xx)); > + if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > + { > + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, > "Unsupported depth hot tile format"); > + > + const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer)); > > - if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE) > + const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > + const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > + > + coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); > + } > + > + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) > { > - const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > - &work.coverageMask[0]; > - generateInputCoverage<T, > T::InputCoverage>(pCoverageMask, psContext.inputMask, > pBlendState->sampleMask); > + const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > &work.coverageMask[0]; > + > + generateInputCoverage<T, > T::InputCoverage>(pCoverageMask, psContext.inputMask, > state.blendState.sampleMask); > } > > AR_BEGIN(BEBarycentric, pDC->drawId); > + > CalcPixelBarycentrics(coeffs, psContext); > > - // for 1x case, centroid is pixel center > - psContext.vX.centroid = psContext.vX.center; > - psContext.vY.centroid = psContext.vY.center; > - psContext.vI.centroid = psContext.vI.center; > - psContext.vJ.centroid = psContext.vJ.center; > - psContext.vOneOverW.centroid = psContext.vOneOverW.center; > + CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, > state.blendState.sampleMask); > > // interpolate and quantize z > psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, > psContext.vI.center, psContext.vJ.center); > psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); > + > AR_END(BEBarycentric, 1); > > - simdmask clipCoverageMask = coverageMask & MASK; > // interpolate user clip distance if available > - if (rastState.clipDistanceMask) > - { > - clipCoverageMask &= > ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, > - psContext.vI.center, psContext.vJ.center); > - } > - > - if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > + if (state.rastState.clipDistanceMask) > { > - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, > "Unsupported depth hot tile format"); > - > - const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthBase)); > - > - const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > - const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > - > - clipCoverageMask &= CalcDepthBoundsAcceptMask(z, minz, > maxz); > + coverageMask &= > ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, > psContext.vI.center, psContext.vJ.center); > } > > - simdscalar vCoverageMask = vMask(clipCoverageMask); > + simdscalar vCoverageMask = vMask(coverageMask); > simdscalar depthPassMask = vCoverageMask; > simdscalar stencilPassMask = vCoverageMask; > > // Early-Z? > - if(T::bCanEarlyZ) > + if (T::bCanEarlyZ) > { > AR_BEGIN(BEEarlyDepthTest, pDC->drawId); > depthPassMask = DepthStencilTest(&state, > work.triFlags.frontFacing, work.triFlags.viewportIndex, > - psContext.vZ, > pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); > + psContext.vZ, > pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); > AR_END(BEEarlyDepthTest, 0); > > // early-exit if no pixels passed depth or earlyZ is > forced on > - if(pPSState->forceEarlyZ || > !_simd_movemask_ps(depthPassMask)) > + if (state.psState.forceEarlyZ || > !_simd_movemask_ps(depthPassMask)) > { > > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, > - pDepthBase, depthPassMask, > vCoverageMask, pStencilBase, stencilPassMask); > + pDepthBuffer, depthPassMask, vCoverageMask, > pStencilBuffer, stencilPassMask); > > if (!_simd_movemask_ps(depthPassMask)) > { > @@ -599,18 +560,18 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint3 > vCoverageMask = _simd_castsi_ps(psContext.activeMask); > > // late-Z > - if(!T::bCanEarlyZ) > + if (!T::bCanEarlyZ) > { > AR_BEGIN(BELateDepthTest, pDC->drawId); > depthPassMask = DepthStencilTest(&state, > work.triFlags.frontFacing, work.triFlags.viewportIndex, > - psContext.vZ, > pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); > + psContext.vZ, > pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); > AR_END(BELateDepthTest, 0); > > - if(!_simd_movemask_ps(depthPassMask)) > + if (!_simd_movemask_ps(depthPassMask)) > { > // need to call depth/stencil write for stencil write > > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, > - pDepthBase, depthPassMask, > vCoverageMask, pStencilBase, stencilPassMask); > + pDepthBuffer, depthPassMask, vCoverageMask, > pStencilBuffer, stencilPassMask); > goto Endtile; > } > } > @@ -622,47 +583,56 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint3 > // output merger > AR_BEGIN(BEOutputMerger, pDC->drawId); > #if USE_8x2_TILE_BACKEND > - OutputMerger(psContext, pColorBase, 0, pBlendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, > useAlternateOffset); > + OutputMerger(psContext, pColorBuffer, 0, &state.blendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, > state.psState.numRenderTargets, useAlternateOffset); > #else > - OutputMerger(psContext, pColorBase, 0, pBlendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets); > + OutputMerger(psContext, pColorBuffer, 0, &state.blendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, > state.psState.numRenderTargets); > #endif > > // do final depth write after all pixel kills > - if (!pPSState->forceEarlyZ) > + if (!state.psState.forceEarlyZ) > { > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, > - pDepthBase, depthPassMask, vCoverageMask, > pStencilBase, stencilPassMask); > + pDepthBuffer, depthPassMask, vCoverageMask, > pStencilBuffer, stencilPassMask); > } > AR_END(BEOutputMerger, 0); > } > > Endtile: > AR_BEGIN(BEEndTile, pDC->drawId); > - coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); > + > + work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); > if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) > { > work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * > SIMD_TILE_X_DIM); > } > - pDepthBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > - pStencilBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > > #if USE_8x2_TILE_BACKEND > if (useAlternateOffset) > { > - for (uint32_t rt = 0; rt < NumRT; ++rt) > + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; > ++rt) > { > - pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > } > #else > - for (uint32_t rt = 0; rt < NumRT; ++rt) > + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) > { > - pColorBase[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > #endif > + pDepthBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > + pStencilBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > + > AR_END(BEEndTile, 0); > + > + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); > + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); > } > + > + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); > + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); > } > + > AR_END(BESingleSampleBackend, 0); > } > > @@ -674,111 +644,76 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_ > AR_BEGIN(BESampleRateBackend, pDC->drawId); > AR_BEGIN(BESetup, pDC->drawId); > > - const API_STATE& state = GetApiState(pDC); > - const SWR_RASTSTATE& rastState = state.rastState; > - const SWR_PS_STATE *pPSState = &state.psState; > - const SWR_BLEND_STATE *pBlendState = &state.blendState; > + const API_STATE &state = GetApiState(pDC); > > - // broadcast scalars > BarycentricCoeffs coeffs; > - coeffs.vIa = _simd_broadcast_ss(&work.I[0]); > - coeffs.vIb = _simd_broadcast_ss(&work.I[1]); > - coeffs.vIc = _simd_broadcast_ss(&work.I[2]); > + SetupBarycentricCoeffs(&coeffs, work); > > - coeffs.vJa = _simd_broadcast_ss(&work.J[0]); > - coeffs.vJb = _simd_broadcast_ss(&work.J[1]); > - coeffs.vJc = _simd_broadcast_ss(&work.J[2]); > + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, > *pStencilBuffer; > + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, > state.psState.numRenderTargets, renderBuffers); > > - coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); > - coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); > - coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); > - > - coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); > - > - coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); > - coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); > - coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); > + SWR_PS_CONTEXT psContext; > + SetupPixelShaderContext<T>(&psContext, work); > > - uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; > - uint32_t NumRT = state.psState.numRenderTargets; > - for(uint32_t rt = 0; rt < NumRT; ++rt) > - { > - pColorBase[rt] = renderBuffers.pColor[rt]; > - } > - uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = > renderBuffers.pStencil; > AR_END(BESetup, 0); > > - SWR_PS_CONTEXT psContext; > - psContext.pAttribs = work.pAttribs; > - psContext.pPerspAttribs = work.pPerspAttribs; > - psContext.pRecipW = work.pRecipW; > - psContext.frontFace = work.triFlags.frontFacing; > - psContext.primID = work.triFlags.primID; > - > - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the > shader because of pull attribs > - psContext.I = work.I; > - psContext.J = work.J; > - psContext.recipDet = work.recipDet; > - psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX; > - psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY; > - psContext.rasterizerSampleCount = T::MultisampleT::numSamples; > + psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > + > + const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); > > for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) > { > - // UL pixel corner > - psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps((float)yy)); > - // pixel center > - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps((float)yy)); > - > + psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + > + const simdscalar dx = > _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); > + > for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) > { > #if USE_8x2_TILE_BACKEND > const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); > > #endif > - psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps((float)xx)); > - // pixel center > - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps((float)xx)); > + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) > + { > + const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > &work.coverageMask[0]; > + > + generateInputCoverage<T, T::InputCoverage>(pCoverageMask, > psContext.inputMask, state.blendState.sampleMask); > + } > > AR_BEGIN(BEBarycentric, pDC->drawId); > + > CalcPixelBarycentrics(coeffs, psContext); > - AR_END(BEBarycentric, 0); > > - if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE) > - { > - const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > - &work.coverageMask[0]; > - generateInputCoverage<T, T::InputCoverage>(pCoverageMask, > psContext.inputMask, pBlendState->sampleMask); > - } > + CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, > state.blendState.sampleMask); > > - if(T::bCentroidPos) > - { > - ///@ todo: don't need to genererate input coverage 2x if > input coverage and centroid > - AR_BEGIN(BEBarycentric, pDC->drawId); > - if(T::bIsStandardPattern) > - { > - CalcCentroidPos<T>(psContext, &work.coverageMask[0], > pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL); > - } > - else > - { > - psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, > _simd_set1_ps(0.5f)); > - psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, > _simd_set1_ps(0.5f)); > - } > - CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, > psContext.vY.UL); > - AR_END(BEBarycentric, 0); > - } > - else > - { > - psContext.vX.centroid = psContext.vX.sample; > - psContext.vY.centroid = psContext.vY.sample; > - } > + AR_END(BEBarycentric, 0); > > - for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; > sample++) > + for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; > sample++) > { > simdmask coverageMask = work.coverageMask[sample] & MASK; > + > if (coverageMask) > { > + // offset depth/stencil buffers current sample > + uint8_t *pDepthSample = pDepthBuffer + > RasterTileDepthOffset(sample); > + uint8_t *pStencilSample = pStencilBuffer + > RasterTileStencilOffset(sample); > + > + if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > + { > + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == > R32_FLOAT, "Unsupported depth hot tile format"); > + > + const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); > + > + const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > + const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > + > + coverageMask &= CalcDepthBoundsAcceptMask(z, minz, > maxz); > + } > + > AR_BEGIN(BEBarycentric, pDC->drawId); > + > // calculate per sample positions > psContext.vX.sample = _simd_add_ps(psContext.vX.UL, > T::MultisampleT::vX(sample)); > psContext.vY.sample = _simd_add_ps(psContext.vY.UL, > T::MultisampleT::vY(sample)); > @@ -788,29 +723,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_ > // interpolate and quantize z > psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, > coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); > psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); > + > AR_END(BEBarycentric, 0); > > // interpolate user clip distance if available > - if (rastState.clipDistanceMask) > + if (state.rastState.clipDistanceMask) > { > - coverageMask &= > ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, > - psContext.vI.sample, psContext.vJ.sample); > - } > - > - // offset depth/stencil buffers current sample > - uint8_t *pDepthSample = pDepthBase + > RasterTileDepthOffset(sample); > - uint8_t *pStencilSample = pStencilBase + > RasterTileStencilOffset(sample); > - > - if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > - { > - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == > R32_FLOAT, "Unsupported depth hot tile format"); > - > - const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); > - > - const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > - const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > - > - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, > maxz); > + coverageMask &= > ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, > psContext.vI.sample, psContext.vJ.sample); > } > > simdscalar vCoverageMask = vMask(coverageMask); > @@ -826,7 +745,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_ > AR_END(BEEarlyDepthTest, 0); > > // early-exit if no samples passed depth or earlyZ is > forced on. > - if (pPSState->forceEarlyZ || > !_simd_movemask_ps(depthPassMask)) > + if (state.psState.forceEarlyZ || > !_simd_movemask_ps(depthPassMask)) > { > > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, > pDepthSample, depthPassMask, vCoverageMask, > pStencilSample, stencilPassMask); > @@ -876,13 +795,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_ > // output merger > AR_BEGIN(BEOutputMerger, pDC->drawId); > #if USE_8x2_TILE_BACKEND > - OutputMerger(psContext, pColorBase, sample, pBlendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, > useAlternateOffset); > + OutputMerger(psContext, pColorBuffer, sample, > &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, > state.psState.numRenderTargets, useAlternateOffset); > #else > - OutputMerger(psContext, pColorBase, sample, pBlendState, > state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets); > + OutputMerger(psContext, pColorBuffer, sample, > &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, > state.psState.numRenderTargets); > #endif > > // do final depth write after all pixel kills > - if (!pPSState->forceEarlyZ) > + if (!state.psState.forceEarlyZ) > { > > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, > pDepthSample, depthPassMask, vCoverageMask, > pStencilSample, stencilPassMask); > @@ -894,31 +813,41 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_ > > Endtile: > ATTR_UNUSED; > + > AR_BEGIN(BEEndTile, pDC->drawId); > + > if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) > { > work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * > SIMD_TILE_X_DIM); > } > - pDepthBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > - pStencilBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > > #if USE_8x2_TILE_BACKEND > if (useAlternateOffset) > { > - for (uint32_t rt = 0; rt < NumRT; ++rt) > + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; > ++rt) > { > - pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > } > #else > - for (uint32_t rt = 0; rt < NumRT; ++rt) > + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) > { > - pColorBase[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > #endif > + pDepthBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > + pStencilBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > + > AR_END(BEEndTile, 0); > + > + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); > + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); > } > + > + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); > + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); > } > + > AR_END(BESampleRateBackend, 0); > } > > @@ -930,62 +859,33 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > AR_BEGIN(BEPixelRateBackend, pDC->drawId); > AR_BEGIN(BESetup, pDC->drawId); > > - const API_STATE& state = GetApiState(pDC); > - const SWR_RASTSTATE& rastState = state.rastState; > - const SWR_PS_STATE *pPSState = &state.psState; > - const SWR_BLEND_STATE *pBlendState = &state.blendState; > + const API_STATE &state = GetApiState(pDC); > > - // broadcast scalars > BarycentricCoeffs coeffs; > - coeffs.vIa = _simd_broadcast_ss(&work.I[0]); > - coeffs.vIb = _simd_broadcast_ss(&work.I[1]); > - coeffs.vIc = _simd_broadcast_ss(&work.I[2]); > + SetupBarycentricCoeffs(&coeffs, work); > > - coeffs.vJa = _simd_broadcast_ss(&work.J[0]); > - coeffs.vJb = _simd_broadcast_ss(&work.J[1]); > - coeffs.vJc = _simd_broadcast_ss(&work.J[2]); > + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, > *pStencilBuffer; > + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, > state.psState.numRenderTargets, renderBuffers); > > - coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); > - coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); > - coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); > + SWR_PS_CONTEXT psContext; > + SetupPixelShaderContext<T>(&psContext, work); > > - coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); > + AR_END(BESetup, 0); > > - coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); > - coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); > - coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); > + PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, > pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); > > - uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; > - uint32_t NumRT = state.psState.numRenderTargets; > - for(uint32_t rt = 0; rt < NumRT; ++rt) > - { > - pColorBase[rt] = renderBuffers.pColor[rt]; > - } > - uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = > renderBuffers.pStencil; > - AR_END(BESetup, 0); > + psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > > - SWR_PS_CONTEXT psContext; > - psContext.pAttribs = work.pAttribs; > - psContext.pPerspAttribs = work.pPerspAttribs; > - psContext.frontFace = work.triFlags.frontFacing; > - psContext.primID = work.triFlags.primID; > - psContext.pRecipW = work.pRecipW; > - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the > shader because of pull attribs > - psContext.I = work.I; > - psContext.J = work.J; > - psContext.recipDet = work.recipDet; > - psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX; > - psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY; > - psContext.rasterizerSampleCount = T::MultisampleT::numSamples; > - > - psContext.sampleIndex = 0; > - > - PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, > pDepthBase, pStencilBase, rastState.clipDistanceMask); > + const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); > > for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) > { > - psContext.vY.UL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps((float)yy)); > - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, > _simd_set1_ps((float)yy)); > + psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + > + const simdscalar dx = > _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); > + > for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) > { > #if USE_8x2_TILE_BACKEND > @@ -996,48 +896,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; > activeLanes = vMask(work.anyCoveredSamples & MASK); > > - psContext.vX.UL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps((float)xx)); > - // set pixel center positions > - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, > _simd_set1_ps((float)xx)); > - > - AR_BEGIN(BEBarycentric, pDC->drawId); > - CalcPixelBarycentrics(coeffs, psContext); > - AR_END(BEBarycentric, 0); > - > if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) > { > - const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > - &work.coverageMask[0]; > - generateInputCoverage<T, T::InputCoverage>(pCoverageMask, > psContext.inputMask, pBlendState->sampleMask); > + const uint64_t* pCoverageMask = (T::InputCoverage == > SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : > &work.coverageMask[0]; > + > + generateInputCoverage<T, T::InputCoverage>(pCoverageMask, > psContext.inputMask, state.blendState.sampleMask); > } > > - if(T::bCentroidPos) > - { > - ///@ todo: don't need to genererate input coverage 2x if > input coverage and centroid > - AR_BEGIN(BEBarycentric, pDC->drawId); > - if(T::bIsStandardPattern) > - { > - CalcCentroidPos<T>(psContext, &work.coverageMask[0], > pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL); > - } > - else > - { > - psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, > _simd_set1_ps(0.5f)); > - psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, > _simd_set1_ps(0.5f)); > - } > + AR_BEGIN(BEBarycentric, pDC->drawId); > > - CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, > psContext.vY.UL); > - AR_END(BEBarycentric, 0); > - } > - else > - { > - psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, > _simd_set1_ps(0.5f)); > - psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, > _simd_set1_ps(0.5f)); > - } > + CalcPixelBarycentrics(coeffs, psContext); > + > + CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, > state.blendState.sampleMask); > + > + AR_END(BEBarycentric, 0); > > if(T::bForcedSampleCount) > { > // candidate pixels (that passed coverage) will cause shader > invocation if any bits in the samplemask are set > - const simdscalar vSampleMask = > _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), > _simd_setzero_si())); > + const simdscalar vSampleMask = > _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), > _simd_setzero_si())); > activeLanes = _simd_and_ps(activeLanes, vSampleMask); > } > > @@ -1051,7 +928,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > // if we have no covered samples that passed depth at this point, > go to next tile > if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; > > - if(pPSState->usesSourceDepth) > + if(state.psState.usesSourceDepth) > { > AR_BEGIN(BEBarycentric, pDC->drawId); > // interpolate and quantize z > @@ -1086,7 +963,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > > // output merger > // loop over all samples, broadcasting the results of the PS to > all passing pixels > - for(uint32_t sample = 0; sample < > GetNumOMSamples<T>(pBlendState->sampleCount); sample++) > + for(uint32_t sample = 0; sample < > GetNumOMSamples<T>(state.blendState.sampleCount); sample++) > { > AR_BEGIN(BEOutputMerger, pDC->drawId); > // center pattern does a single coverage/depth/stencil test, > standard pattern tests all samples > @@ -1110,15 +987,15 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > > // broadcast the results of the PS to all passing pixels > #if USE_8x2_TILE_BACKEND > - OutputMerger(psContext, pColorBase, sample, pBlendState, > state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets, > useAlternateOffset); > + OutputMerger(psContext, pColorBuffer, sample, > &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, > state.psState.numRenderTargets, useAlternateOffset); > #else > - OutputMerger(psContext, pColorBase, sample, pBlendState, > state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets); > + OutputMerger(psContext, pColorBuffer, sample, > &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, > state.psState.numRenderTargets); > #endif > > - if(!pPSState->forceEarlyZ && !T::bForcedSampleCount) > + if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) > { > - uint8_t *pDepthSample = pDepthBase + > RasterTileDepthOffset(sample); > - uint8_t * pStencilSample = pStencilBase + > RasterTileStencilOffset(sample); > + uint8_t *pDepthSample = pDepthBuffer + > RasterTileDepthOffset(sample); > + uint8_t * pStencilSample = pStencilBuffer + > RasterTileStencilOffset(sample); > > DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], > &state.depthStencilState, work.triFlags.frontFacing, > PixelRateZTest.vZ[coverageSampleNum], > pDepthSample, depthMask, coverageMask, > pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); > @@ -1127,6 +1004,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t > } > Endtile: > AR_BEGIN(BEEndTile, pDC->drawId); > + > for(uint32_t sample = 0; sample < > T::MultisampleT::numCoverageSamples; sample++) > { > work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * > SIMD_TILE_X_DIM); > @@ -1137,26 +1015,34 @@ Endtile: > work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * > SIMD_TILE_X_DIM); > } > work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); > - pDepthBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > - pStencilBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > > #if USE_8x2_TILE_BACKEND > if (useAlternateOffset) > { > - for (uint32_t rt = 0; rt < NumRT; ++rt) > + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; > ++rt) > { > - pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > } > #else > - for(uint32_t rt = 0; rt < NumRT; ++rt) > + for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) > { > - pColorBase[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; > } > + pDepthBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > + pStencilBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > #endif > + > AR_END(BEEndTile, 0); > + > + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); > + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); > } > + > + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); > + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); > } > + > AR_END(BEPixelRateBackend, 0); > } > // optimized backend flow with NULL PS > @@ -1170,50 +1056,60 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t y, > typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T; > AR_BEGIN(BESetup, pDC->drawId); > > - const API_STATE& state = GetApiState(pDC); > - const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; > + const API_STATE &state = GetApiState(pDC); > > - // broadcast scalars > BarycentricCoeffs coeffs; > - coeffs.vIa = _simd_broadcast_ss(&work.I[0]); > - coeffs.vIb = _simd_broadcast_ss(&work.I[1]); > - coeffs.vIc = _simd_broadcast_ss(&work.I[2]); > + SetupBarycentricCoeffs(&coeffs, work); > > - coeffs.vJa = _simd_broadcast_ss(&work.J[0]); > - coeffs.vJb = _simd_broadcast_ss(&work.J[1]); > - coeffs.vJc = _simd_broadcast_ss(&work.J[2]); > + uint8_t *pDepthBuffer, *pStencilBuffer; > + SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, > renderBuffers); > > - coeffs.vZa = _simd_broadcast_ss(&work.Z[0]); > - coeffs.vZb = _simd_broadcast_ss(&work.Z[1]); > - coeffs.vZc = _simd_broadcast_ss(&work.Z[2]); > + SWR_PS_CONTEXT psContext; > + // skip SetupPixelShaderContext(&psContext, ...); // not needed here > > - coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); > + AR_END(BESetup, 0); > > - uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = > renderBuffers.pStencil; > + simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps(static_cast<float>(y))); > > - AR_END(BESetup, 0); > + const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); > > - SWR_PS_CONTEXT psContext; > for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) > { > - // UL pixel corner > - simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, > _simd_set1_ps((float)yy)); > + simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps(static_cast<float>(x))); > + > + const simdscalar dx = > _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); > > for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) > { > - // UL pixel corners > - simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, > _simd_set1_ps((float)xx)); > - > // iterate over active samples > unsigned long sample = 0; > uint32_t sampleMask = state.blendState.sampleMask; > while (_BitScanForward(&sample, sampleMask)) > { > sampleMask &= ~(1 << sample); > + > simdmask coverageMask = work.coverageMask[sample] & MASK; > + > if (coverageMask) > { > + // offset depth/stencil buffers current sample > + uint8_t *pDepthSample = pDepthBuffer + > RasterTileDepthOffset(sample); > + uint8_t *pStencilSample = pStencilBuffer + > RasterTileStencilOffset(sample); > + > + if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > + { > + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == > R32_FLOAT, "Unsupported depth hot tile format"); > + > + const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); > + > + const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > + const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > + > + coverageMask &= CalcDepthBoundsAcceptMask(z, minz, > maxz); > + } > + > AR_BEGIN(BEBarycentric, pDC->drawId); > + > // calculate per sample positions > psContext.vX.sample = _simd_add_ps(vXSamplePosUL, > T::MultisampleT::vX(sample)); > psContext.vY.sample = _simd_add_ps(vYSamplePosUL, > T::MultisampleT::vY(sample)); > @@ -1227,26 +1123,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t > workerId, uint32_t x, uint32_t y, > AR_END(BEBarycentric, 0); > > // interpolate user clip distance if available > - if (rastState.clipDistanceMask) > + if (state.rastState.clipDistanceMask) > { > - coverageMask &= > ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, > - psContext.vI.sample, psContext.vJ.sample); > - } > - > - // offset depth/stencil buffers current sample > - uint8_t *pDepthSample = pDepthBase + > RasterTileDepthOffset(sample); > - uint8_t *pStencilSample = pStencilBase + > RasterTileStencilOffset(sample); > - > - if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > - { > - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == > R32_FLOAT, "Unsupported depth hot tile format"); > - > - const simdscalar z = > _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); > - > - const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > - const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > - > - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, > maxz); > + coverageMask &= > ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, > psContext.vI.sample, psContext.vJ.sample); > } > > simdscalar vCoverageMask = vMask(coverageMask); > @@ -1268,10 +1147,16 @@ Endtile: > ATTR_UNUSED; > work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * > SIMD_TILE_X_DIM); > } > - pDepthBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > - pStencilBase += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > + > + pDepthBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; > + pStencilBuffer += (KNOB_SIMD_WIDTH * > FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; > + > + vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); > } > + > + vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy); > } > + > AR_END(BENullBackend, 0); > } > > diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h > b/src/gallium/drivers/swr/rasterizer/core/backend.h > index 53222ea..dc0be90 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/backend.h > +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h > @@ -438,13 +438,117 @@ INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT > blendSampleCount) > } > } > > +inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const > SWR_TRIANGLE_DESC &work) > +{ > + // broadcast scalars > + > + coeffs->vIa = _simd_broadcast_ss(&work.I[0]); > + coeffs->vIb = _simd_broadcast_ss(&work.I[1]); > + coeffs->vIc = _simd_broadcast_ss(&work.I[2]); > + > + coeffs->vJa = _simd_broadcast_ss(&work.J[0]); > + coeffs->vJb = _simd_broadcast_ss(&work.J[1]); > + coeffs->vJc = _simd_broadcast_ss(&work.J[2]); > + > + coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); > + coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); > + coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); > + > + coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); > + > + coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); > + coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); > + coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); > +} > + > +inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], > uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, > RenderOutputBuffers &renderBuffers) > +{ > + SWR_ASSERT(colorBufferCount <= SWR_NUM_RENDERTARGETS); > + > + if (pColorBuffer) > + { > + for (uint32_t index = 0; index < colorBufferCount; index += 1) > + { > + pColorBuffer[index] = renderBuffers.pColor[index]; > + } > + } > + > + if (pDepthBuffer) > + { > + *pDepthBuffer = renderBuffers.pDepth; > + } > + > + if (pStencilBuffer) > + { > + *pStencilBuffer = renderBuffers.pStencil;; > + } > +} > + > +template<typename T> > +void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const > SWR_TRIANGLE_DESC &work) > +{ > + psContext->pAttribs = work.pAttribs; > + psContext->pPerspAttribs = work.pPerspAttribs; > + psContext->frontFace = work.triFlags.frontFacing; > + psContext->primID = work.triFlags.primID; > + > + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the > shader because of pull attribs > + psContext->I = work.I; > + psContext->J = work.J; > + > + psContext->recipDet = work.recipDet; > + psContext->pRecipW = work.pRecipW; > + psContext->pSamplePosX = reinterpret_cast<const float > *>(&T::MultisampleT::samplePosX); > + psContext->pSamplePosY = reinterpret_cast<const float > *>(&T::MultisampleT::samplePosY); > + psContext->rasterizerSampleCount = T::MultisampleT::numSamples; > + psContext->sampleIndex = 0; > +} > + > +template<typename T, bool IsSingleSample> > +void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs > &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) > +{ > + if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't > cut it, the centroid positions are still different > + { > + // for 1x case, centroid is pixel center > + psContext->vX.centroid = psContext->vX.center; > + psContext->vY.centroid = psContext->vY.center; > + psContext->vI.centroid = psContext->vI.center; > + psContext->vJ.centroid = psContext->vJ.center; > + psContext->vOneOverW.centroid = psContext->vOneOverW.center; > + } > + else > + { > + if (T::bCentroidPos) > + { > + ///@ todo: don't need to genererate input coverage 2x if input > coverage and centroid > + if (T::bIsStandardPattern) > + { > + // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to > eliminate 'generate coverage 2X'.. > + CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, > psContext->vX.UL, psContext->vY.UL); > + } > + else > + { > + psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, > _simd_set1_ps(0.5f)); > + psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, > _simd_set1_ps(0.5f)); > + } > + > + CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, > psContext->vY.UL); > + } > + else > + { > + psContext->vX.centroid = psContext->vX.sample; > + psContext->vY.centroid = psContext->vY.sample; > + } > + } > +} > + > template<typename T> > struct PixelRateZTestLoop > { > PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const > SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& > apiState, > - uint8_t*& depthBase, uint8_t*& stencilBase, const > uint8_t ClipDistanceMask) : > + uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const > uint8_t ClipDistanceMask) : > pDC(DC), workerId(_workerId), work(Work), > coeffs(Coeffs), state(apiState), psState(apiState.psState), > - clipDistanceMask(ClipDistanceMask), > pDepthBase(depthBase), pStencilBase(stencilBase) {}; > + clipDistanceMask(ClipDistanceMask), > pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {}; > > INLINE > uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, > @@ -465,7 +569,24 @@ struct PixelRateZTestLoop > continue; > } > > + // offset depth/stencil buffers current sample > + uint8_t *pDepthSample = pDepthBuffer + > RasterTileDepthOffset(sample); > + uint8_t * pStencilSample = pStencilBuffer + > RasterTileStencilOffset(sample); > + > + if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > + { > + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, > "Unsupported depth hot tile format"); > + > + const simdscalar z = _simd_load_ps(reinterpret_cast<const > float *>(pDepthSample)); > + > + const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > + const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > + > + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], > vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); > + } > + > AR_BEGIN(BEBarycentric, pDC->drawId); > + > // calculate per sample positions > psContext.vX.sample = _simd_add_ps(psContext.vX.UL, > T::MultisampleT::vX(sample)); > psContext.vY.sample = _simd_add_ps(psContext.vY.UL, > T::MultisampleT::vY(sample)); > @@ -483,31 +604,16 @@ struct PixelRateZTestLoop > vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, > psContext.vI.sample, psContext.vJ.sample); > vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); > } > + > AR_END(BEBarycentric, 0); > > ///@todo: perspective correct vs non-perspective correct clipping? > // if clip distances are enabled, we need to interpolate for each > sample > if(clipDistanceMask) > { > - uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, > work.pUserClipBuffer, > - psContext.vI.sample, > psContext.vJ.sample); > - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], > vMask(~clipMask)); > - } > - > - // offset depth/stencil buffers current sample > - uint8_t *pDepthSample = pDepthBase + > RasterTileDepthOffset(sample); > - uint8_t * pStencilSample = pStencilBase + > RasterTileStencilOffset(sample); > + uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, > work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); > > - if (state.depthHottileEnable && > state.depthBoundsState.depthBoundsTestEnable) > - { > - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, > "Unsupported depth hot tile format"); > - > - const simdscalar z = _simd_load_ps(reinterpret_cast<const > float *>(pDepthSample)); > - > - const float minz = > state.depthBoundsState.depthBoundsTestMinValue; > - const float maxz = > state.depthBoundsState.depthBoundsTestMaxValue; > - > - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], > vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); > + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], > vMask(~clipMask)); > } > > // ZTest for this sample > @@ -557,8 +663,8 @@ private: > const API_STATE& state; > const SWR_PS_STATE& psState; > const uint8_t clipDistanceMask; > - uint8_t*& pDepthBase; > - uint8_t*& pStencilBase; > + uint8_t*& pDepthBuffer; > + uint8_t*& pStencilBuffer; > }; > > INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, > SWR_PS_CONTEXT &psContext) > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev