() On Tue, Jul 24, 2018 at 9:00 AM Jonathan Marek <jonat...@marek.ca> wrote: > > this patch brings a number of changes to ir2: > -ir2 now generates CF clauses as necessary during assembly. this simplifies > fd2_program/fd2_compiler and is necessary to implement optimization passes > -ir2 now has separate vector/scalar instructions. this will make it easier > to implementing scheduling of scalar+vector instructions together. dst_reg > is also now seperate from src registers instead of a single list > -ir2 now implements register allocation. this makes it possible to compile > shaders which have more than 64 TGSI registers > -ir2 now implements the following optimizations: removal of IN/OUT MOV > instructions generated by TGSI and removal of unused instructions when > some exports are disabled > -ir2 now allows full 8-bit index for constants > -ir2_alloc no longer allocates 4 times too many bytes >
So, this might be easier to review if it was split up a bit better into multiple patches. That said, I think I'll merge it as is, mostly because you folks are the main ones using and working on a2xx currently, and it isn't something that would break a3xx+. However, a few recommendations for the future: 1) you probably want to start running piglit and/or deqp_gles2. (Piglit has better desktop gl coverage, deqp has better gles coverage. Not sure whether you care more about gl or gles.) Due to feature level of a2xx (and because, iirc, I didn't start running piglit much until a3xx), I guess there will be a lot of skips and fails, but main thing you want to watch for is tests that transition pass->fail.. piglit-summary.py can compare before/after piglit runs. (Not really sure how to do that best with deqp, but you can use piglit to run deqp tests.) 2) shader-db is good for measuring the effect of compiler changes across a bunch of shaders. Probably worth wiring up shaderdb traces for a2xx, see dump_shader_info() in ir3_shader.c for example. I suppose you could use the same format for the traces and re-use fd-report.py in the shader-db tree to parse before/after results. (That script could probably use some improvements, like splitting VS/FS results.. I guess I'll do that next time I work up the courage to hack on python.) 3) seems like eventually you'll want to stop re-inventing register_allocate.[ch].. perhaps it is overkill for a2xx, I guess there wasn't anything complicated like multiple register banks or conflicting register classes. So maybe this is fine for now. BR, -R > Signed-off-by: Jonathan Marek <jonat...@marek.ca> > --- > .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++--- > .../drivers/freedreno/a2xx/fd2_program.c | 75 +- > .../drivers/freedreno/a2xx/instr-a2xx.h | 28 +- > src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 734 +++++++++++------- > src/gallium/drivers/freedreno/a2xx/ir-a2xx.h | 113 +-- > 5 files changed, 615 insertions(+), 545 deletions(-) > > diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > index 3ad47f9850..12f9a1ce0a 100644 > --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > @@ -93,9 +93,6 @@ struct fd2_compile_context { > unsigned position, psize; > > uint64_t need_sync; > - > - /* current exec CF instruction */ > - struct ir2_cf *cf; > }; > > static int > @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct > fd_program_stateobj *prog, > > ctx->prog = prog; > ctx->so = so; > - ctx->cf = NULL; > ctx->pred_depth = 0; > > ret = tgsi_parse_init(&ctx->parser, so->tokens); > @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx) > tgsi_parse_free(&ctx->parser); > } > > -static struct ir2_cf * > -next_exec_cf(struct fd2_compile_context *ctx) > -{ > - struct ir2_cf *cf = ctx->cf; > - if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs)) > - ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC); > - return cf; > -} > - > static void > compile_vtx_fetch(struct fd2_compile_context *ctx) > { > @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) > int i; > for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) { > struct ir2_instruction *instr = ir2_instr_create( > - next_exec_cf(ctx), IR2_FETCH); > + ctx->so->ir, IR2_FETCH); > instr->fetch.opc = VTX_FETCH; > > ctx->need_sync |= 1 << (i+1); > > - ir2_reg_create(instr, i+1, "xyzw", 0); > - ir2_reg_create(instr, 0, "x", 0); > + ir2_dst_create(instr, i+1, "xyzw", 0); > + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); > > if (i == 0) > instr->sync = true; > @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) > vfetch_instrs[i] = instr; > } > ctx->so->num_vfetch_instrs = i; > - ctx->cf = NULL; > } > > /* > @@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx) > return num; > } > > -static struct ir2_register * > +static struct ir2_dst_register * > add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, > const struct tgsi_dst_register *dst) > { > @@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct > ir2_instruction *alu, > swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_'; > swiz[4] = '\0'; > > - return ir2_reg_create(alu, num, swiz, flags); > + return ir2_dst_create(alu, num, swiz, flags); > } > > -static struct ir2_register * > +static struct ir2_src_register * > add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, > const struct tgsi_src_register *src) > { > @@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct > ir2_instruction *alu, > if (ctx->type == PIPE_SHADER_VERTEX) { > num = src->Index + 1; > } else { > + flags |= IR2_REG_INPUT; > num = export_linkage(ctx, > ctx->input_export_idx[src->Index]); > } > @@ -415,7 +402,7 @@ static void > add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction > *alu) > { > if (inst->Instruction.Saturate) { > - alu->alu.vector_clamp = true; > + alu->alu_vector.clamp = true; > } > } > > @@ -423,7 +410,7 @@ static void > add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction > *alu) > { > if (inst->Instruction.Saturate) { > - alu->alu.scalar_clamp = true; > + alu->alu_scalar.clamp = true; > } > } > > @@ -461,27 +448,12 @@ add_regs_vector_3(struct fd2_compile_context *ctx, > assert(inst->Instruction.NumDstRegs == 1); > > add_dst_reg(ctx, alu, &inst->Dst[0].Register); > - /* maybe should re-arrange the syntax some day, but > - * in assembler/disassembler and what ir.c expects > - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1 > - */ > - add_src_reg(ctx, alu, &inst->Src[2].Register); > add_src_reg(ctx, alu, &inst->Src[0].Register); > add_src_reg(ctx, alu, &inst->Src[1].Register); > + add_src_reg(ctx, alu, &inst->Src[2].Register); > add_vector_clamp(inst, alu); > } > > -static void > -add_regs_dummy_vector(struct ir2_instruction *alu) > -{ > - /* create dummy, non-written vector dst/src regs > - * for unused vector instr slot: > - */ > - ir2_reg_create(alu, 0, "____", 0); /* vector dst */ > - ir2_reg_create(alu, 0, NULL, 0); /* vector src1 */ > - ir2_reg_create(alu, 0, NULL, 0); /* vector src2 */ > -} > - > static void > add_regs_scalar_1(struct fd2_compile_context *ctx, > struct tgsi_full_instruction *inst, struct ir2_instruction > *alu) > @@ -489,8 +461,6 @@ add_regs_scalar_1(struct fd2_compile_context *ctx, > assert(inst->Instruction.NumSrcRegs == 1); > assert(inst->Instruction.NumDstRegs == 1); > > - add_regs_dummy_vector(alu); > - > add_dst_reg(ctx, alu, &inst->Dst[0].Register); > add_src_reg(ctx, alu, &inst->Src[0].Register); > add_scalar_clamp(inst, alu); > @@ -567,19 +537,13 @@ push_predicate(struct fd2_compile_context *ctx, struct > tgsi_src_register *src) > struct ir2_instruction *alu; > struct tgsi_dst_register pred_dst; > > - /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by > - * themselves: > - */ > - ctx->cf = NULL; > - > if (ctx->pred_depth == 0) { > /* assign predicate register: */ > ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY]; > > get_predicate(ctx, &pred_dst, NULL); > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, > PRED_SETNEs); > - add_regs_dummy_vector(alu); > + alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs); > add_dst_reg(ctx, alu, &pred_dst); > add_src_reg(ctx, alu, src); > } else { > @@ -587,7 +551,7 @@ push_predicate(struct fd2_compile_context *ctx, struct > tgsi_src_register *src) > > get_predicate(ctx, &pred_dst, &pred_src); > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0); > + alu = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_dst_reg(ctx, alu, &pred_dst); > add_src_reg(ctx, alu, &pred_src); > add_src_reg(ctx, alu, src); > @@ -600,18 +564,11 @@ push_predicate(struct fd2_compile_context *ctx, struct > tgsi_src_register *src) > > /* save previous pred state to restore in pop_predicate(): */ > ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred; > - > - ctx->cf = NULL; > } > > static void > pop_predicate(struct fd2_compile_context *ctx) > { > - /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by > - * themselves: > - */ > - ctx->cf = NULL; > - > /* restore previous predicate state: */ > ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth]; > > @@ -622,8 +579,7 @@ pop_predicate(struct fd2_compile_context *ctx) > > get_predicate(ctx, &pred_dst, &pred_src); > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, > PRED_SET_POPs); > - add_regs_dummy_vector(alu); > + alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs); > add_dst_reg(ctx, alu, &pred_dst); > add_src_reg(ctx, alu, &pred_src); > alu->pred = IR2_PRED_NONE; > @@ -631,8 +587,6 @@ pop_predicate(struct fd2_compile_context *ctx) > /* predicate register no longer needed: */ > ctx->pred_reg = -1; > } > - > - ctx->cf = NULL; > } > > static void > @@ -693,12 +647,11 @@ translate_pow(struct fd2_compile_context *ctx, > > get_internal_temp(ctx, &tmp_dst, &tmp_src); > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP); > - add_regs_dummy_vector(alu); > + alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP); > add_dst_reg(ctx, alu, &tmp_dst); > add_src_reg(ctx, alu, &inst->Src[0].Register); > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0); > + alu = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_dst_reg(ctx, alu, &tmp_dst); > add_src_reg(ctx, alu, &tmp_src); > add_src_reg(ctx, alu, &inst->Src[1].Register); > @@ -725,8 +678,7 @@ translate_pow(struct fd2_compile_context *ctx, > break; > } > > - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE); > - add_regs_dummy_vector(alu); > + alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE); > add_dst_reg(ctx, alu, &inst->Dst[0].Register); > add_src_reg(ctx, alu, &tmp_src); > add_scalar_clamp(inst, alu); > @@ -737,7 +689,7 @@ translate_tex(struct fd2_compile_context *ctx, > struct tgsi_full_instruction *inst, unsigned opc) > { > struct ir2_instruction *instr; > - struct ir2_register *reg; > + struct ir2_src_register *reg; > struct tgsi_dst_register tmp_dst; > struct tgsi_src_register tmp_src; > const struct tgsi_src_register *coord; > @@ -766,19 +718,18 @@ translate_tex(struct fd2_compile_context *ctx, > * > * dst = texture_sample(unit, coord, bias) > */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, > RECIP_IEEE); > > - /* MAXv: */ > + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); > add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w"; > add_src_reg(ctx, instr, &inst->Src[0].Register); > add_src_reg(ctx, instr, &inst->Src[0].Register); > > - /* RECIP_IEEE: */ > + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE); > add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___"; > add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle = > swiz[inst->Src[0].Register.SwizzleW]; > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_"; > add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx"; > add_src_reg(ctx, instr, &inst->Src[0].Register); > @@ -788,7 +739,7 @@ translate_tex(struct fd2_compile_context *ctx, > coord = &inst->Src[0].Register; > } > > - instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH); > + instr = ir2_instr_create(ctx->so->ir, IR2_FETCH); > instr->fetch.opc = TEX_FETCH; > instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D); > instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT); > @@ -807,7 +758,7 @@ translate_tex(struct fd2_compile_context *ctx, > reg->swizzle[2] = reg->swizzle[0]; > > /* dst register needs to be marked for sync: */ > - ctx->need_sync |= 1 << instr->regs[0]->num; > + ctx->need_sync |= 1 << instr->dst_reg.num; > > /* TODO we need some way to know if the tex fetch needs to sync on > alu pipe.. */ > instr->sync = true; > @@ -818,7 +769,7 @@ translate_tex(struct fd2_compile_context *ctx, > * the texture to a temp and the use ALU instruction to move > * to output > */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); > > add_dst_reg(ctx, instr, &inst->Dst[0].Register); > add_src_reg(ctx, instr, &tmp_src); > @@ -869,22 +820,18 @@ translate_sge_slt_seq_sne(struct fd2_compile_context > *ctx, > > get_internal_temp(ctx, &tmp_dst, &tmp_src); > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); > add_dst_reg(ctx, instr, &tmp_dst); > add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= > IR2_REG_NEGATE; > add_src_reg(ctx, instr, &inst->Src[1].Register); > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), vopc, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, vopc); > add_dst_reg(ctx, instr, &inst->Dst[0].Register); > - /* maybe should re-arrange the syntax some day, but > - * in assembler/disassembler and what ir.c expects > - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1 > - */ > - get_immediate(ctx, &tmp_const, fui(c0)); > - add_src_reg(ctx, instr, &tmp_const); > add_src_reg(ctx, instr, &tmp_src); > get_immediate(ctx, &tmp_const, fui(c1)); > add_src_reg(ctx, instr, &tmp_const); > + get_immediate(ctx, &tmp_const, fui(c0)); > + add_src_reg(ctx, instr, &tmp_const); > } > > /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */ > @@ -904,25 +851,25 @@ translate_lrp(struct fd2_compile_context *ctx, > get_immediate(ctx, &tmp_const, fui(1.0)); > > /* tmp1 = (a * b) */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_dst_reg(ctx, instr, &tmp_dst1); > add_src_reg(ctx, instr, &inst->Src[0].Register); > add_src_reg(ctx, instr, &inst->Src[1].Register); > > /* tmp2 = (1 - a) */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); > add_dst_reg(ctx, instr, &tmp_dst2); > add_src_reg(ctx, instr, &tmp_const); > add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= > IR2_REG_NEGATE; > > /* tmp2 = tmp2 * c */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_dst_reg(ctx, instr, &tmp_dst2); > add_src_reg(ctx, instr, &tmp_src2); > add_src_reg(ctx, instr, &inst->Src[2].Register); > > /* dst = tmp1 + tmp2 */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); > add_dst_reg(ctx, instr, &inst->Dst[0].Register); > add_src_reg(ctx, instr, &tmp_src1); > add_src_reg(ctx, instr, &tmp_src2); > @@ -956,33 +903,28 @@ translate_trig(struct fd2_compile_context *ctx, > tmp_src.SwizzleX = tmp_src.SwizzleY = > tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X; > > - /* maybe should re-arrange the syntax some day, but > - * in assembler/disassembler and what ir.c expects > - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1 > - */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); > add_dst_reg(ctx, instr, &tmp_dst); > - get_immediate(ctx, &tmp_const, fui(0.5)); > - add_src_reg(ctx, instr, &tmp_const); > add_src_reg(ctx, instr, &inst->Src[0].Register); > get_immediate(ctx, &tmp_const, fui(0.159155)); > add_src_reg(ctx, instr, &tmp_const); > + get_immediate(ctx, &tmp_const, fui(0.5)); > + add_src_reg(ctx, instr, &tmp_const); > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv); > add_dst_reg(ctx, instr, &tmp_dst); > add_src_reg(ctx, instr, &tmp_src); > add_src_reg(ctx, instr, &tmp_src); > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); > add_dst_reg(ctx, instr, &tmp_dst); > - get_immediate(ctx, &tmp_const, fui(-3.141593)); > - add_src_reg(ctx, instr, &tmp_const); > add_src_reg(ctx, instr, &tmp_src); > get_immediate(ctx, &tmp_const, fui(6.283185)); > add_src_reg(ctx, instr, &tmp_const); > + get_immediate(ctx, &tmp_const, fui(-3.141593)); > + add_src_reg(ctx, instr, &tmp_const); > > - instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op); > - add_regs_dummy_vector(instr); > + instr = ir2_instr_create_alu_s(ctx->so->ir, op); > add_dst_reg(ctx, instr, &inst->Dst[0].Register); > add_src_reg(ctx, instr, &tmp_src); > } > @@ -996,12 +938,12 @@ translate_dp2(struct fd2_compile_context *ctx, > struct ir2_instruction *instr; > /* DP2ADD c,a,b -> dot2(a,b) + c */ > /* for c we use the constant 0.0 */ > - instr = ir2_instr_create_alu(next_exec_cf(ctx), DOT2ADDv, ~0); > - get_immediate(ctx, &tmp_const, fui(0.0f)); > + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv); > add_dst_reg(ctx, instr, &inst->Dst[0].Register); > - add_src_reg(ctx, instr, &tmp_const); > add_src_reg(ctx, instr, &inst->Src[0].Register); > add_src_reg(ctx, instr, &inst->Src[1].Register); > + get_immediate(ctx, &tmp_const, fui(0.0f)); > + add_src_reg(ctx, instr, &tmp_const); > add_vector_clamp(inst, instr); > } > > @@ -1015,80 +957,53 @@ translate_instruction(struct fd2_compile_context *ctx, > { > unsigned opc = inst->Instruction.Opcode; > struct ir2_instruction *instr; > - static struct ir2_cf *cf; > > if (opc == TGSI_OPCODE_END) > return; > > - if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { > - unsigned num = inst->Dst[0].Register.Index; > - /* seems like we need to ensure that position vs param/pixel > - * exports don't end up in the same EXEC clause.. easy way > - * to do this is force a new EXEC clause on first appearance > - * of an position or param/pixel export. > - */ > - if ((num == ctx->position) || (num == ctx->psize)) { > - if (ctx->num_position > 0) { > - ctx->cf = NULL; > - ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION, > - ctx->num_position - 1); > - ctx->num_position = 0; > - } > - } else { > - if (ctx->num_param > 0) { > - ctx->cf = NULL; > - ir2_cf_create_alloc(ctx->so->ir, > SQ_PARAMETER_PIXEL, > - ctx->num_param - 1); > - ctx->num_param = 0; > - } > - } > - } > - > - cf = next_exec_cf(ctx); > - > /* TODO turn this into a table: */ > switch (opc) { > case TGSI_OPCODE_MOV: > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); > add_regs_vector_1(ctx, inst, instr); > break; > case TGSI_OPCODE_RCP: > - instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE); > + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE); > add_regs_scalar_1(ctx, inst, instr); > break; > case TGSI_OPCODE_RSQ: > - instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE); > + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE); > add_regs_scalar_1(ctx, inst, instr); > break; > case TGSI_OPCODE_SQRT: > - instr = ir2_instr_create_alu(cf, ~0, SQRT_IEEE); > + instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE); > add_regs_scalar_1(ctx, inst, instr); > break; > case TGSI_OPCODE_MUL: > - instr = ir2_instr_create_alu(cf, MULv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_ADD: > - instr = ir2_instr_create_alu(cf, ADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_DP2: > translate_dp2(ctx, inst, opc); > break; > case TGSI_OPCODE_DP3: > - instr = ir2_instr_create_alu(cf, DOT3v, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_DP4: > - instr = ir2_instr_create_alu(cf, DOT4v, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_MIN: > - instr = ir2_instr_create_alu(cf, MINv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MINv); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_MAX: > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); > add_regs_vector_2(ctx, inst, instr); > break; > case TGSI_OPCODE_SLT: > @@ -1098,22 +1013,22 @@ translate_instruction(struct fd2_compile_context *ctx, > translate_sge_slt_seq_sne(ctx, inst, opc); > break; > case TGSI_OPCODE_MAD: > - instr = ir2_instr_create_alu(cf, MULADDv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); > add_regs_vector_3(ctx, inst, instr); > break; > case TGSI_OPCODE_LRP: > translate_lrp(ctx, inst, opc); > break; > case TGSI_OPCODE_FRC: > - instr = ir2_instr_create_alu(cf, FRACv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv); > add_regs_vector_1(ctx, inst, instr); > break; > case TGSI_OPCODE_FLR: > - instr = ir2_instr_create_alu(cf, FLOORv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv); > add_regs_vector_1(ctx, inst, instr); > break; > case TGSI_OPCODE_EX2: > - instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE); > + instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE); > add_regs_scalar_1(ctx, inst, instr); > break; > case TGSI_OPCODE_POW: > @@ -1128,10 +1043,9 @@ translate_instruction(struct fd2_compile_context *ctx, > translate_tex(ctx, inst, opc); > break; > case TGSI_OPCODE_CMP: > - instr = ir2_instr_create_alu(cf, CNDGTEv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv); > add_regs_vector_3(ctx, inst, instr); > - // TODO this should be src0 if regs where in sane order.. > - instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */ > + instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */ > break; > case TGSI_OPCODE_IF: > push_predicate(ctx, &inst->Src[0].Register); > @@ -1139,16 +1053,12 @@ translate_instruction(struct fd2_compile_context *ctx, > break; > case TGSI_OPCODE_ELSE: > ctx->so->ir->pred = IR2_PRED_NE; > - /* not sure if this is required in all cases, but blob > compiler > - * won't combine EQ and NE in same CF: > - */ > - ctx->cf = NULL; > break; > case TGSI_OPCODE_ENDIF: > pop_predicate(ctx); > break; > case TGSI_OPCODE_F2I: > - instr = ir2_instr_create_alu(cf, TRUNCv, ~0); > + instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv); > add_regs_vector_1(ctx, inst, instr); > break; > default: > @@ -1179,8 +1089,6 @@ compile_instructions(struct fd2_compile_context *ctx) > break; > } > } > - > - ctx->cf->cf_type = EXEC_END; > } > > int > diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c > b/src/gallium/drivers/freedreno/a2xx/fd2_program.c > index 834a7c7fcd..34622eaba0 100644 > --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c > +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c > @@ -199,7 +199,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct > fd2_shader_stateobj *so, > instr->fetch.offset = elem->src_offset; > > for (j = 0; j < 4; j++) > - instr->regs[0]->swizzle[j] = > "xyzw01__"[desc->swizzle[j]]; > + instr->dst_reg.swizzle[j] = > "xyzw01__"[desc->swizzle[j]]; > > assert(instr->fetch.fmt != ~0); > > @@ -210,7 +210,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct > fd2_shader_stateobj *so, > instr->fetch.const_idx, > instr->fetch.const_idx_sel, > elem->instance_divisor, > - instr->regs[0]->swizzle, > + instr->dst_reg.swizzle, > instr->fetch.stride, > instr->fetch.offset); > } > @@ -307,7 +307,6 @@ static struct fd2_shader_stateobj * > create_blit_fp(void) > { > struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT); > - struct ir2_cf *cf; > struct ir2_instruction *instr; > > if (!so) > @@ -315,18 +314,13 @@ create_blit_fp(void) > > so->ir = ir2_shader_create(); > > - cf = ir2_cf_create(so->ir, EXEC); > - > - instr = ir2_instr_create_tex_fetch(cf, 0); > - ir2_reg_create(instr, 0, "xyzw", 0); > - ir2_reg_create(instr, 0, "xyx", 0); > + instr = ir2_instr_create_tex_fetch(so->ir, 0); > + ir2_dst_create(instr, 0, "xyzw", 0); > + ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT); > instr->sync = true; > > - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0); > - cf = ir2_cf_create(so->ir, EXEC_END); > - > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT); > + instr = ir2_instr_create_alu_v(so->ir, MAXv); > + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); > ir2_reg_create(instr, 0, NULL, 0); > ir2_reg_create(instr, 0, NULL, 0); > > @@ -349,7 +343,6 @@ static struct fd2_shader_stateobj * > create_blit_vp(void) > { > struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX); > - struct ir2_cf *cf; > struct ir2_instruction *instr; > > if (!so) > @@ -357,31 +350,23 @@ create_blit_vp(void) > > so->ir = ir2_shader_create(); > > - cf = ir2_cf_create(so->ir, EXEC); > - > - instr = ir2_instr_create_vtx_fetch(cf, 26, 1, FMT_32_32_FLOAT, false, > 8); > + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, > false, 8); > instr->fetch.is_normalized = true; > - ir2_reg_create(instr, 1, "xy01", 0); > - ir2_reg_create(instr, 0, "x", 0); > + ir2_dst_create(instr, 1, "xy01", 0); > + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); > > - instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, > false, 12); > + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, > false, 12); > instr->fetch.is_normalized = true; > - ir2_reg_create(instr, 2, "xyz1", 0); > - ir2_reg_create(instr, 0, "x", 0); > - > - cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0); > - cf = ir2_cf_create(so->ir, EXEC); > + ir2_dst_create(instr, 2, "xyz1", 0); > + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); > > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > - ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT); > + instr = ir2_instr_create_alu_v(so->ir, MAXv); > + ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT); > ir2_reg_create(instr, 2, NULL, 0); > ir2_reg_create(instr, 2, NULL, 0); > > - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0); > - cf = ir2_cf_create(so->ir, EXEC_END); > - > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT); > + instr = ir2_instr_create_alu_v(so->ir, MAXv); > + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); > ir2_reg_create(instr, 1, NULL, 0); > ir2_reg_create(instr, 1, NULL, 0); > > @@ -397,7 +382,6 @@ static struct fd2_shader_stateobj * > create_solid_fp(void) > { > struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT); > - struct ir2_cf *cf; > struct ir2_instruction *instr; > > if (!so) > @@ -405,11 +389,8 @@ create_solid_fp(void) > > so->ir = ir2_shader_create(); > > - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0); > - cf = ir2_cf_create(so->ir, EXEC_END); > - > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT); > + instr = ir2_instr_create_alu_v(so->ir, MAXv); > + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); > ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); > ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); > > @@ -430,7 +411,6 @@ static struct fd2_shader_stateobj * > create_solid_vp(void) > { > struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX); > - struct ir2_cf *cf; > struct ir2_instruction *instr; > > if (!so) > @@ -438,22 +418,15 @@ create_solid_vp(void) > > so->ir = ir2_shader_create(); > > - cf = ir2_cf_create(so->ir, EXEC); > - > - instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, > false, 12); > - ir2_reg_create(instr, 1, "xyz1", 0); > - ir2_reg_create(instr, 0, "x", 0); > - > - cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0); > - cf = ir2_cf_create(so->ir, EXEC); > + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, > false, 12); > + ir2_dst_create(instr, 1, "xyz1", 0); > + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); > > - instr = ir2_instr_create_alu(cf, MAXv, ~0); > - ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT); > + instr = ir2_instr_create_alu_v(so->ir, MAXv); > + ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT); > ir2_reg_create(instr, 1, NULL, 0); > ir2_reg_create(instr, 1, NULL, 0); > > - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0); > - cf = ir2_cf_create(so->ir, EXEC_END); > > return assemble(so); > } > diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h > b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h > index ac972ed35a..5a9f93ec79 100644 > --- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h > +++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h > @@ -147,15 +147,25 @@ typedef struct PACKED { > uint8_t const_1_rel_abs : 1; > uint8_t const_0_rel_abs : 1; > /* dword2: */ > - uint8_t src3_reg : 6; > - uint8_t src3_reg_select : 1; > - uint8_t src3_reg_abs : 1; > - uint8_t src2_reg : 6; > - uint8_t src2_reg_select : 1; > - uint8_t src2_reg_abs : 1; > - uint8_t src1_reg : 6; > - uint8_t src1_reg_select : 1; > - uint8_t src1_reg_abs : 1; > + union { > + struct { > + uint8_t src3_reg : 6; > + uint8_t src3_reg_select : 1; > + uint8_t src3_reg_abs : 1; > + uint8_t src2_reg : 6; > + uint8_t src2_reg_select : 1; > + uint8_t src2_reg_abs : 1; > + uint8_t src1_reg : 6; > + uint8_t src1_reg_select : 1; > + uint8_t src1_reg_abs : 1; > + }; > + /* constants have full 8-bit index */ > + struct { > + uint8_t src3_reg_const : 8; > + uint8_t src2_reg_const : 8; > + uint8_t src1_reg_const : 8; > + }; > + }; > instr_vector_opc_t vector_opc : 5; > uint8_t src3_sel : 1; > uint8_t src2_sel : 1; > diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c > b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c > index 42a9ab494e..af9811864f 100644 > --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c > +++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c > @@ -35,19 +35,13 @@ > #define WARN_MSG(f, ...) DBG("WARN: "f, ##__VA_ARGS__) > #define ERROR_MSG(f, ...) DBG("ERROR: "f, ##__VA_ARGS__) > > -#define REG_MASK 0x3f > - > -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr); > - > static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, > uint32_t idx, struct ir2_shader_info *info); > > -static void reg_update_stats(struct ir2_register *reg, > - struct ir2_shader_info *info, bool dest); > -static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n); > -static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg); > -static uint32_t reg_alu_dst_swiz(struct ir2_register *reg); > -static uint32_t reg_alu_src_swiz(struct ir2_register *reg); > +static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n); > +static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg); > +static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg); > +static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg); > > /* simple allocator to carve allocations out of an up-front allocated heap, > * so that we can free everything easily in one shot. > @@ -55,7 +49,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_register *reg); > static void * ir2_alloc(struct ir2_shader *shader, int sz) > { > void *ptr = &shader->heap[shader->heap_idx]; > - shader->heap_idx += align(sz, 4); > + shader->heap_idx += align(sz, 4) / 4; > return ptr; > } > > @@ -74,7 +68,9 @@ static char * ir2_strdup(struct ir2_shader *shader, const > char *str) > struct ir2_shader * ir2_shader_create(void) > { > DEBUG_MSG(""); > - return calloc(1, sizeof(struct ir2_shader)); > + struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader)); > + shader->max_reg = -1; > + return shader; > } > > void ir2_shader_destroy(struct ir2_shader *shader) > @@ -83,189 +79,344 @@ void ir2_shader_destroy(struct ir2_shader *shader) > free(shader); > } > > -/* resolve addr/cnt/sequence fields in the individual CF's */ > -static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info > *info) > +/* check if an instruction is a simple MOV > + */ > +static struct ir2_instruction * simple_mov(struct ir2_instruction *instr, > + bool output) > { > - uint32_t addr; > - unsigned i; > - int j; > - > - addr = shader->cfs_count / 2; > - for (i = 0; i < shader->cfs_count; i++) { > - struct ir2_cf *cf = shader->cfs[i]; > - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { > - uint32_t sequence = 0; > - > - if (cf->exec.addr && (cf->exec.addr != addr)) > - WARN_MSG("invalid addr '%d' at CF %d", > cf->exec.addr, i); > - if (cf->exec.cnt && (cf->exec.cnt != > cf->exec.instrs_count)) > - WARN_MSG("invalid cnt '%d' at CF %d", > cf->exec.cnt, i); > - > - for (j = cf->exec.instrs_count - 1; j >= 0; j--) { > - struct ir2_instruction *instr = > cf->exec.instrs[j]; > - sequence <<= 2; > - if (instr->instr_type == IR2_FETCH) > - sequence |= 0x1; > - if (instr->sync) > - sequence |= 0x2; > - } > + struct ir2_src_register *src_reg = instr->src_reg; > + struct ir2_dst_register *dst_reg = &instr->dst_reg; > + struct ir2_register *reg; > + unsigned i; > + > + /* MAXv used for MOV */ > + if (instr->instr_type != IR2_ALU_VECTOR || > + instr->alu_vector.opc != MAXv) > + return NULL; > + > + /* non identical srcs */ > + if (src_reg[0].num != src_reg[1].num) > + return NULL; > + > + /* flags */ > + int flags = IR2_REG_NEGATE | IR2_REG_ABS; > + if (output) > + flags |= IR2_REG_INPUT | IR2_REG_CONST; > + if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags)) > + return NULL; > + > + /* clamping */ > + if (instr->alu_vector.clamp) > + return NULL; > + > + /* swizzling */ > + for (i = 0; i < 4; i++) { > + char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i]; > + if (swiz == '_') > + continue; > + > + if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : > "xyzw")[i] || > + swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : > "xyzw")[i]) > + return NULL; > + } > + > + if (output) > + reg = &instr->shader->reg[src_reg[0].num]; > + else > + reg = &instr->shader->reg[dst_reg->num]; > + > + assert(reg->write_idx >= 0); > + if (reg->write_idx != reg->write_idx2) > + return NULL; > + > + if (!output) > + return instr; > + > + instr = instr->shader->instr[reg->write_idx]; > + return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr; > +} > > - cf->exec.addr = addr; > - cf->exec.cnt = cf->exec.instrs_count; > - cf->exec.sequence = sequence; > +static int src_to_reg(struct ir2_instruction *instr, > + struct ir2_src_register *reg) > +{ > + if (reg->flags & IR2_REG_CONST) > + return reg->num; > > - addr += cf->exec.instrs_count; > - } > - } > + return instr->shader->reg[reg->num].reg; > +} > + > +static int dst_to_reg(struct ir2_instruction *instr, > + struct ir2_dst_register *reg) > +{ > + if (reg->flags & IR2_REG_EXPORT) > + return reg->num; > > - info->sizedwords = 3 * addr; > + return instr->shader->reg[reg->num].reg; > +} > > - return 0; > +static bool mask_get(uint32_t *mask, unsigned index) > +{ > + return !!(mask[index / 32] & 1 << index % 32); > } > > -void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info > *info) > +static void mask_set(uint32_t *mask, struct ir2_register *reg, int index) > { > - uint32_t i, j; > - uint32_t *ptr, *dwords = NULL; > - uint32_t idx = 0; > - int ret; > - > - info->sizedwords = 0; > - info->max_reg = -1; > - info->max_input_reg = 0; > - info->regs_written = 0; > - > - /* we need an even # of CF's.. insert a NOP if needed */ > - if (shader->cfs_count != align(shader->cfs_count, 2)) > - ir2_cf_create(shader, NOP); > - > - /* first pass, resolve sizes and addresses: */ > - ret = shader_resolve(shader, info); > - if (ret) { > - ERROR_MSG("resolve failed: %d", ret); > - goto fail; > + if (reg) { > + unsigned i; > + for (i = 0; i < ARRAY_SIZE(reg->regmask); i++) > + mask[i] |= reg->regmask[i]; > } > + if (index >= 0) > + mask[index / 32] |= 1 << index % 32; > +} > > - ptr = dwords = calloc(4, info->sizedwords); > +static bool sets_pred(struct ir2_instruction *instr) > +{ > + return instr->instr_type == IR2_ALU_SCALAR && > + instr->alu_scalar.opc >= PRED_SETEs && > + instr->alu_scalar.opc <= PRED_SET_RESTOREs; > +} > > - /* second pass, emit CF program in pairs: */ > - for (i = 0; i < shader->cfs_count; i += 2) { > - instr_cf_t *cfs = (instr_cf_t *)ptr; > - ret = cf_emit(shader->cfs[i], &cfs[0]); > - if (ret) { > - ERROR_MSG("CF emit failed: %d\n", ret); > - goto fail; > + > + > +void* ir2_shader_assemble(struct ir2_shader *shader, > + struct ir2_shader_info *info) > +{ > + /* NOTES > + * blob compiler seems to always puts PRED_* instrs in a CF by > + * themselves, and wont combine EQ/NE in the same CF > + * (not doing this - doesn't seem to make a difference) > + * > + * TODO: implement scheduling for combining vector+scalar instructions > + * -some vector instructions can be replaced by scalar > + */ > + > + /* first step: > + * 1. remove "NOP" MOV instructions generated by TGSI for > input/output: > + * 2. track information for register allocation, and to remove > + * the dead code when some exports are not needed > + * 3. add additional instructions for a20x hw binning if needed > + * NOTE: modifies the shader instrs > + * this step could be done as instructions are added by compiler > instead > + */ > + > + /* mask of exports that must be generated > + * used to avoid calculating ps exports with hw binning > + */ > + uint64_t export = ~0ull; > + /* bitmask of variables required for exports defined by "export" */ > + uint32_t export_mask[REG_MASK/32+1] = {}; > + > + unsigned idx, reg_idx; > + unsigned max_input = 0; > + int export_size = -1; > + > + for (idx = 0; idx < shader->instr_count; idx++) { > + struct ir2_instruction *instr = shader->instr[idx], *prev; > + struct ir2_dst_register dst_reg = instr->dst_reg; > + > + if (dst_reg.flags & IR2_REG_EXPORT) { > + if (dst_reg.num < 32) > + export_size++; > + > + if ((prev = simple_mov(instr, true))) { > + /* copy instruction but keep dst */ > + *instr = *prev; > + instr->dst_reg = dst_reg; > + } > } > - ret = cf_emit(shader->cfs[i+1], &cfs[1]); > - if (ret) { > - ERROR_MSG("CF emit failed: %d\n", ret); > - goto fail; > + > + for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) { > + struct ir2_src_register *src_reg = > &instr->src_reg[reg_idx]; > + struct ir2_register *reg; > + int num; > + > + if (src_reg->flags & IR2_REG_CONST) > + continue; > + > + num = src_reg->num; > + reg = &shader->reg[num]; > + reg->read_idx = idx; > + > + if (src_reg->flags & IR2_REG_INPUT) { > + max_input = MAX2(max_input, num); > + } else { > + /* bypass simple mov used to set src_reg */ > + assert(reg->write_idx >= 0); > + prev = shader->instr[reg->write_idx]; > + if (simple_mov(prev, false)) { > + *src_reg = prev->src_reg[0]; > + /* process same src_reg again */ > + reg_idx -= 1; > + continue; > + } > + } > + > + /* update dependencies */ > + uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ? > + export_mask : > shader->reg[dst_reg.num].regmask; > + mask_set(mask, reg, num); > + if (sets_pred(instr)) > + mask_set(export_mask, reg, num); > } > - ptr += 3; > - assert((ptr - dwords) <= info->sizedwords); > } > > - /* third pass, emit ALU/FETCH: */ > - for (i = 0; i < shader->cfs_count; i++) { > - struct ir2_cf *cf = shader->cfs[i]; > - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) { > - for (j = 0; j < cf->exec.instrs_count; j++) { > - ret = instr_emit(cf->exec.instrs[j], ptr, > idx++, info); > - if (ret) { > - ERROR_MSG("instruction emit failed: > %d", ret); > - goto fail; > - } > - ptr += 3; > - assert((ptr - dwords) <= info->sizedwords); > + /* second step: > + * emit instructions (with CFs) + RA > + */ > + instr_cf_t cfs[128], *cf = cfs; > + uint32_t alufetch[3*256], *af = alufetch; > + > + /* RA is done on write, so inputs must be allocated here */ > + for (reg_idx = 0; reg_idx <= max_input; reg_idx++) > + shader->reg[reg_idx].reg = reg_idx; > + info->max_reg = max_input; > + > + /* CF instr state */ > + instr_cf_exec_t exec = { .opc = EXEC }; > + instr_cf_alloc_t alloc = { .opc = ALLOC }; > + bool need_alloc = 0; > + bool pos_export = 0; > + > + export_size = MAX2(export_size, 0); > + > + for (idx = 0; idx < shader->instr_count; idx++) { > + struct ir2_instruction *instr = shader->instr[idx]; > + struct ir2_dst_register *dst_reg = &instr->dst_reg; > + unsigned num = dst_reg->num; > + struct ir2_register *reg; > + > + /* a2xx only has 64 registers, so we can use a single 64-bit > mask */ > + uint64_t regmask = 0ull; > + > + /* compute the current regmask */ > + for (reg_idx = 0; (int) reg_idx <= shader->max_reg; > reg_idx++) { > + reg = &shader->reg[reg_idx]; > + if ((int) idx > reg->write_idx && idx < reg->read_idx) > + regmask |= (1ull << reg->reg); > + } > + > + if (dst_reg->flags & IR2_REG_EXPORT) { > + /* skip if export is not needed */ > + if (!(export & (1ull << num))) > + continue; > + > + /* ALLOC CF: > + * want to alloc all < 32 at once > + * 32/33 and 62/63 come in pairs > + * XXX assuming all 3 types are never interleaved > + */ > + if (num < 32) { > + alloc.size = export_size; > + alloc.buffer_select = SQ_PARAMETER_PIXEL; > + need_alloc = export_size >= 0; > + export_size = -1; > + } else if (num == 32 || num == 33) { > + alloc.size = 0; > + alloc.buffer_select = SQ_MEMORY; > + need_alloc = num != 33; > + } else { > + alloc.size = 0; > + alloc.buffer_select = SQ_POSITION; > + need_alloc = !pos_export; > + pos_export = true; > } > + > + } else { > + /* skip if dst register not needed to compute exports > */ > + if (!mask_get(export_mask, num)) > + continue; > + > + /* RA on first write */ > + reg = &shader->reg[num]; > + if (reg->write_idx == idx) { > + reg->reg = ffsll(~regmask) - 1; > + info->max_reg = MAX2(info->max_reg, reg->reg); > + } > + } > + > + if (exec.count == 6 || (exec.count && need_alloc)) { > + *cf++ = *(instr_cf_t*) &exec; > + exec.address += exec.count; > + exec.serialize = 0; > + exec.count = 0; > } > + > + if (need_alloc) { > + *cf++ = *(instr_cf_t*) &alloc; > + need_alloc = false; > + } > + > + int ret = instr_emit(instr, af, idx, info); af += 3; > + assert(!ret); > + > + if (instr->instr_type == IR2_FETCH) > + exec.serialize |= 0x1 << exec.count * 2; > + if (instr->sync) > + exec.serialize |= 0x2 << exec.count * 2; > + exec.count += 1; > } > > - return dwords; > > -fail: > - free(dwords); > - return NULL; > -} > + exec.opc = !export_size ? EXEC : EXEC_END; > + *cf++ = *(instr_cf_t*) &exec; > + exec.address += exec.count; > + exec.serialize = 0; > + exec.count = 0; > > + /* GPU will hang without at least one pixel alloc */ > + if (!export_size) { > + alloc.size = 0; > + alloc.buffer_select = SQ_PARAMETER_PIXEL; > + *cf++ = *(instr_cf_t*) &alloc; > > -struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t > cf_type) > -{ > - struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf)); > - DEBUG_MSG("%d", cf_type); > - cf->shader = shader; > - cf->cf_type = cf_type; > - assert(shader->cfs_count < ARRAY_SIZE(shader->cfs)); > - shader->cfs[shader->cfs_count++] = cf; > - return cf; > -} > + exec.opc = EXEC_END; > + *cf++ = *(instr_cf_t*) &exec; > + } > > + unsigned num_cfs = cf - cfs; > > -/* > - * CF instructions: > - */ > + /* insert nop to get an even # of CFs */ > + if (num_cfs % 2) { > + *cf++ = (instr_cf_t) { .opc = NOP }; > + num_cfs++; > + } > > -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr) > -{ > - memset(instr, 0, sizeof(*instr)); > - > - instr->opc = cf->cf_type; > - > - switch (cf->cf_type) { > - case NOP: > - break; > - case EXEC: > - case EXEC_END: > - assert(cf->exec.addr <= 0x1ff); > - assert(cf->exec.cnt <= 0x6); > - assert(cf->exec.sequence <= 0xfff); > - instr->exec.address = cf->exec.addr; > - instr->exec.count = cf->exec.cnt; > - instr->exec.serialize = cf->exec.sequence; > - break; > - case ALLOC: > - assert(cf->alloc.size <= 0xf); > - instr->alloc.size = cf->alloc.size; > - switch (cf->alloc.type) { > - case SQ_POSITION: > - case SQ_PARAMETER_PIXEL: > - instr->alloc.buffer_select = cf->alloc.type; > + /* offset cf addrs */ > + for (idx = 0; idx < num_cfs; idx++) { > + switch (cfs[idx].opc) { > + case EXEC: > + case EXEC_END: > + cfs[idx].exec.address += num_cfs / 2; > break; > default: > - ERROR_MSG("invalid alloc type: %d", cf->alloc.type); > - return -1; > + break; > + /* XXX and any other address using cf that gets implemented > */ > } > - break; > - case COND_EXEC: > - case COND_EXEC_END: > - case COND_PRED_EXEC: > - case COND_PRED_EXEC_END: > - case LOOP_START: > - case LOOP_END: > - case COND_CALL: > - case RETURN: > - case COND_JMP: > - case COND_EXEC_PRED_CLEAN: > - case COND_EXEC_PRED_CLEAN_END: > - case MARK_VS_FETCH_DONE: > - ERROR_MSG("TODO"); > - return -1; > } > > - return 0; > + /* concatenate cfs+alufetchs */ > + uint32_t cfdwords = num_cfs / 2 * 3; > + uint32_t alufetchdwords = exec.address * 3; > + info->sizedwords = cfdwords + alufetchdwords; > + uint32_t *dwords = malloc(info->sizedwords * 4); > + assert(dwords); > + memcpy(dwords, cfs, cfdwords * 4); > + memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4); > + return dwords; > } > > - > -struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type) > +struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, > + int instr_type) > { > struct ir2_instruction *instr = > - ir2_alloc(cf->shader, sizeof(struct ir2_instruction)); > + ir2_alloc(shader, sizeof(struct ir2_instruction)); > DEBUG_MSG("%d", instr_type); > - instr->shader = cf->shader; > - instr->pred = cf->shader->pred; > + instr->shader = shader; > + instr->idx = shader->instr_count; > + instr->pred = shader->pred; > instr->instr_type = instr_type; > - assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs)); > - cf->exec.instrs[cf->exec.instrs_count++] = instr; > + shader->instr[shader->instr_count++] = instr; > return instr; > } > > @@ -279,15 +430,11 @@ static int instr_emit_fetch(struct ir2_instruction > *instr, > struct ir2_shader_info *info) > { > instr_fetch_t *fetch = (instr_fetch_t *)dwords; > - int reg = 0; > - struct ir2_register *dst_reg = instr->regs[reg++]; > - struct ir2_register *src_reg = instr->regs[reg++]; > + struct ir2_dst_register *dst_reg = &instr->dst_reg; > + struct ir2_src_register *src_reg = &instr->src_reg[0]; > > memset(fetch, 0, sizeof(*fetch)); > > - reg_update_stats(dst_reg, info, true); > - reg_update_stats(src_reg, info, false); > - > fetch->opc = instr->fetch.opc; > > if (instr->fetch.opc == VTX_FETCH) { > @@ -298,9 +445,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, > assert(instr->fetch.const_idx <= 0x1f); > assert(instr->fetch.const_idx_sel <= 0x3); > > - vtx->src_reg = src_reg->num; > + vtx->src_reg = src_to_reg(instr, src_reg); > vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1); > - vtx->dst_reg = dst_reg->num; > + vtx->dst_reg = dst_to_reg(instr, dst_reg); > vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg); > vtx->must_be_one = 1; > vtx->const_index = instr->fetch.const_idx; > @@ -326,9 +473,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr, > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev