On 09/18/2015 05:03 PM, gang.chen.5...@gmail.com wrote:
+uint64_t helper_v1add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 8) {
+ int64_t ae = (int8_t)(a >> i);
+ int64_t be = (int8_t)(b >> i);
+ r |= ((ae + be) & 0xff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v2add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 16) {
+ int64_t ae = (int16_t)(a >> i);
+ int64_t be = (int16_t)(b >> i);
+ r |= ((ae + be) & 0xffff) << i;
+ }
+ return r;
+}
There's a trick for this that's more efficient for 4 or more elements per
vector (i.e. good for v2 and v1, but not v4):
a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
+uint64_t helper_v4add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 32) {
+ int64_t ae = (int32_t)(a >> i);
+ int64_t be = (int32_t)(b >> i);
+ r |= ((ae + be) & 0xffffffff) << i;
+ }
+ return r;
+}
I should have mentioned this in the previous patch...
I think probably it would be best to open-code all, or most of, the v4
operations. Something like
static void gen_v4op(TCGv d64, TCGv a64, TCGv b64,
void (*generate)(TCGv_i32, TCGv_i32, TCGv_i32))
{
TCGv_i32 al = tcg_temp_new_i32();
TCGv_i32 ah = tcg_temp_new_i32();
TCGv_i32 bl = tcg_temp_new_i32();
TCGv_i32 bh = tcg_temp_new_i32();
tcg_gen_extr_i64_i32(al, ah, a64);
tcg_gen_extr_i64_i32(bl, bh, b64);
generate(al, al, bl);
generate(ah, ah, bh);
tcg_gen_concat_i32_i64(d64, al, ah);
tcg_temp_free_i32(al);
tcg_temp_free_i32(ah);
tcg_temp_free_i32(bl);
tcg_temp_free_i32(bh);
}
case OE_RRR(V4ADD, 0, X0):
case OE_RRR(V4ADD, 0, X1):
- return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+ gen_helper_v4add(tdest, tsrca, tsrcb);
And then
gen_v4op(tdest, tsrca, tsrcb, tcg_gen_add_i32);
r~