Hello!

The motivation for this patch was exposed miscompilation of gfortran
on alpha that resulted in:

FAIL: gfortran.dg/assumed_rank_3.f90:15.20:

    print *, ubound(x,dim=3)  ! << wrong dim
                    1
Error: Assumed-rank variable x at (1) may only be used as actual argument

The problem was in the miscompilation of resolve_actual_arglist from
resolve.c.  This function initializes two nearby global bool variables
with the following sequence:

  actual_arg = true;
  first_actual_arg = true;

but due to the miscompilation, the actual_arg was never set to true.
This happened due to the way stores to QImode and HImode locations are
implemented on non-BWX targets. The sequence reads full word, does its
magic to the part and stores the full word with changed part back to
the memory. However - the scheduler mixed two sequences, violating the
atomicity of RMW sequence.

This can be illustrated with following test:

--cut here--
static char *a;
static char *b;

void foo (void)
{
  a[1] = 1;
  b[2] = 1;
}

int bar (void)
{
  return a && b;
}
--cut here--

$foo..ng:
        .prologue 1
        lda $1,1($31)
        lda $2,a
        ldq $3,0($2)   <-- load a
        lda $2,b
        lda $7,1($3)
        ldq_u $5,1($3)   <-- load a
        insbl $1,$7,$4
        ldq $2,0($2)   <-- load b
        mskbl $5,$7,$5
        lda $6,2($2)
        bis $4,$5,$4
        stq_u $4,1($3)  <-- store a
        insbl $1,$6,$1
        ldq_u $3,2($2)   <-- load b
        mskbl $3,$6,$3
        cpys $f31,$f31,$f31
        bis $1,$3,$1
        stq_u $1,2($2)   <-- store b
        ret $31,($26),1
        .end foo

if a and b alias to the same wide memory location, then "b"  RMW
sequence corrupts "a".

The solution is to wrap the sequences with memory blockages to prevent
scheduler to move read and write around. These blockages thus ensure
atomicity of the sequence.

2014-06-27  Uros Bizjak  <ubiz...@gmail.com>

    * config/alpha/alpha.md (unspec): Add UNSPEC_MEMORY_BLOCKAGE.
    (*memory_blockage): New.
    (aligned_store): Wrap the expanded sequence with memory blockages
    (unaligned_store<mode>): Ditto.

The patch was bootstrapped and regression tested on alpha-linux-gnu,
with the compiler, configured with "--target=alpha-linux-gnu
--host=alpha-linux-gnu --build=alpha-linux-gnu".

OK for mainline and release branches?

Uros.
Index: config/alpha/alpha.md
===================================================================
--- config/alpha/alpha.md       (revision 212063)
+++ config/alpha/alpha.md       (working copy)
@@ -58,6 +58,9 @@
   UNSPEC_ATOMIC
   UNSPEC_CMPXCHG
   UNSPEC_XCHG
+
+  ;; Scheduling
+  UNSPEC_MEMORY_BLOCKAGE
 ])
 
 ;; UNSPEC_VOLATILE:
@@ -3689,6 +3692,16 @@
   [(set_attr "length" "0")
    (set_attr "type" "none")])
 
+;; Do not schedule instructions accessing memory across this point.
+
+(define_insn "*memory_blockage"
+  [(set (match_operand:BLK 0)
+       (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BLOCKAGE))]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "none")])
+
 (define_insn "jump"
   [(set (pc)
        (label_ref (match_operand 0)))]
@@ -4269,7 +4282,9 @@
 ;; the value should be placed.  Operands 3 and 4 are SImode temporaries.
 
 (define_expand "aligned_store"
-  [(set (match_operand:SI 3 "register_operand")
+  [(set (match_dup 6)
+       (unspec:BLK [(match_dup 6)] UNSPEC_MEMORY_BLOCKAGE))
+   (set (match_operand:SI 3 "register_operand")
        (match_operand:SI 0 "memory_operand"))
    (set (subreg:DI (match_dup 3) 0)
        (and:DI (subreg:DI (match_dup 3) 0) (match_dup 5)))
@@ -4278,11 +4293,15 @@
                   (match_operand:DI 2 "const_int_operand")))
    (set (subreg:DI (match_dup 4) 0)
        (ior:DI (subreg:DI (match_dup 4) 0) (subreg:DI (match_dup 3) 0)))
-   (set (match_dup 0) (match_dup 4))]
+   (set (match_dup 0) (match_dup 4))
+   (set (match_dup 6)
+       (unspec:BLK [(match_dup 6)] UNSPEC_MEMORY_BLOCKAGE))]
   ""
 {
   operands[5] = GEN_INT (~ (GET_MODE_MASK (GET_MODE (operands[1]))
                            << INTVAL (operands[2])));
+  operands[6] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[6]) = 1;
 })
 
 ;; For the unaligned byte and halfword cases, we use code similar to that
@@ -4293,7 +4312,9 @@
 ;; operand 2 can be that register.
 
 (define_expand "unaligned_store<mode>"
-  [(set (match_operand:DI 3 "register_operand")
+  [(set (match_dup 6)
+       (unspec:BLK [(match_dup 6)] UNSPEC_MEMORY_BLOCKAGE))
+   (set (match_operand:DI 3 "register_operand")
        (mem:DI (and:DI (match_operand:DI 0 "address_operand")
                        (const_int -8))))
    (set (match_operand:DI 2 "register_operand")
@@ -4308,9 +4329,15 @@
                   (ashift:DI (match_dup 2) (const_int 3))))
    (set (match_dup 4) (ior:DI (match_dup 4) (match_dup 3)))
    (set (mem:DI (and:DI (match_dup 0) (const_int -8)))
-       (match_dup 4))]
+       (match_dup 4))
+   (set (match_dup 6)
+       (unspec:BLK [(match_dup 6)] UNSPEC_MEMORY_BLOCKAGE))]
   ""
-  "operands[5] = GEN_INT (GET_MODE_MASK (<MODE>mode));")
+{
+  operands[5] = GEN_INT (GET_MODE_MASK (<MODE>mode));
+  operands[6] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[6]) = 1;
+})
 
 ;; Here are the define_expand's for QI and HI moves that use the above
 ;; patterns.  We have the normal sets, plus the ones that need scratch

Reply via email to