OK, here is a new version. There is now an option for setting a maximum on the array size, which takes its default from the BLAS limit (if specified).
Currently, only setting the maximum size to zero as a way of disabling the unrolling is supported. I have done this in a few test cases. The bugs reported by Dominique have now been fixed. Still to do: Bounds checking (a rather big one), honoring the maximum size, test cases, ChangeLog, some more comments to the code. Anything else? Regards Thomas
Index: fortran/frontend-passes.c =================================================================== --- fortran/frontend-passes.c (Revision 221944) +++ fortran/frontend-passes.c (Arbeitskopie) @@ -43,7 +43,11 @@ static void doloop_warn (gfc_namespace *); static void optimize_reduction (gfc_namespace *); static int callback_reduction (gfc_expr **, int *, void *); static void realloc_strings (gfc_namespace *); -static gfc_expr *create_var (gfc_expr *); +static gfc_expr *create_var (gfc_expr *, const char *vname=NULL); +static int optimize_matmul_assign (gfc_code **, int *, void *); +static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *, + locus *, gfc_namespace *, + char *vname=NULL); /* How deep we are inside an argument list. */ @@ -93,8 +97,24 @@ struct my_struct *evec; static bool in_assoc_list; +/* Counter for temporary variables. */ + +static int var_num = 1; + +/* What sort of matrix we are dealing with when optimizing MATMUL. */ + +enum matrix_case { none=0, A2B2, A2B1, A1B2 }; + +/* Keep track of the number of expressions we have inserted so far + using create_var. */ + +int n_vars; + +void gfc_debug_expr (gfc_expr *); + /* Entry point - run all passes for a namespace. */ + void gfc_run_passes (gfc_namespace *ns) { @@ -157,7 +177,7 @@ realloc_string_callback (gfc_code **c, int *walk_s return 0; current_code = c; - n = create_var (expr2); + n = create_var (expr2, "trim"); co->expr2 = n; return 0; } @@ -524,29 +544,11 @@ constant_string_length (gfc_expr *e) } -/* Returns a new expression (a variable) to be used in place of the old one, - with an assignment statement before the current statement to set - the value of the variable. Creates a new BLOCK for the statement if - that hasn't already been done and puts the statement, plus the - newly created variables, in that block. Special cases: If the - expression is constant or a temporary which has already - been created, just copy it. */ - -static gfc_expr* -create_var (gfc_expr * e) +static gfc_namespace* +insert_block () { - char name[GFC_MAX_SYMBOL_LEN +1]; - static int num = 1; - gfc_symtree *symtree; - gfc_symbol *symbol; - gfc_expr *result; - gfc_code *n; gfc_namespace *ns; - int i; - if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e)) - return gfc_copy_expr (e); - /* If the block hasn't already been created, do so. */ if (inserted_block == NULL) { @@ -578,7 +580,37 @@ constant_string_length (gfc_expr *e) else ns = inserted_block->ext.block.ns; - sprintf(name, "__var_%d",num++); + return ns; +} + +/* Returns a new expression (a variable) to be used in place of the old one, + with an optional assignment statement before the current statement to set + the value of the variable. Creates a new BLOCK for the statement if that + hasn't already been done and puts the statement, plus the newly created + variables, in that block. Special cases: If the expression is constant or + a temporary which has already been created, just copy it. */ + +static gfc_expr* +create_var (gfc_expr * e, const char *vname) +{ + char name[GFC_MAX_SYMBOL_LEN +1]; + gfc_symtree *symtree; + gfc_symbol *symbol; + gfc_expr *result; + gfc_code *n; + gfc_namespace *ns; + int i; + + if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e)) + return gfc_copy_expr (e); + + ns = insert_block (); + + if (vname) + snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d_%s", var_num++, vname); + else + snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d", var_num++); + if (gfc_get_sym_tree (name, ns, &symtree, false) != 0) gcc_unreachable (); @@ -651,6 +683,7 @@ constant_string_length (gfc_expr *e) result->ref->type = REF_ARRAY; result->ref->u.ar.type = AR_FULL; result->ref->u.ar.where = e->where; + result->ref->u.ar.dimen = e->rank; result->ref->u.ar.as = symbol->ts.type == BT_CLASS ? CLASS_DATA (symbol)->as : symbol->as; if (warn_array_temporaries) @@ -658,6 +691,7 @@ constant_string_length (gfc_expr *e) "Creating array temporary at %L", &(e->where)); } + /* Generate the new assignment. */ n = XCNEW (gfc_code); n->op = EXEC_ASSIGN; @@ -666,6 +700,7 @@ constant_string_length (gfc_expr *e) n->expr1 = gfc_copy_expr (result); n->expr2 = e; *changed_statement = n; + n_vars ++; return result; } @@ -724,7 +759,7 @@ cfe_expr_0 (gfc_expr **e, int *walk_subtrees, if (gfc_dep_compare_functions (*ei, *ej, true) == 0) { if (newvar == NULL) - newvar = create_var (*ei); + newvar = create_var (*ei, "fcn"); if (warn_function_elimination) do_warn_function_elimination (*ej); @@ -931,6 +966,7 @@ convert_elseif (gfc_code **c, int *walk_subtrees A /* Don't walk subtrees. */ return 0; } + /* Optimize a namespace, including all contained namespaces. */ static void @@ -947,6 +983,9 @@ optimize_namespace (gfc_namespace *ns) gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL); gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL); gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL); + if (flag_inline_matmul_limit != 0) + gfc_code_walker (&ns->code, optimize_matmul_assign, dummy_expr_callback, + NULL); /* BLOCKs are handled in the expression walker below. */ for (ns = ns->contained; ns; ns = ns->sibling) @@ -1222,7 +1261,7 @@ combine_array_constructor (gfc_expr *e) if (op2->ts.type == BT_CHARACTER) return false; - scalar = create_var (gfc_copy_expr (op2)); + scalar = create_var (gfc_copy_expr (op2), "constr"); oldbase = op1->value.constructor; newbase = NULL; @@ -1939,7 +1978,795 @@ doloop_warn (gfc_namespace *ns) gfc_code_walker (&ns->code, doloop_code, do_function, NULL); } +/* Auxiliary function to build and simplify an array inquiry function. + dim is zero-based. */ +static gfc_expr * +get_array_inq_function (gfc_expr *e, int dim, gfc_isym_id id) +{ + gfc_expr *fcn; + gfc_expr *dim_arg, *kind; + const char *name; + + switch (id) + { + case GFC_ISYM_LBOUND: + name = "_gfortran_lbound"; + break; + + case GFC_ISYM_UBOUND: + name = "_gfortran_ubound"; + break; + + case GFC_ISYM_SIZE: + name = "_gfortran_size"; + break; + + default: + gcc_unreachable (); + } + + dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim); + kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where, + gfc_index_integer_kind); + + fcn = gfc_build_intrinsic_call (current_ns, id, name, e->where, 3, + gfc_copy_expr(e), dim_arg, kind); + gfc_simplify_expr (fcn, 0); + return fcn; +} + +/* Build a not equals - expression. */ + +static gfc_expr* +build_logical_expr (gfc_expr *e1, gfc_expr *e2, gfc_intrinsic_op op) +{ + gfc_typespec ts; + gfc_expr *res; + + ts.type = BT_LOGICAL; + ts.kind = gfc_default_logical_kind; + res = gfc_get_expr (); + res->where = e1->where; + res->expr_type = EXPR_OP; + res->value.op.op = op; + res->value.op.op1 = e1; + res->value.op.op2 = e2; + res->ts = ts; + + return res; +} +/* Handle matrix reallocation. Caller is responsible to insert into + the code tree. + + For the two-dimensional case, build + + if (allocated(c)) then + if (size(c,1) /= size(a,1) .or. size(c,2) /= size(b,2)) then + deallocate(c) + allocate (c(size(a,1), size(b,2))) + end if + else + allocate (c(size(a,1),size(b,2))) + end if + +*/ + +static gfc_code * +matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b, + enum matrix_case m_case) +{ + + gfc_expr *allocated, *alloc_expr; + gfc_code *if_alloc_1, *if_alloc_2, *if_size_1, *if_size_2; + gfc_code *else_alloc; + gfc_code *deallocate, *allocate1, *allocate_else; + gfc_ref *ref; + gfc_array_ref *ar; + gfc_expr *cond, *ne1, *ne2; + + alloc_expr = gfc_copy_expr (c); + + ref = alloc_expr->ref; + + while (ref) + { + if (ref->type == REF_ARRAY && ref->u.ar.type != AR_ELEMENT) + break; + + ref = ref->next; + + } + ar = &ref->u.ar; + gcc_assert (ar && ar->type == AR_FULL); + + /* c comes in as a full ref. Change it into a copy and make it into an + element ref so it has the right for for ALLOCATE. In the same switch, + also generate the size comparison for the secod IF statement. */ + + ar->type = AR_ELEMENT; + + switch (m_case) + { + case A2B2: + ar->start[0] = get_array_inq_function (a, 1, GFC_ISYM_SIZE); + ar->start[1] = get_array_inq_function (b, 2, GFC_ISYM_SIZE); + ne1 = build_logical_expr (get_array_inq_function (c, 1, GFC_ISYM_SIZE), + get_array_inq_function (a, 1, GFC_ISYM_SIZE), + INTRINSIC_NE); + ne2 = build_logical_expr (get_array_inq_function (c, 2, GFC_ISYM_SIZE), + get_array_inq_function (b, 2, GFC_ISYM_SIZE), + INTRINSIC_NE); + cond = build_logical_expr (ne1, ne2, INTRINSIC_OR); + break; + + case A2B1: + ar->start[0] = get_array_inq_function (a, 1, GFC_ISYM_SIZE); + cond = build_logical_expr (get_array_inq_function (c, 1, GFC_ISYM_SIZE), + get_array_inq_function (a, 2, GFC_ISYM_SIZE), + INTRINSIC_NE); + break; + + case A1B2: + ar->start[0] = get_array_inq_function (b, 1, GFC_ISYM_SIZE); + cond = build_logical_expr (get_array_inq_function (c, 1, GFC_ISYM_SIZE), + get_array_inq_function (b, 2, GFC_ISYM_SIZE), + INTRINSIC_NE); + break; + + default: + gcc_unreachable(); + + } + + gfc_simplify_expr (cond, 0); + + /* We need two identical allocate statements in two + branches of the IF statement. */ + + allocate1 = XCNEW (gfc_code); + allocate1->op = EXEC_ALLOCATE; + allocate1->ext.alloc.list = gfc_get_alloc (); + allocate1->loc = c->where; + allocate1->ext.alloc.list->expr = gfc_copy_expr (alloc_expr); + + allocate_else = XCNEW (gfc_code); + allocate_else->op = EXEC_ALLOCATE; + allocate_else->ext.alloc.list = gfc_get_alloc (); + allocate_else->loc = c->where; + allocate_else->ext.alloc.list->expr = alloc_expr; + + allocated = gfc_build_intrinsic_call (current_ns, GFC_ISYM_ALLOCATED, + "_gfortran_allocated", c->where, + 1, gfc_copy_expr (c)); + + deallocate = XCNEW (gfc_code); + deallocate->op = EXEC_DEALLOCATE; + deallocate->ext.alloc.list = gfc_get_alloc (); + deallocate->ext.alloc.list->expr = gfc_copy_expr (c); + deallocate->next = allocate1; + deallocate->loc = c->where; + + if_size_2 = XCNEW (gfc_code); + if_size_2->op = EXEC_IF; + if_size_2->expr1 = cond; + if_size_2->loc = c->where; + if_size_2->next = deallocate; + + if_size_1 = XCNEW (gfc_code); + if_size_1->op = EXEC_IF; + if_size_1->block = if_size_2; + if_size_1->loc = c->where; + + else_alloc = XCNEW (gfc_code); + else_alloc->op = EXEC_IF; + else_alloc->loc = c->where; + else_alloc->next = allocate_else; + + if_alloc_2 = XCNEW (gfc_code); + if_alloc_2->op = EXEC_IF; + if_alloc_2->expr1 = allocated; + if_alloc_2->loc = c->where; + if_alloc_2->next = if_size_1; + if_alloc_2->block = else_alloc; + + if_alloc_1 = XCNEW (gfc_code); + if_alloc_1->op = EXEC_IF; + if_alloc_1->block = if_alloc_2; + if_alloc_1->loc = c->where; + + return if_alloc_1; +} + + +/* Callback function for has_function_or_op. */ + +static int +is_function_or_op (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + if ((*e) == 0) + return 0; + else + return (*e)->expr_type == EXPR_FUNCTION + || (*e)->expr_type == EXPR_OP; +} + +/* Returns true if the expression contains a function. */ + +static bool +has_function_or_op (gfc_expr **e) +{ + if (e == NULL) + return false; + else + return gfc_expr_walker (e, is_function_or_op, NULL); +} + +/* Freeze a single expression. */ + +static void +freeze_expr (gfc_expr **ep) +{ + gfc_expr *ne; + if (has_function_or_op (ep)) + { + ne = create_var (*ep, "freeze"); + *ep = ne; + } +} + +/* Go through an expression's references and assign them to temporary + variables if they contain functions. This is usually done prior to + front-end scalarization to avoid multiple invocations of functions. */ + +static void +freeze_references (gfc_expr *e) +{ + gfc_ref *r; + gfc_array_ref *ar; + int i; + + for (r=e->ref; r; r=r->next) + { + if (r->type == REF_SUBSTRING) + { + if (r->u.ss.start != NULL) + freeze_expr (&r->u.ss.start); + + if (r->u.ss.end != NULL) + freeze_expr (&r->u.ss.end); + } + else if (r->type == REF_ARRAY) + { + ar = &r->u.ar; + switch (ar->type) + { + case AR_FULL: + break; + + case AR_SECTION: + for (i=0; i<ar->dimen; i++) + { + if (ar->dimen_type[i] == DIMEN_RANGE) + { + freeze_expr (&ar->start[i]); + freeze_expr (&ar->end[i]); + freeze_expr (&ar->stride[i]); + } + } + default: + break; + } + } + } +} + +/* Convert to gfc_index_integer_kind if needed, just do a copy otherwise. */ + +static gfc_expr * +convert_to_index_kind (gfc_expr *e) +{ + gfc_expr *res; + + gcc_assert (e != NULL); + + res = gfc_copy_expr (e); + + gcc_assert (e->ts.type == BT_INTEGER); + + if (res->ts.kind != gfc_index_integer_kind) + { + gfc_typespec ts; + gfc_clear_ts (&ts); + ts.type = BT_INTEGER; + ts.kind = gfc_index_integer_kind; + + gfc_convert_type_warn (e, &ts, 2, 0); + } + + return res; +} + +/* Function to create a DO loop including creation of the + iteration variable. gfc_expr are copied.*/ + +static gfc_code * +create_do_loop (gfc_expr *start, gfc_expr *end, gfc_expr *step, locus *where, + gfc_namespace *ns, char *vname) +{ + + char name[GFC_MAX_SYMBOL_LEN +1]; + gfc_symtree *symtree; + gfc_symbol *symbol; + gfc_expr *i; + gfc_code *n, *n2; + + /* Create an expression for the iteration variable. */ + if (vname) + sprintf (name, "__var_%d_do_%s", var_num++, vname); + else + sprintf (name, "__var_%d_do", var_num++); + + + if (gfc_get_sym_tree (name, ns, &symtree, false) != 0) + gcc_unreachable (); + + symbol = symtree->n.sym; + symbol->ts.type = BT_INTEGER; + symbol->ts.kind = gfc_index_integer_kind; + symbol->attr.flavor = FL_VARIABLE; + symbol->attr.referenced = 1; + symbol->attr.dimension = 0; + symbol->attr.fe_temp = 1; + gfc_commit_symbol (symbol); + + i = gfc_get_expr (); + i->expr_type = EXPR_VARIABLE; + i->ts = symbol->ts; + i->rank = 0; + i->where = *where; + i->symtree = symtree; + + n = XCNEW (gfc_code); + n->op = EXEC_DO; + n->loc = *where; + n->ext.iterator = gfc_get_iterator (); + n->ext.iterator->var = i; + n->ext.iterator->start = convert_to_index_kind (start); + n->ext.iterator->end = convert_to_index_kind (end); + if (step) + n->ext.iterator->step = convert_to_index_kind (step); + else + n->ext.iterator->step = gfc_get_int_expr (gfc_index_integer_kind, + where, 1); + + n2 = XCNEW (gfc_code); + n2->op = EXEC_DO; + n2->loc = *where; + n2->next = NULL; + n->block = n2; + return n; +} + +/* Return an operation of one two gfc_expr (one if e2 is NULL. This assumes + compatible typespecs. */ + +static gfc_expr * +get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2) +{ + gfc_expr *res; + + res = gfc_get_expr (); + res->ts = e1->ts; + res->where = e1->where; + res->expr_type = EXPR_OP; + res->value.op.op = op; + res->value.op.op1 = e1; + res->value.op.op2 = e2; + gfc_simplify_expr (res, 0); + return res; +} + +/* Get the upper bound of the DO loops for matmul along + a dimension, i.e. size minus one. */ +static gfc_expr* +get_size_m1 (gfc_expr *e, int dimen) +{ + mpz_t size; + gfc_expr *res; + + if (gfc_array_dimen_size (e, dimen, &size)) + { + res = gfc_get_constant_expr (BT_INTEGER, + gfc_index_integer_kind, &e->where); + mpz_sub_ui (res->value.integer, size, 1); + mpz_clear (size); + } + else + { + res = get_operand (INTRINSIC_MINUS, + get_array_inq_function (e, dimen + 1, GFC_ISYM_SIZE), + gfc_get_int_expr (gfc_index_integer_kind, + &e->where, 1)); + gfc_simplify_expr (res, 0); + } + + return res; +} + +/* Function to return a scalarized expression. It is assumed that indices are + zero based to make generation of DO loops easier. A zero as index will + access the first element along a dimension. Single element references will + be skipped. A NULL as an expression will be replaced by a full reference. + This assumes that the index loops have gfc_index_integer_kind, and that all + references have been frozen. */ + +static gfc_expr* +scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index) +{ + gfc_ref *ref; + gfc_array_ref *ar; + int i; + int rank; + gfc_expr *e; + int i_index; + bool was_fullref; + + e = gfc_copy_expr(e_in); + + rank = e->rank; + + ref = e->ref; + + while (ref) + { + if (ref->type == REF_ARRAY + && (ref->u.ar.type == AR_FULL || ref->u.ar.type == AR_SECTION)) + break; + ref = ref->next; + } + ar = &ref->u.ar; + + /* We scalarize count_index variables, reducing the rank by count_index. */ + + e->rank = rank - count_index; + + was_fullref = ar->type == AR_FULL; + + if (e->rank == 0) + ar->type = AR_ELEMENT; + else + ar->type = AR_SECTION; + + /* Loop over the indices. For each index, create the expression + index + lbound(e, dim). */ + + i_index = 0; + for (i=0; i < ar->dimen; i++) + { + if (was_fullref || ar->dimen_type[i] == DIMEN_RANGE) + { + if (index[i_index] != NULL) + { + gfc_expr *lbound, *nindex; + gfc_expr *loopvar; + + loopvar = gfc_copy_expr (index[i_index]); + + if (ar->stride[i]) + { + gfc_expr *tmp; + + tmp = gfc_copy_expr(ar->stride[i]); + if (tmp->ts.kind != gfc_index_integer_kind) + { + gfc_typespec ts; + gfc_clear_ts (&ts); + ts.type = BT_INTEGER; + ts.kind = gfc_index_integer_kind; + gfc_convert_type (tmp, &ts, 2); + } + nindex = get_operand (INTRINSIC_TIMES, loopvar, tmp); + } + else + nindex = loopvar; + + if (ar->start[i]) + { + lbound = gfc_copy_expr (ar->start[i]); + if (lbound->ts.kind != gfc_index_integer_kind) + { + gfc_typespec ts; + gfc_clear_ts (&ts); + ts.type = BT_INTEGER; + ts.kind = gfc_index_integer_kind; + gfc_convert_type (lbound, &ts, 2); + + } + } + else + lbound = get_array_inq_function (e_in, i+1, GFC_ISYM_LBOUND); + + ar->dimen_type[i] = DIMEN_ELEMENT; + ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound); + ar->end[i] = NULL; + ar->stride[i] = NULL; + gfc_simplify_expr (ar->start[i], 0); + } + else if (was_fullref) + { + ar->dimen_type[i] = DIMEN_RANGE; + ar->start[i] = NULL; + ar->end[i] = NULL; + ar->stride[i] = NULL; + } + i_index ++; + } + } + return e; +} + + +/* Optimize assignments of the form c = matmul(a,b). + Handle only the cases currently where b and c are rank-two arrays. + + This translates the code to + + BLOCK + integer i,j,k + c = 0 + do j=0, size(b,2)-1 + do k=0, size(a, 2)-1 + do i=0, size(a, 1)-1 + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) = + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) + + a(i * stride(a,1) + lbound(a,1), k * stride(a,2) + lbound(a,2)) * + b(k * stride(b,1) + lbound(b,1), j * stride(b,2) + lbound(b,2)) + end do + end do + end do + END BLOCK + +*/ + +static int +optimize_matmul_assign (gfc_code **c, + int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + gfc_code *co = *c; + gfc_expr *expr1, *expr2; + gfc_expr *matrix_a, *matrix_b; + gfc_actual_arglist *a, *b; + gfc_code *do_1, *do_2, *do_3, *assign_zero, *assign_matmul; + gfc_expr *zero_e; + gfc_expr *u1, *u2, *u3; + gfc_expr *list[2]; + gfc_expr *ascalar, *bscalar, *cscalar; + gfc_expr *mult; + gfc_expr *var_1, *var_2, *var_3; + gfc_expr *zero; + gfc_namespace *ns; + gfc_intrinsic_op op_times, op_plus; + enum matrix_case m_case; + gfc_code *next_code; + gfc_code *p; + int i; + + + if (co->op != EXEC_ASSIGN) + return 0; + + expr1 = co->expr1; + expr2 = co->expr2; + if (expr2->expr_type != EXPR_FUNCTION + || expr2->value.function.isym == NULL + || expr2->value.function.isym->id != GFC_ISYM_MATMUL) + return 0; + + /* Handling only two matrices of dimension two for the time being. */ + + current_code = c; + inserted_block = NULL; + changed_statement = NULL; + + a = expr2->value.function.actual; + matrix_a = a->expr; + b = a->next; + matrix_b = b->expr; + + if (matrix_a->expr_type != EXPR_VARIABLE + || matrix_b->expr_type != EXPR_VARIABLE) + return 0; + + if (matrix_a->rank == 2) + m_case = matrix_b->rank == 1 ? A2B1 : A2B2; + else + m_case = A1B2; + + if (gfc_check_dependency (expr1, matrix_a, true) + || gfc_check_dependency (expr1, matrix_b, true)) + return 0; + + ns = insert_block (); + + switch(expr1->ts.type) + { + case BT_INTEGER: + zero_e = gfc_get_int_expr (expr1->ts.kind, &expr1->where, 0); + op_times = INTRINSIC_TIMES; + op_plus = INTRINSIC_PLUS; + break; + + case BT_LOGICAL: + op_times = INTRINSIC_AND; + op_plus = INTRINSIC_OR; + zero_e = gfc_get_logical_expr (expr1->ts.kind, &expr1->where, + 0); + break; + case BT_REAL: + zero_e = gfc_get_constant_expr (BT_REAL, expr1->ts.kind, + &expr1->where); + mpfr_set_si (zero_e->value.real, 0, GFC_RND_MODE); + op_times = INTRINSIC_TIMES; + op_plus = INTRINSIC_PLUS; + break; + + case BT_COMPLEX: + zero_e = gfc_get_constant_expr (BT_COMPLEX, expr1->ts.kind, + &expr1->where); + mpc_set_si_si (zero_e->value.complex, 0, 0, GFC_RND_MODE); + op_times = INTRINSIC_TIMES; + op_plus = INTRINSIC_PLUS; + + break; + + default: + gcc_unreachable(); + } + + current_code = &ns->code; + + n_vars = 0; + freeze_references (matrix_a); + freeze_references (matrix_b); + + assign_zero = XCNEW (gfc_code); + assign_zero->op = EXEC_ASSIGN; + assign_zero->loc = co->loc; + assign_zero->expr1 = gfc_copy_expr (expr1); + assign_zero->expr2 = zero_e; + + if (flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1)) + { + gfc_code *lhs_alloc; + + lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case); + lhs_alloc->next = assign_zero; + next_code = lhs_alloc; + } + else + next_code = assign_zero; + + if (n_vars == 0) + *current_code = next_code; + else + { + p = ns->code; + for (i=0; i<n_vars-1; i++) + p = p->next; + + p->next = next_code; + } + + zero = gfc_get_int_expr (gfc_index_integer_kind, &co->loc, 0); + + assign_matmul = XCNEW (gfc_code); + assign_matmul->op = EXEC_ASSIGN; + assign_matmul->loc = co->loc; + + switch (m_case) + { + case A2B2: + u1 = get_size_m1 (matrix_b, 1); + u2 = get_size_m1 (matrix_a, 1); + u3 = get_size_m1 (matrix_a, 0); + + do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns); + do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns); + do_3 = create_do_loop (gfc_copy_expr (zero), u3, NULL, &co->loc, ns); + + do_1->block->next = do_2; + do_2->block->next = do_3; + do_3->block->next = assign_matmul; + + var_1 = do_1->ext.iterator->var; + var_2 = do_2->ext.iterator->var; + var_3 = do_3->ext.iterator->var; + + list[0] = var_3; + list[1] = var_1; + cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 2); + + list[0] = var_3; + list[1] = var_2; + ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2); + + list[0] = var_2; + list[1] = var_1; + bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2); + + break; + + case A2B1: + u1 = get_size_m1 (matrix_b, 0); + u2 = get_size_m1 (matrix_a, 0); + + do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns); + do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns); + + do_1->block->next = do_2; + do_2->block->next = assign_matmul; + + var_1 = do_1->ext.iterator->var; + var_2 = do_2->ext.iterator->var; + + list[0] = var_2; + cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1); + + list[0] = var_2; + list[1] = var_1; + ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2); + + list[0] = var_1; + bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 1); + + break; + + case A1B2: + u1 = get_size_m1 (matrix_b, 1); + u2 = get_size_m1 (matrix_a, 0); + + do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns); + do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns); + + do_1->block->next = do_2; + do_2->block->next = assign_matmul; + + var_1 = do_1->ext.iterator->var; + var_2 = do_2->ext.iterator->var; + + list[0] = var_1; + cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1); + + list[0] = var_2; + ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 1); + + list[0] = var_2; + list[1] = var_1; + bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2); + + break; + + default: + gcc_unreachable(); + } + + assign_zero->next = do_1; + + + assign_matmul->expr1 = gfc_copy_expr (cscalar); + + mult = get_operand (op_times, ascalar, bscalar); + assign_matmul->expr2 = get_operand (op_plus, cscalar, mult); + + co->next = NULL; + gfc_free_statements(co); + gfc_free_expr (zero); + return 0; +} + #define WALK_SUBEXPR(NODE) \ do \ { \ Index: fortran/gfortran.h =================================================================== --- fortran/gfortran.h (Revision 221944) +++ fortran/gfortran.h (Arbeitskopie) @@ -3225,4 +3225,8 @@ int gfc_code_walker (gfc_code **, walk_code_fn_t, void gfc_convert_mpz_to_signed (mpz_t, int); +/* trans-array.c */ + +bool gfc_is_reallocatable_lhs (gfc_expr *); + #endif /* GCC_GFORTRAN_H */ Index: fortran/lang.opt =================================================================== --- fortran/lang.opt (Revision 221944) +++ fortran/lang.opt (Arbeitskopie) @@ -542,6 +542,10 @@ Enum(gfc_init_local_real) String(inf) Value(GFC_IN EnumValue Enum(gfc_init_local_real) String(-inf) Value(GFC_INIT_REAL_NEG_INF) +finline-matmul-limit= +Fortran RejectNegative Joined UInteger Var(flag_inline_matmul_limit) Init(-1) +-finline-matmul-limit=<n> Specify the size of the largest matrix for which matmul will be inlined + fmax-array-constructor= Fortran RejectNegative Joined UInteger Var(flag_max_array_constructor) Init(65535) -fmax-array-constructor=<n> Maximum number of objects in an array constructor Index: fortran/options.c =================================================================== --- fortran/options.c (Revision 221944) +++ fortran/options.c (Arbeitskopie) @@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename) if (!flag_automatic) flag_max_stack_var_size = 0; + /* If we call BLAS directly, only inline up to the BLAS limit. */ + + if (flag_external_blas && flag_inline_matmul_limit < 0) + flag_inline_matmul_limit = flag_blas_matmul_limit; + /* Optimization implies front end optimization, unless the user specified it directly. */ Index: fortran/simplify.c =================================================================== --- fortran/simplify.c (Revision 221944) +++ fortran/simplify.c (Arbeitskopie) @@ -3445,6 +3445,32 @@ simplify_bound (gfc_expr *array, gfc_expr *dim, gf done: + + if (!upper && as && as->type == AS_ASSUMED_SHAPE && dim + && dim->expr_type == EXPR_CONSTANT && ref->u.ar.type != AR_SECTION) + { + if (!(array->symtree && array->symtree->n.sym + && (array->symtree->n.sym->attr.allocatable + || array->symtree->n.sym->attr.pointer))) + { + unsigned long int ndim; + gfc_expr *lower, *res; + + ndim = mpz_get_si (dim->value.integer) - 1; + lower = as->lower[ndim]; + if (lower->expr_type == EXPR_CONSTANT) + { + res = gfc_copy_expr (lower); + if (kind) + { + int nkind = mpz_get_si (kind->value.integer); + res->ts.kind = nkind; + } + return res; + } + } + } + if (as && (as->type == AS_DEFERRED || as->type == AS_ASSUMED_SHAPE || as->type == AS_ASSUMED_RANK)) return NULL; Index: fortran/trans-array.h =================================================================== --- fortran/trans-array.h (Revision 221944) +++ fortran/trans-array.h (Arbeitskopie) @@ -64,8 +64,6 @@ tree gfc_copy_only_alloc_comp (gfc_symbol *, tree, tree gfc_alloc_allocatable_for_assignment (gfc_loopinfo*, gfc_expr*, gfc_expr*); -bool gfc_is_reallocatable_lhs (gfc_expr *); - /* Add initialization for deferred arrays. */ void gfc_trans_deferred_array (gfc_symbol *, gfc_wrapped_block *); /* Generate an initializer for a static pointer or allocatable array. */ Index: testsuite/gfortran.dg/function_optimize_1.f90 =================================================================== --- testsuite/gfortran.dg/function_optimize_1.f90 (Revision 221944) +++ testsuite/gfortran.dg/function_optimize_1.f90 (Arbeitskopie) @@ -1,5 +1,5 @@ ! { dg-do compile } -! { dg-options "-O -fdump-tree-original -Warray-temporaries" } +! { dg-options "-O -fdump-tree-original -finline-matmul-limit=0 -Warray-temporaries" } program main implicit none real, dimension(2,2) :: a, b, c, d Index: testsuite/gfortran.dg/function_optimize_2.f90 =================================================================== --- testsuite/gfortran.dg/function_optimize_2.f90 (Revision 221944) +++ testsuite/gfortran.dg/function_optimize_2.f90 (Arbeitskopie) @@ -1,5 +1,5 @@ ! { dg-do compile } -! { dg-options "-O -faggressive-function-elimination -fdump-tree-original" } +! { dg-options "-O -finline-matmul-limit=0 -faggressive-function-elimination -fdump-tree-original" } program main implicit none real, dimension(2,2) :: a, b, c, d Index: testsuite/gfortran.dg/function_optimize_5.f90 =================================================================== --- testsuite/gfortran.dg/function_optimize_5.f90 (Revision 221944) +++ testsuite/gfortran.dg/function_optimize_5.f90 (Arbeitskopie) @@ -1,5 +1,5 @@ ! { dg-do compile } -! { dg-options "-ffrontend-optimize -Wfunction-elimination" } +! { dg-options "-ffrontend-optimize -finline-matmul-limit=0 -Wfunction-elimination" } ! Check the -ffrontend-optimize (in the absence of -O) and ! -Wfunction-elimination options. program main Index: testsuite/gfortran.dg/function_optimize_7.f90 =================================================================== --- testsuite/gfortran.dg/function_optimize_7.f90 (Revision 221944) +++ testsuite/gfortran.dg/function_optimize_7.f90 (Arbeitskopie) @@ -1,5 +1,5 @@ ! { dg-do compile } -! { dg-options "-O -fdump-tree-original -Warray-temporaries" } +! { dg-options "-O -fdump-tree-original -Warray-temporaries -finline-matmul-limit=0" } subroutine xx(n, m, a, b, c, d, x, z, i, s_in, s_out) implicit none integer, intent(in) :: n, m