Hi, The attached patch lowers zeroing array assignment to memset for allocatable arrays.
For example: subroutine test(z, n) implicit none integer :: n real(4), allocatable :: z(:,:,:) allocate(z(n, 8192, 2048)) z = 0 end subroutine results in following call to memset instead of 3 nested loops for z = 0: (void) __builtin_memset ((void *) z->data, 0, (unsigned long) ((((MAX_EXPR <z->dim[0].ubound - z->dim[0].lbound, -1> + 1) * (MAX_EXPR <z->dim[1].ubound - z->dim[1].lbound, -1> + 1)) * (MAX_EXPR <z->dim[2].ubound - z->dim[2].lbound, -1> + 1)) * 4)); The patch significantly improves speedup for an internal Fortran application on AArch64 -mcpu=grace (and potentially on other AArch64 cores too). Bootstrapped+tested on aarch64-linux-gnu. Does the patch look OK to commit ? Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com> Thanks, Prathamesh
Lower zeroing array assignment to memset for allocatable arrays. gcc/fortran/ChangeLog: * trans-expr.cc (gfc_trans_zero_assign): Handle allocatable arrays. gcc/testsuite/ChangeLog: * gfortran.dg/array_memset_3.f90: New test. Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com> diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc index 605434f4ddb..7773a24f9d4 100644 --- a/gcc/fortran/trans-expr.cc +++ b/gcc/fortran/trans-expr.cc @@ -11421,18 +11421,23 @@ gfc_trans_zero_assign (gfc_expr * expr) type = TREE_TYPE (dest); if (POINTER_TYPE_P (type)) type = TREE_TYPE (type); - if (!GFC_ARRAY_TYPE_P (type)) - return NULL_TREE; - - /* Determine the length of the array. */ - len = GFC_TYPE_ARRAY_SIZE (type); - if (!len || TREE_CODE (len) != INTEGER_CST) + if (GFC_ARRAY_TYPE_P (type)) + { + /* Determine the length of the array. */ + len = GFC_TYPE_ARRAY_SIZE (type); + if (!len || TREE_CODE (len) != INTEGER_CST) + return NULL_TREE; + } + else if (GFC_DESCRIPTOR_TYPE_P (type)) + { + if (POINTER_TYPE_P (TREE_TYPE (dest))) + dest = build_fold_indirect_ref_loc (input_location, dest); + len = gfc_conv_descriptor_size (dest, GFC_TYPE_ARRAY_RANK (type)); + dest = gfc_conv_descriptor_data_get (dest); + } + else return NULL_TREE; - tmp = TYPE_SIZE_UNIT (gfc_get_element_type (type)); - len = fold_build2_loc (input_location, MULT_EXPR, gfc_array_index_type, len, - fold_convert (gfc_array_index_type, tmp)); - /* If we are zeroing a local array avoid taking its address by emitting a = {} instead. */ if (!POINTER_TYPE_P (TREE_TYPE (dest))) @@ -11440,6 +11445,11 @@ gfc_trans_zero_assign (gfc_expr * expr) dest, build_constructor (TREE_TYPE (dest), NULL)); + /* Multiply len by element size. */ + tmp = TYPE_SIZE_UNIT (gfc_get_element_type (type)); + len = fold_build2_loc (input_location, MULT_EXPR, gfc_array_index_type, + len, fold_convert (gfc_array_index_type, tmp)); + /* Convert arguments to the correct types. */ dest = fold_convert (pvoid_type_node, dest); len = fold_convert (size_type_node, len); diff --git a/gcc/testsuite/gfortran.dg/array_memset_3.f90 b/gcc/testsuite/gfortran.dg/array_memset_3.f90 new file mode 100644 index 00000000000..b750c8de67d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/array_memset_3.f90 @@ -0,0 +1,31 @@ +! { dg-do compile } +! { dg-options "-O2 -fdump-tree-original" } + +subroutine test1(n) + implicit none + integer(8) :: n + real(4), allocatable :: z(:,:,:) + + allocate(z(n, 100, 200)) + z = 0 +end subroutine + +subroutine test2(n) + implicit none + integer(8) :: n + integer, allocatable :: z(:,:,:) + + allocate(z(n, 100, 200)) + z = 0 +end subroutine + +subroutine test3(n) + implicit none + integer(8) :: n + logical, allocatable :: z(:,:,:) + + allocate(z(n, 100, 200)) + z = .false. +end subroutine + +! { dg-final { scan-tree-dump-times "__builtin_memset" 3 "original" } }