On Fri, 11 Dec 2015, Jan Hubicka wrote:

> Hi,
> this patch further reduce memory use and time of WPA stage, especially 
> without -g
>  phase opt and generate  :  75.66 (39%) usr   1.78 (14%) sys  77.44 (37%) 
> wall  855644 kB (21%) ggc
>  phase stream in         :  34.62 (18%) usr   1.95 (16%) sys  36.57 (18%) 
> wall 3245604 kB (79%) ggc
>  phase stream out        :  81.89 (42%) usr   8.49 (69%) sys  90.37 (44%) 
> wall      50 kB ( 0%) ggc
>  ipa dead code removal   :   4.33 ( 2%) usr   0.06 ( 0%) sys   4.24 ( 2%) 
> wall       0 kB ( 0%) ggc
>  ipa virtual call target :  25.15 (13%) usr   0.14 ( 1%) sys  25.42 (12%) 
> wall       0 kB ( 0%) ggc
>  ipa cp                  :   3.92 ( 2%) usr   0.21 ( 2%) sys   4.18 ( 2%) 
> wall  340698 kB ( 8%) ggc
>  ipa inlining heuristics :  24.12 (12%) usr   0.38 ( 3%) sys  24.37 (12%) 
> wall  500427 kB (12%) ggc
>  lto stream inflate      :   7.07 ( 4%) usr   0.38 ( 3%) sys   7.33 ( 4%) 
> wall       0 kB ( 0%) ggc
>  ipa lto gimple in       :   1.95 ( 1%) usr   0.61 ( 5%) sys   2.42 ( 1%) 
> wall  324875 kB ( 8%) ggc
>  ipa lto gimple out      :   9.16 ( 5%) usr   1.64 (13%) sys  10.49 ( 5%) 
> wall      50 kB ( 0%) ggc
>  ipa lto decl in         :  21.25 (11%) usr   1.01 ( 8%) sys  22.37 (11%) 
> wall 2348869 kB (57%) ggc
>  ipa lto decl out        :  67.33 (34%) usr   1.66 (13%) sys  68.96 (33%) 
> wall       0 kB ( 0%) ggc
>  ipa lto constructors out:   1.39 ( 1%) usr   0.38 ( 3%) sys   2.18 ( 1%) 
> wall       0 kB ( 0%) ggc
>  ipa lto decl merge      :   2.12 ( 2%) usr   0.00 ( 0%) sys   2.12 ( 2%) 
> wall   13737 kB ( 0%) ggc
>  ipa reference           :   2.14 ( 2%) usr   0.00 ( 0%) sys   2.13 ( 2%) 
> wall       0 kB ( 0%) ggc
>  ipa pure const          :   2.29 ( 2%) usr   0.01 ( 0%) sys   2.35 ( 2%) 
> wall       0 kB ( 0%) ggc
>  ipa icf                 :   9.02 ( 7%) usr   0.18 ( 2%) sys   9.72 ( 7%) 
> wall   19203 kB ( 0%) ggc
>  TOTAL                 : 195.27            12.37           207.64            
> 4103297 kB
> 
> to:
> 
>  phase setup             :   0.00 ( 0%) usr   0.00 ( 0%) sys   0.01 ( 0%) 
> wall    1996 kB ( 0%) ggc
>  phase opt and generate  :  77.17 (53%) usr   1.69 ( 9%) sys  79.45 (48%) 
> wall  856874 kB (26%) ggc
>  phase stream in         :  25.92 (18%) usr   1.75 (10%) sys  27.66 (17%) 
> wall 2418654 kB (74%) ggc
>  phase stream out        :  39.90 (27%) usr  14.74 (81%) sys  54.82 (33%) 
> wall      50 kB ( 0%) ggc
>  phase finalize          :   2.52 ( 2%) usr   0.11 ( 1%) sys   2.63 ( 2%) 
> wall       0 kB ( 0%) ggc
>  garbage collection      :   4.56 ( 3%) usr   0.01 ( 0%) sys   4.56 ( 3%) 
> wall       0 kB ( 0%) ggc
>  ipa dead code removal   :   4.32 ( 3%) usr   0.03 ( 0%) sys   4.59 ( 3%) 
> wall       2 kB ( 0%) ggc
>  ipa virtual call target :  23.19 (16%) usr   0.18 ( 1%) sys  23.31 (14%) 
> wall       0 kB ( 0%) ggc
>  ipa cp                  :   4.06 ( 3%) usr   0.18 ( 1%) sys   4.10 ( 2%) 
> wall  339974 kB (10%) ggc
>  ipa inlining heuristics :  25.05 (17%) usr   0.32 ( 2%) sys  25.86 (16%) 
> wall  500986 kB (15%) ggc
>  lto stream inflate      :   5.50 ( 4%) usr   0.42 ( 2%) sys   5.73 ( 3%) 
> wall       0 kB ( 0%) ggc
>  ipa lto gimple in       :   1.97 ( 1%) usr   0.51 ( 3%) sys   2.70 ( 2%) 
> wall  324937 kB (10%) ggc
>  ipa lto gimple out      :   9.00 ( 6%) usr   1.59 ( 9%) sys  10.22 ( 6%) 
> wall      50 kB ( 0%) ggc
>  ipa lto decl in         :  14.29 (10%) usr   0.73 ( 4%) sys  15.18 ( 9%) 
> wall 1522854 kB (46%) ggc
>  ipa lto decl out        :  25.35 (17%) usr   0.59 ( 3%) sys  25.91 (16%) 
> wall       0 kB ( 0%) ggc
>  ipa lto constructors out:   1.48 ( 1%) usr   0.51 ( 3%) sys   2.38 ( 1%) 
> wall       0 kB ( 0%) ggc
>  ipa lto cgraph I/O      :   0.74 ( 1%) usr   0.22 ( 1%) sys   0.97 ( 1%) 
> wall  408576 kB (12%) ggc
>  ipa lto decl merge      :   1.94 ( 1%) usr   0.00 ( 0%) sys   1.95 ( 1%) 
> wall   13556 kB ( 0%) ggc
>  whopr wpa I/O           :   2.95 ( 2%) usr  12.03 (66%) sys  15.17 ( 9%) 
> wall       0 kB ( 0%) ggc
>  whopr partitioning      :   3.99 ( 3%) usr   0.03 ( 0%) sys   4.01 ( 2%) 
> wall   13619 kB ( 0%) ggc
>  ipa reference           :   2.45 ( 2%) usr   0.01 ( 0%) sys   2.46 ( 1%) 
> wall       0 kB ( 0%) ggc
>  ipa pure const          :   2.30 ( 2%) usr   0.03 ( 0%) sys   2.33 ( 1%) 
> wall       0 kB ( 0%) ggc
>  ipa icf                 :   8.30 ( 6%) usr   0.26 ( 1%) sys   8.37 ( 5%) 
> wall   19276 kB ( 1%) ggc
>  TOTAL                 : 145.51            18.29           164.57            
> 3277576 kB
> 
> With debug output the numbers are not that impressive, but sitll about 17% 
> down from decl in.
> It also leads to about 63% code size reduction for global decl streams.
> 
> I built WPA with -flto-partition=max and looked into one of partitions that 
> seemed most absurd.
> We used about 180k type delcs to produce about 700 lines of assembler that 
> mostly contained
> a calls to various methods. THe thing is that each method borught in a lot of 
> declarations
> so I looked into why and noticed that TYPE_FIELDS contains TYPE_DECLS that 
> are mostly ignored
> by the back-end expect for dwaf2out and dwarf2out actually ignores good 
> portion of them, too.
> 
> I thus made a predicate to tell waht decls are going to be useful for 
> dwarf2out and removed
> rest in free_lang_data.  Clearly with early debug, we will be able to remove 
> them all.
> 
> Honza
> 
> 
>       * tree.c (free_lang_data_in_type): Skip irrelevant typedecls.
>       (find_decls_types_r): Likewise.
>       * tree.h (type_decl_relevant_for_debug_p): Declare.
>       * dwarf2out.c (type_decl_relevant_for_debug_p): New function.
> Index: tree.c
> ===================================================================
> --- tree.c    (revision 231546)
> +++ tree.c    (working copy)
> @@ -5191,7 +5191,8 @@ free_lang_data_in_type (tree type)
>        while (member)
>       {
>         if (TREE_CODE (member) == FIELD_DECL
> -           || TREE_CODE (member) == TYPE_DECL)
> +           || (TREE_CODE (member) == TYPE_DECL
> +               && type_decl_relevant_for_debug_p (member)))
>           {
>             if (prev)
>               TREE_CHAIN (prev) = member;
> @@ -5666,7 +5667,8 @@ find_decls_types_r (tree *tp, int *ws, v
>         while (tem)
>           {
>             if (TREE_CODE (tem) == FIELD_DECL
> -               || TREE_CODE (tem) == TYPE_DECL)
> +               || (TREE_CODE (tem) == TYPE_DECL
> +                   && type_decl_relevant_for_debug_p (tem)))
>               fld_worklist_push (tem, fld);
>             tem = TREE_CHAIN (tem);
>           }
> Index: tree.h
> ===================================================================
> --- tree.h    (revision 231546)
> +++ tree.h    (working copy)
> @@ -5417,4 +5417,6 @@ desired_pro_or_demotion_p (const_tree to
>    return to_type_precision <= TYPE_PRECISION (from_type);
>  }
>  
> +extern bool type_decl_relevant_for_debug_p (const_tree);
> +
>  #endif  /* GCC_TREE_H  */
> Index: dwarf2out.c
> ===================================================================
> --- dwarf2out.c       (revision 231546)
> +++ dwarf2out.c       (working copy)
> @@ -21134,6 +21134,15 @@ is_redundant_typedef (const_tree decl)
>    return 0;
>  }
>  
> +/* Return true if DECL is going to be useful for debug output.  */
> +bool
> +type_decl_relevant_for_debug_p (const_tree decl)
> +{
> +  if (debug_info_level <= DINFO_LEVEL_TERSE)
> +    return false;

We explicitely do not use debug-info-level tests in free-lang-data
to allow mixing -g and -g0 objects.  Are you sure doing the above
doesn't mess up tree merging enough to effectively enlarge WPA
memory use and the merged decl sections?

[I'm quite sure firefox build system manages to mess up -g vs. -g0
in some places ;)]

> +  return (!DECL_IGNORED_P (decl) && !is_redundant_typedef (decl));
> +}
> +

The patch would be ok if you simply export is_redundant_typedef
and inline the DECL_IGNORED_P check into free-lang-data.

Thanks,
Richard.

Reply via email to