Hello Nathan,
Would it be possible to do the sort "in place" (without the extra buffer)? I mean would that upset any other part of the kernel? Thanks, Emil. > -----Original Message----- > From: Nathan Lynch [mailto:[EMAIL PROTECTED] > Sent: Monday, November 05, 2007 8:34 PM > To: Medve Emilian > Cc: linuxppc-dev@ozlabs.org > Subject: [RFC/PATCH] reduce load time for modules with lots of relocs > > Nathan Lynch wrote: > > Medve Emilian-EMMEDVE1 wrote: > > > > > > I have module with 36K relocation entries (I know, I > know, how could > > > that be...) and the count_relocs() function takes ~17 > seconds to crunch > > > through the relocation table from the .rela.text section. > I don't think > > > I can reduce the number of entries in the relocation > table (can I?) so > > > I'm thinking at improving the performance of > count_relocs() (currently > > > O(n^2)). Does anybody have a simpler idea? Does anyone have any > > > constructive suggestion on how to improve the situation? > > > > This seems to come up every few months. There was a patch submitted > > here: > > > > http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html > > I think this comes up often enough for us to fix it (IIRC unionfs > people complained about it long ago too), and it's kind of lame to > spend 17 seconds in the kernel without scheduling. So I dug up some > old patches that should reduce the complexity to O(n logn), using the > sort() function, which IMO is preferable to doing our own hash thing > as that patch from June does. Only the 64-bit code is tested. Does > this help your situation? > > arch/powerpc/kernel/module_32.c | 85 > +++++++++++++++++++++++++++++++--------- > arch/powerpc/kernel/module_64.c | 80 > ++++++++++++++++++++++++++++++------- > 2 files changed, 132 insertions(+), 33 deletions(-) > > > diff --git a/arch/powerpc/kernel/module_32.c > b/arch/powerpc/kernel/module_32.c > index 07a89a3..001b941 100644 > --- a/arch/powerpc/kernel/module_32.c > +++ b/arch/powerpc/kernel/module_32.c > @@ -24,6 +24,7 @@ > #include <linux/kernel.h> > #include <linux/cache.h> > #include <linux/bug.h> > +#include <linux/sort.h> > > #include "setup.h" > > @@ -50,26 +51,64 @@ void module_free(struct module *mod, void > *module_region) > table entries. */ > } > > +static int reloc_cmp(const void *_a, const void *_b) > +{ > + const Elf32_Rela *a = *(Elf32_Rela **)_a, *b = > *(Elf32_Rela **)_b; > + > + if (a->r_info < b->r_info) > + return -1; > + else if (a->r_info > b->r_info) > + return 1; > + else if (a->r_addend < b->r_addend) > + return -1; > + else if (a->r_addend > b->r_addend) > + return 1; > + > + return 0; > +} > + > + > /* Count how many different relocations (different symbol, different > addend) */ > static unsigned int count_relocs(const Elf32_Rela *rela, > unsigned int num) > { > - unsigned int i, j, ret = 0; > + unsigned int i, sorted_count = 0; > + Elf32_Word last_info; > + Elf32_Sword last_addend; > + Elf32_Rela **sortbuf = NULL; > + > + if (num == 0) > + return 0; > + > + sortbuf = vmalloc(num * sizeof(*sortbuf)); > + > + if (!sortbuf) > + return -ENOMEM; > + > + for (i = 0; i < num; i++) > + sortbuf[i] = (Elf32_Rela *)&rela[i]; > + > + sort(sortbuf, i, sizeof(sortbuf[0]), reloc_cmp, NULL); > + > + last_info = sortbuf[0]->r_info; > + last_addend = sortbuf[0]->r_addend; > + sorted_count = 1; > > - /* Sure, this is order(n^2), but it's usually short, and not > - time critical */ > for (i = 0; i < num; i++) { > - for (j = 0; j < i; j++) { > - /* If this addend appeared before, it's > - already been counted */ > - if (ELF32_R_SYM(rela[i].r_info) > - == ELF32_R_SYM(rela[j].r_info) > - && rela[i].r_addend == rela[j].r_addend) > - break; > - } > - if (j == i) ret++; > + /* If this r_info,r_addend tuple matches the previous > + * entry, don't count it again > + */ > + if (sortbuf[i]->r_info == last_info && > + sortbuf[i]->r_addend == last_addend) > + continue; > + > + last_info = sortbuf[i]->r_info; > + last_addend = sortbuf[i]->r_addend; > + sorted_count++; > } > - return ret; > + > + vfree(sortbuf); > + return sorted_count; > } > > /* Get the potential trampolines size required of the init and > @@ -96,15 +135,19 @@ static unsigned long get_plt_size(const > Elf32_Ehdr *hdr, > continue; > > if (sechdrs[i].sh_type == SHT_RELA) { > + int count; > DEBUGP("Found relocations in section %u\n", i); > DEBUGP("Ptr: %p. Number: %u\n", > (void *)hdr + sechdrs[i].sh_offset, > sechdrs[i].sh_size / sizeof(Elf32_Rela)); > - ret += count_relocs((void *)hdr > + count = count_relocs((void *)hdr > + sechdrs[i].sh_offset, > sechdrs[i].sh_size > / sizeof(Elf32_Rela)) > * sizeof(struct ppc_plt_entry); > + if (count < 0) > + return count; > + ret += count; > } > } > > @@ -117,6 +160,7 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr, > struct module *me) > { > unsigned int i; > + int ret; > > /* Find .plt and .init.plt sections */ > for (i = 0; i < hdr->e_shnum; i++) { > @@ -131,10 +175,15 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr, > } > > /* Override their sizes */ > - sechdrs[me->arch.core_plt_section].sh_size > - = get_plt_size(hdr, sechdrs, secstrings, 0); > - sechdrs[me->arch.init_plt_section].sh_size > - = get_plt_size(hdr, sechdrs, secstrings, 1); > + ret = get_plt_size(hdr, sechdrs, secstrings, 0); > + if (ret < 0) > + return ret; > + sechdrs[me->arch.core_plt_section].sh_size = ret; > + > + ret = get_plt_size(hdr, sechdrs, secstrings, 1); > + if (ret < 0) > + return ret; > + sechdrs[me->arch.init_plt_section].sh_size = ret; > return 0; > } > > diff --git a/arch/powerpc/kernel/module_64.c > b/arch/powerpc/kernel/module_64.c > index 75c7c4f..bfc5b98 100644 > --- a/arch/powerpc/kernel/module_64.c > +++ b/arch/powerpc/kernel/module_64.c > @@ -21,6 +21,7 @@ > #include <linux/err.h> > #include <linux/vmalloc.h> > #include <linux/bug.h> > +#include <linux/sort.h> > #include <asm/module.h> > #include <asm/uaccess.h> > #include <asm/firmware.h> > @@ -77,28 +78,68 @@ static struct ppc64_stub_entry ppc64_stub = > 0x4e, 0x80, 0x04, 0x20 /* bctr */ > } }; > > +static int reloc_cmp(const void *_a, const void *_b) > +{ > + const Elf64_Rela *a = *(Elf64_Rela **)_a, *b = > *(Elf64_Rela **)_b; > + > + if (a->r_info < b->r_info) > + return -1; > + else if (a->r_info > b->r_info) > + return 1; > + else if (a->r_addend < b->r_addend) > + return -1; > + else if (a->r_addend > b->r_addend) > + return 1; > + > + return 0; > +} > + > /* Count how many different 24-bit relocations (different symbol, > different addend) */ > -static unsigned int count_relocs(const Elf64_Rela *rela, > unsigned int num) > +static int count_relocs(const Elf64_Rela *rela, unsigned int num) > { > unsigned int i, j, ret = 0; > + Elf64_Xword last_info; > + Elf64_Sxword last_addend; > + Elf64_Rela **sortbuf = NULL; > + > + if (num == 0) > + return 0; > + > + sortbuf = vmalloc(num * sizeof(*sortbuf)); > > - /* FIXME: Only count external ones --RR */ > - /* Sure, this is order(n^2), but it's usually short, and not > - time critical */ > - for (i = 0; i < num; i++) { > + if (!sortbuf) > + return -ENOMEM; > + > + for (j = 0, i = 0; i < num; i++) { > /* Only count 24-bit relocs, others don't need stubs */ > if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) > continue; > - for (j = 0; j < i; j++) { > - /* If this addend appeared before, it's > - already been counted */ > - if (rela[i].r_info == rela[j].r_info > - && rela[i].r_addend == rela[j].r_addend) > - break; > - } > - if (j == i) ret++; > + sortbuf[j++] = (Elf64_Rela *)&rela[i]; > } > + > + if (j == 0) > + goto out; > + > + sort(sortbuf, j, sizeof(sortbuf[0]), reloc_cmp, NULL); > + > + last_info = sortbuf[0]->r_info; > + last_addend = sortbuf[0]->r_addend; > + ret = 1; > + for (i = 0; i < j; i++) { > + /* If this r_info,r_addend tuple matches the previous > + * entry, don't count it again > + */ > + if (sortbuf[i]->r_info == last_info && > + sortbuf[i]->r_addend == last_addend) > + continue; > + > + last_info = sortbuf[i]->r_info; > + last_addend = sortbuf[i]->r_addend; > + ret++; > + } > +out: > + vfree(sortbuf); > return ret; > } > > @@ -129,13 +170,17 @@ static unsigned long > get_stubs_size(const Elf64_Ehdr *hdr, > /* Every relocated section... */ > for (i = 1; i < hdr->e_shnum; i++) { > if (sechdrs[i].sh_type == SHT_RELA) { > + int count; > DEBUGP("Found relocations in section %u\n", i); > DEBUGP("Ptr: %p. Number: %lu\n", > (void *)sechdrs[i].sh_addr, > sechdrs[i].sh_size / sizeof(Elf64_Rela)); > - relocs += count_relocs((void > *)sechdrs[i].sh_addr, > + count = count_relocs((void *)sechdrs[i].sh_addr, > sechdrs[i].sh_size > / sizeof(Elf64_Rela)); > + if (count < 0) > + return count; > + relocs += count; > } > } > > @@ -173,6 +218,7 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, > struct module *me) > { > unsigned int i; > + int stubs_size; > > /* Find .toc and .stubs sections, symtab and strtab */ > for (i = 1; i < hdr->e_shnum; i++) { > @@ -209,7 +255,11 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, > me->arch.toc_section = me->arch.stubs_section; > > /* Override the stubs size */ > - sechdrs[me->arch.stubs_section].sh_size = > get_stubs_size(hdr, sechdrs); > + stubs_size = get_stubs_size(hdr, sechdrs); > + if (stubs_size < 0) > + return stubs_size; > + > + sechdrs[me->arch.stubs_section].sh_size = stubs_size; > return 0; > } _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@ozlabs.org https://ozlabs.org/mailman/listinfo/linuxppc-dev