[RFC/PATCH] reduce load time for modules with lots of relocs
Medve Emilian
Emilian.Medve at freescale.com
Wed Nov 7 10:58:24 EST 2007
Hello Nathan,
Would it be possible to do the sort "in place" (without the extra
buffer)? I mean would that upset any other part of the kernel?
Thanks,
Emil.
> -----Original Message-----
> From: Nathan Lynch [mailto:ntl at pobox.com]
> Sent: Monday, November 05, 2007 8:34 PM
> To: Medve Emilian
> Cc: linuxppc-dev at ozlabs.org
> Subject: [RFC/PATCH] reduce load time for modules with lots of relocs
>
> Nathan Lynch wrote:
> > Medve Emilian-EMMEDVE1 wrote:
> > >
> > > I have module with 36K relocation entries (I know, I
> know, how could
> > > that be...) and the count_relocs() function takes ~17
> seconds to crunch
> > > through the relocation table from the .rela.text section.
> I don't think
> > > I can reduce the number of entries in the relocation
> table (can I?) so
> > > I'm thinking at improving the performance of
> count_relocs() (currently
> > > O(n^2)). Does anybody have a simpler idea? Does anyone have any
> > > constructive suggestion on how to improve the situation?
> >
> > This seems to come up every few months. There was a patch submitted
> > here:
> >
> > http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html
>
> I think this comes up often enough for us to fix it (IIRC unionfs
> people complained about it long ago too), and it's kind of lame to
> spend 17 seconds in the kernel without scheduling. So I dug up some
> old patches that should reduce the complexity to O(n logn), using the
> sort() function, which IMO is preferable to doing our own hash thing
> as that patch from June does. Only the 64-bit code is tested. Does
> this help your situation?
>
> arch/powerpc/kernel/module_32.c | 85
> +++++++++++++++++++++++++++++++---------
> arch/powerpc/kernel/module_64.c | 80
> ++++++++++++++++++++++++++++++-------
> 2 files changed, 132 insertions(+), 33 deletions(-)
>
>
> diff --git a/arch/powerpc/kernel/module_32.c
> b/arch/powerpc/kernel/module_32.c
> index 07a89a3..001b941 100644
> --- a/arch/powerpc/kernel/module_32.c
> +++ b/arch/powerpc/kernel/module_32.c
> @@ -24,6 +24,7 @@
> #include <linux/kernel.h>
> #include <linux/cache.h>
> #include <linux/bug.h>
> +#include <linux/sort.h>
>
> #include "setup.h"
>
> @@ -50,26 +51,64 @@ void module_free(struct module *mod, void
> *module_region)
> table entries. */
> }
>
> +static int reloc_cmp(const void *_a, const void *_b)
> +{
> + const Elf32_Rela *a = *(Elf32_Rela **)_a, *b =
> *(Elf32_Rela **)_b;
> +
> + if (a->r_info < b->r_info)
> + return -1;
> + else if (a->r_info > b->r_info)
> + return 1;
> + else if (a->r_addend < b->r_addend)
> + return -1;
> + else if (a->r_addend > b->r_addend)
> + return 1;
> +
> + return 0;
> +}
> +
> +
> /* Count how many different relocations (different symbol, different
> addend) */
> static unsigned int count_relocs(const Elf32_Rela *rela,
> unsigned int num)
> {
> - unsigned int i, j, ret = 0;
> + unsigned int i, sorted_count = 0;
> + Elf32_Word last_info;
> + Elf32_Sword last_addend;
> + Elf32_Rela **sortbuf = NULL;
> +
> + if (num == 0)
> + return 0;
> +
> + sortbuf = vmalloc(num * sizeof(*sortbuf));
> +
> + if (!sortbuf)
> + return -ENOMEM;
> +
> + for (i = 0; i < num; i++)
> + sortbuf[i] = (Elf32_Rela *)&rela[i];
> +
> + sort(sortbuf, i, sizeof(sortbuf[0]), reloc_cmp, NULL);
> +
> + last_info = sortbuf[0]->r_info;
> + last_addend = sortbuf[0]->r_addend;
> + sorted_count = 1;
>
> - /* Sure, this is order(n^2), but it's usually short, and not
> - time critical */
> for (i = 0; i < num; i++) {
> - for (j = 0; j < i; j++) {
> - /* If this addend appeared before, it's
> - already been counted */
> - if (ELF32_R_SYM(rela[i].r_info)
> - == ELF32_R_SYM(rela[j].r_info)
> - && rela[i].r_addend == rela[j].r_addend)
> - break;
> - }
> - if (j == i) ret++;
> + /* If this r_info,r_addend tuple matches the previous
> + * entry, don't count it again
> + */
> + if (sortbuf[i]->r_info == last_info &&
> + sortbuf[i]->r_addend == last_addend)
> + continue;
> +
> + last_info = sortbuf[i]->r_info;
> + last_addend = sortbuf[i]->r_addend;
> + sorted_count++;
> }
> - return ret;
> +
> + vfree(sortbuf);
> + return sorted_count;
> }
>
> /* Get the potential trampolines size required of the init and
> @@ -96,15 +135,19 @@ static unsigned long get_plt_size(const
> Elf32_Ehdr *hdr,
> continue;
>
> if (sechdrs[i].sh_type == SHT_RELA) {
> + int count;
> DEBUGP("Found relocations in section %u\n", i);
> DEBUGP("Ptr: %p. Number: %u\n",
> (void *)hdr + sechdrs[i].sh_offset,
> sechdrs[i].sh_size / sizeof(Elf32_Rela));
> - ret += count_relocs((void *)hdr
> + count = count_relocs((void *)hdr
> + sechdrs[i].sh_offset,
> sechdrs[i].sh_size
> / sizeof(Elf32_Rela))
> * sizeof(struct ppc_plt_entry);
> + if (count < 0)
> + return count;
> + ret += count;
> }
> }
>
> @@ -117,6 +160,7 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr,
> struct module *me)
> {
> unsigned int i;
> + int ret;
>
> /* Find .plt and .init.plt sections */
> for (i = 0; i < hdr->e_shnum; i++) {
> @@ -131,10 +175,15 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr,
> }
>
> /* Override their sizes */
> - sechdrs[me->arch.core_plt_section].sh_size
> - = get_plt_size(hdr, sechdrs, secstrings, 0);
> - sechdrs[me->arch.init_plt_section].sh_size
> - = get_plt_size(hdr, sechdrs, secstrings, 1);
> + ret = get_plt_size(hdr, sechdrs, secstrings, 0);
> + if (ret < 0)
> + return ret;
> + sechdrs[me->arch.core_plt_section].sh_size = ret;
> +
> + ret = get_plt_size(hdr, sechdrs, secstrings, 1);
> + if (ret < 0)
> + return ret;
> + sechdrs[me->arch.init_plt_section].sh_size = ret;
> return 0;
> }
>
> diff --git a/arch/powerpc/kernel/module_64.c
> b/arch/powerpc/kernel/module_64.c
> index 75c7c4f..bfc5b98 100644
> --- a/arch/powerpc/kernel/module_64.c
> +++ b/arch/powerpc/kernel/module_64.c
> @@ -21,6 +21,7 @@
> #include <linux/err.h>
> #include <linux/vmalloc.h>
> #include <linux/bug.h>
> +#include <linux/sort.h>
> #include <asm/module.h>
> #include <asm/uaccess.h>
> #include <asm/firmware.h>
> @@ -77,28 +78,68 @@ static struct ppc64_stub_entry ppc64_stub =
> 0x4e, 0x80, 0x04, 0x20 /* bctr */
> } };
>
> +static int reloc_cmp(const void *_a, const void *_b)
> +{
> + const Elf64_Rela *a = *(Elf64_Rela **)_a, *b =
> *(Elf64_Rela **)_b;
> +
> + if (a->r_info < b->r_info)
> + return -1;
> + else if (a->r_info > b->r_info)
> + return 1;
> + else if (a->r_addend < b->r_addend)
> + return -1;
> + else if (a->r_addend > b->r_addend)
> + return 1;
> +
> + return 0;
> +}
> +
> /* Count how many different 24-bit relocations (different symbol,
> different addend) */
> -static unsigned int count_relocs(const Elf64_Rela *rela,
> unsigned int num)
> +static int count_relocs(const Elf64_Rela *rela, unsigned int num)
> {
> unsigned int i, j, ret = 0;
> + Elf64_Xword last_info;
> + Elf64_Sxword last_addend;
> + Elf64_Rela **sortbuf = NULL;
> +
> + if (num == 0)
> + return 0;
> +
> + sortbuf = vmalloc(num * sizeof(*sortbuf));
>
> - /* FIXME: Only count external ones --RR */
> - /* Sure, this is order(n^2), but it's usually short, and not
> - time critical */
> - for (i = 0; i < num; i++) {
> + if (!sortbuf)
> + return -ENOMEM;
> +
> + for (j = 0, i = 0; i < num; i++) {
> /* Only count 24-bit relocs, others don't need stubs */
> if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24)
> continue;
> - for (j = 0; j < i; j++) {
> - /* If this addend appeared before, it's
> - already been counted */
> - if (rela[i].r_info == rela[j].r_info
> - && rela[i].r_addend == rela[j].r_addend)
> - break;
> - }
> - if (j == i) ret++;
> + sortbuf[j++] = (Elf64_Rela *)&rela[i];
> }
> +
> + if (j == 0)
> + goto out;
> +
> + sort(sortbuf, j, sizeof(sortbuf[0]), reloc_cmp, NULL);
> +
> + last_info = sortbuf[0]->r_info;
> + last_addend = sortbuf[0]->r_addend;
> + ret = 1;
> + for (i = 0; i < j; i++) {
> + /* If this r_info,r_addend tuple matches the previous
> + * entry, don't count it again
> + */
> + if (sortbuf[i]->r_info == last_info &&
> + sortbuf[i]->r_addend == last_addend)
> + continue;
> +
> + last_info = sortbuf[i]->r_info;
> + last_addend = sortbuf[i]->r_addend;
> + ret++;
> + }
> +out:
> + vfree(sortbuf);
> return ret;
> }
>
> @@ -129,13 +170,17 @@ static unsigned long
> get_stubs_size(const Elf64_Ehdr *hdr,
> /* Every relocated section... */
> for (i = 1; i < hdr->e_shnum; i++) {
> if (sechdrs[i].sh_type == SHT_RELA) {
> + int count;
> DEBUGP("Found relocations in section %u\n", i);
> DEBUGP("Ptr: %p. Number: %lu\n",
> (void *)sechdrs[i].sh_addr,
> sechdrs[i].sh_size / sizeof(Elf64_Rela));
> - relocs += count_relocs((void
> *)sechdrs[i].sh_addr,
> + count = count_relocs((void *)sechdrs[i].sh_addr,
> sechdrs[i].sh_size
> / sizeof(Elf64_Rela));
> + if (count < 0)
> + return count;
> + relocs += count;
> }
> }
>
> @@ -173,6 +218,7 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr,
> struct module *me)
> {
> unsigned int i;
> + int stubs_size;
>
> /* Find .toc and .stubs sections, symtab and strtab */
> for (i = 1; i < hdr->e_shnum; i++) {
> @@ -209,7 +255,11 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr,
> me->arch.toc_section = me->arch.stubs_section;
>
> /* Override the stubs size */
> - sechdrs[me->arch.stubs_section].sh_size =
> get_stubs_size(hdr, sechdrs);
> + stubs_size = get_stubs_size(hdr, sechdrs);
> + if (stubs_size < 0)
> + return stubs_size;
> +
> + sechdrs[me->arch.stubs_section].sh_size = stubs_size;
> return 0;
> }
More information about the Linuxppc-dev
mailing list