From apw at shadowen.org Thu Dec 1 04:34:50 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 30 Nov 2005 17:34:50 +0000 Subject: [PATCH 1/2] powerpc powermac adb fix dependancy on btext_drawchar References: Message-ID: <20051130173450.GA851@shadowen.org> powerpc: powermac, adb fix dependancy on btext_drawchar udbg_adb_init() has become dependant on btext_drawchar, even when BOOTX_TEXT support is not selected. This leads to the error below. Make the check dependant on BOOTX_TEXT. LD .tmp_vmlinux1 arch/powerpc/platforms/built-in.o(.toc1+0xa40): undefined reference to `btext_drawchar' Signed-off-by: Andy Whitcroft --- diff -upN reference/arch/powerpc/platforms/powermac/udbg_adb.c current/arch/powerpc/platforms/powermac/udbg_adb.c --- reference/arch/powerpc/platforms/powermac/udbg_adb.c +++ current/arch/powerpc/platforms/powermac/udbg_adb.c @@ -171,9 +171,12 @@ int udbg_adb_init(int force_btext) udbg_adb_old_getc_poll = udbg_getc_poll; /* Check if our early init was already called */ - if (udbg_adb_old_putc == udbg_adb_putc || - udbg_adb_old_putc == btext_drawchar) + if (udbg_adb_old_putc == udbg_adb_putc) udbg_adb_old_putc = NULL; +#ifdef CONFIG_BOOTX_TEXT + if (udbg_adb_old_putc == btext_drawchar) + udbg_adb_old_putc = NULL; +#endif /* Set ours as output */ udbg_putc = udbg_adb_putc; From apw at shadowen.org Thu Dec 1 04:34:40 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 30 Nov 2005 17:34:40 +0000 Subject: [PATCH 0/2] 2.6.15rc3mm1 ppc64 compile problems References: <20051129203134.13b93f48.akpm@osdl.org> Message-ID: Testing 2.6.15-rc3-mm1 seems to have issues on ppc64 systems using the powerpc architecture. The problems are in the powermac support relating to the BOOTX_TEXT support. Following this email are a couple of patches to clean up this build: powerpc-powermac-adb-fix-dependancy-on-btext_drawchar: fix up a dependancy problem on BOOTX_TEXT powerpc-powermac-adb-fix-udbg_adb_use_btext-warning: clean up a warning with unused externals Comments? -apw From apw at shadowen.org Thu Dec 1 04:35:01 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 30 Nov 2005 17:35:01 +0000 Subject: [PATCH 2/2] powerpc powermac adb fix udbg_adb_use_btext warning References: Message-ID: <20051130173501.GA863@shadowen.org> powerpc: powermac, adb fix udbg_adb_use_btext warning When compiling without BOOTX_TEXT the following warning is emitted. Fix up the definition to only be made when required. CC arch/powerpc/platforms/powermac/udbg_adb.o .../arch/powerpc/platforms/powermac/udbg_adb.c:41: warning: `udbg_adb_use_btext' defined but not used Signed-off-by: Andy Whitcroft --- diff -upN reference/arch/powerpc/platforms/powermac/udbg_adb.c current/arch/powerpc/platforms/powermac/udbg_adb.c --- reference/arch/powerpc/platforms/powermac/udbg_adb.c +++ current/arch/powerpc/platforms/powermac/udbg_adb.c @@ -38,8 +38,6 @@ static enum { input_adb_cuda, } input_type = input_adb_none; -static int udbg_adb_use_btext; - int xmon_wants_key, xmon_adb_keycode; static inline void udbg_adb_poll(void) @@ -55,6 +53,8 @@ static inline void udbg_adb_poll(void) } #ifdef CONFIG_BOOTX_TEXT + +static int udbg_adb_use_btext; static int xmon_adb_shiftstate; static unsigned char xmon_keytab[128] = From kravetz at us.ibm.com Thu Dec 1 08:47:23 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Wed, 30 Nov 2005 13:47:23 -0800 Subject: [PATCH] updated: Minor numa memory code cleanup Message-ID: <20051130214723.GC29166@w-mikek2.ibm.com> Here is an updated version of the patch that panics if no memory is found as Nathan suggested. I'm still concerned that panic strings (not just the one added here) at this stage of booting do not show up on my system. But, that is an issue separate from this patch. Combine get_mem_*_cells() routines to avoid multiple memory node lookups. Added missing of_node_put() call. Changed variable names to help with some confusion as to meaning. Signed-off-by: Mike Kravetz diff -Naupr linux-2.6.15-rc3-git1/arch/powerpc/mm/numa.c linux-2.6.15-rc3-git1.work/arch/powerpc/mm/numa.c --- linux-2.6.15-rc3-git1/arch/powerpc/mm/numa.c 2005-11-29 03:51:27.000000000 +0000 +++ linux-2.6.15-rc3-git1.work/arch/powerpc/mm/numa.c 2005-11-30 19:53:41.000000000 +0000 @@ -254,29 +254,17 @@ static int __init find_min_common_depth( return depth; } -static int __init get_mem_addr_cells(void) +static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) { struct device_node *memory = NULL; - int rc; memory = of_find_node_by_type(memory, "memory"); if (!memory) - return 0; /* it won't matter */ + panic("numa.c: No memory nodes found!"); - rc = prom_n_addr_cells(memory); - return rc; -} - -static int __init get_mem_size_cells(void) -{ - struct device_node *memory = NULL; - int rc; - - memory = of_find_node_by_type(memory, "memory"); - if (!memory) - return 0; /* it won't matter */ - rc = prom_n_size_cells(memory); - return rc; + *n_addr_cells = prom_n_addr_cells(memory); + *n_size_cells = prom_n_size_cells(memory); + of_node_put(memory); } static unsigned long __init read_n_cells(int n, unsigned int **buf) @@ -386,7 +374,7 @@ static int __init parse_numa_properties( { struct device_node *cpu = NULL; struct device_node *memory = NULL; - int addr_cells, size_cells; + int n_addr_cells, n_size_cells; int max_domain; unsigned long i; @@ -425,8 +413,7 @@ static int __init parse_numa_properties( } } - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); + get_n_mem_cells(&n_addr_cells, &n_size_cells); memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long start; @@ -443,8 +430,8 @@ static int __init parse_numa_properties( ranges = memory->n_addrs; new_range: /* these are order-sensitive, and modify the buffer pointer */ - start = read_n_cells(addr_cells, &memcell_buf); - size = read_n_cells(size_cells, &memcell_buf); + start = read_n_cells(n_addr_cells, &memcell_buf); + size = read_n_cells(n_size_cells, &memcell_buf); numa_domain = of_node_numa_domain(memory); From michael at ellerman.id.au Thu Dec 1 09:33:36 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Wed, 30 Nov 2005 16:33:36 -0600 Subject: [PATCH] updated: Minor numa memory code cleanup In-Reply-To: <20051130214723.GC29166@w-mikek2.ibm.com> References: <20051130214723.GC29166@w-mikek2.ibm.com> Message-ID: <200511301633.42773.michael@ellerman.id.au> On Wed, 30 Nov 2005 15:47, Mike Kravetz wrote: > Here is an updated version of the patch that panics if no memory is > found as Nathan suggested. I'm still concerned that panic strings > (not just the one added here) at this stage of booting do not show > up on my system. But, that is an issue separate from this patch. You probably need to enable one of the EARLY_DEBUG_INIT macros, in arch/powerpc/kernel/setup_64.c. I'm guessing you're on some LPAR machine if you're debugging NUMA? If so you'll want the LPAR debugging. It'll only work if you have a 'hvterm1' compatible console as your /chosen/linux,stdout-path, and it has to be vterm 0 (check the reg property). cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051130/63f3acac/attachment.pgp From kravetz at us.ibm.com Thu Dec 1 09:49:00 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Wed, 30 Nov 2005 14:49:00 -0800 Subject: [PATCH] updated: Minor numa memory code cleanup In-Reply-To: <200511301633.42773.michael@ellerman.id.au> References: <20051130214723.GC29166@w-mikek2.ibm.com> <200511301633.42773.michael@ellerman.id.au> Message-ID: <20051130224900.GE29166@w-mikek2.ibm.com> On Wed, Nov 30, 2005 at 04:33:36PM -0600, Michael Ellerman wrote: > On Wed, 30 Nov 2005 15:47, Mike Kravetz wrote: > > Here is an updated version of the patch that panics if no memory is > > found as Nathan suggested. I'm still concerned that panic strings > > (not just the one added here) at this stage of booting do not show > > up on my system. But, that is an issue separate from this patch. > > You probably need to enable one of the EARLY_DEBUG_INIT macros, in > arch/powerpc/kernel/setup_64.c. I was thinking more about debugging production systems in the field where we may not have the luxury of booting a debug kernel. Seem to recall a situation in the past where someone ran into a problem in numa.c that called panic. Didn't get the panic message displayed on the console. Had them enable xmon, and dig the panic message out of the console buffer. Sure would be nice if we could get all those early catastrophic failure messages to the console. -- Mike From michael at ellerman.id.au Thu Dec 1 10:22:21 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Wed, 30 Nov 2005 17:22:21 -0600 Subject: [PATCH] updated: Minor numa memory code cleanup In-Reply-To: <20051130224900.GE29166@w-mikek2.ibm.com> References: <20051130214723.GC29166@w-mikek2.ibm.com> <200511301633.42773.michael@ellerman.id.au> <20051130224900.GE29166@w-mikek2.ibm.com> Message-ID: <200511301722.26991.michael@ellerman.id.au> On Wed, 30 Nov 2005 16:49, Mike Kravetz wrote: > On Wed, Nov 30, 2005 at 04:33:36PM -0600, Michael Ellerman wrote: > > On Wed, 30 Nov 2005 15:47, Mike Kravetz wrote: > > > Here is an updated version of the patch that panics if no memory is > > > found as Nathan suggested. I'm still concerned that panic strings > > > (not just the one added here) at this stage of booting do not show > > > up on my system. But, that is an issue separate from this patch. > > > > You probably need to enable one of the EARLY_DEBUG_INIT macros, in > > arch/powerpc/kernel/setup_64.c. > > I was thinking more about debugging production systems in the field > where we may not have the luxury of booting a debug kernel. Sure, the nature of early debug is it's trying to tap things that may or may not be around and/or configured - so it's not enabled be default because it will cause some machines to not boot. That's just the way it is. > Seem to recall a situation in the past where someone ran into a > problem in numa.c that called panic. Didn't get the panic message > displayed on the console. Had them enable xmon, and dig the panic > message out of the console buffer. Sure would be nice if we could > get all those early catastrophic failure messages to the console. Hmm, I'd have to check the code, but if xmon is working then you should be able to see the panic. If it's a real problem you could look at generalising init/main.c's panic_later mechanism, which delays a panic until after the console is initialised? cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051130/81a6a7a7/attachment.pgp From j_vvprasad at yahoo.co.in Thu Dec 1 19:48:11 2005 From: j_vvprasad at yahoo.co.in (veera venkata prasad j) Date: Thu, 1 Dec 2005 00:48:11 -0800 (PST) Subject: Booting OS on PowerPC Message-ID: <20051201084811.53930.qmail@web8508.mail.in.yahoo.com> Hi all, Can any body tell me how Linux boot on PowerPC machine when Open Firmware is up. To be more preciese, what is the "known-environment" that the OS expect from Open Firmware. Regards Prasad Jvv. __________________________________ Yahoo! Mail - PC Magazine Editors' Choice 2005 http://mail.yahoo.com From vatsa at in.ibm.com Fri Dec 2 01:26:19 2005 From: vatsa at in.ibm.com (Srivatsa Vaddagiri) Date: Thu, 1 Dec 2005 19:56:19 +0530 Subject: [PATCH] NO_IDLE_HZ patch updated to 2.6.15-rc3-mm1 Message-ID: <20051201142619.GA6157@in.ibm.com> Hello, Here's updated patch to implement NO_IDLE_HZ on PPC64. The patch is against 2.6.15-rc3-mm1 and has been tested on a Power5 LPAR. The patches attached are: boot_cpu_fix.patch -> Lets do_timer be called from any CPU no_idle_hz.patch -> Implement tickless idle CPUs for PPC64 debug.patch -> Debug patch that I used for getting decrementer statistics. We need more cleaner solution if we have to expose those statistics. Let me know if you have any comments on these patches. -- Thanks and Regards, Srivatsa Vaddagiri, Linux Technology Center, IBM Software Labs, Bangalore, INDIA - 560017 -------------- next part -------------- Currently xtime/jiffies is updated by only boot CPU which makes it difficult for an idle boot CPU to skip ticks. The patch overcomes this limitation and lets xtime/jiffies be updated from any CPU. Signed-off-by: Srivatsa Vaddagiri --- diff -puN arch/powerpc/kernel/time.c~boot_cpu_fix arch/powerpc/kernel/time.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~boot_cpu_fix 2005-12-01 13:14:55.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c 2005-12-01 13:21:52.000000000 -0800 @@ -420,6 +420,7 @@ void timer_interrupt(struct pt_regs * re int next_dec; int cpu = smp_processor_id(); unsigned long ticks; + int end_singleshot = 0; #ifdef CONFIG_PPC32 if (atomic_read(&ppc_n_lost_interrupts) != 0) @@ -452,23 +453,29 @@ void timer_interrupt(struct pt_regs * re if (!cpu_is_offline(cpu)) update_process_times(user_mode(regs)); - /* - * No need to check whether cpu is offline here; boot_cpuid - * should have been fixed up by now. - */ - if (cpu != boot_cpuid) - continue; - write_seqlock(&xtime_lock); - tb_last_jiffy += tb_ticks_per_jiffy; - tb_last_stamp = per_cpu(last_jiffy, cpu); - timer_recalc_offset(tb_last_jiffy); - do_timer(regs); - timer_sync_xtime(tb_last_jiffy); - timer_check_rtc(); + if (tb_ticks_since(tb_last_stamp) >= tb_ticks_per_jiffy) { + tb_last_jiffy += tb_ticks_per_jiffy; + tb_last_stamp += tb_ticks_per_jiffy; + if (__USE_RTC() && tb_last_stamp >= 1000000000) + tb_last_stamp -= 1000000000; + timer_recalc_offset(tb_last_jiffy); + do_timer(regs); + timer_sync_xtime(tb_last_jiffy); + timer_check_rtc(); + } + if (adjusting_time && (time_adjust == 0)) { + adjusting_time = 0; + end_singleshot = 1; + } write_sequnlock(&xtime_lock); - if (adjusting_time && (time_adjust == 0)) + + if (end_singleshot) { +#ifdef DEBUG_PPC_ADJTIMEX + printk("ppc_adjtimex: ending single shot time_adjust\n"); +#endif ppc_adjtimex(); + } } next_dec = tb_ticks_per_jiffy - ticks; @@ -826,13 +833,6 @@ void ppc_adjtimex(void) if ( time_adjust < 0 ) singleshot_ppm = -singleshot_ppm; } - else { -#ifdef DEBUG_PPC_ADJTIMEX - if ( adjusting_time ) - printk("ppc_adjtimex: ending single shot time_adjust\n"); -#endif - adjusting_time = 0; - } /* Add up all of the frequency adjustments */ delta_freq = time_freq + ltemp + singleshot_ppm; _ -------------- next part -------------- This patch causes idle CPUs to skip timer ticks until the next scheduled event (next_timer_interrupt()) or until some max duration allowed by the decrementer. This helps to conserve power and on virtual partitions using shared processors, allows for efficient CPU utilization. Currently, only few idle routines have been converted over to use this feature. Other idle routine could be converted over later depending on the requirement. Signed-off-by : Srivatsa Vaddagiri --- diff -puN arch/powerpc/kernel/time.c~no_idle_hz arch/powerpc/kernel/time.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c 2005-12-01 16:18:52.000000000 -0800 @@ -401,40 +401,13 @@ static void iSeries_tb_recal(void) } #endif -/* - * For iSeries shared processors, we have to let the hypervisor - * set the hardware decrementer. We set a virtual decrementer - * in the lppaca and call the hypervisor if the virtual - * decrementer is less than the current value in the hardware - * decrementer. (almost always the new decrementer value will - * be greater than the current hardware decementer so the hypervisor - * call will not be needed) - */ - -/* - * timer_interrupt - gets called when the decrementer overflows, - * with interrupts disabled. - */ -void timer_interrupt(struct pt_regs * regs) +static void account_ticks(struct pt_regs *regs) { int next_dec; int cpu = smp_processor_id(); unsigned long ticks; int end_singleshot = 0; -#ifdef CONFIG_PPC32 - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); -#endif - - irq_enter(); - - profile_tick(CPU_PROFILING, regs); - -#ifdef CONFIG_PPC_ISERIES - get_paca()->lppaca.int_dword.fields.decr_int = 0; -#endif - while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu))) >= tb_ticks_per_jiffy) { /* Update last_jiffy */ @@ -480,6 +453,58 @@ void timer_interrupt(struct pt_regs * re next_dec = tb_ticks_per_jiffy - ticks; set_dec(next_dec); +} + +#ifdef CONFIG_NO_IDLE_HZ +/* Returns 1 if this CPU was set in the mask */ +static inline int clear_hzless_mask(void) +{ + unsigned long cpu = smp_processor_id(); + int rc = 0; + + if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) { + cpu_clear(cpu, nohz_cpu_mask); + rc = 1; + } + + return rc; +} +#else +static inline int clear_hzless_mask(void) { return 0;} +#endif + +/* + * For iSeries shared processors, we have to let the hypervisor + * set the hardware decrementer. We set a virtual decrementer + * in the lppaca and call the hypervisor if the virtual + * decrementer is less than the current value in the hardware + * decrementer. (almost always the new decrementer value will + * be greater than the current hardware decementer so the hypervisor + * call will not be needed) + */ + +/* + * timer_interrupt - gets called when the decrementer overflows, + * with interrupts disabled. + */ +void timer_interrupt(struct pt_regs * regs) +{ +#ifdef CONFIG_PPC32 + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); +#endif + + irq_enter(); + + clear_hzless_mask(); + + profile_tick(CPU_PROFILING, regs); + +#ifdef CONFIG_PPC_ISERIES + get_paca()->lppaca.int_dword.fields.decr_int = 0; +#endif + + account_ticks(regs); #ifdef CONFIG_PPC_ISERIES if (hvlpevent_is_pending()) @@ -497,6 +522,72 @@ void timer_interrupt(struct pt_regs * re irq_exit(); } +#ifdef CONFIG_NO_IDLE_HZ + +#define MAX_DEC_COUNT (UINT_MAX) /* Decrementer is 32-bit */ +#define MIN_SKIP 2 +#define MAX_SKIP (MAX_DEC_COUNT/tb_ticks_per_jiffy) + +int sysctl_hz_timer = 1; + +/* Avoid the HZ timer (decrementer) interrupt on this CPU for "some" time. + * This is accomplished by loading the decrementer with some large calculated + * value. The CPU exits this "tickless" state upon the occurence of an + * exception or external interrupt, at which point the decrementer is again + * reprogrammed to restore the timer interrupt frequency (see start_hz_timer). + * Caller has to ensure that the CPU does not exit the "tickless" idle state + * via other means. + * + * Has to be called with interrupts disabled. + */ +void stop_hz_timer(void) +{ + unsigned long cpu = smp_processor_id(), seq, delta; + int next_dec; + + if (sysctl_hz_timer != 0) + return; + + cpu_set(cpu, nohz_cpu_mask); + smp_mb(); + if (rcu_pending(cpu) || local_softirq_pending()) { + cpu_clear(cpu, nohz_cpu_mask); + return; + } + + do { + seq = read_seqbegin(&xtime_lock); + + delta = next_timer_interrupt() - jiffies; + + if (delta < MIN_SKIP) { + cpu_clear(cpu, nohz_cpu_mask); + return; + } + + if (delta > MAX_SKIP) + delta = MAX_SKIP; + + next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy; + + } while (read_seqretry(&xtime_lock, seq)); + + next_dec -= get_tbl(); + set_dec(next_dec); + + return; +} + +/* Take into account skipped ticks and restore the HZ timer frequency */ +void start_hz_timer(struct pt_regs *regs) +{ + if (clear_hzless_mask()) + account_ticks(regs); +} + +#endif /* CONFIG_NO_IDLE_HZ */ + + void wakeup_decrementer(void) { int i; diff -puN arch/powerpc/kernel/irq.c~no_idle_hz arch/powerpc/kernel/irq.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/irq.c~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/irq.c 2005-12-01 16:18:52.000000000 -0800 @@ -59,6 +59,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_ISERIES #include #endif @@ -192,6 +193,8 @@ void do_IRQ(struct pt_regs *regs) irq_enter(); + start_hz_timer(regs); + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 2KB free? */ { diff -puN include/asm-powerpc/time.h~no_idle_hz include/asm-powerpc/time.h --- linux-2.6.15-rc3-mm1/include/asm-powerpc/time.h~no_idle_hz 2005-12-01 16:06:39.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/include/asm-powerpc/time.h 2005-12-01 16:06:39.000000000 -0800 @@ -198,6 +198,14 @@ static inline unsigned long tb_ticks_sin return get_tbl() - tstamp; } +#ifdef CONFIG_NO_IDLE_HZ +extern void stop_hz_timer(void); +extern void start_hz_timer(struct pt_regs *); +#else +static inline void stop_hz_timer(void) { } +static inline void start_hz_timer(struct pt_regs *regs) { } +#endif + #define mulhwu(x,y) \ ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;}) diff -puN arch/powerpc/Kconfig~no_idle_hz arch/powerpc/Kconfig --- linux-2.6.15-rc3-mm1/arch/powerpc/Kconfig~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/Kconfig 2005-12-01 16:06:28.000000000 -0800 @@ -532,6 +532,12 @@ config HOTPLUG_CPU Say N if you are unsure. +config NO_IDLE_HZ + depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE) + bool "Skip timer ticks on idle CPUs (EXPERIMENTAL)" + help + Switches the HZ timer interrupts off when a CPU is idle. + config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on PPC_MULTIPLATFORM && EXPERIMENTAL diff -puN kernel/sysctl.c~no_idle_hz kernel/sysctl.c --- linux-2.6.15-rc3-mm1/kernel/sysctl.c~no_idle_hz 2005-12-01 16:06:36.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/kernel/sysctl.c 2005-12-01 16:06:36.000000000 -0800 @@ -542,6 +542,16 @@ static ctl_table kern_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_NO_IDLE_HZ + { + .ctl_name = KERN_HZ_TIMER, + .procname = "hz_timer", + .data = &sysctl_hz_timer, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_ARCH_S390 #ifdef CONFIG_MATHEMU { @@ -553,16 +563,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_NO_IDLE_HZ - { - .ctl_name = KERN_HZ_TIMER, - .procname = "hz_timer", - .data = &sysctl_hz_timer, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", diff -puN arch/powerpc/platforms/pseries/setup.c~no_idle_hz arch/powerpc/platforms/pseries/setup.c --- linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/setup.c~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/platforms/pseries/setup.c 2005-12-01 16:18:52.000000000 -0800 @@ -461,9 +461,10 @@ static inline void dedicated_idle_sleep( * a prod occurs. Returning from the cede enables external * interrupts. */ - if (!need_resched()) + if (!need_resched()) { + stop_hz_timer(); cede_processor(); - else + } else local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } else { @@ -553,9 +554,10 @@ static void pseries_shared_idle(void) * Check need_resched() again with interrupts disabled * to avoid a race. */ - if (!need_resched()) + if (!need_resched()) { + stop_hz_timer(); cede_processor(); - else + } else local_irq_enable(); HMT_medium(); diff -puN arch/powerpc/kernel/traps.c~no_idle_hz arch/powerpc/kernel/traps.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/traps.c~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/traps.c 2005-12-01 16:06:28.000000000 -0800 @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef CONFIG_PPC32 #include #endif @@ -889,6 +890,7 @@ void altivec_unavailable_exception(struc #if defined(CONFIG_PPC64) || defined(CONFIG_E500) void performance_monitor_exception(struct pt_regs *regs) { + start_hz_timer(regs); perf_irq(regs); } #endif diff -puN arch/powerpc/kernel/idle_64.c~no_idle_hz arch/powerpc/kernel/idle_64.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/idle_64.c~no_idle_hz 2005-12-01 16:06:28.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/idle_64.c 2005-12-01 16:18:52.000000000 -0800 @@ -66,8 +66,12 @@ void native_idle(void) while (1) { ppc64_runlatch_off(); - if (!need_resched()) - power4_idle(); + local_irq_disable(); + if (!need_resched()) { + stop_hz_timer(); + local_irq_enable(); + power4_idle(); + } if (need_resched()) { ppc64_runlatch_on(); _ -------------- next part -------------- This patch is a quick hack to get decrementer statistics. Not meant for inclusion. --- diff -puN arch/powerpc/kernel/time.c~debug arch/powerpc/kernel/time.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~debug 2005-12-01 16:19:07.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c 2005-12-01 16:19:07.000000000 -0800 @@ -489,6 +489,8 @@ static inline int clear_hzless_mask(void */ void timer_interrupt(struct pt_regs * regs) { + int cpu = smp_processor_id(); + #ifdef CONFIG_PPC32 if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); @@ -498,6 +500,8 @@ void timer_interrupt(struct pt_regs * re clear_hzless_mask(); + kstat_cpu(cpu).irqs[0]++; + profile_tick(CPU_PROFILING, regs); #ifdef CONFIG_PPC_ISERIES @@ -548,6 +552,9 @@ void stop_hz_timer(void) if (sysctl_hz_timer != 0) return; + if (cpu_isset(cpu, nohz_cpu_mask)) + return; + cpu_set(cpu, nohz_cpu_mask); smp_mb(); if (rcu_pending(cpu) || local_softirq_pending()) { diff -puN arch/powerpc/kernel/idle_64.c~debug arch/powerpc/kernel/idle_64.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/idle_64.c~debug 2005-12-01 16:19:07.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/idle_64.c 2005-12-01 16:19:07.000000000 -0800 @@ -41,6 +41,11 @@ void default_idle(void) while (!need_resched() && !cpu_is_offline(cpu)) { ppc64_runlatch_off(); + local_irq_disable(); + if (!need_resched()) + stop_hz_timer(); + local_irq_enable(); + /* * Go into low thread priority and possibly * low power mode. diff -puN arch/powerpc/kernel/irq.c~debug arch/powerpc/kernel/irq.c --- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/irq.c~debug 2005-12-01 16:19:07.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/irq.c 2005-12-01 16:19:07.000000000 -0800 @@ -107,6 +107,10 @@ int show_interrupts(struct seq_file *p, for_each_online_cpu(j) seq_printf(p, "CPU%d ", j); seq_putc(p, '\n'); + seq_printf(p, "%3d: ", i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_putc(p, '\n'); } if (i < NR_IRQS) { diff -puN arch/powerpc/platforms/pseries/setup.c~debug arch/powerpc/platforms/pseries/setup.c --- linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/setup.c~debug 2005-12-01 16:19:07.000000000 -0800 +++ linux-2.6.15-rc3-mm1-root/arch/powerpc/platforms/pseries/setup.c 2005-12-01 16:19:07.000000000 -0800 @@ -498,6 +498,11 @@ static void pseries_dedicated_idle(void) while (!need_resched() && !cpu_is_offline(cpu)) { ppc64_runlatch_off(); + local_irq_disable(); + if (!need_resched()) + stop_hz_timer(); + local_irq_enable(); + /* * Go into low thread priority and possibly * low power mode. _ From linas at austin.ibm.com Fri Dec 2 03:57:22 2005 From: linas at austin.ibm.com (linas) Date: Thu, 1 Dec 2005 10:57:22 -0600 Subject: Booting OS on PowerPC In-Reply-To: <20051201084811.53930.qmail@web8508.mail.in.yahoo.com> References: <20051201084811.53930.qmail@web8508.mail.in.yahoo.com> Message-ID: <20051201165721.GJ31651@austin.ibm.com> On Thu, Dec 01, 2005 at 12:48:11AM -0800, veera venkata prasad j was heard to remark: > Can any body tell me how Linux boot on PowerPC machine > when Open Firmware is up. To be more preciese, what is > the "known-environment" that the OS expect from Open > Firmware. Can you be more specific? What answer are you looking for? --linas From linas at austin.ibm.com Fri Dec 2 11:42:32 2005 From: linas at austin.ibm.com (linas) Date: Thu, 1 Dec 2005 18:42:32 -0600 Subject: [PATCH] powerpc/pseries: dlpar-add crash on null pointer deref Message-ID: <20051202004232.GN31651@austin.ibm.com> Paul, Please apply. This patch fixs a crash on null-pointer deref during dlpar slot addition. Signed-off-by: Linas Vepstas Index: linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/eeh.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/arch/powerpc/platforms/pseries/eeh.c 2005-12-01 17:30:21.000000000 -0600 +++ linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/eeh.c 2005-12-01 18:18:29.808112099 -0600@@ -698,7 +698,7 @@ int enable; struct pci_dn *pdn = PCI_DN(dn); - pdn->class_code = *class_code; + pdn->class_code = 0; pdn->eeh_mode = 0; pdn->eeh_check_count = 0; pdn->eeh_freeze_count = 0; @@ -715,6 +715,7 @@ pdn->eeh_mode |= EEH_MODE_NOCHECK; return NULL; } + pdn->class_code = *class_code; /* * Now decide if we are going to "Disable" EEH checking From linas at austin.ibm.com Fri Dec 2 11:56:14 2005 From: linas at austin.ibm.com (linas) Date: Thu, 1 Dec 2005 18:56:14 -0600 Subject: [PATCH 1/2] PCI Hotplug/powerpc: remove duplicated code Message-ID: <20051202005614.GO31651@austin.ibm.com> Greg, Please apply! --linas The RPAPHP code contains a routine that duplicates some existing code. This patch removes the rpaphp version of the code. Signed-off-by: Linas Vepstas Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpaphp_pci.c 2005-12-01 18:36:40.897900661 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c 2005-12-01 18:51:18.686712139 -0600 @@ -287,18 +287,6 @@ return dev; } -void rpaphp_eeh_init_nodes(struct device_node *dn) -{ - struct device_node *sib; - - for (sib = dn->child; sib; sib = sib->sibling) - rpaphp_eeh_init_nodes(sib); - eeh_add_device_early(dn); - return; - -} -EXPORT_SYMBOL_GPL(rpaphp_eeh_init_nodes); - static void print_slot_pci_funcs(struct pci_bus *bus) { struct device_node *dn; @@ -324,7 +312,7 @@ if (!dn) goto exit; - rpaphp_eeh_init_nodes(dn); + eeh_add_device_tree_early(dn); dev = rpaphp_pci_config_slot(bus); if (!dev) { err("%s: can't find any devices.\n", __FUNCTION__); Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpadlpar_core.c 2005-12-01 18:36:40.898900520 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c 2005-12-01 18:51:18.687711999 -0600 @@ -154,7 +154,8 @@ struct pci_controller *phb = pdn->phb; struct pci_dev *dev = NULL; - rpaphp_eeh_init_nodes(dn); + eeh_add_device_tree_early(dn); + /* Add EADS device to PHB bus, adding new entry to bus->devices */ dev = of_create_pci_dev(dn, phb->bus, pdn->devfn); if (!dev) { From linas at austin.ibm.com Fri Dec 2 11:59:58 2005 From: linas at austin.ibm.com (linas) Date: Thu, 1 Dec 2005 18:59:58 -0600 Subject: [PATCH 2/2] PCI Hotplug/powerpc: more removal of duplicated code In-Reply-To: <20051202005614.GO31651@austin.ibm.com> References: <20051202005614.GO31651@austin.ibm.com> Message-ID: <20051202005957.GP31651@austin.ibm.com> Greg, Please apply! John Rose, Please review this code! --linas The RPAPHP code contains two routines that appear to be gratuitous copies of very similar pci code. In particular, rpaphp_claim_resource ~~ pci_claim_resource (there is a minor, non-functional difference) rpadlpar_claim_one_bus == pcibios_claim_one_bus (the code is identical) This patch removes the rpaphp versions of the code. Signed-off-by: Linas Vepstas Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpaphp_pci.c 2005-12-01 18:51:18.686712139 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c 2005-12-01 18:51:26.357635444 -0600 @@ -62,28 +62,6 @@ } EXPORT_SYMBOL_GPL(rpaphp_find_pci_bus); -int rpaphp_claim_resource(struct pci_dev *dev, int resource) -{ - struct resource *res = &dev->resource[resource]; - struct resource *root = pci_find_parent_resource(dev, res); - char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge"; - int err = -EINVAL; - - if (root != NULL) { - err = request_resource(root, res); - } - - if (err) { - err("PCI: %s region %d of %s %s [%lx:%lx]\n", - root ? "Address space collision on" : - "No parent found for", - resource, dtype, pci_name(dev), res->start, res->end); - } - return err; -} - -EXPORT_SYMBOL_GPL(rpaphp_claim_resource); - static int rpaphp_get_sensor_state(struct slot *slot, int *state) { int rc; @@ -177,7 +155,7 @@ if (r->parent || !r->start || !r->flags) continue; - rpaphp_claim_resource(dev, i); + pci_claim_resource(dev, i); } } } Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpadlpar_core.c 2005-12-01 18:51:18.687711999 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c 2005-12-01 18:51:26.358635304 -0600 @@ -112,28 +112,6 @@ return NULL; } -static void rpadlpar_claim_one_bus(struct pci_bus *b) -{ - struct list_head *ld; - struct pci_bus *child_bus; - - for (ld = b->devices.next; ld != &b->devices; ld = ld->next) { - struct pci_dev *dev = pci_dev_b(ld); - int i; - - for (i = 0; i < PCI_NUM_RESOURCES; i++) { - struct resource *r = &dev->resource[i]; - - if (r->parent || !r->start || !r->flags) - continue; - rpaphp_claim_resource(dev, i); - } - } - - list_for_each_entry(child_bus, &b->children, node) - rpadlpar_claim_one_bus(child_bus); -} - static struct pci_dev *dlpar_find_new_dev(struct pci_bus *parent, struct device_node *dev_dn) { @@ -171,7 +149,7 @@ rpaphp_init_new_devs(dev->subordinate); /* Claim new bus resources */ - rpadlpar_claim_one_bus(dev->bus); + pcibios_claim_one_bus(dev->bus); /* ioremap() for child bus, which may or may not succeed */ (void) remap_bus_range(dev->bus); From greg at kroah.com Fri Dec 2 12:07:07 2005 From: greg at kroah.com (Greg KH) Date: Thu, 1 Dec 2005 17:07:07 -0800 Subject: [PATCH 1/2] PCI Hotplug/powerpc: remove duplicated code In-Reply-To: <20051202005614.GO31651@austin.ibm.com> References: <20051202005614.GO31651@austin.ibm.com> Message-ID: <20051202010707.GA29258@kroah.com> On Thu, Dec 01, 2005 at 06:56:14PM -0600, linas wrote: > > Greg, > Please apply! I need an ack from John before I'll apply either of these. John? thanks, greg k-h From kravetz at us.ibm.com Fri Dec 2 12:22:08 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Thu, 1 Dec 2005 17:22:08 -0800 Subject: [PATCH] powerpc/pseries: dlpar-add crash on null pointer deref In-Reply-To: <20051202004232.GN31651@austin.ibm.com> References: <20051202004232.GN31651@austin.ibm.com> Message-ID: <20051202012208.GB9576@monkey.ibm.com> On Thu, Dec 01, 2005 at 06:42:32PM -0600, linas wrote: > > This patch fixs a crash on null-pointer deref during dlpar slot addition. Just curious is this specific to adapters? I experienced a crash when trying to add CPUs. Haven't debugged it yet. But, was able to successfully add memory. -- Mike From kravetz at us.ibm.com Fri Dec 2 12:28:39 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Thu, 1 Dec 2005 17:28:39 -0800 Subject: [PATCH] numa placement for dynamically added memory Message-ID: <20051202012839.GA14241@monkey.ibm.com> This patch places dynamically added memory within the appropriate numa node. A new routine hot_add_scn_to_nid() replicates most of the memory scanning code in parse_numa_properties(). I'd appreciate it if Anton or Nathan could take a look. I seem to break something every time I touch numa.c. This patch depends on the patch I sent yesterday that hits numa.c http://ozlabs.org/pipermail/linuxppc64-dev/2005-December/006923.html Signed-off-by: Mike Kravetz diff -Naupr linux-2.6.15-rc4.dep/arch/powerpc/mm/mem.c linux-2.6.15-rc4.work/arch/powerpc/mm/mem.c --- linux-2.6.15-rc4.dep/arch/powerpc/mm/mem.c 2005-12-01 06:25:15.000000000 +0000 +++ linux-2.6.15-rc4.work/arch/powerpc/mm/mem.c 2005-12-02 00:11:19.000000000 +0000 @@ -121,11 +121,15 @@ void online_page(struct page *page) */ int __devinit add_memory(u64 start, u64 size) { - struct pglist_data *pgdata = NODE_DATA(0); + struct pglist_data *pgdata; struct zone *zone; + int nid; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + nid = hot_add_scn_to_nid(start); + pgdata = NODE_DATA(nid); + start += KERNELBASE; create_section_mapping(start, start + size); diff -Naupr linux-2.6.15-rc4.dep/arch/powerpc/mm/numa.c linux-2.6.15-rc4.work/arch/powerpc/mm/numa.c --- linux-2.6.15-rc4.dep/arch/powerpc/mm/numa.c 2005-12-01 19:46:21.000000000 +0000 +++ linux-2.6.15-rc4.work/arch/powerpc/mm/numa.c 2005-12-02 00:11:19.000000000 +0000 @@ -37,6 +37,7 @@ EXPORT_SYMBOL(node_data); static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; +static int n_mem_addr_cells, n_mem_size_cells; /* * We need somewhere to store start/end/node for each region until we have @@ -267,7 +268,11 @@ static void __init get_n_mem_cells(int * of_node_put(memory); } +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long read_n_cells(int n, unsigned int **buf) +#else static unsigned long __init read_n_cells(int n, unsigned int **buf) +#endif { unsigned long result = 0; @@ -374,7 +379,6 @@ static int __init parse_numa_properties( { struct device_node *cpu = NULL; struct device_node *memory = NULL; - int n_addr_cells, n_size_cells; int max_domain; unsigned long i; @@ -413,7 +417,7 @@ static int __init parse_numa_properties( } } - get_n_mem_cells(&n_addr_cells, &n_size_cells); + get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long start; @@ -430,8 +434,8 @@ static int __init parse_numa_properties( ranges = memory->n_addrs; new_range: /* these are order-sensitive, and modify the buffer pointer */ - start = read_n_cells(n_addr_cells, &memcell_buf); - size = read_n_cells(n_size_cells, &memcell_buf); + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); numa_domain = of_node_numa_domain(memory); @@ -717,3 +721,50 @@ static int __init early_numa(char *p) return 0; } early_param("numa", early_numa); + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Find the node associated with a hot added memory section. Section + * corresponds to a SPARSEMEM section, not an LMB. It is assumed that + * sections are fully contained within a single LMB. + */ +int hot_add_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory = NULL; + + if (!numa_enabled || (min_common_depth < 0)) + return 0; + + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start, size; + int numa_domain, ranges; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + ranges = memory->n_addrs; /* ranges in cell */ +ha_new_range: + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + numa_domain = of_node_numa_domain(memory); + + /* Domains not present at boot default to 0 */ + if (!node_online(numa_domain)) + numa_domain = 0; + + if ((scn_addr >= start) && (scn_addr < (start + size))) { + of_node_put(memory); + return numa_domain; + } + + if (--ranges) /* process all ranges in cell */ + goto ha_new_range; + } + + BUG(); /* section address should be found above */ + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff -Naupr linux-2.6.15-rc4.dep/include/asm-powerpc/sparsemem.h linux-2.6.15-rc4.work/include/asm-powerpc/sparsemem.h --- linux-2.6.15-rc4.dep/include/asm-powerpc/sparsemem.h 2005-12-01 06:25:15.000000000 +0000 +++ linux-2.6.15-rc4.work/include/asm-powerpc/sparsemem.h 2005-12-01 19:57:03.000000000 +0000 @@ -13,6 +13,11 @@ #ifdef CONFIG_MEMORY_HOTPLUG extern void create_section_mapping(unsigned long start, unsigned long end); +#ifdef CONFIG_NUMA +extern int hot_add_scn_to_nid(unsigned long scn_addr); +#else +#define hot_add_scn_to_nid(scn_addr) (0) +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* CONFIG_SPARSEMEM */ From linas at austin.ibm.com Fri Dec 2 13:22:53 2005 From: linas at austin.ibm.com (linas) Date: Thu, 1 Dec 2005 20:22:53 -0600 Subject: [PATCH] powerpc/pseries: dlpar-add crash on null pointer deref In-Reply-To: <20051202012208.GB9576@monkey.ibm.com> References: <20051202004232.GN31651@austin.ibm.com> <20051202012208.GB9576@monkey.ibm.com> Message-ID: <20051202022253.GQ31651@austin.ibm.com> On Thu, Dec 01, 2005 at 05:22:08PM -0800, Mike Kravetz was heard to remark: > On Thu, Dec 01, 2005 at 06:42:32PM -0600, linas wrote: > > > > This patch fixs a crash on null-pointer deref during dlpar slot addition. > > Just curious is this specific to adapters? I experienced a crash when > trying to add CPUs. Haven't debugged it yet. But, was able to > successfully add memory. This should only affect PCI devices. --linas From ntl at pobox.com Fri Dec 2 14:02:30 2005 From: ntl at pobox.com (Nathan Lynch) Date: Thu, 1 Dec 2005 22:02:30 -0500 Subject: [PATCH] numa placement for dynamically added memory In-Reply-To: <20051202012839.GA14241@monkey.ibm.com> References: <20051202012839.GA14241@monkey.ibm.com> Message-ID: <20051202030229.GA7836@localhost.localdomain> Hi Mike- Mike Kravetz wrote: > This patch places dynamically added memory within the appropriate > numa node. A new routine hot_add_scn_to_nid() replicates most of > the memory scanning code in parse_numa_properties(). > > +#ifdef CONFIG_MEMORY_HOTPLUG > +static unsigned long read_n_cells(int n, unsigned int **buf) > +#else > static unsigned long __init read_n_cells(int n, unsigned int **buf) > +#endif Any reason not to use __devinit here? Or maybe look into devising a macro like __cpuinit for memory hotplug. > +#ifdef CONFIG_MEMORY_HOTPLUG > +/* > + * Find the node associated with a hot added memory section. Section > + * corresponds to a SPARSEMEM section, not an LMB. It is assumed that > + * sections are fully contained within a single LMB. > + */ > +int hot_add_scn_to_nid(unsigned long scn_addr) > +{ > + struct device_node *memory = NULL; > + > + if (!numa_enabled || (min_common_depth < 0)) > + return 0; > + > + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { > + unsigned long start, size; > + int numa_domain, ranges; > + unsigned int *memcell_buf; > + unsigned int len; > + > + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); > + if (!memcell_buf || len <= 0) > + continue; > + > + ranges = memory->n_addrs; /* ranges in cell */ > +ha_new_range: > + start = read_n_cells(n_mem_addr_cells, &memcell_buf); > + size = read_n_cells(n_mem_size_cells, &memcell_buf); > + numa_domain = of_node_numa_domain(memory); > + > + /* Domains not present at boot default to 0 */ > + if (!node_online(numa_domain)) > + numa_domain = 0; Nope, 0 is not always a valid node on pSeries lpar. I suggest using any_online_node(), or revisiting the idea of logical<->physical mapping of node/domain ids. I tried the latter a few months ago but I've been working on other stuff lately and haven't been able to revisit it. > +#ifdef CONFIG_NUMA > +extern int hot_add_scn_to_nid(unsigned long scn_addr); > +#else > +#define hot_add_scn_to_nid(scn_addr) (0) > +#endif Make hot_add_scn_to_nid a static inline in the !CONFIG_NUMA case, please. Nathan From paulus at samba.org Fri Dec 2 15:57:05 2005 From: paulus at samba.org (Paul Mackerras) Date: Fri, 2 Dec 2005 15:57:05 +1100 Subject: please pull powerpc-merge.git Message-ID: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> Linus, Please pull git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc-merge.git There is a fix for a bug in making IOMMU entries on partitioned pSeries systems when 64k pages are used, and a correction for the help text of a config option. Thanks, Paul. Michal Ostrowski: powerpc/pseries: Fix TCE building with 64k pagesize Olaf Hering: powerpc: correct the NR_CPUS description text arch/powerpc/Kconfig | 2 +- arch/powerpc/platforms/pseries/iommu.c | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) From olof at lixom.net Fri Dec 2 16:09:03 2005 From: olof at lixom.net (Olof Johansson) Date: Thu, 1 Dec 2005 23:09:03 -0600 Subject: please pull powerpc-merge.git In-Reply-To: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> Message-ID: <20051202050903.GC13870@pb15.lixom.net> On Fri, Dec 02, 2005 at 03:57:05PM +1100, Paul Mackerras wrote: > Michal Ostrowski: > powerpc/pseries: Fix TCE building with 64k pagesize Did I miss this one when it went by on the list, or was it never posted there? That's not a good way to do it -- tce_build_pSeriesLP will be called for 1 64K page, but it will actually insert 16 4K pages. It's definately a case for buildmulti. I suggest the following instead. Thanks, Olof ---- Fix adjustment of TCE_PAGE_FACTOR in fallbacks to tce_build_pSeriesLP. Signed-off-by: Olof Johansson Index: 2.6/arch/powerpc/platforms/pseries/iommu.c =================================================================== --- 2.6.orig/arch/powerpc/platforms/pseries/iommu.c 2005-11-29 09:11:47.000000000 -0600 +++ 2.6/arch/powerpc/platforms/pseries/iommu.c 2005-12-01 23:06:36.000000000 -0600 @@ -147,7 +147,8 @@ static void tce_buildmulti_pSeriesLP(str npages <<= TCE_PAGE_FACTOR; if (npages == 1) - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + return tce_build_pSeriesLP(tbl, tcenum >> TCE_PAGE_FACTOR, + npages >> TCE_PAGE_FACTOR, uaddr, direction); tcep = __get_cpu_var(tce_page); @@ -159,7 +160,8 @@ static void tce_buildmulti_pSeriesLP(str tcep = (void *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) - return tce_build_pSeriesLP(tbl, tcenum, npages, + return tce_build_pSeriesLP(tbl, tcenum >> TCE_PAGE_FACTOR, + npages >> TCE_PAGE_FACTOR, uaddr, direction); __get_cpu_var(tce_page) = tcep; } From olof at lixom.net Fri Dec 2 16:13:55 2005 From: olof at lixom.net (Olof Johansson) Date: Thu, 1 Dec 2005 23:13:55 -0600 Subject: please pull powerpc-merge.git In-Reply-To: <20051202050903.GC13870@pb15.lixom.net> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> <20051202050903.GC13870@pb15.lixom.net> Message-ID: <20051202051355.GD13870@pb15.lixom.net> On Thu, Dec 01, 2005 at 11:09:03PM -0600, olof wrote: > That's not a good way to do it -- tce_build_pSeriesLP will be called > for 1 64K page, but it will actually insert 16 4K pages. It's definately > a case for buildmulti. > > I suggest the following instead. ..and I forgot to include the first fix of regular build_pSeriesLP. Crap. New patch: --- Fix adjustment of TCE_PAGE_FACTOR in tce_build_pSeriesLP and fallbacks to it. Signed-off-by: Olof Johansson Index: 2.6/arch/powerpc/platforms/pseries/iommu.c =================================================================== --- 2.6.orig/arch/powerpc/platforms/pseries/iommu.c 2005-11-29 09:11:47.000000000 -0600 +++ 2.6/arch/powerpc/platforms/pseries/iommu.c 2005-12-01 23:12:57.000000000 -0600 @@ -109,6 +109,9 @@ static void tce_build_pSeriesLP(struct i u64 rc; union tce_entry tce; + tcenum <<= TCE_PAGE_FACTOR; + npages <<= TCE_PAGE_FACTOR; + tce.te_word = 0; tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; tce.te_rdwr = 1; @@ -147,7 +150,8 @@ static void tce_buildmulti_pSeriesLP(str npages <<= TCE_PAGE_FACTOR; if (npages == 1) - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + return tce_build_pSeriesLP(tbl, tcenum >> TCE_PAGE_FACTOR, + npages >>TCE_PAGE_FACTOR, uaddr, direction); tcep = __get_cpu_var(tce_page); @@ -159,7 +163,8 @@ static void tce_buildmulti_pSeriesLP(str tcep = (void *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) - return tce_build_pSeriesLP(tbl, tcenum, npages, + return tce_build_pSeriesLP(tbl, tcenum >> TCE_PAGE_FACTOR, + npages >> TCE_PAGE_FACTOR, uaddr, direction); __get_cpu_var(tce_page) = tcep; } From paulus at samba.org Fri Dec 2 16:39:58 2005 From: paulus at samba.org (Paul Mackerras) Date: Fri, 2 Dec 2005 16:39:58 +1100 Subject: please pull powerpc-merge.git In-Reply-To: <20051202050903.GC13870@pb15.lixom.net> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> <20051202050903.GC13870@pb15.lixom.net> Message-ID: <17295.56878.842081.525602@cargo.ozlabs.ibm.com> Olof Johansson writes: > On Fri, Dec 02, 2005 at 03:57:05PM +1100, Paul Mackerras wrote: > > > Michal Ostrowski: > > powerpc/pseries: Fix TCE building with 64k pagesize > > Did I miss this one when it went by on the list, or was it never posted > there? Michal sent it just to me, for some reason. I convinced myself that it did actually fix a bug, so I sent it on. Next time maybe Michal can cc linuxppc64-dev. > That's not a good way to do it -- tce_build_pSeriesLP will be called > for 1 64K page, but it will actually insert 16 4K pages. It's definately > a case for buildmulti. > > I suggest the following instead. Or better still, we could do: if (TCE_PAGE_FACTOR == 0 && npages == 1) return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction); which will let the whole tce_build_pSeriesLP call get optimized out when we have 64k pages selected. Paul. From olof at lixom.net Fri Dec 2 16:57:21 2005 From: olof at lixom.net (Olof Johansson) Date: Thu, 1 Dec 2005 23:57:21 -0600 Subject: please pull powerpc-merge.git In-Reply-To: <17295.56878.842081.525602@cargo.ozlabs.ibm.com> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> <20051202050903.GC13870@pb15.lixom.net> <17295.56878.842081.525602@cargo.ozlabs.ibm.com> Message-ID: <20051202055721.GG13870@pb15.lixom.net> On Fri, Dec 02, 2005 at 04:39:58PM +1100, Paul Mackerras wrote: > > I suggest the following instead. > > Or better still, we could do: > > if (TCE_PAGE_FACTOR == 0 && npages == 1) > return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, > direction); > > which will let the whole tce_build_pSeriesLP call get optimized out > when we have 64k pages selected. Yep, that's even better. Yet another twist is to do: if ((npages << TCE_PAGE_FACTOR) == 1) Same result, maybe a little easier to read. Patch below if it's in your taste, if not go with what you have. :) -Olof --- Fix adjustment of TCE_PAGE_FACTOR in fallbacks to tce_build_pSeriesLP. Signed-off-by: Olof Johansson Index: 2.6/arch/powerpc/platforms/pseries/iommu.c =================================================================== --- 2.6.orig/arch/powerpc/platforms/pseries/iommu.c 2005-11-29 09:11:47.000000000 -0600 +++ 2.6/arch/powerpc/platforms/pseries/iommu.c 2005-12-01 23:53:04.000000000 -0600 @@ -109,6 +109,9 @@ static void tce_build_pSeriesLP(struct i u64 rc; union tce_entry tce; + tcenum <<= TCE_PAGE_FACTOR; + npages <<= TCE_PAGE_FACTOR; + tce.te_word = 0; tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; tce.te_rdwr = 1; @@ -143,10 +146,8 @@ static void tce_buildmulti_pSeriesLP(str union tce_entry tce, *tcep; long l, limit; - tcenum <<= TCE_PAGE_FACTOR; - npages <<= TCE_PAGE_FACTOR; - - if (npages == 1) + /* For performance reasons, only fall back for single TCE insert */ + if ((npages << TCE_PAGE_FACTOR) == 1) return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction); @@ -164,6 +165,9 @@ static void tce_buildmulti_pSeriesLP(str __get_cpu_var(tce_page) = tcep; } + tcenum <<= TCE_PAGE_FACTOR; + npages <<= TCE_PAGE_FACTOR; + tce.te_word = 0; tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; tce.te_rdwr = 1; From olof at lixom.net Fri Dec 2 16:59:36 2005 From: olof at lixom.net (Olof Johansson) Date: Thu, 1 Dec 2005 23:59:36 -0600 Subject: please pull powerpc-merge.git In-Reply-To: <20051202055721.GG13870@pb15.lixom.net> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> <20051202050903.GC13870@pb15.lixom.net> <17295.56878.842081.525602@cargo.ozlabs.ibm.com> <20051202055721.GG13870@pb15.lixom.net> Message-ID: <20051202055935.GH13870@pb15.lixom.net> On Thu, Dec 01, 2005 at 11:57:21PM -0600, olof wrote: > Yep, that's even better. Yet another twist is to do: > > if ((npages << TCE_PAGE_FACTOR) == 1) Nevermind, my verification of the above was bad, I tried it with a constant instead of variable. GCC isn't smart enough to optimize that away for variable statements. Go with your solution. -Olof From mostrows at watson.ibm.com Fri Dec 2 23:54:07 2005 From: mostrows at watson.ibm.com (Michal Ostrowski) Date: Fri, 02 Dec 2005 07:54:07 -0500 Subject: please pull powerpc-merge.git In-Reply-To: <17295.56878.842081.525602@cargo.ozlabs.ibm.com> References: <17295.54305.921349.174302@cargo.ozlabs.ibm.com> <20051202050903.GC13870@pb15.lixom.net> <17295.56878.842081.525602@cargo.ozlabs.ibm.com> Message-ID: <1133528047.8137.80.camel@brick.watson.ibm.com> On Fri, 2005-12-02 at 16:39 +1100, Paul Mackerras wrote: > Olof Johansson writes: > > > On Fri, Dec 02, 2005 at 03:57:05PM +1100, Paul Mackerras wrote: > > > > > Michal Ostrowski: > > > powerpc/pseries: Fix TCE building with 64k pagesize > > > > Did I miss this one when it went by on the list, or was it never posted > > there? > > Michal sent it just to me, for some reason. I convinced myself that > it did actually fix a bug, so I sent it on. Next time maybe Michal > can cc linuxppc64-dev. > Yes... bit of an oops on my part. My original patch fixed a real bug I saw with tce_build_pSeriesLP being called directly, not from tce_buildmulti_pSeriesLP. This was due to the fact that firmware_has_feature(FW_FEATURE_MULTITCE) == 0 (see iommu_init_early_pSeries). -- Michal Ostrowski From johnrose at austin.ibm.com Sat Dec 3 03:46:28 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 02 Dec 2005 10:46:28 -0600 Subject: [PATCH 1/2] PCI Hotplug/powerpc: remove duplicated code In-Reply-To: <20051202005614.GO31651@austin.ibm.com> References: <20051202005614.GO31651@austin.ibm.com> Message-ID: <1133541988.9364.8.camel@sinatra.austin.ibm.com> The RPAPHP code contains a routine that duplicates some existing code. This patch removes the rpaphp version of the code. Signed-off-by: Linas Vepstas Acked-by: John Rose From msdemlei at cl.uni-heidelberg.de Sat Dec 3 02:30:50 2005 From: msdemlei at cl.uni-heidelberg.de (Markus Demleitner) Date: Fri, 2 Dec 2005 16:30:50 +0100 Subject: Windfarm/modules trouble Message-ID: <20051202153050.GA1368@victor.cl.uni-heidelberg.de> Hi, I've been trying out the windfarm system in 2.6.15-rc3 on a iMac G5 today. I like the general architecture a lot, but of course with abstraction comes a somewhat steep learning curve, in particular if you (like me) aren't really a kernel guy. So, sorry for not sending useful patches. First off, compiling the stuff statically works, but the fans are more active than they are in OS X or with my hacked simpleTemp. I wanted to find out why, and to save me some rebooting, I tried to compile windfarm as modules. Minor trouble: windfarm_pid.c is missing MODULE_AUTHOR("Benjamin Herrenschmidt "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PID algorithm for thermal control"); at its end, so you get "kernel tainted" messages. What is worse, the control loop doesn't run after you say modprobe windfarm_pm81. To get some idea why, I sprinkled windfarm_core with DBG-statements. wf_notify says "No notifiers" if the notifier chain is empty and otherwise prints the list head and next. Here's what happens: Dec 2 15:59:10 miller kernel: windfarm: Initializing for iMacG5 model ID 5 Dec 2 15:59:10 miller kernel: Windfarm compatible, init: 0 Dec 2 15:59:10 miller kernel: wf: Registered control system-fan Dec 2 15:59:10 miller kernel: No notifiers! Dec 2 15:59:10 miller kernel: wf: Registered control cpu-fan Dec 2 15:59:10 miller kernel: wf: Registered sensor cpu-temp Dec 2 15:59:10 miller kernel: wf: Registered sensor cpu-current Dec 2 15:59:10 miller kernel: wf: Registered sensor cpu-voltage Dec 2 15:59:10 miller kernel: wf: Registered sensor cpu-power Dec 2 15:59:10 miller kernel: wf: Registered sensor hd-temp (up to here, No notifiers continued, but I've clipped it) Dec 2 15:59:10 miller kernel: Driver register. Dec 2 15:59:10 miller kernel: wf... PROBE....register called... (these are my DBGs from windfarm_pm81:wf_smu_probe and wf_register_client, so that one works, so there's now one function in the notifier chain, wf_smu_notify:) Dec 2 15:59:10 miller kernel: Current chain head: 103638 Dec 2 15:59:10 miller kernel: Current chain next: 0 (these continue throughout, but I've clipped them again) Dec 2 15:59:10 miller kernel: wf: new control cpu-fan detected Dec 2 15:59:10 miller kernel: wf: new control system-fan detected Dec 2 15:59:10 miller kernel: wf: new sensor hd-temp detected Dec 2 15:59:10 miller kernel: wf: new sensor cpu-power detected Dec 2 15:59:10 miller kernel: wf: new sensor cpu-voltage detected Dec 2 15:59:10 miller kernel: wf: new sensor cpu-current detected Dec 2 15:59:10 miller kernel: wf: new sensor cpu-temp detected Dec 2 15:59:10 miller kernel: wf: thread started Dec 2 15:59:10 miller kernel: wf: notify called (this guy comes from wf_thread_func, after time_after_eq) Dec 2 15:59:10 miller kernel: register failed... (and this one now from wf_register_client, after the bail: label. I guess that's where the trouble starts, but I have no idea why this fails) Well, that's it, afterwards one sees the the thread running and call wf_smu_probe, but no pid. Hints, anyone? Other issues (I haven't really looked into any of them yet): (1) You cannot unload the windfarm_core once it's loaded because there still remain references into windfarm_smu_sensors: miller$ sudo modprobe windfarm_pm81 miller$ lsmod Module Size Used by windfarm_lm75_sensor 8872 1 windfarm_smu_sensors 10864 4 windfarm_smu_controls 8608 2 windfarm_pm81 18216 0 windfarm_core 20824 4 windfarm_lm75_sensor,windfarm_smu_sensors,windfarm_smu_controls,windfarm_pm81 windfarm_pid 4984 1 windfarm_pm81 [crap clipped] miller$ sudo rmmod windfarm_pm81 windfarm_pid windfarm_smu_controls windfarm_lm75_sensor miller$ lsmod Module Size Used by windfarm_smu_sensors 10864 2 windfarm_core 20824 1 windfarm_smu_sensors Of course, kwindfarm still runs. (2) After that, modprobing windfarm_pm81 again results in an Oops: Unable to handle kernel paging request for data at address 0x17f03280302b8b91 Faulting instruction address: 0xc00000000019b098 Oops: Kernel access of bad area, sig: 11 [#1] PREEMPT SMP NR_CPUS=2 POWERMAC Modules linked in: windfarm_lm75_sensor windfarm_smu_controls windfarm_pm81 windfarm_pid windfarm_smu_sensors windfarm_core cpufreq_powersave cpufreq_conservative cpufreq_ondemand usb_storage NIP: C00000000019B098 LR: C00000000032FF90 CTR: C00000000028E250 REGS: c00000001b233680 TRAP: 0300 Not tainted (2.6.15-rc3) MSR: 9000000000009032 CR: 24002488 XER: 20000000 DAR: 17F03280302B8B91, DSISR: 0000000040000000 TASK = c00000001b19e040[1551] 'modprobe' THREAD: c00000001b230000 CPU: 0 GPR00: C00000000032FFCC C00000001B233900 C0000000004BD4C0 17F03280302B8B91 GPR04: C0000000004B53B8 C000000000EE0568 FFFFFFFFFFFFFFED C000000000408D38 GPR08: C0000000004F1C00 C000000000430DA8 0000000000000000 0000000000000000 GPR12: 0000000024002442 C0000000003F7C00 00000000100170B8 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 00000000100013A4 000000001001DF18 000000001001DC98 GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR28: D00000000010ACE0 17F03280302B8B79 C00000001B233A80 17F03280302B8B81 NIP [C00000000019B098] .kref_get+0x0/0x24 Call Trace: [C00000001B233990] [C00000000022541C] .next_device+0x10/0x38 [C00000001B233A10] [C0000000002254CC] .bus_for_each_dev+0x88/0xcc [C00000001B233AC0] [C00000000022666C] .driver_attach+0x28/0x40 [C00000001B233B40] [C000000000225C54] .bus_add_driver+0xc8/0x1dc [C00000001B233BF0] [C000000000226D0C] .driver_register+0x58/0x74 [C00000001B233C80] [C00000000028E9C8] .i2c_add_driver+0x78/0x188 [C00000001B233D10] [D000000000109588] .wf_lm75_sensor_init+0x1c/0x40 [windfarm_lm75_sensor] [C00000001B233D90] [C0000000000667AC] .sys_init_module+0x2a0/0x4f8 [C00000001B233E30] [C000000000008600] syscall_exit+0x0/0x18 Instruction dump: 7d635b78 e8010010 eba1ffe8 ebc1fff0 ebe1fff8 7c0803a6 4e800020 7c0c0378 4bffff88 38000001 90030000 4e800020 <80030000> 21200000 7c090114 0b000000 <6>note: modprobe[1551] exited with preempt_count 1 Cheers, Markus From brking at us.ibm.com Sat Dec 3 04:21:19 2005 From: brking at us.ibm.com (Brian King) Date: Fri, 02 Dec 2005 11:21:19 -0600 Subject: p615 boot hang with current GIT kernel Message-ID: <4390828F.6030705@us.ibm.com> I'm having trouble booting the current GIT tree on a p615. Not sure if it is a .config problem or a real bug... There are a few boot messages that look a bit strange and not sure if I should worry about: Failed to request PCI IO region on PCI domain 0000 EEH: event on unsupported device, rc=0 dn=/pci at 400000000110/IBM,sp at 1 The system then hangs when trying to talk with the CDROM. I tried removing the CDROM, and was then able to get to ipr loading, but it isn't getting any PCI interrupts either... Not sure if this is a PCI interrupt routing issue or not, but that is my hunch at this point. I also tried disabling distributing interrupts to all CPUs, but that didn't help either. Attached is my boot log and .config. -- Brian King eServer Storage I/O IBM Linux Technology Center -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: boot_hang_short Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051202/e12396e7/attachment.txt -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: boot-hang.config Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051202/e12396e7/attachment-0001.txt From kravetz at us.ibm.com Sat Dec 3 05:43:33 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Fri, 2 Dec 2005 10:43:33 -0800 Subject: [PATCH] numa placement for dynamically added memory In-Reply-To: <20051202030229.GA7836@localhost.localdomain> References: <20051202012839.GA14241@monkey.ibm.com> <20051202030229.GA7836@localhost.localdomain> Message-ID: <20051202184333.GB6927@w-mikek2.ibm.com> On Thu, Dec 01, 2005 at 10:02:30PM -0500, Nathan Lynch wrote: > > +#ifdef CONFIG_MEMORY_HOTPLUG > > +static unsigned long read_n_cells(int n, unsigned int **buf) > > +#else > > static unsigned long __init read_n_cells(int n, unsigned int **buf) > > +#endif > > Any reason not to use __devinit here? Or maybe look into devising a > macro like __cpuinit for memory hotplug. Nope that makes sense. Thanks. > > + /* Domains not present at boot default to 0 */ > > + if (!node_online(numa_domain)) > > + numa_domain = 0; > > Nope, 0 is not always a valid node on pSeries lpar. I suggest using > any_online_node(), or revisiting the idea of logical<->physical > mapping of node/domain ids. I tried the latter a few months ago but > I've been working on other stuff lately and haven't been able to > revisit it. Yeah, I can do that. As a side note, it looks like 0 will always be a valid node in the current code. If we successfully execute parse_numa_properties(), then this code will be run. for (i = 0; i <= max_domain; i++) node_set_online(i); If we execute setup_nonnuma() instead, then the following is executed: node_set_online(0); I've previously wondered about the above code in parse_numa_properties(). You seem to confirm that is not the desired behavior. Should this be changed? > > +#ifdef CONFIG_NUMA > > +extern int hot_add_scn_to_nid(unsigned long scn_addr); > > +#else > > +#define hot_add_scn_to_nid(scn_addr) (0) > > +#endif > > Make hot_add_scn_to_nid a static inline in the !CONFIG_NUMA case, > please. OK -- Mike From zarniwhoop at ntlworld.com Sat Dec 3 05:49:37 2005 From: zarniwhoop at ntlworld.com (Ken Moffat) Date: Fri, 2 Dec 2005 18:49:37 +0000 (GMT) Subject: atkbd keys missing Message-ID: Hi, my powermac G5 SMU now runs nicely with 2.6.15-rc4 (ignoring sound, of course) except for one awkward problem - I'm using a PC PS/2 keyboard (108 keys, if I can count) through a kvm switch, then a PS2-to-usb adaptor. This is a British keyboard, and two of the keys don't work (nothing at all shows from them, not even scancodes). One of these keys is the 'wake' key (code 143 on my other boxes) which I don't miss, but the other is the '\' key (or '|' when shifted) which I find somewhat important. On british keyboards we fit this between the left shift and z keys, and it shows up as code 86 on my other boxes. I guess not many people use these keyboards on macs (the standard keyboard is almost american), so I'm not surprised this is broken. I'm at a loss to know where to look to try to fix this - there are no logged messages from atkbd about unknown keys (although atkbd is built in to my config), and the keycode tables in drivers/char/keyboard.c and drivers/input/keyboard/atkbd.c seem to be constant for almost all architectures. Is there a powerpc(64) keyboard driver that I'm overlooking, or has anybody any pointers to where I should be looking, please ? Ken -- das eine Mal als Trag?die, das andere Mal als Farce From ntl at pobox.com Sat Dec 3 06:20:54 2005 From: ntl at pobox.com (Nathan Lynch) Date: Fri, 2 Dec 2005 14:20:54 -0500 Subject: [PATCH] numa placement for dynamically added memory In-Reply-To: <20051202184333.GB6927@w-mikek2.ibm.com> References: <20051202012839.GA14241@monkey.ibm.com> <20051202030229.GA7836@localhost.localdomain> <20051202184333.GB6927@w-mikek2.ibm.com> Message-ID: <20051202192054.GB7836@localhost.localdomain> Mike Kravetz wrote: > On Thu, Dec 01, 2005 at 10:02:30PM -0500, Nathan Lynch wrote: > > > + /* Domains not present at boot default to 0 */ > > > + if (!node_online(numa_domain)) > > > + numa_domain = 0; > > > > Nope, 0 is not always a valid node on pSeries lpar. I suggest using > > any_online_node(), or revisiting the idea of logical<->physical > > mapping of node/domain ids. I tried the latter a few months ago but > > I've been working on other stuff lately and haven't been able to > > revisit it. > > Yeah, I can do that. As a side note, it looks like 0 will always be a > valid node in the current code. If we successfully execute > parse_numa_properties(), then this code will be run. > > for (i = 0; i <= max_domain; i++) > node_set_online(i); Yes, the code erroneously assumes that we can just mark nodes 0 through max_domain - 1 online. Explained below. > If we execute setup_nonnuma() instead, then the following is executed: > > node_set_online(0); > > I've previously wondered about the above code in parse_numa_properties(). > You seem to confirm that is not the desired behavior. Should this be > changed? I think so. The fundamental issue is that the numa code does not distinguish between logical node numbers and the identifiers given by the platform in the ibm,associativity properties to denote "affinity domains". This is ok for cases such as larger Power4 machines running without a hypervisor and LPARs on smaller Power5 machines (e.g. just 2 nodes). But with larger Power5 systems, we're getting into trouble over this. We need to be able to handle situations where the domain numbering as given by the platform doesn't necessarily begin at zero and isn't necessarily continuous -- for example a partition with domains numbered 2, 7, and 9. So I think a logical to "physical" mapping makes sense, similar to what we do for cpus. Nathan From flar at allandria.com Sat Dec 3 06:43:02 2005 From: flar at allandria.com (Brad Boyer) Date: Fri, 2 Dec 2005 11:43:02 -0800 Subject: atkbd keys missing In-Reply-To: References: Message-ID: <20051202194302.GA9023@pants.nu> On Fri, Dec 02, 2005 at 06:49:37PM +0000, Ken Moffat wrote: > Hi, my powermac G5 SMU now runs nicely with 2.6.15-rc4 (ignoring sound, > of course) except for one awkward problem - I'm using a PC PS/2 keyboard > (108 keys, if I can count) through a kvm switch, then a PS2-to-usb > adaptor. This is a British keyboard, and two of the keys don't work > (nothing at all shows from them, not even scancodes). > > I'm at a loss to know where to look to try to fix this - there are no > logged messages from atkbd about unknown keys (although atkbd is built > in to my config), and the keycode tables in drivers/char/keyboard.c and > drivers/input/keyboard/atkbd.c seem to be constant for almost all > architectures. > > Is there a powerpc(64) keyboard driver that I'm overlooking, or has > anybody any pointers to where I should be looking, please ? Since you are running it through a USB adaptor, I would expect it to show up as a USB keyboard to the system. The atkbd driver is for a keyboard directly connected to an AT or PS/2 style port on the system, which actually acts more like a normal serial port. Take a look at usbhid (in drivers/usb/input) to see if that is where it's actually getting handled. You probably have messages in the logs from the USB detection code if that is the case. Here's some from one of my boxes: input: USB HID v1.00 Keyboard [Macally Macally iKey ] on usb-0001:02:0b.1-2.2.5.1 The name will probably be the model of the adapter, whereas I have a directly connected USB keyboard in this case. It is possible the adapter is eating the keypress, but it's hard to say without more investigation of the problem. Brad Boyer flar at allandria.com From johnrose at austin.ibm.com Sat Dec 3 07:11:41 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 02 Dec 2005 14:11:41 -0600 Subject: [PATCH 2/2] PCI Hotplug/powerpc: more removal of duplicated code In-Reply-To: <20051202005957.GP31651@austin.ibm.com> References: <20051202005614.GO31651@austin.ibm.com> <20051202005957.GP31651@austin.ibm.com> Message-ID: <1133554301.11039.11.camel@sinatra.austin.ibm.com> The RPAPHP code contains two routines that appear to be gratuitous copies of very similar pci code. In particular, rpaphp_claim_resource ~~ pci_claim_resource (there is a minor, non-functional difference) rpadlpar_claim_one_bus == pcibios_claim_one_bus (the code is identical) This patch removes the rpaphp versions of the code. Signed-off-by: Linas Vepstas Acked-by: John Rose From zarniwhoop at ntlworld.com Sat Dec 3 08:26:39 2005 From: zarniwhoop at ntlworld.com (Ken Moffat) Date: Fri, 2 Dec 2005 21:26:39 +0000 (GMT) Subject: atkbd keys missing In-Reply-To: <20051202194302.GA9023@pants.nu> References: <20051202194302.GA9023@pants.nu> Message-ID: On Fri, 2 Dec 2005, Brad Boyer wrote: >> Is there a powerpc(64) keyboard driver that I'm overlooking, or has >> anybody any pointers to where I should be looking, please ? > > Since you are running it through a USB adaptor, I would expect it to > show up as a USB keyboard to the system. The atkbd driver is for a > keyboard directly connected to an AT or PS/2 style port on the system, > which actually acts more like a normal serial port. Take a look at > usbhid (in drivers/usb/input) to see if that is where it's actually > getting handled. You probably have messages in the logs from the USB > detection code if that is the case. Here's some from one of my boxes: > > input: USB HID v1.00 Keyboard [Macally Macally iKey ] on usb-0001:02:0b.1-2.2.5.1 > Brad, thanks for clarifying the role of atkbd. I'll take a look at the usb code, and dig through my logs. Cheers. Ken -- das eine Mal als Trag?die, das andere Mal als Farce From haren at us.ibm.com Sat Dec 3 09:36:53 2005 From: haren at us.ibm.com (Haren Myneni) Date: Fri, 02 Dec 2005 14:36:53 -0800 Subject: compilation error for CONFIG_SMP=n Message-ID: <4390CC85.8030808@us.ibm.com> Getting undeclared symbol `H_SET_ASR' for CONFIG_SMP=n. Thanks Haren -------------- next part -------------- A non-text attachment was scrubbed... Name: UP_compile_error.patch Type: text/x-patch Size: 332 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051202/df342ccf/attachment.bin From benh at kernel.crashing.org Sat Dec 3 09:31:54 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Sat, 03 Dec 2005 09:31:54 +1100 Subject: Windfarm/modules trouble In-Reply-To: <20051202153050.GA1368@victor.cl.uni-heidelberg.de> References: <20051202153050.GA1368@victor.cl.uni-heidelberg.de> Message-ID: <1133562714.6100.78.camel@gaston> On Fri, 2005-12-02 at 16:30 +0100, Markus Demleitner wrote: > Hi, > > I've been trying out the windfarm system in 2.6.15-rc3 on a iMac G5 > today. I like the general architecture a lot, but of course with > abstraction comes a somewhat steep learning curve, in particular if > you (like me) aren't really a kernel guy. So, sorry for not sending > useful patches. > > First off, compiling the stuff statically works, but the fans are > more active than they are in OS X or with my hacked simpleTemp. I > wanted to find out why, and to save me some rebooting, I tried to > compile windfarm as modules. Yes, current windfarm has issues being in a module, plus some non-trivial problems with the module refcounting. I'll look into it. >From my experience, the fans are not faster than OS X if you also use something like powernowd to throttle down your CPU speed when idle... Ben. From linas at austin.ibm.com Sat Dec 3 11:55:24 2005 From: linas at austin.ibm.com (linas) Date: Fri, 2 Dec 2005 18:55:24 -0600 Subject: [PATCH] powerpc: export pcibios_fixup_new_pci_devices() Message-ID: <20051203005524.GV31651@austin.ibm.com> Hi Paul, Please apply. --linas There is code in the RPAPHP directory that is identical to this routine; I'll be removing that code in an upcoming patch, but this patch is needed to expose the function to make it callable. Signed-off-by: Linas Vepstas Index: linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/pci_dlpar.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/arch/powerpc/platforms/pseries/pci_dlpar.c 2005-12-02 17:30:02.997471195 -0600 +++ linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/pci_dlpar.c 2005-12-02 17:31:37.444176026 -0600 @@ -77,7 +77,7 @@ } /* Must be called before pci_bus_add_devices */ -static void +void pcibios_fixup_new_pci_devices(struct pci_bus *bus, int fix_bus) { struct pci_dev *dev; Index: linux-2.6.15-rc3-mm1/include/asm-powerpc/pci-bridge.h =================================================================== --- linux-2.6.15-rc3-mm1.orig/include/asm-powerpc/pci-bridge.h 2005-12-01 15:17:23.000000000 -0600 +++ linux-2.6.15-rc3-mm1/include/asm-powerpc/pci-bridge.h 2005-12-02 17:34:37.386846527 -0600 @@ -137,6 +137,7 @@ /** Discover new pci devices under this bus, and add them */ void pcibios_add_pci_devices(struct pci_bus * bus); +void pcibios_fixup_new_pci_devices(struct pci_bus *bus, int fix_bus); extern int pcibios_remove_root_bus(struct pci_controller *phb); From linas at austin.ibm.com Sat Dec 3 11:59:52 2005 From: linas at austin.ibm.com (linas) Date: Fri, 2 Dec 2005 18:59:52 -0600 Subject: [PATCH] PCI Error Recovery: documentation Message-ID: <20051203005951.GW31651@austin.ibm.com> Greg, Please apply. --linas pci-error-recovery_docs.patch Various PCI bus errors can be signaled by newer PCI controllers. Recovering from those errors requires an infrastructure to notify affected device drivers of the error, and a way of walking through a reset sequence. This patch adds documentation describing the current error recovery proposal. Signed-off-by: Linas Vepstas Documentation/pci-error-recovery.txt | 246 +++++++++++++++++++++++++++++++++++ MAINTAINERS | 7 2 files changed, 253 insertions(+) Index: linux-2.6.14-git10/Documentation/pci-error-recovery.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.14-git10/Documentation/pci-error-recovery.txt 2005-11-07 17:33:26.920560069 -0600 @@ -0,0 +1,246 @@ + + PCI Error Recovery + ------------------ + May 31, 2005 + + Current document maintainer: + Linas Vepstas + + +Some PCI bus controllers are able to detect certain "hard" PCI errors +on the bus, such as parity errors on the data and address busses, as +well as SERR and PERR errors. These chipsets are then able to disable +I/O to/from the affected device, so that, for example, a bad DMA +address doesn't end up corrupting system memory. These same chipsets +are also able to reset the affected PCI device, and return it to +working condition. This document describes a generic API form +performing error recovery. + +The core idea is that after a PCI error has been detected, there must +be a way for the kernel to coordinate with all affected device drivers +so that the pci card can be made operational again, possibly after +performing a full electrical #RST of the PCI card. The API below +provides a generic API for device drivers to be notified of PCI +errors, and to be notified of, and respond to, a reset sequence. + +Preliminary sketch of API, cut-n-pasted-n-modified email from +Ben Herrenschmidt, circa 5 april 2005 + +The error recovery API support is exposed to the driver in the form of +a structure of function pointers pointed to by a new field in struct +pci_driver. The absence of this pointer in pci_driver denotes an +"non-aware" driver, behaviour on these is platform dependant. +Platforms like ppc64 can try to simulate pci hotplug remove/add. + +The definition of "pci_error_token" is not covered here. It is based on +Seto's work on the synchronous error detection. We still need to define +functions for extracting infos out of an opaque error token. This is +separate from this API. + +This structure has the form: + +struct pci_error_handlers +{ + int (*error_detected)(struct pci_dev *dev, pci_error_token error); + int (*mmio_enabled)(struct pci_dev *dev); + int (*resume)(struct pci_dev *dev); + int (*link_reset)(struct pci_dev *dev); + int (*slot_reset)(struct pci_dev *dev); +}; + +A driver doesn't have to implement all of these callbacks. The +only mandatory one is error_detected(). If a callback is not +implemented, the corresponding feature is considered unsupported. +For example, if mmio_enabled() and resume() aren't there, then the +driver is assumed as not doing any direct recovery and requires +a reset. If link_reset() is not implemented, the card is assumed as +not caring about link resets, in which case, if recover is supported, +the core can try recover (but not slot_reset() unless it really did +reset the slot). If slot_reset() is not supported, link_reset() can +be called instead on a slot reset. + +At first, the call will always be : + + 1) error_detected() + + Error detected. This is sent once after an error has been detected. At +this point, the device might not be accessible anymore depending on the +platform (the slot will be isolated on ppc64). The driver may already +have "noticed" the error because of a failing IO, but this is the proper +"synchronisation point", that is, it gives a chance to the driver to +cleanup, waiting for pending stuff (timers, whatever, etc...) to +complete; it can take semaphores, schedule, etc... everything but touch +the device. Within this function and after it returns, the driver +shouldn't do any new IOs. Called in task context. This is sort of a +"quiesce" point. See note about interrupts at the end of this doc. + + Result codes: + - PCIERR_RESULT_CAN_RECOVER: + Driever returns this if it thinks it might be able to recover + the HW by just banging IOs or if it wants to be given + a chance to extract some diagnostic informations (see + below). + - PCIERR_RESULT_NEED_RESET: + Driver returns this if it thinks it can't recover unless the + slot is reset. + - PCIERR_RESULT_DISCONNECT: + Return this if driver thinks it won't recover at all, + (this will detach the driver ? or just leave it + dangling ? to be decided) + +So at this point, we have called error_detected() for all drivers +on the segment that had the error. On ppc64, the slot is isolated. What +happens now typically depends on the result from the drivers. If all +drivers on the segment/slot return PCIERR_RESULT_CAN_RECOVER, we would +re-enable IOs on the slot (or do nothing special if the platform doesn't +isolate slots) and call 2). If not and we can reset slots, we go to 4), +if neither, we have a dead slot. If it's an hotplug slot, we might +"simulate" reset by triggering HW unplug/replug though. + +>>> Current ppc64 implementation assumes that a device driver will +>>> *not* schedule or semaphore in this routine; the current ppc64 +>>> implementation uses one kernel thread to notify all devices; +>>> thus, of one device sleeps/schedules, all devices are affected. +>>> Doing better requires complex multi-threaded logic in the error +>>> recovery implementation (e.g. waiting for all notification threads +>>> to "join" before proceeding with recovery.) This seems excessively +>>> complex and not worth implementing. + +>>> The current ppc64 implementation doesn't much care if the device +>>> attempts i/o at this point, or not. I/O's will fail, returning +>>> a value of 0xff on read, and writes will be dropped. If the device +>>> driver attempts more than 10K I/O's to a frozen adapter, it will +>>> assume that the device driver has gone into an infinite loop, and +>>> it will panic the the kernel. + + 2) mmio_enabled() + + This is the "early recovery" call. IOs are allowed again, but DMA is +not (hrm... to be discussed, I prefer not), with some restrictions. This +is NOT a callback for the driver to start operations again, only to +peek/poke at the device, extract diagnostic information, if any, and +eventually do things like trigger a device local reset or some such, +but not restart operations. This is sent if all drivers on a segment +agree that they can try to recover and no automatic link reset was +performed by the HW. If the platform can't just re-enable IOs without +a slot reset or a link reset, it doesn't call this callback and goes +directly to 3) or 4). All IOs should be done _synchronously_ from +within this callback, errors triggered by them will be returned via +the normal pci_check_whatever() api, no new error_detected() callback +will be issued due to an error happening here. However, such an error +might cause IOs to be re-blocked for the whole segment, and thus +invalidate the recovery that other devices on the same segment might +have done, forcing the whole segment into one of the next states, +that is link reset or slot reset. + + Result codes: + - PCIERR_RESULT_RECOVERED + Driver returns this if it thinks the device is fully + functionnal and thinks it is ready to start + normal driver operations again. There is no + guarantee that the driver will actually be + allowed to proceed, as another driver on the + same segment might have failed and thus triggered a + slot reset on platforms that support it. + + - PCIERR_RESULT_NEED_RESET + Driver returns this if it thinks the device is not + recoverable in it's current state and it needs a slot + reset to proceed. + + - PCIERR_RESULT_DISCONNECT + Same as above. Total failure, no recovery even after + reset driver dead. (To be defined more precisely) + +>>> The current ppc64 implementation does not implement this callback. + + 3) link_reset() + + This is called after the link has been reset. This is typically +a PCI Express specific state at this point and is done whenever a +non-fatal error has been detected that can be "solved" by resetting +the link. This call informs the driver of the reset and the driver +should check if the device appears to be in working condition. +This function acts a bit like 2) mmio_enabled(), in that the driver +is not supposed to restart normal driver I/O operations right away. +Instead, it should just "probe" the device to check it's recoverability +status. If all is right, then the core will call resume() once all +drivers have ack'd link_reset(). + + Result codes: + (identical to mmio_enabled) + +>>> The current ppc64 implementation does not implement this callback. + + 4) slot_reset() + + This is called after the slot has been soft or hard reset by the +platform. A soft reset consists of asserting the adapter #RST line +and then restoring the PCI BARs and PCI configuration header. If the +platform supports PCI hotplug, then it might instead perform a hard +reset by toggling power on the slot off/on. This call gives drivers +the chance to re-initialize the hardware (re-download firmware, etc.), +but drivers shouldn't restart normal I/O processing operations at +this point. (See note about interrupts; interrupts aren't guaranteed +to be delivered until the resume() callback has been called). If all +device drivers report success on this callback, the patform will call +resume() to complete the error handling and let the driver restart +normal I/O processing. + +A driver can still return a critical failure for this function if +it can't get the device operational after reset. If the platform +previously tried a soft reset, it migh now try a hard reset (power +cycle) and then call slot_reset() again. It the device still can't +be recovered, there is nothing more that can be done; the platform +will typically report a "permanent failure" in such a case. The +device will be considered "dead" in this case. + + Result codes: + - PCIERR_RESULT_DISCONNECT + Same as above. + +>>> The current ppc64 implementation does not try a power-cycle reset +>>> if the driver returned PCIERR_RESULT_DISCONNECT. However, it should. + + 5) resume() + + This is called if all drivers on the segment have returned +PCIERR_RESULT_RECOVERED from one of the 3 prevous callbacks. +That basically tells the driver to restart activity, tht everything +is back and running. No result code is taken into account here. If +a new error happens, it will restart a new error handling process. + +That's it. I think this covers all the possibilities. The way those +callbacks are called is platform policy. A platform with no slot reset +capability for example may want to just "ignore" drivers that can't +recover (disconnect them) and try to let other cards on the same segment +recover. Keep in mind that in most real life cases, though, there will +be only one driver per segment. + +Now, there is a note about interrupts. If you get an interrupt and your +device is dead or has been isolated, there is a problem :) + +After much thinking, I decided to leave that to the platform. That is, +the recovery API only precies that: + + - There is no guarantee that interrupt delivery can proceed from any +device on the segment starting from the error detection and until the +restart callback is sent, at which point interrupts are expected to be +fully operational. + + - There is no guarantee that interrupt delivery is stopped, that is, ad +river that gets an interrupts after detecting an error, or that detects +and error within the interrupt handler such that it prevents proper +ack'ing of the interrupt (and thus removal of the source) should just +return IRQ_NOTHANDLED. It's up to the platform to deal with taht +condition, typically by masking the irq source during the duration of +the error handling. It is expected that the platform "knows" which +interrupts are routed to error-management capable slots and can deal +with temporarily disabling that irq number during error processing (this +isn't terribly complex). That means some IRQ latency for other devices +sharing the interrupt, but there is simply no other way. High end +platforms aren't supposed to share interrupts between many devices +anyway :) + + +Revised: 31 May 2005 Linas Vepstas Index: linux-2.6.14-git10/MAINTAINERS =================================================================== --- linux-2.6.14-git10.orig/MAINTAINERS 2005-11-07 17:23:59.053340654 -0600 +++ linux-2.6.14-git10/MAINTAINERS 2005-11-07 17:33:26.933558243 -0600 @@ -1899,6 +1899,13 @@ L: linux-abi-devel at lists.sourceforge.net S: Maintained +PCI ERROR RECOVERY +P: Linas Vepstas +M: linas at austin.ibm.com +L: linux-kernel at vger.kernel.org +L: linux-pci at atrey.karlin.mff.cuni.cz +S: Supported + PCI SOUND DRIVERS (ES1370, ES1371 and SONICVIBES) P: Thomas Sailer M: sailer at ife.ee.ethz.ch From greg at kroah.com Sat Dec 3 12:02:53 2005 From: greg at kroah.com (Greg KH) Date: Fri, 2 Dec 2005 17:02:53 -0800 Subject: [PATCH] PCI Error Recovery: documentation In-Reply-To: <20051203005951.GW31651@austin.ibm.com> References: <20051203005951.GW31651@austin.ibm.com> Message-ID: <20051203010253.GA31826@kroah.com> On Fri, Dec 02, 2005 at 06:59:52PM -0600, linas wrote: > +PCI ERROR RECOVERY > +P: Linas Vepstas > +M: linas at austin.ibm.com > +L: linux-kernel at vger.kernel.org > +L: linux-pci at atrey.karlin.mff.cuni.cz > +S: Supported Tab vs space problem here :( Care to redo? thanks, greg k-h From linas at austin.ibm.com Sat Dec 3 12:03:14 2005 From: linas at austin.ibm.com (linas) Date: Fri, 2 Dec 2005 19:03:14 -0600 Subject: [PATCH]: rpaphp: find_bus() -- remove duplicate code Message-ID: <20051203010314.GX31651@austin.ibm.com> John Rose, Please review and sign off, and forward to Greg KH! --linas The function rpaphp_find_pci_bus() has been migrated to pcibios_find_pci_bus() in arch/powerpc/platforms/pseries/pci_dlpar.c This patch removes the old version. Signed-off-by: Linas Vepstas Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpaphp_pci.c 2005-12-01 18:51:26.000000000 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp_pci.c 2005-12-02 14:17:19.834504074 -0600 @@ -32,36 +32,6 @@ #include "../pci.h" /* for pci_add_new_bus */ #include "rpaphp.h" -static struct pci_bus *find_bus_among_children(struct pci_bus *bus, - struct device_node *dn) -{ - struct pci_bus *child = NULL; - struct list_head *tmp; - struct device_node *busdn; - - busdn = pci_bus_to_OF_node(bus); - if (busdn == dn) - return bus; - - list_for_each(tmp, &bus->children) { - child = find_bus_among_children(pci_bus_b(tmp), dn); - if (child) - break; - } - return child; -} - -struct pci_bus *rpaphp_find_pci_bus(struct device_node *dn) -{ - struct pci_dn *pdn = dn->data; - - if (!pdn || !pdn->phb || !pdn->phb->bus) - return NULL; - - return find_bus_among_children(pdn->phb->bus, dn); -} -EXPORT_SYMBOL_GPL(rpaphp_find_pci_bus); - static int rpaphp_get_sensor_state(struct slot *slot, int *state) { int rc; @@ -120,7 +90,7 @@ /* config/unconfig adapter */ *value = slot->state; } else { - bus = rpaphp_find_pci_bus(slot->dn); + bus = pcibios_find_pci_bus(slot->dn); if (bus && !list_empty(&bus->devices)) *value = CONFIGURED; else @@ -369,7 +339,7 @@ struct pci_bus *bus; BUG_ON(!dn); - bus = rpaphp_find_pci_bus(dn); + bus = pcibios_find_pci_bus(dn); if (!bus) { err("%s: no pci_bus for dn %s\n", __FUNCTION__, dn->full_name); goto exit_rc; Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpadlpar_core.c 2005-12-01 18:51:26.000000000 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpadlpar_core.c 2005-12-02 14:18:23.226614153 -0600 @@ -174,7 +174,7 @@ { struct pci_dev *dev; - if (rpaphp_find_pci_bus(dn)) + if (pcibios_find_pci_bus(dn)) return -EINVAL; /* Add pci bus */ @@ -221,7 +221,7 @@ struct pci_dn *pdn; int rc = 0; - if (!rpaphp_find_pci_bus(dn)) + if (!pcibios_find_pci_bus(dn)) return -EINVAL; slot = find_slot(dn); @@ -366,7 +366,7 @@ struct pci_bus *bus; struct slot *slot; - bus = rpaphp_find_pci_bus(dn); + bus = pcibios_find_pci_bus(dn); if (!bus) return -EINVAL; Index: linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp.h =================================================================== --- linux-2.6.15-rc3-mm1.orig/drivers/pci/hotplug/rpaphp.h 2005-12-01 15:14:48.000000000 -0600 +++ linux-2.6.15-rc3-mm1/drivers/pci/hotplug/rpaphp.h 2005-12-02 14:19:24.050084110 -0600 @@ -88,13 +88,10 @@ /* function prototypes */ /* rpaphp_pci.c */ -extern struct pci_bus *rpaphp_find_pci_bus(struct device_node *dn); -extern int rpaphp_claim_resource(struct pci_dev *dev, int resource); extern int rpaphp_enable_pci_slot(struct slot *slot); extern int register_pci_slot(struct slot *slot); extern int rpaphp_get_pci_adapter_status(struct slot *slot, int is_init, u8 * value); extern void rpaphp_init_new_devs(struct pci_bus *bus); -extern void rpaphp_eeh_init_nodes(struct device_node *dn); extern int rpaphp_config_pci_adapter(struct pci_bus *bus); extern int rpaphp_unconfig_pci_adapter(struct pci_bus *bus); From linas at austin.ibm.com Sat Dec 3 12:16:18 2005 From: linas at austin.ibm.com (linas) Date: Fri, 2 Dec 2005 19:16:18 -0600 Subject: [PATCH] PCI Error Recovery: documentation In-Reply-To: <20051203010253.GA31826@kroah.com> References: <20051203005951.GW31651@austin.ibm.com> <20051203010253.GA31826@kroah.com> Message-ID: <20051203011618.GZ31651@austin.ibm.com> On Fri, Dec 02, 2005 at 05:02:53PM -0800, Greg KH was heard to remark: > > Tab vs space problem here :( > Care to redo? Below: pci-error-recovery_docs.patch Various PCI bus errors can be signaled by newer PCI controllers. Recovering from those errors requires an infrastructure to notify affected device drivers of the error, and a way of walking through a reset sequence. This patch adds documentation describing the current error recovery proposal. Signed-off-by: Linas Vepstas Documentation/pci-error-recovery.txt | 246 +++++++++++++++++++++++++++++++++++ MAINTAINERS | 7 2 files changed, 253 insertions(+) Index: linux-2.6.15-rc3-mm1/Documentation/pci-error-recovery.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.15-rc3-mm1/Documentation/pci-error-recovery.txt 2005-12-02 19:12:23.715528104 -0600 @@ -0,0 +1,246 @@ + + PCI Error Recovery + ------------------ + May 31, 2005 + + Current document maintainer: + Linas Vepstas + + +Some PCI bus controllers are able to detect certain "hard" PCI errors +on the bus, such as parity errors on the data and address busses, as +well as SERR and PERR errors. These chipsets are then able to disable +I/O to/from the affected device, so that, for example, a bad DMA +address doesn't end up corrupting system memory. These same chipsets +are also able to reset the affected PCI device, and return it to +working condition. This document describes a generic API form +performing error recovery. + +The core idea is that after a PCI error has been detected, there must +be a way for the kernel to coordinate with all affected device drivers +so that the pci card can be made operational again, possibly after +performing a full electrical #RST of the PCI card. The API below +provides a generic API for device drivers to be notified of PCI +errors, and to be notified of, and respond to, a reset sequence. + +Preliminary sketch of API, cut-n-pasted-n-modified email from +Ben Herrenschmidt, circa 5 april 2005 + +The error recovery API support is exposed to the driver in the form of +a structure of function pointers pointed to by a new field in struct +pci_driver. The absence of this pointer in pci_driver denotes an +"non-aware" driver, behaviour on these is platform dependant. +Platforms like ppc64 can try to simulate pci hotplug remove/add. + +The definition of "pci_error_token" is not covered here. It is based on +Seto's work on the synchronous error detection. We still need to define +functions for extracting infos out of an opaque error token. This is +separate from this API. + +This structure has the form: + +struct pci_error_handlers +{ + int (*error_detected)(struct pci_dev *dev, pci_error_token error); + int (*mmio_enabled)(struct pci_dev *dev); + int (*resume)(struct pci_dev *dev); + int (*link_reset)(struct pci_dev *dev); + int (*slot_reset)(struct pci_dev *dev); +}; + +A driver doesn't have to implement all of these callbacks. The +only mandatory one is error_detected(). If a callback is not +implemented, the corresponding feature is considered unsupported. +For example, if mmio_enabled() and resume() aren't there, then the +driver is assumed as not doing any direct recovery and requires +a reset. If link_reset() is not implemented, the card is assumed as +not caring about link resets, in which case, if recover is supported, +the core can try recover (but not slot_reset() unless it really did +reset the slot). If slot_reset() is not supported, link_reset() can +be called instead on a slot reset. + +At first, the call will always be : + + 1) error_detected() + + Error detected. This is sent once after an error has been detected. At +this point, the device might not be accessible anymore depending on the +platform (the slot will be isolated on ppc64). The driver may already +have "noticed" the error because of a failing IO, but this is the proper +"synchronisation point", that is, it gives a chance to the driver to +cleanup, waiting for pending stuff (timers, whatever, etc...) to +complete; it can take semaphores, schedule, etc... everything but touch +the device. Within this function and after it returns, the driver +shouldn't do any new IOs. Called in task context. This is sort of a +"quiesce" point. See note about interrupts at the end of this doc. + + Result codes: + - PCIERR_RESULT_CAN_RECOVER: + Driever returns this if it thinks it might be able to recover + the HW by just banging IOs or if it wants to be given + a chance to extract some diagnostic informations (see + below). + - PCIERR_RESULT_NEED_RESET: + Driver returns this if it thinks it can't recover unless the + slot is reset. + - PCIERR_RESULT_DISCONNECT: + Return this if driver thinks it won't recover at all, + (this will detach the driver ? or just leave it + dangling ? to be decided) + +So at this point, we have called error_detected() for all drivers +on the segment that had the error. On ppc64, the slot is isolated. What +happens now typically depends on the result from the drivers. If all +drivers on the segment/slot return PCIERR_RESULT_CAN_RECOVER, we would +re-enable IOs on the slot (or do nothing special if the platform doesn't +isolate slots) and call 2). If not and we can reset slots, we go to 4), +if neither, we have a dead slot. If it's an hotplug slot, we might +"simulate" reset by triggering HW unplug/replug though. + +>>> Current ppc64 implementation assumes that a device driver will +>>> *not* schedule or semaphore in this routine; the current ppc64 +>>> implementation uses one kernel thread to notify all devices; +>>> thus, of one device sleeps/schedules, all devices are affected. +>>> Doing better requires complex multi-threaded logic in the error +>>> recovery implementation (e.g. waiting for all notification threads +>>> to "join" before proceeding with recovery.) This seems excessively +>>> complex and not worth implementing. + +>>> The current ppc64 implementation doesn't much care if the device +>>> attempts i/o at this point, or not. I/O's will fail, returning +>>> a value of 0xff on read, and writes will be dropped. If the device +>>> driver attempts more than 10K I/O's to a frozen adapter, it will +>>> assume that the device driver has gone into an infinite loop, and +>>> it will panic the the kernel. + + 2) mmio_enabled() + + This is the "early recovery" call. IOs are allowed again, but DMA is +not (hrm... to be discussed, I prefer not), with some restrictions. This +is NOT a callback for the driver to start operations again, only to +peek/poke at the device, extract diagnostic information, if any, and +eventually do things like trigger a device local reset or some such, +but not restart operations. This is sent if all drivers on a segment +agree that they can try to recover and no automatic link reset was +performed by the HW. If the platform can't just re-enable IOs without +a slot reset or a link reset, it doesn't call this callback and goes +directly to 3) or 4). All IOs should be done _synchronously_ from +within this callback, errors triggered by them will be returned via +the normal pci_check_whatever() api, no new error_detected() callback +will be issued due to an error happening here. However, such an error +might cause IOs to be re-blocked for the whole segment, and thus +invalidate the recovery that other devices on the same segment might +have done, forcing the whole segment into one of the next states, +that is link reset or slot reset. + + Result codes: + - PCIERR_RESULT_RECOVERED + Driver returns this if it thinks the device is fully + functionnal and thinks it is ready to start + normal driver operations again. There is no + guarantee that the driver will actually be + allowed to proceed, as another driver on the + same segment might have failed and thus triggered a + slot reset on platforms that support it. + + - PCIERR_RESULT_NEED_RESET + Driver returns this if it thinks the device is not + recoverable in it's current state and it needs a slot + reset to proceed. + + - PCIERR_RESULT_DISCONNECT + Same as above. Total failure, no recovery even after + reset driver dead. (To be defined more precisely) + +>>> The current ppc64 implementation does not implement this callback. + + 3) link_reset() + + This is called after the link has been reset. This is typically +a PCI Express specific state at this point and is done whenever a +non-fatal error has been detected that can be "solved" by resetting +the link. This call informs the driver of the reset and the driver +should check if the device appears to be in working condition. +This function acts a bit like 2) mmio_enabled(), in that the driver +is not supposed to restart normal driver I/O operations right away. +Instead, it should just "probe" the device to check it's recoverability +status. If all is right, then the core will call resume() once all +drivers have ack'd link_reset(). + + Result codes: + (identical to mmio_enabled) + +>>> The current ppc64 implementation does not implement this callback. + + 4) slot_reset() + + This is called after the slot has been soft or hard reset by the +platform. A soft reset consists of asserting the adapter #RST line +and then restoring the PCI BARs and PCI configuration header. If the +platform supports PCI hotplug, then it might instead perform a hard +reset by toggling power on the slot off/on. This call gives drivers +the chance to re-initialize the hardware (re-download firmware, etc.), +but drivers shouldn't restart normal I/O processing operations at +this point. (See note about interrupts; interrupts aren't guaranteed +to be delivered until the resume() callback has been called). If all +device drivers report success on this callback, the patform will call +resume() to complete the error handling and let the driver restart +normal I/O processing. + +A driver can still return a critical failure for this function if +it can't get the device operational after reset. If the platform +previously tried a soft reset, it migh now try a hard reset (power +cycle) and then call slot_reset() again. It the device still can't +be recovered, there is nothing more that can be done; the platform +will typically report a "permanent failure" in such a case. The +device will be considered "dead" in this case. + + Result codes: + - PCIERR_RESULT_DISCONNECT + Same as above. + +>>> The current ppc64 implementation does not try a power-cycle reset +>>> if the driver returned PCIERR_RESULT_DISCONNECT. However, it should. + + 5) resume() + + This is called if all drivers on the segment have returned +PCIERR_RESULT_RECOVERED from one of the 3 prevous callbacks. +That basically tells the driver to restart activity, tht everything +is back and running. No result code is taken into account here. If +a new error happens, it will restart a new error handling process. + +That's it. I think this covers all the possibilities. The way those +callbacks are called is platform policy. A platform with no slot reset +capability for example may want to just "ignore" drivers that can't +recover (disconnect them) and try to let other cards on the same segment +recover. Keep in mind that in most real life cases, though, there will +be only one driver per segment. + +Now, there is a note about interrupts. If you get an interrupt and your +device is dead or has been isolated, there is a problem :) + +After much thinking, I decided to leave that to the platform. That is, +the recovery API only precies that: + + - There is no guarantee that interrupt delivery can proceed from any +device on the segment starting from the error detection and until the +restart callback is sent, at which point interrupts are expected to be +fully operational. + + - There is no guarantee that interrupt delivery is stopped, that is, ad +river that gets an interrupts after detecting an error, or that detects +and error within the interrupt handler such that it prevents proper +ack'ing of the interrupt (and thus removal of the source) should just +return IRQ_NOTHANDLED. It's up to the platform to deal with taht +condition, typically by masking the irq source during the duration of +the error handling. It is expected that the platform "knows" which +interrupts are routed to error-management capable slots and can deal +with temporarily disabling that irq number during error processing (this +isn't terribly complex). That means some IRQ latency for other devices +sharing the interrupt, but there is simply no other way. High end +platforms aren't supposed to share interrupts between many devices +anyway :) + + +Revised: 31 May 2005 Linas Vepstas Index: linux-2.6.15-rc3-mm1/MAINTAINERS =================================================================== --- linux-2.6.15-rc3-mm1.orig/MAINTAINERS 2005-12-01 15:17:24.000000000 -0600 +++ linux-2.6.15-rc3-mm1/MAINTAINERS 2005-12-02 19:14:19.126269787 -0600 @@ -1997,6 +1997,13 @@ L: linux-abi-devel at lists.sourceforge.net S: Maintained +PCI ERROR RECOVERY +P: Linas Vepstas +M: linas at austin.ibm.com +L: linux-kernel at vger.kernel.org +L: linux-pci at atrey.karlin.mff.cuni.cz +S: Supported + PCI SOUND DRIVERS (ES1370, ES1371 and SONICVIBES) P: Thomas Sailer M: sailer at ife.ee.ethz.ch From tom_gall at mac.com Sun Dec 4 09:29:42 2005 From: tom_gall at mac.com (Thomas Gall) Date: Sat, 3 Dec 2005 16:29:42 -0600 Subject: power3 / matrox problems on current git Message-ID: <3528CCF2-D2CD-496B-821D-E3714EC885DF@mac.com> Greetings, Trying to work with ben on some vdso support for glibc and I've ran into an interesting problem on my power3 box. 44p-270 rs6000 It would appear things are busticated on 2.6.15 current git. (Pull as of dec 3, this afternoon ~2pm CST) I suspect it's something related to the matrox card but that's only a theory at this point. Box has 2 gig of memory and G200 matrox card. Screen is all garbled and on the panel I get panic, VFS: can't find r which is probably a reference to the root partition. I've checked the yaboot entry, it has the same root reference as my working 2.6.12 kernel so I'm sure something else isn't quite right. I've tried both with and without video=matroxfb:1280x1024 at 60,memtype: 3 which in the past was required in order to get all 8 megs on the card working (and thus have a working X) Yes I did make sure optimize for POWER4 is off. :-) Appreciate any comments or suggestions, Regards, Tom From tom_gall at mac.com Sun Dec 4 09:57:34 2005 From: tom_gall at mac.com (Thomas Gall) Date: Sat, 3 Dec 2005 16:57:34 -0600 Subject: power3 / matrox problems on current git In-Reply-To: <3528CCF2-D2CD-496B-821D-E3714EC885DF@mac.com> References: <3528CCF2-D2CD-496B-821D-E3714EC885DF@mac.com> Message-ID: <02CADBC9-5D0F-4146-A29F-56F543D664D1@mac.com> On Dec 3, 2005, at 4:29 PM, Thomas Gall wrote: > > > Appreciate any comments or suggestions, As a follow up, I did try both with and without the NUMA settings Flat and Sparse memory models ... no change ... I'll see if I can get a serial cable on it later tonight... Regards, Tom From rsa at us.ibm.com Mon Dec 5 08:12:09 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:12:09 -0600 Subject: [RFC PATCH 1/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935BA9.5020602@us.ibm.com> This patch removes the old bogus_console.c driver file. Signed-off-by: Ryan S. Arnold -------------- next part -------------- A non-text attachment was scrubbed... Name: hvc_fss.1.patch Type: text/x-patch Size: 7878 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051204/895b1270/attachment.bin From rsa at us.ibm.com Mon Dec 5 08:12:16 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:12:16 -0600 Subject: [RFC PATCH 2/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935BB0.9050306@us.ibm.com> This patch shuffles around some data-type declarations and moves some functions out of include/asm-ppc64/hvconsole.h and into a new drivers/char/hvc_console.h file. Signed-off-by: Ryan S. Arnold -------------- next part -------------- A non-text attachment was scrubbed... Name: hvc_fss.2.patch Type: text/x-patch Size: 6230 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051204/2c07a770/attachment.bin From michael at ellerman.id.au Sun Dec 4 17:28:05 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 17:28:05 Subject: [PATCH] powerpc: Fix compile warning in __eeh_mark_slot() Message-ID: <20051204232819.9FABD6884B@ozlabs.org> Fix a compile warning the the powerpc.git tree: arch/powerpc/platforms/pseries/eeh.c: In function `__eeh_mark_slot': arch/powerpc/platforms/pseries/eeh.c:214: warning: ISO C90 forbids mixed declarations and code Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/eeh.c | 2 +- 1 files changed, 1 insertion(+), 1 deletion(-) Index: kexec/arch/powerpc/platforms/pseries/eeh.c =================================================================== --- kexec.orig/arch/powerpc/platforms/pseries/eeh.c +++ kexec/arch/powerpc/platforms/pseries/eeh.c @@ -208,10 +208,10 @@ static void __eeh_mark_slot (struct devi { while (dn) { if (PCI_DN(dn)) { + struct pci_dev *dev = PCI_DN(dn)->pcidev; PCI_DN(dn)->eeh_mode |= mode_flag; /* Mark the pci device driver too */ - struct pci_dev *dev = PCI_DN(dn)->pcidev; if (dev && dev->driver) dev->error_state = pci_channel_io_frozen; From michael at ellerman.id.au Sun Dec 4 18:39:09 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:09 Subject: [PATCH 0/11] powerpc: Kdump support Message-ID: <1133743149.268607.418162138937.qpush@concordia> This patch series implements basic support for kdump on powerpc, on top of the current powerpc.git tree. Paulus please merge. From michael at ellerman.id.au Sun Dec 4 18:39:12 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:12 Subject: [PATCH 1/11] powerpc: Propagate regs through to machine_crash_shutdown In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003926.BD45B68869@ozlabs.org> Currently machine_crash_shutdown() gets a struct pt_regs, but doesn't pass it through to the ppc_md function, it should. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/machine_kexec.c | 2 +- include/asm-powerpc/machdep.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: kexec/include/asm-powerpc/machdep.h =================================================================== --- kexec.orig/include/asm-powerpc/machdep.h +++ kexec/include/asm-powerpc/machdep.h @@ -222,7 +222,7 @@ struct machdep_calls { * to run successfully. * XXX Should we move this one out of kexec scope? */ - void (*machine_crash_shutdown)(void); + void (*machine_crash_shutdown)(struct pt_regs *regs); /* Called to do what every setup is needed on image and the * reboot code buffer. Returns 0 on success. Index: kexec/arch/powerpc/kernel/machine_kexec.c =================================================================== --- kexec.orig/arch/powerpc/kernel/machine_kexec.c +++ kexec/arch/powerpc/kernel/machine_kexec.c @@ -23,7 +23,7 @@ note_buf_t crash_notes[NR_CPUS]; void machine_crash_shutdown(struct pt_regs *regs) { if (ppc_md.machine_crash_shutdown) - ppc_md.machine_crash_shutdown(); + ppc_md.machine_crash_shutdown(regs); } /* From michael at ellerman.id.au Sun Dec 4 18:39:15 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:15 Subject: [PATCH 2/11] powerpc: Add a is_kernel_addr() macro In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003930.04D9768871@ozlabs.org> There's a bunch of code that compares an address with KERNELBASE to see if it's a "kernel address", ie. >= KERNELBASE. The proper test is actually to compare with PAGE_OFFSET, since we're going to change KERNELBASE soon. So replace all of them with an is_kernel_addr() macro that does that. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/mm/slb.c | 6 +++--- arch/powerpc/mm/stab.c | 6 +++--- arch/powerpc/mm/tlb_64.c | 2 +- arch/powerpc/oprofile/op_model_power4.c | 4 ++-- arch/powerpc/oprofile/op_model_rs64.c | 3 +-- arch/powerpc/xmon/xmon.c | 4 ++-- include/asm-powerpc/page.h | 6 ++++++ 9 files changed, 20 insertions(+), 15 deletions(-) Index: kexec/arch/powerpc/mm/stab.c =================================================================== --- kexec.orig/arch/powerpc/mm/stab.c +++ kexec/arch/powerpc/mm/stab.c @@ -122,7 +122,7 @@ static int __ste_allocate(unsigned long unsigned long offset; /* Kernel or user address? */ - if (ea >= KERNELBASE) { + if (is_kernel_addr(ea)) { vsid = get_kernel_vsid(ea); } else { if ((ea >= TASK_SIZE_USER64) || (! mm)) @@ -133,7 +133,7 @@ static int __ste_allocate(unsigned long stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); - if (ea < KERNELBASE) { + if (!is_kernel_addr(ea)) { offset = __get_cpu_var(stab_cache_ptr); if (offset < NR_STAB_CACHE_ENTRIES) __get_cpu_var(stab_cache[offset++]) = stab_entry; @@ -190,7 +190,7 @@ void switch_stab(struct task_struct *tsk entry++, ste++) { unsigned long ea; ea = ste->esid_data & ESID_MASK; - if (ea < KERNELBASE) { + if (!is_kernel_addr(ea)) { ste->esid_data = 0; } } Index: kexec/arch/powerpc/kernel/prom_init.c =================================================================== --- kexec.orig/arch/powerpc/kernel/prom_init.c +++ kexec/arch/powerpc/kernel/prom_init.c @@ -1994,7 +1994,7 @@ static void __init prom_check_initrd(uns if (r3 && r4 && r4 != 0xdeadbeef) { unsigned long val; - RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3; + RELOC(prom_initrd_start) = is_kernel_addr(r3) ? __pa(r3) : r3; RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; val = RELOC(prom_initrd_start); Index: kexec/arch/powerpc/kernel/setup-common.c =================================================================== --- kexec.orig/arch/powerpc/kernel/setup-common.c +++ kexec/arch/powerpc/kernel/setup-common.c @@ -319,7 +319,7 @@ void __init check_for_initrd(void) /* If we were passed an initrd, set the ROOT_DEV properly if the values * look sensible. If not, clear initrd reference. */ - if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE && + if (is_kernel_addr(initrd_start) && is_kernel_addr(initrd_end) && initrd_end > initrd_start) ROOT_DEV = Root_RAM0; else Index: kexec/arch/powerpc/mm/slb.c =================================================================== --- kexec.orig/arch/powerpc/mm/slb.c +++ kexec/arch/powerpc/mm/slb.c @@ -134,14 +134,14 @@ void switch_slb(struct task_struct *tsk, else unmapped_base = TASK_UNMAPPED_BASE_USER64; - if (pc >= KERNELBASE) + if (is_kernel_addr(pc)) return; slb_allocate(pc); if (GET_ESID(pc) == GET_ESID(stack)) return; - if (stack >= KERNELBASE) + if (is_kernel_addr(stack)) return; slb_allocate(stack); @@ -149,7 +149,7 @@ void switch_slb(struct task_struct *tsk, || (GET_ESID(stack) == GET_ESID(unmapped_base))) return; - if (unmapped_base >= KERNELBASE) + if (is_kernel_addr(unmapped_base)) return; slb_allocate(unmapped_base); } Index: kexec/arch/powerpc/oprofile/op_model_power4.c =================================================================== --- kexec.orig/arch/powerpc/oprofile/op_model_power4.c +++ kexec/arch/powerpc/oprofile/op_model_power4.c @@ -252,7 +252,7 @@ static unsigned long get_pc(struct pt_re return (unsigned long)__va(pc); /* Not sure where we were */ - if (pc < KERNELBASE) + if (!is_kernel_addr(pc)) /* function descriptor madness */ return *((unsigned long *)kernel_unknown_bucket); @@ -264,7 +264,7 @@ static int get_kernel(unsigned long pc) int is_kernel; if (!mmcra_has_sihv) { - is_kernel = (pc >= KERNELBASE); + is_kernel = is_kernel_addr(pc); } else { unsigned long mmcra = mfspr(SPRN_MMCRA); is_kernel = ((mmcra & MMCRA_SIPR) == 0); Index: kexec/arch/powerpc/xmon/xmon.c =================================================================== --- kexec.orig/arch/powerpc/xmon/xmon.c +++ kexec/arch/powerpc/xmon/xmon.c @@ -1013,7 +1013,7 @@ static long check_bp_loc(unsigned long a unsigned int instr; addr &= ~3; - if (addr < KERNELBASE) { + if (!is_kernel_addr(addr)) { printf("Breakpoints may only be placed at kernel addresses\n"); return 0; } @@ -1064,7 +1064,7 @@ bpt_cmds(void) dabr.address = 0; dabr.enabled = 0; if (scanhex(&dabr.address)) { - if (dabr.address < KERNELBASE) { + if (!is_kernel_addr(dabr.address)) { printf(badaddr); break; } Index: kexec/include/asm-powerpc/page.h =================================================================== --- kexec.orig/include/asm-powerpc/page.h +++ kexec/include/asm-powerpc/page.h @@ -86,6 +86,12 @@ /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) +/* + * Don't compare things with KERNELBASE or PAGE_OFFSET to test for + * "kernelness", use is_kernel_addr() - it should do what you want. + */ +#define is_kernel_addr(x) ((x) >= PAGE_OFFSET) + #ifndef __ASSEMBLY__ #undef STRICT_MM_TYPECHECKS Index: kexec/arch/powerpc/oprofile/op_model_rs64.c =================================================================== --- kexec.orig/arch/powerpc/oprofile/op_model_rs64.c +++ kexec/arch/powerpc/oprofile/op_model_rs64.c @@ -178,7 +178,6 @@ static void rs64_handle_interrupt(struct int val; int i; unsigned long pc = mfspr(SPRN_SIAR); - int is_kernel = (pc >= KERNELBASE); /* set the PMM bit (see comment below) */ mtmsrd(mfmsr() | MSR_PMM); @@ -187,7 +186,7 @@ static void rs64_handle_interrupt(struct val = ctr_read(i); if (val < 0) { if (ctr[i].enabled) { - oprofile_add_pc(pc, is_kernel, i); + oprofile_add_pc(pc, is_kernel_addr(pc), i); ctr_write(i, reset_value[i]); } else { ctr_write(i, 0); Index: kexec/arch/powerpc/mm/tlb_64.c =================================================================== --- kexec.orig/arch/powerpc/mm/tlb_64.c +++ kexec/arch/powerpc/mm/tlb_64.c @@ -168,7 +168,7 @@ void hpte_update(struct mm_struct *mm, u batch->mm = mm; batch->psize = psize; } - if (addr < KERNELBASE) { + if (!is_kernel_addr(addr)) { vsid = get_vsid(mm->context.id, addr); WARN_ON(vsid == 0); } else From michael at ellerman.id.au Sun Dec 4 18:39:20 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:20 Subject: [PATCH 3/11] powerpc: Seperate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003934.643E26887C@ozlabs.org> This patch seperates usage of KERNELBASE and PAGE_OFFSET. I haven't looked at any of the PPC code, if we ever want to support Kdump on PPC we'll have to do another audit, ditto for iSeries. This patch makes PAGE_OFFSET the constant, it'll always be 0xC * 1 gazillion. To get a physical address from a virtual one you subtract PAGE_OFFSET, _not_ KERNELBASE. KERNELBASE is the virtual address of the start of the kernel, it's often the same as PAGE_OFFSET, but _might not be_. If you want to know something's offset from the start of the kernel you should subtract KERNELBASE. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/btext.c | 4 ++-- arch/powerpc/kernel/entry_64.S | 4 ++-- arch/powerpc/kernel/lparmap.c | 6 +++--- arch/powerpc/kernel/machine_kexec_64.c | 5 ++--- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/slb.c | 4 ++-- arch/powerpc/mm/slb_low.S | 6 +++--- arch/powerpc/mm/stab.c | 10 +++++----- include/asm-powerpc/page.h | 2 +- 9 files changed, 23 insertions(+), 24 deletions(-) Index: kexec/arch/powerpc/mm/stab.c =================================================================== --- kexec.orig/arch/powerpc/mm/stab.c +++ kexec/arch/powerpc/mm/stab.c @@ -40,7 +40,7 @@ static int make_ste(unsigned long stab, unsigned long entry, group, old_esid, castout_entry, i; unsigned int global_entry; struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; + unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; vsid_data = vsid << STE_VSID_SHIFT; esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; @@ -83,7 +83,7 @@ static int make_ste(unsigned long stab, } /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) + if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) break; castout_entry = (castout_entry + 1) & 0xf; @@ -251,7 +251,7 @@ void stabs_alloc(void) panic("Unable to allocate segment table for CPU %d.\n", cpu); - newstab += KERNELBASE; + newstab = (unsigned long)__va(newstab); memset((void *)newstab, 0, HW_PAGE_SIZE); @@ -270,11 +270,11 @@ void stabs_alloc(void) */ void stab_initialize(unsigned long stab) { - unsigned long vsid = get_kernel_vsid(KERNELBASE); + unsigned long vsid = get_kernel_vsid(PAGE_OFFSET); unsigned long stabreal; asm volatile("isync; slbia; isync":::"memory"); - make_ste(stab, GET_ESID(KERNELBASE), vsid); + make_ste(stab, GET_ESID(PAGE_OFFSET), vsid); /* Order update */ asm volatile("sync":::"memory"); Index: kexec/arch/powerpc/kernel/machine_kexec_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/machine_kexec_64.c +++ kexec/arch/powerpc/kernel/machine_kexec_64.c @@ -153,9 +153,8 @@ void kexec_copy_flush(struct kimage *ima * including ones that were in place on the original copy */ for (i = 0; i < nr_segments; i++) - flush_icache_range(ranges[i].mem + KERNELBASE, - ranges[i].mem + KERNELBASE + - ranges[i].memsz); + flush_icache_range((unsigned long)__va(ranges[i].mem), + (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); } #ifdef CONFIG_SMP Index: kexec/arch/powerpc/mm/hash_utils_64.c =================================================================== --- kexec.orig/arch/powerpc/mm/hash_utils_64.c +++ kexec/arch/powerpc/mm/hash_utils_64.c @@ -456,7 +456,7 @@ void __init htab_initialize(void) /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].base + KERNELBASE; + base = (unsigned long)__va(lmb.memory.region[i].base); size = lmb.memory.region[i].size; DBG("creating mapping for region: %lx : %lx\n", base, size); @@ -498,8 +498,8 @@ void __init htab_initialize(void) * for either 4K or 16MB pages. */ if (tce_alloc_start) { - tce_alloc_start += KERNELBASE; - tce_alloc_end += KERNELBASE; + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); if (base + size >= tce_alloc_start) tce_alloc_start = base + size + 1; Index: kexec/arch/powerpc/mm/slb.c =================================================================== --- kexec.orig/arch/powerpc/mm/slb.c +++ kexec/arch/powerpc/mm/slb.c @@ -75,7 +75,7 @@ static void slb_flush_and_rebolt(void) vflags = SLB_VSID_KERNEL | virtual_llp; ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); - if ((ksp_esid_data & ESID_MASK) == KERNELBASE) + if ((ksp_esid_data & ESID_MASK) == PAGE_OFFSET) ksp_esid_data &= ~SLB_ESID_V; /* We need to do this all in asm, so we're sure we don't touch @@ -213,7 +213,7 @@ void slb_initialize(void) asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); - create_slbe(KERNELBASE, lflags, 0); + create_slbe(PAGE_OFFSET, lflags, 0); /* VMALLOC space has 4K pages always for now */ create_slbe(VMALLOCBASE, vflags, 1); Index: kexec/arch/powerpc/kernel/entry_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/entry_64.S +++ kexec/arch/powerpc/kernel/entry_64.S @@ -690,7 +690,7 @@ _GLOBAL(enter_rtas) /* Setup our real return addr */ SET_REG_TO_LABEL(r4,.rtas_return_loc) - SET_REG_TO_CONST(r9,KERNELBASE) + SET_REG_TO_CONST(r9,PAGE_OFFSET) sub r4,r4,r9 mtlr r4 @@ -718,7 +718,7 @@ _GLOBAL(enter_rtas) _STATIC(rtas_return_loc) /* relocation is off at this point */ mfspr r4,SPRN_SPRG3 /* Get PACA */ - SET_REG_TO_CONST(r5, KERNELBASE) + SET_REG_TO_CONST(r5, PAGE_OFFSET) sub r4,r4,r5 /* RELOC the PACA base pointer */ mfmsr r6 Index: kexec/arch/powerpc/mm/slb_low.S =================================================================== --- kexec.orig/arch/powerpc/mm/slb_low.S +++ kexec/arch/powerpc/mm/slb_low.S @@ -37,9 +37,9 @@ _GLOBAL(slb_allocate_realmode) srdi r9,r3,60 /* get region */ srdi r10,r3,28 /* get esid */ - cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - /* r3 = address, r10 = esid, cr7 = <>KERNELBASE */ + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ blt cr7,0f /* user or kernel? */ /* kernel address: proto-VSID = ESID */ @@ -166,7 +166,7 @@ _GLOBAL(slb_allocate_user) /* * Finish loading of an SLB entry and return * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: ASM_VSID_SCRAMBLE(r10,r9) Index: kexec/arch/powerpc/kernel/lparmap.c =================================================================== --- kexec.orig/arch/powerpc/kernel/lparmap.c +++ kexec/arch/powerpc/kernel/lparmap.c @@ -16,8 +16,8 @@ const struct LparMap __attribute__((__se .xSegmentTableOffs = STAB0_PAGE, .xEsids = { - { .xKernelEsid = GET_ESID(KERNELBASE), - .xKernelVsid = KERNEL_VSID(KERNELBASE), }, + { .xKernelEsid = GET_ESID(PAGE_OFFSET), + .xKernelVsid = KERNEL_VSID(PAGE_OFFSET), }, { .xKernelEsid = GET_ESID(VMALLOCBASE), .xKernelVsid = KERNEL_VSID(VMALLOCBASE), }, }, @@ -25,7 +25,7 @@ const struct LparMap __attribute__((__se .xRanges = { { .xPages = HvPagesToMap, .xOffset = 0, - .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - HW_PAGE_SHIFT), + .xVPN = KERNEL_VSID(PAGE_OFFSET) << (SID_SHIFT - HW_PAGE_SHIFT), }, }, }; Index: kexec/include/asm-powerpc/page.h =================================================================== --- kexec.orig/include/asm-powerpc/page.h +++ kexec/include/asm-powerpc/page.h @@ -56,7 +56,7 @@ #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) +#define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET)) #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* Index: kexec/arch/powerpc/kernel/btext.c =================================================================== --- kexec.orig/arch/powerpc/kernel/btext.c +++ kexec/arch/powerpc/kernel/btext.c @@ -60,7 +60,7 @@ int force_printk_to_btext = 0; * * The display is mapped to virtual address 0xD0000000, rather * than 1:1, because some some CHRP machines put the frame buffer - * in the region starting at 0xC0000000 (KERNELBASE). + * in the region starting at 0xC0000000 (PAGE_OFFSET). * This mapping is temporary and will disappear as soon as the * setup done by MMU_Init() is applied. * @@ -71,7 +71,7 @@ int force_printk_to_btext = 0; */ void __init btext_prepare_BAT(void) { - unsigned long vaddr = KERNELBASE + 0x10000000; + unsigned long vaddr = PAGE_OFFSET + 0x10000000; unsigned long addr; unsigned long lowbits; From michael at ellerman.id.au Sun Dec 4 18:39:23 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:23 Subject: [PATCH 4/11] powerpc: Add CONFIG_CRASH_DUMP In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003942.A65C468851@ozlabs.org> This patch adds a Kconfig variable, CONFIG_CRASH_DUMP, which configures the built kernel for use as a Kdump kernel. Currently "all" this involves is changing the value of KERNELBASE to 32 MB. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 11 +++++++++++ arch/powerpc/kernel/setup_64.c | 3 +++ include/asm-powerpc/page.h | 9 ++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) Index: kexec/arch/powerpc/Kconfig =================================================================== --- kexec.orig/arch/powerpc/Kconfig +++ kexec/arch/powerpc/Kconfig @@ -379,6 +379,17 @@ config CELL_IIC bool default y +config CRASH_DUMP + bool "kernel crash dumps (EXPERIMENTAL)" + depends on PPC_MULTIPLATFORM + depends on EXPERIMENTAL + help + Build a kernel suitable for use as a kdump capture kernel. + The kernel will be linked at a different address than normal, and + so can only be used for Kdump. + + Don't change this unless you know what you are doing. + config IBMVIO depends on PPC_PSERIES || PPC_ISERIES bool Index: kexec/arch/powerpc/kernel/setup_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/setup_64.c +++ kexec/arch/powerpc/kernel/setup_64.c @@ -504,6 +504,9 @@ void __init setup_system(void) ppc64_caches.iline_size); printk("htab_address = 0x%p\n", htab_address); printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); +#if PHYSICAL_START > 0 + printk("physical_start = 0x%x\n", PHYSICAL_START); +#endif printk("-----------------------------------------------------\n"); mm_init_ppc64(); Index: kexec/include/asm-powerpc/page.h =================================================================== --- kexec.orig/include/asm-powerpc/page.h +++ kexec/include/asm-powerpc/page.h @@ -37,8 +37,15 @@ */ #define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) +#ifdef CONFIG_CRASH_DUMP +/* Kdump kernel runs at 32 MB, change at your peril. */ +#define PHYSICAL_START 0x2000000 +#else +#define PHYSICAL_START 0x0 +#endif + #define PAGE_OFFSET ASM_CONST(CONFIG_KERNEL_START) -#define KERNELBASE PAGE_OFFSET +#define KERNELBASE (PAGE_OFFSET + PHYSICAL_START) #ifdef CONFIG_DISCONTIGMEM #define page_to_pfn(page) discontigmem_page_to_pfn(page) From michael at ellerman.id.au Sun Dec 4 18:39:33 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:33 Subject: [PATCH 5/11] powerpc: Create a trampoline for the fwnmi vectors In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003947.EA0A068876@ozlabs.org> The fwnmi vectors can be anywhere < 32 MB, so we need to use a trampoline for them. The kdump kernel will register the trampoline addresses, which will then jump up to the real code above 32 MB. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_64.S | 2 ++ arch/powerpc/platforms/pseries/ras.c | 6 ++---- arch/powerpc/platforms/pseries/setup.c | 18 ++++++++++-------- include/asm-powerpc/firmware.h | 6 ++++++ 4 files changed, 20 insertions(+), 12 deletions(-) Index: kexec/arch/powerpc/kernel/head_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/head_64.S +++ kexec/arch/powerpc/kernel/head_64.S @@ -553,6 +553,7 @@ slb_miss_user_pseries: * Vectors for the FWNMI option. Share common code. */ .globl system_reset_fwnmi + .align 7 system_reset_fwnmi: HMT_MEDIUM mtspr SPRN_SPRG1,r13 /* save r13 */ @@ -560,6 +561,7 @@ system_reset_fwnmi: EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) .globl machine_check_fwnmi + .align 7 machine_check_fwnmi: HMT_MEDIUM mtspr SPRN_SPRG1,r13 /* save r13 */ Index: kexec/arch/powerpc/platforms/pseries/setup.c =================================================================== --- kexec.orig/arch/powerpc/platforms/pseries/setup.c +++ kexec/arch/powerpc/platforms/pseries/setup.c @@ -77,8 +77,6 @@ #endif extern void find_udbg_vterm(void); -extern void system_reset_fwnmi(void); /* from head.S */ -extern void machine_check_fwnmi(void); /* from head.S */ int fwnmi_active; /* TRUE if an FWNMI handler is present */ @@ -104,18 +102,22 @@ void pSeries_show_cpuinfo(struct seq_fil /* Initialize firmware assisted non-maskable interrupts if * the firmware supports this feature. - * */ static void __init fwnmi_init(void) { - int ret; + unsigned long system_reset_addr, machine_check_addr; + int ibm_nmi_register = rtas_token("ibm,nmi-register"); if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) return; - ret = rtas_call(ibm_nmi_register, 2, 1, NULL, - __pa((unsigned long)system_reset_fwnmi), - __pa((unsigned long)machine_check_fwnmi)); - if (ret == 0) + + /* If the kernel's not linked at zero we point the firmware at low + * addresses anyway, and use a trampoline to get to the real code. */ + system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START; + machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START; + + if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, + machine_check_addr)) fwnmi_active = 1; } Index: kexec/include/asm-powerpc/firmware.h =================================================================== --- kexec.orig/include/asm-powerpc/firmware.h +++ kexec/include/asm-powerpc/firmware.h @@ -98,6 +98,12 @@ typedef struct { extern firmware_feature_t firmware_features_table[]; #endif +extern void system_reset_fwnmi(void); +extern void machine_check_fwnmi(void); + +/* This is true if we are using the firmware NMI handler (typically LPAR) */ +extern int fwnmi_active; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_FIRMWARE_H */ Index: kexec/arch/powerpc/platforms/pseries/ras.c =================================================================== --- kexec.orig/arch/powerpc/platforms/pseries/ras.c +++ kexec/arch/powerpc/platforms/pseries/ras.c @@ -49,14 +49,12 @@ #include #include #include +#include static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; static DEFINE_SPINLOCK(ras_log_buf_lock); -char mce_data_buf[RTAS_ERROR_LOG_MAX] -; -/* This is true if we are using the firmware NMI handler (typically LPAR) */ -extern int fwnmi_active; +char mce_data_buf[RTAS_ERROR_LOG_MAX]; static int ras_get_sensor_state_token; static int ras_check_exception_token; From michael at ellerman.id.au Sun Dec 4 18:39:37 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:37 Subject: [PATCH 6/11] powerpc: Reroute interrupts from 0 + offset to PHYSICAL_START + offset In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003951.3E3CC6887E@ozlabs.org> Regardless of where the kernel's linked we always get interrupts at low addresses. This patch creates a trampoline in the first 3 pages of memory, where interrupts land, and patches those addresses to jump into the real kernel code at PHYSICAL_START. We also need to reserve the trampoline code and a bit more in prom.c Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 1 arch/powerpc/kernel/crash_dump.c | 53 +++++++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/prom.c | 6 +++- arch/powerpc/kernel/setup_64.c | 5 +++ include/asm-powerpc/kdump.h | 13 +++++++++ 5 files changed, 77 insertions(+), 1 deletion(-) Index: kexec/arch/powerpc/kernel/setup_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/setup_64.c +++ kexec/arch/powerpc/kernel/setup_64.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -260,6 +261,10 @@ void __init early_setup(unsigned long dt } ppc_md = **mach; +#ifdef CONFIG_CRASH_DUMP + kdump_setup(); +#endif + DBG("Found, Initializing memory management...\n"); /* Index: kexec/arch/powerpc/kernel/prom.c =================================================================== --- kexec.orig/arch/powerpc/kernel/prom.c +++ kexec/arch/powerpc/kernel/prom.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1335,11 +1336,14 @@ void __init early_init_devtree(void *par of_scan_flat_dt(early_init_dt_scan_memory, NULL); lmb_enforce_memory_limit(memory_limit); lmb_analyze(); - lmb_reserve(0, __pa(klimit)); DBG("Phys. mem: %lx\n", lmb_phys_mem_size()); /* Reserve LMB regions used by kernel, initrd, dt, etc... */ + lmb_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START); +#ifdef CONFIG_CRASH_DUMP + lmb_reserve(0, KDUMP_RESERVE_LIMIT); +#endif early_reserve_mem(); DBG("Scanning CPUs ...\n"); Index: kexec/include/asm-powerpc/kdump.h =================================================================== --- /dev/null +++ kexec/include/asm-powerpc/kdump.h @@ -0,0 +1,13 @@ +#ifndef _PPC64_KDUMP_H +#define _PPC64_KDUMP_H + +/* How many bytes to reserve at zero for kdump. The reserve limit should + * be greater or equal to the trampoline's end address. */ +#define KDUMP_RESERVE_LIMIT 0x8000 + +#define KDUMP_TRAMPOLINE_START 0x0100 +#define KDUMP_TRAMPOLINE_END 0x3000 + +extern void kdump_setup(void); + +#endif /* __PPC64_KDUMP_H */ Index: kexec/arch/powerpc/kernel/Makefile =================================================================== --- kexec.orig/arch/powerpc/kernel/Makefile +++ kexec/arch/powerpc/kernel/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj64-$(CONFIG_PPC_MULTIPLATFORM) += nvram_64.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifeq ($(CONFIG_PPC_MERGE),y) Index: kexec/arch/powerpc/kernel/crash_dump.c =================================================================== --- /dev/null +++ kexec/arch/powerpc/kernel/crash_dump.c @@ -0,0 +1,53 @@ +/* + * Routines for doing kexec-based kdump. + * + * Copyright (C) 2005, IBM Corp. + * + * Created by: Michael Ellerman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#undef DEBUG + +#include +#include +#include + +#ifdef DEBUG +#include +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +static void __init create_trampoline(unsigned long addr) +{ + /* The maximum range of a single instruction branch, is the current + * instruction's address + (32 MB - 4) bytes. For the trampoline we + * need to branch to current address + 32 MB. So we insert a nop at + * the trampoline address, then the next instruction (+ 4 bytes) + * does a branch to (32 MB - 4). The net effect is that when we + * branch to "addr" we jump to ("addr" + 32 MB). Although it requires + * two instructions it doesn't require any registers. + */ + create_instruction(addr, 0x60000000); /* nop */ + create_branch(addr + 4, addr + PHYSICAL_START, 0); +} + +void __init kdump_setup(void) +{ + unsigned long i; + + DBG(" -> kdump_setup()\n"); + + for (i = KDUMP_TRAMPOLINE_START; i < KDUMP_TRAMPOLINE_END; i += 8) { + create_trampoline(i); + } + + create_trampoline(__pa(system_reset_fwnmi) - PHYSICAL_START); + create_trampoline(__pa(machine_check_fwnmi) - PHYSICAL_START); + + DBG(" <- kdump_setup()\n"); +} From michael at ellerman.id.au Sun Dec 4 18:39:40 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:40 Subject: [PATCH 7/11] powerpc: Fixups for kernel linked at 32 MB In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003954.6E56168802@ozlabs.org> There's a few places where we need to fix things up for the kernel to work if it's linked at 32MB: - platforms/powermac/smp.c To start secondary cpus on pmac we patch the reset vector, which is fine. Except if we're above 32MB we don't have enough bits for an absolute branch, it needs to relative. - kernel/head_64.s - A few branches in the cpu hold code need to load the full target address and do a bctr. - after_prom_start needs to load PHYSICAL_START as the dest address, not 0. - The exception prolog needs to load the low word of the target adddress, not just the low halfword. - Fixup handling of the initial stab address. - kernel/setup_64.c smp_release_cpus() needs to write 1 to the spinloop flag near 0, not 32 MB. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_64.S | 30 ++++++++++++++++++++++++------ arch/powerpc/kernel/setup_64.c | 5 ++++- arch/powerpc/platforms/powermac/smp.c | 16 +++++++--------- include/asm-powerpc/mmu.h | 3 ++- 4 files changed, 37 insertions(+), 17 deletions(-) Index: kexec/arch/powerpc/platforms/powermac/smp.c =================================================================== --- kexec.orig/arch/powerpc/platforms/powermac/smp.c +++ kexec/arch/powerpc/platforms/powermac/smp.c @@ -753,14 +753,15 @@ static int __init smp_core99_probe(void) static void __devinit smp_core99_kick_cpu(int nr) { unsigned int save_vector; - unsigned long new_vector; - unsigned long flags; + unsigned long target, flags; volatile unsigned int *vector = ((volatile unsigned int *)(KERNELBASE+0x100)); if (nr < 0 || nr > 3) return; - if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu", 0x346); + + if (ppc_md.progress) + ppc_md.progress("smp_core99_kick_cpu", 0x346); local_irq_save(flags); local_irq_disable(); @@ -768,14 +769,11 @@ static void __devinit smp_core99_kick_cp /* Save reset vector */ save_vector = *vector; - /* Setup fake reset vector that does + /* Setup fake reset vector that does * b __secondary_start_pmac_0 + nr*8 - KERNELBASE */ - new_vector = (unsigned long) __secondary_start_pmac_0 + nr * 8; - *vector = 0x48000002 + new_vector - KERNELBASE; - - /* flush data cache and inval instruction cache */ - flush_icache_range((unsigned long) vector, (unsigned long) vector + 4); + target = (unsigned long) __secondary_start_pmac_0 + nr * 8; + create_branch((unsigned long)vector, target, BRANCH_SET_LINK); /* Put some life in our friend */ pmac_call_feature(PMAC_FTR_RESET_CPU, NULL, nr, 0); Index: kexec/arch/powerpc/kernel/head_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/head_64.S +++ kexec/arch/powerpc/kernel/head_64.S @@ -154,11 +154,15 @@ _GLOBAL(__secondary_hold) bne 100b #ifdef CONFIG_HMT - b .hmt_init + LOADADDR(r4, .hmt_init) + mtctr r4 + bctr #else #ifdef CONFIG_SMP + LOADADDR(r4, .pSeries_secondary_smp_init) + mtctr r4 mr r3,r24 - b .pSeries_secondary_smp_init + bctr #else BUG_OPCODE #endif @@ -200,6 +204,20 @@ exception_marker: #define EX_R3 64 #define EX_LR 72 +/* + * We're short on space and time in the exception prolog, so we can't use + * the normal LOADADDR macro. Normally we just need the low halfword of the + * address, but for Kdump we need the whole low word. + */ +#ifdef CONFIG_CRASH_DUMP +#define LOAD_HANDLER(reg, label) \ + oris r12,r12,(label)@h; /* virt addr of handler ... */ \ + ori r12,r12,(label)@l; /* .. and the rest */ +#else +#define LOAD_HANDLER(reg, label) \ + ori r12,r12,(label)@l; /* virt addr of handler ... */ +#endif + #define EXCEPTION_PROLOG_PSERIES(area, label) \ mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ std r9,area+EX_R9(r13); /* save r9 - r12 */ \ @@ -212,7 +230,7 @@ exception_marker: clrrdi r12,r13,32; /* get high part of &label */ \ mfmsr r10; \ mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - ori r12,r12,(label)@l; /* virt addr of handler */ \ + LOAD_HANDLER(r12,label) \ ori r10,r10,MSR_IR|MSR_DR|MSR_RI; \ mtspr SPRN_SRR0,r12; \ mfspr r12,SPRN_SRR1; /* and SRR1 */ \ @@ -1348,7 +1366,7 @@ _GLOBAL(do_stab_bolted) * fixed address (the linker can't compute (u64)&initial_stab >> * PAGE_SHIFT). */ - . = STAB0_PHYS_ADDR /* 0x6000 */ + . = STAB0_OFFSET /* 0x6000 */ .globl initial_stab initial_stab: .space 4096 @@ -1553,7 +1571,7 @@ _STATIC(__boot_from_prom) _STATIC(__after_prom_start) /* - * We need to run with __start at physical address 0. + * We need to run with __start at physical address PHYSICAL_START. * This will leave some code in the first 256B of * real memory, which are reserved for software use. * The remainder of the first page is loaded with the fixed @@ -1568,7 +1586,7 @@ _STATIC(__after_prom_start) mr r26,r3 SET_REG_TO_CONST(r27,KERNELBASE) - li r3,0 /* target addr */ + LOADADDR(r3, PHYSICAL_START) /* target addr */ // XXX FIXME: Use phys returned by OF (r30) add r4,r27,r26 /* source addr */ Index: kexec/arch/powerpc/kernel/setup_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/setup_64.c +++ kexec/arch/powerpc/kernel/setup_64.c @@ -314,6 +314,7 @@ void early_setup_secondary(void) void smp_release_cpus(void) { extern unsigned long __secondary_hold_spinloop; + unsigned long *ptr; DBG(" -> smp_release_cpus()\n"); @@ -324,7 +325,9 @@ void smp_release_cpus(void) * This is useless but harmless on iSeries, secondaries are already * waiting on their paca spinloops. */ - __secondary_hold_spinloop = 1; + ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop + - PHYSICAL_START); + *ptr = 1; mb(); DBG(" <- smp_release_cpus()\n"); Index: kexec/include/asm-powerpc/mmu.h =================================================================== --- kexec.orig/include/asm-powerpc/mmu.h +++ kexec/include/asm-powerpc/mmu.h @@ -33,7 +33,8 @@ /* Location of cpu0's segment table */ #define STAB0_PAGE 0x6 -#define STAB0_PHYS_ADDR (STAB0_PAGE<<12) +#define STAB0_OFFSET (STAB0_PAGE << 12) +#define STAB0_PHYS_ADDR (STAB0_OFFSET + PHYSICAL_START) #ifndef __ASSEMBLY__ extern char initial_stab[]; From michael at ellerman.id.au Sun Dec 4 18:39:43 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:43 Subject: [PATCH 8/11] powerpc: Add arch dependent basic infrastructure for Kdump. In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205003957.4CD9868887@ozlabs.org> Implementing the machine_crash_shutdown which will be called by crash_kexec (called in case of a panic, sysrq etc.). Disable the interrupts, shootdown cpus using debugger IPI and collect regs for all CPUs. elfcorehdr= specifies the location of elf core header stored by the crashed kernel. This command line option will be passed by the kexec-tools to capture kernel. savemaxmem= specifies the actual memory size that the first kernel has and this value will be used for dumping in the capture kernel. This command line option will be passed by the kexec-tools to capture kernel. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 2 arch/powerpc/kernel/crash.c | 264 ++++++++++++++++++++++++++++++++ arch/powerpc/kernel/crash_dump.c | 20 ++ arch/powerpc/kernel/machine_kexec_64.c | 13 + arch/powerpc/kernel/smp.c | 22 ++ arch/powerpc/kernel/traps.c | 17 +- arch/powerpc/platforms/cell/setup.c | 1 arch/powerpc/platforms/maple/setup.c | 1 arch/powerpc/platforms/powermac/setup.c | 1 arch/powerpc/platforms/pseries/setup.c | 1 arch/powerpc/platforms/pseries/xics.c | 2 include/asm-powerpc/kexec.h | 10 + 12 files changed, 345 insertions(+), 9 deletions(-) Index: kexec/arch/powerpc/kernel/smp.c =================================================================== --- kexec.orig/arch/powerpc/kernel/smp.c +++ kexec/arch/powerpc/kernel/smp.c @@ -75,6 +75,8 @@ void smp_call_function_interrupt(void); int smt_enabled_at_boot = 1; +static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; + #ifdef CONFIG_MPIC int __init smp_mpic_probe(void) { @@ -123,11 +125,16 @@ void smp_message_recv(int msg, struct pt /* XXX Do we have to do this? */ set_need_resched(); break; -#ifdef CONFIG_DEBUGGER case PPC_MSG_DEBUGGER_BREAK: + if (crash_ipi_function_ptr) { + crash_ipi_function_ptr(regs); + break; + } +#ifdef CONFIG_DEBUGGER debugger_ipi(regs); break; -#endif +#endif /* CONFIG_DEBUGGER */ + /* FALLTHROUGH */ default: printk("SMP %d: smp_message_recv(): unknown msg %d\n", smp_processor_id(), msg); @@ -147,6 +154,17 @@ void smp_send_debugger_break(int cpu) } #endif +#ifdef CONFIG_KEXEC +void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) +{ + crash_ipi_function_ptr = crash_ipi_callback; + if (crash_ipi_callback) { + mb(); + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK); + } +} +#endif + static void stop_this_cpu(void *dummy) { local_irq_disable(); Index: kexec/arch/powerpc/kernel/crash.c =================================================================== --- /dev/null +++ kexec/arch/powerpc/kernel/crash.c @@ -0,0 +1,264 @@ +/* + * Architecture specific (PPC64) functions for kexec based crash dumps. + * + * Copyright (C) 2005, IBM Corp. + * + * Created by: Haren Myneni + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef DEBUG +#include +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +/* This keeps a track of which one is crashing cpu. */ +int crashing_cpu = -1; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that that no need to invent something new. + */ + buf = &crash_notes[cpu][0]; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +/* FIXME Merge this with xmon_save_regs ?? */ +static inline void crash_get_current_regs(struct pt_regs *regs) +{ + unsigned long tmp1, tmp2; + + __asm__ __volatile__ ( + "std 0,0(%2)\n" + "std 1,8(%2)\n" + "std 2,16(%2)\n" + "std 3,24(%2)\n" + "std 4,32(%2)\n" + "std 5,40(%2)\n" + "std 6,48(%2)\n" + "std 7,56(%2)\n" + "std 8,64(%2)\n" + "std 9,72(%2)\n" + "std 10,80(%2)\n" + "std 11,88(%2)\n" + "std 12,96(%2)\n" + "std 13,104(%2)\n" + "std 14,112(%2)\n" + "std 15,120(%2)\n" + "std 16,128(%2)\n" + "std 17,136(%2)\n" + "std 18,144(%2)\n" + "std 19,152(%2)\n" + "std 20,160(%2)\n" + "std 21,168(%2)\n" + "std 22,176(%2)\n" + "std 23,184(%2)\n" + "std 24,192(%2)\n" + "std 25,200(%2)\n" + "std 26,208(%2)\n" + "std 27,216(%2)\n" + "std 28,224(%2)\n" + "std 29,232(%2)\n" + "std 30,240(%2)\n" + "std 31,248(%2)\n" + "mfmsr %0\n" + "std %0, 264(%2)\n" + "mfctr %0\n" + "std %0, 280(%2)\n" + "mflr %0\n" + "std %0, 288(%2)\n" + "bl 1f\n" + "1: mflr %1\n" + "std %1, 256(%2)\n" + "mtlr %0\n" + "mfxer %0\n" + "std %0, 296(%2)\n" + : "=&r" (tmp1), "=&r" (tmp2) + : "b" (regs)); +} + +/* We may have saved_regs from where the error came from + * or it is NULL if via a direct panic(). + */ +static void crash_save_self(struct pt_regs *saved_regs) +{ + struct pt_regs regs; + int cpu; + + cpu = smp_processor_id(); + if (saved_regs) + memcpy(®s, saved_regs, sizeof(regs)); + else + crash_get_current_regs(®s); + crash_save_this_cpu(®s, cpu); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +void crash_ipi_callback(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + if (cpu == crashing_cpu) + return; + + if (!cpu_online(cpu)) + return; + + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(1, 1); + + local_irq_disable(); + + crash_save_this_cpu(regs, cpu); + atomic_dec(&waiting_for_crash_ipi); + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void crash_kexec_prepare_cpus(void) +{ + unsigned int msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + + crash_send_ipi(crash_ipi_callback); + smp_wmb(); + + /* + * FIXME: Until we will have the way to stop other CPUSs reliabally, + * the crash CPU will send an IPI and wait for other CPUs to + * respond. If not, proceed the kexec boot even though we failed to + * capture other CPU states. + */ + msecs = 1000000; + while ((atomic_read(&waiting_for_crash_ipi) > 0) && (--msecs > 0)) { + barrier(); + mdelay(1); + } + + /* Would it be better to replace the trap vector here? */ + + /* + * FIXME: In case if we do not get all CPUs, one possibility: ask the + * user to do soft reset such that we get all. + * IPI handler is already set by the panic cpu initially. Therefore, + * all cpus could invoke this handler from die() and the panic CPU + * will call machine_kexec() directly from this handler to do + * kexec boot. + */ + if (atomic_read(&waiting_for_crash_ipi)) + printk(KERN_ALERT "done waiting: %d cpus not responding\n", + atomic_read(&waiting_for_crash_ipi)); + /* Leave the IPI callback set */ +} +#else +static void crash_kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + */ + smp_release_cpus(); +} + +#endif + +void default_machine_crash_shutdown(struct pt_regs *regs) +{ + /* + * This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means stopping other cpus in + * an SMP system. + * The kernel is broken so disable interrupts. + */ + local_irq_disable(); + + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(1, 0); + + /* + * Make a note of crashing cpu. Will be used in machine_kexec + * such that another IPI will not be sent. + */ + crashing_cpu = smp_processor_id(); + crash_kexec_prepare_cpus(); + crash_save_self(regs); +} Index: kexec/arch/powerpc/kernel/traps.c =================================================================== --- kexec.orig/arch/powerpc/kernel/traps.c +++ kexec/arch/powerpc/kernel/traps.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -95,7 +96,7 @@ static DEFINE_SPINLOCK(die_lock); int die(const char *str, struct pt_regs *regs, long err) { - static int die_counter; + static int die_counter, crash_dump_start = 0; int nl = 0; if (debugger(regs)) @@ -156,7 +157,21 @@ int die(const char *str, struct pt_regs print_modules(); show_regs(regs); bust_spinlocks(0); + + if (!crash_dump_start && kexec_should_crash(current)) { + crash_dump_start = 1; + spin_unlock_irq(&die_lock); + crash_kexec(regs); + /* NOTREACHED */ + } spin_unlock_irq(&die_lock); + if (crash_dump_start) + /* + * Only for soft-reset: Other CPUs will be responded to an IPI + * sent by first kexec CPU. + */ + for(;;) + ; if (in_interrupt()) panic("Fatal exception in interrupt"); Index: kexec/arch/powerpc/kernel/machine_kexec_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/machine_kexec_64.c +++ kexec/arch/powerpc/kernel/machine_kexec_64.c @@ -265,11 +265,18 @@ extern NORET_TYPE void kexec_sequence(vo /* too late to fail here */ void default_machine_kexec(struct kimage *image) { - /* prepare control code if any */ - /* shutdown other cpus into our wait loop and quiesce interrupts */ - kexec_prepare_cpus(); + /* + * If the kexec boot is the normal one, need to shutdown other cpus + * into our wait loop and quiesce interrupts. + * Otherwise, in the case of crashed mode (crashing_cpu >= 0), + * stopping other CPUs and collecting their pt_regs is done before + * using debugger IPI. + */ + + if (crashing_cpu == -1) + kexec_prepare_cpus(); /* switch to a staticly allocated stack. Based on irq stack code. * XXX: the task struct will likely be invalid once we do the copy! Index: kexec/include/asm-powerpc/kexec.h =================================================================== --- kexec.orig/include/asm-powerpc/kexec.h +++ kexec/include/asm-powerpc/kexec.h @@ -32,6 +32,8 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_KEXEC + #define MAX_NOTE_BYTES 1024 typedef u32 note_buf_t[MAX_NOTE_BYTES / sizeof(u32)]; @@ -41,11 +43,17 @@ extern note_buf_t crash_notes[]; extern void kexec_smp_wait(void); /* get and clear naca physid, wait for master to copy new code to 0 */ extern void __init kexec_setup(void); -#endif +extern int crashing_cpu; +extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); +#endif /* __powerpc64 __ */ struct kimage; +struct pt_regs; extern void default_machine_kexec(struct kimage *image); extern int default_machine_kexec_prepare(struct kimage *image); +extern void default_machine_crash_shutdown(struct pt_regs *regs); + +#endif /* !CONFIG_KEXEC */ #endif /* ! __ASSEMBLY__ */ #endif /* _ASM_POWERPC_KEXEC_H */ Index: kexec/arch/powerpc/platforms/pseries/xics.c =================================================================== --- kexec.orig/arch/powerpc/platforms/pseries/xics.c +++ kexec/arch/powerpc/platforms/pseries/xics.c @@ -417,7 +417,7 @@ irqreturn_t xics_ipi_action(int irq, voi smp_message_recv(PPC_MSG_MIGRATE_TASK, regs); } #endif -#ifdef CONFIG_DEBUGGER +#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK, &xics_ipi_message[cpu].value)) { mb(); Index: kexec/arch/powerpc/platforms/cell/setup.c =================================================================== --- kexec.orig/arch/powerpc/platforms/cell/setup.c +++ kexec/arch/powerpc/platforms/cell/setup.c @@ -217,5 +217,6 @@ struct machdep_calls __initdata cell_md #ifdef CONFIG_KEXEC .machine_kexec = default_machine_kexec, .machine_kexec_prepare = default_machine_kexec_prepare, + .machine_crash_shutdown = default_machine_crash_shutdown, #endif }; Index: kexec/arch/powerpc/platforms/maple/setup.c =================================================================== --- kexec.orig/arch/powerpc/platforms/maple/setup.c +++ kexec/arch/powerpc/platforms/maple/setup.c @@ -282,5 +282,6 @@ struct machdep_calls __initdata maple_md #ifdef CONFIG_KEXEC .machine_kexec = default_machine_kexec, .machine_kexec_prepare = default_machine_kexec_prepare, + .machine_crash_shutdown = default_machine_crash_shutdown, #endif }; Index: kexec/arch/powerpc/platforms/powermac/setup.c =================================================================== --- kexec.orig/arch/powerpc/platforms/powermac/setup.c +++ kexec/arch/powerpc/platforms/powermac/setup.c @@ -771,6 +771,7 @@ struct machdep_calls __initdata pmac_md #ifdef CONFIG_KEXEC .machine_kexec = default_machine_kexec, .machine_kexec_prepare = default_machine_kexec_prepare, + .machine_crash_shutdown = default_machine_crash_shutdown, #endif #endif /* CONFIG_PPC64 */ #ifdef CONFIG_PPC32 Index: kexec/arch/powerpc/platforms/pseries/setup.c =================================================================== --- kexec.orig/arch/powerpc/platforms/pseries/setup.c +++ kexec/arch/powerpc/platforms/pseries/setup.c @@ -629,5 +629,6 @@ struct machdep_calls __initdata pSeries_ .kexec_cpu_down = pseries_kexec_cpu_down, .machine_kexec = default_machine_kexec, .machine_kexec_prepare = default_machine_kexec_prepare, + .machine_crash_shutdown = default_machine_crash_shutdown, #endif }; Index: kexec/arch/powerpc/kernel/crash_dump.c =================================================================== --- kexec.orig/arch/powerpc/kernel/crash_dump.c +++ kexec/arch/powerpc/kernel/crash_dump.c @@ -11,6 +11,8 @@ #undef DEBUG +#include +#include #include #include #include @@ -51,3 +53,21 @@ void __init kdump_setup(void) DBG(" <- kdump_setup()\n"); } + +static int __init parse_elfcorehdr(char *p) +{ + if (p) + elfcorehdr_addr = memparse(p, &p); + + return 0; +} +__setup("elfcorehdr=", parse_elfcorehdr); + +static int __init parse_savemaxmem(char *p) +{ + if (p) + saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1; + + return 0; +} +__setup("savemaxmem=", parse_savemaxmem); Index: kexec/arch/powerpc/kernel/Makefile =================================================================== --- kexec.orig/arch/powerpc/kernel/Makefile +++ kexec/arch/powerpc/kernel/Makefile @@ -66,7 +66,7 @@ pci64-$(CONFIG_PPC64) += pci_64.o pci_d obj-$(CONFIG_PCI) += $(pci64-y) kexec-$(CONFIG_PPC64) := machine_kexec_64.o kexec-$(CONFIG_PPC32) := machine_kexec_32.o -obj-$(CONFIG_KEXEC) += machine_kexec.o $(kexec-y) +obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o $(kexec-y) ifeq ($(CONFIG_PPC_ISERIES),y) $(obj)/head_64.o: $(obj)/lparmap.s From michael at ellerman.id.au Sun Dec 4 18:39:48 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:48 Subject: [PATCH 9/11] powerpc: Parse crashkernel= parameter in first kernel In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205004002.7A01B68889@ozlabs.org> This patch adds code to parse and setup the crash kernel resource in the first kernel. PPC64 ignores the @x part, we always run at 32 MB. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom.c | 11 ++++++++ arch/powerpc/kernel/prom_init.c | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) Index: kexec/arch/powerpc/kernel/prom_init.c =================================================================== --- kexec.orig/arch/powerpc/kernel/prom_init.c +++ kexec/arch/powerpc/kernel/prom_init.c @@ -192,6 +192,11 @@ static unsigned long __initdata alloc_bo static unsigned long __initdata rmo_top; static unsigned long __initdata ram_top; +#ifdef CONFIG_KEXEC +static unsigned long __initdata prom_crashk_base; +static unsigned long __initdata prom_crashk_size; +#endif + static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE]; static int __initdata mem_reserve_cnt; @@ -590,6 +595,34 @@ static void __init early_cmdline_parse(v RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000); #endif } + +#ifdef CONFIG_KEXEC + /* + * crashkernel=size at addr specifies the location to reserve for + * crash kernel. + */ + opt = strstr(RELOC(prom_cmd_line), RELOC("crashkernel=")); + if (opt) { + opt += 12; + RELOC(prom_crashk_size) = prom_memparse(opt, &opt); + + if (ALIGN(RELOC(prom_crashk_size), 0x1000000) != + RELOC(prom_crashk_size)) { + prom_printf("Warning: crashkernel size is not " + "aligned to 16MB\n"); + } + + /* + * At present, the crash kernel always run at 32MB. + * Just ignore whatever user passed. + */ + RELOC(prom_crashk_base) = 0x2000000; + if (*opt == '@') { + prom_printf("Warning: PPC64 kdump kernel always runs " + "at 32 MB\n"); + } + } +#endif } #ifdef CONFIG_PPC_PSERIES @@ -1011,6 +1044,12 @@ static void __init prom_init_mem(void) prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); prom_printf(" rmo_top : %x\n", RELOC(rmo_top)); prom_printf(" ram_top : %x\n", RELOC(ram_top)); +#ifdef CONFIG_KEXEC + if (RELOC(prom_crashk_base)) { + prom_printf(" crashk_base : %x\n", RELOC(prom_crashk_base)); + prom_printf(" crashk_size : %x\n", RELOC(prom_crashk_size)); + } +#endif } @@ -2094,6 +2133,10 @@ unsigned long __init prom_init(unsigned */ prom_init_mem(); +#ifdef CONFIG_KEXEC + if (RELOC(prom_crashk_base)) + reserve_mem(RELOC(prom_crashk_base), RELOC(prom_crashk_size)); +#endif /* * Determine which cpu is actually running right _now_ */ @@ -2150,6 +2193,16 @@ unsigned long __init prom_init(unsigned } #endif +#ifdef CONFIG_KEXEC + if (RELOC(prom_crashk_base)) { + prom_setprop(_prom->chosen, "/chosen", "linux,crashkernel-base", + PTRRELOC(&prom_crashk_base), + sizeof(RELOC(prom_crashk_base))); + prom_setprop(_prom->chosen, "/chosen", "linux,crashkernel-size", + PTRRELOC(&prom_crashk_size), + sizeof(RELOC(prom_crashk_size))); + } +#endif /* * Fixup any known bugs in the device-tree */ Index: kexec/arch/powerpc/kernel/prom.c =================================================================== --- kexec.orig/arch/powerpc/kernel/prom.c +++ kexec/arch/powerpc/kernel/prom.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -1198,6 +1199,16 @@ static int __init early_init_dt_scan_cho } #endif /* CONFIG_PPC_RTAS */ +#ifdef CONFIG_KEXEC + lprop = (u64*)of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL); + if (lprop) + crashk_res.start = *lprop; + + lprop = (u64*)of_get_flat_dt_prop(node, "linux,crashkernel-size", NULL); + if (lprop) + crashk_res.end = crashk_res.start + *lprop - 1; +#endif + /* break now */ return 1; } From michael at ellerman.id.au Sun Dec 4 18:39:51 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:51 Subject: [PATCH 10/11] powerpc: Add arch-dependant copy_oldmem_page In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205004006.1FEB26887B@ozlabs.org> Add arch-dependant copy_oldmem_page. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/crash_dump.c | 36 ++++++++++++++++++++++++++++++++++++ include/asm-powerpc/kexec.h | 2 ++ kernel/crash_dump.c | 3 +++ 3 files changed, 41 insertions(+) Index: kexec/arch/powerpc/kernel/crash_dump.c =================================================================== --- kexec.orig/arch/powerpc/kernel/crash_dump.c +++ kexec/arch/powerpc/kernel/crash_dump.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef DEBUG #include @@ -71,3 +72,38 @@ static int __init parse_savemaxmem(char return 0; } __setup("savemaxmem=", parse_savemaxmem); + +/* + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0); + + if (userbuf) { + if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) { + iounmap(vaddr); + return -EFAULT; + } + } else + memcpy(buf, (vaddr + offset), csize); + + iounmap(vaddr); + return csize; +} Index: kexec/include/asm-powerpc/kexec.h =================================================================== --- kexec.orig/include/asm-powerpc/kexec.h +++ kexec/include/asm-powerpc/kexec.h @@ -30,6 +30,8 @@ #define KEXEC_ARCH KEXEC_ARCH_PPC #endif +#define HAVE_ARCH_COPY_OLDMEM_PAGE + #ifndef __ASSEMBLY__ #ifdef CONFIG_KEXEC Index: kexec/kernel/crash_dump.c =================================================================== --- kexec.orig/kernel/crash_dump.c +++ kexec/kernel/crash_dump.c @@ -14,10 +14,12 @@ #include #include +#include /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +#ifndef HAVE_ARCH_COPY_OLDMEM_PAGE /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -59,3 +61,4 @@ ssize_t copy_oldmem_page(unsigned long p kfree(page); return csize; } +#endif From michael at ellerman.id.au Sun Dec 4 18:39:55 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 18:39:55 Subject: [PATCH 11/11] powerpc: Add support for "linux, usable-memory" on memory nodes In-Reply-To: <1133743149.268607.418162138937.qpush@concordia> Message-ID: <20051205004009.D5BCE68865@ozlabs.org> Milton has proposed that we should support a "linux,usable-memory" property on memory nodes which describes, in preference to "reg", the regions of memory Linux should use. This facility is required for kdump, to inform the second kernel which memory it should use. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom.c | 9 +++++++-- arch/powerpc/mm/numa.c | 7 ++++++- 2 files changed, 13 insertions(+), 3 deletions(-) Index: kexec/arch/powerpc/kernel/prom.c =================================================================== --- kexec.orig/arch/powerpc/kernel/prom.c +++ kexec/arch/powerpc/kernel/prom.c @@ -567,7 +567,10 @@ static int __init interpret_root_props(s unsigned int *rp; int rpsize = (naddrc + nsizec) * sizeof(unsigned int); - rp = (unsigned int *) get_property(np, "reg", &l); + rp = (unsigned int *) get_property(np, "linux,usable-memory", &l); + if (rp == NULL) + rp = (unsigned int *) get_property(np, "reg", &l); + if (rp != 0 && l >= rpsize) { i = 0; adr = (struct address_range *) (*mem_start); @@ -1275,7 +1278,9 @@ static int __init early_init_dt_scan_mem } else if (strcmp(type, "memory") != 0) return 0; - reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); + reg = (cell_t *)of_get_flat_dt_prop(node, "linux,usable-memory", &l); + if (reg == NULL) + reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); if (reg == NULL) return 0; Index: kexec/arch/powerpc/mm/numa.c =================================================================== --- kexec.orig/arch/powerpc/mm/numa.c +++ kexec/arch/powerpc/mm/numa.c @@ -423,7 +423,12 @@ static int __init parse_numa_properties( unsigned int *memcell_buf; unsigned int len; - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + memcell_buf = (unsigned int *)get_property(memory, + "linux,usable-memory", &len); + if (!memcell_buf || len <= 0) + memcell_buf = + (unsigned int *)get_property(memory, "reg", + &len); if (!memcell_buf || len <= 0) continue; From paulus at samba.org Mon Dec 5 15:06:13 2005 From: paulus at samba.org (Paul Mackerras) Date: Mon, 5 Dec 2005 15:06:13 +1100 Subject: compilation error for CONFIG_SMP=n In-Reply-To: <4390CC85.8030808@us.ibm.com> References: <4390CC85.8030808@us.ibm.com> Message-ID: <17299.48309.788379.53779@cargo.ozlabs.ibm.com> Haren Myneni writes: > Getting undeclared symbol `H_SET_ASR' for CONFIG_SMP=n. There weren't actually any released pSeries machines that had a hypervisor and a segment table, so I will just take out the code that calls H_SET_ASR instead. Paul. From michael at ellerman.id.au Sun Dec 4 23:07:02 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sun, 04 Dec 2005 23:07:02 Subject: [PATCH] powerpc: Separate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <20051205003934.643E26887C@ozlabs.org> Message-ID: <20051205050717.ED89768863@ozlabs.org> This patch separates usage of KERNELBASE and PAGE_OFFSET. I haven't looked at any of the PPC code, if we ever want to support Kdump on PPC we'll have to do another audit, ditto for iSeries. This patch makes PAGE_OFFSET the constant, it'll always be 0xC * 1 gazillion. To get a physical address from a virtual one you subtract PAGE_OFFSET, _not_ KERNELBASE. KERNELBASE is the virtual address of the start of the kernel, it's often the same as PAGE_OFFSET, but _might not be_. If you want to know something's offset from the start of the kernel you should subtract KERNELBASE. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/btext.c | 4 ++-- arch/powerpc/kernel/entry_64.S | 4 ++-- arch/powerpc/kernel/lparmap.c | 6 +++--- arch/powerpc/kernel/machine_kexec_64.c | 5 ++--- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/slb.c | 4 ++-- arch/powerpc/mm/slb_low.S | 6 +++--- arch/powerpc/mm/stab.c | 10 +++++----- include/asm-powerpc/page.h | 2 +- 9 files changed, 23 insertions(+), 24 deletions(-) Index: kexec/arch/powerpc/mm/stab.c =================================================================== --- kexec.orig/arch/powerpc/mm/stab.c +++ kexec/arch/powerpc/mm/stab.c @@ -40,7 +40,7 @@ static int make_ste(unsigned long stab, unsigned long entry, group, old_esid, castout_entry, i; unsigned int global_entry; struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; + unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; vsid_data = vsid << STE_VSID_SHIFT; esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; @@ -83,7 +83,7 @@ static int make_ste(unsigned long stab, } /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) + if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) break; castout_entry = (castout_entry + 1) & 0xf; @@ -251,7 +251,7 @@ void stabs_alloc(void) panic("Unable to allocate segment table for CPU %d.\n", cpu); - newstab += KERNELBASE; + newstab = (unsigned long)__va(newstab); memset((void *)newstab, 0, HW_PAGE_SIZE); @@ -270,11 +270,11 @@ void stabs_alloc(void) */ void stab_initialize(unsigned long stab) { - unsigned long vsid = get_kernel_vsid(KERNELBASE); + unsigned long vsid = get_kernel_vsid(PAGE_OFFSET); unsigned long stabreal; asm volatile("isync; slbia; isync":::"memory"); - make_ste(stab, GET_ESID(KERNELBASE), vsid); + make_ste(stab, GET_ESID(PAGE_OFFSET), vsid); /* Order update */ asm volatile("sync":::"memory"); Index: kexec/arch/powerpc/kernel/machine_kexec_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/machine_kexec_64.c +++ kexec/arch/powerpc/kernel/machine_kexec_64.c @@ -153,9 +153,8 @@ void kexec_copy_flush(struct kimage *ima * including ones that were in place on the original copy */ for (i = 0; i < nr_segments; i++) - flush_icache_range(ranges[i].mem + KERNELBASE, - ranges[i].mem + KERNELBASE + - ranges[i].memsz); + flush_icache_range((unsigned long)__va(ranges[i].mem), + (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); } #ifdef CONFIG_SMP Index: kexec/arch/powerpc/mm/hash_utils_64.c =================================================================== --- kexec.orig/arch/powerpc/mm/hash_utils_64.c +++ kexec/arch/powerpc/mm/hash_utils_64.c @@ -456,7 +456,7 @@ void __init htab_initialize(void) /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].base + KERNELBASE; + base = (unsigned long)__va(lmb.memory.region[i].base); size = lmb.memory.region[i].size; DBG("creating mapping for region: %lx : %lx\n", base, size); @@ -498,8 +498,8 @@ void __init htab_initialize(void) * for either 4K or 16MB pages. */ if (tce_alloc_start) { - tce_alloc_start += KERNELBASE; - tce_alloc_end += KERNELBASE; + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); if (base + size >= tce_alloc_start) tce_alloc_start = base + size + 1; Index: kexec/arch/powerpc/mm/slb.c =================================================================== --- kexec.orig/arch/powerpc/mm/slb.c +++ kexec/arch/powerpc/mm/slb.c @@ -75,7 +75,7 @@ static void slb_flush_and_rebolt(void) vflags = SLB_VSID_KERNEL | virtual_llp; ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); - if ((ksp_esid_data & ESID_MASK) == KERNELBASE) + if ((ksp_esid_data & ESID_MASK) == PAGE_OFFSET) ksp_esid_data &= ~SLB_ESID_V; /* We need to do this all in asm, so we're sure we don't touch @@ -213,7 +213,7 @@ void slb_initialize(void) asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); - create_slbe(KERNELBASE, lflags, 0); + create_slbe(PAGE_OFFSET, lflags, 0); /* VMALLOC space has 4K pages always for now */ create_slbe(VMALLOCBASE, vflags, 1); Index: kexec/arch/powerpc/kernel/entry_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/entry_64.S +++ kexec/arch/powerpc/kernel/entry_64.S @@ -690,7 +690,7 @@ _GLOBAL(enter_rtas) /* Setup our real return addr */ SET_REG_TO_LABEL(r4,.rtas_return_loc) - SET_REG_TO_CONST(r9,KERNELBASE) + SET_REG_TO_CONST(r9,PAGE_OFFSET) sub r4,r4,r9 mtlr r4 @@ -718,7 +718,7 @@ _GLOBAL(enter_rtas) _STATIC(rtas_return_loc) /* relocation is off at this point */ mfspr r4,SPRN_SPRG3 /* Get PACA */ - SET_REG_TO_CONST(r5, KERNELBASE) + SET_REG_TO_CONST(r5, PAGE_OFFSET) sub r4,r4,r5 /* RELOC the PACA base pointer */ mfmsr r6 Index: kexec/arch/powerpc/mm/slb_low.S =================================================================== --- kexec.orig/arch/powerpc/mm/slb_low.S +++ kexec/arch/powerpc/mm/slb_low.S @@ -37,9 +37,9 @@ _GLOBAL(slb_allocate_realmode) srdi r9,r3,60 /* get region */ srdi r10,r3,28 /* get esid */ - cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - /* r3 = address, r10 = esid, cr7 = <>KERNELBASE */ + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ blt cr7,0f /* user or kernel? */ /* kernel address: proto-VSID = ESID */ @@ -166,7 +166,7 @@ _GLOBAL(slb_allocate_user) /* * Finish loading of an SLB entry and return * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: ASM_VSID_SCRAMBLE(r10,r9) Index: kexec/arch/powerpc/kernel/lparmap.c =================================================================== --- kexec.orig/arch/powerpc/kernel/lparmap.c +++ kexec/arch/powerpc/kernel/lparmap.c @@ -16,8 +16,8 @@ const struct LparMap __attribute__((__se .xSegmentTableOffs = STAB0_PAGE, .xEsids = { - { .xKernelEsid = GET_ESID(KERNELBASE), - .xKernelVsid = KERNEL_VSID(KERNELBASE), }, + { .xKernelEsid = GET_ESID(PAGE_OFFSET), + .xKernelVsid = KERNEL_VSID(PAGE_OFFSET), }, { .xKernelEsid = GET_ESID(VMALLOCBASE), .xKernelVsid = KERNEL_VSID(VMALLOCBASE), }, }, @@ -25,7 +25,7 @@ const struct LparMap __attribute__((__se .xRanges = { { .xPages = HvPagesToMap, .xOffset = 0, - .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - HW_PAGE_SHIFT), + .xVPN = KERNEL_VSID(PAGE_OFFSET) << (SID_SHIFT - HW_PAGE_SHIFT), }, }, }; Index: kexec/include/asm-powerpc/page.h =================================================================== --- kexec.orig/include/asm-powerpc/page.h +++ kexec/include/asm-powerpc/page.h @@ -56,7 +56,7 @@ #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) +#define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET)) #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* Index: kexec/arch/powerpc/kernel/btext.c =================================================================== --- kexec.orig/arch/powerpc/kernel/btext.c +++ kexec/arch/powerpc/kernel/btext.c @@ -60,7 +60,7 @@ int force_printk_to_btext = 0; * * The display is mapped to virtual address 0xD0000000, rather * than 1:1, because some some CHRP machines put the frame buffer - * in the region starting at 0xC0000000 (KERNELBASE). + * in the region starting at 0xC0000000 (PAGE_OFFSET). * This mapping is temporary and will disappear as soon as the * setup done by MMU_Init() is applied. * @@ -71,7 +71,7 @@ int force_printk_to_btext = 0; */ void __init btext_prepare_BAT(void) { - unsigned long vaddr = KERNELBASE + 0x10000000; + unsigned long vaddr = PAGE_OFFSET + 0x10000000; unsigned long addr; unsigned long lowbits; From prenuka at gmail.com Mon Dec 5 19:35:22 2005 From: prenuka at gmail.com (Renuka Pampana) Date: Mon, 5 Dec 2005 14:05:22 +0530 Subject: Linuxppc64-dev Digest, Vol 16, Issue 11 In-Reply-To: <20051205010004.52D1568876@ozlabs.org> References: <20051205010004.52D1568876@ozlabs.org> Message-ID: <9b23fc710512050035i117c7bd7y75a01f487dc74654@mail.gmail.com> Hi, Where can i get PPC440ep (yosemite) patch for 64 bit kernel. Can you give me some pointers to refer. Thank you in advance Renuka On 12/5/05, linuxppc64-dev-request at ozlabs.org wrote: > Send Linuxppc64-dev mailing list submissions to > linuxppc64-dev at ozlabs.org > > To subscribe or unsubscribe via the World Wide Web, visit > https://ozlabs.org/mailman/listinfo/linuxppc64-dev > or, via email, send a message with subject or body 'help' to > linuxppc64-dev-request at ozlabs.org > > You can reach the person managing the list at > linuxppc64-dev-owner at ozlabs.org > > When replying, please edit your Subject line so it is more specific > than "Re: Contents of Linuxppc64-dev digest..." > > > Today's Topics: > > 1. [PATCH 8/11] powerpc: Add arch dependent basic infrastructure > for Kdump. (Michael Ellerman) > 2. [PATCH 9/11] powerpc: Parse crashkernel= parameter in first > kernel (Michael Ellerman) > 3. [PATCH 10/11] powerpc: Add arch-dependant copy_oldmem_page > (Michael Ellerman) > 4. [PATCH 11/11] powerpc: Add support for "linux, usable-memory" > on memory nodes (Michael Ellerman) > > > ---------------------------------------------------------------------- > > Message: 1 > Date: Sun, 04 Dec 2005 18:39:43 > From: Michael Ellerman > Subject: [PATCH 8/11] powerpc: Add arch dependent basic infrastructure > for Kdump. > To: , Paul Mackerras > Message-ID: <20051205003957.4CD9868887 at ozlabs.org> > > Implementing the machine_crash_shutdown which will be called by > crash_kexec (called in case of a panic, sysrq etc.). Disable the > interrupts, shootdown cpus using debugger IPI and collect regs > for all CPUs. > > elfcorehdr= specifies the location of elf core header stored by > the crashed kernel. This command line option will be passed by > the kexec-tools to capture kernel. > > savemaxmem= specifies the actual memory size that the first kernel > has and this value will be used for dumping in the capture kernel. > This command line option will be passed by the kexec-tools to > capture kernel. > > Signed-off-by: Haren Myneni > Signed-off-by: Michael Ellerman > --- > > arch/powerpc/kernel/Makefile | 2 > arch/powerpc/kernel/crash.c | 264 ++++++++++++++++++++++++++++++++ > arch/powerpc/kernel/crash_dump.c | 20 ++ > arch/powerpc/kernel/machine_kexec_64.c | 13 + > arch/powerpc/kernel/smp.c | 22 ++ > arch/powerpc/kernel/traps.c | 17 +- > arch/powerpc/platforms/cell/setup.c | 1 > arch/powerpc/platforms/maple/setup.c | 1 > arch/powerpc/platforms/powermac/setup.c | 1 > arch/powerpc/platforms/pseries/setup.c | 1 > arch/powerpc/platforms/pseries/xics.c | 2 > include/asm-powerpc/kexec.h | 10 + > 12 files changed, 345 insertions(+), 9 deletions(-) > > Index: kexec/arch/powerpc/kernel/smp.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/smp.c > +++ kexec/arch/powerpc/kernel/smp.c > @@ -75,6 +75,8 @@ void smp_call_function_interrupt(void); > > int smt_enabled_at_boot = 1; > > +static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; > + > #ifdef CONFIG_MPIC > int __init smp_mpic_probe(void) > { > @@ -123,11 +125,16 @@ void smp_message_recv(int msg, struct pt > /* XXX Do we have to do this? */ > set_need_resched(); > break; > -#ifdef CONFIG_DEBUGGER > case PPC_MSG_DEBUGGER_BREAK: > + if (crash_ipi_function_ptr) { > + crash_ipi_function_ptr(regs); > + break; > + } > +#ifdef CONFIG_DEBUGGER > debugger_ipi(regs); > break; > -#endif > +#endif /* CONFIG_DEBUGGER */ > + /* FALLTHROUGH */ > default: > printk("SMP %d: smp_message_recv(): unknown msg %d\n", > smp_processor_id(), msg); > @@ -147,6 +154,17 @@ void smp_send_debugger_break(int cpu) > } > #endif > > +#ifdef CONFIG_KEXEC > +void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) > +{ > + crash_ipi_function_ptr = crash_ipi_callback; > + if (crash_ipi_callback) { > + mb(); > + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK); > + } > +} > +#endif > + > static void stop_this_cpu(void *dummy) > { > local_irq_disable(); > Index: kexec/arch/powerpc/kernel/crash.c > =================================================================== > --- /dev/null > +++ kexec/arch/powerpc/kernel/crash.c > @@ -0,0 +1,264 @@ > +/* > + * Architecture specific (PPC64) functions for kexec based crash dumps. > + * > + * Copyright (C) 2005, IBM Corp. > + * > + * Created by: Haren Myneni > + * > + * This source code is licensed under the GNU General Public License, > + * Version 2. See the file COPYING for more details. > + * > + */ > + > +#undef DEBUG > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > +#include > +#include > + > +#ifdef DEBUG > +#include > +#define DBG(fmt...) udbg_printf(fmt) > +#else > +#define DBG(fmt...) > +#endif > + > +/* This keeps a track of which one is crashing cpu. */ > +int crashing_cpu = -1; > + > +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, > + size_t data_len) > +{ > + struct elf_note note; > + > + note.n_namesz = strlen(name) + 1; > + note.n_descsz = data_len; > + note.n_type = type; > + memcpy(buf, ¬e, sizeof(note)); > + buf += (sizeof(note) +3)/4; > + memcpy(buf, name, note.n_namesz); > + buf += (note.n_namesz + 3)/4; > + memcpy(buf, data, note.n_descsz); > + buf += (note.n_descsz + 3)/4; > + > + return buf; > +} > + > +static void final_note(u32 *buf) > +{ > + struct elf_note note; > + > + note.n_namesz = 0; > + note.n_descsz = 0; > + note.n_type = 0; > + memcpy(buf, ¬e, sizeof(note)); > +} > + > +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) > +{ > + struct elf_prstatus prstatus; > + u32 *buf; > + > + if ((cpu < 0) || (cpu >= NR_CPUS)) > + return; > + > + /* Using ELF notes here is opportunistic. > + * I need a well defined structure format > + * for the data I pass, and I need tags > + * on the data to indicate what information I have > + * squirrelled away. ELF notes happen to provide > + * all of that that no need to invent something new. > + */ > + buf = &crash_notes[cpu][0]; > + memset(&prstatus, 0, sizeof(prstatus)); > + prstatus.pr_pid = current->pid; > + elf_core_copy_regs(&prstatus.pr_reg, regs); > + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, > + sizeof(prstatus)); > + final_note(buf); > +} > + > +/* FIXME Merge this with xmon_save_regs ?? */ > +static inline void crash_get_current_regs(struct pt_regs *regs) > +{ > + unsigned long tmp1, tmp2; > + > + __asm__ __volatile__ ( > + "std 0,0(%2)\n" > + "std 1,8(%2)\n" > + "std 2,16(%2)\n" > + "std 3,24(%2)\n" > + "std 4,32(%2)\n" > + "std 5,40(%2)\n" > + "std 6,48(%2)\n" > + "std 7,56(%2)\n" > + "std 8,64(%2)\n" > + "std 9,72(%2)\n" > + "std 10,80(%2)\n" > + "std 11,88(%2)\n" > + "std 12,96(%2)\n" > + "std 13,104(%2)\n" > + "std 14,112(%2)\n" > + "std 15,120(%2)\n" > + "std 16,128(%2)\n" > + "std 17,136(%2)\n" > + "std 18,144(%2)\n" > + "std 19,152(%2)\n" > + "std 20,160(%2)\n" > + "std 21,168(%2)\n" > + "std 22,176(%2)\n" > + "std 23,184(%2)\n" > + "std 24,192(%2)\n" > + "std 25,200(%2)\n" > + "std 26,208(%2)\n" > + "std 27,216(%2)\n" > + "std 28,224(%2)\n" > + "std 29,232(%2)\n" > + "std 30,240(%2)\n" > + "std 31,248(%2)\n" > + "mfmsr %0\n" > + "std %0, 264(%2)\n" > + "mfctr %0\n" > + "std %0, 280(%2)\n" > + "mflr %0\n" > + "std %0, 288(%2)\n" > + "bl 1f\n" > + "1: mflr %1\n" > + "std %1, 256(%2)\n" > + "mtlr %0\n" > + "mfxer %0\n" > + "std %0, 296(%2)\n" > + : "=&r" (tmp1), "=&r" (tmp2) > + : "b" (regs)); > +} > + > +/* We may have saved_regs from where the error came from > + * or it is NULL if via a direct panic(). > + */ > +static void crash_save_self(struct pt_regs *saved_regs) > +{ > + struct pt_regs regs; > + int cpu; > + > + cpu = smp_processor_id(); > + if (saved_regs) > + memcpy(®s, saved_regs, sizeof(regs)); > + else > + crash_get_current_regs(®s); > + crash_save_this_cpu(®s, cpu); > +} > + > +#ifdef CONFIG_SMP > +static atomic_t waiting_for_crash_ipi; > + > +void crash_ipi_callback(struct pt_regs *regs) > +{ > + int cpu = smp_processor_id(); > + > + if (cpu == crashing_cpu) > + return; > + > + if (!cpu_online(cpu)) > + return; > + > + if (ppc_md.kexec_cpu_down) > + ppc_md.kexec_cpu_down(1, 1); > + > + local_irq_disable(); > + > + crash_save_this_cpu(regs, cpu); > + atomic_dec(&waiting_for_crash_ipi); > + kexec_smp_wait(); > + /* NOTREACHED */ > +} > + > +static void crash_kexec_prepare_cpus(void) > +{ > + unsigned int msecs; > + > + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); > + > + crash_send_ipi(crash_ipi_callback); > + smp_wmb(); > + > + /* > + * FIXME: Until we will have the way to stop other CPUSs reliabally, > + * the crash CPU will send an IPI and wait for other CPUs to > + * respond. If not, proceed the kexec boot even though we failed to > + * capture other CPU states. > + */ > + msecs = 1000000; > + while ((atomic_read(&waiting_for_crash_ipi) > 0) && (--msecs > 0)) { > + barrier(); > + mdelay(1); > + } > + > + /* Would it be better to replace the trap vector here? */ > + > + /* > + * FIXME: In case if we do not get all CPUs, one possibility: ask the > + * user to do soft reset such that we get all. > + * IPI handler is already set by the panic cpu initially. Therefore, > + * all cpus could invoke this handler from die() and the panic CPU > + * will call machine_kexec() directly from this handler to do > + * kexec boot. > + */ > + if (atomic_read(&waiting_for_crash_ipi)) > + printk(KERN_ALERT "done waiting: %d cpus not responding\n", > + atomic_read(&waiting_for_crash_ipi)); > + /* Leave the IPI callback set */ > +} > +#else > +static void crash_kexec_prepare_cpus(void) > +{ > + /* > + * move the secondarys to us so that we can copy > + * the new kernel 0-0x100 safely > + * > + * do this if kexec in setup.c ? > + */ > + smp_release_cpus(); > +} > + > +#endif > + > +void default_machine_crash_shutdown(struct pt_regs *regs) > +{ > + /* > + * This function is only called after the system > + * has paniced or is otherwise in a critical state. > + * The minimum amount of code to allow a kexec'd kernel > + * to run successfully needs to happen here. > + * > + * In practice this means stopping other cpus in > + * an SMP system. > + * The kernel is broken so disable interrupts. > + */ > + local_irq_disable(); > + > + if (ppc_md.kexec_cpu_down) > + ppc_md.kexec_cpu_down(1, 0); > + > + /* > + * Make a note of crashing cpu. Will be used in machine_kexec > + * such that another IPI will not be sent. > + */ > + crashing_cpu = smp_processor_id(); > + crash_kexec_prepare_cpus(); > + crash_save_self(regs); > +} > Index: kexec/arch/powerpc/kernel/traps.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/traps.c > +++ kexec/arch/powerpc/kernel/traps.c > @@ -31,6 +31,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -95,7 +96,7 @@ static DEFINE_SPINLOCK(die_lock); > > int die(const char *str, struct pt_regs *regs, long err) > { > - static int die_counter; > + static int die_counter, crash_dump_start = 0; > int nl = 0; > > if (debugger(regs)) > @@ -156,7 +157,21 @@ int die(const char *str, struct pt_regs > print_modules(); > show_regs(regs); > bust_spinlocks(0); > + > + if (!crash_dump_start && kexec_should_crash(current)) { > + crash_dump_start = 1; > + spin_unlock_irq(&die_lock); > + crash_kexec(regs); > + /* NOTREACHED */ > + } > spin_unlock_irq(&die_lock); > + if (crash_dump_start) > + /* > + * Only for soft-reset: Other CPUs will be responded to an IPI > + * sent by first kexec CPU. > + */ > + for(;;) > + ; > > if (in_interrupt()) > panic("Fatal exception in interrupt"); > Index: kexec/arch/powerpc/kernel/machine_kexec_64.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/machine_kexec_64.c > +++ kexec/arch/powerpc/kernel/machine_kexec_64.c > @@ -265,11 +265,18 @@ extern NORET_TYPE void kexec_sequence(vo > /* too late to fail here */ > void default_machine_kexec(struct kimage *image) > { > - > /* prepare control code if any */ > > - /* shutdown other cpus into our wait loop and quiesce interrupts */ > - kexec_prepare_cpus(); > + /* > + * If the kexec boot is the normal one, need to shutdown other cpus > + * into our wait loop and quiesce interrupts. > + * Otherwise, in the case of crashed mode (crashing_cpu >= 0), > + * stopping other CPUs and collecting their pt_regs is done before > + * using debugger IPI. > + */ > + > + if (crashing_cpu == -1) > + kexec_prepare_cpus(); > > /* switch to a staticly allocated stack. Based on irq stack code. > * XXX: the task struct will likely be invalid once we do the copy! > Index: kexec/include/asm-powerpc/kexec.h > =================================================================== > --- kexec.orig/include/asm-powerpc/kexec.h > +++ kexec/include/asm-powerpc/kexec.h > @@ -32,6 +32,8 @@ > > #ifndef __ASSEMBLY__ > > +#ifdef CONFIG_KEXEC > + > #define MAX_NOTE_BYTES 1024 > typedef u32 note_buf_t[MAX_NOTE_BYTES / sizeof(u32)]; > > @@ -41,11 +43,17 @@ extern note_buf_t crash_notes[]; > extern void kexec_smp_wait(void); /* get and clear naca physid, wait for > master to copy new code to 0 */ > extern void __init kexec_setup(void); > -#endif > +extern int crashing_cpu; > +extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); > +#endif /* __powerpc64 __ */ > > struct kimage; > +struct pt_regs; > extern void default_machine_kexec(struct kimage *image); > extern int default_machine_kexec_prepare(struct kimage *image); > +extern void default_machine_crash_shutdown(struct pt_regs *regs); > + > +#endif /* !CONFIG_KEXEC */ > > #endif /* ! __ASSEMBLY__ */ > #endif /* _ASM_POWERPC_KEXEC_H */ > Index: kexec/arch/powerpc/platforms/pseries/xics.c > =================================================================== > --- kexec.orig/arch/powerpc/platforms/pseries/xics.c > +++ kexec/arch/powerpc/platforms/pseries/xics.c > @@ -417,7 +417,7 @@ irqreturn_t xics_ipi_action(int irq, voi > smp_message_recv(PPC_MSG_MIGRATE_TASK, regs); > } > #endif > -#ifdef CONFIG_DEBUGGER > +#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) > if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK, > &xics_ipi_message[cpu].value)) { > mb(); > Index: kexec/arch/powerpc/platforms/cell/setup.c > =================================================================== > --- kexec.orig/arch/powerpc/platforms/cell/setup.c > +++ kexec/arch/powerpc/platforms/cell/setup.c > @@ -217,5 +217,6 @@ struct machdep_calls __initdata cell_md > #ifdef CONFIG_KEXEC > .machine_kexec = default_machine_kexec, > .machine_kexec_prepare = default_machine_kexec_prepare, > + .machine_crash_shutdown = default_machine_crash_shutdown, > #endif > }; > Index: kexec/arch/powerpc/platforms/maple/setup.c > =================================================================== > --- kexec.orig/arch/powerpc/platforms/maple/setup.c > +++ kexec/arch/powerpc/platforms/maple/setup.c > @@ -282,5 +282,6 @@ struct machdep_calls __initdata maple_md > #ifdef CONFIG_KEXEC > .machine_kexec = default_machine_kexec, > .machine_kexec_prepare = default_machine_kexec_prepare, > + .machine_crash_shutdown = default_machine_crash_shutdown, > #endif > }; > Index: kexec/arch/powerpc/platforms/powermac/setup.c > =================================================================== > --- kexec.orig/arch/powerpc/platforms/powermac/setup.c > +++ kexec/arch/powerpc/platforms/powermac/setup.c > @@ -771,6 +771,7 @@ struct machdep_calls __initdata pmac_md > #ifdef CONFIG_KEXEC > .machine_kexec = default_machine_kexec, > .machine_kexec_prepare = default_machine_kexec_prepare, > + .machine_crash_shutdown = default_machine_crash_shutdown, > #endif > #endif /* CONFIG_PPC64 */ > #ifdef CONFIG_PPC32 > Index: kexec/arch/powerpc/platforms/pseries/setup.c > =================================================================== > --- kexec.orig/arch/powerpc/platforms/pseries/setup.c > +++ kexec/arch/powerpc/platforms/pseries/setup.c > @@ -629,5 +629,6 @@ struct machdep_calls __initdata pSeries_ > .kexec_cpu_down = pseries_kexec_cpu_down, > .machine_kexec = default_machine_kexec, > .machine_kexec_prepare = default_machine_kexec_prepare, > + .machine_crash_shutdown = default_machine_crash_shutdown, > #endif > }; > Index: kexec/arch/powerpc/kernel/crash_dump.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/crash_dump.c > +++ kexec/arch/powerpc/kernel/crash_dump.c > @@ -11,6 +11,8 @@ > > #undef DEBUG > > +#include > +#include > #include > #include > #include > @@ -51,3 +53,21 @@ void __init kdump_setup(void) > > DBG(" <- kdump_setup()\n"); > } > + > +static int __init parse_elfcorehdr(char *p) > +{ > + if (p) > + elfcorehdr_addr = memparse(p, &p); > + > + return 0; > +} > +__setup("elfcorehdr=", parse_elfcorehdr); > + > +static int __init parse_savemaxmem(char *p) > +{ > + if (p) > + saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1; > + > + return 0; > +} > +__setup("savemaxmem=", parse_savemaxmem); > Index: kexec/arch/powerpc/kernel/Makefile > =================================================================== > --- kexec.orig/arch/powerpc/kernel/Makefile > +++ kexec/arch/powerpc/kernel/Makefile > @@ -66,7 +66,7 @@ pci64-$(CONFIG_PPC64) += pci_64.o pci_d > obj-$(CONFIG_PCI) += $(pci64-y) > kexec-$(CONFIG_PPC64) := machine_kexec_64.o > kexec-$(CONFIG_PPC32) := machine_kexec_32.o > -obj-$(CONFIG_KEXEC) += machine_kexec.o $(kexec-y) > +obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o $(kexec-y) > > ifeq ($(CONFIG_PPC_ISERIES),y) > $(obj)/head_64.o: $(obj)/lparmap.s > > > ------------------------------ > > Message: 2 > Date: Sun, 04 Dec 2005 18:39:48 > From: Michael Ellerman > Subject: [PATCH 9/11] powerpc: Parse crashkernel= parameter in first > kernel > To: , Paul Mackerras > Message-ID: <20051205004002.7A01B68889 at ozlabs.org> > > This patch adds code to parse and setup the crash kernel resource in the > first kernel. PPC64 ignores the @x part, we always run at 32 MB. > > Signed-off-by: Haren Myneni > Signed-off-by: Michael Ellerman > --- > > arch/powerpc/kernel/prom.c | 11 ++++++++ > arch/powerpc/kernel/prom_init.c | 53 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 64 insertions(+) > > Index: kexec/arch/powerpc/kernel/prom_init.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/prom_init.c > +++ kexec/arch/powerpc/kernel/prom_init.c > @@ -192,6 +192,11 @@ static unsigned long __initdata alloc_bo > static unsigned long __initdata rmo_top; > static unsigned long __initdata ram_top; > > +#ifdef CONFIG_KEXEC > +static unsigned long __initdata prom_crashk_base; > +static unsigned long __initdata prom_crashk_size; > +#endif > + > static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE]; > static int __initdata mem_reserve_cnt; > > @@ -590,6 +595,34 @@ static void __init early_cmdline_parse(v > RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000); > #endif > } > + > +#ifdef CONFIG_KEXEC > + /* > + * crashkernel=size at addr specifies the location to reserve for > + * crash kernel. > + */ > + opt = strstr(RELOC(prom_cmd_line), RELOC("crashkernel=")); > + if (opt) { > + opt += 12; > + RELOC(prom_crashk_size) = prom_memparse(opt, &opt); > + > + if (ALIGN(RELOC(prom_crashk_size), 0x1000000) != > + RELOC(prom_crashk_size)) { > + prom_printf("Warning: crashkernel size is not " > + "aligned to 16MB\n"); > + } > + > + /* > + * At present, the crash kernel always run at 32MB. > + * Just ignore whatever user passed. > + */ > + RELOC(prom_crashk_base) = 0x2000000; > + if (*opt == '@') { > + prom_printf("Warning: PPC64 kdump kernel always runs " > + "at 32 MB\n"); > + } > + } > +#endif > } > > #ifdef CONFIG_PPC_PSERIES > @@ -1011,6 +1044,12 @@ static void __init prom_init_mem(void) > prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); > prom_printf(" rmo_top : %x\n", RELOC(rmo_top)); > prom_printf(" ram_top : %x\n", RELOC(ram_top)); > +#ifdef CONFIG_KEXEC > + if (RELOC(prom_crashk_base)) { > + prom_printf(" crashk_base : %x\n", RELOC(prom_crashk_base)); > + prom_printf(" crashk_size : %x\n", RELOC(prom_crashk_size)); > + } > +#endif > } > > > @@ -2094,6 +2133,10 @@ unsigned long __init prom_init(unsigned > */ > prom_init_mem(); > > +#ifdef CONFIG_KEXEC > + if (RELOC(prom_crashk_base)) > + reserve_mem(RELOC(prom_crashk_base), RELOC(prom_crashk_size)); > +#endif > /* > * Determine which cpu is actually running right _now_ > */ > @@ -2150,6 +2193,16 @@ unsigned long __init prom_init(unsigned > } > #endif > > +#ifdef CONFIG_KEXEC > + if (RELOC(prom_crashk_base)) { > + prom_setprop(_prom->chosen, "/chosen", "linux,crashkernel-base", > + PTRRELOC(&prom_crashk_base), > + sizeof(RELOC(prom_crashk_base))); > + prom_setprop(_prom->chosen, "/chosen", "linux,crashkernel-size", > + PTRRELOC(&prom_crashk_size), > + sizeof(RELOC(prom_crashk_size))); > + } > +#endif > /* > * Fixup any known bugs in the device-tree > */ > Index: kexec/arch/powerpc/kernel/prom.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/prom.c > +++ kexec/arch/powerpc/kernel/prom.c > @@ -29,6 +29,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -1198,6 +1199,16 @@ static int __init early_init_dt_scan_cho > } > #endif /* CONFIG_PPC_RTAS */ > > +#ifdef CONFIG_KEXEC > + lprop = (u64*)of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL); > + if (lprop) > + crashk_res.start = *lprop; > + > + lprop = (u64*)of_get_flat_dt_prop(node, "linux,crashkernel-size", NULL); > + if (lprop) > + crashk_res.end = crashk_res.start + *lprop - 1; > +#endif > + > /* break now */ > return 1; > } > > > ------------------------------ > > Message: 3 > Date: Sun, 04 Dec 2005 18:39:51 > From: Michael Ellerman > Subject: [PATCH 10/11] powerpc: Add arch-dependant copy_oldmem_page > To: , Paul Mackerras > Message-ID: <20051205004006.1FEB26887B at ozlabs.org> > > Add arch-dependant copy_oldmem_page. > > Signed-off-by: Haren Myneni > Signed-off-by: Michael Ellerman > --- > > arch/powerpc/kernel/crash_dump.c | 36 ++++++++++++++++++++++++++++++++++++ > include/asm-powerpc/kexec.h | 2 ++ > kernel/crash_dump.c | 3 +++ > 3 files changed, 41 insertions(+) > > Index: kexec/arch/powerpc/kernel/crash_dump.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/crash_dump.c > +++ kexec/arch/powerpc/kernel/crash_dump.c > @@ -16,6 +16,7 @@ > #include > #include > #include > +#include > > #ifdef DEBUG > #include > @@ -71,3 +72,38 @@ static int __init parse_savemaxmem(char > return 0; > } > __setup("savemaxmem=", parse_savemaxmem); > + > +/* > + * copy_oldmem_page - copy one page from "oldmem" > + * @pfn: page frame number to be copied > + * @buf: target memory address for the copy; this can be in kernel address > + * space or user address space (see @userbuf) > + * @csize: number of bytes to copy > + * @offset: offset in bytes into the page (based on pfn) to begin the copy > + * @userbuf: if set, @buf is in user address space, use copy_to_user(), > + * otherwise @buf is in kernel address space, use memcpy(). > + * > + * Copy a page from "oldmem". For this page, there is no pte mapped > + * in the current kernel. We stitch up a pte, similar to kmap_atomic. > + */ > +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, > + size_t csize, unsigned long offset, int userbuf) > +{ > + void *vaddr; > + > + if (!csize) > + return 0; > + > + vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0); > + > + if (userbuf) { > + if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) { > + iounmap(vaddr); > + return -EFAULT; > + } > + } else > + memcpy(buf, (vaddr + offset), csize); > + > + iounmap(vaddr); > + return csize; > +} > Index: kexec/include/asm-powerpc/kexec.h > =================================================================== > --- kexec.orig/include/asm-powerpc/kexec.h > +++ kexec/include/asm-powerpc/kexec.h > @@ -30,6 +30,8 @@ > #define KEXEC_ARCH KEXEC_ARCH_PPC > #endif > > +#define HAVE_ARCH_COPY_OLDMEM_PAGE > + > #ifndef __ASSEMBLY__ > > #ifdef CONFIG_KEXEC > Index: kexec/kernel/crash_dump.c > =================================================================== > --- kexec.orig/kernel/crash_dump.c > +++ kexec/kernel/crash_dump.c > @@ -14,10 +14,12 @@ > > #include > #include > +#include > > /* Stores the physical address of elf header of crash image. */ > unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; > > +#ifndef HAVE_ARCH_COPY_OLDMEM_PAGE > /** > * copy_oldmem_page - copy one page from "oldmem" > * @pfn: page frame number to be copied > @@ -59,3 +61,4 @@ ssize_t copy_oldmem_page(unsigned long p > kfree(page); > return csize; > } > +#endif > > > ------------------------------ > > Message: 4 > Date: Sun, 04 Dec 2005 18:39:55 > From: Michael Ellerman > Subject: [PATCH 11/11] powerpc: Add support for "linux, usable-memory" > on memory nodes > To: , Paul Mackerras > Message-ID: <20051205004009.D5BCE68865 at ozlabs.org> > > Milton has proposed that we should support a "linux,usable-memory" property > on memory nodes which describes, in preference to "reg", the regions of memory > Linux should use. > > This facility is required for kdump, to inform the second kernel which memory > it should use. > > Signed-off-by: Haren Myneni > Signed-off-by: Michael Ellerman > --- > > arch/powerpc/kernel/prom.c | 9 +++++++-- > arch/powerpc/mm/numa.c | 7 ++++++- > 2 files changed, 13 insertions(+), 3 deletions(-) > > Index: kexec/arch/powerpc/kernel/prom.c > =================================================================== > --- kexec.orig/arch/powerpc/kernel/prom.c > +++ kexec/arch/powerpc/kernel/prom.c > @@ -567,7 +567,10 @@ static int __init interpret_root_props(s > unsigned int *rp; > int rpsize = (naddrc + nsizec) * sizeof(unsigned int); > > - rp = (unsigned int *) get_property(np, "reg", &l); > + rp = (unsigned int *) get_property(np, "linux,usable-memory", &l); > + if (rp == NULL) > + rp = (unsigned int *) get_property(np, "reg", &l); > + > if (rp != 0 && l >= rpsize) { > i = 0; > adr = (struct address_range *) (*mem_start); > @@ -1275,7 +1278,9 @@ static int __init early_init_dt_scan_mem > } else if (strcmp(type, "memory") != 0) > return 0; > > - reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); > + reg = (cell_t *)of_get_flat_dt_prop(node, "linux,usable-memory", &l); > + if (reg == NULL) > + reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); > if (reg == NULL) > return 0; > > Index: kexec/arch/powerpc/mm/numa.c > =================================================================== > --- kexec.orig/arch/powerpc/mm/numa.c > +++ kexec/arch/powerpc/mm/numa.c > @@ -423,7 +423,12 @@ static int __init parse_numa_properties( > unsigned int *memcell_buf; > unsigned int len; > > - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); > + memcell_buf = (unsigned int *)get_property(memory, > + "linux,usable-memory", &len); > + if (!memcell_buf || len <= 0) > + memcell_buf = > + (unsigned int *)get_property(memory, "reg", > + &len); > if (!memcell_buf || len <= 0) > continue; > > > > ------------------------------ > > _______________________________________________ > Linuxppc64-dev mailing list > Linuxppc64-dev at ozlabs.org > https://ozlabs.org/mailman/listinfo/linuxppc64-dev > > > End of Linuxppc64-dev Digest, Vol 16, Issue 11 > ********************************************** > From ericvanhensbergen at us.ibm.com Tue Dec 6 01:45:24 2005 From: ericvanhensbergen at us.ibm.com (Eric V Van hensbergen) Date: Mon, 5 Dec 2005 08:45:24 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: Message-ID: rsa at us.ltcfwd.linux.ibm.com wrote on 12/04/2005 03:13:12 PM: > This patch adds the hvc_fss.c driver file. > > Signed-off-by: Ryan S. Arnold > diff -uNr linux-2.6.14-rc5/drivers/char/hvc_fss.c linux-2.6.14-rc5- > cbe-fss/drivers/char/hvc_fss.c > --- linux-2.6.14-rc5/drivers/char/hvc_fss.c 1969-12-31 19:00:00. > 000000000 -0500 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_fss.c 2005-12-02 17: > 54:19.243249984 -0500 > @@ -0,0 +1,148 @@ ... > + > +static inline int callthru0(int command) > +{ > + register int c asm ("r3") = command; > + > + asm volatile (".long 0x000EAEB0" : "=r" (c): "r" (c)); > + return((c)); > +} > + > +static inline int callthru3(int command, unsigned long arg1, > unsigned long arg2, unsigned long arg3) > +{ > + register int c asm ("r3") = command; > + register unsigned long a1 asm ("r4") = arg1; > + register unsigned long a2 asm ("r5") = arg2; > + register unsigned long a3 asm ("r6") = arg3; > + > + asm volatile (".long 0x000EAEB0" : "=r" (c): "r" (c), "r" (a1), > "r" (a2), "r" (a3)); > + return((c)); > +} > + Its a relatively small knit-pick, but the callthru functions should probably be kept in a common include. My patch-set has include/asm-powerpc/systemsim.h which includes these definitions. That way we don't have to define the callthru's for every driver which might use them (such as BogusNet or BogusDisk). -eric From hollis at penguinppc.org Tue Dec 6 02:35:10 2005 From: hollis at penguinppc.org (Hollis Blanchard) Date: Mon, 5 Dec 2005 09:35:10 -0600 Subject: [PATCH] powerpc: Separate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <20051205050717.ED89768863@ozlabs.org> References: <20051205050717.ED89768863@ozlabs.org> Message-ID: <5d89217ec251646b34138f147e73cad6@penguinppc.org> On Dec 4, 2005, at 5:07 PM, Michael Ellerman wrote: > This patch separates usage of KERNELBASE and PAGE_OFFSET. I haven't > looked at > any of the PPC code, if we ever want to support Kdump on PPC we'll > have to do > another audit, ditto for iSeries. (I guess you're trying to say you haven't tested 32-bit support, but saying "PPC" here is rather confusing...) > To get a physical address from a virtual one you subtract PAGE_OFFSET, > _not_ > KERNELBASE. > > KERNELBASE is the virtual address of the start of the kernel, it's > often the > same as PAGE_OFFSET, but _might not be_. > > If you want to know something's offset from the start of the kernel > you should > subtract KERNELBASE. Could you please add these helpful comments to page.h? You might also mention kdump as an example, to help people understand this subtle distinction. -Hollis From michael at ellerman.id.au Tue Dec 6 03:10:43 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Mon, 5 Dec 2005 10:10:43 -0600 Subject: [PATCH] powerpc: Separate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <5d89217ec251646b34138f147e73cad6@penguinppc.org> References: <20051205050717.ED89768863@ozlabs.org> <5d89217ec251646b34138f147e73cad6@penguinppc.org> Message-ID: <200512051010.50584.michael@ellerman.id.au> On Mon, 5 Dec 2005 09:35, Hollis Blanchard wrote: > On Dec 4, 2005, at 5:07 PM, Michael Ellerman wrote: > > This patch separates usage of KERNELBASE and PAGE_OFFSET. I haven't > > looked at > > any of the PPC code, if we ever want to support Kdump on PPC we'll > > have to do > > another audit, ditto for iSeries. > > (I guess you're trying to say you haven't tested 32-bit support, but > saying "PPC" here is rather confusing...) You're right that's not very clear. What I meant is I haven't audited any of the code in arch/ppc, or any of the 32-bit PPC code in arch/powerpc for usage of KERNELBASE vs PAGE_OFFSET. If we want kdump to work on 32-bit powerpc we'll need to audit that code first. > > To get a physical address from a virtual one you subtract PAGE_OFFSET, > > _not_ > > KERNELBASE. > > > > KERNELBASE is the virtual address of the start of the kernel, it's > > often the > > same as PAGE_OFFSET, but _might not be_. > > > > If you want to know something's offset from the start of the kernel > > you should > > subtract KERNELBASE. > > Could you please add these helpful comments to page.h? You might also > mention kdump as an example, to help people understand this subtle > distinction. Not a bad idea. I doesn't look like paulus has merged them yet (due to my speeling mistakes ;), so I'll just update this patch. cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/94126c26/attachment.pgp From michael at ellerman.id.au Tue Dec 6 03:24:33 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Mon, 05 Dec 2005 10:24:33 -0600 Subject: [PATCH] powerpc: Separate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <5d89217ec251646b34138f147e73cad6@penguinppc.org> Message-ID: <20051205162449.0DBE26884E@ozlabs.org> This patch separates usage of KERNELBASE and PAGE_OFFSET. I haven't looked at any of the PPC code, if we ever want to support Kdump on PPC we'll have to do another audit, ditto for iSeries. This patch makes PAGE_OFFSET the constant, it'll always be 0xC * 1 gazillion. To get a physical address from a virtual one you subtract PAGE_OFFSET, _not_ KERNELBASE. KERNELBASE is the virtual address of the start of the kernel, it's often the same as PAGE_OFFSET, but _might not be_. If you want to know something's offset from the start of the kernel you should subtract KERNELBASE. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/btext.c | 4 ++-- arch/powerpc/kernel/entry_64.S | 4 ++-- arch/powerpc/kernel/lparmap.c | 6 +++--- arch/powerpc/kernel/machine_kexec_64.c | 5 ++--- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/slb.c | 4 ++-- arch/powerpc/mm/slb_low.S | 6 +++--- arch/powerpc/mm/stab.c | 10 +++++----- include/asm-powerpc/page.h | 16 +++++++++++++++- 9 files changed, 37 insertions(+), 24 deletions(-) Index: kexec/arch/powerpc/mm/stab.c =================================================================== --- kexec.orig/arch/powerpc/mm/stab.c +++ kexec/arch/powerpc/mm/stab.c @@ -40,7 +40,7 @@ static int make_ste(unsigned long stab, unsigned long entry, group, old_esid, castout_entry, i; unsigned int global_entry; struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; + unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; vsid_data = vsid << STE_VSID_SHIFT; esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; @@ -83,7 +83,7 @@ static int make_ste(unsigned long stab, } /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) + if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) break; castout_entry = (castout_entry + 1) & 0xf; @@ -251,7 +251,7 @@ void stabs_alloc(void) panic("Unable to allocate segment table for CPU %d.\n", cpu); - newstab += KERNELBASE; + newstab = (unsigned long)__va(newstab); memset((void *)newstab, 0, HW_PAGE_SIZE); @@ -270,11 +270,11 @@ void stabs_alloc(void) */ void stab_initialize(unsigned long stab) { - unsigned long vsid = get_kernel_vsid(KERNELBASE); + unsigned long vsid = get_kernel_vsid(PAGE_OFFSET); unsigned long stabreal; asm volatile("isync; slbia; isync":::"memory"); - make_ste(stab, GET_ESID(KERNELBASE), vsid); + make_ste(stab, GET_ESID(PAGE_OFFSET), vsid); /* Order update */ asm volatile("sync":::"memory"); Index: kexec/arch/powerpc/kernel/machine_kexec_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/machine_kexec_64.c +++ kexec/arch/powerpc/kernel/machine_kexec_64.c @@ -153,9 +153,8 @@ void kexec_copy_flush(struct kimage *ima * including ones that were in place on the original copy */ for (i = 0; i < nr_segments; i++) - flush_icache_range(ranges[i].mem + KERNELBASE, - ranges[i].mem + KERNELBASE + - ranges[i].memsz); + flush_icache_range((unsigned long)__va(ranges[i].mem), + (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); } #ifdef CONFIG_SMP Index: kexec/arch/powerpc/mm/hash_utils_64.c =================================================================== --- kexec.orig/arch/powerpc/mm/hash_utils_64.c +++ kexec/arch/powerpc/mm/hash_utils_64.c @@ -456,7 +456,7 @@ void __init htab_initialize(void) /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].base + KERNELBASE; + base = (unsigned long)__va(lmb.memory.region[i].base); size = lmb.memory.region[i].size; DBG("creating mapping for region: %lx : %lx\n", base, size); @@ -498,8 +498,8 @@ void __init htab_initialize(void) * for either 4K or 16MB pages. */ if (tce_alloc_start) { - tce_alloc_start += KERNELBASE; - tce_alloc_end += KERNELBASE; + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); if (base + size >= tce_alloc_start) tce_alloc_start = base + size + 1; Index: kexec/arch/powerpc/mm/slb.c =================================================================== --- kexec.orig/arch/powerpc/mm/slb.c +++ kexec/arch/powerpc/mm/slb.c @@ -75,7 +75,7 @@ static void slb_flush_and_rebolt(void) vflags = SLB_VSID_KERNEL | virtual_llp; ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); - if ((ksp_esid_data & ESID_MASK) == KERNELBASE) + if ((ksp_esid_data & ESID_MASK) == PAGE_OFFSET) ksp_esid_data &= ~SLB_ESID_V; /* We need to do this all in asm, so we're sure we don't touch @@ -213,7 +213,7 @@ void slb_initialize(void) asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); - create_slbe(KERNELBASE, lflags, 0); + create_slbe(PAGE_OFFSET, lflags, 0); /* VMALLOC space has 4K pages always for now */ create_slbe(VMALLOCBASE, vflags, 1); Index: kexec/arch/powerpc/kernel/entry_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/entry_64.S +++ kexec/arch/powerpc/kernel/entry_64.S @@ -690,7 +690,7 @@ _GLOBAL(enter_rtas) /* Setup our real return addr */ SET_REG_TO_LABEL(r4,.rtas_return_loc) - SET_REG_TO_CONST(r9,KERNELBASE) + SET_REG_TO_CONST(r9,PAGE_OFFSET) sub r4,r4,r9 mtlr r4 @@ -718,7 +718,7 @@ _GLOBAL(enter_rtas) _STATIC(rtas_return_loc) /* relocation is off at this point */ mfspr r4,SPRN_SPRG3 /* Get PACA */ - SET_REG_TO_CONST(r5, KERNELBASE) + SET_REG_TO_CONST(r5, PAGE_OFFSET) sub r4,r4,r5 /* RELOC the PACA base pointer */ mfmsr r6 Index: kexec/arch/powerpc/mm/slb_low.S =================================================================== --- kexec.orig/arch/powerpc/mm/slb_low.S +++ kexec/arch/powerpc/mm/slb_low.S @@ -37,9 +37,9 @@ _GLOBAL(slb_allocate_realmode) srdi r9,r3,60 /* get region */ srdi r10,r3,28 /* get esid */ - cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - /* r3 = address, r10 = esid, cr7 = <>KERNELBASE */ + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ blt cr7,0f /* user or kernel? */ /* kernel address: proto-VSID = ESID */ @@ -166,7 +166,7 @@ _GLOBAL(slb_allocate_user) /* * Finish loading of an SLB entry and return * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: ASM_VSID_SCRAMBLE(r10,r9) Index: kexec/arch/powerpc/kernel/lparmap.c =================================================================== --- kexec.orig/arch/powerpc/kernel/lparmap.c +++ kexec/arch/powerpc/kernel/lparmap.c @@ -16,8 +16,8 @@ const struct LparMap __attribute__((__se .xSegmentTableOffs = STAB0_PAGE, .xEsids = { - { .xKernelEsid = GET_ESID(KERNELBASE), - .xKernelVsid = KERNEL_VSID(KERNELBASE), }, + { .xKernelEsid = GET_ESID(PAGE_OFFSET), + .xKernelVsid = KERNEL_VSID(PAGE_OFFSET), }, { .xKernelEsid = GET_ESID(VMALLOCBASE), .xKernelVsid = KERNEL_VSID(VMALLOCBASE), }, }, @@ -25,7 +25,7 @@ const struct LparMap __attribute__((__se .xRanges = { { .xPages = HvPagesToMap, .xOffset = 0, - .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - HW_PAGE_SHIFT), + .xVPN = KERNEL_VSID(PAGE_OFFSET) << (SID_SHIFT - HW_PAGE_SHIFT), }, }, }; Index: kexec/include/asm-powerpc/page.h =================================================================== --- kexec.orig/include/asm-powerpc/page.h +++ kexec/include/asm-powerpc/page.h @@ -37,6 +37,20 @@ */ #define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) +/* + * KERNELBASE is the virtual address of the start of the kernel, it's often + * the same as PAGE_OFFSET, but _might not be_. + * + * The kdump dump kernel is one example where KERNELBASE != PAGE_OFFSET. + * + * To get a physical address from a virtual one you subtract PAGE_OFFSET, + * _not_ KERNELBASE. + * + * If you want to know something's offset from the start of the kernel you + * should subtract KERNELBASE. + * + * If you want to test if something's a kernel address, use is_kernel_addr(). + */ #define PAGE_OFFSET ASM_CONST(CONFIG_KERNEL_START) #define KERNELBASE PAGE_OFFSET @@ -56,7 +70,7 @@ #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) +#define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET)) #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* Index: kexec/arch/powerpc/kernel/btext.c =================================================================== --- kexec.orig/arch/powerpc/kernel/btext.c +++ kexec/arch/powerpc/kernel/btext.c @@ -60,7 +60,7 @@ int force_printk_to_btext = 0; * * The display is mapped to virtual address 0xD0000000, rather * than 1:1, because some some CHRP machines put the frame buffer - * in the region starting at 0xC0000000 (KERNELBASE). + * in the region starting at 0xC0000000 (PAGE_OFFSET). * This mapping is temporary and will disappear as soon as the * setup done by MMU_Init() is applied. * @@ -71,7 +71,7 @@ int force_printk_to_btext = 0; */ void __init btext_prepare_BAT(void) { - unsigned long vaddr = KERNELBASE + 0x10000000; + unsigned long vaddr = PAGE_OFFSET + 0x10000000; unsigned long addr; unsigned long lowbits; From arnd at arndb.de Tue Dec 6 03:33:34 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 5 Dec 2005 17:33:34 +0100 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: References: Message-ID: <200512051733.35417.arnd@arndb.de> On Maandag 05 Dezember 2005 15:45, Eric V Van hensbergen wrote: > Its a relatively small knit-pick, but the callthru functions should > probably > be kept in a common include. ?My patch-set has > include/asm-powerpc/systemsim.h > which includes these definitions. ?That way we don't have to define the > callthru's for every driver which might use them (such as BogusNet or > BogusDisk). > That's right. I have already ported the patches to the powerpc.git tree and used your systemsim.h file for that. What are your plans for bringing your patches upstream? The code in there looks pretty good already, but I guess some day you should split it up into smaller patches and submit those. Arnd <>< From rsa at us.ibm.com Tue Dec 6 04:06:27 2005 From: rsa at us.ibm.com (Ryan Arnold) Date: Mon, 05 Dec 2005 11:06:27 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: References: Message-ID: <1133802387.10632.8.camel@localhost.localdomain> On Mon, 2005-12-05 at 08:45 -0600, Eric V Van hensbergen wrote: > Its a relatively small knit-pick, but the callthru functions should > probably > be kept in a common include. My patch-set has > include/asm-powerpc/systemsim.h > which includes these definitions. That way we don't have to define the > callthru's for every driver which might use them (such as BogusNet or > BogusDisk). > > -eric Thanks Eric, I did question whether the console driver was the appropriate place for the callthru when I moved the definitions from bogus_console.c. I guess we won't see these definitions moved to an alternate file until Arnd makes his patches available against a more recent kernel? -- Ryan Arnold IBM Linux Technology Center From ericvanhensbergen at us.ibm.com Tue Dec 6 04:14:04 2005 From: ericvanhensbergen at us.ibm.com (Eric V Van hensbergen) Date: Mon, 5 Dec 2005 11:14:04 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: Message-ID: rsa at us.ltcfwd.linux.ibm.com wrote on 12/05/2005 11:11:11 AM: > On Mon, 2005-12-05 at 08:45 -0600, Eric V Van hensbergen wrote: > > Its a relatively small knit-pick, but the callthru functions should > > probably > > be kept in a common include. My patch-set has > > include/asm-powerpc/systemsim.h > > which includes these definitions. That way we don't have to define the > > callthru's for every driver which might use them (such as BogusNet or > > BogusDisk). > > > > -eric > > Thanks Eric, > > I did question whether the console driver was the appropriate place for > the callthru when I moved the definitions from bogus_console.c. I guess > we won't see these definitions moved to an alternate file until Arnd > makes his patches available against a more recent kernel? > You can get to my (generic systemsim patches) via kernel.org: /pub/scm/linux/kernel/git/ericvh/systemsim.git You should be able to pull the general definitiions from there if you want to update your patch. -eric From ericvanhensbergen at us.ibm.com Tue Dec 6 04:17:22 2005 From: ericvanhensbergen at us.ibm.com (Eric V Van hensbergen) Date: Mon, 5 Dec 2005 11:17:22 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: Message-ID: Arnd Bergmann wrote on 12/05/2005 10:35:13 AM: > On Maandag 05 Dezember 2005 15:45, Eric V Van hensbergen wrote: > > Its a relatively small knit-pick, but the callthru functions should > > probably > > be kept in a common include. ?My patch-set has > > include/asm-powerpc/systemsim.h > > which includes these definitions. ?That way we don't have to define the > > callthru's for every driver which might use them (such as BogusNet or > > BogusDisk). > > > > That's right. I have already ported the patches to the powerpc.git tree > and used your systemsim.h file for that. > > What are your plans for bringing your patches upstream? The code in there > looks pretty good already, but I guess some day you should split it up > into smaller patches and submit those. > I suppose if there is sufficient pull I could push them at any time -- I haven't gone down this path because I'm not sure how I feel including simulator drivers in the mainline kernel tree. If the linuxppc64 folks think this is valuable, I'd be happy to clean-up the drivers a bit more and submit a patch. -eric From miltonm at bga.com Tue Dec 6 04:27:07 2005 From: miltonm at bga.com (Milton Miller) Date: Mon, 5 Dec 2005 11:27:07 -0600 Subject: [RFC PATCH 2/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: <43935BB0.9050306@us.ibm.com> References: <43935BB0.9050306@us.ibm.com> Message-ID: <1e4db56531f2c11b786669e793c749d1@bga.com> Hi Ryan. On Dec 4, 2005, at 3:12 PM, Ryan S. Arnold wrote: > This patch shuffles around some data-type declarations and moves some > functions out of include/asm-ppc64/hvconsole.h and into a new > drivers/char/hvc_console.h file. > > Signed-off-by: Ryan S. Arnold > > diff -uNr linux-2.6.14-rc5/drivers/char/hvc_console.c > linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_console.c > --- linux-2.6.14-rc5/drivers/char/hvc_console.c 2005-10-20 > 02:23:05.000000000 -0400 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_console.c 2005-12-02 > 17:20:58.095207576 -0500 > @@ -40,7 +40,7 @@ > #include > #include > #include > -#include > +#include "hvc_console.h" > > #define HVC_MAJOR 229 > #define HVC_MINOR 0 > @@ -61,11 +61,6 @@ > */ > #define HVC_ALLOC_TTY_ADAPTERS 8 Above should be in .h file, and consistent with console (or more). > > -#define N_OUTBUF 16 > -#define N_INBUF 16 > - > -#define __ALIGNED__ __attribute__((__aligned__(8))) > - > static struct tty_driver *hvc_driver; > static struct task_struct *hvc_task; > > @@ -76,22 +71,6 @@ > static int sysrq_pressed; > #endif > > -struct hvc_struct { > - spinlock_t lock; > - int index; > - struct tty_struct *tty; > - unsigned int count; > - int do_wakeup; > - char outbuf[N_OUTBUF] __ALIGNED__; > - int n_outbuf; > - uint32_t vtermno; > - struct hv_ops *ops; > - int irq_requested; > - int irq; > - struct list_head next; > - struct kobject kobj; /* ref count & hvc_struct lifetime */ > -}; > - > /* dynamic list of hvc_struct instances */ > static struct list_head hvc_structs = LIST_HEAD_INIT(hvc_structs); > > @@ -136,7 +115,6 @@ > return hp; > } > > - > /* > * Initial console vtermnos for console API usage prior to full > console > * initialization. Any vty adapter outside this range will not have > usable > @@ -154,6 +132,7 @@ > > void hvc_console_print(struct console *co, const char *b, unsigned > count) > { > + /* This [16] should probably use a #define */ N_OUTBUF perhaps? > char c[16] __ALIGNED__; > unsigned i = 0, n = 0; > int r, donecr = 0, index = co->index; > diff -uNr linux-2.6.14-rc5/drivers/char/hvc_console.h > linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_console.h > --- linux-2.6.14-rc5/drivers/char/hvc_console.h 1969-12-31 > 19:00:00.000000000 -0500 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_console.h 2005-12-02 > 17:27:07.733180280 -0500 > @@ -0,0 +1,83 @@ > +/* > + * hvc_console.h > + * Copyright (C) 2005 IBM Corporation > + * > + * Author(s): > + * Ryan S. Arnold > + * > + * hvc_console header information: > + * moved here from include/asm-ppc64/hvconsole.h > + * and drivers/char/hvc_console.c > + * > + * This program is free software; you can redistribute it and/or > modify > + * it under the terms of the GNU General Public License as published > by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA > 02111-1307 USA > + */ > + > +#ifndef HVC_CONSOLE_H > +#define HVC_CONSOLE_H > + > +#include > +#include > +#include > + > +/* > + * This is the max number of console adapters that can/will be found > as > + * console devices on first stage console init. Any number beyond > this range > + * can't be used as a console device but is still a valid tty device. > + */ > +#define MAX_NR_HVC_CONSOLES 16 > + > +/* > + * This is a design shortcoming, the number '16' is a vio required > buffer > + * size. This should be changeable per architecture, but hvc_struct > relies > + * upon it and that struct is used by all hvc_console backend > drivers. This > + * needs to be fixed. > + */ > +#define N_OUTBUF 16 > +#define N_INBUF 16 > + > +#define __ALIGNED__ __attribute__((__aligned__(sizeof(long)))) > + A little bit generic for a .h file used by multiple files ... but see next comment. > +/* implemented by a low level driver */ > +struct hv_ops { > + int (*get_chars)(uint32_t vtermno, char *buf, int count); > + int (*put_chars)(uint32_t vtermno, const char *buf, int count); > +}; > + > +struct hvc_struct { > + spinlock_t lock; > + int index; > + struct tty_struct *tty; > + unsigned int count; > + int do_wakeup; > + char outbuf[N_OUTBUF] __ALIGNED__; > + int n_outbuf; > + uint32_t vtermno; > + struct hv_ops *ops; > + int irq_requested; > + int irq; > + struct list_head next; > + struct kobject kobj; /* ref count & hvc_struct lifetime */ > +}; Why are you putting the full structure definition in the .h file instead of just declaring the struct? It only encourages clients to dig into the structure instead of treating it as magic cookie. > + > +/* Register a vterm and a slot index for use as a console > (console_init) */ > +extern int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops > *ops); > + > +/* register a vterm for hvc tty operation (module_init or hotplug > add) */ > +extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int > irq, > + struct hv_ops *ops); > +/* remove a vterm from hvc tty operation (modele_exit or hotplug > remove) */ > +extern int __devexit hvc_remove(struct hvc_struct *hp); > + > +#endif // HVC_CONSOLE_H > diff -uNr linux-2.6.14-rc5/include/asm-ppc64/hvconsole.h > linux-2.6.14-rc5-cbe-fss/include/asm-ppc64/hvconsole.h > --- linux-2.6.14-rc5/include/asm-ppc64/hvconsole.h 2005-10-20 > 02:23:05.000000000 -0400 > +++ linux-2.6.14-rc5-cbe-fss/include/asm-ppc64/hvconsole.h 2005-11-14 > 16:24:02.000000000 -0500 > @@ -22,28 +22,7 @@ > #ifndef _PPC64_HVCONSOLE_H > #define _PPC64_HVCONSOLE_H > > -/* > - * This is the max number of console adapters that can/will be found > as > - * console devices on first stage console init. Any number beyond > this range > - * can't be used as a console device but is still a valid tty device. > - */ > -#define MAX_NR_HVC_CONSOLES 16 > - > -/* implemented by a low level driver */ > -struct hv_ops { > - int (*get_chars)(uint32_t vtermno, char *buf, int count); > - int (*put_chars)(uint32_t vtermno, const char *buf, int count); > -}; > extern int hvc_get_chars(uint32_t vtermno, char *buf, int count); > extern int hvc_put_chars(uint32_t vtermno, const char *buf, int > count); > > -struct hvc_struct; > - > -/* Register a vterm and a slot index for use as a console > (console_init) */ > -extern int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops > *ops); > -/* register a vterm for hvc tty operation (module_init or hotplug > add) */ > -extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int > irq, > - struct hv_ops *ops); > -/* remove a vterm from hvc tty operation (modele_exit or hotplug > remove) */ > -extern int __devexit hvc_remove(struct hvc_struct *hp); > #endif /* _PPC64_HVCONSOLE_H */ Did I miss the addition of hvc_console.h to hvc_vio.c (and hvsi)? I'm ok moving the .h but it does constrain the clients to be in drivers/char. milton From miltonm at bga.com Tue Dec 6 04:27:55 2005 From: miltonm at bga.com (Milton Miller) Date: Mon, 5 Dec 2005 11:27:55 -0600 Subject: [RFC PATCH 3/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: <43935BB5.9030302@us.ibm.com> References: <43935BB5.9030302@us.ibm.com> Message-ID: <2b19bee9bd90cfee311d8076b026add4@bga.com> On Dec 4, 2005, at 3:12 PM, Ryan S. Arnold wrote: > This patch modifies the defconfig file for the CELL simulator and > changes the Makefile and Kconfig to add hvc_fss. > > Signed-off-by: Ryan S. Arnold > > > diff -uNr linux-2.6.14-rc5/arch/ppc64/configs/cbesim_defconfig > linux-2.6.14-rc5-cbe-fss/arch/ppc64/configs/cbesim_defconfig > --- linux-2.6.14-rc5/arch/ppc64/configs/cbesim_defconfig 2005-11-14 > 12:26:32.000000000 -0500 > +++ > linux-2.6.14-rc5-cbe-fss/arch/ppc64/configs/cbesim_defconfig 2005-11 > -14 15:59:05.000000000 -0500 > @@ -322,7 +322,7 @@ > CONFIG_UNIX98_PTYS=y > # CONFIG_LEGACY_PTYS is not set > # CONFIG_RTASCONS is not set > -CONFIG_BOGUS_CONSOLE=y > +CONFIG_HVC_FSS=y > > # > # IPMI > diff -uNr linux-2.6.14-rc5/drivers/char/Kconfig > linux-2.6.14-rc5-cbe-fss/drivers/char/Kconfig > --- linux-2.6.14-rc5/drivers/char/Kconfig 2005-11-14 > 12:26:32.000000000 -0500 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/Kconfig 2005-12-02 > 17:44:04.490273872 -0500 > @@ -552,25 +552,37 @@ > > If unsure, say N. > > +config HVC_DRIVER > + bool "PowerPC virtual console front-end support" > + depends on PPC_PSERIES || PPC_BPA || PPC_RTAS > + help > + Users of pSeries machines that want to utilize the hvc console > front-end > + module for their backend console driver should select this option. > + It will automatically be selected if one of the back-end console > drivers > + is selected. > + Lets just keep this hidden -- so take out depends (its all generic code) and just say bool (without any quoted text). The help text could then be made more generic. > config HVC_CONSOLE > bool "pSeries Hypervisor Virtual Console support" > depends on PPC_PSERIES > + select HVC_DRIVER > help > pSeries machines when partitioned support a hypervisor virtual > console. This driver allows each pSeries partition to have a > console > which is accessed via the HMC. > > -config RTASCONS > - bool "RTAS firmware console support" > - depends on PPC_RTAS > - help > - RTAS console support. > - > -config BOGUS_CONSOLE > - bool "Simulator bogus console support" > +config HVC_FSS > + bool "IBM Full System Simulator Console support" > depends on PPC_PSERIES || PPC_BPA > + select HVC_DRIVER > + help > + IBM Full System Simulator Console device driver which makes use of > + the HVC_DRIVER front end. > + > +config RTASCONS > + bool "RTAS firmware console support" > + depends on PPC_RTAS > help > - IBM System Simulator bogus console device driver. > + RTAS console support. > > config HVCS > tristate "IBM Hypervisor Virtual Console Server support" > diff -uNr linux-2.6.14-rc5/drivers/char/Makefile > linux-2.6.14-rc5-cbe-fss/drivers/char/Makefile > --- linux-2.6.14-rc5/drivers/char/Makefile 2005-11-14 > 12:26:32.000000000 -0500 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/Makefile 2005-12-02 > 17:24:12.583189272 -0500 > @@ -41,12 +41,13 @@ > obj-$(CONFIG_SX) += sx.o generic_serial.o > obj-$(CONFIG_RIO) += rio/ generic_serial.o > obj-$(CONFIG_RTASCONS) += rtascons.o > -obj-$(CONFIG_BOGUS_CONSOLE) +=bogus_console.o > -obj-$(CONFIG_HVC_CONSOLE) += hvc_console.o hvc_vio.o hvsi.o > +obj-$(CONFIG_HVC_DRIVER) += hvc_console.o > +obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o > +obj-$(CONFIG_HVC_FSS) += hvc_fss.o > obj-$(CONFIG_RAW_DRIVER) += raw.o > obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o > obj-$(CONFIG_MMTIMER) += mmtimer.o > -obj-$(CONFIG_VIOCONS) += viocons.o > +obj-$(CONFIG_VIOCONS) += viocons.o > obj-$(CONFIG_VIOTAPE) += viotape.o > obj-$(CONFIG_HVCS) += hvcs.o > obj-$(CONFIG_SGI_MBCS) += mbcs.o milton From miltonm at bga.com Tue Dec 6 04:27:57 2005 From: miltonm at bga.com (Milton Miller) Date: Mon, 5 Dec 2005 11:27:57 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: <43935BBF.6080005@us.ibm.com> References: <43935BBF.6080005@us.ibm.com> Message-ID: <5f0ba2ea6429728d231d7baf74a7018d@bga.com> Mostly style and less confusion. On Dec 4, 2005, at 3:12 PM, Ryan S. Arnold wrote: > This patch adds the hvc_fss.c driver file. > > Signed-off-by: Ryan S. Arnold > diff -uNr linux-2.6.14-rc5/drivers/char/hvc_fss.c > linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_fss.c > --- linux-2.6.14-rc5/drivers/char/hvc_fss.c 1969-12-31 > 19:00:00.000000000 -0500 > +++ linux-2.6.14-rc5-cbe-fss/drivers/char/hvc_fss.c 2005-12-02 > 17:54:19.243249984 -0500 > @@ -0,0 +1,148 @@ > +/* > + * IBM Full System Simulator driver interface to hvc_console.c > + * > + * (C) Copyright IBM Corporation 2001-2005 > + * Author(s): Maximino Augilar > + * : Ryan S. Arnold > + * > + * inspired by drivers/char/hvc_console.c > + * written by Anton Blanchard and Paul Mackerras > + * > + * Some code is from the IBM Full System Simulator Group in ARL. > + * Author: Patrick Bohrer > + * > + * Much of this code was moved here from the IBM Full System Simulator > + * Bogus console driver in order to reuse the framework provided by > the hvc > + * console driver. Ryan S. Arnold > + * > + * This program is free software; you can redistribute it and/or > modify > + * it under the terms of the GNU General Public License as published > by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA > 02111-1307 USA > + */ > + > +#include > +#include > +#include > +#include "hvc_console.h" > + > +static uint32_t hvc_fss_vtermno = 0; This might be confusing ... its not a terminal number so much as a cookie. Made me review the order of arguments below. It could be #define hvc_fss_cookie 0x (unless you expect more channels later). > +struct hvc_struct *hvc_fss_dev; > + > +static inline int callthru0(int command) > +{ > + register int c asm ("r3") = command; > + > + asm volatile (".long 0x000EAEB0" : "=r" (c): "r" (c)); > + return((c)); > +} > + > +static inline int callthru3(int command, unsigned long arg1, unsigned > long arg2, unsigned long arg3) > +{ > + register int c asm ("r3") = command; > + register unsigned long a1 asm ("r4") = arg1; > + register unsigned long a2 asm ("r5") = arg2; > + register unsigned long a3 asm ("r6") = arg3; > + > + asm volatile (".long 0x000EAEB0" : "=r" (c): "r" (c), "r" (a1), "r" > (a2), "r" (a3)); > + return((c)); > +} nit: =&r ? (not sure but I thought that made something input and output) > + > +static inline int hvc_fss_write_console(uint32_t vtermno, const char > *buf, int count) > +{ > + int ret = 0; assigning =0 when unconditionally setting below is redundant. > + ret = callthru3(0, (unsigned long)buf, > + (unsigned long)count, (unsigned long)1); > + if (ret != 0) { > + return (count - ret); /* is this right? */ > + } > + > + /* the calling routine expects to receive the number of bytes sent */ > + return count; > +} > + > +static inline int hvc_fss_read_console(uint32_t vtermno, char *buf, > int count) > +{ > + unsigned long got; > + int c; > + int i; > + > + for (got = 0, i = 0; i < count; i++) { > + Here I would go the other way, and initialize got above, and only put i=0 in the for statement ... I had to look twice to find the initialization. > + if (( c = callthru0(60) ) != -1) { > + buf[i] = c; > + ++got; > + } > + else } and else on same line > + break; > + } > + return got; > +} > + > +static struct hv_ops hvc_fss_get_put_ops = { > + .get_chars = hvc_fss_read_console, > + .put_chars = hvc_fss_write_console, > +}; > + > +static int hvc_fss_init(void) > +{ > + /* Register a single device with the driver */ > + struct hvc_struct *hp; > + > + if(!__onsim()) { > + return -1; > + } > + > + if(hvc_fss_dev) { > + return -1; /* This shouldn't happen */ > + } > + > + /* Allocate an hvc_struct for the console device we instantiated > + * earlier. Save off hp so that we can return it on exit */ > + hp = hvc_alloc(hvc_fss_vtermno, NO_IRQ, &hvc_fss_get_put_ops); > + if (IS_ERR(hp)) > + return PTR_ERR(hp); > + hvc_fss_dev = hp; > + return 0; > +} > +module_init(hvc_fss_init); > + > +/* This will tear down the tty portion of the driver */ > +static void __exit hvc_fss_exit(void) > +{ > + struct hvc_struct *hp_safe; > + /* Hopefully this isn't premature */ > + if (!hvc_fss_dev) > + return; > + > + hp_safe = hvc_fss_dev; > + hvc_fss_dev = NULL; > + > + /* Really the fun isn't over until the worker thread breaks down and > the > + * tty cleans up */ > + hvc_remove(hp_safe); > +} > +module_exit(hvc_fss_exit); /* before drivers/char/hvc_console.c */ > + > +/* This will happen prior to module init. There is no tty at this > time? */ > +static int hvc_fss_console_init(void) > +{ > + /* Don't register if we aren't running on the simulator */ > + if (__onsim()) { > + /* Tell the driver we know of one console device. We > + * shouldn't get a collision on the index as long as no-one > + * else instantiates on hardware they don't have. */ > + hvc_instantiate(hvc_fss_vtermno, 0, &hvc_fss_get_put_ops ); > + } > + return 0; > +} > +console_initcall(hvc_fss_console_init); milton From kravetz at us.ibm.com Tue Dec 6 06:25:52 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Mon, 5 Dec 2005 11:25:52 -0800 Subject: [PATCH 3/11] powerpc: Seperate usage of KERNELBASE and PAGE_OFFSET In-Reply-To: <20051205003934.643E26887C@ozlabs.org> References: <1133743149.268607.418162138937.qpush@concordia> <20051205003934.643E26887C@ozlabs.org> Message-ID: <20051205192552.GA5535@w-mikek2.ibm.com> On Sun, Dec 04, 2005 at 06:39:20PM +0000, Michael Ellerman wrote: > Index: kexec/arch/powerpc/mm/hash_utils_64.c > =================================================================== > --- kexec.orig/arch/powerpc/mm/hash_utils_64.c > +++ kexec/arch/powerpc/mm/hash_utils_64.c > @@ -456,7 +456,7 @@ void __init htab_initialize(void) > > /* create bolted the linear mapping in the hash table */ > for (i=0; i < lmb.memory.cnt; i++) { > - base = lmb.memory.region[i].base + KERNELBASE; > + base = (unsigned long)__va(lmb.memory.region[i].base); > size = lmb.memory.region[i].size; I think you will want to make a similar change to the routine add_memory() in powerpc/mm/mem.c. This routine was based on htab_initialize's call to htab_bolt_mapping(). int __devinit add_memory(u64 start, u64 size) { struct pglist_data *pgdata = NODE_DATA(0); struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; start += KERNELBASE; create_section_mapping(start, start + size); /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; return __add_pages(zone, start_pfn, nr_pages); return 0; } -- Mike From kravetz at us.ibm.com Tue Dec 6 07:06:42 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Mon, 5 Dec 2005 12:06:42 -0800 Subject: [PATCH] reworked: numa placement for dynamically added memory Message-ID: <20051205200642.GA20613@w-mikek2.ibm.com> Here is a reworked version of the patch with changes suggested by Nathan. Again, this patch depends on: http://ozlabs.org/pipermail/linuxppc64-dev/2005-December/006923.html This patch places dynamically added memory within the appropriate numa node. A new routine hot_add_scn_to_nid() replicates most of the memory scanning code in parse_numa_properties(). Signed-off-by: Mike Kravetz diff -Naupr linux-2.6.15-rc5-git1.dep/arch/powerpc/mm/mem.c linux-2.6.15-rc5-git1.work/arch/powerpc/mm/mem.c --- linux-2.6.15-rc5-git1.dep/arch/powerpc/mm/mem.c 2005-12-04 05:10:42.000000000 +0000 +++ linux-2.6.15-rc5-git1.work/arch/powerpc/mm/mem.c 2005-12-05 19:57:50.000000000 +0000 @@ -114,18 +114,17 @@ void online_page(struct page *page) num_physpages++; } -/* - * This works only for the non-NUMA case. Later, we'll need a lookup - * to convert from real physical addresses to nid, that doesn't use - * pfn_to_nid(). - */ int __devinit add_memory(u64 start, u64 size) { - struct pglist_data *pgdata = NODE_DATA(0); + struct pglist_data *pgdata; struct zone *zone; + int nid; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + nid = hot_add_scn_to_nid(start); + pgdata = NODE_DATA(nid); + start += KERNELBASE; create_section_mapping(start, start + size); diff -Naupr linux-2.6.15-rc5-git1.dep/arch/powerpc/mm/numa.c linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c --- linux-2.6.15-rc5-git1.dep/arch/powerpc/mm/numa.c 2005-12-05 19:54:24.000000000 +0000 +++ linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c 2005-12-05 19:57:50.000000000 +0000 @@ -37,6 +37,7 @@ EXPORT_SYMBOL(node_data); static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; +static int n_mem_addr_cells, n_mem_size_cells; /* * We need somewhere to store start/end/node for each region until we have @@ -267,7 +268,7 @@ static void __init get_n_mem_cells(int * of_node_put(memory); } -static unsigned long __init read_n_cells(int n, unsigned int **buf) +static unsigned long __devinit read_n_cells(int n, unsigned int **buf) { unsigned long result = 0; @@ -374,7 +375,6 @@ static int __init parse_numa_properties( { struct device_node *cpu = NULL; struct device_node *memory = NULL; - int n_addr_cells, n_size_cells; int max_domain; unsigned long i; @@ -413,7 +413,7 @@ static int __init parse_numa_properties( } } - get_n_mem_cells(&n_addr_cells, &n_size_cells); + get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long start; @@ -430,8 +430,8 @@ static int __init parse_numa_properties( ranges = memory->n_addrs; new_range: /* these are order-sensitive, and modify the buffer pointer */ - start = read_n_cells(n_addr_cells, &memcell_buf); - size = read_n_cells(n_size_cells, &memcell_buf); + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); numa_domain = of_node_numa_domain(memory); @@ -717,3 +717,50 @@ static int __init early_numa(char *p) return 0; } early_param("numa", early_numa); + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Find the node associated with a hot added memory section. Section + * corresponds to a SPARSEMEM section, not an LMB. It is assumed that + * sections are fully contained within a single LMB. + */ +int hot_add_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory = NULL; + + if (!numa_enabled || (min_common_depth < 0)) + return 0; + + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start, size; + int numa_domain, ranges; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + ranges = memory->n_addrs; /* ranges in cell */ +ha_new_range: + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + numa_domain = of_node_numa_domain(memory); + + /* Domains not present at boot default to 0 */ + if (!node_online(numa_domain)) + numa_domain = any_online_node(NODE_MASK_ALL); + + if ((scn_addr >= start) && (scn_addr < (start + size))) { + of_node_put(memory); + return numa_domain; + } + + if (--ranges) /* process all ranges in cell */ + goto ha_new_range; + } + + BUG(); /* section address should be found above */ + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff -Naupr linux-2.6.15-rc5-git1.dep/include/asm-powerpc/sparsemem.h linux-2.6.15-rc5-git1.work/include/asm-powerpc/sparsemem.h --- linux-2.6.15-rc5-git1.dep/include/asm-powerpc/sparsemem.h 2005-12-04 05:10:42.000000000 +0000 +++ linux-2.6.15-rc5-git1.work/include/asm-powerpc/sparsemem.h 2005-12-05 19:57:50.000000000 +0000 @@ -13,6 +13,14 @@ #ifdef CONFIG_MEMORY_HOTPLUG extern void create_section_mapping(unsigned long start, unsigned long end); +#ifdef CONFIG_NUMA +extern int hot_add_scn_to_nid(unsigned long scn_addr); +#else +static inline int hot_add_scn_to_nid(unsigned long scn_addr) +{ + return 0; +} +#endif /* CONFIG_NUMA */ #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* CONFIG_SPARSEMEM */ From jdl at freescale.com Tue Dec 6 08:06:48 2005 From: jdl at freescale.com (Jon Loeliger) Date: Mon, 05 Dec 2005 15:06:48 -0600 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware Message-ID: <1133816807.8577.50.camel@cashmere.sps.mot.com> Folks, Included below is a proposed Revision 0.5 of the "Booting the Linux/ppc kernel without Open Firmware" document. This modification primarily extends the Revision 0.4 by adding definitions for OF Nodes that cover the System-On-a-Chip features found on PPC parts. It also generalizes some earlier wording that pertained to only PPC64 parts and covers the new, merged PPC 32 and 64 parts together. Finally, minor typos, style consistency and grammar problems were corrected. Please review this document, primarily Chapter VI, so that we may all proceed with the PPC32/64 merge tree development in a consistent, unified direction. While some effort has been made to follow standard OF nomenclature, terminology and standards, I confess that the authors of these additions are not experts in this area and my have missed details or key insights, or allowed for glaring errors. Our goal is collective improvement here, so be gentle when you call us stupid. :-) Thanks, jdl Booting the Linux/ppc kernel without Open Firmware -------------------------------------------------- (c) 2005 Benjamin Herrenschmidt , IBM Corp. (c) 2005 Becky Bruce , Freescale Semiconductor, FSL SOC and 32-bit additions May 18, 2005: Rev 0.1 - Initial draft, no chapter III yet. May 19, 2005: Rev 0.2 - Add chapter III and bits & pieces here or clarifies the fact that a lot of things are optional, the kernel only requires a very small device tree, though it is encouraged to provide an as complete one as possible. May 24, 2005: Rev 0.3 - Precise that DT block has to be in RAM - Misc fixes - Define version 3 and new format version 16 for the DT block (version 16 needs kernel patches, will be fwd separately). String block now has a size, and full path is replaced by unit name for more compactness. linux,phandle is made optional, only nodes that are referenced by other nodes need it. "name" property is now automatically deduced from the unit name June 1, 2005: Rev 0.4 - Correct confusion between OF_DT_END and OF_DT_END_NODE in structure definition. - Change version 16 format to always align property data to 4 bytes. Since tokens are already aligned, that means no specific required alignement between property size and property data. The old style variable alignment would make it impossible to do "simple" insertion of properties using memove (thanks Milton for noticing). Updated kernel patch as well - Correct a few more alignement constraints - Add a chapter about the device-tree compiler and the textural representation of the tree that can be "compiled" by dtc. November 21, 2005: Rev 0.5 - Additions/generalizations for 32-bit - Changed to reflect the new arch/powerpc structure - Added chapter VI ToDo: - Add some definitions of interrupt tree (simple/complex) - Add some definitions for pci host bridges - Add some common address format examples - Add definitions for standard properties and "compatible" names for cells that are not already defined by the existing OF spec. - Compare FSL SOC use of PCI to standard and make sure no new node definition required. - Add more information about node definitions for SOC devices that currently have no standard, like the FSL CPM. I - Introduction ================ During the recent development of the Linux/ppc64 kernel, and more specifically, the addition of new platform types outside of the old IBM pSeries/iSeries pair, it was decided to enforce some strict rules regarding the kernel entry and bootloader <-> kernel interfaces, in order to avoid the degeneration that had become the ppc32 kernel entry point and the way a new platform should be added to the kernel. The legacy iSeries platform breaks those rules as it predates this scheme, but no new board support will be accepted in the main tree that doesn't follows them properly. In addition, since the advent of the arch/powerpc merged architecture for ppc32 and ppc64, new 32-bit platforms and 32-bit platforms which move into arch/powerpc will be required to use these rules as well. The main requirement that will be defined in more detail below is the presence of a device-tree whose format is defined after Open Firmware specification. However, in order to make life easier to embedded board vendors, the kernel doesn't require the device-tree to represent every device in the system and only requires some nodes and properties to be present. This will be described in detail in section III, but, for example, the kernel does not require you to create a node for every PCI device in the system. It is a requirement to have a node for PCI host bridges in order to provide interrupt routing informations and memory/IO ranges, among others. It is also recommended to define nodes for on chip devices and other busses that don't specifically fit in an existing OF specification. This creates a great flexibility in the way the kernel can then probe those and match drivers to device, without having to hard code all sorts of tables. It also makes it more flexible for board vendors to do minor hardware upgrades without significantly impacting the kernel code or cluttering it with special cases. 1) Entry point for arch/powerpc ------------------------------- There is one and one single entry point to the kernel, at the start of the kernel image. That entry point supports two calling conventions: a) Boot from Open Firmware. If your firmware is compatible with Open Firmware (IEEE 1275) or provides an OF compatible client interface API (support for "interpret" callback of forth words isn't required), you can enter the kernel with: r5 : OF callback pointer as defined by IEEE 1275 bindings to powerpc. Only the 32 bit client interface is currently supported r3, r4 : address & length of an initrd if any or 0 The MMU is either on or off; the kernel will run the trampoline located in arch/powerpc/kernel/prom_init.c to extract the device-tree and other information from open firmware and build a flattened device-tree as described in b). prom_init() will then re-enter the kernel using the second method. This trampoline code runs in the context of the firmware, which is supposed to handle all exceptions during that time. b) Direct entry with a flattened device-tree block. This entry point is called by a) after the OF trampoline and can also be called directly by a bootloader that does not support the Open Firmware client interface. It is also used by "kexec" to implement "hot" booting of a new kernel from a previous running one. This method is what I will describe in more details in this document, as method a) is simply standard Open Firmware, and thus should be implemented according to the various standard documents defining it and its binding to the PowerPC platform. The entry point definition then becomes: r3 : physical pointer to the device-tree block (defined in chapter II) in RAM r4 : physical pointer to the kernel itself. This is used by the assembly code to properly disable the MMU in case you are entering the kernel with MMU enabled and a non-1:1 mapping. r5 : NULL (as to differenciate with method a) Note about SMP entry: Either your firmware puts your other CPUs in some sleep loop or spin loop in ROM where you can get them out via a soft reset or some other means, in which case you don't need to care, or you'll have to enter the kernel with all CPUs. The way to do that with method b) will be described in a later revision of this document. 2) Board support ---------------- 64-bit kernels: Board supports (platforms) are not exclusive config options. An arbitrary set of board supports can be built in a single kernel image. The kernel will "know" what set of functions to use for a given platform based on the content of the device-tree. Thus, you should: a) add your platform support as a _boolean_ option in arch/powerpc/Kconfig, following the example of PPC_PSERIES, PPC_PMAC and PPC_MAPLE. The later is probably a good example of a board support to start from. b) create your main platform file as "arch/powerpc/platforms/myplatform/myboard_setup.c" and add it to the Makefile under the condition of your CONFIG_ option. This file will define a structure of type "ppc_md" containing the various callbacks that the generic code will use to get to your platform specific code c) Add a reference to your "ppc_md" structure in the "machines" table in arch/powerpc/kernel/setup_64.c if you are a 64-bit platform. d) request and get assigned a platform number (see PLATFORM_* constants in include/asm-powerpc/processor.h 32-bit embedded kernels: Currently, board support is essentially an exclusive config option. The kernel is configured for a single platform. Part of the reason for this is to keep kernels on embedded systems small and efficient; part of this is due to the fact the code is already that way. In the future, a kernel may support multiple platforms, but only if the platforms feature the same core architectire. A single kernel build cannot support both configurations with Book E and configurations with classic Powerpc architectures. 32-bit embedded platforms that are moved into arch/powerpc using a flattened device tree should adopt the merged tree practice of setting ppc_md up dynamically, even though the kernel is currently built with support for only a single platform at a time. This allows unification of the setup code, and will make it easier to go to a multiple-platform-support model in the future. NOTE: I believe the above will be true once Ben's done with the merge of the boot sequences.... someone speak up if this is wrong! To add a 32-bit embedded platform support, follow the instructions for 64-bit platforms above, with the exception that the Kconfig option should be set up such that the kernel builds exclusively for the platform selected. The processor type for the platform should enable another config option to select the specific board supported. NOTE: If ben doesn't merge the setup files, may need to change this to point to setup_32.c I will describe later the boot process and various callbacks that your platform should implement. II - The DT block format ======================== This chapter defines the actual format of the flattened device-tree passed to the kernel. The actual content of it and kernel requirements are described later. You can find example of code manipulating that format in various places, including arch/powerpc/kernel/prom_init.c which will generate a flattened device-tree from the Open Firmware representation, or the fs2dt utility which is part of the kexec tools which will generate one from a filesystem representation. It is expected that a bootloader like uboot provides a bit more support, that will be discussed later as well. Note: The block has to be in main memory. It has to be accessible in both real mode and virtual mode with no mapping other than main memory. If you are writing a simple flash bootloader, it should copy the block to RAM before passing it to the kernel. 1) Header --------- The kernel is entered with r3 pointing to an area of memory that is roughtly described in include/asm-powerpc/prom.h by the structure boot_param_header: struct boot_param_header { u32 magic; /* magic word OF_DT_HEADER */ u32 totalsize; /* total size of DT block */ u32 off_dt_struct; /* offset to structure */ u32 off_dt_strings; /* offset to strings */ u32 off_mem_rsvmap; /* offset to memory reserve map */ u32 version; /* format version */ u32 last_comp_version; /* last compatible version */ /* version 2 fields below */ u32 boot_cpuid_phys; /* Which physical CPU id we're booting on */ /* version 3 fields below */ u32 size_dt_strings; /* size of the strings block */ }; Along with the constants: /* Definitions used by the flattened device tree */ #define OF_DT_HEADER 0xd00dfeed /* 4: version, 4: total size */ #define OF_DT_BEGIN_NODE 0x1 /* Start node: full name */ #define OF_DT_END_NODE 0x2 /* End node */ #define OF_DT_PROP 0x3 /* Property: name off, size, content */ #define OF_DT_END 0x9 All values in this header are in big endian format, the various fields in this header are defined more precisely below. All "offset" values are in bytes from the start of the header; that is from the value of r3. - magic This is a magic value that "marks" the beginning of the device-tree block header. It contains the value 0xd00dfeed and is defined by the constant OF_DT_HEADER - totalsize This is the total size of the DT block including the header. The "DT" block should enclose all data structures defined in this chapter (who are pointed to by offsets in this header). That is, the device-tree structure, strings, and the memory reserve map. - off_dt_struct This is an offset from the beginning of the header to the start of the "structure" part the device tree. (see 2) device tree) - off_dt_strings This is an offset from the beginning of the header to the start of the "strings" part of the device-tree - off_mem_rsvmap This is an offset from the beginning of the header to the start of the reserved memory map. This map is a list of pairs of 64 bit integers. Each pair is a physical address and a size. The list is terminated by an entry of size 0. This map provides the kernel with a list of physical memory areas that are "reserved" and thus not to be used for memory allocations, especially during early initialization. The kernel needs to allocate memory during boot for things like un-flattening the device-tree, allocating an MMU hash table, etc... Those allocations must be done in such a way to avoid overriding critical things like, on Open Firmware capable machines, the RTAS instance, or on some pSeries, the TCE tables used for the iommu. Typically, the reserve map should contain _at least_ this DT block itself (header,total_size). If you are passing an initrd to the kernel, you should reserve it as well. You do not need to reserve the kernel image itself. The map should be 64 bit aligned. - version This is the version of this structure. Version 1 stops here. Version 2 adds an additional field boot_cpuid_phys. Version 3 adds the size of the strings block, allowing the kernel to reallocate it easily at boot and free up the unused flattened structure after expansion. Version 16 introduces a new more "compact" format for the tree itself that is however not backward compatible. You should always generate a structure of the highest version defined at the time of your implementation. Currently that is version 16, unless you explicitely aim at being backward compatible. - last_comp_version Last compatible version. This indicates down to what version of the DT block you are backward compatible. For example, version 2 is backward compatible with version 1 (that is, a kernel build for version 1 will be able to boot with a version 2 format). You should put a 1 in this field if you generate a device tree of version 1 to 3, or 0x10 if you generate a tree of version 0x10 using the new unit name format. - boot_cpuid_phys This field only exist on version 2 headers. It indicate which physical CPU ID is calling the kernel entry point. This is used, among others, by kexec. If you are on an SMP system, this value should match the content of the "reg" property of the CPU node in the device-tree corresponding to the CPU calling the kernel entry point (see further chapters for more informations on the required device-tree contents) So the typical layout of a DT block (though the various parts don't need to be in that order) looks like this (addresses go from top to bottom): ------------------------------ r3 -> | struct boot_param_header | ------------------------------ | (alignment gap) (*) | ------------------------------ | memory reserve map | ------------------------------ | (alignment gap) | ------------------------------ | | | device-tree structure | | | ------------------------------ | (alignment gap) | ------------------------------ | | | device-tree strings | | | -----> ------------------------------ | | --- (r3 + totalsize) (*) The alignment gaps are not necessarily present; their presence and size are dependent on the various alignment requirements of the individual data blocks. 2) Device tree generalities --------------------------- This device-tree itself is separated in two different blocks, a structure block and a strings block. Both need to be aligned to a 4 byte boundary. First, let's quickly describe the device-tree concept before detailing the storage format. This chapter does _not_ describe the detail of the required types of nodes & properties for the kernel, this is done later in chapter III. The device-tree layout is strongly inherited from the definition of the Open Firmware IEEE 1275 device-tree. It's basically a tree of nodes, each node having two or more named properties. A property can have a value or not. It is a tree, so each node has one and only one parent except for the root node who has no parent. A node has 2 names. The actual node name is generally contained in a property of type "name" in the node property list whose value is a zero terminated string and is mandatory for version 1 to 3 of the format definition (as it is in Open Firmware). Version 0x10 makes it optional as it can generate it from the unit name defined below. There is also a "unit name" that is used to differenciate nodes with the same name at the same level, it is usually made of the node name's, the "@" sign, and a "unit address", which definition is specific to the bus type the node sits on. The unit name doesn't exist as a property per-se but is included in the device-tree structure. It is typically used to represent "path" in the device-tree. More details about the actual format of these will be below. The kernel powerpc generic code does not make any formal use of the unit address (though some board support code may do) so the only real requirement here for the unit address is to ensure uniqueness of the node unit name at a given level of the tree. Nodes with no notion of address and no possible sibling of the same name (like /memory or /cpus) may omit the unit address in the context of this specification, or use the "@0" default unit address. The unit name is used to define a node "full path", which is the concatenation of all parent node unit names separated with "/". The root node doesn't have a defined name, and isn't required to have a name property either if you are using version 3 or earlier of the format. It also has no unit address (no @ symbol followed by a unit address). The root node unit name is thus an empty string. The full path to the root node is "/". Every node which actually represents an actual device (that is, a node which isn't only a virtual "container" for more nodes, like "/cpus" is) is also required to have a "device_type" property indicating the type of node . Finally, every node that can be referenced from a property in another node is required to have a "linux,phandle" property. Real open firmware implementations provide a unique "phandle" value for every node that the "prom_init()" trampoline code turns into "linux,phandle" properties. However, this is made optional if the flattened device tree is used directly. An example of a node referencing another node via "phandle" is when laying out the interrupt tree which will be described in a further version of this document. This "linux, phandle" property is a 32 bit value that uniquely identifies a node. You are free to use whatever values or system of values, internal pointers, or whatever to generate these, the only requirement is that every node for which you provide that property has a unique value for it. Here is an example of a simple device-tree. In this example, an "o" designates a node followed by the node unit name. Properties are presented with their name followed by their content. "content" represents an ASCII string (zero terminated) value, while represents a 32 bit hexadecimal value. The various nodes in this example will be discussed in a later chapter. At this point, it is only meant to give you a idea of what a device-tree looks like. I have purposefully kept the "name" and "linux,phandle" properties which aren't necessary in order to give you a better idea of what the tree looks like in practice. / o device-tree |- name = "device-tree" |- model = "MyBoardName" |- compatible = "MyBoardFamilyName" |- #address-cells = <2> |- #size-cells = <2> |- linux,phandle = <0> | o cpus | | - name = "cpus" | | - linux,phandle = <1> | | - #address-cells = <1> | | - #size-cells = <0> | | | o PowerPC,970 at 0 | |- name = "PowerPC,970" | |- device_type = "cpu" | |- reg = <0> | |- clock-frequency = <5f5e1000> | |- linux,boot-cpu | |- linux,phandle = <2> | o memory at 0 | |- name = "memory" | |- device_type = "memory" | |- reg = <00000000 00000000 00000000 20000000> | |- linux,phandle = <3> | o chosen |- name = "chosen" |- bootargs = "root=/dev/sda2" |- linux,platform = <00000600> |- linux,phandle = <4> This tree is almost a minimal tree. It pretty much contains the minimal set of required nodes and properties to boot a linux kernel; that is, some basic model informations at the root, the CPUs, and the physical memory layout. It also includes misc information passed through /chosen, like in this example, the platform type (mandatory) and the kernel command line arguments (optional). The /cpus/PowerPC,970 at 0/linux,boot-cpu property is an example of a property without a value. All other properties have a value. The significance of the #address-cells and #size-cells properties will be explained in chapter IV which defines precisely the required nodes and properties and their content. 3) Device tree "structure" block The structure of the device tree is a linearized tree structure. The "OF_DT_BEGIN_NODE" token starts a new node, and the "OF_DT_END_NODE" ends that node definition. Child nodes are simply defined before "OF_DT_END_NODE" (that is nodes within the node). A 'token' is a 32 bit value. The tree has to be "finished" with a OF_DT_END token Here's the basic structure of a single node: * token OF_DT_BEGIN_NODE (that is 0x00000001) * for version 1 to 3, this is the node full path as a zero terminated string, starting with "/". For version 16 and later, this is the node unit name only (or an empty string for the root node) * [align gap to next 4 bytes boundary] * for each property: * token OF_DT_PROP (that is 0x00000003) * 32 bit value of property value size in bytes (or 0 of no * value) * 32 bit value of offset in string block of property name * property value data if any * [align gap to next 4 bytes boundary] * [child nodes if any] * token OF_DT_END_NODE (that is 0x00000002) So the node content can be summmarised as a start token, a full path, a list of properties, a list of child node and an end token. Every child node is a full node structure itself as defined above. 4) Device tree 'strings" block In order to save space, property names, which are generally redundant, are stored separately in the "strings" block. This block is simply the whole bunch of zero terminated strings for all property names concatenated together. The device-tree property definitions in the structure block will contain offset values from the beginning of the strings block. III - Required content of the device tree ========================================= WARNING: All "linux,*" properties defined in this document apply only to a flattened device-tree. If your platform uses a real implementation of Open Firmware or an implementation compatible with the Open Firmware client interface, those properties will be created by the trampoline code in the kernel's prom_init() file. For example, that's where you'll have to add code to detect your board model and set the platform number. However, when using the flatenned device-tree entry point, there is no prom_init() pass, and thus you have to provide those properties yourself. 1) Note about cells and address representation ---------------------------------------------- The general rule is documented in the various Open Firmware documentations. If you chose to describe a bus with the device-tree and there exist an OF bus binding, then you should follow the specification. However, the kernel does not require every single device or bus to be described by the device tree. In general, the format of an address for a device is defined by the parent bus type, based on the #address-cells and #size-cells property. In the absence of such a property, the parent's parent values are used, etc... The kernel requires the root node to have those properties defining addresses format for devices directly mapped on the processor bus. Those 2 properties define 'cells' for representing an address and a size. A "cell" is a 32 bit number. For example, if both contain 2 like the example tree given above, then an address and a size are both composed of 2 cells, and each is a 64 bit number (cells are concatenated and expected to be in big endian format). Another example is the way Apple firmware defines them, with 2 cells for an address and one cell for a size. Most 32-bit implementations should define #address-cells and #size-cells to 1, which represents a 32-bit value. Some 32-bit processors allow for physical addresses greater than 32 bits; these processors should define #address-cells as 2. "reg" properties are always a tuple of the type "address size" where the number of cells of address and size is specified by the bus #address-cells and #size-cells. When a bus supports various address spaces and other flags relative to a given address allocation (like prefetchable, etc...) those flags are usually added to the top level bits of the physical address. For example, a PCI physical address is made of 3 cells, the bottom two containing the actual address itself while the top cell contains address space indication, flags, and pci bus & device numbers. For busses that support dynamic allocation, it's the accepted practice to then not provide the address in "reg" (keep it 0) though while providing a flag indicating the address is dynamically allocated, and then, to provide a separate "assigned-addresses" property that contains the fully allocated addresses. See the PCI OF bindings for details. In general, a simple bus with no address space bits and no dynamic allocation is preferred if it reflects your hardware, as the existing kernel address parsing functions will work out of the box. If you define a bus type with a more complex address format, including things like address space bits, you'll have to add a bus translator to the prom_parse.c file of the recent kernels for your bus type. The "reg" property only defines addresses and sizes (if #size-cells is non-0) within a given bus. In order to translate addresses upward (that is into parent bus addresses, and possibly into cpu physical addresses), all busses must contain a "ranges" property. If the "ranges" property is missing at a given level, it's assumed that translation isn't possible. The format of the "ranges" proprety for a bus is a list of: bus address, parent bus address, size "bus address" is in the format of the bus this bus node is defining, that is, for a PCI bridge, it would be a PCI address. Thus, (bus address, size) defines a range of addresses for child devices. "parent bus address" is in the format of the parent bus of this bus. For example, for a PCI host controller, that would be a CPU address. For a PCI<->ISA bridge, that would be a PCI address. It defines the base address in the parent bus where the beginning of that range is mapped. For a new 64 bit powerpc board, I recommend either the 2/2 format or Apple's 2/1 format which is slightly more compact since sizes usually fit in a single 32 bit word. New 32 bit powerpc boards should use a 1/1 format, unless the processor supports physical addresses greater than 32-bits, in which case a 2/1 format is recommended. 2) Note about "compatible" properties ------------------------------------- These properties are optional, but recommended in devices and the root node. The format of a "compatible" property is a list of concatenated zero terminated strings. They allow a device to express its compatibility with a family of similar devices, in some cases, allowing a single driver to match against several devices regardless of their actual names. 3) Note about "name" properties ------------------------------- While earlier users of Open Firmware like OldWorld macintoshes tended to use the actual device name for the "name" property, it's nowadays considered a good practice to use a name that is closer to the device class (often equal to device_type). For example, nowadays, ethernet controllers are named "ethernet", an additional "model" property defining precisely the chip type/model, and "compatible" property defining the family in case a single driver can driver more than one of these chips. However, the kernel doesn't generally put any restriction on the "name" property; it is simply considered good practice to follow the standard and its evolutions as closely as possible. Note also that the new format version 16 makes the "name" property optional. If it's absent for a node, then the node's unit name is then used to reconstruct the name. That is, the part of the unit name before the "@" sign is used (or the entire unit name if no "@" sign is present). 4) Note about node and property names and character set ------------------------------------------------------- While open firmware provides more flexibe usage of 8859-1, this specification enforces more strict rules. Nodes and properties should be comprised only of ASCII characters 'a' to 'z', '0' to '9', ',', '.', '_', '+', '#', '?', and '-'. Node names additionally allow uppercase characters 'A' to 'Z' (property names should be lowercase. The fact that vendors like Apple don't respect this rule is irrelevant here). Additionally, node and property names should always begin with a character in the range 'a' to 'z' (or 'A' to 'Z' for node names). The maximum number of characters for both nodes and property names is 31. In the case of node names, this is only the leftmost part of a unit name (the pure "name" property), it doesn't include the unit address which can extend beyond that limit. 5) Required nodes and properties -------------------------------- These are all that are currently required. However, it is strongly recommended that you expose PCI host bridges as documented in the PCI binding to open firmware, and your interrupt tree as documented in OF interrupt tree specification. a) The root node The root node requires some properties to be present: - model : this is your board name/model - #address-cells : address representation for "root" devices - #size-cells: the size representation for "root" devices Additionally, some recommended properties are: - compatible : the board "family" generally finds its way here, for example, if you have 2 board models with a similar layout, that typically get driven by the same platform code in the kernel, you would use a different "model" property but put a value in "compatible". The kernel doesn't directly use that value (see /chosen/linux,platform for how the kernel choses a platform type) but it is generally useful. The root node is also generally where you add additional properties specific to your board like the serial number if any, that sort of thing. it is recommended that if you add any "custom" property whose name may clash with standard defined ones, you prefix them with your vendor name and a comma. b) The /cpus node This node is the parent of all individual CPU nodes. It doesn't have any specific requirements, though it's generally good practice to have at least: #address-cells = <00000001> #size-cells = <00000000> This defines that the "address" for a CPU is a single cell, and has no meaningful size. This is not necessary but the kernel will assume that format when reading the "reg" properties of a CPU node, see below c) The /cpus/* nodes So under /cpus, you are supposed to create a node for every CPU on the machine. There is no specific restriction on the name of the CPU, though It's common practice to call it PowerPC,. For example, Apple uses PowerPC,G5 while IBM uses PowerPC,970FX. Required properties: - device_type : has to be "cpu" - reg : This is the physical cpu number, it's a single 32 bit cell and is also used as-is as the unit number for constructing the unit name in the full path. For example, with 2 CPUs, you would have the full path: /cpus/PowerPC,970FX at 0 /cpus/PowerPC,970FX at 1 (unit addresses do not require leading zeroes) - d-cache-line-size : one cell, L1 data cache line size in bytes - i-cache-line-size : one cell, L1 instruction cache line size in bytes - d-cache-size : one cell, size of L1 data cache in bytes - i-cache-size : one cell, size of L1 instruction cache in bytes - linux, boot-cpu : Should be defined if this cpu is the boot cpu. Recommended properties: - timebase-frequency : a cell indicating the frequency of the timebase in Hz. This is not directly used by the generic code, but you are welcome to copy/paste the pSeries code for setting the kernel timebase/decrementer calibration based on this value. - clock-frequency : a cell indicating the CPU core clock frequency in Hz. A new property will be defined for 64 bit values, but if your frequency is < 4Ghz, one cell is enough. Here as well as for the above, the common code doesn't use that property, but you are welcome to re-use the pSeries or Maple one. A future kernel version might provide a common function for this. You are welcome to add any property you find relevant to your board, like some information about the mechanism used to soft-reset the CPUs. For example, Apple puts the GPIO number for CPU soft reset lines in there as a "soft-reset" property since they start secondary CPUs by soft-resetting them. d) the /memory node(s) To define the physical memory layout of your board, you should create one or more memory node(s). You can either create a single node with all memory ranges in its reg property, or you can create several nodes, as you wish. The unit address (@ part) used for the full path is the address of the first range of memory defined by a given node. If you use a single memory node, this will typically be @0. Required properties: - device_type : has to be "memory" - reg : This property contains all the physical memory ranges of your board. It's a list of addresses/sizes concatenated together, with the number of cells of each defined by the #address-cells and #size-cells of the root node. For example, with both of these properties beeing 2 like in the example given earlier, a 970 based machine with 6Gb of RAM could typically have a "reg" property here that looks like: 00000000 00000000 00000000 80000000 00000001 00000000 00000001 00000000 That is a range starting at 0 of 0x80000000 bytes and a range starting at 0x100000000 and of 0x100000000 bytes. You can see that there is no memory covering the IO hole between 2Gb and 4Gb. Some vendors prefer splitting those ranges into smaller segments, but the kernel doesn't care. e) The /chosen node This node is a bit "special". Normally, that's where open firmware puts some variable environment information, like the arguments, or phandle pointers to nodes like the main interrupt controller, or the default input/output devices. This specification makes a few of these mandatory, but also defines some linux-specific properties that would be normally constructed by the prom_init() trampoline when booting with an OF client interface, but that you have to provide yourself when using the flattened format. Required properties: - linux,platform : This is your platform number as assigned by the architecture maintainers Recommended properties: - bootargs : This zero-terminated string is passed as the kernel command line - linux,stdout-path : This is the full path to your standard console device if any. Typically, if you have serial devices on your board, you may want to put the full path to the one set as the default console in the firmware here, for the kernel to pick it up as it's own default console. If you look at the funciton set_preferred_console() in arch/ppc64/kernel/setup.c, you'll see that the kernel tries to find out the default console and has knowledge of various types like 8250 serial ports. You may want to extend this function to add your own. - interrupt-controller : This is one cell containing a phandle value that matches the "linux,phandle" property of your main interrupt controller node. May be used for interrupt routing. Note that u-boot creates and fills in the chosen node for platforms that use it. f) the /soc node This node is used to represent a system-on-a-chip (SOC) and must be present if the processor is a SOC. The top-level soc node contains information that is global to all devices on the SOC. The node name should contain a unit address for the SOC, which is the base address of the memory-mapped register set for the SOC. The name of an soc node should start with "soc", and the remainder of the name should represent the part number for the soc. For example, the MPC8540's soc node would be called "soc8540". Required properties: - device_type : Should be "soc" - ranges : Should be defined as specified in 1) to describe the translation of SOC addresses for memory mapped SOC registers. Recommended properties: - reg : This property defines the address and size of the memory-mapped registers that are used for the SOC node itself. It does not include the child device registers - these will be defined inside each child node. The address specified in the "reg" property should match the unit address of the SOC node. - #address-cells : Address representation for "soc" devices. The format of this field may vary depending on whether or not the device registers are memory mapped. For memory mapped registers, this field represents the number of cells needed to represent the address of the registers. For SOCs that do not use MMIO, a special address format should be defined that contains enough cells to represent the required information. See 1) above for more details on defining #address-cells. - #size-cells : Size representation for "soc" devices - #interrupt-cells : Defines the width of cells used to represent interrupts. Typically this value is <2>, which includes a 32-bit number that represents the interrupt number, and a 32-bit number that represents the interrupt sense and level. This field is only needed if the SOC contains an interrupt controller. The SOC node may contain child nodes for each SOC device that the platform uses. Nodes should not be created for devices which exist on the SOC but are not used by a particular platform. See chapter VI for more information on how to specify devices that are part of an SOC. Example SOC node for the MPC8540: soc8540 at e0000000 { #address-cells = <1>; #size-cells = <1>; #interrupt-cells = <2>; device_type = "soc"; ranges = <00000000 e0000000 00100000> reg = ; } IV - "dtc", the device tree compiler ==================================== dtc source code can be found at WARNING: This version is still in early development stage; the resulting device-tree "blobs" have not yet been validated with the kernel. The current generated bloc lacks a useful reserve map (it will be fixed to generate an empty one, it's up to the bootloader to fill it up) among others. The error handling needs work, bugs are lurking, etc... dtc basically takes a device-tree in a given format and outputs a device-tree in another format. The currently supported formats are: Input formats: ------------- - "dtb": "blob" format, that is a flattened device-tree block with header all in a binary blob. - "dts": "source" format. This is a text file containing a "source" for a device-tree. The format is defined later in this chapter. - "fs" format. This is a representation equivalent to the output of /proc/device-tree, that is nodes are directories and properties are files Output formats: --------------- - "dtb": "blob" format - "dts": "source" format - "asm": assembly language file. This is a file that can be sourced by gas to generate a device-tree "blob". That file can then simply be added to your Makefile. Additionally, the assembly file exports some symbols that can be use The syntax of the dtc tool is dtc [-I ] [-O ] [-o output-filename] [-V output_version] input_filename The "output_version" defines what versio of the "blob" format will be generated. Supported versions are 1,2,3 and 16. The default is currently version 3 but that may change in the future to version 16. Additionally, dtc performs various sanity checks on the tree, like the uniqueness of linux,phandle properties, validity of strings, etc... The format of the .dts "source" file is "C" like, supports C and C++ style commments. / { } The above is the "device-tree" definition. It's the only statement supported currently at the toplevel. / { property1 = "string_value"; /* define a property containing a 0 * terminated string */ property2 = <1234abcd>; /* define a property containing a * numerical 32 bits value (hexadecimal) */ property3 = <12345678 12345678 deadbeef>; /* define a property containing 3 * numerical 32 bits values (cells) in * hexadecimal */ property4 = [0a 0b 0c 0d de ea ad be ef]; /* define a property whose content is * an arbitrary array of bytes */ childnode at addresss { /* define a child node named "childnode" * whose unit name is "childnode at * address" */ childprop = "hello\n"; /* define a property "childprop" of * childnode (in this case, a string) */ }; }; Nodes can contain other nodes etc... thus defining the hierarchical structure of the tree. Strings support common escape sequences from C: "\n", "\t", "\r", "\(octal value)", "\x(hex value)". It is also suggested that you pipe your source file through cpp (gcc preprocessor) so you can use #include's, #define for constants, etc... Finally, various options are planned but not yet implemented, like automatic generation of phandles, labels (exported to the asm file so you can point to a property content and change it easily from whatever you link the device-tree with), label or path instead of numeric value in some cells to "point" to a node (replaced by a phandle at compile time), export of reserve map address to the asm file, ability to specify reserve map content at compile time, etc... We may provide a .h include file with common definitions of that proves useful for some properties (like building PCI properties or interrupt maps) though it may be better to add a notion of struct definitions to the compiler... V - Recommendations for a bootloader ==================================== Here are some various ideas/recommendations that have been proposed while all this has been defined and implemented. - The bootloader may want to be able to use the device-tree itself and may want to manipulate it (to add/edit some properties, like physical memory size or kernel arguments). At this point, 2 choices can be made. Either the bootloader works directly on the flattened format, or the bootloader has its own internal tree representation with pointers (similar to the kernel one) and re-flattens the tree when booting the kernel. The former is a bit more difficult to edit/modify, the later requires probably a bit more code to handle the tree structure. Note that the structure format has been designed so it's relatively easy to "insert" properties or nodes or delete them by just memmoving things around. It contains no internal offsets or pointers for this purpose. - An example of code for iterating nodes & retreiving properties directly from the flattened tree format can be found in the kernel file arch/ppc64/kernel/prom.c, look at scan_flat_dt() function, it's usage in early_init_devtree(), and the corresponding various early_init_dt_scan_*() callbacks. That code can be re-used in a GPL bootloader, and as the author of that code, I would be happy do discuss possible free licencing to any vendor who wishes to integrate all or part of this code into a non-GPL bootloader. VI - System-on-a-chip devices and nodes ======================================= Many companies are now starting to develop system-on-a-chip processors, where the processor core (cpu) and many peripheral devices exist on a single piece of silicon. For these SOCs, an SOC node should be used that defines child nodes for the devices that make up the SOC. While platforms are not required to use this model in order to boot the kernel, it is highly encouraged that all SOC implementations define as complete a flat-device-tree as possible to describe the devices on the SOC. This will allow for the genericization of much of the kernel code. 1) Defining child nodes of an SOC --------------------------------- Each device that is part of an SOC may have its own node entry inside the SOC node. For each device that is included in the SOC, the unit address property represents the address offset for this device's memory-mapped registers in the parent's address space. The parent's address space is defined by the "ranges" property in the top-level soc node. The "reg" property for each node that exists directly under the SOC node should contain the address mapping from the child address space to the parent SOC address space and the size of the device's memory-mapped register file. For many devices that may exist inside an SOC, there are predefined specifications for the format of the device tree node. All SOC child nodes should follow these specifications, except where noted in this document. See appendix A for an example partial SOC node definition for the MPC8540. 2) Specifying interrupt information for SOC devices --------------------------------------------------- Each device that is part of an SOC and which generates interrupts should have the following properties: - interrupt-parent : contains the phandle of the interrupt controller which handles interrupts for this device - interrupts : a list of tuples representing the interrupt number and the interrupt sense and level for each interupt for this device. This information is used by the kernel to build the interrupt table for the interrupt controllers in the system. Sense and level information should be encoded as follows: Devices connected to openPIC-compatible controllers should encode sense and polarity as follows: 0 = high to low edge sensitive type enabled 1 = active low level sensitive type enabled 2 = low to high edge sensitive type enabled 3 = active high level sensitive type enabled ISA PIC interrupt controllers should adhere to the ISA PIC encodings listed below: 0 = active low level sensitive type enabled 1 = active high level sensitive type enabled 2 = high to low edge sensitive type enabled 3 = low to high edge sensitive type enabled 3) Representing devices without a current OF specification ---------------------------------------------------------- Currently, there are many devices on SOCs that do not have a standard representation pre-defined as part of the open firmware specifications, mainly because the boards that contain these SOCs are not currently booted using open firmware. This section contains descriptions for the SOC devices for which new nodes have been defined; this list will expand as more and more SOC-containing platforms are moved over to use the flattened-device-tree model. a) MDIO IO device The MDIO is a bus to which the PHY devices are connected. For each device that exists on this bus, a child node should be created. See the definition of the PHY node below for an example of how to define a PHY. Required properties: - reg : Offset and length of the register set for the device Example: mdio at 24520 { reg = <24520 20>; ethernet-phy at 0 { ...... }; }; b) Gianfar-compatible ethernet nodes Required properties: - device_type : Should be "network" - model : Model of the device. Can be "TSEC" or "FEC" - compatible : Should be "gianfar" - reg : Offset and length of the register set for the device - address : List of bytes representing the ethernet address of this controller - interrupts : where a is the interrupt number and b is a field that represents an encoding of the sense and level information for the interrupt. This should be encoded based on the information in section 2) depending on the type of interrupt controller you have. - interrupt-parent : the phandle for the interrupt controller that services interrupts for this device. - phy-handle : The phandle for the PHY connected to this ethernet controller. Example: ethernet at 24000 { #size-cells = <0>; device_type = "network"; model = "TSEC"; compatible = "gianfar"; reg = <24000 1000>; address = [ 00 E0 0C 00 73 00 ]; interrupts = ; interrupt-parent = <40000>; phy-handle = <2452000> }; c) PHY nodes Required properties: - device_type : Should be "ethernet-phy" - interrupts : where a is the interrupt number and b is a field that represents an encoding of the sense and level information for the interrupt. This should be encoded based on the information in section 2) depending on the type of interrupt controller you have. - interrupt-parent : the phandle for the interrupt controller that services interrupts for this device. - reg : The ID number for the phy, usually a small integer - linux,phandle : phandle for this node; likely referenced by an ethernet controller node. Example: ethernet-phy at 0 { linux,phandle = <2452000> interrupt-parent = <40000>; interrupts = <35 1>; reg = <0>; device_type = "ethernet-phy"; }; d) Interrupt controllers Some SOC devices contain interrupt controllers that are different from the standard Open PIC specification. The SOC device nodes for these types of controllers should be specified just like a standard OpenPIC controller. Sense and level information should be encoded as specified in section 2) of this chapter for each device that specifies an interrupt. Example : pic at 40000 { linux,phandle = <40000>; clock-frequency = <0>; interrupt-controller; #address-cells = <0>; reg = <40000 40000>; built-in; compatible = "chrp,open-pic"; device_type = "open-pic"; big-endian; }; e) I2C Required properties : - device_type : Should be "i2c" - reg : Offset and length of the register set for the device Recommended properties : - compatible : Should be "fsl-i2c" for parts compatible with Freescale I2C specifications. - interrupts : where a is the interrupt number and b is a field that represents an encoding of the sense and level information for the interrupt. This should be encoded based on the information in section 2) depending on the type of interrupt controller you have. - interrupt-parent : the phandle for the interrupt controller that services interrupts for this device. - dfsrr : boolean; if defined, indicates that this I2C device has a digital filter sampling rate register - fsl5200-clocking : boolean; if defined, indicated that this device uses the FSL 5200 clocking mechanism. Example : i2c at 3000 { interrupt-parent = <40000>; interrupts = <1b 3>; reg = <3000 18>; device_type = "i2c"; compatible = "fsl-i2c"; dfsrr; }; More devices will be defined as this spec matures. Appendix A - Sample SOC node for MPC8540 ======================================== Note that the #address-cells and #size-cells for the SoC node in this example have been explicitly listed; these are likely not necessary as they are usually the same as the root node. soc8540 at e0000000 { #address-cells = <1>; #size-cells = <1>; #interrupt-cells = <2>; device_type = "soc"; ranges = <00000000 e0000000 00100000> reg = ; mdio at 24520 { reg = <24520 20>; ethernet-phy at 0 { linux,phandle = <2452000> interrupt-parent = <40000>; interrupts = <35 1>; reg = <0>; device_type = "ethernet-phy"; }; ethernet-phy at 1 { linux,phandle = <2452001> interrupt-parent = <40000>; interrupts = <35 1>; reg = <1>; device_type = "ethernet-phy"; }; ethernet-phy at 3 { linux,phandle = <2452002> interrupt-parent = <40000>; interrupts = <35 1>; reg = <3>; device_type = "ethernet-phy"; }; }; ethernet at 24000 { #size-cells = <0>; device_type = "network"; model = "TSEC"; compatible = "gianfar"; reg = <24000 1000>; address = [ 00 E0 0C 00 73 00 ]; interrupts = ; interrupt-parent = <40000>; phy-handle = <2452000>; }; ethernet at 25000 { #address-cells = <1>; #size-cells = <0>; device_type = "network"; model = "TSEC"; compatible = "gianfar"; reg = <25000 1000>; address = [ 00 E0 0C 00 73 01 ]; interrupts = <13 3 14 3 18 3>; interrupt-parent = <40000>; phy-handle = <2452001>; }; ethernet at 26000 { #address-cells = <1>; #size-cells = <0>; device_type = "network"; model = "FEC"; compatible = "gianfar"; reg = <26000 1000>; address = [ 00 E0 0C 00 73 02 ]; interrupts = <19 3>; interrupt-parent = <40000>; phy-handle = <2452002>; }; serial at 4500 { device_type = "serial"; compatible = "ns16550"; reg = <4500 100>; clock-frequency = <0>; interrupts = <1a 3>; interrupt-parent = <40000>; }; pic at 40000 { linux,phandle = <40000>; clock-frequency = <0>; interrupt-controller; #address-cells = <0>; reg = <40000 40000>; built-in; compatible = "chrp,open-pic"; device_type = "open-pic"; big-endian; }; i2c at 3000 { interrupt-parent = <40000>; interrupts = <1b 3>; reg = <3000 18>; device_type = "i2c"; compatible = "fsl-i2c"; dfsrr; }; }; From michael at ellerman.id.au Tue Dec 6 08:38:57 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Mon, 5 Dec 2005 15:38:57 -0600 Subject: [PATCH 7/11] powerpc: Fixups for kernel linked at 32 MB In-Reply-To: <20051205003954.6E56168802@ozlabs.org> References: <20051205003954.6E56168802@ozlabs.org> Message-ID: <200512051539.01254.michael@ellerman.id.au> On Sun, 4 Dec 2005 12:39, Michael Ellerman wrote: > +#ifdef CONFIG_CRASH_DUMP > +#define LOAD_HANDLER(reg, label) \ > + oris r12,r12,(label)@h; /* virt addr of handler ... */ \ > + ori r12,r12,(label)@l; /* .. and the rest */ > +#else > +#define LOAD_HANDLER(reg, label) \ > + ori r12,r12,(label)@l; /* virt addr of handler ... */ > +#endif Milton just spotted this buglet, we don't actually use reg, oops :} New patch on the way. -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/4b52804d/attachment.pgp From michael at ellerman.id.au Tue Dec 6 08:49:00 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Mon, 05 Dec 2005 15:49:00 -0600 Subject: [PATCH] powerpc: Fixups for kernel linked at 32 MB In-Reply-To: <20051205003954.6E56168802@ozlabs.org> Message-ID: <20051205214914.9BEE16887C@ozlabs.org> There's a few places where we need to fix things up for the kernel to work if it's linked at 32MB: - platforms/powermac/smp.c To start secondary cpus on pmac we patch the reset vector, which is fine. Except if we're above 32MB we don't have enough bits for an absolute branch, it needs to relative. - kernel/head_64.s - A few branches in the cpu hold code need to load the full target address and do a bctr. - after_prom_start needs to load PHYSICAL_START as the dest address, not 0. - The exception prolog needs to load the low word of the target adddress, not just the low halfword. - Fixup handling of the initial stab address. - kernel/setup_64.c smp_release_cpus() needs to write 1 to the spinloop flag near 0, not 32 MB. Signed-off-by: Michael Ellerman arch/powerpc/kernel/head_64.S | 30 ++++++++++++++++++++++++------ arch/powerpc/kernel/setup_64.c | 5 ++++- arch/powerpc/platforms/powermac/smp.c | 16 +++++++--------- include/asm-powerpc/mmu.h | 3 ++- 4 files changed, 37 insertions(+), 17 deletions(-) Index: kexec/arch/powerpc/platforms/powermac/smp.c =================================================================== --- kexec.orig/arch/powerpc/platforms/powermac/smp.c +++ kexec/arch/powerpc/platforms/powermac/smp.c @@ -753,14 +753,15 @@ static int __init smp_core99_probe(void) static void __devinit smp_core99_kick_cpu(int nr) { unsigned int save_vector; - unsigned long new_vector; - unsigned long flags; + unsigned long target, flags; volatile unsigned int *vector = ((volatile unsigned int *)(KERNELBASE+0x100)); if (nr < 0 || nr > 3) return; - if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu", 0x346); + + if (ppc_md.progress) + ppc_md.progress("smp_core99_kick_cpu", 0x346); local_irq_save(flags); local_irq_disable(); @@ -768,14 +769,11 @@ static void __devinit smp_core99_kick_cp /* Save reset vector */ save_vector = *vector; - /* Setup fake reset vector that does + /* Setup fake reset vector that does * b __secondary_start_pmac_0 + nr*8 - KERNELBASE */ - new_vector = (unsigned long) __secondary_start_pmac_0 + nr * 8; - *vector = 0x48000002 + new_vector - KERNELBASE; - - /* flush data cache and inval instruction cache */ - flush_icache_range((unsigned long) vector, (unsigned long) vector + 4); + target = (unsigned long) __secondary_start_pmac_0 + nr * 8; + create_branch((unsigned long)vector, target, BRANCH_SET_LINK); /* Put some life in our friend */ pmac_call_feature(PMAC_FTR_RESET_CPU, NULL, nr, 0); Index: kexec/arch/powerpc/kernel/head_64.S =================================================================== --- kexec.orig/arch/powerpc/kernel/head_64.S +++ kexec/arch/powerpc/kernel/head_64.S @@ -154,11 +154,15 @@ _GLOBAL(__secondary_hold) bne 100b #ifdef CONFIG_HMT - b .hmt_init + LOADADDR(r4, .hmt_init) + mtctr r4 + bctr #else #ifdef CONFIG_SMP + LOADADDR(r4, .pSeries_secondary_smp_init) + mtctr r4 mr r3,r24 - b .pSeries_secondary_smp_init + bctr #else BUG_OPCODE #endif @@ -200,6 +204,20 @@ exception_marker: #define EX_R3 64 #define EX_LR 72 +/* + * We're short on space and time in the exception prolog, so we can't use + * the normal LOADADDR macro. Normally we just need the low halfword of the + * address, but for Kdump we need the whole low word. + */ +#ifdef CONFIG_CRASH_DUMP +#define LOAD_HANDLER(reg, label) \ + oris reg,reg,(label)@h; /* virt addr of handler ... */ \ + ori reg,reg,(label)@l; /* .. and the rest */ +#else +#define LOAD_HANDLER(reg, label) \ + ori reg,reg,(label)@l; /* virt addr of handler ... */ +#endif + #define EXCEPTION_PROLOG_PSERIES(area, label) \ mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ std r9,area+EX_R9(r13); /* save r9 - r12 */ \ @@ -212,7 +230,7 @@ exception_marker: clrrdi r12,r13,32; /* get high part of &label */ \ mfmsr r10; \ mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - ori r12,r12,(label)@l; /* virt addr of handler */ \ + LOAD_HANDLER(r12,label) \ ori r10,r10,MSR_IR|MSR_DR|MSR_RI; \ mtspr SPRN_SRR0,r12; \ mfspr r12,SPRN_SRR1; /* and SRR1 */ \ @@ -1348,7 +1366,7 @@ _GLOBAL(do_stab_bolted) * fixed address (the linker can't compute (u64)&initial_stab >> * PAGE_SHIFT). */ - . = STAB0_PHYS_ADDR /* 0x6000 */ + . = STAB0_OFFSET /* 0x6000 */ .globl initial_stab initial_stab: .space 4096 @@ -1553,7 +1571,7 @@ _STATIC(__boot_from_prom) _STATIC(__after_prom_start) /* - * We need to run with __start at physical address 0. + * We need to run with __start at physical address PHYSICAL_START. * This will leave some code in the first 256B of * real memory, which are reserved for software use. * The remainder of the first page is loaded with the fixed @@ -1568,7 +1586,7 @@ _STATIC(__after_prom_start) mr r26,r3 SET_REG_TO_CONST(r27,KERNELBASE) - li r3,0 /* target addr */ + LOADADDR(r3, PHYSICAL_START) /* target addr */ // XXX FIXME: Use phys returned by OF (r30) add r4,r27,r26 /* source addr */ Index: kexec/arch/powerpc/kernel/setup_64.c =================================================================== --- kexec.orig/arch/powerpc/kernel/setup_64.c +++ kexec/arch/powerpc/kernel/setup_64.c @@ -314,6 +314,7 @@ void early_setup_secondary(void) void smp_release_cpus(void) { extern unsigned long __secondary_hold_spinloop; + unsigned long *ptr; DBG(" -> smp_release_cpus()\n"); @@ -324,7 +325,9 @@ void smp_release_cpus(void) * This is useless but harmless on iSeries, secondaries are already * waiting on their paca spinloops. */ - __secondary_hold_spinloop = 1; + ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop + - PHYSICAL_START); + *ptr = 1; mb(); DBG(" <- smp_release_cpus()\n"); Index: kexec/include/asm-powerpc/mmu.h =================================================================== --- kexec.orig/include/asm-powerpc/mmu.h +++ kexec/include/asm-powerpc/mmu.h @@ -33,7 +33,8 @@ /* Location of cpu0's segment table */ #define STAB0_PAGE 0x6 -#define STAB0_PHYS_ADDR (STAB0_PAGE<<12) +#define STAB0_OFFSET (STAB0_PAGE << 12) +#define STAB0_PHYS_ADDR (STAB0_OFFSET + PHYSICAL_START) #ifndef __ASSEMBLY__ extern char initial_stab[]; From arnd at arndb.de Tue Dec 6 14:52:22 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:22 -0500 Subject: [PATCH 02/14] spufs: fix local store page refcounting References: <20051206035220.097737000@localhost> Message-ID: <20051206040643.452349000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spufs-page-refcnt.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/7a8743a5/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:21 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:21 -0500 Subject: [PATCH 01/14] spufs: Make all exports GPL-only References: <20051206035220.097737000@localhost> Message-ID: <20051206040643.328108000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spufs-export-symbol-gpl.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/a085066d/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:20 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:20 -0500 Subject: [PATCH 00/14] Cell updates for powerpc.git Message-ID: <20051206035220.097737000@localhost> This is my current set of updates related to the cell platforms. It includes some spufs updates, most importantly preemption support for SPUs from Mark Nutter, some platform code updates and a few bug fixes for the spidernet device driver. Paul, please apply to the powerpc.git tree, all patches are based on todays checkout. Arnd <>< -- From arnd at arndb.de Tue Dec 6 14:52:23 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:23 -0500 Subject: [PATCH 03/14] spufs: Fix oops when spufs module is not loaded References: <20051206035220.097737000@localhost> Message-ID: <20051206040643.620312000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spufs-syscall-oops.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/36b2892e/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:24 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:24 -0500 Subject: [PATCH 04/14] spufs: Turn off debugging output References: <20051206035220.097737000@localhost> Message-ID: <20051206040643.792016000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spufs-no-debug.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/c8555f2a/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:27 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:27 -0500 Subject: [PATCH 07/14] spufs: fix mailbox polling References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.322664000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spufs-mbox-intr.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/4e4f049c/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:26 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:26 -0500 Subject: [PATCH 06/14] spufs: Improved SPU preemptability [part 2]. References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.145607000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spu-preempt-3.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/7701d47f/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:33 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:33 -0500 Subject: [PATCH 13/14] spidernet: read firmware from the OF device tree References: <20051206035220.097737000@localhost> Message-ID: <20051206040645.368270000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spidernet-fw-from-dt-2.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/fd9ba499/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:30 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:30 -0500 Subject: [PATCH 10/14] cell: add iommu support for larger memory References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.841367000@localhost> An embedded and charset-unspecified text was scrubbed... Name: iommu-new-firmware-6.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/4865f86a/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:28 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:28 -0500 Subject: [PATCH 08/14] cell: enable pause(0) in cpu_idle References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.495500000@localhost> An embedded and charset-unspecified text was scrubbed... Name: bpa-pmd-add-2.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/69d8875e/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:29 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:29 -0500 Subject: [PATCH 09/14] cell: add platform detection code References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.665121000@localhost> An embedded and charset-unspecified text was scrubbed... Name: cell-platform-detect.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/1328ae8f/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:32 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:32 -0500 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly References: <20051206035220.097737000@localhost> Message-ID: <20051206040645.193163000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spidernet-programcheck.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/f2494609/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:25 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:25 -0500 Subject: [PATCH 05/14] spufs: Improved SPU preemptability. References: <20051206035220.097737000@localhost> Message-ID: <20051206040644.014463000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spu-preempt-1.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/01b2ace2/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:31 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:31 -0500 Subject: [PATCH 11/14] spidernet: fix Kconfig after BPA->CELL rename References: <20051206035220.097737000@localhost> Message-ID: <20051206040645.065973000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spidernet-with-pci-and-cell.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/f9fb4bf1/attachment.txt From arnd at arndb.de Tue Dec 6 14:52:34 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Mon, 05 Dec 2005 22:52:34 -0500 Subject: [PATCH 14/14] spidernet: fix HW structures for 64 bit dma_addr_t References: <20051206035220.097737000@localhost> Message-ID: <20051206040645.538783000@localhost> An embedded and charset-unspecified text was scrubbed... Name: spidernet-dma_addr_t-fix-2.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051205/34f8d7a2/attachment.txt From rsa at us.ibm.com Mon Dec 5 08:12:26 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:12:26 -0600 Subject: [RFC PATCH 4/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935BBA.8090009@us.ibm.com> This patch sets the preffered_console when running on the simulator. Signed-off-by: Ryan S. Arnold -------------- next part -------------- A non-text attachment was scrubbed... Name: hvc_fss.4.patch Type: text/x-patch Size: 1008 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051204/b504f2d3/attachment.bin From rsa at us.ibm.com Mon Dec 5 08:12:31 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:12:31 -0600 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935BBF.6080005@us.ibm.com> This patch adds the hvc_fss.c driver file. Signed-off-by: Ryan S. Arnold -------------- next part -------------- A non-text attachment was scrubbed... Name: hvc_fss.5.patch Type: text/x-patch Size: 4623 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051204/24ea2a00/attachment.bin From rsa at us.ibm.com Mon Dec 5 08:12:21 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:12:21 -0600 Subject: [RFC PATCH 3/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935BB5.9030302@us.ibm.com> This patch modifies the defconfig file for the CELL simulator and changes the Makefile and Kconfig to add hvc_fss. Signed-off-by: Ryan S. Arnold -------------- next part -------------- A non-text attachment was scrubbed... Name: hvc_fss.3.patch Type: text/x-patch Size: 3072 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051204/67c3f007/attachment.bin From paulus at samba.org Tue Dec 6 11:59:11 2005 From: paulus at samba.org (Paul Mackerras) Date: Tue, 6 Dec 2005 11:59:11 +1100 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly In-Reply-To: <20051206040645.193163000@localhost> References: <20051206035220.097737000@localhost> <20051206040645.193163000@localhost> Message-ID: <17300.57951.373636.507621@cargo.ozlabs.ibm.com> Arnd Bergmann writes: > Uploading the device firmware may fail if wrong input data > was provided by the user. This checks for the condition. > > From: Jens.Osterkamp at de.ibm.com > Cc: netdev at vger.kernel.org This one should be sent to Jeff Garzik, along with patches 11, 13 and 14. Paul. From paulus at samba.org Tue Dec 6 11:51:39 2005 From: paulus at samba.org (Paul Mackerras) Date: Tue, 6 Dec 2005 11:51:39 +1100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <20051206040643.452349000@localhost> References: <20051206035220.097737000@localhost> <20051206040643.452349000@localhost> Message-ID: <17300.57499.400458.387421@cargo.ozlabs.ibm.com> Arnd Bergmann writes: > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/spufs/file.c Remind me again why spufs is under arch/powerpc/ rather than fs/ ? Regards, Paul. From rsa at us.ibm.com Mon Dec 5 08:11:56 2005 From: rsa at us.ibm.com (Ryan S. Arnold) Date: Sun, 04 Dec 2005 15:11:56 -0600 Subject: [RFC PATCH 0/5] CELL bogus_console port to hvc_console backend driver Message-ID: <43935B9C.5020503@us.ibm.com> The following patch-set was created against the 2.6.14-rc5 CBE (cell broadband environment) patch-set provided by Arnd Bergman. The purpose of this patch-set is to port the CELL IBM Full System Simulator bogus_console.c driver to an hvc_console back-end driver, namely hvc_fss.c. Our intention is to support binary-compatibility of all hvc_console back-end drivers such that all drivers can be built into the kernel at configuration time but only one back-end (the one that detects that it is running on the right hardware) actually registers the front-end driver at console init. This is a request-for-comments. Please contribute any suggestions especially in-regards-to the Makefile and Kconfig changes. I'm not very experienced with them. I do realize that the current mainline kernel has some significant differences in the hvc_console driver versus the 2.6.14-rc5 kernel. We'll address this when the cell patches go upstream. I've tested these patches on the CELL IBM Full System Simulator and the console works fine. I've not had a chance to test the hvc_vio back-end on ppc64 hardware but I'll do this once I have a chance. I suspect that there will be some udev magic required to get the /dev/hvc0 device to appear on the CELL simulator since there isn't an actual serial device. Using this driver one can actually execute agetty on the console as well. Thanks, Ryan S. Arnold IBM Linux Technology Center From linas at austin.ibm.com Tue Dec 6 12:37:35 2005 From: linas at austin.ibm.com (linas) Date: Mon, 5 Dec 2005 19:37:35 -0600 Subject: [PATCH] powerpc: minor cleanup of void ptr deref Message-ID: <20051206013735.GJ31651@austin.ibm.com> Paul, Please apply. --linas Minor: use macro to perform void pointer deref; this may someday help avoid pointer typecasting errors. Signed-off-by: Linas Vepstas -- Index: linux-2.6.15-rc3-mm1/arch/powerpc/platforms/powermac/pci.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/arch/powerpc/platforms/powermac/pci.c 2005-12-01 15:14:41.000000000 -0600 +++ linux-2.6.15-rc3-mm1/arch/powerpc/platforms/powermac/pci.c 2005-12-05 13:52:03.207941067 -0600 @@ -326,7 +326,7 @@ else busdn = hose->arch_data; for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->data && PCI_DN(dn)->devfn == devfn) + if (PCI_DN(dn) && PCI_DN(dn)->devfn == devfn) break; if (dn == NULL) return -1; Index: linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/iommu.c =================================================================== --- linux-2.6.15-rc3-mm1.orig/arch/powerpc/platforms/pseries/iommu.c 2005-12-01 15:14:41.000000000 -0600 +++ linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/iommu.c 2005-12-05 13:52:03.207941067 -0600 @@ -433,7 +433,7 @@ return; } - ppci = pdn->data; + ppci = PCI_DN(pdn); if (!ppci->iommu_table) { /* Bussubno hasn't been copied yet. * Do it now because iommu_table_setparms_lpar needs it. @@ -480,10 +480,10 @@ * an already allocated iommu table is found and use that. */ - while (dn && dn->data && PCI_DN(dn)->iommu_table == NULL) + while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) dn = dn->parent; - if (dn && dn->data) { + if (dn && PCI_DN(dn)) { PCI_DN(mydn)->iommu_table = PCI_DN(dn)->iommu_table; } else { DBG("iommu_dev_setup_pSeries, dev %p (%s) has no iommu table\n", dev, pci_name(dev)); @@ -494,7 +494,7 @@ { int err = NOTIFY_OK; struct device_node *np = node; - struct pci_dn *pci = np->data; + struct pci_dn *pci = PCI_DN(np); switch (action) { case PSERIES_RECONFIG_REMOVE: @@ -530,7 +530,7 @@ */ dn = pci_device_to_OF_node(dev); - for (pdn = dn; pdn && pdn->data && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; pdn = pdn->parent) { dma_window = (unsigned int *) get_property(pdn, "ibm,dma-window", NULL); @@ -549,7 +549,7 @@ DBG("Found DMA window, allocating table\n"); } - pci = pdn->data; + pci = PCI_DN(pdn); if (!pci->iommu_table) { /* iommu_table_setparms_lpar needs bussubno. */ pci->bussubno = pci->phb->bus->number; From hollis at penguinppc.org Tue Dec 6 12:59:03 2005 From: hollis at penguinppc.org (Hollis Blanchard) Date: Mon, 5 Dec 2005 19:59:03 -0600 Subject: Linuxppc64-dev Digest, Vol 16, Issue 11 In-Reply-To: <9b23fc710512050035i117c7bd7y75a01f487dc74654@mail.gmail.com> References: <20051205010004.52D1568876@ozlabs.org> <9b23fc710512050035i117c7bd7y75a01f487dc74654@mail.gmail.com> Message-ID: <84a13e07288333aff98dfd0c36591b00@penguinppc.org> On Dec 5, 2005, at 2:35 AM, Renuka Pampana wrote: > > Where can i get PPC440ep (yosemite) patch for 64 bit kernel. Can you > give me some pointers to refer. Please do not quote an entire email digest, especially since your post had nothing to do with that at all. The 440EP is not a 64-bit processor, so you will never find support for it in the 64-bit kernel. Instead please see http://penguinppc.org/embedded/ and the resources it points you to. -Hollis From arnd at arndb.de Tue Dec 6 21:18:17 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 6 Dec 2005 11:18:17 +0100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17300.57499.400458.387421@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <20051206040643.452349000@localhost> <17300.57499.400458.387421@cargo.ozlabs.ibm.com> Message-ID: <200512061118.19633.arnd@arndb.de> On Dinsdag 06 Dezember 2005 01:51, Paul Mackerras wrote: > Arnd Bergmann writes: > > > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/spufs/file.c > > Remind me again why spufs is under arch/powerpc/ rather than fs/ ? We had a discussion about this in August, after the patch at http://patchwork.ozlabs.org/linuxppc64/patch?id=2140 Nobody had voiced any objections against the arch/powerpc location, and Pekka had good reasons against fs/, so I changed it. Arnd <>< From arnd at arndb.de Tue Dec 6 21:23:39 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 6 Dec 2005 11:23:39 +0100 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly In-Reply-To: <17300.57951.373636.507621@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <20051206040645.193163000@localhost> <17300.57951.373636.507621@cargo.ozlabs.ibm.com> Message-ID: <200512061123.40059.arnd@arndb.de> On Dinsdag 06 Dezember 2005 01:59, Paul Mackerras wrote: > Arnd Bergmann writes: > > > Uploading the device firmware may fail if wrong input data > > was provided by the user. This checks for the condition. > > > > From: Jens.Osterkamp at de.ibm.com > > Cc: netdev at vger.kernel.org > > This one should be sent to Jeff Garzik, along with patches 11, 13 and > 14. Ok. Jens, is it ok for you if you send the network driver stuff to jgarzik at pobox.com, Cc: netdev at vger.kernel.org yourself in the future? Arnd <>< From dwmw2 at redhat.com Tue Dec 6 22:31:17 2005 From: dwmw2 at redhat.com (David Woodhouse) Date: Tue, 06 Dec 2005 11:31:17 +0000 Subject: [RFC PATCH 5/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: References: Message-ID: <1133868677.4136.80.camel@baythorne.infradead.org> On Mon, 2005-12-05 at 11:17 -0600, Eric V Van hensbergen wrote: > I suppose if there is sufficient pull I could push them at any time -- I > haven't gone down this path because I'm not sure how I feel including > simulator drivers in the mainline kernel tree. If the linuxppc64 folks > think this is valuable, I'd be happy to clean-up the drivers a bit more > and submit a patch. They're fairly self-contained and harmless -- I think it would be good to merge them. The bogusdisk driver wants a proper major number assigned, and it wants to look in the device tree for /mambo/bogusdisc nodes (which we can trivially hack the sim's tcl script to provide). -- dwmw2 From penberg at cs.helsinki.fi Tue Dec 6 22:38:28 2005 From: penberg at cs.helsinki.fi (Pekka Enberg) Date: Tue, 06 Dec 2005 13:38:28 +0200 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <200512061118.19633.arnd@arndb.de> References: <20051206035220.097737000@localhost> <20051206040643.452349000@localhost> <17300.57499.400458.387421@cargo.ozlabs.ibm.com> <200512061118.19633.arnd@arndb.de> Message-ID: <1133869108.7968.1.camel@localhost> Hi, On Dinsdag 06 Dezember 2005 01:51, Paul Mackerras wrote: > > Remind me again why spufs is under arch/powerpc/ rather than fs/ ? On Tue, 2005-12-06 at 11:18 +0100, Arnd Bergmann wrote: > We had a discussion about this in August, after the patch > at http://patchwork.ozlabs.org/linuxppc64/patch?id=2140 > > Nobody had voiced any objections against the arch/powerpc location, > and Pekka had good reasons against fs/, so I changed it. It had arch specific hooks which IMHO do not belong into fs/. Pekka From ntl at pobox.com Wed Dec 7 02:45:58 2005 From: ntl at pobox.com (Nathan Lynch) Date: Tue, 6 Dec 2005 10:45:58 -0500 Subject: [PATCH] reworked: numa placement for dynamically added memory In-Reply-To: <20051205200642.GA20613@w-mikek2.ibm.com> References: <20051205200642.GA20613@w-mikek2.ibm.com> Message-ID: <20051206154557.GA8901@localhost.localdomain> Mike Kravetz wrote: > Here is a reworked version of the patch with changes suggested by > Nathan. Again, this patch depends on: > http://ozlabs.org/pipermail/linuxppc64-dev/2005-December/006923.html > > This patch places dynamically added memory within the appropriate > numa node. A new routine hot_add_scn_to_nid() replicates most of > the memory scanning code in parse_numa_properties(). Changes look good to me, thanks. Nathan From miltonm at bga.com Wed Dec 7 03:40:00 2005 From: miltonm at bga.com (Milton Miller) Date: Tue, 6 Dec 2005 10:40:00 -0600 Subject: Booting OS on PowerPC Message-ID: On Thu Dec 1 19:48:11 EST 2005, veera venkata prasad j wrote: > Can any body tell me how Linux boot on PowerPC machine > when Open Firmware is up. To be more preciese, what is > the "known-environment" that the OS expect from Open > Firmware. It expects to be (1) in 32-bit mode, (2) r3, r4 0, (3) r5 is a pointer to the client interface callback, (4) r1 is a usable stack, (5) image is loaded according to elf-header of zImage wrapper. More information can be found at penguinppc.org, the CHRP (Common Hardware Reference Platform, outdated), relevant Openfirmware specifications, and by reading prom.c in the source. If you are looking to skip openfirmware, see the draft document in the current thread on this mailing list RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware milton From arnd at arndb.de Wed Dec 7 05:49:30 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 6 Dec 2005 19:49:30 +0100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <1133869108.7968.1.camel@localhost> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> Message-ID: <200512061949.33482.arnd@arndb.de> On Dinsdag 06 Dezember 2005 12:38, Pekka Enberg wrote: > On Dinsdag 06 Dezember 2005 01:51, Paul Mackerras wrote: > > > Remind me again why spufs is under arch/powerpc/ rather than fs/ ? > > On Tue, 2005-12-06 at 11:18 +0100, Arnd Bergmann wrote: > > We had a discussion about this in August, after the patch > > at http://patchwork.ozlabs.org/linuxppc64/patch?id=2140 > > > > Nobody had voiced any objections against the arch/powerpc location, > > and Pekka had good reasons against fs/, so I changed it. > > It had arch specific hooks which IMHO do not belong into fs/. Since the discussion came up again in irc, I looked up the existing file systems. outside of fs/, we have the following file systems. find -name \*.c | grep -v ^./fs | xargs grep struct.file_system_type.*= ./arch/ia64/kernel/perfmon.c:static struct file_system_type pfm_fs_type = { ./drivers/infiniband/core/uverbs_main.c:static struct file_system_type uverbs_event_fs = { ./drivers/isdn/capi/capifs.c:static struct file_system_type capifs_fs_type = { ./drivers/misc/ibmasm/ibmasmfs.c:static struct file_system_type ibmasmfs_type = { ./drivers/oprofile/oprofilefs.c:static struct file_system_type oprofilefs_type = { ./drivers/usb/core/inode.c:static struct file_system_type usb_fs_type = { ./drivers/usb/gadget/inode.c:static struct file_system_type gadgetfs_type = { ./ipc/mqueue.c:static struct file_system_type mqueue_fs_type = { ./kernel/cpuset.c:static struct file_system_type cpuset_fs_type = { ./kernel/futex.c:static struct file_system_type futex_fs_type = { ./mm/shmem.c:static struct file_system_type tmpfs_fs_type = { ./mm/tiny-shmem.c:static struct file_system_type tmpfs_fs_type = { ./net/socket.c:static struct file_system_type sock_fs_type = { ./net/sunrpc/rpc_pipe.c:static struct file_system_type rpc_pipe_fs_type = { ./security/inode.c:static struct file_system_type fs_type = { ./security/selinux/selinuxfs.c:static struct file_system_type sel_fs_type = { In fs/, most code deals with actual files stored on a disk or similar, with the exception of: ./fs/binfmt_misc.c:static struct file_system_type bm_fs_type = { ./fs/block_dev.c:static struct file_system_type bd_type = { ./fs/debugfs/inode.c:static struct file_system_type debug_fs_type = { ./fs/devfs/base.c:static struct file_system_type devfs_fs_type = { ./fs/devpts/inode.c:static struct file_system_type devpts_fs_type = { ./fs/eventpoll.c:static struct file_system_type eventpoll_fs_type = { ./fs/hugetlbfs/inode.c:static struct file_system_type hugetlbfs_fs_type = { ./fs/inotify.c:static struct file_system_type inotify_fs_type = { ./fs/openpromfs/inode.c:static struct file_system_type openprom_fs_type = { ./fs/pipe.c:static struct file_system_type pipe_fs_type = { ./fs/proc/root.c:static struct file_system_type proc_fs_type = { ./fs/relayfs/inode.c:static struct file_system_type relayfs_fs_type = { ./fs/sysfs/mount.c:static struct file_system_type sysfs_fs_type = { I guess there is no strict rule where these file systems go to, e.g. hugetlbs could just as well live near mm/shmem.c or any of those outside of fs/ could be moved in there. I don't really care where I put spufs, but I would prefer to move the files only one more time at most. Initially, they were in fs/spufs, and I moved them to arch/powerpc/platforms/cell/spufs at Pekkas suggestion. Arnd <>< From mikey at neuling.org Wed Dec 7 05:59:10 2005 From: mikey at neuling.org (Michael Neuling) Date: Tue, 6 Dec 2005 12:59:10 -0600 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware In-Reply-To: <1133816807.8577.50.camel@cashmere.sps.mot.com> References: <1133816807.8577.50.camel@cashmere.sps.mot.com> Message-ID: <20051206125910.9f83d230.mikey@neuling.org> > dtc source code can be found at > > > WARNING: This version is still in early development stage; the > resulting device-tree "blobs" have not yet been validated with the > kernel. This has been done now. We added an insert blob option to the kexec tools so that a blob generated with dtc could be used by a kernel booted with kexec. See: http://lists.osdl.org/pipermail/fastboot/2005-October/002061.html Mikey From penberg at cs.helsinki.fi Wed Dec 7 06:05:47 2005 From: penberg at cs.helsinki.fi (Pekka Enberg) Date: Tue, 06 Dec 2005 21:05:47 +0200 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <200512061949.33482.arnd@arndb.de> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> Message-ID: <1133895947.3279.4.camel@localhost> Hi, On Tue, 2005-12-06 at 19:49 +0100, Arnd Bergmann wrote: > I guess there is no strict rule where these file systems go to, e.g. > hugetlbs could just as well live near mm/shmem.c or any of those outside > of fs/ could be moved in there. hugetlbs does not contain architecture specific code so I don't see it as a problem. On Tue, 2005-12-06 at 19:49 +0100, Arnd Bergmann wrote: > I don't really care where I put spufs, but I would prefer to move > the files only one more time at most. > Initially, they were in fs/spufs, and I moved them to > arch/powerpc/platforms/cell/spufs at Pekkas suggestion. I would prefer them to stay in arch/powerpc/. As far as I understand, spufs will never have any use for platforms other than cell, so I really don't see any point in putting it in fs/. Pekka From arnd at arndb.de Wed Dec 7 06:48:55 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 6 Dec 2005 20:48:55 +0100 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware In-Reply-To: <1133816807.8577.50.camel@cashmere.sps.mot.com> References: <1133816807.8577.50.camel@cashmere.sps.mot.com> Message-ID: <200512062048.56131.arnd@arndb.de> On Maandag 05 Dezember 2005 22:06, Jon Loeliger wrote: > Included below is a proposed Revision 0.5 of the > "Booting the Linux/ppc kernel without Open Firmware" > document. This modification primarily extends the > Revision 0.4 by adding definitions for OF Nodes that > cover the System-On-a-Chip features found on PPC parts. > It also generalizes some earlier wording that pertained > to only PPC64 parts and covers the new, merged PPC 32 > and 64 parts together. Finally, minor typos, style > consistency and grammar problems were corrected. A few points are not clear yet, either because I don't understand the document or one it references correctly or because I might have different requirements: - Do we need a way to identify the type of soc bus? There are different standards for this, e.g. PLB4 on PPC440 or the EIB on the Cell BE. My initial idea was to have different device-type properties for these, but I now think that device_type = "soc" makes sense for all of them. Maybe we could add a model or compatible property for them. - It does not really belong into this document, but is related anyway: how do you want to represent this in Linux? Currently, most of these would be of_platform_device, but I think it would be good to have a new bus_type for it. The advantage would be that you can see the devices in /sys/devices/soc at xxx/ even if the driver is not loaded and the driver can even be autoloaded by udev. Also, which properties should show up in sysfs? All of them or just those specified in this document or a subset of them? - What do we do with pci root devices? They are often physically connected to the internal CPU bus, so it would make sense to represent them this way in the device tree. Should we add them to the specification here? Would it even work the expected way in Linux? - For some devices, you mandate a model property, for others you don't. Is this intentional? It might be easier to find the right device driver if the match string always contains a model name. - How would I represent nested interrupt controllers? E.g. suppose I have a Cell internal interrupt controller on one SOC bus and and an external interrupt controller on another SOC bus but have that deliver interrupts to the first one. - Should it mention nested SOC buses, e.g. a PLB4 bus connected to a PLB5 bus? - The title says 'without Open Firmware', but it should also be allowed to use the same SOC bus layout when using SLOF or some other OF implementation, right? - Also not new in this version, but still: Should there be support for specifying CPUs with multiple SMT threads? Arnd <>< From jdl at freescale.com Wed Dec 7 07:08:00 2005 From: jdl at freescale.com (Jon Loeliger) Date: Tue, 06 Dec 2005 14:08:00 -0600 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmwa re In-Reply-To: <20051206125910.9f83d230.mikey@neuling.org> References: <20051206125910.9f83d230.mikey@neuling.org> Message-ID: <1133899679.8577.82.camel@cashmere.sps.mot.com> On Tue, 2005-12-06 at 12:59, Michael Neuling wrote: > > dtc source code can be found at > > And on that note, I should probably make people aware that the current form of this document can now be found as part of the DTC tree! > > WARNING: This version is still in early development stage; the > > resulting device-tree "blobs" have not yet been validated with the > > kernel. > > This has been done now. We added an insert blob option to the kexec > tools so that a blob generated with dtc could be used by a kernel > booted with kexec. See: > > http://lists.osdl.org/pipermail/fastboot/2005-October/002061.html > > Mikey OK. So, do we want to have patches (versus the DTC version) sent to this list for changes to this document now too? jdl From paulus at samba.org Wed Dec 7 08:10:18 2005 From: paulus at samba.org (Paul Mackerras) Date: Wed, 7 Dec 2005 08:10:18 +1100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <1133895947.3279.4.camel@localhost> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> Message-ID: <17301.65082.251692.675360@cargo.ozlabs.ibm.com> Pekka Enberg writes: > I would prefer them to stay in arch/powerpc/. As far as I understand, > spufs will never have any use for platforms other than cell, so I really > don't see any point in putting it in fs/. The point is that people making changes to the filesystem interfaces will be much more likely to notice and fix stuff that is under fs/ than code that is buried deep under arch/ somewhere. Filesystems should go under fs/ for the sake of long-term maintainability. The fact that it's only used on one architecture is irrelevant - you simply make sure (with the appropriate Kconfig bits) that it's only offered on that architecture. Paul. From penberg at cs.helsinki.fi Wed Dec 7 08:41:38 2005 From: penberg at cs.helsinki.fi (Pekka Enberg) Date: Tue, 06 Dec 2005 23:41:38 +0200 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17301.65082.251692.675360@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> Message-ID: <1133905298.8027.13.camel@localhost> Hi Paul, On Wed, 2005-12-07 at 08:10 +1100, Paul Mackerras wrote: > The point is that people making changes to the filesystem interfaces > will be much more likely to notice and fix stuff that is under fs/ > than code that is buried deep under arch/ somewhere. Filesystems > should go under fs/ for the sake of long-term maintainability. The > fact that it's only used on one architecture is irrelevant - you > simply make sure (with the appropriate Kconfig bits) that it's only > offered on that architecture. I think the fact that it is highly architecture specific is relevant. I have no way of testing spufs changes except on cell, no? And if I am developing on a cell, I probably will notice it in arch/ all the same. So I don't quite buy your the maintenace argument. But as Arnd said, there are no clear rules on what kind of filesystems should go into fs/ so please do whatever you must. Pekka From benh at kernel.crashing.org Wed Dec 7 08:41:15 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Wed, 07 Dec 2005 08:41:15 +1100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <1133869108.7968.1.camel@localhost> References: <20051206035220.097737000@localhost> <20051206040643.452349000@localhost> <17300.57499.400458.387421@cargo.ozlabs.ibm.com> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> Message-ID: <1133905276.7168.54.camel@gaston> On Tue, 2005-12-06 at 13:38 +0200, Pekka Enberg wrote: > Hi, > > On Dinsdag 06 Dezember 2005 01:51, Paul Mackerras wrote: > > > Remind me again why spufs is under arch/powerpc/ rather than fs/ ? > > On Tue, 2005-12-06 at 11:18 +0100, Arnd Bergmann wrote: > > We had a discussion about this in August, after the patch > > at http://patchwork.ozlabs.org/linuxppc64/patch?id=2140 > > > > Nobody had voiced any objections against the arch/powerpc location, > > and Pekka had good reasons against fs/, so I changed it. > > It had arch specific hooks which IMHO do not belong into fs/. Hrm... but not being into fs/ makes sure people like viro will "miss" it when fixing all filesystems... Ben. From ntl at pobox.com Wed Dec 7 09:14:34 2005 From: ntl at pobox.com (Nathan Lynch) Date: Tue, 6 Dec 2005 17:14:34 -0500 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17301.65082.251692.675360@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> Message-ID: <20051206221434.GB8901@localhost.localdomain> Paul Mackerras wrote: > Pekka Enberg writes: > > > I would prefer them to stay in arch/powerpc/. As far as I understand, > > spufs will never have any use for platforms other than cell, so I really > > don't see any point in putting it in fs/. > > The point is that people making changes to the filesystem interfaces > will be much more likely to notice and fix stuff that is under fs/ > than code that is buried deep under arch/ somewhere. Filesystems > should go under fs/ for the sake of long-term maintainability. The > fact that it's only used on one architecture is irrelevant - you > simply make sure (with the appropriate Kconfig bits) that it's only > offered on that architecture. openpromfs seems to be a precedent here. It makes sense only on sparc and sparc64 but it lives in fs/. From mikey at neuling.org Wed Dec 7 09:18:55 2005 From: mikey at neuling.org (Michael Neuling) Date: Tue, 6 Dec 2005 16:18:55 -0600 Subject: [PATCH 9/11] powerpc: Parse crashkernel= parameter in first kernel In-Reply-To: <20051205004002.7A01B68889@ozlabs.org> References: <1133743149.268607.418162138937.qpush@concordia> <20051205004002.7A01B68889@ozlabs.org> Message-ID: <20051206161855.745bd0be.mikey@neuling.org> > + RELOC(prom_crashk_size) = prom_memparse(opt, &opt); To avoid a compiler warning, this should be: RELOC(prom_crashk_size) = prom_memparse(opt, (const char **)&opt); Mikey From paulus at samba.org Wed Dec 7 09:19:28 2005 From: paulus at samba.org (Paul Mackerras) Date: Wed, 7 Dec 2005 09:19:28 +1100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <1133905298.8027.13.camel@localhost> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> <1133905298.8027.13.camel@localhost> Message-ID: <17302.3696.364669.18755@cargo.ozlabs.ibm.com> Pekka Enberg writes: > I think the fact that it is highly architecture specific is relevant. I > have no way of testing spufs changes except on cell, no? And if I am > developing on a cell, I probably will notice it in arch/ all the same. > So I don't quite buy your the maintenace argument. Think about someone changing the VFS layer interface and fixing up all the filesystems to accommodate that change. That person is doing some of your work for you, so you want to make it easy for him/her to find your filesystem. That's the sort of thing I was referring to as maintenance. As for changes on the cell-specific side, the people doing those changes will know where to find it, so it isn't a problem having it in fs/. Having it in fs/ also means that it is more likely that people familiar with VFS internals will look through your code and comment on it. I know that can be painful in the short term, but in the long term it will lead to better code. Paul. From arnd at arndb.de Wed Dec 7 09:27:08 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 6 Dec 2005 23:27:08 +0100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17302.3696.364669.18755@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <1133905298.8027.13.camel@localhost> <17302.3696.364669.18755@cargo.ozlabs.ibm.com> Message-ID: <200512062327.08448.arnd@arndb.de> Am Dienstag 06 Dezember 2005 23:19 schrieb Paul Mackerras: > Having it in fs/ also means that it is more likely that people > familiar with VFS internals will look through your code and comment on > it. ?I know that can be painful in the short term, but in the long > term it will lead to better code. Yes, that is an excellent point. How should we proceed to get the code there? Do you want to move the files around in your git tree or do you prefer me to send a full set of patches again and kill the existing copy? Obviously, I'd prefer the former, since it would mean less work for me with the same result. Arnd <>< From michael at ellerman.id.au Wed Dec 7 10:39:03 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Tue, 6 Dec 2005 17:39:03 -0600 Subject: [RFC] Should lmb_alloc() always panic on failure? Message-ID: <200512061739.06783.michael@ellerman.id.au> Hi ya'll, Currently lmb_alloc(_base) returns 0 if it can't allocate memory, but a lot of places don't actualyl check. I was thinking it might be better if it just panicked. The following functions call lmb_alloc() and don't check the return value: finish_device_tree() rtas_initialize() irqstack_early_init() emergency_stack_init() early_get_page() MMU_init_hw() stabs_alloc() pmac_probe() alloc_u3_dart_table() These functions check and panic() or BUG_ON: unflatten_device_tree() htab_initialize() do_init_bootmem() dart_init() The only other caller is careful_allocation(), which checks and retries the alloc with different parameters - we could accomodate this with an __lmb_alloc() or similar. What do people think? cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051206/8d9d7159/attachment.pgp From david at gibson.dropbear.id.au Wed Dec 7 11:17:20 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Wed, 7 Dec 2005 11:17:20 +1100 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware In-Reply-To: <200512062048.56131.arnd@arndb.de> References: <1133816807.8577.50.camel@cashmere.sps.mot.com> <200512062048.56131.arnd@arndb.de> Message-ID: <20051207001720.GB25533@localhost.localdomain> On Tue, Dec 06, 2005 at 08:48:55PM +0100, Arnd Bergmann wrote: > On Maandag 05 Dezember 2005 22:06, Jon Loeliger wrote: > > Included below is a proposed Revision 0.5 of the > > "Booting the Linux/ppc kernel without Open Firmware" > > document. This modification primarily extends the > > Revision 0.4 by adding definitions for OF Nodes that > > cover the System-On-a-Chip features found on PPC parts. > > It also generalizes some earlier wording that pertained > > to only PPC64 parts and covers the new, merged PPC 32 > > and 64 parts together. Finally, minor typos, style > > consistency and grammar problems were corrected. > > A few points are not clear yet, either because I don't understand the > document or one it references correctly or because I might have > different requirements: All comments below IMHO, and subject to persuasion otherwise. > - Do we need a way to identify the type of soc bus? There are different > standards for this, e.g. PLB4 on PPC440 or the EIB on the Cell BE. > My initial idea was to have different device-type properties for these, > but I now think that device_type = "soc" makes sense for all of them. > Maybe we could add a model or compatible property for them. It think it would be a good idea to have something labelling the specific type of SOC bus, though I'm not immediately sure where. "model" perhaps, if it rarely has an effect on how to operate the bus. > - It does not really belong into this document, but is related anyway: > how do you want to represent this in Linux? Currently, most of these > would be of_platform_device, but I think it would be good to have > a new bus_type for it. The advantage would be that you can see the > devices in /sys/devices/soc at xxx/ even if the driver is not loaded > and the driver can even be autoloaded by udev. > Also, which properties should show up in sysfs? All of them or just > those specified in this document or a subset of them? I concur - I believe we already have a bus_type for on-chip devices on 4xx. > - What do we do with pci root devices? They are often physically connected > to the internal CPU bus, so it would make sense to represent them > this way in the device tree. Should we add them to the specification > here? Would it even work the expected way in Linux? The host bridges should sit on the soc bus then, as you suggest (just as the PCI busses hang off HyperTransport on the G5). I think you need to refer to the OF docs for how to represent the PCI host bridge and devices themselves. > - For some devices, you mandate a model property, for others you don't. > Is this intentional? It might be easier to find the right device > driver if the match string always contains a model name. You rarely want to match model name to find a device - generally you want to match either on "compatible" or "device_type", or possibly both. > - How would I represent nested interrupt controllers? E.g. suppose I > have a Cell internal interrupt controller on one SOC bus and > and an external interrupt controller on another SOC bus but have > that deliver interrupts to the first one. Again, I believe this is in the OF docs - interrupt controllers have an interrupt-parent property IIRC, which gives the phandle of the next interrupt controller up the chain. > - Should it mention nested SOC buses, e.g. a PLB4 bus connected to a > PLB5 bus? Yes. > - The title says 'without Open Firmware', but it should also be allowed > to use the same SOC bus layout when using SLOF or some other OF > implementation, right? I guess so. > - Also not new in this version, but still: Should there be support for > specifying CPUs with multiple SMT threads? Umm.. maybe. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson From geoffrey.levand at am.sony.com Wed Dec 7 11:34:00 2005 From: geoffrey.levand at am.sony.com (Geoff Levand) Date: Tue, 06 Dec 2005 16:34:00 -0800 Subject: [RFC] spufs: wrap spu priveleged register access Message-ID: <43962DF8.2040007@am.sony.com> The current spufs implementation accesses privileged (privilege 1) spu registers directly, which may not be allowed by a hypervisor. This patch adds wrapper functions that can be implemented as needed as either platform specific hypervisor calls or direct register accesses. Included is a sample of support for a fictitious hypervisor. This patch is just to give an idea, please re-write it as you like. It may be a good idea to wrap not only irq_mask/stat but also any other regs, and to remove generic functions like spu_priv1_get64/put64() since each access may be mapped to different hypervisor calls. Arnd mentioned it would be best to arrange for runtime configuration possibly using firmware_has_feature(). Signed-off-by: Masato Noguchi Signed-off-by: Geoff Levand Index: linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spu_base.c =================================================================== --- linux-2.6.15-rc4-cell.orig/arch/powerpc/platforms/cell/spu_base.c 2005-12-02 16:26:20.000000000 -0800 +++ linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spu_base.c 2005-12-02 16:27:40.000000000 -0800 @@ -141,8 +141,8 @@ /* atomically disable SPU mailbox interrupts */ spin_lock(&spu->register_lock); - out_be64(&spu->priv1->int_mask_class2_RW, - in_be64(&spu->priv1->int_mask_class2_RW) & ~0x1); + spu_irq_mask_set(spu, 2, + spu_irq_mask_get(spu, 2) & ~0x1); spin_unlock(&spu->register_lock); return 0; } @@ -177,8 +177,8 @@ /* atomically disable SPU mailbox interrupts */ spin_lock(&spu->register_lock); - out_be64(&spu->priv1->int_mask_class2_RW, - in_be64(&spu->priv1->int_mask_class2_RW) & ~0x10); + spu_irq_mask_set(spu, 2, + spu_irq_mask_get(spu, 2) & ~0x10); spin_unlock(&spu->register_lock); return 0; } @@ -202,7 +202,7 @@ spu->class_0_pending = 0; - stat = in_be64(&spu->priv1->int_stat_class0_RW); + stat = spu_irq_stat_get(spu, 0); if (stat & 1) /* invalid MFC DMA */ __spu_trap_invalid_dma(spu); @@ -213,7 +213,7 @@ if (stat & 4) /* error on SPU */ __spu_trap_error(spu); - out_be64(&spu->priv1->int_stat_class0_RW, stat); + spu_irq_stat_clear(spu, 0, stat); return 0; } @@ -227,13 +227,13 @@ /* atomically read & clear class1 status. */ spin_lock(&spu->register_lock); - mask = in_be64(&spu->priv1->int_mask_class1_RW); - stat = in_be64(&spu->priv1->int_stat_class1_RW) & mask; - dar = in_be64(&spu->priv1->mfc_dar_RW); - dsisr = in_be64(&spu->priv1->mfc_dsisr_RW); + mask = spu_irq_mask_get(spu, 1); + stat = spu_irq_stat_get(spu, 1) & mask; + dar = spu_priv1_get64(spu, mfc_dar_RW); + dsisr = spu_priv1_get64(spu, mfc_dsisr_RW); if (stat & 2) /* mapping fault */ - out_be64(&spu->priv1->mfc_dsisr_RW, 0UL); - out_be64(&spu->priv1->int_stat_class1_RW, stat); + spu_priv1_set64(spu, mfc_dsisr_RW, 0UL); + spu_irq_stat_clear(spu, 1, stat); spin_unlock(&spu->register_lock); if (stat & 1) /* segment fault */ @@ -259,10 +259,10 @@ unsigned long stat; spu = data; - stat = in_be64(&spu->priv1->int_stat_class2_RW); + stat = spu_irq_stat_get(spu, 2); pr_debug("class 2 interrupt %d, %lx, %lx\n", irq, stat, - in_be64(&spu->priv1->int_mask_class2_RW)); + spu_irq_mask_get(spu, int_mask_class2_RW)); if (stat & 1) /* PPC core mailbox */ @@ -280,7 +280,7 @@ if (stat & 0x10) /* SPU mailbox threshold */ __spu_trap_spubox(spu); - out_be64(&spu->priv1->int_stat_class2_RW, stat); + spu_irq_stat_set(spu, 2, stat); return stat ? IRQ_HANDLED : IRQ_NONE; } @@ -297,21 +297,21 @@ spu_irq_class_0, 0, spu->irq_c0, spu); if (ret) goto out; - out_be64(&spu->priv1->int_mask_class0_RW, 0x7); + spu_irq_mask_set(spu, 0, 0x7); snprintf(spu->irq_c1, sizeof (spu->irq_c1), "spe%02d.1", spu->number); ret = request_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc, spu_irq_class_1, 0, spu->irq_c1, spu); if (ret) goto out1; - out_be64(&spu->priv1->int_mask_class1_RW, 0x3); + spu_irq_mask_set(spu, 1, 0x3); snprintf(spu->irq_c2, sizeof (spu->irq_c2), "spe%02d.2", spu->number); ret = request_irq(irq_base + 2*IIC_CLASS_STRIDE + spu->isrc, spu_irq_class_2, 0, spu->irq_c2, spu); if (ret) goto out2; - out_be64(&spu->priv1->int_mask_class2_RW, 0xe); + spu_irq_mask_set(spu, 2, 0xe); goto out; out2: @@ -373,9 +373,9 @@ static void spu_init_regs(struct spu *spu) { - out_be64(&spu->priv1->int_mask_class0_RW, 0x7); - out_be64(&spu->priv1->int_mask_class1_RW, 0x3); - out_be64(&spu->priv1->int_mask_class2_RW, 0xe); + spu_irq_mask_set(spu, 0, 0x7); + spu_irq_mask_set(spu, 1, 0x3); + spu_irq_mask_set(spu, 2, 0xe); } struct spu *spu_alloc(void) @@ -523,13 +523,11 @@ int spu_run(struct spu *spu) { struct spu_problem __iomem *prob; - struct spu_priv1 __iomem *priv1; struct spu_priv2 __iomem *priv2; u32 status; int ret; prob = spu->problem; - priv1 = spu->priv1; priv2 = spu->priv2; /* Let SPU run. */ @@ -561,7 +559,7 @@ cpu_relax(); out_be64(&priv2->slb_invalidate_all_W, 0); - out_be64(&priv1->tlb_invalidate_entry_W, 0UL); + spu_priv1_set64(spu, tlb_invalidate_entry_W, 0UL); eieio(); /* Check for SPU breakpoint. */ Index: linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spufs/hw_ops.c =================================================================== --- linux-2.6.15-rc4-cell.orig/arch/powerpc/platforms/cell/spufs/hw_ops.c 2005-12-02 16:26:20.000000000 -0800 +++ linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spufs/hw_ops.c 2005-12-02 16:27:40.000000000 -0800 @@ -62,7 +62,6 @@ { struct spu *spu = ctx->spu; struct spu_problem __iomem *prob = spu->problem; - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_priv2 __iomem *priv2 = spu->priv2; int ret; @@ -73,8 +72,8 @@ ret = 4; } else { /* make sure we get woken up by the interrupt */ - out_be64(&priv1->int_mask_class2_RW, - in_be64(&priv1->int_mask_class2_RW) | 0x1); + spu_irq_mask_set(spu, 2, + spu_irq_mask_get(spu, 2) | 0x1); ret = 0; } spin_unlock_irq(&spu->register_lock); @@ -85,7 +84,6 @@ { struct spu *spu = ctx->spu; struct spu_problem __iomem *prob = spu->problem; - struct spu_priv1 __iomem *priv1 = spu->priv1; int ret; spin_lock_irq(&spu->register_lock); @@ -96,8 +94,8 @@ } else { /* make sure we get woken up by the interrupt when space becomes available */ - out_be64(&priv1->int_mask_class2_RW, - in_be64(&priv1->int_mask_class2_RW) | 0x10); + spu_irq_mask_set(spu, 2, + spu_irq_mask_get(spu, 2) | 0x10); ret = 0; } spin_unlock_irq(&spu->register_lock); Index: linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spufs/switch.c =================================================================== --- linux-2.6.15-rc4-cell.orig/arch/powerpc/platforms/cell/spufs/switch.c 2005-12-02 16:26:20.000000000 -0800 +++ linux-2.6.15-rc4-cell/arch/powerpc/platforms/cell/spufs/switch.c 2005-12-02 16:27:40.000000000 -0800 @@ -108,8 +108,6 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 3: * Restore, Step 2: * Save INT_Mask_class0 in CSA. @@ -122,15 +120,15 @@ spin_lock_irq(&spu->register_lock); if (csa) { csa->priv1.int_mask_class0_RW = - in_be64(&priv1->int_mask_class0_RW); + spu_irq_mask_get(spu, 0); csa->priv1.int_mask_class1_RW = - in_be64(&priv1->int_mask_class1_RW); + spu_irq_mask_get(spu, 1); csa->priv1.int_mask_class2_RW = - in_be64(&priv1->int_mask_class2_RW); + spu_irq_mask_get(spu, 2); } - out_be64(&priv1->int_mask_class0_RW, 0UL); - out_be64(&priv1->int_mask_class1_RW, 0UL); - out_be64(&priv1->int_mask_class2_RW, 0UL); + spu_irq_mask_set(spu, 0, 0UL); + spu_irq_mask_set(spu, 1, 0UL); + spu_irq_mask_set(spu, 2, 0UL); eieio(); spin_unlock_irq(&spu->register_lock); } @@ -217,12 +215,10 @@ static inline void save_mfc_sr1(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 10: * Save MFC_SR1 in the CSA. */ - csa->priv1.mfc_sr1_RW = in_be64(&priv1->mfc_sr1_RW); + csa->priv1.mfc_sr1_RW = spu_priv1_get64(spu, mfc_sr1_RW); } static inline void save_spu_status(struct spu_state *csa, struct spu *spu) @@ -316,15 +312,13 @@ static inline void issue_mfc_tlbie(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 17: * Restore, Step 12. * Restore, Step 48. * Write TLB_Invalidate_Entry[IS,VPN,L,Lp]=0 register. * Then issue a PPE sync instruction. */ - out_be64(&priv1->tlb_invalidate_entry_W, 0UL); + spu_priv1_set64(spu, tlb_invalidate_entry_W, 0UL); mb(); } @@ -434,25 +428,21 @@ static inline void save_mfc_tclass_id(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 25: * Save the MFC_TCLASS_ID register in * the CSA. */ - csa->priv1.mfc_tclass_id_RW = in_be64(&priv1->mfc_tclass_id_RW); + csa->priv1.mfc_tclass_id_RW = spu_priv1_get64(spu, mfc_tclass_id_RW); } static inline void set_mfc_tclass_id(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 26: * Restore, Step 23. * Write the MFC_TCLASS_ID register with * the value 0x10000000. */ - out_be64(&priv1->mfc_tclass_id_RW, 0x10000000); + spu_priv1_set64(spu, mfc_tclass_id_RW, 0x10000000); eieio(); } @@ -482,14 +472,13 @@ static inline void save_mfc_slbs(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_priv2 __iomem *priv2 = spu->priv2; int i; /* Save, Step 29: * If MFC_SR1[R]='1', save SLBs in CSA. */ - if (in_be64(&priv1->mfc_sr1_RW) & MFC_STATE1_RELOCATE_MASK) { + if (spu_priv1_get64(spu, mfc_sr1_RW) & MFC_STATE1_RELOCATE_MASK) { csa->priv2.slb_index_W = in_be64(&priv2->slb_index_W); for (i = 0; i < 8; i++) { out_be64(&priv2->slb_index_W, i); @@ -503,8 +492,6 @@ static inline void setup_mfc_sr1(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 30: * Restore, Step 18: * Write MFC_SR1 with MFC_SR1[D=0,S=1] and @@ -516,9 +503,9 @@ * MFC_SR1[Pr] bit is not set. * */ - out_be64(&priv1->mfc_sr1_RW, (MFC_STATE1_MASTER_RUN_CONTROL_MASK | - MFC_STATE1_RELOCATE_MASK | - MFC_STATE1_BUS_TLBIE_MASK)); + spu_priv1_set64(spu, mfc_sr1_RW, (MFC_STATE1_MASTER_RUN_CONTROL_MASK | + MFC_STATE1_RELOCATE_MASK | + MFC_STATE1_BUS_TLBIE_MASK)); } static inline void save_spu_npc(struct spu_state *csa, struct spu *spu) @@ -595,16 +582,14 @@ static inline void save_mfc_rag(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Save, Step 38: * Save RA_GROUP_ID register and the * RA_ENABLE reigster in the CSA. */ csa->priv1.resource_allocation_groupID_RW = - in_be64(&priv1->resource_allocation_groupID_RW); + spu_priv1_get64(spu, resource_allocation_groupID_RW); csa->priv1.resource_allocation_enable_RW = - in_be64(&priv1->resource_allocation_enable_RW); + spu_priv1_get64(spu, resource_allocation_enable_RW); } static inline void save_ppu_mb_stat(struct spu_state *csa, struct spu *spu) @@ -722,14 +707,13 @@ static inline void invalidate_slbs(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_priv2 __iomem *priv2 = spu->priv2; /* Save, Step 45: * Restore, Step 19: * If MFC_SR1[R]=1, write 0 to SLB_Invalidate_All. */ - if (in_be64(&priv1->mfc_sr1_RW) & MFC_STATE1_RELOCATE_MASK) { + if (spu_priv1_get64(spu, mfc_sr1_RW) & MFC_STATE1_RELOCATE_MASK) { out_be64(&priv2->slb_invalidate_all_W, 0UL); eieio(); } @@ -798,7 +782,6 @@ static inline void enable_interrupts(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; unsigned long class1_mask = CLASS1_ENABLE_SEGMENT_FAULT_INTR | CLASS1_ENABLE_STORAGE_FAULT_INTR; @@ -811,12 +794,12 @@ * (translation) interrupts. */ spin_lock_irq(&spu->register_lock); - out_be64(&priv1->int_stat_class0_RW, ~(0UL)); - out_be64(&priv1->int_stat_class1_RW, ~(0UL)); - out_be64(&priv1->int_stat_class2_RW, ~(0UL)); - out_be64(&priv1->int_mask_class0_RW, 0UL); - out_be64(&priv1->int_mask_class1_RW, class1_mask); - out_be64(&priv1->int_mask_class2_RW, 0UL); + spu_irq_stat_clear(spu, 0, ~(0UL)); + spu_irq_stat_clear(spu, 1, ~(0UL)); + spu_irq_stat_clear(spu, 2, ~(0UL)); + spu_irq_mask_set(spu, 0, 0UL); + spu_irq_mask_set(spu, 1, 0UL); + spu_irq_mask_set(spu, 2, 0UL); spin_unlock_irq(&spu->register_lock); } @@ -954,7 +937,6 @@ static inline void wait_tag_complete(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_problem __iomem *prob = spu->problem; u32 mask = MFC_TAGID_TO_TAGMASK(0); unsigned long flags; @@ -971,14 +953,13 @@ POLL_WHILE_FALSE(in_be32(&prob->dma_tagstatus_R) & mask); local_irq_save(flags); - out_be64(&priv1->int_stat_class0_RW, ~(0UL)); - out_be64(&priv1->int_stat_class2_RW, ~(0UL)); + spu_irq_stat_clear(spu, 0, ~(0UL)); + spu_irq_stat_clear(spu, 2, ~(0UL)); local_irq_restore(flags); } static inline void wait_spu_stopped(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_problem __iomem *prob = spu->problem; unsigned long flags; @@ -991,8 +972,8 @@ POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING); local_irq_save(flags); - out_be64(&priv1->int_stat_class0_RW, ~(0UL)); - out_be64(&priv1->int_stat_class2_RW, ~(0UL)); + spu_irq_stat_clear(spu, 0, ~(0UL)); + spu_irq_stat_clear(spu, 2, ~(0UL)); local_irq_restore(flags); } @@ -1091,7 +1072,6 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu) { struct spu_problem __iomem *prob = spu->problem; - struct spu_priv1 __iomem *priv1 = spu->priv1; /* Restore, Step 10: * If SPU_Status[R]=0 and SPU_Status[E,L,IS]=1, @@ -1100,8 +1080,8 @@ if (!(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING)) { if (in_be32(&prob->spu_status_R) & SPU_STATUS_ISOLATED_EXIT_STAUTUS) { - out_be64(&priv1->mfc_sr1_RW, - MFC_STATE1_MASTER_RUN_CONTROL_MASK); + spu_priv1_set64(spu, mfc_sr1_RW, + MFC_STATE1_MASTER_RUN_CONTROL_MASK); eieio(); out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_RUNNABLE); eieio(); @@ -1112,8 +1092,8 @@ SPU_STATUS_ISOLATED_LOAD_STAUTUS) || (in_be32(&prob->spu_status_R) & SPU_STATUS_ISOLATED_STATE)) { - out_be64(&priv1->mfc_sr1_RW, - MFC_STATE1_MASTER_RUN_CONTROL_MASK); + spu_priv1_set64(spu, mfc_sr1_RW, + MFC_STATE1_MASTER_RUN_CONTROL_MASK); eieio(); out_be32(&prob->spu_runcntl_RW, 0x2); eieio(); @@ -1281,16 +1261,14 @@ static inline void restore_mfc_rag(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Restore, Step 29: * Restore RA_GROUP_ID register and the * RA_ENABLE reigster from the CSA. */ - out_be64(&priv1->resource_allocation_groupID_RW, - csa->priv1.resource_allocation_groupID_RW); - out_be64(&priv1->resource_allocation_enable_RW, - csa->priv1.resource_allocation_enable_RW); + spu_priv1_set64(spu, resource_allocation_groupID_RW, + csa->priv1.resource_allocation_groupID_RW); + spu_priv1_set64(spu, resource_allocation_enable_RW, + csa->priv1.resource_allocation_enable_RW); } static inline void send_restore_code(struct spu_state *csa, struct spu *spu) @@ -1433,8 +1411,6 @@ static inline void clear_interrupts(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Restore, Step 49: * Write INT_MASK_class0 with value of 0. * Write INT_MASK_class1 with value of 0. @@ -1444,12 +1420,12 @@ * Write INT_STAT_class2 with value of -1. */ spin_lock_irq(&spu->register_lock); - out_be64(&priv1->int_mask_class0_RW, 0UL); - out_be64(&priv1->int_mask_class1_RW, 0UL); - out_be64(&priv1->int_mask_class2_RW, 0UL); - out_be64(&priv1->int_stat_class0_RW, ~(0UL)); - out_be64(&priv1->int_stat_class1_RW, ~(0UL)); - out_be64(&priv1->int_stat_class2_RW, ~(0UL)); + spu_irq_mask_set(spu, 0, 0UL); + spu_irq_mask_set(spu, 1, 0UL); + spu_irq_mask_set(spu, 2, 0UL); + spu_irq_stat_clear(spu, 0, ~(0UL)); + spu_irq_stat_clear(spu, 1, ~(0UL)); + spu_irq_stat_clear(spu, 2, ~(0UL)); spin_unlock_irq(&spu->register_lock); } @@ -1546,12 +1522,10 @@ static inline void restore_mfc_tclass_id(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Restore, Step 56: * Restore the MFC_TCLASS_ID register from CSA. */ - out_be64(&priv1->mfc_tclass_id_RW, csa->priv1.mfc_tclass_id_RW); + spu_priv1_set64(spu, mfc_tclass_id_RW, csa->priv1.mfc_tclass_id_RW); eieio(); } @@ -1713,7 +1687,6 @@ static inline void check_ppuint_mb_stat(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; struct spu_priv2 __iomem *priv2 = spu->priv2; u64 dummy = 0UL; @@ -1724,8 +1697,7 @@ if ((csa->prob.mb_stat_R & 0xFF0000) == 0) { dummy = in_be64(&priv2->puint_mb_R); eieio(); - out_be64(&priv1->int_stat_class2_RW, - CLASS2_ENABLE_MAILBOX_INTR); + spu_irq_stat_clear(spu, 2, CLASS2_ENABLE_MAILBOX_INTR); eieio(); } } @@ -1753,12 +1725,10 @@ static inline void restore_mfc_sr1(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Restore, Step 69: * Restore the MFC_SR1 register from CSA. */ - out_be64(&priv1->mfc_sr1_RW, csa->priv1.mfc_sr1_RW); + spu_priv1_set64(spu, mfc_sr1_RW, csa->priv1.mfc_sr1_RW); eieio(); } @@ -1816,15 +1786,13 @@ static inline void reenable_interrupts(struct spu_state *csa, struct spu *spu) { - struct spu_priv1 __iomem *priv1 = spu->priv1; - /* Restore, Step 75: * Re-enable SPU interrupts. */ spin_lock_irq(&spu->register_lock); - out_be64(&priv1->int_mask_class0_RW, csa->priv1.int_mask_class0_RW); - out_be64(&priv1->int_mask_class1_RW, csa->priv1.int_mask_class1_RW); - out_be64(&priv1->int_mask_class2_RW, csa->priv1.int_mask_class2_RW); + spu_irq_mask_set(spu, 0, csa->priv1.int_mask_class0_RW); + spu_irq_mask_set(spu, 1, csa->priv1.int_mask_class1_RW); + spu_irq_mask_set(spu, 2, csa->priv1.int_mask_class2_RW); spin_unlock_irq(&spu->register_lock); } Index: linux-2.6.15-rc4-cell/include/asm-powerpc/spu.h =================================================================== --- linux-2.6.15-rc4-cell.orig/include/asm-powerpc/spu.h 2005-12-02 16:26:20.000000000 -0800 +++ linux-2.6.15-rc4-cell/include/asm-powerpc/spu.h 2005-12-02 16:27:40.000000000 -0800 @@ -576,4 +576,64 @@ u64 spu_trace_cntl; /* 0x1070 */ } __attribute__ ((aligned(0x2000))); + +/* priv1 access */ + +#ifdef CONFIG_ON_HYPERVISOR_XXXXX + /* examples for a fictitious hypervisor */ + +#include + +inline u64 spu_irq_mask_get(struct spu *spu, int cls) +{ + u64 __val; + hvcall_spu_get_irq_mask(spu->spu_magical_id, + cls, + &__val); + return __val; +} + +#define spu_irq_mask_set(spu, cls, mask) \ + hvcall_spu_get_irq_mask(spu->spu_magical_id, \ + cls, \ + mask); + +#define spu_irq_stat_get(spu, cls) \ + hvcall_spu_get_interrupt_status(spu->spu_magical_id, \ + cls); +#define spu_irq_stat_clear(spu, cls, val) \ + hvcall_spu_clear_interrupt_status(spu->spu_magical_id, \ + cls, val); + +inline u64 spu_priv1_get64(struct spu *spu, int cls) +{ + u64 __val; + hvcall_spu_get_priv1(spu->spu_magical_id, + offsetof(struct spu_priv1, reg), + &__val); + return __val; +} + +#define spu_priv1_set64(spu, cls, val) \ + hvcall_spu_set_priv1(spu->spu_magical_id, \ + offsetof(struct spu_priv1, reg), \ + val); + +#else /* CONFIG_ON_HYPERVISOR_XXXXX */ + +#define spu_irq_mask_get(spu, cls) \ + spu_priv1_get64(spu, int_mask_class ## cls ## _RW) +#define spu_irq_mask_set(spu, cls, mask) \ + spu_priv1_set64(spu, int_mask_class ## cls ## _RW, mask) + +#define spu_irq_stat_get(spu, cls) \ + spu_priv1_get64(spu, int_mask_class ## cls ## _RW) +#define spu_irq_stat_clear(spu, cls, stat) \ + spu_priv1_set64(spu, int_mask_class ## cls ## _RW, stat) + +#define spu_priv1_get64(spu, reg) in_be64(&(spu)->priv1->reg) +#define spu_priv1_set64(spu, reg, val) out_be64(&(spu)->priv1->reg, val) + +#endif /* CONFIG_ON_HYPERVISOR_XXXXX */ + #endif From haren at us.ibm.com Wed Dec 7 12:53:37 2005 From: haren at us.ibm.com (Haren Myneni) Date: Tue, 06 Dec 2005 17:53:37 -0800 Subject: [PATCH] Trivial fix in __alloc_bootmem_core() when there is no free page in first node's memory Message-ID: <439640A1.3030300@us.ibm.com> Hi, Hitting BUG_ON() in __alloc_bootmem_core() when there is no free page available in the first node's memory. For the case of kdump on PPC64 (Power 4 machine), the captured kernel is used two memory regions - memory for TCE tables (tce-base and tce-size at top of RAM and reserved) and captured kernel memory region (crashk_base and crashk_size). Since we reserve the memory for the first node, we should be returning from __alloc_bootmem_core() to search for the next node (pg_dat). Currently, find_next_zero_bit() is returning the n^th bit (eidx) when there is no free page. Then, test_bit() is failed since we set 0xff only for the actual size initially (init_bootmem_core()) even though rounded up to one page for bdata->node_bootmem_map. We are hitting the BUG_ON after failing to enter second "for" loop. Please apply. Thanks Haren Signed-off-by: Haren Myneni -------------- next part -------------- A non-text attachment was scrubbed... Name: bootmem_bug_on_fix.patch Type: text/x-patch Size: 413 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051206/8fadadaf/attachment.bin From sfr at canb.auug.org.au Wed Dec 7 13:01:05 2005 From: sfr at canb.auug.org.au (Stephen Rothwell) Date: Wed, 7 Dec 2005 13:01:05 +1100 Subject: [PATCH] powerpc: fix for "Update OF address parsers" Message-ID: <20051207130105.44a488c0.sfr@canb.auug.org.au> This patch allows iSeries to build again. It just moves pci_address_to_pio outside the #ifdef CONFIG_PPC_MULTIPLATFORM. Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/pci_64.c | 28 ++++++++++++++-------------- 1 files changed, 14 insertions(+), 14 deletions(-) Built on iSeries and pSeries and booted on iSeries. -- Cheers, Stephen Rothwell sfr at canb.auug.org.au http://www.canb.auug.org.au/~sfr/ c3485e24b9b4fbd530f28022e6b3f58b206eec74 diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 0988222..4eb93fc 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -1181,20 +1181,6 @@ void phbs_remap_io(void) remap_bus_range(hose->bus); } -unsigned int pci_address_to_pio(phys_addr_t address) -{ - struct pci_controller *hose, *tmp; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - if (address >= hose->io_base_phys && - address < (hose->io_base_phys + hose->pci_io_size)) - return (unsigned int)hose->io_base_virt + - (address - hose->io_base_phys); - } - return (unsigned int)-1; -} -EXPORT_SYMBOL_GPL(pci_address_to_pio); - static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -1337,6 +1323,20 @@ struct pci_controller* pci_find_hose_for #endif /* CONFIG_PPC_MULTIPLATFORM */ +unsigned int pci_address_to_pio(phys_addr_t address) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + if (address >= hose->io_base_phys && + address < (hose->io_base_phys + hose->pci_io_size)) + return (unsigned int)hose->io_base_virt + + (address - hose->io_base_phys); + } + return (unsigned int)-1; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + #define IOBASE_BRIDGE_NUMBER 0 #define IOBASE_MEMORY 1 -- 0.99.9l From viro at ftp.linux.org.uk Wed Dec 7 13:26:10 2005 From: viro at ftp.linux.org.uk (Al Viro) Date: Wed, 7 Dec 2005 02:26:10 +0000 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17302.3696.364669.18755@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> <1133905298.8027.13.camel@localhost> <17302.3696.364669.18755@cargo.ozlabs.ibm.com> Message-ID: <20051207022610.GI27946@ftp.linux.org.uk> On Wed, Dec 07, 2005 at 09:19:28AM +1100, Paul Mackerras wrote: > Think about someone changing the VFS layer interface and fixing up all > the filesystems to accommodate that change. That person is doing some > of your work for you, so you want to make it easy for him/her to find > your filesystem. That's the sort of thing I was referring to as > maintenance. FWIW, I think it's not a serious argument. Interface changes => grep time. And that means grep over the tree anyway. > As for changes on the cell-specific side, the people doing those > changes will know where to find it, so it isn't a problem having it in > fs/. > > Having it in fs/ also means that it is more likely that people > familiar with VFS internals will look through your code and comment on > it. I know that can be painful in the short term, but in the long > term it will lead to better code. That's solved by asking for review... As far as I'm concerned, the only thing here that looks like a possible reason to move the entire thing is highly unusual semantics of final close and interesting use of VFS interfaces in spu_create(). I.e. it's not that we have a filesystem there. OTOH, if you go looking for analogs as far as unusual interaction with VFS is concerned... net/unix is unlikely to get moved. From paulus at samba.org Wed Dec 7 13:57:14 2005 From: paulus at samba.org (Paul Mackerras) Date: Wed, 7 Dec 2005 13:57:14 +1100 Subject: [PATCH 7/11] powerpc: Fixups for kernel linked at 32 MB In-Reply-To: <20051205003954.6E56168802@ozlabs.org> References: <1133743149.268607.418162138937.qpush@concordia> <20051205003954.6E56168802@ozlabs.org> Message-ID: <17302.20362.490309.877127@cargo.ozlabs.ibm.com> Michael Ellerman writes: > There's a few places where we need to fix things up for the kernel to work > if it's linked at 32MB: > > - platforms/powermac/smp.c > To start secondary cpus on pmac we patch the reset vector, which is fine. > Except if we're above 32MB we don't have enough bits for an absolute branch, > it needs to relative. A relative branch at 0x100 is only going to get 0x100 bytes further than an absolute branch, and I don't think that's far enough. Did you consider putting the kdump kernel at 24MB rather than 32MB? That would solve this and other branch issues. Paul. From paulus at samba.org Wed Dec 7 14:15:09 2005 From: paulus at samba.org (Paul Mackerras) Date: Wed, 7 Dec 2005 14:15:09 +1100 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <20051207022610.GI27946@ftp.linux.org.uk> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> <1133905298.8027.13.camel@localhost> <17302.3696.364669.18755@cargo.ozlabs.ibm.com> <20051207022610.GI27946@ftp.linux.org.uk> Message-ID: <17302.21437.608048.64857@cargo.ozlabs.ibm.com> Al Viro writes: > FWIW, I think it's not a serious argument. Interface changes => grep time. > And that means grep over the tree anyway. OK, well, where would you prefer the spufs code to go? > That's solved by asking for review... Could you review the spufs code (i.e. the patches posted by Arnd recently to linuxppc64-dev at ozlabs.org) please? Thanks, Paul. From michael at ellerman.id.au Wed Dec 7 15:38:00 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Tue, 6 Dec 2005 22:38:00 -0600 Subject: [PATCH 7/11] powerpc: Fixups for kernel linked at 32 MB In-Reply-To: <17302.20362.490309.877127@cargo.ozlabs.ibm.com> References: <1133743149.268607.418162138937.qpush@concordia> <20051205003954.6E56168802@ozlabs.org> <17302.20362.490309.877127@cargo.ozlabs.ibm.com> Message-ID: <200512062238.03741.michael@ellerman.id.au> On Tue, 6 Dec 2005 20:57, Paul Mackerras wrote: > Michael Ellerman writes: > > There's a few places where we need to fix things up for the kernel to > > work if it's linked at 32MB: > > > > - platforms/powermac/smp.c > > To start secondary cpus on pmac we patch the reset vector, which is > > fine. Except if we're above 32MB we don't have enough bits for an > > absolute branch, it needs to relative. > > A relative branch at 0x100 is only going to get 0x100 bytes further > than an absolute branch, and I don't think that's far enough. Except we're patching at KERNELBASE + 0x100, so as long as __secondary_start_pmac_0 - KERNELBASE < 32 MB we should be fine. > Did you consider putting the kdump kernel at 24MB rather than 32MB? > That would solve this and other branch issues. No I didn't, but a few people have mentioned it since. I think it's something we could look at - but I'd rather not change it now. The only other "branch issue" I know of is having to use a no-op in the trampoline, is there anything else? cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051206/30952dec/attachment.pgp From penberg at cs.helsinki.fi Wed Dec 7 19:21:50 2005 From: penberg at cs.helsinki.fi (Pekka Enberg) Date: Wed, 7 Dec 2005 10:21:50 +0200 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17302.21437.608048.64857@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> <1133905298.8027.13.camel@localhost> <17302.3696.364669.18755@cargo.ozlabs.ibm.com> <20051207022610.GI27946@ftp.linux.org.uk> <17302.21437.608048.64857@cargo.ozlabs.ibm.com> Message-ID: <84144f020512070021r38188044x54c0b2491ef4a176@mail.gmail.com> Hi Paul, On 12/7/05, Paul Mackerras wrote: > Could you review the spufs code (i.e. the patches posted by Arnd > recently to linuxppc64-dev at ozlabs.org) please? Why not post them to LKML? Pekka From viro at ftp.linux.org.uk Wed Dec 7 21:17:08 2005 From: viro at ftp.linux.org.uk (Al Viro) Date: Wed, 7 Dec 2005 10:17:08 +0000 Subject: [PATCH 02/14] spufs: fix local store page refcounting In-Reply-To: <17302.21437.608048.64857@cargo.ozlabs.ibm.com> References: <20051206035220.097737000@localhost> <200512061118.19633.arnd@arndb.de> <1133869108.7968.1.camel@localhost> <200512061949.33482.arnd@arndb.de> <1133895947.3279.4.camel@localhost> <17301.65082.251692.675360@cargo.ozlabs.ibm.com> <1133905298.8027.13.camel@localhost> <17302.3696.364669.18755@cargo.ozlabs.ibm.com> <20051207022610.GI27946@ftp.linux.org.uk> <17302.21437.608048.64857@cargo.ozlabs.ibm.com> Message-ID: <20051207101708.GJ27946@ftp.linux.org.uk> On Wed, Dec 07, 2005 at 02:15:09PM +1100, Paul Mackerras wrote: > Al Viro writes: > > > FWIW, I think it's not a serious argument. Interface changes => grep time. > > And that means grep over the tree anyway. > > OK, well, where would you prefer the spufs code to go? Up to ppc folks, really - I don't see any serious objections to arch/powerpc/ variants; it could go there, it could go to fs/*. Objections along the lines of "it won't be found" are BS - any interface change is going to start with grep over the entire tree anyway. > > That's solved by asking for review... > > Could you review the spufs code (i.e. the patches posted by Arnd > recently to linuxppc64-dev at ozlabs.org) please? If it's what you have in powerpc.git - see comments on IRC yesterday... From Jens.Osterkamp at de.ibm.com Wed Dec 7 20:53:12 2005 From: Jens.Osterkamp at de.ibm.com (Jens Osterkamp) Date: Wed, 7 Dec 2005 10:53:12 +0100 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly In-Reply-To: <200512061123.40059.arnd@arndb.de> Message-ID: Arnd Bergmann wrote on 12/06/2005 11:23:39 AM: > On Dinsdag 06 Dezember 2005 01:59, Paul Mackerras wrote: > > Arnd Bergmann writes: > > > > > Uploading the device firmware may fail if wrong input data > > > was provided by the user. This checks for the condition. > > > > > > From: Jens.Osterkamp at de.ibm.com > > > Cc: netdev at vger.kernel.org > > > > This one should be sent to Jeff Garzik, along with patches 11, 13 and > > 14. > > Ok. > > Jens, is it ok for you if you send the network driver stuff to > jgarzik at pobox.com, Cc: netdev at vger.kernel.org yourself in the future? Sure, I will do so for our next updates. Jens From rsa at us.ibm.com Thu Dec 8 02:54:59 2005 From: rsa at us.ibm.com (Ryan Arnold) Date: Wed, 07 Dec 2005 09:54:59 -0600 Subject: [RFC PATCH 3/5] CELL bogus_console port to hvc_console backend driver In-Reply-To: <2b19bee9bd90cfee311d8076b026add4@bga.com> References: <43935BB5.9030302@us.ibm.com> <2b19bee9bd90cfee311d8076b026add4@bga.com> Message-ID: <1133970899.10632.11.camel@localhost.localdomain> On Mon, 2005-12-05 at 11:27 -0600, Milton Miller wrote: > > +config HVC_DRIVER > > + bool "PowerPC virtual console front-end support" > > + depends on PPC_PSERIES || PPC_BPA || PPC_RTAS > > + help > > + Users of pSeries machines that want to utilize the hvc console > > front-end > > + module for their backend console driver should select this option. > > + It will automatically be selected if one of the back-end console > > drivers > > + is selected. > > + > > Lets just keep this hidden -- so take out depends (its all generic code) > and just say bool (without any quoted text). The help text could then > be made more generic. Good idea, I wasn't aware that a Kconfig option can remain hidden. -- Ryan Arnold IBM Linux Technology Center From galak at kernel.crashing.org Thu Dec 8 03:54:28 2005 From: galak at kernel.crashing.org (Kumar Gala) Date: Wed, 7 Dec 2005 10:54:28 -0600 Subject: RFC: Rev 0.5 Booting the Linux/ppc kernel without Open Firmware In-Reply-To: <20051207001720.GB25533@localhost.localdomain> References: <1133816807.8577.50.camel@cashmere.sps.mot.com> <200512062048.56131.arnd@arndb.de> <20051207001720.GB25533@localhost.localdomain> Message-ID: On Dec 6, 2005, at 6:17 PM, David Gibson wrote: > On Tue, Dec 06, 2005 at 08:48:55PM +0100, Arnd Bergmann wrote: >> On Maandag 05 Dezember 2005 22:06, Jon Loeliger wrote: >>> Included below is a proposed Revision 0.5 of the >>> "Booting the Linux/ppc kernel without Open Firmware" >>> document. This modification primarily extends the >>> Revision 0.4 by adding definitions for OF Nodes that >>> cover the System-On-a-Chip features found on PPC parts. >>> It also generalizes some earlier wording that pertained >>> to only PPC64 parts and covers the new, merged PPC 32 >>> and 64 parts together. Finally, minor typos, style >>> consistency and grammar problems were corrected. >> >> A few points are not clear yet, either because I don't understand the >> document or one it references correctly or because I might have >> different requirements: > > All comments below IMHO, and subject to persuasion otherwise. > >> - Do we need a way to identify the type of soc bus? There are >> different >> standards for this, e.g. PLB4 on PPC440 or the EIB on the Cell BE. >> My initial idea was to have different device-type properties for >> these, >> but I now think that device_type = "soc" makes sense for all of >> them. >> Maybe we could add a model or compatible property for them. > > It think it would be a good idea to have something labelling the > specific type of SOC bus, though I'm not immediately sure where. > "model" perhaps, if it rarely has an effect on how to operate the bus. I think this should be optional since it rarely has an effect on usage. >> - It does not really belong into this document, but is related >> anyway: >> how do you want to represent this in Linux? Currently, most of >> these >> would be of_platform_device, but I think it would be good to have >> a new bus_type for it. The advantage would be that you can see the >> devices in /sys/devices/soc at xxx/ even if the driver is not loaded >> and the driver can even be autoloaded by udev. >> Also, which properties should show up in sysfs? All of them or just >> those specified in this document or a subset of them? > > I concur - I believe we already have a bus_type for on-chip devices on > 4xx. Not, sure what the 4xx reference is but, we have be using the platform bus in the kernel for "soc" connected devices. I dont see the need to invent a new bus type unless there is a specific reason to. >> - What do we do with pci root devices? They are often physically >> connected >> to the internal CPU bus, so it would make sense to represent them >> this way in the device tree. Should we add them to the >> specification >> here? Would it even work the expected way in Linux? > > The host bridges should sit on the soc bus then, as you suggest (just > as the PCI busses hang off HyperTransport on the G5). I think you > need to refer to the OF docs for how to represent the PCI host bridge > and devices themselves. We need to provide some details on PCI nodes based on the OF docs. Ben and I have talked a little about this. Its mainly about what parts of the OF spec are truly required. We will probably add some additional information that the OF spec doesnt handle for host bridges setup. >> - For some devices, you mandate a model property, for others you >> don't. >> Is this intentional? It might be easier to find the right device >> driver if the match string always contains a model name. > > You rarely want to match model name to find a device - generally you > want to match either on "compatible" or "device_type", or possibly > both. > >> - How would I represent nested interrupt controllers? E.g. suppose I >> have a Cell internal interrupt controller on one SOC bus and >> and an external interrupt controller on another SOC bus but have >> that deliver interrupts to the first one. > > Again, I believe this is in the OF docs - interrupt controllers have > an interrupt-parent property IIRC, which gives the phandle of the next > interrupt controller up the chain. Yep, you need to check out the "Interrupt Mapping" OF spec for details. It handles describing the chaining you speak of. However, you will need to provide some "spec" for any properties of the interrupt controllers that you may need. >> - Should it mention nested SOC buses, e.g. a PLB4 bus connected to a >> PLB5 bus? > > Yes. Is there anything special about this? are these PLB4/5 busses software visible? > >> - The title says 'without Open Firmware', but it should also be >> allowed >> to use the same SOC bus layout when using SLOF or some other OF >> implementation, right? > > I guess so. > >> - Also not new in this version, but still: Should there be support >> for >> specifying CPUs with multiple SMT threads? > > Umm.. maybe. - kumar From miltonm at bga.com Thu Dec 8 04:23:39 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:23:39 -0600 Subject: [PATCH 03/14] spufs: Fix oops when spufs module is not loaded Message-ID: > - if (try_module_get(spufs_calls.owner)) { > + if (owner && try_module_get(spufs_calls.owner)) { > try_module_get(owner) to avoid the race (twice) milton From miltonm at bga.com Thu Dec 8 04:23:43 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:23:43 -0600 Subject: [PATCH 05/14] spufs: Improved SPU preemptability. Message-ID: <72ba6abba87023a896c2313797ede940@bga.com> > > This patch makes it easier to preempt an SPU context by > having the scheduler hold ctx->state_sema for much shorter > periods of time. > > As part of this restructuring, the control logic for the "run" > operation is moved from arch/ppc64/kernel/spu_base.c to > fs/spufs/file.c. Of course the base retains "bottom half" file.c moved > handlers for class{0,1} irqs. The new run loop will re-acquire > an SPU if preempted. > > From: Mark Nutter > Signed-off-by: Arnd Bergmann From miltonm at bga.com Thu Dec 8 04:24:21 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:24:21 -0600 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly Message-ID: <088b3b2b5b1187c6a716102b0201469f@bga.com> On Tue Dec 6 14:52:32 EST 2005, Arnd Bergmann wrote: > Uploading the device firmware may fail if wrong input data > was provided by the user. This checks for the condition. > > From: Jens.Osterkamp at de.ibm.com > Cc: netdev at vger.kernel.org > Signed-off-by: Arnd Bergmann > > Index: linux-2.6.15-rc/drivers/net/spider_net.c > =================================================================== > --- linux-2.6.15-rc.orig/drivers/net/spider_net.c > +++ linux-2.6.15-rc/drivers/net/spider_net.c > @@ -1836,7 +1836,7 @@ spider_net_setup_phy(struct spider_net_c > * spider_net_download_firmware loads the firmware opened by > * spider_net_init_firmware into the adapter. > */ > -static void > +static int > spider_net_download_firmware(struct spider_net_card *card, > const struct firmware *firmware) > { > @@ -1857,8 +1857,13 @@ spider_net_download_firmware(struct spid > } > } > > + if (spider_net_read_reg(card, SPIDER_NET_GSINIT)) > + return -EIO; > + > spider_net_write_reg(card, SPIDER_NET_GSINIT, > SPIDER_NET_RUN_SEQ_VALUE); > + > + return 0; > } > > /** > @@ -1909,9 +1914,8 @@ spider_net_init_firmware(struct spider_n > goto out; > } > > - spider_net_download_firmware(card, firmware); > - > - err = 0; > + if (!spider_net_download_firmware(card, firmware)) > + err = 0; Why not assign err to the return of spider_net_download_firmware? > out: > release_firmware(firmware); > > Index: linux-2.6.15-rc/drivers/net/spider_net.h > =================================================================== > --- linux-2.6.15-rc.orig/drivers/net/spider_net.h > +++ linux-2.6.15-rc/drivers/net/spider_net.h > @@ -155,7 +155,7 @@ extern char spider_net_driver_name[]; > /* set this first, then the FRAMENUM_VALUE */ > #define SPIDER_NET_GFXFRAMES_VALUE 0x00000000 > > -#define SPIDER_NET_STOP_SEQ_VALUE 0x00000000 > +#define SPIDER_NET_STOP_SEQ_VALUE 0x007e0000 > #define SPIDER_NET_RUN_SEQ_VALUE 0x0000007e > > #define SPIDER_NET_PHY_CTRL_VALUE 0x00040040 > milton From miltonm at bga.com Thu Dec 8 04:24:45 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:24:45 -0600 Subject: [PATCH 10/14] cell: add iommu support for larger memory Message-ID: <0ee9d47e9c94a42075ef44e387650089@bga.com> On Tue Dec 6 14:52:30 EST 2005, Arnd Bergmann wrote: > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/iommu.c > =================================================================== > --- linux-2.6.15-rc.orig/arch/powerpc/platforms/cell/iommu.c > +++ linux-2.6.15-rc/arch/powerpc/platforms/cell/iommu.c ... > @@ -40,6 +42,7 @@ > #include > #include > #include > +#include > > #include "iommu.h" > > @@ -221,7 +224,7 @@ set_iopt_cache(void __iomem *base, unsig > unsigned long __iomem *tags = base + IOC_PT_CACHE_DIR; > unsigned long __iomem *p = base + IOC_PT_CACHE_REG; > pr_debug("iopt %02lx was v%016lx/t%016lx, store > v%016lx/t%016lx\n", > - index, get_iopt_cache(base, index, &oldtag), oldtag, > val, tag); > + index, get_iopt_cache(base, index, &tag), tag, val, > tag); Assuming get_iopt_cache takes &tag to fill it in, this code is wrong. The order of function argument evaluation is undefined in C, and the compiler can choose to change its order at any time. > - for (address = 0; address < 0x100000000ul; address += > io_page_size) { > - ioste = get_iost_entry(0x10000000000ul, address, > io_page_size); > - if ((address & 0xfffffff) == 0) /* segment start */ > - set_iost_cache(base, address >> 28, ioste); > - index = get_ioc_hash_1way(ioste, address); > + for (real_address = 0, io_address = 0; > + io_address <= map_start + map_size; > + real_address += io_page_size, io_address += io_page_size) > { > + ioste = get_iost_entry(fake_iopt, io_address, > io_page_size); > + if ((real_address & 0xfffffff) == 0) /* segment start > */ > + set_iost_cache(ioc_mmio_base, > + io_address >> 28, ioste); > + index = get_ioc_hash_1way(ioste, io_address); [comment] more magic numbers remain... milton From miltonm at bga.com Thu Dec 8 04:28:31 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:28:31 -0600 Subject: [PATCH 08/14] cell: enable pause(0) in cpu_idle Message-ID: <592f37d568f304c5bc5fbad0285c8cb8@bga.com> Hi Arnd. Quite a few comments on this one. On Tue Dec 6 14:52:28 EST 2005, Arnd Bergmann wrote: > This patch enables support for pause(0) power management state > for the Cell Broadband Processor, which is import for power efficient > operation. The pervasive infrastructure will in the future enable > us to introduce more functionality specific to the Cell's > pervasive unit. > > From: Maximino Aguilar > Signed-off-by: Arnd Bergmann > > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/Makefile > =================================================================== > --- linux-2.6.15-rc.orig/arch/powerpc/platforms/cell/Makefile > +++ linux-2.6.15-rc/arch/powerpc/platforms/cell/Makefile > @@ -1,4 +1,6 @@ > obj-y += interrupt.o iommu.o setup.o spider-pic.o > +obj-y += pervasive.o > + > obj-$(CONFIG_SMP) += smp.o > obj-$(CONFIG_SPU_FS) += spufs/ spu_base.o > builtin-spufs-$(CONFIG_SPU_FS) += spu_syscalls.o > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/pervasive.c > =================================================================== > --- /dev/null > +++ linux-2.6.15-rc/arch/powerpc/platforms/cell/pervasive.c > @@ -0,0 +1,147 @@ > +/* > + * CBE Pervasive Monitor and Debug > + * > + * (C) Copyright IBM Corporation 2005 > + * > + * Authors: Maximino Aguilar (maguilar at us.ibm.com) > + * Michael N. Day (mnday at us.ibm.com) > + * > + * This program is free software; you can redistribute it and/or > modify > + * it under the terms of the GNU General Public License as published > by > + * the Free Software Foundation; either version 2, or (at your option) > + * any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#include > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > +#include > + > +#include "pervasive.h" > + > +struct pmd { > + struct pmd_regs __iomem *regs; > + int power_management_enable; > +}; This name conflicts with the memory management system used throughout the kernel. Please rename. > + > +static DEFINE_PER_CPU(struct pmd, pmd); > + > +void pause_zero(void) > +{ > + unsigned int multi_threading_control; > + unsigned long long machine_state; > + > + /* Reset Thread Run Latch (latch is set in idle.c) */ > + ppc64_runlatch_off(); > + > + if (__get_cpu_var(pmd).power_management_enable) How do you know __get_cpu_var is safe here? because this is only called in the idle loop which is bound? > + { > + /* Disable EE during check for pause */ > + machine_state=mfmsr(); > + machine_state &= ~MSR_EE; > + mtmsrd(machine_state); local_irq_disable() ? > + /* Pause the PU */ > + HMT_low(); > + multi_threading_control = 0; > + mtspr(SPRN_CTRLT,multi_threading_control); > + > + /* Re-enable EE after resuming */ > + machine_state=mfmsr(); > + machine_state |= MSR_EE; > + mtmsrd(machine_state); local_irq_enable() ? > + } > +} > + > +void enable_pause_zero(void * data) > +{ > + unsigned long thread_switch_control; > + unsigned long temp_register; > + struct pmd *pmd; > + > + pmd = &get_cpu_var(pmd); > + > + if (!pmd->regs) > + return; > + > + pr_debug("Power Management: CPU %d\n", smp_processor_id()); > + > + /* Enable Pause(0) control bit */ > + temp_register = in_be64(&pmd->regs->pm_control); > + > + out_be64(&pmd->regs->pm_control, > temp_register|PMD_PAUSE_ZERO_CONTROL); > + > + /* Enable DEC and EE interrupt request */ > + thread_switch_control = mfspr(SPRN_TSC_CELL); > + thread_switch_control |= TSCR_EE_ENABLE | TSCR_EE_BOOST; > + > + if (smp_processor_id()%2) smp_processor_id is software number, and does not necessarily correspond to the hardware thread id. Either use the hw version, or better yet, read the PIR (spr 1023?) directly. > + thread_switch_control |= TSC_DEC_ENABLE_1; > + else > + thread_switch_control |= TSC_DEC_ENABLE_0; > + > + mtspr(SPRN_TSC_CELL, thread_switch_control); > + > + pmd->power_management_enable = 1; > + put_cpu_var(pmd); > +} > + > +static struct pmd_regs __iomem *find_pmd_mmio(int cpu) > +{ > + struct device_node *node; > + int node_number = cpu / 2; hmm... so # threads / node hard coded in here ... > + struct pmd_regs __iomem *pmd_mmio_area; > + unsigned long real_address; > + > + for (node = of_find_node_by_type(NULL, "cpu"); node; > + node = of_find_node_by_type(node, "cpu")) { perhaps for (node = NULL; node = of_find(..) ;) or =NULL then while Somewhat long-winded, but ok the way it is. > + if (node_number == *(int *)get_property(node, > "node-id", NULL)) > + break; > + } > + > + if (!node) { > + printk(KERN_WARNING "PMD: CPU %d not found\n", cpu); > + pmd_mmio_area = NULL; > + } else { > + real_address = *(long *)get_property(node, > "pervasive", NULL); > + pr_debug("PMD for CPU %d at %lx\n", cpu, real_address); > + pmd_mmio_area = __ioremap(real_address, 0x1000, > _PAGE_NO_CACHE); > + } > + return pmd_mmio_area; > +} > + > +void __init cell_pervasive_init(void) > +{ > + struct pmd *pmd; > + int cpu; > + > + if (!cpu_has_feature(CPU_FTR_PAUSE_ZERO)) > + return; > + > + for_each_cpu(cpu) { > + pmd = &per_cpu(pmd, cpu); > + pmd->regs = find_pmd_mmio(cpu); > + } O(n**2) find loop, could combine to get O(n) > +} > + > +int __init enable_pause_zero_init(void) > +{ > + on_each_cpu(enable_pause_zero, NULL, 0, 1); > + return 0; > +} > + > +arch_initcall(enable_pause_zero_init); arch_initcall functions should be static > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/pervasive.h > =================================================================== > --- /dev/null > +++ linux-2.6.15-rc/arch/powerpc/platforms/cell/pervasive.h > @@ -0,0 +1,64 @@ > +/* > + * Cell Pervasive Monitor and Debug interface and HW structures > + * > + * (C) Copyright IBM Corporation 2005 > + * > + * Authors: Maximino Aguilar (maguilar at us.ibm.com) > + * David J. Erb (djerb at us.ibm.com) > + * > + * This program is free software; you can redistribute it and/or > modify > + * it under the terms of the GNU General Public License as published > by > + * the Free Software Foundation; either version 2, or (at your option) > + * any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > + > +#ifndef PERVASIVE_H > +#define PERVASIVE_H > + > +struct pmd_regs { > + u8 pad_0x0000_0x0800[0x0800 - 0x0000]; /* > 0x0000 */ > + > + /* Thermal Sensor Registers */ > + u64 ts_ctsr1; /* > 0x0800 */ > + u64 ts_ctsr2; /* > 0x0808 */ > + u64 ts_mtsr1; /* > 0x0810 */ > + u64 ts_mtsr2; /* > 0x0818 */ > + u64 ts_itr1; /* > 0x0820 */ > + u64 ts_itr2; /* > 0x0828 */ > + u64 ts_gitr; /* > 0x0830 */ > + u64 ts_isr; /* > 0x0838 */ > + u64 ts_imr; /* > 0x0840 */ > + u64 tm_cr1; /* > 0x0848 */ > + u64 tm_cr2; /* > 0x0850 */ > + u64 tm_simr; /* > 0x0858 */ > + u64 tm_tpr; /* > 0x0860 */ > + u64 tm_str1; /* > 0x0868 */ > + u64 tm_str2; /* > 0x0870 */ > + u64 tm_tsr; /* > 0x0878 */ > + > + /* Power Management */ > + u64 pm_control; /* > 0x0880 */ > +#define PMD_PAUSE_ZERO_CONTROL 0x10000 > + u64 pm_status; /* > 0x0888 */ > + > + /* Time Base Register */ > + u64 tbr; /* > 0x0890 */ > + > + u8 pad_0x0898_0x1000 [0x1000 - 0x0898]; /* > 0x0898 */ > +}; > + > +void __init cell_pervasive_init(void); > +void enable_pause_zero(void *); > +void _pause_zero(void); what is the single_underscore _pause_zero() ? ohter functions are either arch_initcall or called by initcall in the same file and can be static. > + > +#endif > Index: linux-2.6.15-rc/arch/powerpc/platforms/cell/setup.c > =================================================================== > --- linux-2.6.15-rc.orig/arch/powerpc/platforms/cell/setup.c > +++ linux-2.6.15-rc/arch/powerpc/platforms/cell/setup.c > @@ -49,6 +49,7 @@ > > #include "interrupt.h" > #include "iommu.h" > +#include "pervasive.h" > > #ifdef DEBUG > #define DBG(fmt...) udbg_printf(fmt) > @@ -165,6 +166,7 @@ static void __init cell_setup_arch(void) > init_pci_config_tokens(); > find_and_init_phbs(); > spider_init_IRQ(); > + cell_pervasive_init(); > #ifdef CONFIG_DUMMY_CONSOLE > conswitchp = &dummy_con; > #endif > Index: linux-2.6.15-rc/arch/powerpc/kernel/head_64.S > =================================================================== > --- linux-2.6.15-rc.orig/arch/powerpc/kernel/head_64.S > +++ linux-2.6.15-rc/arch/powerpc/kernel/head_64.S > @@ -383,7 +383,7 @@ label##_common: > \ > .globl __start_interrupts > __start_interrupts: > > - STD_EXCEPTION_PSERIES(0x100, system_reset) > + STD_EXCEPTION_PSERIES(0x100, system_reset_check) > > . = 0x200 > _machine_check_pSeries: > @@ -860,6 +860,31 @@ unrecov_fer: > bl .unrecoverable_exception > b 1b > > +/* This is a new system reset handler for the BE processor. > + * SRR1 stores wake information that must be decoded to determine why > + * the processor was at the system reset handler. > + */ > + > + .align 7 > + .globl system_reset_check_common > +system_reset_check_common: > +BEGIN_FTR_SECTION > + mr r22,r12 /* r12 has SRR1 saved */ > + srwi r22,r22,16 > + andi. r22,r22,MSR_WAKEMASK > + cmpwi r22,MSR_WAKEEE > + beq 40f > + cmpwi r22,MSR_WAKEDEC > + beq 42f > + cmpwi r22,MSR_WAKEMT > + beq 43f > +END_FTR_SECTION_IFSET(CPU_FTR_PAUSE_ZERO) > + b system_reset_common > +40: b hardware_interrupt_common > +42: b decrementer_common > +43: EXCEPTION_PROLOG_COMMON(0x100, PACA_EXGEN); > + b fast_exception_return > + Branches to branches that must be in the same file, within the first 64k, currently within 32k. Just make the conditional branches directly to the other routines. This could go inline with system_reset_common, except that it would mean breaking apart the STD_EXCEPTION_COMMON macro for it. Space optimization would then be to put the test for WAKEMT after PROLOG_COMMON at the expense of breaking up the tests. > /* > * Here r13 points to the paca, r9 contains the saved CR, > * SRR0 and SRR1 are saved in r11 and r12, > Index: linux-2.6.15-rc/arch/powerpc/kernel/idle_64.c > =================================================================== > --- linux-2.6.15-rc.orig/arch/powerpc/kernel/idle_64.c > +++ linux-2.6.15-rc/arch/powerpc/kernel/idle_64.c > @@ -40,7 +40,8 @@ void default_idle(void) > if (!need_resched()) { > while (!need_resched() && > !cpu_is_offline(cpu)) { > ppc64_runlatch_off(); > - > + if > (cpu_has_feature(CPU_FTR_PAUSE_ZERO)) > + pause_zero(); We have multiple idle loops and ppc_md.idle_loop to avoid junk like this. Assign the idle-loop based on the cpu feature. Place it in persavisive.c, then you can make pause_zero static, and it will be inline. All better for power (fewer tests and branches). > /* > * Go into low thread priority and > possibly > * low power mode. > Index: linux-2.6.15-rc/include/asm-powerpc/cputable.h > =================================================================== > --- linux-2.6.15-rc.orig/include/asm-powerpc/cputable.h > +++ linux-2.6.15-rc/include/asm-powerpc/cputable.h > @@ -106,6 +106,7 @@ extern void do_cpu_ftr_fixups(unsigned l > #define CPU_FTR_LOCKLESS_TLBIE ASM_CONST(0x0000040000000000) > #define CPU_FTR_MMCRA_SIHV ASM_CONST(0x0000080000000000) > #define CPU_FTR_CI_LARGE_PAGE ASM_CONST(0x0000100000000000) > +#define CPU_FTR_PAUSE_ZERO ASM_CONST(0x0000200000000000) > #else > /* ensure on 32b processors the flags are available for compiling but > * don't do anything */ > @@ -305,7 +306,8 @@ enum { > CPU_FTR_MMCRA_SIHV, > CPU_FTRS_CELL = CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | > CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | > - CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT, > + CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | > + CPU_FTR_CTRL | CPU_FTR_PAUSE_ZERO, > CPU_FTRS_COMPATIBLE = CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | > CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2, > #endif > Index: linux-2.6.15-rc/include/asm-powerpc/processor.h > =================================================================== > --- linux-2.6.15-rc.orig/include/asm-powerpc/processor.h > +++ linux-2.6.15-rc/include/asm-powerpc/processor.h > @@ -281,6 +281,14 @@ static inline void prefetchw(const void > #define HAVE_ARCH_PICK_MMAP_LAYOUT > #endif > > +#ifdef CONFIG_PPC_CELL > +extern void pause_zero(void); > +#else > +static inline void pause_zero(void) > +{ > +} > +#endif > + and you can stop polluting processor.h with something you only want called in a pinned cpu context from your idle loop. > #endif /* __KERNEL__ */ > #endif /* __ASSEMBLY__ */ > #endif /* _ASM_POWERPC_PROCESSOR_H */ > Index: linux-2.6.15-rc/include/asm-powerpc/reg.h > =================================================================== > --- linux-2.6.15-rc.orig/include/asm-powerpc/reg.h > +++ linux-2.6.15-rc/include/asm-powerpc/reg.h > @@ -92,6 +92,15 @@ > #define MSR_RI __MASK(MSR_RI_LG) /* Recoverable > Exception */ > #define MSR_LE __MASK(MSR_LE_LG) /* Little Endian */ > > +/* Wake Events */ > +#define MSR_WAKEMASK 0x0038 > +#define MSR_WAKERESET 0x0038 > +#define MSR_WAKESYSERR 0x0030 > +#define MSR_WAKEEE 0x0020 > +#define MSR_WAKEMT 0x0028 > +#define MSR_WAKEDEC 0x0018 > +#define MSR_WAKETHERM 0x0010 > + > #ifdef CONFIG_PPC64 > #define MSR_ MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_ISF > #define MSR_KERNEL MSR_ | MSR_SF | MSR_HV > @@ -257,9 +266,10 @@ > #define SPRN_HID6 0x3F9 /* BE HID 6 */ > #define HID6_LB (0x0F<<12) /* Concurrent Large Page > Modes */ > #define HID6_DLP (1<<20) /* Disable all large page > modes (4K only) */ > -#define SPRN_TSCR 0x399 /* Thread switch control on BE > */ > -#define SPRN_TTR 0x39A /* Thread switch timeout on BE > */ > -#define TSCR_DEC_ENABLE 0x200000 /* Decrementer > Interrupt */ > +#define SPRN_TSC_CELL 0x399 /* Thread switch control on > Cell */ > +#define SPRN_TTR 0x39A /* Thread switch timeout on > Cell */ > +#define TSC_DEC_ENABLE_0 0x400000 /* Decrementer > Interrupt */ > +#define TSC_DEC_ENABLE_1 0x200000 /* Decrementer > Interrupt */ The prefix should be the name of the register to which they apply and directly under that register. > #define TSCR_EE_ENABLE 0x100000 /* External Interrupt > */ > #define TSCR_EE_BOOST 0x080000 /* External Interrupt > Boost */ > #define SPRN_TSC 0x3FD /* Thread switch control on > others */ > milton From miltonm at bga.com Thu Dec 8 04:28:39 2005 From: miltonm at bga.com (Milton Miller) Date: Wed, 7 Dec 2005 11:28:39 -0600 Subject: [PATCH 13/14] spidernet: read firmware from the OF device tree Message-ID: <24364349f5d62b1f71eebc4cb2f3b76e@bga.com> On Tue Dec 6 14:52:33 EST 2005, Arnd Bergmann wrote: > request_firmware() is sometimes problematic, especially > in initramfs, reading the firmware from Open Firmware > is much preferrable. > > We still try to get the firmware from the file system > first, in order to support old SLOF releases and to allow > updates of the spidernet firmware without reflashing > the system. > > From: Jens.Osterkamp at de.ibm.com > Cc: netdev at vger.kernel.org > Signed-off-by: Arnd Bergmann > > Index: linux-2.6.15-rc/drivers/net/spider_net.c > =================================================================== > --- linux-2.6.15-rc.orig/drivers/net/spider_net.c > +++ linux-2.6.15-rc/drivers/net/spider_net.c > @@ -1895,16 +1895,27 @@ spider_net_download_firmware(struct spid > static int > spider_net_init_firmware(struct spider_net_card *card) > { > - const struct firmware *firmware; > + struct firmware *firmware; > + struct device_node *dn; > + u8 *fw_prop; > int err = -EIO; > > - if (request_firmware(&firmware, > + if (request_firmware((const struct firmware **)&firmware, > SPIDER_NET_FIRMWARE_NAME, > &card->pdev->dev) < 0) { > if (netif_msg_probe(card)) > pr_err("Couldn't read in sequencer data file > %s.\n", > SPIDER_NET_FIRMWARE_NAME); > - firmware = NULL; > - goto out; > + > + dn = pci_device_to_OF_node(card->pdev); > + if (!dn) > + goto out; > + > + fw_prop = (u8 *)get_property(dn, "firmware", NULL); > + if (!fw_prop) > + goto out; > + > + memcpy(firmware->data, fw_prop, 6 * > SPIDER_NET_FIRMWARE_LEN * sizeof(u32)); > + firmware->size = 6 * SPIDER_NET_FIRMWARE_LEN * > sizeof(u32); > } > > if (firmware->size != 6 * SPIDER_NET_FIRMWARE_LEN * > sizeof(u32)) { > > A person might think that FIRMWARE_LEN was the desired length. Or at least there would be something defined to that long expression. Also, how about actually getting the size of the property (that third NULL argument is to return that), and checking if it is the desired size? milton From kravetz at us.ibm.com Thu Dec 8 08:07:23 2005 From: kravetz at us.ibm.com (Mike Kravetz) Date: Wed, 7 Dec 2005 13:07:23 -0800 Subject: [PATCH] boot failures on numa if no memory on node Message-ID: <20051207210723.GA8970@monkey.ibm.com> This bug exists in the current code and prevents machines from booting with numa enabled if there is a node that does not contain memory. Workaround is to boot with 'numa=off'. Looks like a simple type. Signed-off-by: Mike Kravetz diff -Naupr linux-2.6.15-rc5-git1/arch/powerpc/mm/numa.c linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c --- linux-2.6.15-rc5-git1/arch/powerpc/mm/numa.c 2005-12-04 05:10:42.000000000 +0000 +++ linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c 2005-12-07 20:49:23.000000000 +0000 @@ -125,7 +125,7 @@ void __init get_region(unsigned int nid, /* We didnt find a matching region, return start/end as 0 */ if (*start_pfn == -1UL) - start_pfn = 0; + *start_pfn = 0; } static inline void map_cpu_to_node(int cpu, int node) From anton at samba.org Thu Dec 8 08:01:31 2005 From: anton at samba.org (Anton Blanchard) Date: Thu, 8 Dec 2005 08:01:31 +1100 Subject: [PATCH] boot failures on numa if no memory on node In-Reply-To: <20051207210723.GA8970@monkey.ibm.com> References: <20051207210723.GA8970@monkey.ibm.com> Message-ID: <20051207210130.GA23641@krispykreme> Hi, Thanks Mike, that stupid bug was my fault. Looks good. Anton > This bug exists in the current code and prevents machines from booting > with numa enabled if there is a node that does not contain memory. > Workaround is to boot with 'numa=off'. Looks like a simple type. > > Signed-off-by: Mike Kravetz > > diff -Naupr linux-2.6.15-rc5-git1/arch/powerpc/mm/numa.c linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c > --- linux-2.6.15-rc5-git1/arch/powerpc/mm/numa.c 2005-12-04 05:10:42.000000000 +0000 > +++ linux-2.6.15-rc5-git1.work/arch/powerpc/mm/numa.c 2005-12-07 20:49:23.000000000 +0000 > @@ -125,7 +125,7 @@ void __init get_region(unsigned int nid, > > /* We didnt find a matching region, return start/end as 0 */ > if (*start_pfn == -1UL) > - start_pfn = 0; > + *start_pfn = 0; > } > > static inline void map_cpu_to_node(int cpu, int node) From Jens.Osterkamp at de.ibm.com Thu Dec 8 08:20:52 2005 From: Jens.Osterkamp at de.ibm.com (Jens Osterkamp) Date: Wed, 7 Dec 2005 22:20:52 +0100 Subject: [PATCH 12/14] spidernet: check if firmware was loaded correctly In-Reply-To: <088b3b2b5b1187c6a716102b0201469f@bga.com> Message-ID: Milton Miller wrote on 12/07/2005 06:24:21 PM: > > - spider_net_download_firmware(card, firmware); > > - > > - err = 0; > > + if (!spider_net_download_firmware(card, firmware)) > > + err = 0; > > Why not assign err to the return of spider_net_download_firmware? You are right, I will correct this. Jens From mostrows at watson.ibm.com Thu Dec 8 10:14:43 2005 From: mostrows at watson.ibm.com (Michal Ostrowski) Date: Wed, 07 Dec 2005 18:14:43 -0500 Subject: [PATCH] Fix windfarm model-id table. Message-ID: <1133997283.28136.57.camel@brick.watson.ibm.com> .model_id fields of wf_smu_sys_all_params should match the model ID they are supposed to represent (as commented). Fixes windfarm on iMac 8,1. Signed-off-by: Michal Ostrowski --- drivers/macintosh/windfarm_pm81.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) 645a833c9e40e76b56052af16c7ba96259a68163 diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c index 322c74b..80ddf97 100644 --- a/drivers/macintosh/windfarm_pm81.c +++ b/drivers/macintosh/windfarm_pm81.c @@ -207,7 +207,7 @@ static struct wf_smu_sys_fans_param wf_s }, /* Model ID 3 */ { - .model_id = 2, + .model_id = 3, .itarget = 0x350000, .gd = 0x08e00000, .gp = 0x00566666, @@ -219,7 +219,7 @@ static struct wf_smu_sys_fans_param wf_s }, /* Model ID 5 */ { - .model_id = 2, + .model_id = 5, .itarget = 0x3a0000, .gd = 0x15400000, .gp = 0x00233333, -- 0.99.9.GIT From canticle400 at gmail.com Thu Dec 8 14:37:18 2005 From: canticle400 at gmail.com (Dennis Chua) Date: Wed, 7 Dec 2005 22:37:18 -0500 Subject: Linux+PPC64, self-modifying code Message-ID: Hello. Can anyone comment on the feasibility of writing self-modifying code on Linux PPC64? Disregarding the motivations behind this, is it possible for an executable program to - access the instruction opcode of one of its functions. - overwrite/alter the function opcode and to do this all during runtime? Any insight is much appreciated! Thank you. -------------- next part -------------- An HTML attachment was scrubbed... URL: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051207/af36e207/attachment.htm From hollis at penguinppc.org Thu Dec 8 15:10:36 2005 From: hollis at penguinppc.org (Hollis Blanchard) Date: Wed, 7 Dec 2005 22:10:36 -0600 Subject: Linux+PPC64, self-modifying code In-Reply-To: References: Message-ID: <9a87b582891390d1bbf5d00bee5488d0@penguinppc.org> On Dec 7, 2005, at 9:37 PM, Dennis Chua wrote: > > Can anyone comment on the feasibility of writing self-modifying > code on Linux PPC64? Disregarding the motivations behind this, > is it possible for an executable program to > > - access the instruction opcode of one of its functions. > - overwrite/alter the function opcode > > and to do this all during runtime? It's quite feasible. Many projects, including the kernel, do this. > Any insight is much appreciated! Thank you. The main trick is that most PowerPC have L1 instruction caches that are incoherent with the L1 data caches. In other words, when you write the new code to memory, it lands in the dcache, and then the icache has stale instructions which it will happily execute. The architected sequence you must execute for self-modifying code is documented, I believe in Book III of the PowerPC Architecture (see http://penguinppc.org/dev/#library). You basically flush the affected memory out of the L1 dcache, sync to make sure all that finished, invalidate the previous icache contents, then isync to discard partially-decoded instructions the processor may have already fetched out of the icache. See the Architecture book for the exact instructions... -Hollis From david at gibson.dropbear.id.au Thu Dec 8 15:25:51 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 8 Dec 2005 15:25:51 +1100 Subject: Linux+PPC64, self-modifying code In-Reply-To: <9a87b582891390d1bbf5d00bee5488d0@penguinppc.org> References: <9a87b582891390d1bbf5d00bee5488d0@penguinppc.org> Message-ID: <20051208042551.GA30681@localhost.localdomain> On Wed, Dec 07, 2005 at 10:10:36PM -0600, Hollis Blanchard wrote: > On Dec 7, 2005, at 9:37 PM, Dennis Chua wrote: > > > > Can anyone comment on the feasibility of writing self-modifying > > code on Linux PPC64? Disregarding the motivations behind this, > > is it possible for an executable program to > > > > - access the instruction opcode of one of its functions. > > - overwrite/alter the function opcode > > > > and to do this all during runtime? > > It's quite feasible. Many projects, including the kernel, do this. > > > Any insight is much appreciated! Thank you. > > The main trick is that most PowerPC have L1 instruction caches that are > incoherent with the L1 data caches. In other words, when you write the > new code to memory, it lands in the dcache, and then the icache has > stale instructions which it will happily execute. > > The architected sequence you must execute for self-modifying code is > documented, I believe in Book III of the PowerPC Architecture (see > http://penguinppc.org/dev/#library). You basically flush the affected > memory out of the L1 dcache, sync to make sure all that finished, > invalidate the previous icache contents, then isync to discard > partially-decoded instructions the processor may have already fetched > out of the icache. See the Architecture book for the exact > instructions... I believe dcbst sync icbi isync is what you need. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson From hollis at penguinppc.org Thu Dec 8 16:07:50 2005 From: hollis at penguinppc.org (Hollis Blanchard) Date: Wed, 7 Dec 2005 23:07:50 -0600 Subject: Linux+PPC64, self-modifying code In-Reply-To: <9a87b582891390d1bbf5d00bee5488d0@penguinppc.org> References: <9a87b582891390d1bbf5d00bee5488d0@penguinppc.org> Message-ID: <88e1d310bda948bdf8ab8d1c6efcaf76@penguinppc.org> On Dec 7, 2005, at 10:10 PM, Hollis Blanchard wrote: > > The architected sequence you must execute for self-modifying code is > documented, I believe in Book III of the PowerPC Architecture (see > http://penguinppc.org/dev/#library). Correction: Book II, section 1.8 in the version 2.02 architecture PDFs (which are what's on the web). -Hoolis From benh at kernel.crashing.org Thu Dec 8 16:51:44 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Thu, 08 Dec 2005 16:51:44 +1100 Subject: [PATCH] powerpc: Fix a huge page bug Message-ID: <1134021104.7168.106.camel@gaston> The 64k pages patch changed the meaning of one argument passed to the low level hash functions (from "large" it became "psize" or page size index), but one of the call sites wasn't properly updates, causing potential random weird problems with huge pages. This fixes it. Signed-off-by: Benjamin Herrenschmidt --- This is a candidate for 2.6.15 Index: linux-work/arch/powerpc/mm/hugetlbpage.c =================================================================== --- linux-work.orig/arch/powerpc/mm/hugetlbpage.c 2005-11-28 11:04:49.000000000 +1100 +++ linux-work/arch/powerpc/mm/hugetlbpage.c 2005-12-08 16:42:41.000000000 +1100 @@ -703,7 +703,8 @@ int hash_huge_page(struct mm_struct *mm, slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += (old_pte & _PAGE_F_GIX) >> 12; - if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1) + if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize, + local) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } From benh at kernel.crashing.org Thu Dec 8 16:53:34 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Thu, 08 Dec 2005 16:53:34 +1100 Subject: [PATCH] powerpc: Remove debug code in hash path Message-ID: <1134021215.7168.109.camel@gaston> Some debug code wasn't properly removed from the initial 64k pages patch, and while it's harmless, it's also slowing down significantly a very hot code path, thus it should really be removed. Signed-off-by: Benjamin Herrenschmidt --- This is a candidate for 2.6.15 Index: linux-work/arch/powerpc/platforms/pseries/lpar.c =================================================================== --- linux-work.orig/arch/powerpc/platforms/pseries/lpar.c 2005-11-09 11:49:03.000000000 +1100 +++ linux-work/arch/powerpc/platforms/pseries/lpar.c 2005-12-08 16:43:40.000000000 +1100 @@ -298,18 +298,6 @@ long pSeries_lpar_hpte_insert(unsigned l if (!(vflags & HPTE_V_BOLTED)) DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); -#if 1 - { - int i; - for (i=0;i<8;i++) { - unsigned long w0, w1; - plpar_pte_read(0, hpte_group, &w0, &w1); - BUG_ON (HPTE_V_COMPARE(hpte_v, w0) - && (w0 & HPTE_V_VALID)); - } - } -#endif - /* Now fill in the actual HPTE */ /* Set CEC cookie to 0 */ /* Zero page = 0 */ From paulus at samba.org Thu Dec 8 17:18:58 2005 From: paulus at samba.org (Paul Mackerras) Date: Thu, 8 Dec 2005 17:18:58 +1100 Subject: [PATCH 00/14] Cell updates for powerpc.git In-Reply-To: <20051206035220.097737000@localhost> References: <20051206035220.097737000@localhost> Message-ID: <17303.53330.215784.545952@cargo.ozlabs.ibm.com> Arnd Bergmann writes: > This is my current set of updates related to the cell platforms. I have put patches 1..7 and 9 into the powerpc.git tree. Please address Milton's comments on patches 3, 5, 8 and 10. Thanks, Paul. From paulus at samba.org Thu Dec 8 17:22:52 2005 From: paulus at samba.org (Paul Mackerras) Date: Thu, 8 Dec 2005 17:22:52 +1100 Subject: patches in powerpc.git tree Message-ID: <17303.53564.160668.376061@cargo.ozlabs.ibm.com> Here is a list of patches currently in the powerpc.git tree that aren't already in Linus' tree. Paul. Adrian Bunk: PPC_PREP: remove unneeded exports Andy Whitcroft: powerpc: powermac adb fix dependency on btext_drawchar powerpc: powermac adb fix udbg_adb_use_btext warning powerpc32: clean up available memory models powerpc32: fix definition of distribute_irqs Arnd Bergmann: spufs: The SPU file system, base spufs: cooperative scheduler support spufs: Make all exports GPL-only spufs: fix local store page refcounting spufs: Fix oops when spufs module is not loaded spufs: Turn off debugging output spufs: Improved SPU preemptability. spufs: Improved SPU preemptability [part 2]. spufs: fix mailbox polling cell: add platform detection code Benjamin Herrenschmidt: powerpc: Merge align.c (#2) powerpc: Add OF address parsing code (#2) powerpc: serial port discovery (#2) powerpc: Unify udbg (#2) powerpc: Add back support for booting from BootX (#2) powerpc: convert macio_asic to use prom_parse powerpc: Fix g5 build with xmon powerpc: More serial probe fixes (#2) powerpc: udbg updates powerpc: Update OF address parsers powerpc: Fix a huge page bug powerpc: Remove debug code in hash path David Gibson: powerpc: Remove imalloc.h powerpc: Make hugepage mappings respect hint addresses is_aligned_hugepage_range() cleanup powerpc: Remove ItLpRegSave area from the paca powerpc: Remove some unneeded fields from the paca David Woodhouse: syscall entry/exit revamp ppc64 syscall_exit_work: call the save_nvgprs function, not its descriptor. powerpc: serial port discovery: cope with broken firmware Save NVGPRS in 32-bit signal frame Fix code that saves NVGPRS in 32-bit signal frame ppc: Make ARCH=ppc build again with new syscall path Heiko J Schick: powerpc: IBMEBUS bus support Hugh Dickins: mm: powerpc ptlock comments mm: powerpc init_mm without ptlock Kumar Gala: powerpc: moved ipic code to arch/powerpc powerpc: Add support for building uImages powerpc: Fix suboptimal uImage target linas: powerpc/pseries: dlpar-add crash on null pointer deref powerpc: minor cleanup of void ptr deref Linas Vepstas: powerpc: PCI hotplug common code elimination powerpc: make pcibios_claim_one_bus available to other code powerpc: migrate common PCI hotplug code PCI Error Recovery: header file patch powerpc: PCI Error Recovery: PPC64 core recovery routines powerpc: Split out PCI address cache to its own file powerpc: Add "partitionable endpoint" support powerpc: remove bogus printk powerpc: Remove duplicate code powerpc: bugfix: fill in uninitialized field powerpc: Use PE configuration address consistently powerpc: set up the RTAS token just like the rest of them. powerpc: Don't continue with PCI Error recovery if slot reset failed. powerpc: handle multifunction PCI devices properly powerpc: IOMMU: don't ioremap null addresses powerpc: Save device BARs much earlier in the boot sequence powerpc: get rid of per_cpu EEH counters Marcelo Tosatti: ppc32: m8xx watchdog update powerpc/8xx: Fix m8xx_wdt issues Mark Nutter: spufs: switchable spu contexts kernel-side context switch code for spufs spufs: add spu-side context switch code Michael Ellerman: powerpc: Merge kexec powerpc: Propagate regs through to machine_crash_shutdown powerpc: Add a is_kernel_addr() macro powerpc: Separate usage of KERNELBASE and PAGE_OFFSET powerpc: Add CONFIG_CRASH_DUMP powerpc: Create a trampoline for the fwnmi vectors powerpc: Reroute interrupts from 0 + offset to PHYSICAL_START + offset powerpc: Fixups for kernel linked at 32 MB powerpc: Add arch dependent basic infrastructure for Kdump. powerpc: Parse crashkernel= parameter in first kernel powerpc: Add arch-dependent copy_oldmem_page powerpc: Add support for "linux,usable-memory" on memory nodes Michal Ostrowski: powerpc/pseries: Fix TCE building with 64k pagesize Mike Kravetz: Remove SPAN_OTHER_NODES config definition powerpc: Minor numa memory code cleanup powerpc: Minor numa memory code cleanup powerpc: numa placement for dynamically added memory powerpc/pseries: boot failures on numa if no memory on node Olaf Hering: powerpc: correct the NR_CPUS description text Olof Johansson: powerpc: remove redundant code in stab init Otavio Salvador: ppc: removed unused variable i from code. Paul Mackerras: powerpc: Update __NR_syscalls to account for SPU syscalls ppc: remove duplicate bseip.h powerpc: Fix up some compile errors in the PCI error recovery code powerpc/pseries: Optimize IOMMU setup ppc: Build in all three of powermac, PREP and CHRP support Revert "powerpc: Minor numa memory code cleanup" powerpc: Fix typo in head_64.S Stephen Rothwell: powerpc: remove arch/powerpc/include hack for 64 bit powerpc: cleanup iseries irq.c powerpc: use end_IRQ for iseries irqs powerpc: partly merge iseries do_IRQ powerpc: reduce include in irq.c powerpc: more iseries irq work powerpc: fix for "Update OF address parsers" From sfr at canb.auug.org.au Thu Dec 8 17:48:20 2005 From: sfr at canb.auug.org.au (Stephen Rothwell) Date: Thu, 8 Dec 2005 17:48:20 +1100 Subject: patches in powerpc.git tree In-Reply-To: <17303.53564.160668.376061@cargo.ozlabs.ibm.com> References: <17303.53564.160668.376061@cargo.ozlabs.ibm.com> Message-ID: <20051208174820.728a4a7b.sfr@canb.auug.org.au> On Thu, 8 Dec 2005 17:22:52 +1100 Paul Mackerras wrote: > > Stephen Rothwell: > powerpc: remove arch/powerpc/include hack for 64 bit That one is in Linus' tree ... -- Cheers, Stephen Rothwell sfr at canb.auug.org.au http://www.canb.auug.org.au/~sfr/ -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051208/055e16b3/attachment.pgp From dwmw2 at infradead.org Thu Dec 8 19:25:19 2005 From: dwmw2 at infradead.org (David Woodhouse) Date: Thu, 08 Dec 2005 09:25:19 +0100 Subject: instantiate_rtas on Cell sim fails... Message-ID: <1134030320.19711.51.camel@localhost.localdomain> We never used to check for != 0; we used to check for == PROM_ERROR instead. And on mambo we get 1, not 0. This makes it work again, but is it the sim at fault, or the kernel? --- linux-2.6.14/arch/powerpc/kernel/prom_init.c~ 2005-12-07 23:33:20.000000000 +0100 +++ linux-2.6.14/arch/powerpc/kernel/prom_init.c 2005-12-07 23:33:38.000000000 +0100 @@ -1051,7 +1051,7 @@ static void __init prom_instantiate_rtas if (call_prom_ret("call-method", 3, 2, &entry, ADDR("instantiate-rtas"), - rtas_inst, base) != 0 + rtas_inst, base) == PROM_ERROR || entry == 0) { prom_printf(" failed\n"); return; -- dwmw2 From dwmw2 at infradead.org Thu Dec 8 21:31:10 2005 From: dwmw2 at infradead.org (David Woodhouse) Date: Thu, 08 Dec 2005 11:31:10 +0100 Subject: [RFC PATCH 6/5] CELL rtas console port to hvc_console backend driver In-Reply-To: <43935B9C.5020503@us.ibm.com> References: <43935B9C.5020503@us.ibm.com> Message-ID: <1134037870.19711.57.camel@localhost.localdomain> --- linux-2.6.14/drivers/char/hvc_rtas.c~ 2005-12-07 18:12:59.000000000 +0100 +++ linux-2.6.14/drivers/char/hvc_rtas.c 2005-12-07 18:15:39.000000000 +0100 @@ -0,0 +1,161 @@ +/* + * IBM RTAS driver interface to hvc_console.c + * + * (C) Copyright IBM Corporation 2001-2005 + * (C) Copyright Red Hat, Inc. 2005 + * + * Author(s): Maximino Augilar + * : Ryan S. Arnold + * : Utz Bacher + * : David Woodhouse + * + * inspired by drivers/char/hvc_console.c + * written by Anton Blanchard and Paul Mackerras + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "hvc_console.h" + +static uint32_t hvc_rtas_vtermno = 0; +struct hvc_struct *hvc_rtas_dev; + +#define RTASCONS_PUT_ATTEMPTS 16 + +static int rtascons_put_char_token = -1; +static int rtascons_get_char_token = -1; +static int rtascons_put_delay; + +static inline int hvc_rtas_write_console(uint32_t vtermno, const char *buf, int count) +{ + int result = 0; + int attempts = RTASCONS_PUT_ATTEMPTS; + int done = 0; + + /* if there is more than one character to be displayed, wait a bit */ + for (; done < count && attempts; udelay(rtascons_put_delay)) { + attempts--; + result = rtas_call(rtascons_put_char_token, 1, 1, NULL, buf[done]); + + if (!result) { + attempts = RTASCONS_PUT_ATTEMPTS; + done++; + } + } + /* the calling routine expects to receive the number of bytes sent */ + return done?:result; +} + +static inline int rtascons_get_char(void) +{ + int result; + + if (rtas_call(rtascons_get_char_token, 0, 2, &result)) + result = -1; + + return result; +} + +static int hvc_rtas_read_console(uint32_t vtermno, char *buf, int count) +{ + unsigned long got; + int c; + int i; + + for (got = 0, i = 0; i < count; i++) { + + if (( c = rtascons_get_char() ) != -1) { + buf[i] = c; + ++got; + } + else + break; + } + return got; +} + +static struct hv_ops hvc_rtas_get_put_ops = { + .get_chars = hvc_rtas_read_console, + .put_chars = hvc_rtas_write_console, +}; + +static int hvc_rtas_init(void) +{ + struct hvc_struct *hp; + + if (rtascons_put_char_token == -1) + rtascons_put_char_token = rtas_token("put-term-char"); + if (rtascons_put_char_token == -1) + return -EIO; + + if (rtascons_get_char_token == -1) + rtascons_get_char_token = rtas_token("get-term-char"); + if (rtascons_get_char_token == -1) + return -EIO; + + if (__onsim()) + rtascons_put_delay = 0; + else + rtascons_put_delay = 100; + + BUG_ON(hvc_rtas_dev); + + /* Allocate an hvc_struct for the console device we instantiated + * earlier. Save off hp so that we can return it on exit */ + hp = hvc_alloc(hvc_rtas_vtermno, NO_IRQ, &hvc_rtas_get_put_ops); + if (IS_ERR(hp)) + return PTR_ERR(hp); + hvc_rtas_dev = hp; + return 0; +} +module_init(hvc_rtas_init); + +/* This will tear down the tty portion of the driver */ +static void __exit hvc_rtas_exit(void) +{ + struct hvc_struct *hp_safe; + /* Hopefully this isn't premature */ + if (!hvc_rtas_dev) + return; + + hp_safe = hvc_rtas_dev; + hvc_rtas_dev = NULL; + + /* Really the fun isn't over until the worker thread breaks down and the + * tty cleans up */ + hvc_remove(hp_safe); +} +module_exit(hvc_rtas_exit); /* before drivers/char/hvc_console.c */ + +/* This will happen prior to module init. There is no tty at this time? */ +static int hvc_rtas_console_init(void) +{ + rtascons_put_char_token = rtas_token("put-term-char"); + if (rtascons_put_char_token == -1) + return -EIO; + rtascons_get_char_token = rtas_token("get-term-char"); + if (rtascons_get_char_token == -1) + return -EIO; + + hvc_instantiate(hvc_rtas_vtermno, 0, &hvc_rtas_get_put_ops ); + return 0; +} +console_initcall(hvc_rtas_console_init); --- linux-2.6.14/drivers/char/Makefile~ 2005-12-07 17:47:05.000000000 +0100 +++ linux-2.6.14/drivers/char/Makefile 2005-12-07 18:12:07.000000000 +0100 @@ -43,6 +43,7 @@ obj-$(CONFIG_RIO) += rio/ generic_seria obj-$(CONFIG_HVC_DRIVER) += hvc_console.o obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o obj-$(CONFIG_HVC_FSS) += hvc_fss.o +obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MMTIMER) += mmtimer.o --- linux-2.6.14/drivers/char/Kconfig~ 2005-12-07 17:47:05.000000000 +0100 +++ linux-2.6.14/drivers/char/Kconfig 2005-12-07 18:17:14.000000000 +0100 @@ -575,6 +575,13 @@ config HVC_FSS IBM Full System Simulator Console device driver which makes use of the HVC_DRIVER front end. +config HVC_RTAS + bool "IBM RTAS Console support" + depends on PPC_RTAS + select HVC_DRIVER + help + IBM Console device driver which makes use of RTAS + config HVCS tristate "IBM Hypervisor Virtual Console Server support" depends on PPC_PSERIES -- dwmw2 From paulus at samba.org Thu Dec 8 22:48:45 2005 From: paulus at samba.org (Paul Mackerras) Date: Thu, 8 Dec 2005 22:48:45 +1100 Subject: instantiate_rtas on Cell sim fails... In-Reply-To: <1134030320.19711.51.camel@localhost.localdomain> References: <1134030320.19711.51.camel@localhost.localdomain> Message-ID: <17304.7581.151784.735540@cargo.ozlabs.ibm.com> David Woodhouse writes: > We never used to check for != 0; we used to check for == PROM_ERROR > instead. And on mambo we get 1, not 0. This makes it work again, but is > it the sim at fault, or the kernel? > > --- linux-2.6.14/arch/powerpc/kernel/prom_init.c~ 2005-12-07 23:33:20.000000000 +0100 > +++ linux-2.6.14/arch/powerpc/kernel/prom_init.c 2005-12-07 23:33:38.000000000 +0100 > @@ -1051,7 +1051,7 @@ static void __init prom_instantiate_rtas > > if (call_prom_ret("call-method", 3, 2, &entry, > ADDR("instantiate-rtas"), > - rtas_inst, base) != 0 > + rtas_inst, base) == PROM_ERROR The call-method function is supposed to execute the named method inside a catch. The first return value, which is what call_prom_ret returns, is the result from catch. Catch returns false (i.e. 0) if there was no throw call, or a non-zero error code if an error was signalled with throw. This is from IEEE 1275. So I think that != 0 is correct and thus sim is at fault, unless of course the forth code for instantiate-rtas is in fact calling throw for some reason, in which case we need to find out what error the sim firmware is detecting. Paul. From olof at lixom.net Fri Dec 9 12:40:17 2005 From: olof at lixom.net (Olof Johansson) Date: Thu, 8 Dec 2005 19:40:17 -0600 Subject: [PATCH] powerpc: Set cache info defaults Message-ID: <20051209014017.GD1082@pb15.lixom.net> Hi, I would like to see this in 2.6.15, please apply. --- Cache info is setup by walking the device tree in initialize_cache_info(). However, icache_flush_range might be called before that, in slb_initialize()->patch_slb_encoding, which modifies the load immediate instructions used with SLB fault code. Not only that, but depending on memory layout, we might take SLB faults during unflatten_device_tree. So that fault will load an SLB entry that might not contain the right LLP flags for the segment. Either we can walk the flattened device tree to setup cache info, or we can pick the known defaults that are known to work. Doing it in the flattened device tree is hairier since we need to know the machine type to know what property to look for, etc, etc. For now, it's just easier to go with the defaults. Worst thing that happens from it is that we might waste a few cycles doing too small dcbst/icbi increments. Signed-off-by: Olof Johansson Index: 2.6/arch/powerpc/kernel/setup_64.c =================================================================== --- 2.6.orig/arch/powerpc/kernel/setup_64.c 2005-12-08 17:17:59.000000000 -0600 +++ 2.6/arch/powerpc/kernel/setup_64.c 2005-12-08 19:35:15.000000000 -0600 @@ -106,7 +106,15 @@ int boot_cpuid_phys = 0; dev_t boot_dev; u64 ppc64_pft_size; -struct ppc64_caches ppc64_caches; +/* Pick defaults since we might want to patch instructions + * before we've read this from the device tree. + */ +struct ppc64_caches ppc64_caches = { + .dline_size = 0x80, + .log_dline_size = 7, + .iline_size = 0x80, + .log_iline_size = 7 +}; EXPORT_SYMBOL_GPL(ppc64_caches); /* From michael at ellerman.id.au Fri Dec 9 12:57:20 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Thu, 8 Dec 2005 19:57:20 -0600 Subject: [PATCH] powerpc: Set cache info defaults In-Reply-To: <20051209014017.GD1082@pb15.lixom.net> References: <20051209014017.GD1082@pb15.lixom.net> Message-ID: <200512081957.24861.michael@ellerman.id.au> On Thu, 8 Dec 2005 19:40, Olof Johansson wrote: > Cache info is setup by walking the device tree in initialize_cache_info(). > However, icache_flush_range might be called before that, in > slb_initialize()->patch_slb_encoding, which modifies the load immediate > instructions used with SLB fault code. > > Not only that, but depending on memory layout, we might take SLB faults > during unflatten_device_tree. So that fault will load an SLB entry that > might not contain the right LLP flags for the segment. > > Either we can walk the flattened device tree to setup cache info, or > we can pick the known defaults that are known to work. Doing it in the > flattened device tree is hairier since we need to know the machine type > to know what property to look for, etc, etc. > > For now, it's just easier to go with the defaults. Worst thing that > happens from it is that we might waste a few cycles doing too small > dcbst/icbi increments. This is cool. I had to hand-code the sync in one of my kdump patches exactly because it was too early to call flush_icache_range(). And I got it wrong the first time :P cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20051208/81dd418c/attachment.pgp From tom_gall at vnet.ibm.com Fri Dec 9 13:13:54 2005 From: tom_gall at vnet.ibm.com (Tom Gall) Date: Thu, 8 Dec 2005 20:13:54 -0600 (CST) Subject: [PATCH] vDSO for ppc/ppc64 submission Message-ID: Greetings, Enclosed is the patch for ppc/ppc64 vDSO support in glibc plus changes to use the vDSO implementations of __vdso_get_tbfreq, __vdso_clock_gettime, __vdso_clock_getres and __vdso_gettimeofday found in the 2.6.15 kernel written by Ben Herrenschmidt. Comments/Complaints/Suggestions of course are most welcome. Regards, Tom 2005-12-08 Steven Munroe Tom Gall * elf/rtld.c (dl_main): Initialize l_local_scope for sysinfo_map. * sysdeps/powerpc/elf/libc-start.c: Move this. * sysdeps/unix/sysv/linux/powerpc/libc-start.c: To here. * sysdeps/powerpc/powerpc32/dl-start.S: add _dl_main_dispatch * sysdeps/powerpc/powerpc32/hp-timing.h: New file. * sysdeps/powerpc/Versions: add __vdso_ symbols * sysdeps/unix/sysv/linux/clock_getres.c: add INTERNAL_VSYSCALL defined by default to INTERNAL_SYSCALL and INLINE_VSYSCALL defined by default to INLINE_SYSCALL * sysdeps/unix/sysv/linux/clock_gettime.c: add INTERNAL_VSYSCALL defined by default to INTERNAL_SYSCALL and INLINE_VSYSCALL defined by default to INLINE_SYSCALL * sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h: new file * sysdeps/unix/sysv/linux/powerpc/clock_getres.c: New file. * sysdeps/unix/sysv/linux/powerpc/clock_gettime.c: New file. * sysdeps/unix/sysv/linux/powerpc/dl-vdso.c: New file. * sysdeps/unix/sysv/linux/powerpc/dl-vdso.h: New file. * sysdeps/unix/sysv/linux/powerpc/get_clockfreq.c: use vDSO / format * sysdeps/unix/sysv/linux/powerpc/gettimeofday.c: New file. * sysdeps/unix/sysv/linux/powerpc/Makefile: Add routines += dl-vdso. * sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h: new INLINE_VDSOCALL, INTERNAL_VDSOCALL_SIMPLE, INLINE_VDSOCALL_NO_SYSCALL_FALLBACK, INLINE_VDSOCALL_SIMPLE and INTERNAL_VDSOCALL macros * sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h: new INLINE_VDSOCALL, INTERNAL_VDSOCALL_SIMPLE, INLINE_VDSOCALL_NO_SYSCALL_FALLBACK, INLINE_VDSOCALL_SIMPLE and INTERNAL_VDSOCALL macros macros diff -uNr libc.orig/elf/rtld.c libc/elf/rtld.c --- libc.orig/elf/rtld.c 2005-12-05 21:18:26.000000000 -0500 +++ libc/elf/rtld.c 2005-12-05 21:20:43.000000000 -0500 @@ -1296,6 +1296,13 @@ elf_get_dynamic_info (l, dyn_temp); _dl_setup_hash (l); l->l_relocated = 1; + /* Initialize l_local_scope to contain just this map. This allows + the use of dl_lookup_symbol_x to resolve symbols within the vdso. + So we create a single entry list pointing to l_real as its only + element */ + + l->l_local_scope[0]->r_nlist = 1; + l->l_local_scope[0]->r_list = &l->l_real; /* Now that we have the info handy, use the DSO image's soname so this object can be looked up by name. Note that we do not diff -uNr libc.orig/sysdeps/powerpc/elf/libc-start.c libc/sysdeps/powerpc/elf/libc-start.c --- libc.orig/sysdeps/powerpc/elf/libc-start.c 2005-12-05 21:18:27.000000000 -0500 +++ libc/sysdeps/powerpc/elf/libc-start.c 1969-12-31 19:00:00.000000000 -0500 @@ -1,99 +0,0 @@ -/* Copyright (C) 1998,2000,2001,2002,2003,2004 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include -#include -#include -#include -#include - -extern int __cache_line_size; -weak_extern (__cache_line_size) - -/* The main work is done in the generic function. */ -#define LIBC_START_MAIN generic_start_main -#define LIBC_START_DISABLE_INLINE -#define LIBC_START_MAIN_AUXVEC_ARG -#define MAIN_AUXVEC_ARG -#include - - -struct startup_info -{ - void *__unbounded sda_base; - int (*main) (int, char **, char **, void *); - int (*init) (int, char **, char **, void *); - void (*fini) (void); -}; - - -int -/* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the - BPs in the arglist of startup_info.main and startup_info.init. */ -BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av, - char *__unbounded *__unbounded ubp_ev, - ElfW(auxv_t) *__unbounded auxvec, - void (*rtld_fini) (void), - struct startup_info *__unbounded stinfo, - char *__unbounded *__unbounded stack_on_entry) -{ -#if __BOUNDED_POINTERS__ - char **argv; -#else -# define argv ubp_av -#endif - - /* the PPC SVR4 ABI says that the top thing on the stack will - be a NULL pointer, so if not we assume that we're being called - as a statically-linked program by Linux... */ - if (*stack_on_entry != NULL) - { - char *__unbounded *__unbounded temp; - /* ...in which case, we have argc as the top thing on the - stack, followed by argv (NULL-terminated), envp (likewise), - and the auxilary vector. */ - /* 32/64-bit agnostic load from stack */ - argc = *(long int *__unbounded) stack_on_entry; - ubp_av = stack_on_entry + 1; - ubp_ev = ubp_av + argc + 1; -#ifdef HAVE_AUX_VECTOR - temp = ubp_ev; - while (*temp != NULL) - ++temp; - auxvec = (ElfW(auxv_t) *)++temp; -#endif - rtld_fini = NULL; - } - - /* Initialize the __cache_line_size variable from the aux vector. */ - for (ElfW(auxv_t) *av = auxvec; av->a_type != AT_NULL; ++av) - switch (av->a_type) - { - case AT_DCACHEBSIZE: - { - int *cls = & __cache_line_size; - if (cls != NULL) - *cls = av->a_un.a_val; - } - break; - } - - return generic_start_main (stinfo->main, argc, ubp_av, auxvec, - stinfo->init, stinfo->fini, rtld_fini, - stack_on_entry); -} diff -uNr libc.orig/sysdeps/powerpc/powerpc32/dl-start.S libc/sysdeps/powerpc/powerpc32/dl-start.S --- libc.orig/sysdeps/powerpc/powerpc32/dl-start.S 2005-12-05 21:18:27.000000000 -0500 +++ libc/sysdeps/powerpc/powerpc32/dl-start.S 2005-12-05 21:20:43.000000000 -0500 @@ -98,6 +98,7 @@ Take the opportunity to clear LR, so anyone who accidentally returns from _start gets SEGV. Also clear the next few words of the stack. */ +ENTRY(_dl_main_dispatch) li r31,0 stw r31,0(r1) mtlr r31 diff -uNr libc.orig/sysdeps/powerpc/powerpc32/hp-timing.h libc/sysdeps/powerpc/powerpc32/hp-timing.h --- libc.orig/sysdeps/powerpc/powerpc32/hp-timing.h 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/powerpc/powerpc32/hp-timing.h 2005-12-05 21:20:43.000000000 -0500 @@ -0,0 +1,83 @@ +/* High precision, low overhead timing functions. Generic version. + Copyright (C) 1998, 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper , 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _HP_TIMING_H +#define _HP_TIMING_H 1 + + +/* There are no generic definitions for the times. We could write something + using the `gettimeofday' system call where available but the overhead of + the system call might be too high. + + In case a platform supports timers in the hardware the following macros + and types must be defined: + + - HP_TIMING_AVAIL: test for availability. + + - HP_TIMING_INLINE: this macro is non-zero if the functionality is not + implemented using function calls but instead uses some inlined code + which might simply consist of a few assembler instructions. We have to + know this since we might want to use the macros here in places where we + cannot make function calls. + + - hp_timing_t: This is the type for variables used to store the time + values. + + - HP_TIMING_ZERO: clear `hp_timing_t' object. + + - HP_TIMING_NOW: place timestamp for current time in variable given as + parameter. + + - HP_TIMING_DIFF_INIT: do whatever is necessary to be able to use the + HP_TIMING_DIFF macro. + + - HP_TIMING_DIFF: compute difference between two times and store it + in a third. Source and destination might overlap. + + - HP_TIMING_ACCUM: add time difference to another variable. This might + be a bit more complicated to implement for some platforms as the + operation should be thread-safe and 64bit arithmetic on 32bit platforms + is not. + + - HP_TIMING_ACCUM_NT: this is the variant for situations where we know + there are no threads involved. + + - HP_TIMING_PRINT: write decimal representation of the timing value into + the given string. This operation need not be inline even though + HP_TIMING_INLINE is specified. + +*/ + +/* Provide dummy definitions. */ +#define HP_TIMING_AVAIL (0) +#define HP_TIMING_INLINE (0) +typedef unsigned long long int hp_timing_t; +#define HP_TIMING_ZERO(Var) +#define HP_TIMING_NOW(var) +#define HP_TIMING_DIFF_INIT() +#define HP_TIMING_DIFF(Diff, Start, End) +#define HP_TIMING_ACCUM(Sum, Diff) +#define HP_TIMING_ACCUM_NT(Sum, Diff) +#define HP_TIMING_PRINT(Buf, Len, Val) + +/* Since this implementation is not available we tell the user about it. */ +#define HP_TIMING_NONAVAIL 1 + +#endif /* hp-timing.h */ diff -uNr libc.orig/sysdeps/powerpc/Versions libc/sysdeps/powerpc/Versions --- libc.orig/sysdeps/powerpc/Versions 2005-12-05 21:18:27.000000000 -0500 +++ libc/sysdeps/powerpc/Versions 2005-12-05 21:20:43.000000000 -0500 @@ -13,5 +13,8 @@ GLIBC_PRIVATE { __novmx__libc_longjmp; __novmx__libc_siglongjmp; __vmx__libc_longjmp; __vmx__libc_siglongjmp; + __vdso_get_tbfreq; + __vdso_clock_gettime; + __vdso_clock_getres; } } diff -uNr libc.orig/sysdeps/unix/sysv/linux/clock_getres.c libc/sysdeps/unix/sysv/linux/clock_getres.c --- libc.orig/sysdeps/unix/sysv/linux/clock_getres.c 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/clock_getres.c 2005-12-07 13:54:09.000000000 -0500 @@ -24,9 +24,16 @@ #include "kernel-features.h" +#ifndef INTERNAL_VSYSCALL +#define INTERNAL_VSYSCALL INTERNAL_SYSCALL +#endif + +#ifndef INLINE_VSYSCALL +#define INLINE_VSYSCALL INLINE_SYSCALL +#endif #define SYSCALL_GETRES \ - retval = INLINE_SYSCALL (clock_getres, 2, clock_id, res); \ + retval = INLINE_VSYSCALL (clock_getres, 2, clock_id, res); \ break #ifdef __ASSUME_POSIX_TIMERS @@ -109,7 +116,7 @@ if (!__libc_missing_posix_cpu_timers) { INTERNAL_SYSCALL_DECL (err); - int r = INTERNAL_SYSCALL (clock_getres, err, 2, clock_id, res); + int r = INTERNAL_VSYSCALL (clock_getres, err, 2, clock_id, res); if (!INTERNAL_SYSCALL_ERROR_P (r, err)) return 0; @@ -128,7 +135,7 @@ { /* Check whether the kernel supports CPU clocks at all. If not, record it for the future. */ - r = INTERNAL_SYSCALL (clock_getres, err, 2, + r = INTERNAL_VSYSCALL (clock_getres, err, 2, MAKE_PROCESS_CPUCLOCK (0, CPUCLOCK_SCHED), NULL); if (INTERNAL_SYSCALL_ERROR_P (r, err)) diff -uNr libc.orig/sysdeps/unix/sysv/linux/clock_gettime.c libc/sysdeps/unix/sysv/linux/clock_gettime.c --- libc.orig/sysdeps/unix/sysv/linux/clock_gettime.c 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/clock_gettime.c 2005-12-07 13:52:13.000000000 -0500 @@ -23,9 +23,16 @@ #include "kernel-posix-cpu-timers.h" #include "kernel-features.h" +#ifndef INTERNAL_VSYSCALL +#define INTERNAL_VSYSCALL INTERNAL_SYSCALL +#endif + +#ifndef INLINE_VSYSCALL +#define INLINE_VSYSCALL INLINE_SYSCALL +#endif #define SYSCALL_GETTIME \ - retval = INLINE_SYSCALL (clock_gettime, 2, clock_id, tp); \ + retval = INLINE_VSYSCALL (clock_gettime, 2, clock_id, tp); \ break #ifdef __ASSUME_POSIX_TIMERS @@ -108,7 +115,7 @@ if (!__libc_missing_posix_cpu_timers) { INTERNAL_SYSCALL_DECL (err); - int r = INTERNAL_SYSCALL (clock_gettime, err, 2, clock_id, tp); + int r = INTERNAL_VSYSCALL (clock_gettime, err, 2, clock_id, tp); if (!INTERNAL_SYSCALL_ERROR_P (r, err)) return 0; @@ -127,7 +134,7 @@ { /* Check whether the kernel supports CPU clocks at all. If not, record it for the future. */ - r = INTERNAL_SYSCALL (clock_getres, err, 2, + r = INTERNAL_VSYSCALL (clock_getres, err, 2, MAKE_PROCESS_CPUCLOCK (0, CPUCLOCK_SCHED), NULL); if (INTERNAL_SYSCALL_ERROR_P (r, err)) diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h libc/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h --- libc.orig/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h 2005-12-05 21:20:43.000000000 -0500 @@ -0,0 +1,36 @@ +/* Resolved function pointers to VDSO functions. + Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef _LIBC_VDSO_H +#define _LIBC_VDSO_H + +#ifdef SHARED + +extern void *__vdso_gettimeofday; + +extern void *__vdso_clock_gettime; + +extern void *__vdso_clock_getres; + +extern void *__vdso_get_tbfreq; + +#endif + +#endif /* _LIBC_VDSO_H */ diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/clock_getres.c libc/sysdeps/unix/sysv/linux/powerpc/clock_getres.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/clock_getres.c 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/clock_getres.c 2005-12-07 16:44:14.000000000 -0500 @@ -0,0 +1,25 @@ +/* clock_getres -- Get the resolution of a POSIX clockid_t. Linux version. + Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#define INTERNAL_VSYSCALL INTERNAL_VDSOCALL_SIMPLE +#define INLINE_VSYSCALL INLINE_VDSOCALL + +#include diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/clock_gettime.c libc/sysdeps/unix/sysv/linux/powerpc/clock_gettime.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/clock_gettime.c 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/clock_gettime.c 2005-12-07 16:44:35.000000000 -0500 @@ -0,0 +1,25 @@ +/* clock_gettime -- Get current time from a POSIX clockid_t. Linux version. + Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#define INTERNAL_VSYSCALL INTERNAL_VDSOCALL_SIMPLE +#define INLINE_VSYSCALL INLINE_VDSOCALL + +#include diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/dl-vdso.c libc/sysdeps/unix/sysv/linux/powerpc/dl-vdso.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/dl-vdso.c 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/dl-vdso.c 2005-12-05 21:20:43.000000000 -0500 @@ -0,0 +1,59 @@ +/* ELF symbol resolve functions for VDSO objects. + Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "config.h" +#include +#include + +void *internal_function +_dl_vdso_vsym (const char *name, const char *version) +{ + ElfW (Sym) wsym; + const ElfW (Sym) * ref = &wsym; + struct link_map *map = GLRO (dl_sysinfo_map); + void *value = NULL; + struct r_found_version vers; + lookup_t result; + + + if (map != NULL) + { + /* Use a WEAK REF so we don't error out if the symbol is not found. */ + memset (&wsym, 0, sizeof (ElfW (Sym))); + wsym.st_info = (unsigned char) ELFW (ST_INFO (STB_WEAK, STT_NOTYPE)); + /* Compute hash value to the version string. */ + vers.name = version; + vers.hidden = 1; + vers.hash = _dl_elf_hash (version); + /* We don't have a specific file where the symbol can be found. */ + vers.filename = NULL; + + /* Search the scope of the vdso map. */ + result = GLRO (dl_lookup_symbol_x) (name, map, &ref, + map->l_local_scope, + &vers, 0, 0, NULL); + + if (ref != NULL) + { + value = DL_SYMBOL_ADDRESS (result, ref); + + } + } + return value; +} diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/dl-vdso.h libc/sysdeps/unix/sysv/linux/powerpc/dl-vdso.h --- libc.orig/sysdeps/unix/sysv/linux/powerpc/dl-vdso.h 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/dl-vdso.h 2005-12-05 21:20:43.000000000 -0500 @@ -0,0 +1,29 @@ +/* ELF symbol resolve functions for VDSO objects. + Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _DL_VDSO_H +#define _DL_VDSO_H + +/* Functions for resolving symbols in the VDSO link map. */ + +extern void * +_dl_vdso_vsym (const char *name, const char *version) + internal_function attribute_hidden; + +#endif /* dl-vdso.h */ diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/get_clockfreq.c libc/sysdeps/unix/sysv/linux/powerpc/get_clockfreq.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/get_clockfreq.c 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/get_clockfreq.c 2005-12-05 21:20:43.000000000 -0500 @@ -22,14 +22,15 @@ #include #include #include - +#include +#include hp_timing_t __get_clockfreq (void) { /* We read the information from the /proc filesystem. /proc/cpuinfo contains at least one line like: - timebase : 33333333 + timebase : 33333333 We search for this line and convert the number into an integer. */ static hp_timing_t timebase_freq; hp_timing_t result = 0L; @@ -38,68 +39,76 @@ if (timebase_freq != 0) return timebase_freq; - int fd = open ("/proc/cpuinfo", O_RDONLY); - if (__builtin_expect (fd != -1, 1)) + /* if we can use the vDSO to obtain the timebase even better */ +#ifdef SHARED + timebase_freq = INLINE_VDSOCALL_SIMPLE (get_tbfreq, 0); + if (timebase_freq == 0) +#endif { - /* The timebase will be in the 1st 1024 bytes for systems with up - to 8 processors. If the first read returns less then 1024 - bytes read, we have the whole cpuinfo and can start the scan. - Otherwise we will have to read more to insure we have the - timebase value in the scan. */ - char buf[1024]; - ssize_t n; + int fd = open ("/proc/cpuinfo", O_RDONLY); - n = read (fd, buf, sizeof (buf)); - if (n == sizeof (buf)) + if (__builtin_expect (fd != -1, 1)) { - /* We are here because the 1st read returned exactly sizeof - (buf) bytes. This implies that we are not at EOF and may - not have read the timebase value yet. So we need to read - more bytes until we know we have EOF. We copy the lower - half of buf to the upper half and read sizeof (buf)/2 - bytes into the lower half of buf and repeat until we - reach EOF. We can assume that the timebase will be in - the last 512 bytes of cpuinfo, so two 512 byte half_bufs - will be sufficient to contain the timebase and will - handle the case where the timebase spans the half_buf - boundry. */ - const ssize_t half_buf = sizeof (buf) / 2; - while (n >= half_buf) + /* The timebase will be in the 1st 1024 bytes for systems with up + to 8 processors. If the first read returns less then 1024 + bytes read, we have the whole cpuinfo and can start the scan. + Otherwise we will have to read more to insure we have the + timebase value in the scan. */ + char buf[1024]; + ssize_t n; + + n = read (fd, buf, sizeof (buf)); + if (n == sizeof (buf)) { - memcpy (buf, buf + half_buf, half_buf); - n = read (fd, buf + half_buf, half_buf); + /* We are here because the 1st read returned exactly sizeof + (buf) bytes. This implies that we are not at EOF and may + not have read the timebase value yet. So we need to read + more bytes until we know we have EOF. We copy the lower + half of buf to the upper half and read sizeof (buf)/2 + bytes into the lower half of buf and repeat until we + reach EOF. We can assume that the timebase will be in + the last 512 bytes of cpuinfo, so two 512 byte half_bufs + will be sufficient to contain the timebase and will + handle the case where the timebase spans the half_buf + boundry. */ + const ssize_t half_buf = sizeof (buf) / 2; + while (n >= half_buf) + { + memcpy (buf, buf + half_buf, half_buf); + n = read (fd, buf + half_buf, half_buf); + } + if (n >= 0) + n += half_buf; } - if (n >= 0) - n += half_buf; - } - - if (__builtin_expect (n, 1) > 0) - { - char *mhz = memmem (buf, n, "timebase", 7); - if (__builtin_expect (mhz != NULL, 1)) + if (__builtin_expect (n, 1) > 0) { - char *endp = buf + n; + char *mhz = memmem (buf, n, "timebase", 7); - /* Search for the beginning of the string. */ - while (mhz < endp && (*mhz < '0' || *mhz > '9') && *mhz != '\n') - ++mhz; - - while (mhz < endp && *mhz != '\n') + if (__builtin_expect (mhz != NULL, 1)) { - if (*mhz >= '0' && *mhz <= '9') + char *endp = buf + n; + + /* Search for the beginning of the string. */ + while (mhz < endp && (*mhz < '0' || *mhz > '9') + && *mhz != '\n') + ++mhz; + + while (mhz < endp && *mhz != '\n') { - result *= 10; - result += *mhz - '0'; - } + if (*mhz >= '0' && *mhz <= '9') + { + result *= 10; + result += *mhz - '0'; + } - ++mhz; + ++mhz; + } } + timebase_freq = result; } - timebase_freq = result; + close (fd); } - close (fd); } - return timebase_freq; } diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c libc/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c 2005-12-07 14:02:19.000000000 -0500 @@ -0,0 +1,41 @@ +/* Copyright (C) 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include + +#undef __gettimeofday +#include + +/* Get the current time of day and timezone information, + putting it into *TV and *TZ. If TZ is NULL, *TZ is not filled. + Returns 0 on success, -1 on errors. */ + +int +__gettimeofday (tv, tz) + struct timeval *tv; + struct timezone *tz; +{ + return INLINE_VDSOCALL (gettimeofday, 2, CHECK_1 (tv), CHECK_1 (tz)); +} + +INTDEF (__gettimeofday) weak_alias (__gettimeofday, gettimeofday) diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/libc-start.c libc/sysdeps/unix/sysv/linux/powerpc/libc-start.c --- libc.orig/sysdeps/unix/sysv/linux/powerpc/libc-start.c 1969-12-31 19:00:00.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/libc-start.c 2005-12-05 21:20:43.000000000 -0500 @@ -0,0 +1,130 @@ +/* Copyright (C) 1998,2000,2001,2002,2003,2004,2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include + +extern int __cache_line_size; +weak_extern (__cache_line_size) +/* The main work is done in the generic function. */ +#define LIBC_START_MAIN generic_start_main +#define LIBC_START_DISABLE_INLINE +#define LIBC_START_MAIN_AUXVEC_ARG +#define MAIN_AUXVEC_ARG +#define INIT_MAIN_ARGS +#include + +struct startup_info + { + void *__unbounded sda_base; + int (*main) (int, char **, char **, void *); + int (*init) (int, char **, char **, void *); + void (*fini) (void); + }; + + +#ifdef SHARED +#include +#include +#undef __gettimeofday +#undef __clock_gettime +#undef __clock_getres +#include + +void *__vdso_gettimeofday; +void *__vdso_clock_gettime; +void *__vdso_clock_getres; +void *__vdso_get_tbfreq; + +static inline void _libc_vdso_platform_setup (void) + { + __vdso_gettimeofday = _dl_vdso_vsym ("__kernel_gettimeofday", + "LINUX_2.6.15"); + + __vdso_clock_gettime = _dl_vdso_vsym ("__kernel_clock_gettime", + "LINUX_2.6.15"); + + __vdso_clock_getres = _dl_vdso_vsym ("__kernel_clock_getres", + "LINUX_2.6.15"); + + __vdso_get_tbfreq = _dl_vdso_vsym ("__kernel_vdso_get_tbfreq", + "LINUX_2.6.15"); + } +#endif + +int +/* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the + BPs in the arglist of startup_info.main and startup_info.init. */ + BP_SYM (__libc_start_main) (int argc, char *__unbounded * __unbounded ubp_av, + char *__unbounded * __unbounded ubp_ev, + ElfW (auxv_t) * __unbounded auxvec, + void (*rtld_fini) (void), + struct startup_info * __unbounded stinfo, + char *__unbounded * __unbounded stack_on_entry) +{ +#if __BOUNDED_POINTERS__ + char **argv; +#else +# define argv ubp_av +#endif + + /* the PPC SVR4 ABI says that the top thing on the stack will + be a NULL pointer, so if not we assume that we're being called + as a statically-linked program by Linux... */ + if (*stack_on_entry != NULL) + { + char *__unbounded * __unbounded temp; + /* ...in which case, we have argc as the top thing on the + stack, followed by argv (NULL-terminated), envp (likewise), + and the auxilary vector. */ + /* 32/64-bit agnostic load from stack */ + argc = *(long int *__unbounded) stack_on_entry; + ubp_av = stack_on_entry + 1; + ubp_ev = ubp_av + argc + 1; +#ifdef HAVE_AUX_VECTOR + temp = ubp_ev; + while (*temp != NULL) + ++temp; + auxvec = (ElfW (auxv_t) *)++ temp; +#endif + rtld_fini = NULL; + } + + /* Initialize the __cache_line_size variable from the aux vector. */ + for (ElfW (auxv_t) * av = auxvec; av->a_type != AT_NULL; ++av) + switch (av->a_type) + { + case AT_DCACHEBSIZE: + { + int *cls = &__cache_line_size; + if (cls != NULL) + *cls = av->a_un.a_val; + } + break; + } +#ifdef SHARED + /* Resolve and initialize function pointers for VDSO functions. */ + _libc_vdso_platform_setup (); +#endif + return generic_start_main (stinfo->main, argc, ubp_av, auxvec, + stinfo->init, stinfo->fini, rtld_fini, + stack_on_entry); +} diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/Makefile libc/sysdeps/unix/sysv/linux/powerpc/Makefile --- libc.orig/sysdeps/unix/sysv/linux/powerpc/Makefile 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/Makefile 2005-12-05 21:20:43.000000000 -0500 @@ -2,3 +2,8 @@ ifeq ($(subdir),rt) librt-routines += rt-sysdep endif + +ifeq ($(subdir),misc) +routines += dl-vdso +endif + diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h --- libc.orig/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h 2005-12-08 16:00:17.300339776 -0500 @@ -54,6 +54,139 @@ # include +# undef INLINE_VDSOCALL +#ifdef SHARED +# define INLINE_VDSOCALL(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret = 0; \ + \ + if ( __vdso_ ## name !=NULL) \ + sc_ret = INTERNAL_VDSOCALL (__vdso_ ## name, sc_err, nr, args); \ + if (( __vdso_ ## name == NULL ) || (sc_ret == ENOSYS)) \ + sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, args); \ + if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \ + { \ + __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \ + sc_ret = -1L; \ + } \ + sc_ret; \ + }) +#else +# define INLINE_VDSOCALL(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret; \ + \ + sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, args); \ + if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \ + { \ + __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \ + sc_ret = -1L; \ + } \ + sc_ret; \ + }) +#endif + +# undef INTERNAL_VDSOCALL_SIMPLE +#ifdef SHARED +# define INTERNAL_VDSOCALL_SIMPLE(name, err, nr, args...) \ + ({ \ + long int v_ret = 0; \ + \ + if ( __vdso_ ## name !=NULL) \ + v_ret = INTERNAL_VDSOCALL (__vdso_ ## name, err, nr, args); \ + if (( __vdso_ ## name == NULL ) || (v_ret == ENOSYS)) \ + v_ret = INTERNAL_SYSCALL (name, err, nr, args); \ + v_ret; \ + }) +#else +# define INTERNAL_VDSOCALL_SIMPLE(name, err, nr, args...) \ + ({ \ + long int v_ret; \ + \ + v_ret = INTERNAL_SYSCALL (name, err, nr, args); \ + v_ret; \ + }) +#endif + +# undef INLINE_VDSOCALL_NO_SYSCALL_FALLBACK +# define INLINE_VDSOCALL_NO_SYSCALL_FALLBACK(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret=0; \ + \ + if (__vdso_ ## name !=NULL) \ + { \ + sc_ret = INTERNAL_VDSOCALL (__vdso_ ## name, sc_err, nr, args); \ + } \ + else \ + { \ + sc_ret = ENOSYS; \ + } \ + if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \ + { \ + __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \ + sc_ret = -1L; \ + } \ + sc_ret; \ + }) + +# undef INLINE_VDSOCALL_SIMPLE +# define INLINE_VDSOCALL_SIMPLE(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret=0; \ + \ + if (__vdso_ ## name !=NULL) \ + { \ + sc_ret = INTERNAL_VDSOCALL (__vdso_ ## name, sc_err, nr, args); \ + } \ + else \ + { \ + sc_ret = ENOSYS; \ + } \ + sc_ret; \ + }) + +/* Define a macro which expands inline into the wrapper code for a VDSO + call. This use is for internal calls that do not need to handle errors + normally. It will never touch errno. + On powerpc a system call basically clobbers the same registers like a + function call, with the exception of LR (which is needed for the + "sc; bnslr+" sequence) and CR (where only CR0.SO is clobbered to signal + an error return status). */ + +# undef INTERNAL_VDSOCALL +# define INTERNAL_VDSOCALL_NCS(funcptr, err, nr, args...) \ + ({ \ + register void *r0 __asm__ ("r0"); \ + register long int r3 __asm__ ("r3"); \ + register long int r4 __asm__ ("r4"); \ + register long int r5 __asm__ ("r5"); \ + register long int r6 __asm__ ("r6"); \ + register long int r7 __asm__ ("r7"); \ + register long int r8 __asm__ ("r8"); \ + register long int r9 __asm__ ("r9"); \ + register long int r10 __asm__ ("r10"); \ + register long int r11 __asm__ ("r11"); \ + register long int r12 __asm__ ("r12"); \ + LOADARGS_##nr(funcptr, args); \ + __asm__ __volatile__ \ + ("mtctr %0\n\t" \ + "bctrl\n\t" \ + "mfcr %0" \ + : "=&r" (r0), \ + "=&r" (r3), "=&r" (r4), "=&r" (r5), "=&r" (r6), "=&r" (r7), \ + "=&r" (r8), "=&r" (r9), "=&r" (r10), "=&r" (r11), "=&r" (r12) \ + : ASM_INPUT_##nr \ + : "cr0", "ctr", "lr", "memory"); \ + err = (long int)r0; \ + (int) r3; \ + }) +# define INTERNAL_VDSOCALL(name, err, nr, args...) \ + INTERNAL_VDSOCALL_NCS (name, err, nr, ##args) + # undef INLINE_SYSCALL # define INLINE_SYSCALL(name, nr, args...) \ ({ \ diff -uNr libc.orig/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h --- libc.orig/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h 2005-12-05 21:18:30.000000000 -0500 +++ libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h 2005-12-08 16:00:42.708338736 -0500 @@ -66,7 +66,144 @@ #define ASM_TYPE_DIRECTIVE(name,typearg) .type name,typearg; #define ASM_SIZE_DIRECTIVE(name) .size name,.-name -#endif /* __ASSEMBLER__ */ +#endif /* __ASSEMBLER__ */ + +/* This version is for kernels that implement system calls that + behave like function calls as far as register saving. + It falls back to the syscall in the case that the vDSO doesn't + exist or fails for ENOSYS */ + +# undef INLINE_VDSOCALL +#ifdef SHARED +# define INLINE_VDSOCALL(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret = 0; \ + \ + if ( __vdso_ ## name !=NULL) \ + sc_ret = INTERNAL_VDSOCALL (__vdso_ ## name, sc_err, nr, args); \ + if (( __vdso_ ## name == NULL ) || (sc_ret == ENOSYS)) \ + sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, args); \ + if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \ + { \ + __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \ + sc_ret = -1L; \ + } \ + sc_ret; \ + }) +#else +# define INLINE_VDSOCALL(name, nr, args...) \ + ({ \ + INTERNAL_SYSCALL_DECL (sc_err); \ + long int sc_ret; \ + \ + sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, args); \ + if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \ + { \ + __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \ + sc_ret = -1L; \ + } \ + sc_ret; \ + }) +#endif + +# undef INTERNAL_VDSOCALL_SIMPLE +#ifdef SHARED +# define INTERNAL_VDSOCALL_SIMPLE(name, err, nr, args...) \ + ({ \ + long int v_ret = 0; \ + \ + if ( __vdso_ ## name !=NULL) \ + v_ret = INTERNAL_VDSOCALL (__vdso_ ## name, err, nr, args); \ + if (( __vdso_ ## name == NULL ) || (v_ret == ENOSYS)) \ + v_ret = INTERNAL_SYSCALL (name, err, nr, args); \ + v_ret; \ + }) +#else +# define INTERNAL_VDSOCALL_SIMPLE(name, err, nr, args...) \ + ({ \ + long int v_ret; \ + \ + v_ret = INTERNAL_SYSCALL (name, err, nr, args); \ + v_ret; \ + }) +#endif + + +/* This version does not fail back to a syscall as the previous + version does */ +# undef INLINE_VDSOCALL_NO_SYSCALL_FALLBACK +# define INLINE_VDSOCAL