From anton at samba.org Sun May 1 19:26:42 2005 From: anton at samba.org (Anton Blanchard) Date: Sun, 1 May 2005 19:26:42 +1000 Subject: [PATCH] ppc64: remove hidden -fno-omit-frame-pointer for schedule.c Message-ID: <20050501092641.GL19662@krispykreme> Hi, While looking at code generated by gcc4.0 I noticed some functions still had frame pointers, even after we stopped ppc64 from defining CONFIG_FRAME_POINTER. It turns out kernel/Makefile hardwires -fno-omit-frame-pointer on when compiling schedule.c. It was already disabled on ia64, disable it on ppc64 as well. Signed-off-by: Anton Blanchard Index: linux-2.6.12-rc2/kernel/Makefile =================================================================== --- linux-2.6.12-rc2.orig/kernel/Makefile 2005-04-19 13:37:40.599016667 +1000 +++ linux-2.6.12-rc2/kernel/Makefile 2005-05-01 05:48:00.689299680 +1000 @@ -33,6 +33,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o ifneq ($(CONFIG_IA64),y) +ifneq ($(CONFIG_PPC64),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure @@ -40,6 +41,7 @@ # to get a correct value for the wait-channel (WCHAN in ps). --davidm CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +endif $(obj)/configs.o: $(obj)/config_data.h From akpm at osdl.org Sun May 1 19:37:59 2005 From: akpm at osdl.org (Andrew Morton) Date: Sun, 1 May 2005 02:37:59 -0700 Subject: [PATCH] ppc64: remove hidden -fno-omit-frame-pointer for schedule.c In-Reply-To: <20050501092641.GL19662@krispykreme> References: <20050501092641.GL19662@krispykreme> Message-ID: <20050501023759.15d98aea.akpm@osdl.org> Anton Blanchard wrote: > > --- linux-2.6.12-rc2.orig/kernel/Makefile 2005-04-19 13:37:40.599016667 +1000 > +++ linux-2.6.12-rc2/kernel/Makefile 2005-05-01 05:48:00.689299680 +1000 > @@ -33,6 +33,7 @@ > obj-$(CONFIG_SECCOMP) += seccomp.o > > ifneq ($(CONFIG_IA64),y) > +ifneq ($(CONFIG_PPC64),y) Could we please use a new CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER, define that in the arch's Kconfig? (is that a triple negative I see?) From anton at samba.org Sun May 1 19:56:39 2005 From: anton at samba.org (Anton Blanchard) Date: Sun, 1 May 2005 19:56:39 +1000 Subject: [PATCH] ppc64: remove hidden -fno-omit-frame-pointer for schedule.c In-Reply-To: <20050501023759.15d98aea.akpm@osdl.org> References: <20050501092641.GL19662@krispykreme> <20050501023759.15d98aea.akpm@osdl.org> Message-ID: <20050501095639.GM19662@krispykreme> > Could we please use a new CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER, define > that in the arch's Kconfig? > > (is that a triple negative I see?) I like it. Hopefully someone suitably annoyed by this triple negative will go and work out which damn architectures actually need -fno-omit-frame-pointer and reverse the test. For now ppc32, ppc64, ia64 dont need it. Anton -- While looking at code generated by gcc4.0 I noticed some functions still had frame pointers, even after we stopped ppc64 from defining CONFIG_FRAME_POINTER. It turns out kernel/Makefile hardwires -fno-omit-frame-pointer on when compiling schedule.c. Create CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER and define it on architectures that dont require frame pointers in sched.c code. Signed-off-by: Anton Blanchard Index: linux-2.6.12-rc2/kernel/Makefile =================================================================== --- linux-2.6.12-rc2.orig/kernel/Makefile 2005-04-19 13:37:40.599016667 +1000 +++ linux-2.6.12-rc2/kernel/Makefile 2005-05-01 19:48:44.471448005 +1000 @@ -32,7 +32,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o -ifneq ($(CONFIG_IA64),y) +ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure Index: linux-2.6.12-rc2/arch/ia64/Kconfig =================================================================== --- linux-2.6.12-rc2.orig/arch/ia64/Kconfig 2005-04-19 13:37:33.173418325 +1000 +++ linux-2.6.12-rc2/arch/ia64/Kconfig 2005-05-01 19:49:35.060202590 +1000 @@ -46,6 +46,10 @@ bool default y +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool + default y + choice prompt "System type" default IA64_GENERIC Index: linux-2.6.12-rc2/arch/ppc/Kconfig =================================================================== --- linux-2.6.12-rc2.orig/arch/ppc/Kconfig 2005-04-19 13:37:33.450396856 +1000 +++ linux-2.6.12-rc2/arch/ppc/Kconfig 2005-05-01 19:49:24.699414050 +1000 @@ -43,6 +43,10 @@ bool default y +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool + default y + source "init/Kconfig" menu "Processor" Index: linux-2.6.12-rc2/arch/ppc64/Kconfig =================================================================== --- linux-2.6.12-rc2.orig/arch/ppc64/Kconfig 2005-05-01 05:39:38.017058150 +1000 +++ linux-2.6.12-rc2/arch/ppc64/Kconfig 2005-05-01 19:50:47.878561880 +1000 @@ -40,6 +40,10 @@ bool default y +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool + default y + # We optimistically allocate largepages from the VM, so make the limit # large enough (16MB). This badly named config option is actually # max order + 1 From anton at samba.org Sun May 1 20:30:13 2005 From: anton at samba.org (Anton Blanchard) Date: Sun, 1 May 2005 20:30:13 +1000 Subject: [PATCH] ppc64: add missing Kconfig help text Message-ID: <20050501103013.GN19662@krispykreme> From: Jesper Juhl There's no help text for CONFIG_DEBUG_STACKOVERFLOW - add one. Signed-off-by: Jesper Juhl Signed-off-by: Anton Blanchard Index: linux-2.6.12-rc2/arch/ppc64/Kconfig.debug =================================================================== --- linux-2.6.12-rc2.orig/arch/ppc64/Kconfig.debug 2005-02-04 04:10:36.000000000 +1100 +++ linux-2.6.12-rc2/arch/ppc64/Kconfig.debug 2005-05-01 20:27:18.760365099 +1000 @@ -5,6 +5,9 @@ config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL + help + This option will cause messages to be printed if free stack space + drops below a certain limit. config KPROBES bool "Kprobes" From peter at chubb.wattle.id.au Mon May 2 10:17:51 2005 From: peter at chubb.wattle.id.au (Peter Chubb) Date: Mon, 2 May 2005 10:17:51 +1000 Subject: [PATCH] ppc64: update to use the new 4L headers In-Reply-To: <4270472E.9050708@yahoo.com.au> References: <1114652039.7112.213.camel@gaston> <42704130.9050005@yahoo.com.au> <427044AA.5030402@nortel.com> <4270472E.9050708@yahoo.com.au> Message-ID: <17013.29103.249971.866326@wombat.chubb.wattle.id.au> >>>>> "Nick" == Nick Piggin writes: Nick> Chris Friesen wrote: >> I needed something like: >> >> pte_t *va_to_ptep_map(struct mm_struct *mm, unsigned int addr) >> >> There was code in follow_page() that did basically what I needed, >> but it was all contained within that function so I had to >> re-implement it. >> Nick> If you can break out exactly what you need, and make that inline Nick> or otherwise available via the correct header, I'm sure it would Nick> have a good chance of being merged. We're currently working on this, so as to be able to provide interfaces to alternative page tables. We want to be able to slot in Liedtke's `Guarded Page Tables', or B-trees, or a hash table to see what happens. Except we've called the function: pte_t * lookup_page_table(unsigned long address, struct mm_struct *mm); follow_page() is essentially the same after inline expansion happens; but we're seeing a regression in clear_page_range() that we want to fix before release. If you want to take a look (warning: it's still fairly rough work-in-progress) there's high level design being worked on at http://www.gelato.unsw.edu.au/IA64wiki/PageTableInterface and patches from our CVS repository. The only patch of interst is pti.patch. cvs -d :pserver:anoncvs at gelato.unsw.edu.au:/gelato login Logging in to :pserver:anoncvs at lemon:2401/gelato CVS password:[enter anoncvs] $ cvs -d:pserver:anoncvs at gelato.unsw.edu.au:/gelato co kernel/page_table_interface or from http://www.gelato.unsw.edu.au/cgi-bin/viewcvs.cgi/cvs/kernel/page_table_interface/ Peter C From miltonm at bga.com Mon May 2 16:43:40 2005 From: miltonm at bga.com (Milton Miller) Date: Mon, 2 May 2005 01:43:40 -0500 Subject: [PATCH 1/2] ppc64: fix read/write on large /dev/nvram Message-ID: <7845758a806ed6769cea59a9df344d39@bga.com> On Fri Apr 22 16:49:59 EST 2005, Arnd wrote a patch with the following lines (among several others). - len = ppc_md.nvram_read(tmp_buffer, count, ppos); + ret = ppc_md.nvram_read(tmp, count, ppos); - len = ppc_md.nvram_write(tmp_buffer, count, ppos); + ret = ppc_md.nvram_read(tmp, count, ppos); Even though I am just scanning, I am guessing this is not quite right. milton From david at gibson.dropbear.id.au Tue May 3 10:26:08 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Tue, 3 May 2005 10:26:08 +1000 Subject: [PPC64] pgtable.h and other header cleanups Message-ID: <20050503002608.GA22453@localhost.localdomain> Andrew, please apply. This patch started as simply removing a few never-used macros from asm-ppc64/pgtable.h, then kind of grew. It now makes a bunch of cleanups to the ppc64 low-level header files (with corresponding changes to .c files where necessary) such as: - Abolishing never-used macros - Eliminating multiple #defines with the same purpose - Removing pointless macros (cases where just expanding the macro everywhere turns out clearer and more sensible) - Removing some cases where macros which could be defined in terms of each other weren't - Moving imalloc() related definitions from pgtable.h to their own header file (imalloc.h) - Re-arranging headers to group things more logically - Moving all VSID allocation related things to mmu.h, instead of being split between mmu.h and mmu_context.h - Removing some reserved space for flags from the PMD - we're not using it. Signed-off-by: David Gibson Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-02 16:21:09.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-02 17:58:29.000000000 +1000 @@ -17,16 +17,6 @@ #include -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3) + (PAGE_SHIFT - 2)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for @@ -40,40 +30,30 @@ #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) -#define USER_PTRS_PER_PGD (1024) -#define FIRST_USER_ADDRESS 0 +/* PMD_SHIFT determines what a second-level page table entry can map */ +#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) +/* PGDIR_SHIFT determines what a third-level page table entry can map */ +#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +#define FIRST_USER_ADDRESS 0 /* * Size of EA range mapped by our pagetables. */ -#define PGTABLE_EA_BITS 41 -#define PGTABLE_EA_MASK ((1UL<> PMD_TO_PTEPAGE_SHIFT)) +#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) #define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) @@ -266,8 +241,6 @@ /* to find an entry in the ioremap page-table-directory */ #define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -487,18 +460,13 @@ extern unsigned long ioremap_bot, ioremap_base; -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) -extern pgd_t swapper_pg_dir[1024]; -extern pgd_t ioremap_dir[1024]; +extern pgd_t swapper_pg_dir[]; +extern pgd_t ioremap_dir[]; extern void paging_init(void); @@ -540,43 +508,11 @@ */ #define kern_addr_valid(addr) (1) -#define io_remap_page_range(vma, vaddr, paddr, size, prot) \ - remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) -#define MK_IOSPACE_PFN(space, pfn) (pfn) -#define GET_IOSPACE(pfn) 0 -#define GET_PFN(pfn) (pfn) - void pgtable_cache_init(void); -extern void hpte_init_native(void); -extern void hpte_init_lpar(void); -extern void hpte_init_iSeries(void); - -/* imalloc region types */ -#define IM_REGION_UNUSED 0x1 -#define IM_REGION_SUBSET 0x2 -#define IM_REGION_EXISTS 0x4 -#define IM_REGION_OVERLAP 0x8 -#define IM_REGION_SUPERSET 0x10 - -extern struct vm_struct * im_get_free_area(unsigned long size); -extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, - int region_type); -unsigned long im_free(void *addr); - -extern long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long prpn, - int secondary, unsigned long hpteflags, - int bolted, int large); - -extern long native_hpte_insert(unsigned long hpte_group, unsigned long va, - unsigned long prpn, int secondary, - unsigned long hpteflags, int bolted, int large); - /* * find_linux_pte returns the address of a linux pte for a given * effective address and directory. If not found, it returns zero. Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-02 16:21:09.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-02 16:21:43.000000000 +1000 @@ -23,7 +23,6 @@ #define PAGE_SHIFT 12 #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PAGE_OFFSET_MASK (PAGE_SIZE-1) #define SID_SHIFT 28 #define SID_MASK 0xfffffffffUL @@ -85,9 +84,6 @@ /* align addr on a size boundary - adjust address up if needed */ #define _ALIGN(addr,size) _ALIGN_UP(addr,size) -/* to align the pointer to the (next) double word boundary */ -#define DOUBLEWORD_ALIGN(addr) _ALIGN(addr,sizeof(unsigned long)) - /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) @@ -100,7 +96,6 @@ #define REGION_SIZE 4UL #define REGION_SHIFT 60UL #define REGION_MASK (((1UL<>REGION_SHIFT) -#define VMALLOC_REGION_ID (VMALLOCBASE>>REGION_SHIFT) -#define KERNEL_REGION_ID (KERNELBASE>>REGION_SHIFT) +#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) +#define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) +#define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) #define USER_REGION_ID (0UL) -#define REGION_ID(X) (((unsigned long)(X))>>REGION_SHIFT) +#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x))<> PAGE_SHIFT) #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-02 16:59:40.000000000 +1000 @@ -14,6 +14,7 @@ #include #include #include +#include static DECLARE_MUTEX(imlist_sem); struct vm_struct * imlist = NULL; @@ -23,11 +24,11 @@ unsigned long addr; struct vm_struct **p, *tmp; - addr = IMALLOC_START; + addr = ioremap_bot; for (p = &imlist; (tmp = *p) ; p = &tmp->next) { if (size + addr < (unsigned long) tmp->addr) break; - if ((unsigned long)tmp->addr >= IMALLOC_START) + if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; if (addr > IMALLOC_END-size) return 1; Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-02 16:56:17.000000000 +1000 @@ -298,24 +298,23 @@ int local = 0; cpumask_t tmp; + if ((ea & ~REGION_MASK) > EADDR_MASK) + return 1; + switch (REGION_ID(ea)) { case USER_REGION_ID: user_region = 1; mm = current->mm; - if ((ea > USER_END) || (! mm)) + if (! mm) return 1; vsid = get_vsid(mm->context.id, ea); break; case IO_REGION_ID: - if (ea > IMALLOC_END) - return 1; mm = &ioremap_mm; vsid = get_kernel_vsid(ea); break; case VMALLOC_REGION_ID: - if (ea > VMALLOC_END) - return 1; mm = &init_mm; vsid = get_kernel_vsid(ea); break; @@ -362,7 +361,7 @@ unsigned long vsid, vpn, va, hash, secondary, slot; unsigned long huge = pte_huge(pte); - if ((ea >= USER_START) && (ea <= USER_END)) + if (ea < KERNELBASE) vsid = get_vsid(context, ea); else vsid = get_kernel_vsid(ea); Index: working-2.6/arch/ppc64/mm/hash_native.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_native.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_native.c 2005-05-02 16:51:44.000000000 +1000 @@ -320,8 +320,7 @@ j = 0; for (i = 0; i < number; i++) { - if ((batch->addr[i] >= USER_START) && - (batch->addr[i] <= USER_END)) + if (batch->addr[i] < KERNELBASE) vsid = get_vsid(context, batch->addr[i]); else vsid = get_kernel_vsid(batch->addr[i]); Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-02 08:57:20.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-02 16:38:18.000000000 +1000 @@ -64,6 +64,7 @@ #include #include #include +#include int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; Index: working-2.6/include/asm-ppc64/mmu.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu.h 2005-05-02 17:45:59.000000000 +1000 @@ -15,19 +15,10 @@ #include #include -#include -#ifndef __ASSEMBLY__ - -/* Time to allow for more things here */ -typedef unsigned long mm_context_id_t; -typedef struct { - mm_context_id_t id; -#ifdef CONFIG_HUGETLB_PAGE - pgd_t *huge_pgdir; - u16 htlb_segs; /* bitmask */ -#endif -} mm_context_t; +/* + * Segment table + */ #define STE_ESID_V 0x80 #define STE_ESID_KS 0x20 @@ -36,15 +27,48 @@ #define STE_VSID_SHIFT 12 -struct stab_entry { - unsigned long esid_data; - unsigned long vsid_data; -}; +/* Location of cpu0's segment table */ +#define STAB0_PAGE 0x9 +#define STAB0_PHYS_ADDR (STAB0_PAGE<> VSID_BITS) + (x & VSID_MODULUS); + return (x + ((x+1) >> VSID_BITS)) & VSID_MODULUS; +#endif /* 1 */ +} + +/* This is only valid for addresses >= KERNELBASE */ +static inline unsigned long get_kernel_vsid(unsigned long ea) +{ + return vsid_scramble(ea >> SID_SHIFT); +} + +/* This is only valid for user addresses (which are below 2^41) */ +static inline unsigned long get_vsid(unsigned long context, unsigned long ea) +{ + return vsid_scramble((context << USER_ESID_BITS) + | (ea >> SID_SHIFT)); +} + +#endif /* __ASSEMBLY */ + #endif /* _PPC64_MMU_H_ */ Index: working-2.6/arch/ppc64/mm/stab.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/stab.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/stab.c 2005-05-02 17:29:03.000000000 +1000 @@ -19,6 +19,11 @@ #include #include +struct stab_entry { + unsigned long esid_data; + unsigned long vsid_data; +}; + /* Both the segment table and SLB code uses the following cache */ #define NR_STAB_CACHE_ENTRIES 8 DEFINE_PER_CPU(long, stab_cache_ptr); Index: working-2.6/include/asm-ppc64/mmu_context.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu_context.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu_context.h 2005-05-02 17:41:49.000000000 +1000 @@ -84,86 +84,4 @@ local_irq_restore(flags); } -/* VSID allocation - * =============== - * - * We first generate a 36-bit "proto-VSID". For kernel addresses this - * is equal to the ESID, for user addresses it is: - * (context << 15) | (esid & 0x7fff) - * - * The two forms are distinguishable because the top bit is 0 for user - * addresses, whereas the top two bits are 1 for kernel addresses. - * Proto-VSIDs with the top two bits equal to 0b10 are reserved for - * now. - * - * The proto-VSIDs are then scrambled into real VSIDs with the - * multiplicative hash: - * - * VSID = (proto-VSID * VSID_MULTIPLIER) % VSID_MODULUS - * where VSID_MULTIPLIER = 268435399 = 0xFFFFFC7 - * VSID_MODULUS = 2^36-1 = 0xFFFFFFFFF - * - * This scramble is only well defined for proto-VSIDs below - * 0xFFFFFFFFF, so both proto-VSID and actual VSID 0xFFFFFFFFF are - * reserved. VSID_MULTIPLIER is prime, so in particular it is - * co-prime to VSID_MODULUS, making this a 1:1 scrambling function. - * Because the modulus is 2^n-1 we can compute it efficiently without - * a divide or extra multiply (see below). - * - * This scheme has several advantages over older methods: - * - * - We have VSIDs allocated for every kernel address - * (i.e. everything above 0xC000000000000000), except the very top - * segment, which simplifies several things. - * - * - We allow for 15 significant bits of ESID and 20 bits of - * context for user addresses. i.e. 8T (43 bits) of address space for - * up to 1M contexts (although the page table structure and context - * allocation will need changes to take advantage of this). - * - * - The scramble function gives robust scattering in the hash - * table (at least based on some initial results). The previous - * method was more susceptible to pathological cases giving excessive - * hash collisions. - */ - -/* - * WARNING - If you change these you must make sure the asm - * implementations in slb_allocate(), do_stab_bolted and mmu.h - * (ASM_VSID_SCRAMBLE macro) are changed accordingly. - * - * You'll also need to change the precomputed VSID values in head.S - * which are used by the iSeries firmware. - */ - -static inline unsigned long vsid_scramble(unsigned long protovsid) -{ -#if 0 - /* The code below is equivalent to this function for arguments - * < 2^VSID_BITS, which is all this should ever be called - * with. However gcc is not clever enough to compute the - * modulus (2^n-1) without a second multiply. */ - return ((protovsid * VSID_MULTIPLIER) % VSID_MODULUS); -#else /* 1 */ - unsigned long x; - - x = protovsid * VSID_MULTIPLIER; - x = (x >> VSID_BITS) + (x & VSID_MODULUS); - return (x + ((x+1) >> VSID_BITS)) & VSID_MODULUS; -#endif /* 1 */ -} - -/* This is only valid for addresses >= KERNELBASE */ -static inline unsigned long get_kernel_vsid(unsigned long ea) -{ - return vsid_scramble(ea >> SID_SHIFT); -} - -/* This is only valid for user addresses (which are below 2^41) */ -static inline unsigned long get_vsid(unsigned long context, unsigned long ea) -{ - return vsid_scramble((context << USER_ESID_BITS) - | (ea >> SID_SHIFT)); -} - #endif /* __PPC64_MMU_CONTEXT_H */ -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From omkhar at gentoo.org Tue May 3 10:44:41 2005 From: omkhar at gentoo.org (Omkhar Arasaratnam) Date: Mon, 02 May 2005 20:44:41 -0400 Subject: [BUG] 2.4.30 - Bring up on JS20 Fails Message-ID: <4276C979.3020300@gentoo.org> -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 At first the kernel wouldn't compile as it was missing a header, which I have resolved : http://dev.gentoo.org/~omkhar/ppc64-autofs_4.patch After including this header I was able to compile, but on bring up i see the following: [boot]0012 Setup Arch pSeries_pci: this system has large bus numbers and the kernel was not built with the patch that fixes include/linux/pci.h struct pci_bus so number, primary, secondary and subordinate are ints. Kernel panic: pSeries_pci: this system has large bus numbers and the kernel was not built with the patch that fixes include/linux/pci.h struct pci_bus so number, primary, secondary and subordinate are ints. In idle task - not syncing Ideas? - -- Omkhar Arasaratnam - Gentoo PPC64 Developer omkhar at gentoo.org - http://dev.gentoo.org/~omkhar Gentoo Linux / PPC64 Linux: http://ppc64.gentoo.org -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (MingW32) iD8DBQFCdsl59msUWjh2lHURAm9hAKCdfkUB5p+qx8hlQvzt7PnHgaLKqACeKe8y 6kUP8tOuOF+Zgi1OxkzOXKc= =QkrG -----END PGP SIGNATURE----- From benh at kernel.crashing.org Tue May 3 11:16:53 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Tue, 03 May 2005 11:16:53 +1000 Subject: [BUG] 2.4.30 - Bring up on JS20 Fails In-Reply-To: <4276C979.3020300@gentoo.org> References: <4276C979.3020300@gentoo.org> Message-ID: <1115083013.6155.37.camel@gaston> On Mon, 2005-05-02 at 20:44 -0400, Omkhar Arasaratnam wrote: > -----BEGIN PGP SIGNED MESSAGE----- > Hash: SHA1 > > At first the kernel wouldn't compile as it was missing a header, which I > have resolved : http://dev.gentoo.org/~omkhar/ppc64-autofs_4.patch > > After including this header I was able to compile, but on bring up i see > the following: > > [boot]0012 Setup Arch > pSeries_pci: this system has large bus numbers and the kernel was not > built with the patch that fixes include/linux/pci.h struct pci_bus so > number, primary, secondary and subordinate are ints. > Kernel panic: pSeries_pci: this system has large bus numbers and the > kernel was not > built with the patch that fixes include/linux/pci.h struct pci_bus so > number, primary, secondary and subordinate are ints. > In idle task - not syncing Why 2.4 ? Ben. From david at gibson.dropbear.id.au Tue May 3 11:23:43 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Tue, 3 May 2005 11:23:43 +1000 Subject: [PPC64] pgtable.h and other header cleanups In-Reply-To: <20050503002608.GA22453@localhost.localdomain> References: <20050503002608.GA22453@localhost.localdomain> Message-ID: <20050503012343.GB22453@localhost.localdomain> On Tue, May 03, 2005 at 10:26:08AM +1000, David Gibson wrote: > Andrew, please apply. > > This patch started as simply removing a few never-used macros from > asm-ppc64/pgtable.h, then kind of grew. It now makes a bunch of > cleanups to the ppc64 low-level header files (with corresponding > changes to .c files where necessary) such as: > - Abolishing never-used macros > - Eliminating multiple #defines with the same purpose > - Removing pointless macros (cases where just expanding the > macro everywhere turns out clearer and more sensible) > - Removing some cases where macros which could be defined in > terms of each other weren't > - Moving imalloc() related definitions from pgtable.h to their > own header file (imalloc.h) > - Re-arranging headers to group things more logically > - Moving all VSID allocation related things to mmu.h, instead > of being split between mmu.h and mmu_context.h > - Removing some reserved space for flags from the PMD - we're > not using it. Aargh! Don't apply, patch is broken (missing imalloc.h). Grr... I could have sworn I'd quilt added it. Fixed version coming shortly. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From anton at samba.org Tue May 3 13:13:22 2005 From: anton at samba.org (Anton Blanchard) Date: Tue, 3 May 2005 13:13:22 +1000 Subject: [BUG] 2.4.30 - Bring up on JS20 Fails In-Reply-To: <4276C979.3020300@gentoo.org> References: <4276C979.3020300@gentoo.org> Message-ID: <20050503031322.GG12682@krispykreme> Hi, > After including this header I was able to compile, but on bring up i see > the following: > > [boot]0012 Setup Arch > pSeries_pci: this system has large bus numbers and the kernel was not > built with the patch that fixes include/linux/pci.h struct pci_bus so > number, primary, secondary and subordinate are ints. > Kernel panic: pSeries_pci: this system has large bus numbers and the > kernel was not > built with the patch that fixes > include/linux/pci.h struct pci_bus so > number, primary, secondary and subordinate are ints. Do that and it should work :) Its to do with PCI domains and is fixed properly in 2.6. Anton From david at gibson.dropbear.id.au Tue May 3 13:33:32 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Tue, 3 May 2005 13:33:32 +1000 Subject: [PPC64] pgtable.h and other header cleanups In-Reply-To: <20050503012343.GB22453@localhost.localdomain> References: <20050503002608.GA22453@localhost.localdomain> <20050503012343.GB22453@localhost.localdomain> Message-ID: <20050503033332.GC22453@localhost.localdomain> On Tue, May 03, 2005 at 11:23:43AM +1000, David Gibson wrote: > On Tue, May 03, 2005 at 10:26:08AM +1000, David Gibson wrote: > > Andrew, please apply. > > > > This patch started as simply removing a few never-used macros from > > asm-ppc64/pgtable.h, then kind of grew. It now makes a bunch of > > cleanups to the ppc64 low-level header files (with corresponding > > changes to .c files where necessary) such as: > > - Abolishing never-used macros > > - Eliminating multiple #defines with the same purpose > > - Removing pointless macros (cases where just expanding the > > macro everywhere turns out clearer and more sensible) > > - Removing some cases where macros which could be defined in > > terms of each other weren't > > - Moving imalloc() related definitions from pgtable.h to their > > own header file (imalloc.h) > > - Re-arranging headers to group things more logically > > - Moving all VSID allocation related things to mmu.h, instead > > of being split between mmu.h and mmu_context.h > > - Removing some reserved space for flags from the PMD - we're > > not using it. > > Aargh! Don't apply, patch is broken (missing imalloc.h). Grr... I > could have sworn I'd quilt added it. Fixed version coming shortly. Ok, this time for sure. Andrew, please apply: This patch started as simply removing a few never-used macros from asm-ppc64/pgtable.h, then kind of grew. It now makes a bunch of cleanups to the ppc64 low-level header files (with corresponding changes to .c files where necessary) such as: - Abolishing never-used macros - Eliminating multiple #defines with the same purpose - Removing pointless macros (cases where just expanding the macro everywhere turns out clearer and more sensible) - Removing some cases where macros which could be defined in terms of each other weren't - Moving imalloc() related definitions from pgtable.h to their own header file (imalloc.h) - Re-arranging headers to group things more logically - Moving all VSID allocation related things to mmu.h, instead of being split between mmu.h and mmu_context.h - Removing some reserved space for flags from the PMD - we're not using it. - Fix some bugs which broke compile with STRICT_MM_TYPECHECKS. Signed-off-by: David Gibson Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-02 08:57:22.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-03 12:56:34.000000000 +1000 @@ -17,16 +17,6 @@ #include -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3) + (PAGE_SHIFT - 2)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for @@ -40,40 +30,30 @@ #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) -#define USER_PTRS_PER_PGD (1024) -#define FIRST_USER_ADDRESS 0 +/* PMD_SHIFT determines what a second-level page table entry can map */ +#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) +/* PGDIR_SHIFT determines what a third-level page table entry can map */ +#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +#define FIRST_USER_ADDRESS 0 /* * Size of EA range mapped by our pagetables. */ -#define PGTABLE_EA_BITS 41 -#define PGTABLE_EA_MASK ((1UL<> PMD_TO_PTEPAGE_SHIFT)) +#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) #define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) @@ -266,8 +242,6 @@ /* to find an entry in the ioremap page-table-directory */ #define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -442,7 +416,7 @@ pte_clear(mm, addr, ptep); flush_tlb_pending(); } - *ptep = __pte(pte_val(pte)) & ~_PAGE_HPTEFLAGS; + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); } /* Set the dirty and/or accessed bits atomically in a linux PTE, this @@ -487,18 +461,13 @@ extern unsigned long ioremap_bot, ioremap_base; -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) -extern pgd_t swapper_pg_dir[1024]; -extern pgd_t ioremap_dir[1024]; +extern pgd_t swapper_pg_dir[]; +extern pgd_t ioremap_dir[]; extern void paging_init(void); @@ -540,43 +509,11 @@ */ #define kern_addr_valid(addr) (1) -#define io_remap_page_range(vma, vaddr, paddr, size, prot) \ - remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) -#define MK_IOSPACE_PFN(space, pfn) (pfn) -#define GET_IOSPACE(pfn) 0 -#define GET_PFN(pfn) (pfn) - void pgtable_cache_init(void); -extern void hpte_init_native(void); -extern void hpte_init_lpar(void); -extern void hpte_init_iSeries(void); - -/* imalloc region types */ -#define IM_REGION_UNUSED 0x1 -#define IM_REGION_SUBSET 0x2 -#define IM_REGION_EXISTS 0x4 -#define IM_REGION_OVERLAP 0x8 -#define IM_REGION_SUPERSET 0x10 - -extern struct vm_struct * im_get_free_area(unsigned long size); -extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, - int region_type); -unsigned long im_free(void *addr); - -extern long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long prpn, - int secondary, unsigned long hpteflags, - int bolted, int large); - -extern long native_hpte_insert(unsigned long hpte_group, unsigned long va, - unsigned long prpn, int secondary, - unsigned long hpteflags, int bolted, int large); - /* * find_linux_pte returns the address of a linux pte for a given * effective address and directory. If not found, it returns zero. Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-02 08:57:22.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-03 13:08:06.000000000 +1000 @@ -23,7 +23,6 @@ #define PAGE_SHIFT 12 #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PAGE_OFFSET_MASK (PAGE_SIZE-1) #define SID_SHIFT 28 #define SID_MASK 0xfffffffffUL @@ -85,9 +84,6 @@ /* align addr on a size boundary - adjust address up if needed */ #define _ALIGN(addr,size) _ALIGN_UP(addr,size) -/* to align the pointer to the (next) double word boundary */ -#define DOUBLEWORD_ALIGN(addr) _ALIGN(addr,sizeof(unsigned long)) - /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) @@ -100,7 +96,6 @@ #define REGION_SIZE 4UL #define REGION_SHIFT 60UL #define REGION_MASK (((1UL<>REGION_SHIFT) -#define VMALLOC_REGION_ID (VMALLOCBASE>>REGION_SHIFT) -#define KERNEL_REGION_ID (KERNELBASE>>REGION_SHIFT) +#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) +#define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) +#define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) #define USER_REGION_ID (0UL) -#define REGION_ID(X) (((unsigned long)(X))>>REGION_SHIFT) +#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x))<> PAGE_SHIFT) #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-03 12:56:34.000000000 +1000 @@ -14,6 +14,7 @@ #include #include #include +#include static DECLARE_MUTEX(imlist_sem); struct vm_struct * imlist = NULL; @@ -23,11 +24,11 @@ unsigned long addr; struct vm_struct **p, *tmp; - addr = IMALLOC_START; + addr = ioremap_bot; for (p = &imlist; (tmp = *p) ; p = &tmp->next) { if (size + addr < (unsigned long) tmp->addr) break; - if ((unsigned long)tmp->addr >= IMALLOC_START) + if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; if (addr > IMALLOC_END-size) return 1; Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-03 12:56:34.000000000 +1000 @@ -298,24 +298,23 @@ int local = 0; cpumask_t tmp; + if ((ea & ~REGION_MASK) > EADDR_MASK) + return 1; + switch (REGION_ID(ea)) { case USER_REGION_ID: user_region = 1; mm = current->mm; - if ((ea > USER_END) || (! mm)) + if (! mm) return 1; vsid = get_vsid(mm->context.id, ea); break; case IO_REGION_ID: - if (ea > IMALLOC_END) - return 1; mm = &ioremap_mm; vsid = get_kernel_vsid(ea); break; case VMALLOC_REGION_ID: - if (ea > VMALLOC_END) - return 1; mm = &init_mm; vsid = get_kernel_vsid(ea); break; @@ -362,7 +361,7 @@ unsigned long vsid, vpn, va, hash, secondary, slot; unsigned long huge = pte_huge(pte); - if ((ea >= USER_START) && (ea <= USER_END)) + if (ea < KERNELBASE) vsid = get_vsid(context, ea); else vsid = get_kernel_vsid(ea); Index: working-2.6/arch/ppc64/mm/hash_native.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_native.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_native.c 2005-05-03 12:56:34.000000000 +1000 @@ -320,8 +320,7 @@ j = 0; for (i = 0; i < number; i++) { - if ((batch->addr[i] >= USER_START) && - (batch->addr[i] <= USER_END)) + if (batch->addr[i] < KERNELBASE) vsid = get_vsid(context, batch->addr[i]); else vsid = get_kernel_vsid(batch->addr[i]); Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-02 08:57:20.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-03 12:56:34.000000000 +1000 @@ -64,6 +64,7 @@ #include #include #include +#include int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; Index: working-2.6/include/asm-ppc64/mmu.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu.h 2005-05-03 12:56:34.000000000 +1000 @@ -15,19 +15,10 @@ #include #include -#include -#ifndef __ASSEMBLY__ - -/* Time to allow for more things here */ -typedef unsigned long mm_context_id_t; -typedef struct { - mm_context_id_t id; -#ifdef CONFIG_HUGETLB_PAGE - pgd_t *huge_pgdir; - u16 htlb_segs; /* bitmask */ -#endif -} mm_context_t; +/* + * Segment table + */ #define STE_ESID_V 0x80 #define STE_ESID_KS 0x20 @@ -36,15 +27,48 @@ #define STE_VSID_SHIFT 12 -struct stab_entry { - unsigned long esid_data; - unsigned long vsid_data; -}; +/* Location of cpu0's segment table */ +#define STAB0_PAGE 0x9 +#define STAB0_PHYS_ADDR (STAB0_PAGE<> VSID_BITS) + (x & VSID_MODULUS); + return (x + ((x+1) >> VSID_BITS)) & VSID_MODULUS; +#endif /* 1 */ +} + +/* This is only valid for addresses >= KERNELBASE */ +static inline unsigned long get_kernel_vsid(unsigned long ea) +{ + return vsid_scramble(ea >> SID_SHIFT); +} + +/* This is only valid for user addresses (which are below 2^41) */ +static inline unsigned long get_vsid(unsigned long context, unsigned long ea) +{ + return vsid_scramble((context << USER_ESID_BITS) + | (ea >> SID_SHIFT)); +} + +#endif /* __ASSEMBLY */ + #endif /* _PPC64_MMU_H_ */ Index: working-2.6/arch/ppc64/mm/stab.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/stab.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/mm/stab.c 2005-05-03 12:56:34.000000000 +1000 @@ -19,6 +19,11 @@ #include #include +struct stab_entry { + unsigned long esid_data; + unsigned long vsid_data; +}; + /* Both the segment table and SLB code uses the following cache */ #define NR_STAB_CACHE_ENTRIES 8 DEFINE_PER_CPU(long, stab_cache_ptr); Index: working-2.6/include/asm-ppc64/mmu_context.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu_context.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu_context.h 2005-05-03 12:56:34.000000000 +1000 @@ -84,86 +84,4 @@ local_irq_restore(flags); } -/* VSID allocation - * =============== - * - * We first generate a 36-bit "proto-VSID". For kernel addresses this - * is equal to the ESID, for user addresses it is: - * (context << 15) | (esid & 0x7fff) - * - * The two forms are distinguishable because the top bit is 0 for user - * addresses, whereas the top two bits are 1 for kernel addresses. - * Proto-VSIDs with the top two bits equal to 0b10 are reserved for - * now. - * - * The proto-VSIDs are then scrambled into real VSIDs with the - * multiplicative hash: - * - * VSID = (proto-VSID * VSID_MULTIPLIER) % VSID_MODULUS - * where VSID_MULTIPLIER = 268435399 = 0xFFFFFC7 - * VSID_MODULUS = 2^36-1 = 0xFFFFFFFFF - * - * This scramble is only well defined for proto-VSIDs below - * 0xFFFFFFFFF, so both proto-VSID and actual VSID 0xFFFFFFFFF are - * reserved. VSID_MULTIPLIER is prime, so in particular it is - * co-prime to VSID_MODULUS, making this a 1:1 scrambling function. - * Because the modulus is 2^n-1 we can compute it efficiently without - * a divide or extra multiply (see below). - * - * This scheme has several advantages over older methods: - * - * - We have VSIDs allocated for every kernel address - * (i.e. everything above 0xC000000000000000), except the very top - * segment, which simplifies several things. - * - * - We allow for 15 significant bits of ESID and 20 bits of - * context for user addresses. i.e. 8T (43 bits) of address space for - * up to 1M contexts (although the page table structure and context - * allocation will need changes to take advantage of this). - * - * - The scramble function gives robust scattering in the hash - * table (at least based on some initial results). The previous - * method was more susceptible to pathological cases giving excessive - * hash collisions. - */ - -/* - * WARNING - If you change these you must make sure the asm - * implementations in slb_allocate(), do_stab_bolted and mmu.h - * (ASM_VSID_SCRAMBLE macro) are changed accordingly. - * - * You'll also need to change the precomputed VSID values in head.S - * which are used by the iSeries firmware. - */ - -static inline unsigned long vsid_scramble(unsigned long protovsid) -{ -#if 0 - /* The code below is equivalent to this function for arguments - * < 2^VSID_BITS, which is all this should ever be called - * with. However gcc is not clever enough to compute the - * modulus (2^n-1) without a second multiply. */ - return ((protovsid * VSID_MULTIPLIER) % VSID_MODULUS); -#else /* 1 */ - unsigned long x; - - x = protovsid * VSID_MULTIPLIER; - x = (x >> VSID_BITS) + (x & VSID_MODULUS); - return (x + ((x+1) >> VSID_BITS)) & VSID_MODULUS; -#endif /* 1 */ -} - -/* This is only valid for addresses >= KERNELBASE */ -static inline unsigned long get_kernel_vsid(unsigned long ea) -{ - return vsid_scramble(ea >> SID_SHIFT); -} - -/* This is only valid for user addresses (which are below 2^41) */ -static inline unsigned long get_vsid(unsigned long context, unsigned long ea) -{ - return vsid_scramble((context << USER_ESID_BITS) - | (ea >> SID_SHIFT)); -} - #endif /* __PPC64_MMU_CONTEXT_H */ Index: working-2.6/include/asm-ppc64/imalloc.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ working-2.6/include/asm-ppc64/imalloc.h 2005-05-03 12:56:34.000000000 +1000 @@ -0,0 +1,24 @@ +#ifndef _PPC64_IMALLOC_H +#define _PPC64_IMALLOC_H + +/* + * Define the address range of the imalloc VM area. + */ +#define PHBS_IO_BASE IOREGIONBASE +#define IMALLOC_BASE (IOREGIONBASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ +#define IMALLOC_END (IOREGIONBASE + EADDR_MASK) + + +/* imalloc region types */ +#define IM_REGION_UNUSED 0x1 +#define IM_REGION_SUBSET 0x2 +#define IM_REGION_EXISTS 0x4 +#define IM_REGION_OVERLAP 0x8 +#define IM_REGION_SUPERSET 0x10 + +extern struct vm_struct * im_get_free_area(unsigned long size); +extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, + int region_type); +unsigned long im_free(void *addr); + +#endif /* _PPC64_IMALLOC_H */ Index: working-2.6/arch/ppc64/kernel/pci.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/pci.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/pci.c 2005-05-03 12:56:34.000000000 +1000 @@ -438,7 +438,7 @@ int i; if (page_is_ram(offset >> PAGE_SHIFT)) - return prot; + return __pgprot(prot); prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From paulus at samba.org Tue May 3 14:14:00 2005 From: paulus at samba.org (Paul Mackerras) Date: Tue, 3 May 2005 14:14:00 +1000 Subject: [PPC64] pgtable.h and other header cleanups In-Reply-To: <20050503033332.GC22453@localhost.localdomain> References: <20050503002608.GA22453@localhost.localdomain> <20050503012343.GB22453@localhost.localdomain> <20050503033332.GC22453@localhost.localdomain> Message-ID: <17014.64136.49697.910612@cargo.ozlabs.ibm.com> David Gibson writes: > This patch started as simply removing a few never-used macros from > asm-ppc64/pgtable.h, then kind of grew. It now makes a bunch of > cleanups to the ppc64 low-level header files (with corresponding > changes to .c files where necessary) such as: > - Abolishing never-used macros > - Eliminating multiple #defines with the same purpose > - Removing pointless macros (cases where just expanding the > macro everywhere turns out clearer and more sensible) > - Removing some cases where macros which could be defined in > terms of each other weren't > - Moving imalloc() related definitions from pgtable.h to their > own header file (imalloc.h) > - Re-arranging headers to group things more logically > - Moving all VSID allocation related things to mmu.h, instead > of being split between mmu.h and mmu_context.h > - Removing some reserved space for flags from the PMD - we're > not using it. > - Fix some bugs which broke compile with STRICT_MM_TYPECHECKS. > > Signed-off-by: David Gibson Acked-by: Paul Mackerras From sonny at burdell.org Tue May 3 15:02:12 2005 From: sonny at burdell.org (Sonny Rao) Date: Tue, 3 May 2005 01:02:12 -0400 Subject: 2.6.11 e1000 EEH MMIO failure Message-ID: <20050503050212.GA22395@kevlar.burdell.org> I'm guessing this means a bad e1000 card but I wanted to check with the experts. The box is a p690 w/ some expansion drawers attached, and is running a pretty-much stock 2.6.11 kernel, system is booted in SMP mode. Could it be related to e1000 errata "23" mentioned earlier on the mailing list? Here are the messages: Intel(R) PRO/1000 Network Driver - version 5.6.10.1-k2 Copyright (c) 1999-2004 Intel Corporation. e1000: eth3: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (000a:01:01.0), cmd 143 e1000: eth4: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (000a:01:01.1), cmd 143 e1000: eth5: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (000e:21:01.0), cmd 143 e1000: eth6: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (0011:21:01.0), cmd 143 e1000: eth7: e1000_probe: Intel(R) PRO/1000 Network Connection RTAS: event: 15, Type: Retry, Severity: 2 EEH: MMIO failure (2) on device: ethernet /pci at 3ffe7f0a000/pci at 2,2/ethernet at 1 Call Trace: [c00000103873a910] [c000000000631630] 0xc000000000631630 (unreliable) [c00000103873a990] [c000000000036a6c] .eeh_dn_check_failure+0x2e4/0x334 [c00000103873aa70] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 [c00000103873ab10] [d0000000002a6b04] .e1000_check_for_link+0x5ac/0x664 [e1000] [c00000103873abd0] [d00000000029a5e0] .e1000_watchdog+0x48/0x79c [e1000] [c00000103873ac90] [c00000000005f558] .run_timer_softirq+0x15c/0x280 [c00000103873ad60] [c00000000005a3c4] .__do_softirq+0xdc/0x1c8 [c00000103873ae20] [c00000000005a538] .do_softirq+0x88/0x8c [c00000103873aeb0] [c000000000011520] .timer_interrupt+0x294/0x35c [c00000103873afb0] [c00000000000a2b8] decrementer_common+0xb8/0x100 --- Exception: 901 at ._spin_unlock_irqrestore+0x1c/0x28 LR = .rtas_call+0x1a4/0x2b4 [c00000103873b2a0] [c0000000001e8128] .snprintf+0x30/0x44 (unreliable) [c00000103873b2e0] [c00000000003421c] .rtas_call+0x110/0x2b4 [c00000103873b3a0] [c0000000000366ec] .read_slot_reset_state+0x94/0xac [c00000103873b420] [c000000000036890] .eeh_dn_check_failure+0x108/0x334 [c00000103873b500] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 [c00000103873b5a0] [d00000000029f174] .e1000_up+0x404/0x40c [e1000] [c00000103873b650] [d00000000029f5cc] .e1000_open+0x54/0xc0 [e1000] [c00000103873b6e0] [c0000000002fec84] .dev_open+0x118/0x13c [c00000103873b780] [c0000000002fcef8] .dev_change_flags+0x19c/0x1d4 [c00000103873b820] [c000000000357878] .devinet_ioctl+0x66c/0x820 [c00000103873b930] [c000000000358794] .inet_ioctl+0x260/0x2e0 [c00000103873b9c0] [c0000000002f03a0] .sock_ioctl+0x28c/0x418 [c00000103873ba70] [c0000000000c7564] .do_ioctl+0x124/0x13c [c00000103873bb10] [c0000000000c777c] .vfs_ioctl+0x200/0x4e0 [c00000103873bbc0] [c0000000000c7ab8] .sys_ioctl+0x5c/0xa4 [c00000103873bc70] [c00000000001e8c0] .dev_ifsioc+0x8c/0x348 [c00000103873bd50] [c0000000000e7d24] .compat_sys_ioctl+0x46c/0x4c4 [c00000103873be30] [c00000000000d500] syscall_exit+0x0/0x18 e1000: eth7: e1000_watchdog: NIC Link is Up 1000 Mbps Full Duplex RTAS: event: 16, Type: Retry, Severity: 2 EEH: MMIO failure (2) on device: ethernet /pci at 3ffe7f0a000/pci at 2,2/ethernet at 1 Call Trace: [c00000103873b3a0] [c000000000631630] 0xc000000000631630 (unreliable) [c00000103873b420] [c000000000036a6c] .eeh_dn_check_failure+0x2e4/0x334 [c00000103873b500] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 [c00000103873b5a0] [d00000000029f174] .e1000_up+0x404/0x40c [e1000] [c00000103873b650] [d00000000029f5cc] .e1000_open+0x54/0xc0 [e1000] [c00000103873b6e0] [c0000000002fec84] .dev_open+0x118/0x13c [c00000103873b780] [c0000000002fcef8] .dev_change_flags+0x19c/0x1d4 [c00000103873b820] [c000000000357878] .devinet_ioctl+0x66c/0x820 [c00000103873b930] [c000000000358794] .inet_ioctl+0x260/0x2e0 [c00000103873b9c0] [c0000000002f03a0] .sock_ioctl+0x28c/0x418 [c00000103873ba70] [c0000000000c7564] .do_ioctl+0x124/0x13c [c00000103873bb10] [c0000000000c777c] .vfs_ioctl+0x200/0x4e0 [c00000103873bbc0] [c0000000000c7ab8] .sys_ioctl+0x5c/0xa4 [c00000103873bc70] [c00000000001e8c0] .dev_ifsioc+0x8c/0x348 [c00000103873bd50] [c0000000000e7d24] .compat_sys_ioctl+0x46c/0x4c4 [c00000103873be30] [c00000000000d500] syscall_exit+0x0/0x18 EEH: MMIO failure (2), notifiying device 0011:21:01.0 EEH: MMIO failure (2), notifiying device 0011:21:01.0 PCI: Enabling device: (0014:01:01.0), cmd 143 e1000: eth8: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (0014:01:01.1), cmd 143 e1000: eth9: e1000_probe: Intel(R) PRO/1000 Network Connection PCI: Enabling device: (0017:01:01.0), cmd 143 e1000: eth10: e1000_probe: Intel(R) PRO/1000 Network Connection Sonny From benh at kernel.crashing.org Tue May 3 15:34:58 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Tue, 03 May 2005 15:34:58 +1000 Subject: [PATCH] Fix gcc 4.0 vs CONFIG_ALTIVEC Message-ID: <1115098498.6030.82.camel@gaston> Hi ! gcc-4.0 generates altivec code implicitely when -mcpu insidates an altivec capable CPU which is not suitable for the kenrel. However, we used to set -mcpu=970 when CONFIG_ALTIVEC was set because a gcc-3.x bug prevented from using -maltivec along with -mcpu=power4, thus prevented building the RAID6 altivec code. This patch fixes all of this by testing for the gcc version. If 4.0 or later, just normally use -mcpu=power4 and let the RAID6 code add -maltivec to the few files it needs to be compiled with altivec support. For 3.x, we still use -mcpu=970 to work around the above problem, which is fine as 3.x will never implicitely generate altivec code. The Makefile hackery may not be the most lovely, I welcome anybody more skilled than me to improve it. Signed-off-by: Benjamin Herrenschmidt Index: linux-work/arch/ppc64/Makefile =================================================================== --- linux-work.orig/arch/ppc64/Makefile 2005-05-02 10:48:08.000000000 +1000 +++ linux-work/arch/ppc64/Makefile 2005-05-03 14:38:43.000000000 +1000 @@ -56,13 +56,20 @@ CFLAGS += -msoft-float -pipe -mminimal-toc -mtraceback=none \ -mcall-aixdesc +GCC_VERSION := $(call cc-version) +GCC_BROKEN_VEC := $(shell if [ $(GCC_VERSION) -lt 0400 ] ; then echo "y"; fi ;) + ifeq ($(CONFIG_POWER4_ONLY),y) ifeq ($(CONFIG_ALTIVEC),y) +ifeq ($(GCC_BROKEN_VEC),y) CFLAGS += $(call cc-option,-mcpu=970) else CFLAGS += $(call cc-option,-mcpu=power4) endif else + CFLAGS += $(call cc-option,-mcpu=power4) +endif +else CFLAGS += $(call cc-option,-mtune=power4) endif From benh at kernel.crashing.org Tue May 3 16:08:25 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Tue, 03 May 2005 16:08:25 +1000 Subject: [PATCH] Fix gcc 4.0 vs CONFIG_ALTIVEC In-Reply-To: <1115098498.6030.82.camel@gaston> References: <1115098498.6030.82.camel@gaston> Message-ID: <1115100505.6030.91.camel@gaston> On Tue, 2005-05-03 at 15:35 +1000, Benjamin Herrenschmidt wrote: > Hi ! > > gcc-4.0 generates altivec code implicitely when -mcpu insidates an > altivec capable CPU which is not suitable for the kenrel. Damn ! I should stay away from a keyboard today ! Oh well, at least the patch itself looks ok. Ben. From arnd at arndb.de Tue May 3 19:46:03 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Tue, 3 May 2005 11:46:03 +0200 Subject: [PATCH 1/2] ppc64: fix read/write on large /dev/nvram In-Reply-To: <7845758a806ed6769cea59a9df344d39@bga.com> References: <7845758a806ed6769cea59a9df344d39@bga.com> Message-ID: <200505031146.04824.arnd@arndb.de> On Maandag 02 Mai 2005 08:43, Milton Miller wrote: > On Fri Apr 22 16:49:59 EST 2005, Arnd wrote a patch with the following > lines (among several others). > > - len = ppc_md.nvram_read(tmp_buffer, count, ppos); > + ret = ppc_md.nvram_read(tmp, count, ppos); > > - len = ppc_md.nvram_write(tmp_buffer, count, ppos); > + ret = ppc_md.nvram_read(tmp, count, ppos); > > > Even though I am just scanning, I am guessing this is not quite right. Good catch. I only tested the read path because I did not want to mess with the contents of the nvram. I'll do a new patch when I come back to Germany, unless someone else (Utz?) does one first. Arnd <>< From omkhar at gentoo.org Wed May 4 02:27:51 2005 From: omkhar at gentoo.org (Omkhar Arasaratnam) Date: Tue, 03 May 2005 12:27:51 -0400 Subject: [BUG] 2.4.30 - Bring up on JS20 Fails In-Reply-To: <20050503031322.GG12682@krispykreme> References: <4276C979.3020300@gentoo.org> <20050503031322.GG12682@krispykreme> Message-ID: <4277A687.8010806@gentoo.org> -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Anton Blanchard wrote: > > Hi, > >> After including this header I was able to compile, but on bring >> up i see the following: >> >> [boot]0012 Setup Arch pSeries_pci: this system has large bus >> numbers and the kernel was not built with the patch that fixes >> include/linux/pci.h struct pci_bus so number, primary, secondary >> and subordinate are ints. Kernel panic: pSeries_pci: this system >> has large bus numbers and the kernel was not built with the patch >> that fixes > > >> include/linux/pci.h struct pci_bus so number, primary, secondary >> and subordinate are ints. > > > Do that and it should work :) Its to do with PCI domains and is > fixed properly in 2.6. > > Anton > Long story short - someone asked - I'll try and quantify thier requirements but thats about it for now - -- Omkhar Arasaratnam - Gentoo PPC64 Developer omkhar at gentoo.org - http://dev.gentoo.org/~omkhar Gentoo Linux / PPC64 Linux: http://ppc64.gentoo.org -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (MingW32) iD8DBQFCd6aH9msUWjh2lHURArUrAJ0afKK091NgJV/3J9TbmTFreT+gLgCghO9B sh6ijo7Mmkg28spwNR06MvY= =Wq8s -----END PGP SIGNATURE----- From sonny at burdell.org Wed May 4 04:17:47 2005 From: sonny at burdell.org (Sonny Rao) Date: Tue, 3 May 2005 14:17:47 -0400 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050503050212.GA22395@kevlar.burdell.org> References: <20050503050212.GA22395@kevlar.burdell.org> Message-ID: <20050503181747.GB7870@kevlar.burdell.org> On Tue, May 03, 2005 at 01:02:12AM -0400, Sonny Rao wrote: > I'm guessing this means a bad e1000 card but I wanted to check with > the experts. The box is a p690 w/ some expansion drawers attached, > and is running a pretty-much stock 2.6.11 kernel, system is booted in > SMP mode. > > Could it be related to e1000 errata "23" mentioned earlier on the > mailing list? > This little bugger is causing a lot of spew into my logs, is there a way to tell EEH to just offline that PCI device ? Isn't that what it's supposed to do? Is there a PCI hotplug FAQ or README somewhere that I can read (and stop posting this crap to the list :) ) Thanks, Sonny From linas at austin.ibm.com Wed May 4 08:46:32 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Tue, 3 May 2005 17:46:32 -0500 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050503050212.GA22395@kevlar.burdell.org> References: <20050503050212.GA22395@kevlar.burdell.org> Message-ID: <20050503224632.GF11745@austin.ibm.com> Recent e1000 code has some new kind of whiz-bang watchdog timer code that is causing the device to DMA off into hyperspace, thus triggering the EEH code. It's not clear to me if the 2.6.11 kernel has this code. Am cc'ing two people who should know.... --linas On Tue, May 03, 2005 at 01:02:12AM -0400, Sonny Rao was heard to remark: > I'm guessing this means a bad e1000 card but I wanted to check with > the experts. The box is a p690 w/ some expansion drawers attached, > and is running a pretty-much stock 2.6.11 kernel, system is booted in > SMP mode. > > Could it be related to e1000 errata "23" mentioned earlier on the > mailing list? > > Here are the messages: > > Intel(R) PRO/1000 Network Driver - version 5.6.10.1-k2 > Copyright (c) 1999-2004 Intel Corporation. > > > > e1000: eth3: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (000a:01:01.0), cmd 143 > e1000: eth4: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (000a:01:01.1), cmd 143 > e1000: eth5: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (000e:21:01.0), cmd 143 > e1000: eth6: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (0011:21:01.0), cmd 143 > e1000: eth7: e1000_probe: Intel(R) PRO/1000 Network Connection > RTAS: event: 15, Type: Retry, Severity: 2 > EEH: MMIO failure (2) on device: ethernet /pci at 3ffe7f0a000/pci at 2,2/ethernet at 1 > Call Trace: > [c00000103873a910] [c000000000631630] 0xc000000000631630 (unreliable) > [c00000103873a990] [c000000000036a6c] .eeh_dn_check_failure+0x2e4/0x334 > [c00000103873aa70] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 > [c00000103873ab10] [d0000000002a6b04] .e1000_check_for_link+0x5ac/0x664 [e1000] > [c00000103873abd0] [d00000000029a5e0] .e1000_watchdog+0x48/0x79c [e1000] > [c00000103873ac90] [c00000000005f558] .run_timer_softirq+0x15c/0x280 > [c00000103873ad60] [c00000000005a3c4] .__do_softirq+0xdc/0x1c8 > [c00000103873ae20] [c00000000005a538] .do_softirq+0x88/0x8c > [c00000103873aeb0] [c000000000011520] .timer_interrupt+0x294/0x35c > [c00000103873afb0] [c00000000000a2b8] decrementer_common+0xb8/0x100 > --- Exception: 901 at ._spin_unlock_irqrestore+0x1c/0x28 > LR = .rtas_call+0x1a4/0x2b4 > [c00000103873b2a0] [c0000000001e8128] .snprintf+0x30/0x44 (unreliable) > [c00000103873b2e0] [c00000000003421c] .rtas_call+0x110/0x2b4 > [c00000103873b3a0] [c0000000000366ec] .read_slot_reset_state+0x94/0xac > [c00000103873b420] [c000000000036890] .eeh_dn_check_failure+0x108/0x334 > [c00000103873b500] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 > [c00000103873b5a0] [d00000000029f174] .e1000_up+0x404/0x40c [e1000] > [c00000103873b650] [d00000000029f5cc] .e1000_open+0x54/0xc0 [e1000] > [c00000103873b6e0] [c0000000002fec84] .dev_open+0x118/0x13c > [c00000103873b780] [c0000000002fcef8] .dev_change_flags+0x19c/0x1d4 > [c00000103873b820] [c000000000357878] .devinet_ioctl+0x66c/0x820 > [c00000103873b930] [c000000000358794] .inet_ioctl+0x260/0x2e0 > [c00000103873b9c0] [c0000000002f03a0] .sock_ioctl+0x28c/0x418 > [c00000103873ba70] [c0000000000c7564] .do_ioctl+0x124/0x13c > [c00000103873bb10] [c0000000000c777c] .vfs_ioctl+0x200/0x4e0 > [c00000103873bbc0] [c0000000000c7ab8] .sys_ioctl+0x5c/0xa4 > [c00000103873bc70] [c00000000001e8c0] .dev_ifsioc+0x8c/0x348 > [c00000103873bd50] [c0000000000e7d24] .compat_sys_ioctl+0x46c/0x4c4 > [c00000103873be30] [c00000000000d500] syscall_exit+0x0/0x18 > e1000: eth7: e1000_watchdog: NIC Link is Up 1000 Mbps Full Duplex > RTAS: event: 16, Type: Retry, Severity: 2 > EEH: MMIO failure (2) on device: ethernet /pci at 3ffe7f0a000/pci at 2,2/ethernet at 1 > Call Trace: > [c00000103873b3a0] [c000000000631630] 0xc000000000631630 (unreliable) > [c00000103873b420] [c000000000036a6c] .eeh_dn_check_failure+0x2e4/0x334 > [c00000103873b500] [c000000000036c20] .eeh_check_failure+0x164/0x1b0 > [c00000103873b5a0] [d00000000029f174] .e1000_up+0x404/0x40c [e1000] > [c00000103873b650] [d00000000029f5cc] .e1000_open+0x54/0xc0 [e1000] > [c00000103873b6e0] [c0000000002fec84] .dev_open+0x118/0x13c > [c00000103873b780] [c0000000002fcef8] .dev_change_flags+0x19c/0x1d4 > [c00000103873b820] [c000000000357878] .devinet_ioctl+0x66c/0x820 > [c00000103873b930] [c000000000358794] .inet_ioctl+0x260/0x2e0 > [c00000103873b9c0] [c0000000002f03a0] .sock_ioctl+0x28c/0x418 > [c00000103873ba70] [c0000000000c7564] .do_ioctl+0x124/0x13c > [c00000103873bb10] [c0000000000c777c] .vfs_ioctl+0x200/0x4e0 > [c00000103873bbc0] [c0000000000c7ab8] .sys_ioctl+0x5c/0xa4 > [c00000103873bc70] [c00000000001e8c0] .dev_ifsioc+0x8c/0x348 > [c00000103873bd50] [c0000000000e7d24] .compat_sys_ioctl+0x46c/0x4c4 > [c00000103873be30] [c00000000000d500] syscall_exit+0x0/0x18 > EEH: MMIO failure (2), notifiying device 0011:21:01.0 > EEH: MMIO failure (2), notifiying device 0011:21:01.0 > PCI: Enabling device: (0014:01:01.0), cmd 143 > e1000: eth8: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (0014:01:01.1), cmd 143 > e1000: eth9: e1000_probe: Intel(R) PRO/1000 Network Connection > PCI: Enabling device: (0017:01:01.0), cmd 143 > e1000: eth10: e1000_probe: Intel(R) PRO/1000 Network Connection > > Sonny > _______________________________________________ > Linuxppc64-dev mailing list > Linuxppc64-dev at ozlabs.org > https://ozlabs.org/cgi-bin/mailman/listinfo/linuxppc64-dev > From linas at austin.ibm.com Wed May 4 08:55:08 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Tue, 3 May 2005 17:55:08 -0500 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050503181747.GB7870@kevlar.burdell.org> References: <20050503050212.GA22395@kevlar.burdell.org> <20050503181747.GB7870@kevlar.burdell.org> Message-ID: <20050503225508.GG11745@austin.ibm.com> On Tue, May 03, 2005 at 02:17:47PM -0400, Sonny Rao was heard to remark: > > This little bugger is causing a lot of spew into my logs, is there a > way to tell EEH to just offline that PCI device ? Isn't that what > it's supposed to do? Is there a PCI hotplug FAQ or README somewhere > that I can read (and stop posting this crap to the list :) ) You can prevent it from panicing by setting "panic_on_oops" to 0 echo 0 > /proc/sys/kernel/panic_on_oops Unfortunately, there is no boot-prompt option for this; there should be a __setup(panic_on_oops) added to kernel/panic.c As to actually recovering from that error-- you might try applying one of the earlier posted EEH patches; it should work. These earlier patches aren't in the mainline kernel because they have deficiencies. I'm supposed to be re-writing the code to make an EEH patch that is generally acceptable as a real patch, but am currently snowed under with other activities. --linas From sonny at burdell.org Wed May 4 10:29:03 2005 From: sonny at burdell.org (Sonny Rao) Date: Tue, 3 May 2005 20:29:03 -0400 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050503224632.GF11745@austin.ibm.com> References: <20050503050212.GA22395@kevlar.burdell.org> <20050503224632.GF11745@austin.ibm.com> Message-ID: <20050504002903.GA11855@kevlar.burdell.org> On Tue, May 03, 2005 at 05:46:32PM -0500, Linas Vepstas wrote: > > Recent e1000 code has some new kind of whiz-bang watchdog timer > code that is causing the device to DMA off into hyperspace, > thus triggering the EEH code. It's not clear to me if the > 2.6.11 kernel has this code. > > Am cc'ing two people who should know.... > > --linas > Well that machine has other e1000 cards in it that aren't doing this, so I'm thinking it really is bad hardware in my case. If you want this card for testing EEH code or something, I just found it and ripped it out earlier today, and you can have it :-) Sonny From sonny at burdell.org Wed May 4 10:33:05 2005 From: sonny at burdell.org (Sonny Rao) Date: Tue, 3 May 2005 20:33:05 -0400 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050503225508.GG11745@austin.ibm.com> References: <20050503050212.GA22395@kevlar.burdell.org> <20050503181747.GB7870@kevlar.burdell.org> <20050503225508.GG11745@austin.ibm.com> Message-ID: <20050504003305.GB11855@kevlar.burdell.org> On Tue, May 03, 2005 at 05:55:08PM -0500, Linas Vepstas wrote: > On Tue, May 03, 2005 at 02:17:47PM -0400, Sonny Rao was heard to remark: > > > > This little bugger is causing a lot of spew into my logs, is there a > > way to tell EEH to just offline that PCI device ? Isn't that what > > it's supposed to do? Is there a PCI hotplug FAQ or README somewhere > > that I can read (and stop posting this crap to the list :) ) > > You can prevent it from panicing by setting "panic_on_oops" to 0 > echo 0 > /proc/sys/kernel/panic_on_oops > > Unfortunately, there is no boot-prompt option for this; > there should be a __setup(panic_on_oops) added to kernel/panic.c Hmm okay, so it isn't actually causing a panic in my case, which I think is good mind you :) I didn't actually try and use it though, it was just in that machine among other e1000s. > As to actually recovering from that error-- you might try applying > one of the earlier posted EEH patches; it should work. These earlier > patches aren't in the mainline kernel because they have deficiencies. > > I'm supposed to be re-writing the code to make an EEH patch that is > generally acceptable as a real patch, but am currently snowed under > with other activities. Ah okay cool, so in the future Linux will be able to smartly handle it, very nice. Unfortunately I can't really test your patch because several other people need to use the machine which is normally partitioned up (and that particular device is left out of any LPAR config) I just happend to boot the full-system partition to do some tests and noticed the problem. Again, if someone wants to do something with that card, let me know, otherwise I'm going to toss it out. Sonny From anton at samba.org Wed May 4 14:37:45 2005 From: anton at samba.org (Anton Blanchard) Date: Wed, 4 May 2005 14:37:45 +1000 Subject: [PATCH] remove io_page_mask Message-ID: <20050504043745.GJ13590@krispykreme> Hi Jake, I found an issue with the io_page_mask code when pci_probe_only is not set (we dont initialise io_page_mask and bad things happen). I was about to fix it up when I wondered if we can remove it now. Ben changed the serial code to check before it goes pounding on addresses. Im not sure if there were other issues with badly behaving drivers but my js20 boots here with the following removal patch. Thoughts? Anton Index: foobar2/include/asm-ppc64/io.h =================================================================== --- foobar2.orig/include/asm-ppc64/io.h 2005-05-04 13:51:41.245647479 +1000 +++ foobar2/include/asm-ppc64/io.h 2005-05-04 13:55:14.823405718 +1000 @@ -33,12 +33,6 @@ extern unsigned long isa_io_base; extern unsigned long pci_io_base; -extern unsigned long io_page_mask; - -#define MAX_ISA_PORT 0x10000 - -#define _IO_IS_VALID(port) ((port) >= MAX_ISA_PORT || (1 << (port>>PAGE_SHIFT)) \ - & io_page_mask) #ifdef CONFIG_PPC_ISERIES /* __raw_* accessors aren't supported on iSeries */ Index: foobar2/include/asm-ppc64/eeh.h =================================================================== --- foobar2.orig/include/asm-ppc64/eeh.h 2005-05-04 13:51:41.246647403 +1000 +++ foobar2/include/asm-ppc64/eeh.h 2005-05-04 13:55:14.825405566 +1000 @@ -310,8 +310,6 @@ static inline u8 eeh_inb(unsigned long port) { u8 val; - if (!_IO_IS_VALID(port)) - return ~0; val = in_8((u8 __iomem *)(port+pci_io_base)); if (EEH_POSSIBLE_ERROR(val, u8)) return eeh_check_failure((void __iomem *)(port), val); @@ -320,15 +318,12 @@ static inline void eeh_outb(u8 val, unsigned long port) { - if (_IO_IS_VALID(port)) - out_8((u8 __iomem *)(port+pci_io_base), val); + out_8((u8 __iomem *)(port+pci_io_base), val); } static inline u16 eeh_inw(unsigned long port) { u16 val; - if (!_IO_IS_VALID(port)) - return ~0; val = in_le16((u16 __iomem *)(port+pci_io_base)); if (EEH_POSSIBLE_ERROR(val, u16)) return eeh_check_failure((void __iomem *)(port), val); @@ -337,15 +332,12 @@ static inline void eeh_outw(u16 val, unsigned long port) { - if (_IO_IS_VALID(port)) - out_le16((u16 __iomem *)(port+pci_io_base), val); + out_le16((u16 __iomem *)(port+pci_io_base), val); } static inline u32 eeh_inl(unsigned long port) { u32 val; - if (!_IO_IS_VALID(port)) - return ~0; val = in_le32((u32 __iomem *)(port+pci_io_base)); if (EEH_POSSIBLE_ERROR(val, u32)) return eeh_check_failure((void __iomem *)(port), val); @@ -354,8 +346,7 @@ static inline void eeh_outl(u32 val, unsigned long port) { - if (_IO_IS_VALID(port)) - out_le32((u32 __iomem *)(port+pci_io_base), val); + out_le32((u32 __iomem *)(port+pci_io_base), val); } /* in-string eeh macros */ Index: foobar2/arch/ppc64/kernel/iSeries_pci.c =================================================================== --- foobar2.orig/arch/ppc64/kernel/iSeries_pci.c 2005-05-04 13:55:12.042223389 +1000 +++ foobar2/arch/ppc64/kernel/iSeries_pci.c 2005-05-04 13:55:52.221213083 +1000 @@ -47,8 +47,6 @@ #include "pci.h" -extern unsigned long io_page_mask; - /* * Forward declares of prototypes. */ @@ -291,7 +289,6 @@ PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Entry.\n"); iomm_table_initialize(); find_and_init_phbs(); - io_page_mask = -1; PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Exit.\n"); } Index: foobar2/arch/ppc64/kernel/pci.c =================================================================== --- foobar2.orig/arch/ppc64/kernel/pci.c 2005-05-04 13:55:12.047223007 +1000 +++ foobar2/arch/ppc64/kernel/pci.c 2005-05-04 13:55:52.226212702 +1000 @@ -42,15 +42,6 @@ unsigned long pci_probe_only = 1; unsigned long pci_assign_all_buses = 0; -/* - * legal IO pages under MAX_ISA_PORT. This is to ensure we don't touch - * devices we don't have access to. - */ -unsigned long io_page_mask; - -EXPORT_SYMBOL(io_page_mask); - - unsigned int pcibios_assign_all_busses(void) { return pci_assign_all_buses; @@ -674,8 +665,6 @@ pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys, hose->io_base_virt); of_node_put(isa_dn); - /* Allow all IO */ - io_page_mask = -1; } } @@ -837,24 +826,9 @@ if (dev->resource[i].flags & IORESOURCE_IO) { unsigned long offset = (unsigned long)hose->io_base_virt - pci_io_base; - unsigned long start, end, mask; - - start = dev->resource[i].start += offset; - end = dev->resource[i].end += offset; - /* Need to allow IO access to pages that are in the - ISA range */ - if (start < MAX_ISA_PORT) { - if (end > MAX_ISA_PORT) - end = MAX_ISA_PORT; - - start >>= PAGE_SHIFT; - end >>= PAGE_SHIFT; - - /* get the range of pages for the map */ - mask = ((1 << (end+1))-1) ^ ((1 << start)-1); - io_page_mask |= mask; - } + dev->resource[i].start += offset; + dev->resource[i].end += offset; } else if (dev->resource[i].flags & IORESOURCE_MEM) { dev->resource[i].start += hose->pci_mem_offset; Index: foobar2/arch/ppc64/kernel/maple_pci.c =================================================================== --- foobar2.orig/arch/ppc64/kernel/maple_pci.c 2005-05-04 13:55:12.044223236 +1000 +++ foobar2/arch/ppc64/kernel/maple_pci.c 2005-05-04 13:55:52.223212931 +1000 @@ -454,9 +454,6 @@ /* Tell pci.c to use the common resource allocation mecanism */ pci_probe_only = 0; - - /* Allow all IO */ - io_page_mask = -1; } int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) Index: foobar2/arch/ppc64/kernel/pmac_pci.c =================================================================== --- foobar2.orig/arch/ppc64/kernel/pmac_pci.c 2005-05-04 13:55:12.050222778 +1000 +++ foobar2/arch/ppc64/kernel/pmac_pci.c 2005-05-04 13:55:52.228212549 +1000 @@ -755,9 +755,6 @@ /* Tell pci.c to not use the common resource allocation mecanism */ pci_probe_only = 1; - - /* Allow all IO */ - io_page_mask = -1; } /* Index: foobar2/arch/ppc64/kernel/iomap.c =================================================================== --- foobar2.orig/arch/ppc64/kernel/iomap.c 2005-05-04 13:16:13.000000000 +1000 +++ foobar2/arch/ppc64/kernel/iomap.c 2005-05-04 14:02:43.887671757 +1000 @@ -88,8 +88,6 @@ void __iomem *ioport_map(unsigned long port, unsigned int len) { - if (!_IO_IS_VALID(port)) - return NULL; return (void __iomem *) (port+pci_io_base); } From anton at samba.org Wed May 4 15:42:58 2005 From: anton at samba.org (Anton Blanchard) Date: Wed, 4 May 2005 15:42:58 +1000 Subject: [PATCH] remove io_page_mask In-Reply-To: <20050504043745.GJ13590@krispykreme> References: <20050504043745.GJ13590@krispykreme> Message-ID: <20050504054258.GK13590@krispykreme> > I found an issue with the io_page_mask code when pci_probe_only is not > set (we dont initialise io_page_mask and bad things happen). I was > about to fix it up when I wondered if we can remove it now. > > Ben changed the serial code to check before it goes pounding on > addresses. Im not sure if there were other issues with badly behaving > drivers but my js20 boots here with the following removal patch. First fallout: parport_pc_probe_port+0x318/0xbd0 parport_pc_init+0x418/0x520 sys_init_module+0x1bc/0x4b0 The parallel port code is going out and touching stuff it shouldnt. Anton From will_schmidt at vnet.ibm.com Wed May 4 23:12:27 2005 From: will_schmidt at vnet.ibm.com (will schmidt) Date: Wed, 04 May 2005 08:12:27 -0500 Subject: (resend) RFC/Patch xmon pte/pgd/ userspace address additions Message-ID: <4278CA3B.6060700@vnet.ibm.com> Hi Folks, This is a resend. I didnt see this on the patch page, so suspect it got lost. (Do updated patches need to be sent to the list under a different thread?) -Will -------- Original Message -------- Subject: Re: RFC/Patch more xmon additions Date: Thu, 07 Apr 2005 08:39:36 -0500 From: will schmidt To: will schmidt CC: linuxppc64-dev at ozlabs.org References: <421E3BE3.90301 at vnet.ibm.com> Hi All, here's a revised version of my initial patch. - I've removed the try_spinlock code; - As an alternative to duplicating lots of function to add mread calls in place of references, I've added setjmp(bus_error_jmp) {} around what seem more likely to be critical areas. - cleaned up spacing - changed most of the function names to be xmon_xxx instead of wm_xxx. these functions show up under a submenu 'w'. use "w?" at xmon> prompt to get the help blurb. -Will will schmidt wrote: > > Hi Folks, > Am looking for comments on this additional function i've added to xmon > on the side.. > > the bulk of my intent was to make it easier for me to poke at memory > within a particular user process. > > I realize that the spacing is a bit screwed up, and the function names > should eventually change. Because i couldnt decide on letters for the > new functions, i put them under a submenu 'w'. > > wP will dump info on all processes. > > wp 0xabc will make process with pid 0xabc the active pid. <- active > only with respect to xmon poking into memory. > > wd 0xabcd1234 - will call through the pdg/pmd functions and return the > kernel address corresponding to 0xabcd1234 within the processes memory > space location. > > wg will dump gprs of the process/thread. > > -Will > > > ------------------------------------------------------------------------ > > > _______________________________________________ > Linuxppc64-dev mailing list > Linuxppc64-dev at ozlabs.org > https://ozlabs.org/cgi-bin/mailman/listinfo/linuxppc64-dev -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: xmon_pxd_code_apr7.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050504/e50cfe05/attachment.txt From linas at austin.ibm.com Thu May 5 02:28:55 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Wed, 4 May 2005 11:28:55 -0500 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050504003305.GB11855@kevlar.burdell.org> References: <20050503050212.GA22395@kevlar.burdell.org> <20050503181747.GB7870@kevlar.burdell.org> <20050503225508.GG11745@austin.ibm.com> <20050504003305.GB11855@kevlar.burdell.org> Message-ID: <20050504162855.GH11745@austin.ibm.com> On Tue, May 03, 2005 at 08:33:05PM -0400, Sonny Rao was heard to remark: > > Ah okay cool, so in the future Linux will be able to smartly handle > it, very nice. Unfortunately I can't really test your patch because > several other people need to use the machine which is normally > partitioned up (and that particular device is left out of any LPAR > config) I just happend to boot the full-system partition to do some > tests and noticed the problem. There's supposed to be some code that allows slots to be dynamically added and removed from running partitions, but I've never tried it myself. > Again, if someone wants to do something with that card, let me know, > otherwise I'm going to toss it out. FWIW, field experience shows that nine out of ten failures are due to poorly seated PCI cards. Before you chuck it, you might want to remove it, make sure there are no iron filings in the slot, and try again. Let me know how that goes; I'd like to add this to my bag of "real world" experience with this thing. --linas From support at paypal.com Thu May 5 02:30:40 2005 From: support at paypal.com (support at paypal.com) Date: Wed, 4 May 2005 18:30:40 +0200 (CEST) Subject: Billing Issues Message-ID: <20050504163040.AE7D524F7@wmphpp02.st2.lyceu.net> An HTML attachment was scrubbed... URL: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050504/f601641f/attachment.htm From moilanen at austin.ibm.com Thu May 5 06:35:59 2005 From: moilanen at austin.ibm.com (Jake Moilanen) Date: Wed, 4 May 2005 15:35:59 -0500 Subject: [PATCH] ppc64: enforce medium thread priority in hypervisor calls In-Reply-To: <20050429135446.GF19662@krispykreme> References: <20050429135446.GF19662@krispykreme> Message-ID: <20050504153559.2aa85753.moilanen@austin.ibm.com> > Calls into the hypervisor do not raise the thread priority. Ensure we > are running at medium priority upon entry to the hypervisor. Anton, what's the purpose of this patch. I thought only RS64 had HMT, and those boxes don't make hypervisor calls. Jake From moilanen at austin.ibm.com Thu May 5 06:48:35 2005 From: moilanen at austin.ibm.com (Jake Moilanen) Date: Wed, 4 May 2005 15:48:35 -0500 Subject: [PATCH] ppc64: enforce medium thread priority in hypervisor calls In-Reply-To: <20050504153559.2aa85753.moilanen@austin.ibm.com> References: <20050429135446.GF19662@krispykreme> <20050504153559.2aa85753.moilanen@austin.ibm.com> Message-ID: <20050504154835.1e67686b.moilanen@austin.ibm.com> On Wed, 4 May 2005 15:35:59 -0500 Jake Moilanen wrote: > > Calls into the hypervisor do not raise the thread priority. Ensure we > > are running at medium priority upon entry to the hypervisor. > > Anton, what's the purpose of this patch. I thought only RS64 had HMT, > and those boxes don't make hypervisor calls. Disregard...Olof reminded me that SMT uses the same scheme. Jake From moilanen at austin.ibm.com Thu May 5 04:50:07 2005 From: moilanen at austin.ibm.com (Jake Moilanen) Date: Wed, 4 May 2005 13:50:07 -0500 Subject: [PATCH] remove io_page_mask In-Reply-To: <20050504043745.GJ13590@krispykreme> References: <20050504043745.GJ13590@krispykreme> Message-ID: <20050504135007.78f449d2.moilanen@austin.ibm.com> > I found an issue with the io_page_mask code when pci_probe_only is not > set (we dont initialise io_page_mask and bad things happen). I was > about to fix it up when I wondered if we can remove it now. > > Ben changed the serial code to check before it goes pounding on > addresses. Im not sure if there were other issues with badly behaving > drivers but my js20 boots here with the following removal patch. As long as the serial code is fixed up, then the JS20 shouldn't need the io_page_mask. Jake From apw at shadowen.org Thu May 5 06:30:57 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 04 May 2005 21:30:57 +0100 Subject: [3/3] sparsemem memory model for ppc64 Message-ID: Provide the architecture specific implementation for SPARSEMEM for PPC64 systems. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Mike Kravetz (in part) Signed-off-by: Martin Bligh --- arch/ppc64/Kconfig | 13 ++++++++++++- arch/ppc64/kernel/setup.c | 1 + arch/ppc64/mm/Makefile | 2 +- arch/ppc64/mm/init.c | 24 +++++++++++++++++++----- include/asm-ppc64/mmzone.h | 36 +++++++++++++++++++++++------------- include/asm-ppc64/page.h | 3 ++- include/asm-ppc64/sparsemem.h | 16 ++++++++++++++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig --- reference/arch/ppc64/Kconfig 2005-05-04 20:54:52.000000000 +0100 +++ current/arch/ppc64/Kconfig 2005-05-04 20:54:54.000000000 +0100 @@ -198,6 +198,13 @@ config HMT This option enables hardware multithreading on RS64 cpus. pSeries systems p620 and p660 have such a cpu type. +config ARCH_SELECT_MEMORY_MODEL + def_bool y + +config ARCH_FLATMEM_ENABLE + def_bool y + depends on !NUMA + config ARCH_DISCONTIGMEM_ENABLE def_bool y depends on SMP && PPC_PSERIES @@ -209,6 +216,10 @@ config ARCH_DISCONTIGMEM_DEFAULT config ARCH_FLATMEM_ENABLE def_bool y +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on ARCH_DISCONTIGMEM_ENABLE + source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID @@ -229,7 +240,7 @@ config NODES_SPAN_OTHER_NODES config NUMA bool "NUMA support" - depends on DISCONTIGMEM + default y if DISCONTIGMEM || SPARSEMEM config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/kernel/setup.c current/arch/ppc64/kernel/setup.c --- reference/arch/ppc64/kernel/setup.c 2005-04-11 19:33:15.000000000 +0100 +++ current/arch/ppc64/kernel/setup.c 2005-05-04 20:54:53.000000000 +0100 @@ -1059,6 +1059,7 @@ void __init setup_arch(char **cmdline_p) /* set up the bootmem stuff with available memory */ do_init_bootmem(); + sparse_init(); /* initialize the syscall map in systemcfg */ setup_syscall_map(); diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/mm/init.c current/arch/ppc64/mm/init.c --- reference/arch/ppc64/mm/init.c 2005-05-04 20:54:20.000000000 +0100 +++ current/arch/ppc64/mm/init.c 2005-05-04 20:54:54.000000000 +0100 @@ -601,13 +601,21 @@ EXPORT_SYMBOL(page_is_ram); * Initialize the bootmem system and give it all the memory we * have available. */ -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES void __init do_init_bootmem(void) { unsigned long i; unsigned long start, bootmap_pages; unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; int boot_mapsize; + unsigned long start_pfn, end_pfn; + /* + * Note presence of first (logical/coalasced) LMB which will + * contain RMO region + */ + start_pfn = lmb.memory.region[0].physbase >> PAGE_SHIFT; + end_pfn = start_pfn + (lmb.memory.region[0].size >> PAGE_SHIFT); + memory_present(0, start_pfn, end_pfn); /* * Find an area to use for the bootmem bitmap. Calculate the size of @@ -623,12 +631,18 @@ void __init do_init_bootmem(void) max_pfn = max_low_pfn; - /* add all physical memory to the bootmem map. Also find the first */ + /* add all physical memory to the bootmem map. Also, find the first + * presence of all LMBs*/ for (i=0; i < lmb.memory.cnt; i++) { unsigned long physbase, size; physbase = lmb.memory.region[i].physbase; size = lmb.memory.region[i].size; + if (i) { /* already created mappings for first LMB */ + start_pfn = physbase >> PAGE_SHIFT; + end_pfn = start_pfn + (size >> PAGE_SHIFT); + } + memory_present(0, start_pfn, end_pfn); free_bootmem(physbase, size); } @@ -667,7 +681,7 @@ void __init paging_init(void) free_area_init_node(0, &contig_page_data, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); } -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ static struct kcore_list kcore_vmem; @@ -698,7 +712,7 @@ module_init(setup_kcore); void __init mem_init(void) { -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif pg_data_t *pgdat; @@ -709,7 +723,7 @@ void __init mem_init(void) num_physpages = max_low_pfn; /* RAM is assumed contiguous */ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NEED_MULTIPLE_NODES for_each_online_node(nid) { if (NODE_DATA(nid)->node_spanned_pages != 0) { printk("freeing bootmem node %x\n", nid); diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/mm/Makefile current/arch/ppc64/mm/Makefile --- reference/arch/ppc64/mm/Makefile 2005-01-21 14:04:09.000000000 +0000 +++ current/arch/ppc64/mm/Makefile 2005-05-04 20:54:54.000000000 +0100 @@ -6,6 +6,6 @@ EXTRA_CFLAGS += -mno-minimal-toc obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \ slb_low.o slb.o stab.o mmap.o -obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/include/asm-ppc64/mmzone.h current/include/asm-ppc64/mmzone.h --- reference/include/asm-ppc64/mmzone.h 2005-05-04 20:54:50.000000000 +0100 +++ current/include/asm-ppc64/mmzone.h 2005-05-04 20:54:54.000000000 +0100 @@ -10,9 +10,20 @@ #include #include -#ifdef CONFIG_DISCONTIGMEM +/* generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + + +#ifdef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data *node_data[]; +/* + * Return a pointer to the node data for node n. + */ +#define NODE_DATA(nid) (node_data[nid]) /* * Following are specific to this numa platform. @@ -47,30 +58,27 @@ static inline int pa_to_nid(unsigned lon return nid; } -#define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) - -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* * Following are macros that each numa implmentation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) +#ifdef CONFIG_DISCONTIGMEM + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) + +#define pfn_to_nid(pfn) pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT) + /* Written this way to avoid evaluating arguments twice */ #define discontigmem_pfn_to_page(pfn) \ ({ \ @@ -91,6 +99,8 @@ static inline int pa_to_nid(unsigned lon #endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ + #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID #define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) #endif diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/include/asm-ppc64/page.h current/include/asm-ppc64/page.h --- reference/include/asm-ppc64/page.h 2005-04-11 19:33:45.000000000 +0100 +++ current/include/asm-ppc64/page.h 2005-05-04 20:54:54.000000000 +0100 @@ -224,7 +224,8 @@ extern u64 ppc64_pft_size; /* Log 2 of #define page_to_pfn(page) discontigmem_page_to_pfn(page) #define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) #define pfn_valid(pfn) discontigmem_pfn_valid(pfn) -#else +#endif +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/include/asm-ppc64/sparsemem.h current/include/asm-ppc64/sparsemem.h --- reference/include/asm-ppc64/sparsemem.h 1970-01-01 01:00:00.000000000 +0100 +++ current/include/asm-ppc64/sparsemem.h 2005-05-04 20:54:54.000000000 +0100 @@ -0,0 +1,16 @@ +#ifndef _ASM_PPC64_SPARSEMEM_H +#define _ASM_PPC64_SPARSEMEM_H 1 + +#ifdef CONFIG_SPARSEMEM +/* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#define SECTION_SIZE_BITS 24 +#define MAX_PHYSADDR_BITS 38 +#define MAX_PHYSMEM_BITS 36 + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_PPC64_SPARSEMEM_H */ From apw at shadowen.org Thu May 5 06:28:57 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 04 May 2005 21:28:57 +0100 Subject: [1/3] add early_pfn_to_nid for ppc64 Message-ID: Provide an implementation of early_pfn_to_nid for PPC64. This is used by memory models to determine the node from which to take allocations before the memory allocators are fully initialised. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh --- arch/ppc64/Kconfig | 4 ++++ include/asm-ppc64/mmzone.h | 5 +++++ 2 files changed, 9 insertions(+) diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig --- reference/arch/ppc64/Kconfig 2005-05-04 20:54:41.000000000 +0100 +++ current/arch/ppc64/Kconfig 2005-05-04 20:54:48.000000000 +0100 @@ -211,6 +211,10 @@ config ARCH_FLATMEM_ENABLE source "mm/Kconfig" +config HAVE_ARCH_EARLY_PFN_TO_NID + bool + default y + # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and # between a node's start and end pfns, it may not diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/include/asm-ppc64/mmzone.h current/include/asm-ppc64/mmzone.h --- reference/include/asm-ppc64/mmzone.h 2005-05-04 20:54:41.000000000 +0100 +++ current/include/asm-ppc64/mmzone.h 2005-05-04 20:54:48.000000000 +0100 @@ -90,4 +90,9 @@ static inline int pa_to_nid(unsigned lon #define discontigmem_pfn_valid(pfn) ((pfn) < num_physpages) #endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) +#endif + #endif /* _ASM_MMZONE_H_ */ From apw at shadowen.org Thu May 5 06:27:57 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 04 May 2005 21:27:57 +0100 Subject: [0/3] SPARSEMEM memory model patches for ppc64 Message-ID: After long testing outside -mm we believe that the SPARSEMEM patches are ready for wider testing, please consider for -mm. SPARSEMEM essentially is a replacement for DISCONTIGMEM providing support for non-contigious memory but with the advantage of handling both inter- and intra-node memory holes. The goal of the implementation was to design a clean memory memory model covering the needs of both UMA and NUMA discontigouos memory layouts whilst providing a basis for hotplug. This should allow us to consolidate the implementation of various "discontiguous" memory model whilst trying to fix its short comings. Ultimatly it should allow us to remove DISCONTIGMEM. Following this mail are 3 patches which provide SPARSEMEM for ppc64: [1/3] add early_pfn_to_nid for ppc64 [2/3] add memory present for ppc64 [3/3] sparsemem memory model for ppc64 These apply on top of the generic/i386 patches recently sent out to linux-mm: [1/6] generify early_pfn_to_nid [2/6] generify memory present [3/6] sparsemem memory model [4/6] sparsemem memory model for i386 [5/6] sparsemem swiss cheese numa layouts [6/6] sparsemem hotplug base These patches have been compiled, booted and tested on 2.6.12-rc2 (plus the -mm patches listed below). They have been compile and boot tested against 2.6.12-rc3-mm2. They do assume a number of patches already incorporated into -mm including the latest configuration updates from Dave Hansen . remove-non-discontig-use-of-pgdat-node_mem_map.patch resubmit-sparsemem-base-early_pfn_to_nid-works-before-sparse-is-initialized.patch resubmit-sparsemem-base-simple-numa-remap-space-allocator.patch resubmit-sparsemem-base-reorganize-page-flags-bit-operations.patch resubmit-sparsemem-base-teach-discontig-about-sparse-ranges.patch create-mm-kconfig-for-arch-independent-memory-options.patch make-each-arch-use-mm-kconfig.patch update-all-defconfigs-for-arch_discontigmem_enable.patch introduce-new-kconfig-option-for-numa-or-discontig.patch sparsemem-fix-minor-defaults-issue-in-mm-kconfig.patch mm-kconfig-kill-unused-arch_flatmem_disable.patch mm-kconfig-hide-memory-model-selection-menu.patch Comments/feedback appreciated. -apw From apw at shadowen.org Thu May 5 06:29:57 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Wed, 04 May 2005 21:29:57 +0100 Subject: [2/3] add memory present for ppc64 Message-ID: Provide hooks for PPC64 to allow memory models to be informed of installed memory areas. This allows SPARSEMEM to instantiate mem_map for the populated areas. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh --- Kconfig | 4 ++-- mm/numa.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig --- reference/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 +++ current/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 @@ -212,8 +212,8 @@ config ARCH_FLATMEM_ENABLE source "mm/Kconfig" config HAVE_ARCH_EARLY_PFN_TO_NID - bool - default y + def_bool y + depends on NEED_MULTIPLE_NODES # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/mm/numa.c current/arch/ppc64/mm/numa.c --- reference/arch/ppc64/mm/numa.c 2005-04-11 19:33:15.000000000 +0100 +++ current/arch/ppc64/mm/numa.c 2005-05-04 20:54:50.000000000 +0100 @@ -440,6 +440,8 @@ new_range: for (i = start ; i < (start+size); i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = numa_domain; + memory_present(numa_domain, start >> PAGE_SHIFT, + (start + size) >> PAGE_SHIFT); if (--ranges) goto new_range; @@ -481,6 +483,7 @@ static void __init setup_nonnuma(void) for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; + memory_present(0, 0, init_node_data[0].node_end_pfn); } static void __init dump_numa_topology(void) From olof at lixom.net Thu May 5 08:08:19 2005 From: olof at lixom.net (Olof Johansson) Date: Wed, 4 May 2005 17:08:19 -0500 Subject: [PATCH] remove io_page_mask In-Reply-To: <20050504135007.78f449d2.moilanen@austin.ibm.com> References: <20050504043745.GJ13590@krispykreme> <20050504135007.78f449d2.moilanen@austin.ibm.com> Message-ID: <20050504220815.GA29571@austin.ibm.com> On Wed, May 04, 2005 at 01:50:07PM -0500, Jake Moilanen wrote: > > I found an issue with the io_page_mask code when pci_probe_only is not > > set (we dont initialise io_page_mask and bad things happen). I was > > about to fix it up when I wondered if we can remove it now. > > > > Ben changed the serial code to check before it goes pounding on > > addresses. Im not sure if there were other issues with badly behaving > > drivers but my js20 boots here with the following removal patch. > > As long as the serial code is fixed up, then the JS20 shouldn't need the > io_page_mask. I tried booting 2.6.12-rc3-mm2 + this patch on a JS20 here and it seemed to go well. Serial options enabled were: CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set # Non-8250 serial port support CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y -Olof From sonny at burdell.org Thu May 5 10:39:54 2005 From: sonny at burdell.org (Sonny Rao) Date: Wed, 4 May 2005 20:39:54 -0400 Subject: 2.6.11 e1000 EEH MMIO failure In-Reply-To: <20050504162855.GH11745@austin.ibm.com> References: <20050503050212.GA22395@kevlar.burdell.org> <20050503181747.GB7870@kevlar.burdell.org> <20050503225508.GG11745@austin.ibm.com> <20050504003305.GB11855@kevlar.burdell.org> <20050504162855.GH11745@austin.ibm.com> Message-ID: <20050505003954.GB7367@kevlar.burdell.org> On Wed, May 04, 2005 at 11:28:55AM -0500, Linas Vepstas wrote: > On Tue, May 03, 2005 at 08:33:05PM -0400, Sonny Rao was heard to remark: > > > > Ah okay cool, so in the future Linux will be able to smartly handle > > it, very nice. Unfortunately I can't really test your patch because > > several other people need to use the machine which is normally > > partitioned up (and that particular device is left out of any LPAR > > config) I just happend to boot the full-system partition to do some > > tests and noticed the problem. > > There's supposed to be some code that allows slots to be dynamically > added and removed from running partitions, but I've never tried it > myself. > > > Again, if someone wants to do something with that card, let me know, > > otherwise I'm going to toss it out. > > FWIW, field experience shows that nine out of ten failures are due to > poorly seated PCI cards. Before you chuck it, you might want to remove > it, make sure there are no iron filings in the slot, and try again. > > Let me know how that goes; I'd like to add this to my bag of "real > world" experience with this thing. Well, I tried to hot-plug that thing back in and boot a partition with it, and as far as I can tell the card seems to be dead. Not sure how to interpret this. Oh well, thanks. Sonny From david at gibson.dropbear.id.au Thu May 5 11:42:56 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 5 May 2005 11:42:56 +1000 Subject: Patch to kill ioremap_mm Message-ID: <20050505014256.GE18270@localhost.localdomain> Can anyone see any problems with this patch. If not, I'll send it on to akpm. Currently ppc64 has two mm_structs for the kernel, init_mm and also ioremap_mm. The latter really isn't necessary: this patch abolishes it, instead restricting vmallocs to the lower 1TB of the init_mm's range and placing io mappings in the upper 1TB. This simplifies the code in a number of places, and gets rid of an unecessary set of pagetables. Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-05 10:58:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-05 11:12:59.000000000 +1000 @@ -53,7 +53,8 @@ * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_END (VMALLOC_START + EADDR_MASK) +#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* * Bits in a linux-style PTE. These match the bits in the @@ -239,9 +240,6 @@ /* This now only contains the vmalloc pages */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -/* to find an entry in the ioremap page-table-directory */ -#define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -459,15 +457,12 @@ #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -extern unsigned long ioremap_bot, ioremap_base; - #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; -extern pgd_t ioremap_dir[]; extern void paging_init(void); Index: working-2.6/include/asm-ppc64/imalloc.h =================================================================== --- working-2.6.orig/include/asm-ppc64/imalloc.h 2005-05-05 10:58:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/imalloc.h 2005-05-05 11:13:39.000000000 +1000 @@ -4,9 +4,9 @@ /* * Define the address range of the imalloc VM area. */ -#define PHBS_IO_BASE IOREGIONBASE -#define IMALLOC_BASE (IOREGIONBASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ -#define IMALLOC_END (IOREGIONBASE + EADDR_MASK) +#define PHBS_IO_BASE VMALLOC_END +#define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ +#define IMALLOC_END (VMALLOC_START + EADDR_MASK) /* imalloc region types */ @@ -21,4 +21,6 @@ int region_type); unsigned long im_free(void *addr); +extern unsigned long ioremap_bot; + #endif /* _PPC64_IMALLOC_H */ Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-05 10:58:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-05 11:14:02.000000000 +1000 @@ -202,9 +202,7 @@ #define PAGE_OFFSET ASM_CONST(0xC000000000000000) #define KERNELBASE PAGE_OFFSET #define VMALLOCBASE ASM_CONST(0xD000000000000000) -#define IOREGIONBASE ASM_CONST(0xE000000000000000) -#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) #define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) #define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) #define USER_REGION_ID (0UL) Index: working-2.6/arch/ppc64/kernel/eeh.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/eeh.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/eeh.c 2005-05-05 11:23:40.000000000 +1000 @@ -505,7 +505,7 @@ pte_t *ptep; unsigned long pa; - ptep = find_linux_pte(ioremap_mm.pgd, token); + ptep = find_linux_pte(init_mm.pgd, token); if (!ptep) return token; pa = pte_pfn(*ptep) << PAGE_SHIFT; Index: working-2.6/arch/ppc64/kernel/process.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/process.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/process.c 2005-05-05 11:16:20.000000000 +1000 @@ -58,14 +58,6 @@ struct task_struct *last_task_used_altivec = NULL; #endif -struct mm_struct ioremap_mm = { - .pgd = ioremap_dir, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .cpu_vm_mask = CPU_MASK_ALL, - .page_table_lock = SPIN_LOCK_UNLOCKED, -}; - /* * Make sure the floating-point register state in the * the thread_struct is up to date for task tsk. Index: working-2.6/include/asm-ppc64/processor.h =================================================================== --- working-2.6.orig/include/asm-ppc64/processor.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/processor.h 2005-05-05 11:24:46.000000000 +1000 @@ -590,16 +590,6 @@ } /* - * Note: the vm_start and vm_end fields here should *not* - * be in kernel space. (Could vm_end == vm_start perhaps?) - */ -#define IOREMAP_MMAP { &ioremap_mm, 0, 0x1000, NULL, \ - PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \ - 1, NULL, NULL } - -extern struct mm_struct ioremap_mm; - -/* * Return saved PC of a blocked thread. For now, this is the "user" PC */ #define thread_saved_pc(tsk) \ Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-05 10:58:04.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-05 11:17:03.000000000 +1000 @@ -310,10 +310,6 @@ vsid = get_vsid(mm->context.id, ea); break; - case IO_REGION_ID: - mm = &ioremap_mm; - vsid = get_kernel_vsid(ea); - break; case VMALLOC_REGION_ID: mm = &init_mm; vsid = get_kernel_vsid(ea); Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-05 10:58:04.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-05 11:22:54.000000000 +1000 @@ -144,7 +144,7 @@ pte = pte_offset_kernel(pmd, addr); do { - pte_t ptent = ptep_get_and_clear(&ioremap_mm, addr, pte); + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); } @@ -181,13 +181,13 @@ static void unmap_im_area(unsigned long addr, unsigned long end) { - struct mm_struct *mm = &ioremap_mm; + struct mm_struct *mm = &init_mm; unsigned long next; pgd_t *pgd; spin_lock(&mm->page_table_lock); - pgd = pgd_offset_i(addr); + pgd = pgd_offset_k(addr); flush_cache_vunmap(addr, end); do { next = pgd_addr_end(addr, end); @@ -214,21 +214,21 @@ unsigned long vsid; if (mem_init_done) { - spin_lock(&ioremap_mm.page_table_lock); - pgdp = pgd_offset_i(ea); - pudp = pud_alloc(&ioremap_mm, pgdp, ea); + spin_lock(&init_mm.page_table_lock); + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) return -ENOMEM; - pmdp = pmd_alloc(&ioremap_mm, pudp, ea); + pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); + ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; pa = abs_to_phys(pa); - set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); - spin_unlock(&ioremap_mm.page_table_lock); + spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From olof at lixom.net Thu May 5 12:31:32 2005 From: olof at lixom.net (Olof Johansson) Date: Wed, 4 May 2005 21:31:32 -0500 Subject: [3/3] sparsemem memory model for ppc64 In-Reply-To: References: Message-ID: <20050505023132.GB20283@austin.ibm.com> Hi, Just two formatting nitpicks below. -Olof On Wed, May 04, 2005 at 09:30:57PM +0100, Andy Whitcroft wrote: > diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/mm/init.c current/arch/ppc64/mm/init.c > --- reference/arch/ppc64/mm/init.c 2005-05-04 20:54:20.000000000 +0100 > +++ current/arch/ppc64/mm/init.c 2005-05-04 20:54:54.000000000 +0100 [...] > @@ -623,12 +631,18 @@ void __init do_init_bootmem(void) > > max_pfn = max_low_pfn; > > - /* add all physical memory to the bootmem map. Also find the first */ > + /* add all physical memory to the bootmem map. Also, find the first > + * presence of all LMBs*/ CodingStyle: */ on new line > for (i=0; i < lmb.memory.cnt; i++) { > unsigned long physbase, size; > > physbase = lmb.memory.region[i].physbase; > size = lmb.memory.region[i].size; > + if (i) { /* already created mappings for first LMB */ > + start_pfn = physbase >> PAGE_SHIFT; > + end_pfn = start_pfn + (size >> PAGE_SHIFT); Comment on new line indented, please -Olof From olof at lixom.net Thu May 5 12:31:19 2005 From: olof at lixom.net (Olof Johansson) Date: Wed, 4 May 2005 21:31:19 -0500 Subject: [2/3] add memory present for ppc64 In-Reply-To: References: Message-ID: <20050505023119.GA20283@austin.ibm.com> On Wed, May 04, 2005 at 09:29:57PM +0100, Andy Whitcroft wrote: > diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig > --- reference/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 > +++ current/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 > @@ -212,8 +212,8 @@ config ARCH_FLATMEM_ENABLE > source "mm/Kconfig" > > config HAVE_ARCH_EARLY_PFN_TO_NID > - bool > - default y > + def_bool y > + depends on NEED_MULTIPLE_NODES Ok, time to show my lack of undestanding here, but when can we ever be CONFIG_NUMA and NOT need multiple nodes? > @@ -481,6 +483,7 @@ static void __init setup_nonnuma(void) > > for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) > numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; > + memory_present(0, 0, init_node_data[0].node_end_pfn); Isn't the memory_present stuff and numa_memory_lookup_table two implementations doing the same thing (mapping memory to nodes)? Can we kill numa_memory_lookup_table with this? -Olof From haveblue at us.ibm.com Thu May 5 14:43:18 2005 From: haveblue at us.ibm.com (Dave Hansen) Date: Wed, 04 May 2005 21:43:18 -0700 Subject: [2/3] add memory present for ppc64 In-Reply-To: <20050505023119.GA20283@austin.ibm.com> References: <20050505023119.GA20283@austin.ibm.com> Message-ID: <1115268198.9286.11.camel@localhost> On Wed, 2005-05-04 at 21:31 -0500, Olof Johansson wrote: > On Wed, May 04, 2005 at 09:29:57PM +0100, Andy Whitcroft wrote: > > diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig > > --- reference/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 > > +++ current/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 > > @@ -212,8 +212,8 @@ config ARCH_FLATMEM_ENABLE > > source "mm/Kconfig" > > > > config HAVE_ARCH_EARLY_PFN_TO_NID > > - bool > > - default y > > + def_bool y > > + depends on NEED_MULTIPLE_NODES > > Ok, time to show my lack of undestanding here, but when can we ever be > CONFIG_NUMA and NOT need multiple nodes? NEED_MULTIPLE_NODES is for DISCONTIG || NUMA. It is a blanket config option that helps us separate those two very intertwined options. > > @@ -481,6 +483,7 @@ static void __init setup_nonnuma(void) > > > > for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) > > numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; > > + memory_present(0, 0, init_node_data[0].node_end_pfn); > > Isn't the memory_present stuff and numa_memory_lookup_table two > implementations doing the same thing (mapping memory to nodes)? They have similar functions: record the physical layout of the system. But, memory_present() is for sparsemem, which basically implements pfn_to_page() and page_to_pfn(). The numa_memory_lookup_table[] is used for pfn_to_nid(), which is actually orthogonal to what sparsemem needs. > Can we kill numa_memory_lookup_table with this? Nope, we still need it for pfn_to_nid(). We could possibly replace the current implementation like this: #define pfn_to_nid(pfn) page_zone(__pfn_to_section(pfn)->section_mem_map[pfn])->zone_pgdat->node_id But, that might have a few performance implications :) There are certainly some options that sparsemem opens up here, and I hope that we explore them further as we move away from discontig. We could even do something like store the nid directly in the mem_section. But, as I said, that's an optimization that can come later. -- Dave From benh at kernel.crashing.org Thu May 5 17:18:22 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Thu, 05 May 2005 17:18:22 +1000 Subject: [PATCH] SMU update #2 Message-ID: <1115277502.7568.178.camel@gaston> Ok, here's more SMU work. It's not yet ready for upstream though. This patch rewrites completely the SMU driver (System Management Unit I suppose) which is used currently on iMac G5 and single CPU desktop G5, and will be used by Apple future machines. This new version adds interrupt-driven asynchronous command processing, userland interace to send SMU commands, a low level asynchronous i2c interface, a driver hooking this to the linux-i2c layer (with various "bugs" or rather "deviances" to what a linux-i2c driver is supposed to do, in a similar vein to keywest-i2c, but then, I personally consider the linux-i2c layer as a pile of barely useable crap). You can play with the userland interface to the SMU with this tool: http://gate.crashing.org/~benh/smu_wink.c That will flash your front led. I have successfully verified that I could use the i2c interface to read the hard disk temperature sensors. The other sensors (CPU temperature, voltage and current) are on the SMU itself and require specific commands. I have played with those in OF, it will be fairly easy to use this driver to access them. From that point, I'll finish my decrypting of the Darwin thermal control driver for the iMac G5 and implement something. I'm posting this for now as I can't tell when I'll be able to resume work on it and somebody else may want to play a bit in the meantime. Index: linux-work/drivers/macintosh/smu.c =================================================================== --- linux-work.orig/drivers/macintosh/smu.c 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/drivers/macintosh/smu.c 2005-05-05 16:28:11.000000000 +1000 @@ -8,21 +8,22 @@ */ /* - * For now, this driver includes: - * - RTC get & set - * - reboot & shutdown commands - * all synchronous with IRQ disabled (ugh) + * ??/??/05 - BenH - 0.1 : Initial released based on J. Mayer port * + * ??/??/05 - BenH - 0.4 : Rewritten version with async support, not merged + * but posted to some mailing lists + * + * ??/??/05 - BenH - 0.5 : Add i2c support + * TODO: - * rework in a way the PMU driver works, that is asynchronous - * with a queue of commands. I'll do that as soon as I have an - * SMU based machine at hand. Some more cleanup is needed too, - * like maybe fitting it into a platform device, etc... - * Also check what's up with cache coherency, and if we really - * can't do better than flushing the cache, maybe build a table - * of command len/reply len like the PMU driver to only flush - * what is actually necessary. - * --BenH. + * - maybe add timeout to commands ? + * - blocking version of time functions + * - polling version of i2c commands (including timer that works with + * interrutps off) + * - maybe avoid some data copies with i2c by directly using the smu cmd + * buffer and a lower level internal interface + * - understand SMU -> CPU events and implement reception of them via + * the userland interface */ #include @@ -36,6 +37,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -45,6 +50,11 @@ #include #include #include +#include +#include + +#define VERSION "0.5" +#define AUTHOR "(c) 2005 Benjamin Herrenschmidt, IBM Corp." #define DEBUG_SMU 1 @@ -57,20 +67,30 @@ /* * This is the command buffer passed to the SMU hardware */ +#define SMU_MAX_DATA 254 + struct smu_cmd_buf { u8 cmd; u8 length; - u8 data[0x0FFE]; + u8 data[SMU_MAX_DATA]; }; struct smu_device { spinlock_t lock; struct device_node *of_node; - int db_ack; /* doorbell ack GPIO */ - int db_req; /* doorbell req GPIO */ + struct of_device *of_dev; + int doorbell; /* doorbell gpio */ u32 __iomem *db_buf; /* doorbell buffer */ + int db_irq; + int msg; + int msg_irq; struct smu_cmd_buf *cmd_buf; /* command buffer virtual */ u32 cmd_buf_abs; /* command buffer absolute */ + struct list_head cmd_list; + struct smu_cmd *cmd_cur; /* pending command */ + struct list_head cmd_i2c_list; + struct smu_i2c_cmd *cmd_i2c_cur; /* pending i2c command */ + struct timer_list i2c_timer; }; /* @@ -79,113 +99,243 @@ */ static struct smu_device *smu; + /* - * SMU low level communication stuff + * SMU driver low level stuff */ -static inline int smu_cmd_stat(struct smu_cmd_buf *cmd_buf, u8 cmd_ack) + +static void smu_start_cmd(void) { - rmb(); - return cmd_buf->cmd == cmd_ack && cmd_buf->length != 0; + unsigned long faddr, fend; + struct smu_cmd *cmd; + + if (list_empty(&smu->cmd_list)) + return; + + /* Fetch first command in queue */ + cmd = list_entry(smu->cmd_list.next, struct smu_cmd, link); + smu->cmd_cur = cmd; + list_del(&cmd->link); + + DPRINTK("SMU: starting cmd %x, %d bytes data\n", cmd->cmd, + cmd->data_len); + DPRINTK("SMU: data buffer: %02x %02x %02x %02x ...\n", + ((u8 *)cmd->data_buf)[0], ((u8 *)cmd->data_buf)[1], + ((u8 *)cmd->data_buf)[2], ((u8 *)cmd->data_buf)[3]); + + /* Fill the SMU command buffer */ + smu->cmd_buf->cmd = cmd->cmd; + smu->cmd_buf->length = cmd->data_len; + memcpy(smu->cmd_buf->data, cmd->data_buf, cmd->data_len); + + /* Flush command and data to RAM */ + faddr = (unsigned long)smu->cmd_buf; + fend = faddr + smu->cmd_buf->length + 2; + flush_inval_dcache_range(faddr, fend); + + /* This isn't exactly a DMA mapping here, I suspect + * the SMU is actually communicating with us via i2c to the + * northbridge or the CPU to access RAM. + */ + writel(smu->cmd_buf_abs, smu->db_buf); + + /* Ring the SMU doorbell */ + pmac_do_feature_call(PMAC_FTR_WRITE_GPIO, NULL, smu->doorbell, 4); } -static inline u8 smu_save_ack_cmd(struct smu_cmd_buf *cmd_buf) + +static irqreturn_t smu_db_intr(int irq, void *arg, struct pt_regs *regs) { - return (~cmd_buf->cmd) & 0xff; + unsigned long flags; + struct smu_cmd *cmd; + void (*done)(struct smu_cmd *cmd, void *misc) = NULL; + void *misc = NULL; + u8 gpio; + int rc = 0; + + /* SMU completed the command, well, we hope, let's make sure + * of it + */ + spin_lock_irqsave(&smu->lock, flags); + + gpio = pmac_do_feature_call(PMAC_FTR_READ_GPIO, NULL, smu->doorbell); + if ((gpio & 7) != 7) + return IRQ_HANDLED; + + cmd = smu->cmd_cur; + smu->cmd_cur = NULL; + if (cmd == NULL) + goto bail; + + if (rc == 0) { + unsigned long faddr; + int reply_len; + u8 ack; + + /* CPU might have brought back the cache line, so we need + * to flush again before peeking at the SMU response. We + * flush the entire buffer for now as we haven't read the + * reply lenght (it's only 2 cache lines anyway) + */ + faddr = (unsigned long)smu->cmd_buf; + flush_inval_dcache_range(faddr, faddr + 256); + + /* Now check ack */ + ack = (~cmd->cmd) & 0xff; + if (ack != smu->cmd_buf->cmd) { + DPRINTK("SMU: incorrect ack, want %x got %x\n", + ack, smu->cmd_buf->cmd); + rc = -EIO; + } + reply_len = rc == 0 ? smu->cmd_buf->length : 0; + DPRINTK("SMU: reply len: %d\n", reply_len); + if (reply_len > cmd->reply_len) { + printk(KERN_WARNING "SMU: reply buffer too small," + "got %d bytes for a %d bytes buffer\n", + reply_len, cmd->reply_len); + reply_len = cmd->reply_len; + } + cmd->reply_len = reply_len; + if (cmd->reply_buf && reply_len) + memcpy(cmd->reply_buf, smu->cmd_buf->data, reply_len); + } + + /* Now complete the command. Write status last in order as we lost + * ownership of the command structure as soon as it's no longer -1 + */ + done = cmd->done; + misc = cmd->misc; + mb(); + cmd->status = rc; + bail: + /* Start next command if any */ + smu_start_cmd(); + spin_unlock_irqrestore(&smu->lock, flags); + + /* Call command completion handler if any */ + if (done) + done(cmd, misc); + + /* It's an edge interrupt, nothing to do */ + return IRQ_HANDLED; } -static void smu_send_cmd(struct smu_device *dev) + +static irqreturn_t smu_msg_intr(int irq, void *arg, struct pt_regs *regs) { - /* SMU command buf is currently cacheable, we need a physical - * address. This isn't exactly a DMA mapping here, I suspect - * the SMU is actually communicating with us via i2c to the - * northbridge or the CPU to access RAM. + /* I don't quite know what to do with this one, we seem to never + * receive it, so I suspect we have to arm it someway in the SMU + * to start getting events that way. */ - writel(dev->cmd_buf_abs, dev->db_buf); - /* Ring the SMU doorbell */ - pmac_do_feature_call(PMAC_FTR_WRITE_GPIO, NULL, dev->db_req, 4); - pmac_do_feature_call(PMAC_FTR_READ_GPIO, NULL, dev->db_req, 4); + printk(KERN_INFO "SMU: message interrupt !\n"); + + /* It's an edge interrupt, nothing to do */ + return IRQ_HANDLED; } -static int smu_cmd_done(struct smu_device *dev) + +/* + * Queued command management. + * + */ + +int smu_queue_cmd(struct smu_cmd *cmd) { - unsigned long wait = 0; - int gpio; + unsigned long flags; - /* Check the SMU doorbell */ - do { - gpio = pmac_do_feature_call(PMAC_FTR_READ_GPIO, - NULL, dev->db_ack); - if ((gpio & 7) == 7) - return 0; - udelay(100); - } while(++wait < 10000); + if (smu == NULL) + return -ENODEV; + if (cmd->data_len > SMU_MAX_DATA || + cmd->reply_len > SMU_MAX_DATA) + return -EINVAL; - printk(KERN_ERR "SMU timeout !\n"); - return -ENXIO; + cmd->status = 1; + spin_lock_irqsave(&smu->lock, flags); + list_add_tail(&cmd->link, &smu->cmd_list); + if (smu->cmd_cur == NULL) + smu_start_cmd(); + spin_unlock_irqrestore(&smu->lock, flags); + + return 0; } +EXPORT_SYMBOL(smu_queue_cmd); -static int smu_do_cmd(struct smu_device *dev) + +int smu_queue_simple(struct smu_simple_cmd *scmd, u8 command, + unsigned int data_len, + void (*done)(struct smu_cmd *cmd, void *misc), + void *misc, ...) { - int rc; - u8 cmd_ack; + struct smu_cmd *cmd = &scmd->cmd; + va_list list; + int i; - DPRINTK("SMU do_cmd %02x len=%d %02x\n", - dev->cmd_buf->cmd, dev->cmd_buf->length, - dev->cmd_buf->data[0]); - - cmd_ack = smu_save_ack_cmd(dev->cmd_buf); - - /* Clear cmd_buf cache lines */ - flush_inval_dcache_range((unsigned long)dev->cmd_buf, - ((unsigned long)dev->cmd_buf) + - sizeof(struct smu_cmd_buf)); - smu_send_cmd(dev); - rc = smu_cmd_done(dev); - if (rc == 0) - rc = smu_cmd_stat(dev->cmd_buf, cmd_ack) ? 0 : -1; - - DPRINTK("SMU do_cmd %02x len=%d %02x => %d (%02x)\n", - dev->cmd_buf->cmd, dev->cmd_buf->length, - dev->cmd_buf->data[0], rc, cmd_ack); + if (data_len > sizeof(scmd->buffer)) + return -EINVAL; - return rc; + memset(scmd, 0, sizeof(*scmd)); + cmd->cmd = command; + cmd->data_len = data_len; + cmd->data_buf = scmd->buffer; + cmd->reply_len = sizeof(scmd->buffer); + cmd->reply_buf = scmd->buffer; + cmd->done = done; + cmd->misc = misc; + + va_start(list, misc); + for (i = 0; i < data_len; ++i) + scmd->buffer[i] = (u8)va_arg(list, int); + va_end(list); + + return smu_queue_cmd(cmd); } +EXPORT_SYMBOL(smu_queue_simple); -/* RTC low level commands */ -static inline int bcd2hex (int n) + +void smu_poll(void) { - return (((n & 0xf0) >> 4) * 10) + (n & 0xf); + u8 gpio; + + if (smu == NULL) + return; + + gpio = pmac_do_feature_call(PMAC_FTR_READ_GPIO, NULL, smu->doorbell); + if ((gpio & 7) == 7) + smu_db_intr(smu->db_irq, smu, NULL); } +EXPORT_SYMBOL(smu_poll); -static inline int hex2bcd (int n) + +void smu_done_complete(struct smu_cmd *cmd, void *misc) { - return ((n / 10) << 4) + (n % 10); + struct completion *comp = misc; + + complete(comp); } +EXPORT_SYMBOL(smu_done_complete); + -#if 0 -static inline void smu_fill_set_pwrup_timer_cmd(struct smu_cmd_buf *cmd_buf) +void smu_spinwait_cmd(struct smu_cmd *cmd) { - cmd_buf->cmd = 0x8e; - cmd_buf->length = 8; - cmd_buf->data[0] = 0x00; - memset(cmd_buf->data + 1, 0, 7); + while(cmd->status == 1) + smu_poll(); } +EXPORT_SYMBOL(smu_spinwait_cmd); -static inline void smu_fill_get_pwrup_timer_cmd(struct smu_cmd_buf *cmd_buf) + +/* RTC low level commands */ +static inline int bcd2hex (int n) { - cmd_buf->cmd = 0x8e; - cmd_buf->length = 1; - cmd_buf->data[0] = 0x01; + return (((n & 0xf0) >> 4) * 10) + (n & 0xf); } -static inline void smu_fill_dis_pwrup_timer_cmd(struct smu_cmd_buf *cmd_buf) + +static inline int hex2bcd (int n) { - cmd_buf->cmd = 0x8e; - cmd_buf->length = 1; - cmd_buf->data[0] = 0x02; + return ((n / 10) << 4) + (n % 10); } -#endif + static inline void smu_fill_set_rtc_cmd(struct smu_cmd_buf *cmd_buf, struct rtc_time *time) @@ -202,100 +352,96 @@ cmd_buf->data[7] = hex2bcd(time->tm_year - 100); } -static inline void smu_fill_get_rtc_cmd(struct smu_cmd_buf *cmd_buf) -{ - cmd_buf->cmd = 0x8e; - cmd_buf->length = 1; - cmd_buf->data[0] = 0x81; -} - -static void smu_parse_get_rtc_reply(struct smu_cmd_buf *cmd_buf, - struct rtc_time *time) -{ - time->tm_sec = bcd2hex(cmd_buf->data[0]); - time->tm_min = bcd2hex(cmd_buf->data[1]); - time->tm_hour = bcd2hex(cmd_buf->data[2]); - time->tm_wday = bcd2hex(cmd_buf->data[3]); - time->tm_mday = bcd2hex(cmd_buf->data[4]); - time->tm_mon = bcd2hex(cmd_buf->data[5]) - 1; - time->tm_year = bcd2hex(cmd_buf->data[6]) + 100; -} -int smu_get_rtc_time(struct rtc_time *time) +int smu_get_rtc_time(struct rtc_time *time, int spinwait) { - unsigned long flags; + struct smu_simple_cmd cmd; int rc; if (smu == NULL) return -ENODEV; memset(time, 0, sizeof(struct rtc_time)); - spin_lock_irqsave(&smu->lock, flags); - smu_fill_get_rtc_cmd(smu->cmd_buf); - rc = smu_do_cmd(smu); - if (rc == 0) - smu_parse_get_rtc_reply(smu->cmd_buf, time); - spin_unlock_irqrestore(&smu->lock, flags); - - return rc; + rc = smu_queue_simple(&cmd, SMU_CMD_RTC_COMMAND, 1, NULL, NULL, + SMU_CMD_RTC_GET_DATETIME); + if (rc) + return rc; + smu_spinwait_simple(&cmd); + + time->tm_sec = bcd2hex(cmd.buffer[0]); + time->tm_min = bcd2hex(cmd.buffer[1]); + time->tm_hour = bcd2hex(cmd.buffer[2]); + time->tm_wday = bcd2hex(cmd.buffer[3]); + time->tm_mday = bcd2hex(cmd.buffer[4]); + time->tm_mon = bcd2hex(cmd.buffer[5]) - 1; + time->tm_year = bcd2hex(cmd.buffer[6]) + 100; + + return 0; } -int smu_set_rtc_time(struct rtc_time *time) + +int smu_set_rtc_time(struct rtc_time *time, int spinwait) { - unsigned long flags; + struct smu_simple_cmd cmd; int rc; if (smu == NULL) return -ENODEV; - spin_lock_irqsave(&smu->lock, flags); - smu_fill_set_rtc_cmd(smu->cmd_buf, time); - rc = smu_do_cmd(smu); - spin_unlock_irqrestore(&smu->lock, flags); + rc = smu_queue_simple(&cmd, SMU_CMD_RTC_COMMAND, 8, NULL, NULL, + SMU_CMD_RTC_SET_DATETIME, + hex2bcd(time->tm_sec), + hex2bcd(time->tm_min), + hex2bcd(time->tm_hour), + time->tm_wday, + hex2bcd(time->tm_mday), + hex2bcd(time->tm_mon) + 1, + hex2bcd(time->tm_year - 100)); + if (rc) + return rc; + smu_spinwait_simple(&cmd); - return rc; + return 0; } + void smu_shutdown(void) { - const unsigned char *command = "SHUTDOWN"; - unsigned long flags; + struct smu_simple_cmd cmd; if (smu == NULL) return; - spin_lock_irqsave(&smu->lock, flags); - smu->cmd_buf->cmd = 0xaa; - smu->cmd_buf->length = strlen(command); - strcpy(smu->cmd_buf->data, command); - smu_do_cmd(smu); + if (smu_queue_simple(&cmd, SMU_CMD_POWER_COMMAND, 9, NULL, NULL, + 'S', 'H', 'U', 'T', 'D', 'O', 'W', 'N', 0)) + return; + smu_spinwait_simple(&cmd); for (;;) ; - spin_unlock_irqrestore(&smu->lock, flags); } + void smu_restart(void) { - const unsigned char *command = "RESTART"; - unsigned long flags; + struct smu_simple_cmd cmd; if (smu == NULL) return; - spin_lock_irqsave(&smu->lock, flags); - smu->cmd_buf->cmd = 0xaa; - smu->cmd_buf->length = strlen(command); - strcpy(smu->cmd_buf->data, command); - smu_do_cmd(smu); + if (smu_queue_simple(&cmd, SMU_CMD_POWER_COMMAND, 8, NULL, NULL, + 'R', 'E', 'S', 'T', 'A', 'R', 'T', 0)) + return; + smu_spinwait_simple(&cmd); for (;;) ; - spin_unlock_irqrestore(&smu->lock, flags); } + int smu_present(void) { return smu != NULL; } +EXPORT_SYMBOL(smu_present); int smu_init (void) @@ -307,6 +453,8 @@ if (np == NULL) return -ENODEV; + printk(KERN_INFO "SMU driver %s %s\n", VERSION, AUTHOR); + if (smu_cmdbuf_abs == 0) { printk(KERN_ERR "SMU: Command buffer not allocated !\n"); return -EINVAL; @@ -318,7 +466,13 @@ memset(smu, 0, sizeof(*smu)); spin_lock_init(&smu->lock); + INIT_LIST_HEAD(&smu->cmd_list); + INIT_LIST_HEAD(&smu->cmd_i2c_list); smu->of_node = np; + smu->db_irq = NO_IRQ; + smu->msg_irq = NO_IRQ; + init_timer(&smu->i2c_timer); + /* smu_cmdbuf_abs is in the low 2G of RAM, can be converted to a * 32 bits value safely */ @@ -331,8 +485,8 @@ goto fail; } data = (u32 *)get_property(np, "reg", NULL); - of_node_put(np); if (data == NULL) { + of_node_put(np); printk(KERN_ERR "SMU: Can't find doorbell GPIO address !\n"); goto fail; } @@ -341,8 +495,31 @@ * and ack. GPIOs are at 0x50, best would be to find that out * in the device-tree though. */ - smu->db_req = 0x50 + *data; - smu->db_ack = 0x50 + *data; + smu->doorbell = *data; + if (smu->doorbell < 0x50) + smu->doorbell += 0x50; + if (np->n_intrs > 0) + smu->db_irq = np->intrs[0].line; + + of_node_put(np); + + /* Now look for the smu-interrupt GPIO */ + do { + np = of_find_node_by_name(NULL, "smu-interrupt"); + if (np == NULL) + break; + data = (u32 *)get_property(np, "reg", NULL); + if (data == NULL) { + of_node_put(np); + break; + } + smu->msg = *data; + if (smu->msg < 0x50) + smu->msg += 0x50; + if (np->n_intrs > 0) + smu->msg_irq = np->intrs[0].line; + of_node_put(np); + } while(0); /* Doorbell buffer is currently hard-coded, I didn't find a proper * device-tree entry giving the address. Best would probably to use @@ -362,3 +539,536 @@ return -ENXIO; } + + +static int smu_late_init(void) +{ + if (!smu) + return 0; + + /* + * Try to request the interrupts + */ + + if (smu->db_irq != NO_IRQ) { + if (request_irq(smu->db_irq, smu_db_intr, + SA_SHIRQ, "SMU doorbell", smu) < 0) { + printk(KERN_WARNING "SMU: can't " + "request interrupt %d\n", + smu->db_irq); + smu->db_irq = NO_IRQ; + } + } + + if (smu->msg_irq != NO_IRQ) { + if (request_irq(smu->msg_irq, smu_msg_intr, + SA_SHIRQ, "SMU message", smu) < 0) { + printk(KERN_WARNING "SMU: can't " + "request interrupt %d\n", + smu->msg_irq); + smu->msg_irq = NO_IRQ; + } + } + + return 0; +} +arch_initcall(smu_late_init); + +/* + * sysfs visibility + */ + +static void smu_expose_childs(void *unused) +{ + struct device_node *np; + + for (np = NULL; (np = of_get_next_child(smu->of_node, np)) != NULL;) { + if (device_is_compatible(np, "smu-i2c")) { + char name[32]; + u32 *reg = (u32 *)get_property(np, "reg", NULL); + + if (reg == NULL) + continue; + sprintf(name, "smu-i2c-%02x", *reg); + of_platform_device_create(np, name, &smu->of_dev->dev); + } + } + +} + +static DECLARE_WORK(smu_expose_childs_work, smu_expose_childs, NULL); + +static int smu_platform_probe(struct of_device* dev, + const struct of_match *match) +{ + if (!smu) + return -ENODEV; + smu->of_dev = dev; + + /* + * Ok, we are matched, now expose all i2c busses. We have to defer + * that unfortunately or it would deadlock inside the device model + */ + schedule_work(&smu_expose_childs_work); + + return 0; +} + +static struct of_match smu_platform_match[] = +{ + { + .name = OF_ANY_MATCH, + .type = "smu", + .compatible = OF_ANY_MATCH + }, + {}, +}; + +static struct of_platform_driver smu_of_platform_driver = +{ + .name = "smu", + .match_table = smu_platform_match, + .probe = smu_platform_probe, +}; + +static int __init smu_init_sysfs(void) +{ + int rc; + + /* + * Due to sysfs bogosity, a sysdev is not a real device, so + * we should in fact create both if we want sysdev semantics + * for power management. + * For now, we don't power manage machines with an SMU chip, + * I'm a bit too far from figuring out how that works with those + * new chipsets, but that will come back and bite us + */ + rc = of_register_driver(&smu_of_platform_driver); + return 0; +} + +device_initcall(smu_init_sysfs); + +struct of_device *smu_get_ofdev(void) +{ + if (!smu) + return NULL; + return smu->of_dev; +} + +EXPORT_SYMBOL_GPL(smu_get_ofdev); + +/* + * i2c interface + */ + +static void smu_i2c_complete_command(struct smu_i2c_cmd *cmd, int fail) +{ + void (*done)(struct smu_i2c_cmd *cmd, void *misc) = cmd->done; + void *misc = cmd->misc; + unsigned long flags; + + /* Check for read case */ + if (!fail && cmd->read) { + if (cmd->pdata[0] < 1) + fail = 1; + else + memcpy(cmd->info.data, &cmd->pdata[1], + cmd->info.datalen); + } + + DPRINTK("SMU: completing, success: %d\n", !fail); + + /* Update status and mark no pending i2c command with lock + * held so nobody comes in while we dequeue an eventual + * pending next i2c command + */ + spin_lock_irqsave(&smu->lock, flags); + smu->cmd_i2c_cur = NULL; + wmb(); + cmd->status = fail ? -EIO : 0; + + /* Is there another i2c command waiting ? */ + if (!list_empty(&smu->cmd_i2c_list)) { + struct smu_i2c_cmd *newcmd; + + /* Fetch it, new current, remove from list */ + newcmd = list_entry(smu->cmd_i2c_list.next, + struct smu_i2c_cmd, link); + smu->cmd_i2c_cur = newcmd; + list_del(&cmd->link); + + /* Queue with low level smu */ + list_add_tail(&cmd->scmd.link, &smu->cmd_list); + if (smu->cmd_cur == NULL) + smu_start_cmd(); + } + spin_unlock_irqrestore(&smu->lock, flags); + + /* Call command completion handler if any */ + if (done) + done(cmd, misc); + +} + + +static void smu_i2c_retry(unsigned long data) +{ + struct smu_i2c_cmd *cmd = (struct smu_i2c_cmd *)data; + + DPRINTK("SMU: i2c failure, requeuing...\n"); + + /* requeue command simply by resetting reply_len */ + cmd->pdata[0] = 0xff; + cmd->scmd.reply_len = 0x10; + smu_queue_cmd(&cmd->scmd); +} + + +static void smu_i2c_low_completion(struct smu_cmd *scmd, void *misc) +{ + struct smu_i2c_cmd *cmd = misc; + int fail = 0; + + DPRINTK("SMU: i2c compl. stage=%d status=%x pdata[0]=%x rlen: %x\n", + cmd->stage, scmd->status, cmd->pdata[0], scmd->reply_len); + + /* Check for possible status */ + if (scmd->status < 0) + fail = 1; + else if (cmd->read) { + if (cmd->stage == 0) + fail = cmd->pdata[0] != 0; + else + fail = cmd->pdata[0] >= 0x80; + } else { + fail = cmd->pdata[0] != 0; + } + + /* Handle failures by requeuing command, after 5ms interval + */ + if (fail && --cmd->retries > 0) { + DPRINTK("SMU: i2c failure, starting timer...\n"); + smu->i2c_timer.function = smu_i2c_retry; + smu->i2c_timer.data = (unsigned long)cmd; + smu->i2c_timer.expires = jiffies + msecs_to_jiffies(5); + add_timer(&smu->i2c_timer); + return; + } + + /* If failure or stage 1, command is complete */ + if (fail || cmd->stage != 0) { + smu_i2c_complete_command(cmd, fail); + return; + } + + DPRINTK("SMU: going to stage 1\n"); + + /* Ok, initial command complete, now poll status */ + scmd->reply_buf = cmd->pdata; + scmd->reply_len = 0x10; + scmd->data_buf = cmd->pdata; + scmd->data_len = 1; + cmd->pdata[0] = 0; + cmd->stage = 1; + cmd->retries = 20; + smu_queue_cmd(scmd); +} + + +int smu_queue_i2c(struct smu_i2c_cmd *cmd) +{ + unsigned long flags; + + if (smu == NULL) + return -ENODEV; + + /* Fill most fields of scmd */ + cmd->scmd.cmd = SMU_CMD_I2C_COMMAND; + cmd->scmd.done = smu_i2c_low_completion; + cmd->scmd.misc = cmd; + cmd->scmd.reply_buf = cmd->pdata; + cmd->scmd.reply_len = 0x10; + cmd->scmd.data_buf = (u8 *)(char *)&cmd->info; + cmd->scmd.status = 1; + cmd->stage = 0; + cmd->pdata[0] = 0xff; + cmd->retries = 20; + cmd->status = 1; + + /* Check transfer type, sanitize some "info" fields + * based on transfer type and do more checking + */ + cmd->info.caddr = cmd->info.devaddr; + cmd->read = cmd->info.devaddr & 0x01; + switch(cmd->info.type) { + case SMU_I2C_TRANSFER_SIMPLE: + memset(&cmd->info.sublen, 0, 4); + break; + case SMU_I2C_TRANSFER_COMBINED: + cmd->info.devaddr &= 0xfe; + case SMU_I2C_TRANSFER_STDSUB: + if (cmd->info.sublen > 3) + return -EINVAL; + break; + default: + return -EINVAL; + } + + /* Finish setting up command based on transfer direction + */ + if (cmd->read) { + if (cmd->info.datalen > SMU_I2C_READ_MAX) + return -EINVAL; + memset(cmd->info.data, 0xff, cmd->info.datalen); + cmd->scmd.data_len = 9; + } else { + if (cmd->info.datalen > SMU_I2C_WRITE_MAX) + return -EINVAL; + cmd->scmd.data_len = 9 + cmd->info.datalen; + } + + DPRINTK("SMU: i2c enqueuing command\n"); + DPRINTK("SMU: %s, len=%d bus=%x addr=%x sub0=%x type=%x\n", + cmd->read ? "read" : "write", cmd->info.datalen, + cmd->info.bus, cmd->info.caddr, + cmd->info.subaddr[0], cmd->info.type); + + + /* Enqueue command in i2c list, and if empty, enqueue also in + * main command list + */ + spin_lock_irqsave(&smu->lock, flags); + if (smu->cmd_i2c_cur == NULL) { + smu->cmd_i2c_cur = cmd; + list_add_tail(&cmd->scmd.link, &smu->cmd_list); + if (smu->cmd_cur == NULL) + smu_start_cmd(); + } else + list_add_tail(&cmd->link, &smu->cmd_i2c_list); + spin_unlock_irqrestore(&smu->lock, flags); + + return 0; +} + + + +/* + * Userland driver interface + */ + + +static LIST_HEAD(smu_clist); +static DEFINE_SPINLOCK(smu_clist_lock); + +enum smu_file_mode { + smu_file_commands, + smu_file_events, + smu_file_closing +}; + +struct smu_private +{ + struct list_head list; + enum smu_file_mode mode; + int busy; + struct smu_cmd cmd; + spinlock_t lock; + wait_queue_head_t wait; + u8 buffer[SMU_MAX_DATA]; +}; + + +static int smu_open(struct inode *inode, struct file *file) +{ + struct smu_private *pp; + unsigned long flags; + + pp = kmalloc(sizeof(struct smu_private), GFP_KERNEL); + if (pp == 0) + return -ENOMEM; + memset(pp, 0, sizeof(struct smu_private)); + spin_lock_init(&pp->lock); + pp->mode = smu_file_commands; + init_waitqueue_head(&pp->wait); + + spin_lock_irqsave(&smu_clist_lock, flags); + list_add(&pp->list, &smu_clist); + spin_unlock_irqrestore(&smu_clist_lock, flags); + file->private_data = pp; + + return 0; +} + + +static void smu_user_cmd_done(struct smu_cmd *cmd, void *misc) +{ + struct smu_private *pp = misc; + + wake_up_interruptible(&pp->wait); +} + + +static ssize_t smu_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct smu_private *pp = file->private_data; + unsigned long flags; + struct smu_user_cmd_hdr hdr; + int rc = 0; + + if (pp->busy) + return -EBUSY; + else if (copy_from_user(&hdr, buf, sizeof(hdr))) + return -EFAULT; + else if (hdr.cmdtype == SMU_CMDTYPE_WANTS_EVENTS) { + pp->mode = smu_file_events; + return 0; + } else if (hdr.cmdtype != SMU_CMDTYPE_SMU) + return -EINVAL; + else if (pp->mode != smu_file_commands) + return -EBADFD; + else if (hdr.data_len > SMU_MAX_DATA) + return -EINVAL; + + spin_lock_irqsave(&pp->lock, flags); + if (pp->busy) { + spin_unlock_irqrestore(&pp->lock, flags); + return -EBUSY; + } + pp->busy = 1; + pp->cmd.status = 1; + spin_unlock_irqrestore(&pp->lock, flags); + + if (copy_from_user(pp->buffer, buf + sizeof(hdr), hdr.data_len)) { + pp->busy = 0; + return -EFAULT; + } + + pp->cmd.cmd = hdr.cmd; + pp->cmd.data_len = hdr.data_len; + pp->cmd.reply_len = SMU_MAX_DATA; + pp->cmd.data_buf = pp->buffer; + pp->cmd.reply_buf = pp->buffer; + pp->cmd.done = smu_user_cmd_done; + pp->cmd.misc = pp; + rc = smu_queue_cmd(&pp->cmd); + if (rc < 0) + return rc; + return count; +} + + +static ssize_t smu_read_command(struct file *file, struct smu_private *pp, + char __user *buf, size_t count) +{ + DECLARE_WAITQUEUE(wait, current); + struct smu_user_reply_hdr hdr; + int size, rc = 0; + + if (!pp->busy) + return 0; + if (count < sizeof(struct smu_user_reply_hdr)) + return -EOVERFLOW; + if (pp->cmd.status == 1) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + add_wait_queue(&pp->wait, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + rc = 0; + if (pp->cmd.status != 1) + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&pp->wait, &wait); + if (rc) + return rc; + } + if (pp->cmd.status != 0) + pp->cmd.reply_len = 0; + size = sizeof(hdr) + pp->cmd.reply_len; + if (count < size) + size = count; + rc = size; + hdr.status = pp->cmd.status; + hdr.reply_len = pp->cmd.reply_len; + if (copy_to_user(buf, &hdr, sizeof(hdr))) + return -EFAULT; + size -= sizeof(hdr); + if (size && copy_to_user(buf + sizeof(hdr), pp->buffer, size)) + return -EFAULT; + pp->busy = 0; + + return rc; +} + + +static ssize_t smu_read_events(struct file *file, struct smu_private *pp, + char __user *buf, size_t count) +{ + /* Not implemented */ + msleep_interruptible(1000); + return 0; +} + + +static ssize_t smu_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct smu_private *pp = file->private_data; + + if (pp->mode == smu_file_commands) + return smu_read_command(file, pp, buf, count); + if (pp->mode == smu_file_events) + return smu_read_events(file, pp, buf, count); + + return -EBADFD; +} + + +static int smu_release(struct inode *inode, struct file *file) +{ + struct smu_private *pp = file->private_data; + unsigned long flags; + + if (pp == 0) + return 0; + + file->private_data = NULL; + pp->mode = smu_file_closing; + // XXX wait completion + spin_lock_irqsave(&smu_clist_lock, flags); + list_del(&pp->list); + spin_unlock_irqrestore(&smu_clist_lock, flags); + kfree(pp); + + return 0; +} + + +static struct file_operations smu_device_fops __pmacdata = { + .llseek = no_llseek, + .read = smu_read, + .write = smu_write, + .open = smu_open, + .release = smu_release, +}; + +static struct miscdevice pmu_device __pmacdata = { + MISC_DYNAMIC_MINOR, "smu", &smu_device_fops +}; + +static int smu_device_init(void) +{ + if (!smu) + return -ENODEV; + if (misc_register(&pmu_device) < 0) + printk(KERN_ERR "via-pmu: cannot register misc device.\n"); + return 0; +} +device_initcall(smu_device_init); Index: linux-work/include/asm-ppc64/smu.h =================================================================== --- linux-work.orig/include/asm-ppc64/smu.h 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/include/asm-ppc64/smu.h 2005-05-05 14:58:12.000000000 +1000 @@ -1,22 +1,379 @@ +#ifndef _SMU_H +#define _SMU_H + /* * Definitions for talking to the SMU chip in newer G5 PowerMacs */ #include +#include + +/* + * Known SMU commands + * + * Most of what is below comes from looking at the Open Firmware driver, + * though this is still incomplete and could use better documentation here + * or there... + */ + + +/* + * Partition info commands + * + * I do not know what those are for at this point + */ +#define SMU_CMD_PARTITION_COMMAND 0x3e + /* - * Basic routines for use by architecture. To be extended as - * we understand more of the chip + * Fan control + * + * This is a "mux" for fan control commands, first byte is the + * "sub" command. + */ +#define SMU_CMD_FAN_COMMAND 0x4a + + +/* + * Battery access + * + * Same command number as the PMU, could it be same syntax ? + */ +#define SMU_CMD_BATTERY_COMMAND 0x6f +#define SMU_CMD_GET_BATTERY_INFO 0x00 + +/* + * Real time clock control + * + * This is a "mux", first data byte contains the "sub" command. + * The "RTC" part of the SMU controls the date, time, powerup + * timer, but also a PRAM + * + * Dates are in BCD format on 7 bytes: + * [sec] [min] [hour] [weekday] [month day] [month] [year] + * with month being 1 based and year minus 100 + */ +#define SMU_CMD_RTC_COMMAND 0x8e +#define SMU_CMD_RTC_SET_PWRUP_TIMER 0x00 /* i: 7 bytes date */ +#define SMU_CMD_RTC_GET_PWRUP_TIMER 0x01 /* o: 7 bytes date */ +#define SMU_CMD_RTC_STOP_PWRUP_TIMER 0x02 +#define SMU_CMD_RTC_SET_PRAM_BYTE_ACC 0x20 /* i: 1 byte (address?) */ +#define SMU_CMD_RTC_SET_PRAM_AUTOINC 0x21 /* i: 1 byte (data?) */ +#define SMU_CMD_RTC_SET_PRAM_LO_BYTES 0x22 /* i: 10 bytes */ +#define SMU_CMD_RTC_SET_PRAM_HI_BYTES 0x23 /* i: 10 bytes */ +#define SMU_CMD_RTC_GET_PRAM_BYTE 0x28 /* i: 1 bytes (address?) */ +#define SMU_CMD_RTC_GET_PRAM_LO_BYTES 0x29 /* o: 10 bytes */ +#define SMU_CMD_RTC_GET_PRAM_HI_BYTES 0x2a /* o: 10 bytes */ +#define SMU_CMD_RTC_SET_DATETIME 0x80 /* i: 7 bytes date */ +#define SMU_CMD_RTC_GET_DATETIME 0x81 /* o: 7 bytes date */ + + /* + * i2c commands + * + * To issue an i2c command, first is to send a parameter block to the + * the SMU. This is a command of type 0x9a with 9 bytes of header + * eventually followed by data for a write: + * + * 0: bus number (from device-tree usually, SMU has lots of busses !) + * 1: transfer type/format (see below) + * 2: device address. For combined and combined4 type transfers, this + * is the "write" version of the address (bit 0x01 cleared) + * 3: subaddress length (0..3) + * 4: subaddress byte 0 (or only byte for subaddress length 1) + * 5: subaddress byte 1 + * 6: subaddress byte 2 + * 7: combined address (device address for combined mode data phase) + * 8: data length + * + * The transfer types are the same good old Apple ones it seems, + * that is: + * - 0x00: Simple transfer + * - 0x01: Subaddress transfer (addr write + data tx, no restart) + * - 0x02: Combined transfer (addr write + restart + data tx) + * + * This is then followed by actual data for a write. + * + * At this point, the OF driver seems to have a limitation on transfer + * sizes of 0xd bytes on reads and 0x5 bytes on writes. I do not know + * wether this is just an OF limit due to some temporary buffer size + * or if this is an SMU imposed limit. This driver has the same limitation + * for now as I use a 0x10 bytes temporary buffer as well + * + * Once that is completed, a response is expected from the SMU. This is + * obtained via a command of type 0x9a with a length of 1 byte containing + * 0 as the data byte. OF also fills the rest of the data buffer with 0xff's + * though I can't tell yet if this is actually necessary. Once this command + * is complete, at this point, all I can tell is what OF does. OF tests + * byte 0 of the reply: + * - on read, 0xfe or 0xfc : bus is busy, wait (see below) or nak ? + * - on read, 0x00 or 0x01 : reply is in buffer (after the byte 0) + * - on write, < 0 -> failure (immediate exit) + * - else, OF just exists (without error, weird) + * + * So on read, there is this wait-for-busy thing when getting a 0xfc or + * 0xfe result. OF does a loop of up to 64 retries, waiting 20ms and + * doing the above again until either the retries expire or the result + * is no longer 0xfe or 0xfc + * + * The Darwin I2C driver is less subtle though. On any non-success status + * from the response command, it waits 5ms and tries again up to 20 times, + * it doesn't differenciate between fatal errors or "busy" status. + * + * This driver provides an asynchronous paramblock based i2c command + * interface to be used either directly by low level code or by a higher + * level driver interfacing to the linux i2c layer. The current + * implementation of this relies on working timers & timer interrupts + * though, so be careful of calling context for now. This may be "fixed" + * in the future by adding a polling facility. + */ +#define SMU_CMD_I2C_COMMAND 0x9a + /* transfer types */ +#define SMU_I2C_TRANSFER_SIMPLE 0x00 +#define SMU_I2C_TRANSFER_STDSUB 0x01 +#define SMU_I2C_TRANSFER_COMBINED 0x02 + +/* + * Power supply control + * + * The "sub" command is an ASCII string in the data, the + * data lenght is that of the string. + * + * The VSLEW command can be used to get or set the voltage slewing. + * - lenght 5 (only "VSLEW") : it returns "DONE" and 3 bytes of + * reply at data offset 6, 7 and 8. + * - lenght 8 ("VSLEWxyz") has 3 additional bytes appended, and is + * used to set the voltage slewing point. The SMU replies with "DONE" + * I yet have to figure out their exact meaning of those 3 bytes in + * both cases. + * + */ +#define SMU_CMD_POWER_COMMAND 0xaa +#define SMU_CMD_POWER_RESTART "RESTART" +#define SMU_CMD_POWER_SHUTDOWN "SHUTDOWN" +#define SMU_CMD_POWER_VOLTAGE_SLEW "VSLEW" + +/* Misc commands + * + * This command seem to be a grab bag of various things + */ +#define SMU_CMD_MISC_df_COMMAND 0xdf +#define SMU_CMD_MISC_df_SET_DISPLAY_LIT 0x02 /* i: 1 byte */ +#define SMU_CMD_MISC_df_NMI_OPTION 0x04 + +/* + * Version info commands + * + * I haven't quite tried to figure out how these work + */ +#define SMU_CMD_VERSION_COMMAND 0xea + + +/* + * Misc commands + * + * This command seem to be a grab bag of various things + */ +#define SMU_CMD_MISC_ee_COMMAND 0xee +#define SMU_CMD_MISC_ee_GET_DATABLOCK_REC 0x02 +#define SMU_CMD_MISC_ee_LEDS_CTRL 0x04 /* i: 00 (00,01) [00] */ +#define SMU_CMD_MISC_ee_GET_DATA 0x05 /* i: 00 , o: ?? */ + + + +/* + * - Kernel side interface - + */ + +#ifdef __KERNEL__ + +/* + * Asynchronous SMU commands + * + * Fill up this structure and submit it via smu_queue_command(), + * and get notified by the optional done() callback, or because + * status becomes != 1 + */ + +struct smu_cmd; + +struct smu_cmd +{ + /* public */ + u8 cmd; /* command */ + int data_len; /* data len */ + int reply_len; /* reply len */ + void *data_buf; /* data buffer */ + void *reply_buf; /* reply buffer */ + int status; /* command status */ + void (*done)(struct smu_cmd *cmd, void *misc); + void *misc; + + /* private */ + struct list_head link; +}; + +/* + * Queues an SMU command, all fields have to be initialized + */ +extern int smu_queue_cmd(struct smu_cmd *cmd); + +/* + * Simple command wrapper. This structure embeds a small buffer + * to ease sending simple SMU commands from the stack + */ +struct smu_simple_cmd +{ + struct smu_cmd cmd; + u8 buffer[16]; +}; + +/* + * Queues a simple command. All fields will be initialized by that + * function + */ +extern int smu_queue_simple(struct smu_simple_cmd *scmd, u8 command, + unsigned int data_len, + void (*done)(struct smu_cmd *cmd, void *misc), + void *misc, + ...); + +/* + * Completion helper. Pass it to smu_queue_simple or as 'done' + * member to smu_queue_cmd, it will call complete() on the struct + * completion passed in the "misc" argument + */ +extern void smu_done_complete(struct smu_cmd *cmd, void *misc); + +/* + * Synchronous helpers. Will spin-wait for completion of a command + */ +extern void smu_spinwait_cmd(struct smu_cmd *cmd); + +static inline void smu_spinwait_simple(struct smu_simple_cmd *scmd) +{ + smu_spinwait_cmd(&scmd->cmd); +} + +/* + * Poll routine to call if blocked with irqs off + */ +extern void smu_poll(void); + + +/* + * Init routine, presence check.... */ extern int smu_init(void); extern int smu_present(void); +struct of_device; +extern struct of_device *smu_get_ofdev(void); + + +/* + * Common command wrappers + */ extern void smu_shutdown(void); extern void smu_restart(void); -extern int smu_get_rtc_time(struct rtc_time *time); -extern int smu_set_rtc_time(struct rtc_time *time); +struct rtc_time; +extern int smu_get_rtc_time(struct rtc_time *time, int spinwait); +extern int smu_set_rtc_time(struct rtc_time *time, int spinwait); /* * SMU command buffer absolute address, exported by pmac_setup, * this is allocated very early during boot. */ extern unsigned long smu_cmdbuf_abs; + + +/* + * Kenrel asynchronous i2c interface + */ + +/* SMU i2c header, exactly matches i2c header on wire */ +struct smu_i2c_param +{ + u8 bus; /* SMU bus ID (from device tree) */ + u8 type; /* i2c transfer type */ + u8 devaddr; /* device address (includes direction) */ + u8 sublen; /* subaddress length */ + u8 subaddr[3]; /* subaddress */ + u8 caddr; /* combined address, filled by SMU driver */ + u8 datalen; /* length of transfer */ + u8 data[7]; /* data */ +}; + +#define SMU_I2C_READ_MAX 0x0d +#define SMU_I2C_WRITE_MAX 0x05 + +struct smu_i2c_cmd +{ + /* public */ + struct smu_i2c_param info; + void (*done)(struct smu_i2c_cmd *cmd, void *misc); + void *misc; + int status; /* 1 = pending, 0 = ok, <0 = fail */ + + /* private */ + struct smu_cmd scmd; + int read; + int stage; + int retries; + u8 pdata[0x10]; + struct list_head link; +}; + +/* + * Call this to queue an i2c command to the SMU. You must fill info, + * including info.data for a write, done and misc. + * For now, no polling interface is provided so you have to use completion + * callback. + */ +extern int smu_queue_i2c(struct smu_i2c_cmd *cmd); + + +#endif /* __KERNEL__ */ + +/* + * - Userland interface - + */ + +/* + * A given instance of the device can be configured for 2 different + * things at the moment: + * + * - sending SMU commands (default at open() time) + * - receiving SMU events (not yet implemented) + * + * Commands are written with write() of a command block. They can be + * "driver" commands (for example to switch to event reception mode) + * or real SMU commands. They are made of a header followed by command + * data if any. + * + * For SMU commands (not for driver commands), you can then read() back + * a reply. The reader will be blocked or not depending on how the device + * file is opened. poll() isn't implemented yet. The reply will consist + * of a header as well, followed by the reply data if any. You should + * always provide a buffer large enough for the maximum reply data, I + * recommand one page. + * + * It is illegal to send SMU commands through a file descriptor configured + * for events reception + * + */ +struct smu_user_cmd_hdr +{ + __u32 cmdtype; +#define SMU_CMDTYPE_SMU 0 /* SMU command */ +#define SMU_CMDTYPE_WANTS_EVENTS 1 /* switch fd to events mode */ + + __u8 cmd; /* SMU command byte */ + __u32 data_len; /* Lenght of data following */ +}; + +struct smu_user_reply_hdr +{ + __u32 status; /* Command status */ + __u32 reply_len; /* Lenght of data follwing */ +}; + +#endif /* _SMU_H */ Index: linux-work/arch/ppc64/kernel/pmac_time.c =================================================================== --- linux-work.orig/arch/ppc64/kernel/pmac_time.c 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/arch/ppc64/kernel/pmac_time.c 2005-05-05 14:58:12.000000000 +1000 @@ -89,7 +89,7 @@ #ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: - smu_get_rtc_time(tm); + smu_get_rtc_time(tm, 1); break; #endif /* CONFIG_PMAC_SMU */ default: @@ -133,7 +133,7 @@ #ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: - return smu_set_rtc_time(tm); + return smu_set_rtc_time(tm, 1); #endif /* CONFIG_PMAC_SMU */ default: return -ENODEV; Index: linux-work/drivers/i2c/busses/i2c-pmac-smu.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-work/drivers/i2c/busses/i2c-pmac-smu.c 2005-05-05 16:23:19.000000000 +1000 @@ -0,0 +1,298 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int probe; + +MODULE_AUTHOR("Benjamin Herrenschmidt "); +MODULE_DESCRIPTION("I2C driver for Apple's SMU"); +MODULE_LICENSE("GPL"); +module_param(probe, bool, 0); + + +/* Physical interface */ +struct smu_iface +{ + struct i2c_adapter adapter; + struct completion complete; + u32 busid; +}; + +static void smu_i2c_done(struct smu_i2c_cmd *cmd, void *misc) +{ + struct smu_iface *iface = misc; + complete(&iface->complete); +} + +/* + * SMBUS-type transfer entrypoint + */ +static s32 smu_smbus_xfer( struct i2c_adapter* adap, + u16 addr, + unsigned short flags, + char read_write, + u8 command, + int size, + union i2c_smbus_data* data) +{ + struct smu_iface *iface = i2c_get_adapdata(adap); + struct smu_i2c_cmd cmd; + int rc = 0; + int read = (read_write == I2C_SMBUS_READ); + + cmd.info.bus = iface->busid; + cmd.info.devaddr = (addr << 1) | (read ? 0x01 : 0x00); + + /* Prepare datas & select mode */ + switch (size) { + case I2C_SMBUS_QUICK: + cmd.info.type = SMU_I2C_TRANSFER_SIMPLE; + cmd.info.datalen = 0; + break; + case I2C_SMBUS_BYTE: + cmd.info.type = SMU_I2C_TRANSFER_SIMPLE; + cmd.info.datalen = 1; + if (!read) + cmd.info.data[0] = data->byte; + break; + case I2C_SMBUS_BYTE_DATA: + cmd.info.type = SMU_I2C_TRANSFER_STDSUB; + cmd.info.datalen = 1; + cmd.info.sublen = 1; + cmd.info.subaddr[0] = command; + cmd.info.subaddr[1] = 0; + cmd.info.subaddr[2] = 0; + if (!read) + cmd.info.data[0] = data->byte; + break; + case I2C_SMBUS_WORD_DATA: + cmd.info.type = SMU_I2C_TRANSFER_STDSUB; + cmd.info.datalen = 2; + cmd.info.sublen = 1; + cmd.info.subaddr[0] = command; + cmd.info.subaddr[1] = 0; + cmd.info.subaddr[2] = 0; + if (!read) { + cmd.info.data[0] = data->byte & 0xff; + cmd.info.data[1] = (data->byte >> 8) & 0xff; + } + break; + /* Note that these are broken vs. the expected smbus API where + * on reads, the lenght is actually returned from the function, + * but I think the current API makes no sense and I don't want + * any driver that I haven't verified for correctness to go + * anywhere near a pmac i2c bus anyway ... + */ + case I2C_SMBUS_BLOCK_DATA: + cmd.info.type = SMU_I2C_TRANSFER_STDSUB; + cmd.info.datalen = data->block[0] + 1; + if (cmd.info.datalen > 6) + return -EINVAL; + if (!read) + memcpy(cmd.info.data, data->block, cmd.info.datalen); + cmd.info.sublen = 1; + cmd.info.subaddr[0] = command; + cmd.info.subaddr[1] = 0; + cmd.info.subaddr[2] = 0; + break; + case I2C_SMBUS_I2C_BLOCK_DATA: + cmd.info.type = SMU_I2C_TRANSFER_STDSUB; + cmd.info.datalen = data->block[0]; + if (cmd.info.datalen > 7) + return -EINVAL; + if (!read) + memcpy(cmd.info.data, &data->block[1], + cmd.info.datalen); + cmd.info.sublen = 1; + cmd.info.subaddr[0] = command; + cmd.info.subaddr[1] = 0; + cmd.info.subaddr[2] = 0; + break; + + default: + return -EINVAL; + } + + /* Turn a standardsub read into a combined mode access */ + if (read_write == I2C_SMBUS_READ && + cmd.info.type == SMU_I2C_TRANSFER_STDSUB) + cmd.info.type = SMU_I2C_TRANSFER_COMBINED; + + /* Finish filling command and submit it */ + cmd.done = smu_i2c_done; + cmd.misc = iface; + rc = smu_queue_i2c(&cmd); + if (rc < 0) + return rc; + wait_for_completion(&iface->complete); + rc = cmd.status; + + if (!read || rc < 0) + return rc; + + switch (size) { + case I2C_SMBUS_BYTE: + case I2C_SMBUS_BYTE_DATA: + data->byte = cmd.info.data[0]; + break; + case I2C_SMBUS_WORD_DATA: + data->word = ((u16)cmd.info.data[1]) << 8; + data->word |= cmd.info.data[0]; + break; + /* Note that these are broken vs. the expected smbus API where + * on reads, the lenght is actually returned from the function, + * but I think the current API makes no sense and I don't want + * any driver that I haven't verified for correctness to go + * anywhere near a pmac i2c bus anyway ... + */ + case I2C_SMBUS_BLOCK_DATA: + case I2C_SMBUS_I2C_BLOCK_DATA: + memcpy(&data->block[0], cmd.info.data, cmd.info.datalen); + break; + } + + return rc; +} + +static u32 +smu_smbus_func(struct i2c_adapter * adapter) +{ + return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_BLOCK_DATA; +} + +/* For now, we only handle combined mode (smbus) */ +static struct i2c_algorithm smu_algorithm = { + .name = "SMU i2c", + .id = I2C_ALGO_SMBUS, + .smbus_xfer = smu_smbus_xfer, + .functionality = smu_smbus_func, +}; + +static int create_iface(struct device_node *np, struct device *dev) +{ + struct smu_iface* iface; + u32 *reg, busid; + int rc; + + reg = (u32 *)get_property(np, "reg", NULL); + if (reg == NULL) { + printk(KERN_ERR "i2c-pmac-smu: can't find bus number !\n"); + return -ENXIO; + } + busid = *reg; + + iface = kmalloc(sizeof(struct smu_iface), GFP_KERNEL); + if (iface == NULL) { + printk(KERN_ERR "i2c-pmac-smu: can't allocate inteface !\n"); + return -ENOMEM; + } + memset(iface, 0, sizeof(struct smu_iface)); + init_completion(&iface->complete); + iface->busid = busid; + + dev_set_drvdata(dev, iface); + + sprintf(iface->adapter.name, "smu-i2c-%02x", busid); + iface->adapter.id = I2C_ALGO_SMBUS; + iface->adapter.algo = &smu_algorithm; + iface->adapter.algo_data = NULL; + iface->adapter.client_register = NULL; + iface->adapter.client_unregister = NULL; + i2c_set_adapdata(&iface->adapter, iface); + iface->adapter.dev.parent = dev; + + rc = i2c_add_adapter(&iface->adapter); + if (rc) { + printk(KERN_ERR "i2c-pamc-smu.c: Adapter %s registration " + "failed\n", iface->adapter.name); + i2c_set_adapdata(&iface->adapter, NULL); + } + + if (probe) { + unsigned char addr; + printk("Probe: "); + for (addr = 0x00; addr <= 0x7f; addr++) { + if (i2c_smbus_xfer(&iface->adapter,addr, + 0,0,0,I2C_SMBUS_QUICK,NULL) >= 0) + printk("%02x ", addr); + } + printk("\n"); + } + + printk(KERN_INFO "SMU i2c bus %x registered\n", busid); + + return 0; +} + +static int dispose_iface(struct device *dev) +{ + struct smu_iface *iface = dev_get_drvdata(dev); + int rc; + + rc = i2c_del_adapter(&iface->adapter); + i2c_set_adapdata(&iface->adapter, NULL); + /* We aren't that prepared to deal with this... */ + if (rc) + printk("i2c-pmac-smu.c: Failed to remove bus %s !\n", + iface->adapter.name); + dev_set_drvdata(dev, NULL); + kfree(iface); + + return 0; +} + + +static int create_iface_of_platform(struct of_device* dev, + const struct of_match *match) +{ + return create_iface(dev->node, &dev->dev); +} + + +static int dispose_iface_of_platform(struct of_device* dev) +{ + return dispose_iface(&dev->dev); +} + + +static struct of_match i2c_smu_match[] = +{ + { + .name = OF_ANY_MATCH, + .type = OF_ANY_MATCH, + .compatible = "smu-i2c" + }, + {}, +}; +static struct of_platform_driver i2c_smu_of_platform_driver = +{ + .name = "i2c-smu", + .match_table = i2c_smu_match, + .probe = create_iface_of_platform, + .remove = dispose_iface_of_platform +}; + + +static int __init i2c_pmac_smu_init(void) +{ + of_register_driver(&i2c_smu_of_platform_driver); + return 0; +} + + +static void __exit i2c_pmac_smu_cleanup(void) +{ + of_unregister_driver(&i2c_smu_of_platform_driver); +} + +module_init(i2c_pmac_smu_init); +module_exit(i2c_pmac_smu_cleanup); Index: linux-work/drivers/i2c/busses/Kconfig =================================================================== --- linux-work.orig/drivers/i2c/busses/Kconfig 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/drivers/i2c/busses/Kconfig 2005-05-05 14:58:12.000000000 +1000 @@ -235,6 +235,18 @@ This support is also available as a module. If so, the module will be called i2c-keywest. +config I2C_PMAC_SMU + tristate "Powermac SMU I2C interface" + depends on I2C && PPC_PMAC64 + help + This supports the use of the I2C interface in the SMU + chip on recent Apple machines like the iMac G5. It is used + among others by the thermal control driver for those machines. + Say Y if you have such a machine. + + This support is also available as a module. If so, the module + will be called i2c-pmac-smu. + config I2C_MPC tristate "MPC107/824x/85xx/52xx" depends on I2C && PPC32 Index: linux-work/drivers/i2c/busses/Makefile =================================================================== --- linux-work.orig/drivers/i2c/busses/Makefile 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/drivers/i2c/busses/Makefile 2005-05-05 14:58:12.000000000 +1000 @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_IXP2000) += i2c-ixp2000.o obj-$(CONFIG_I2C_IXP4XX) += i2c-ixp4xx.o obj-$(CONFIG_I2C_KEYWEST) += i2c-keywest.o +obj-$(CONFIG_I2C_PMAC_SMU) += i2c-pmac-smu.o obj-$(CONFIG_I2C_MPC) += i2c-mpc.o obj-$(CONFIG_I2C_MV64XXX) += i2c-mv64xxx.o obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o Index: linux-work/arch/ppc64/kernel/pmac_setup.c =================================================================== --- linux-work.orig/arch/ppc64/kernel/pmac_setup.c 2005-05-05 14:57:39.000000000 +1000 +++ linux-work/arch/ppc64/kernel/pmac_setup.c 2005-05-05 16:14:18.000000000 +1000 @@ -444,15 +444,23 @@ static int __init pmac_declare_of_platform_devices(void) { - struct device_node *np; + struct device_node *np, *npp; - np = find_devices("u3"); - if (np) { - for (np = np->child; np != NULL; np = np->sibling) + npp = of_find_node_by_name(NULL, "u3"); + if (npp) { + for (np = NULL; (np = of_get_next_child(npp, np)) != NULL;) { if (strncmp(np->name, "i2c", 3) == 0) { - of_platform_device_create(np, "u3-i2c"); + of_platform_device_create(np, "u3-i2c", NULL); + of_node_put(np); break; } + } + of_node_put(npp); + } + npp = of_find_node_by_type(NULL, "smu"); + if (npp) { + of_platform_device_create(npp, "smu", NULL); + of_node_put(npp); } return 0; Index: linux-work/drivers/macintosh/therm_adt746x.c =================================================================== --- linux-work.orig/drivers/macintosh/therm_adt746x.c 2005-05-02 10:48:11.000000000 +1000 +++ linux-work/drivers/macintosh/therm_adt746x.c 2005-05-05 16:17:03.000000000 +1000 @@ -563,7 +563,7 @@ "limit_adjust: %d, fan_speed: %d\n", therm_bus, therm_address, limit_adjust, fan_speed); - of_dev = of_platform_device_create(np, "temperatures"); + of_dev = of_platform_device_create(np, "temperatures", NULL); if (of_dev == NULL) { printk(KERN_ERR "Can't register temperatures device !\n"); Index: linux-work/drivers/macintosh/therm_pm72.c =================================================================== --- linux-work.orig/drivers/macintosh/therm_pm72.c 2005-05-02 10:48:11.000000000 +1000 +++ linux-work/drivers/macintosh/therm_pm72.c 2005-05-05 16:16:44.000000000 +1000 @@ -2052,7 +2052,7 @@ return -ENODEV; } } - of_dev = of_platform_device_create(np, "temperature"); + of_dev = of_platform_device_create(np, "temperature", NULL); if (of_dev == NULL) { printk(KERN_ERR "Can't register FCU platform device !\n"); return -ENODEV; Index: linux-work/arch/ppc64/kernel/of_device.c =================================================================== --- linux-work.orig/arch/ppc64/kernel/of_device.c 2005-05-02 10:48:08.000000000 +1000 +++ linux-work/arch/ppc64/kernel/of_device.c 2005-05-05 16:21:46.000000000 +1000 @@ -232,7 +232,9 @@ device_unregister(&ofdev->dev); } -struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id) +struct of_device* of_platform_device_create(struct device_node *np, + const char *bus_id, + struct device *parent) { struct of_device *dev; u32 *reg; @@ -245,7 +247,7 @@ dev->node = np; dev->dma_mask = 0xffffffffUL; dev->dev.dma_mask = &dev->dma_mask; - dev->dev.parent = NULL; + dev->dev.parent = parent; dev->dev.bus = &of_platform_bus_type; dev->dev.release = of_release_dev; @@ -260,6 +262,7 @@ return dev; } + EXPORT_SYMBOL(of_match_device); EXPORT_SYMBOL(of_platform_bus_type); EXPORT_SYMBOL(of_register_driver); Index: linux-work/arch/ppc/platforms/pmac_setup.c =================================================================== --- linux-work.orig/arch/ppc/platforms/pmac_setup.c 2005-05-02 10:48:08.000000000 +1000 +++ linux-work/arch/ppc/platforms/pmac_setup.c 2005-05-05 16:04:32.000000000 +1000 @@ -719,7 +719,8 @@ if (np) { for (np = np->child; np != NULL; np = np->sibling) if (strncmp(np->name, "i2c", 3) == 0) { - of_platform_device_create(np, "uni-n-i2c"); + of_platform_device_create(np, "uni-n-i2c", + NULL); break; } } @@ -727,17 +728,18 @@ if (np) { for (np = np->child; np != NULL; np = np->sibling) if (strncmp(np->name, "i2c", 3) == 0) { - of_platform_device_create(np, "u3-i2c"); + of_platform_device_create(np, "u3-i2c", + NULL); break; } } np = find_devices("valkyrie"); if (np) - of_platform_device_create(np, "valkyrie"); + of_platform_device_create(np, "valkyrie", NULL); np = find_devices("platinum"); if (np) - of_platform_device_create(np, "platinum"); + of_platform_device_create(np, "platinum", NULL); return 0; } Index: linux-work/include/asm-ppc/of_device.h =================================================================== --- linux-work.orig/include/asm-ppc/of_device.h 2005-05-02 10:49:57.000000000 +1000 +++ linux-work/include/asm-ppc/of_device.h 2005-05-05 16:03:54.000000000 +1000 @@ -67,7 +67,9 @@ extern void of_unregister_driver(struct of_platform_driver *drv); extern int of_device_register(struct of_device *ofdev); extern void of_device_unregister(struct of_device *ofdev); -extern struct of_device *of_platform_device_create(struct device_node *np, const char *bus_id); +extern struct of_device *of_platform_device_create(struct device_node *np, + const char *bus_id, + struct device *parent); extern void of_release_dev(struct device *dev); #endif /* __OF_DEVICE_H__ */ Index: linux-work/arch/ppc/syslib/of_device.c =================================================================== --- linux-work.orig/arch/ppc/syslib/of_device.c 2005-05-02 10:48:08.000000000 +1000 +++ linux-work/arch/ppc/syslib/of_device.c 2005-05-05 17:14:01.000000000 +1000 @@ -233,7 +233,9 @@ device_unregister(&ofdev->dev); } -struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id) +struct of_device* of_platform_device_create(struct device_node *np, + const char *bus_id, + struct device *parent) { struct of_device *dev; u32 *reg; @@ -246,7 +248,7 @@ dev->node = of_node_get(np); dev->dma_mask = 0xffffffffUL; dev->dev.dma_mask = &dev->dma_mask; - dev->dev.parent = NULL; + dev->dev.parent = parent; dev->dev.bus = &of_platform_bus_type; dev->dev.release = of_release_dev; From apw at shadowen.org Thu May 5 22:04:10 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Thu, 05 May 2005 13:04:10 +0100 Subject: [2/3] add memory present for ppc64 In-Reply-To: <20050505023119.GA20283@austin.ibm.com> References: <20050505023119.GA20283@austin.ibm.com> Message-ID: <427A0BBA.1080803@shadowen.org> Olof Johansson wrote: > On Wed, May 04, 2005 at 09:29:57PM +0100, Andy Whitcroft wrote: > > >>diff -X /home/apw/brief/lib/vdiff.excl -rupN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig >>--- reference/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 >>+++ current/arch/ppc64/Kconfig 2005-05-04 20:54:50.000000000 +0100 >>@@ -212,8 +212,8 @@ config ARCH_FLATMEM_ENABLE >> source "mm/Kconfig" >> >> config HAVE_ARCH_EARLY_PFN_TO_NID >>- bool >>- default y >>+ def_bool y >>+ depends on NEED_MULTIPLE_NODES > > > Ok, time to show my lack of undestanding here, but when can we ever be > CONFIG_NUMA and NOT need multiple nodes? > > >>@@ -481,6 +483,7 @@ static void __init setup_nonnuma(void) >> >> for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) >> numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; >>+ memory_present(0, 0, init_node_data[0].node_end_pfn); > > > Isn't the memory_present stuff and numa_memory_lookup_table two > implementations doing the same thing (mapping memory to nodes)? > Can we kill numa_memory_lookup_table with this? This table basically is part of the DISCONTIGMEM implementation and used lightly by SPARSEMEM. In the i386 port we have already pushd that out into a discontigmem implementation of memory_present. That is a logical next step in this port and I've got some of it already done. That should sit nicely on this lot. I'll work on this one. -apw From jschopp at austin.ibm.com Fri May 6 01:44:42 2005 From: jschopp at austin.ibm.com (Joel Schopp) Date: Thu, 05 May 2005 10:44:42 -0500 Subject: [1/3] add early_pfn_to_nid for ppc64 In-Reply-To: References: Message-ID: <427A3F6A.6060405@austin.ibm.com> > +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID > +#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) > +#endif Is there a reason we didn't just use pfn_to_nid() directly here instead of pa_to_nid()? I'm just thinking of having DISCONTIG/NUMA off and pfn_to_nid() being #defined to zero for those cases. From apw at shadowen.org Fri May 6 02:19:19 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Thu, 05 May 2005 17:19:19 +0100 Subject: [1/3] add early_pfn_to_nid for ppc64 In-Reply-To: <427A3F6A.6060405@austin.ibm.com> References: <427A3F6A.6060405@austin.ibm.com> Message-ID: <427A4787.4030802@shadowen.org> Joel Schopp wrote: >> +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID >> +#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << >> PAGE_SHIFT) >> +#endif > > > Is there a reason we didn't just use pfn_to_nid() directly here instead > of pa_to_nid()? I'm just thinking of having DISCONTIG/NUMA off and > pfn_to_nid() being #defined to zero for those cases. The problem is that pfn_to_nid is defined by the memory model. In the SPARSEMEM case it isn't always usable until after the we have initialised and allocated the sparse mem_maps. It is allocations during this phase that need this early_pfn_to_nid() form, to guide its allocations of the mem_map to obtain locality with the physical memory blocks. This is clearer in the i386 port where the early_pfn_to_nid() implementation uses low level table to determine the location. As has been mentioned in another thread, we are using what is effectivly a DISCONTIGMEM data structure here. I have some work in progress to split that last part and move to a true early implementation on ppc64 too. -apw From johnrose at austin.ibm.com Fri May 6 02:21:30 2005 From: johnrose at austin.ibm.com (John Rose) Date: Thu, 05 May 2005 11:21:30 -0500 Subject: Patch to kill ioremap_mm In-Reply-To: <20050505014256.GE18270@localhost.localdomain> References: <20050505014256.GE18270@localhost.localdomain> Message-ID: <1115310090.6011.21.camel@sinatra.austin.ibm.com> Hi David- Given that we use a separate allocation scheme for imalloc mappings, does it make sense to lump these into the vmalloc mm_struct, and to share the vmalloc address space? This saves lines of code, but is it as clear as the existing (separate) layout? Thanks- John On Wed, 2005-05-04 at 20:42, David Gibson wrote: > Can anyone see any problems with this patch. If not, I'll send it on > to akpm. > > Currently ppc64 has two mm_structs for the kernel, init_mm and also > ioremap_mm. The latter really isn't necessary: this patch abolishes > it, instead restricting vmallocs to the lower 1TB of the init_mm's > range and placing io mappings in the upper 1TB. This simplifies the > code in a number of places, and gets rid of an unecessary set of > pagetables. > > Index: working-2.6/include/asm-ppc64/pgtable.h > =================================================================== > --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-05 10:58:04.000000000 +1000 > +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-05 11:12:59.000000000 +1000 > @@ -53,7 +53,8 @@ > * Define the address range of the vmalloc VM area. > */ > #define VMALLOC_START (0xD000000000000000ul) > -#define VMALLOC_END (VMALLOC_START + EADDR_MASK) > +#define VMALLOC_SIZE (0x10000000000UL) > +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) > > /* > * Bits in a linux-style PTE. These match the bits in the > @@ -239,9 +240,6 @@ > /* This now only contains the vmalloc pages */ > #define pgd_offset_k(address) pgd_offset(&init_mm, address) > > -/* to find an entry in the ioremap page-table-directory */ > -#define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) > - > /* > * The following only work if pte_present() is true. > * Undefined behaviour if not.. > @@ -459,15 +457,12 @@ > #define __HAVE_ARCH_PTE_SAME > #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) > > -extern unsigned long ioremap_bot, ioremap_base; > - > #define pmd_ERROR(e) \ > printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) > #define pgd_ERROR(e) \ > printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) > > extern pgd_t swapper_pg_dir[]; > -extern pgd_t ioremap_dir[]; > > extern void paging_init(void); > > Index: working-2.6/include/asm-ppc64/imalloc.h > =================================================================== > --- working-2.6.orig/include/asm-ppc64/imalloc.h 2005-05-05 10:58:04.000000000 +1000 > +++ working-2.6/include/asm-ppc64/imalloc.h 2005-05-05 11:13:39.000000000 +1000 > @@ -4,9 +4,9 @@ > /* > * Define the address range of the imalloc VM area. > */ > -#define PHBS_IO_BASE IOREGIONBASE > -#define IMALLOC_BASE (IOREGIONBASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ > -#define IMALLOC_END (IOREGIONBASE + EADDR_MASK) > +#define PHBS_IO_BASE VMALLOC_END > +#define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ > +#define IMALLOC_END (VMALLOC_START + EADDR_MASK) > > > /* imalloc region types */ > @@ -21,4 +21,6 @@ > int region_type); > unsigned long im_free(void *addr); > > +extern unsigned long ioremap_bot; > + > #endif /* _PPC64_IMALLOC_H */ > Index: working-2.6/include/asm-ppc64/page.h > =================================================================== > --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-05 10:58:04.000000000 +1000 > +++ working-2.6/include/asm-ppc64/page.h 2005-05-05 11:14:02.000000000 +1000 > @@ -202,9 +202,7 @@ > #define PAGE_OFFSET ASM_CONST(0xC000000000000000) > #define KERNELBASE PAGE_OFFSET > #define VMALLOCBASE ASM_CONST(0xD000000000000000) > -#define IOREGIONBASE ASM_CONST(0xE000000000000000) > > -#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) > #define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) > #define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) > #define USER_REGION_ID (0UL) > Index: working-2.6/arch/ppc64/kernel/eeh.c > =================================================================== > --- working-2.6.orig/arch/ppc64/kernel/eeh.c 2005-04-26 15:37:55.000000000 +1000 > +++ working-2.6/arch/ppc64/kernel/eeh.c 2005-05-05 11:23:40.000000000 +1000 > @@ -505,7 +505,7 @@ > pte_t *ptep; > unsigned long pa; > > - ptep = find_linux_pte(ioremap_mm.pgd, token); > + ptep = find_linux_pte(init_mm.pgd, token); > if (!ptep) > return token; > pa = pte_pfn(*ptep) << PAGE_SHIFT; > Index: working-2.6/arch/ppc64/kernel/process.c > =================================================================== > --- working-2.6.orig/arch/ppc64/kernel/process.c 2005-04-26 15:37:55.000000000 +1000 > +++ working-2.6/arch/ppc64/kernel/process.c 2005-05-05 11:16:20.000000000 +1000 > @@ -58,14 +58,6 @@ > struct task_struct *last_task_used_altivec = NULL; > #endif > > -struct mm_struct ioremap_mm = { > - .pgd = ioremap_dir, > - .mm_users = ATOMIC_INIT(2), > - .mm_count = ATOMIC_INIT(1), > - .cpu_vm_mask = CPU_MASK_ALL, > - .page_table_lock = SPIN_LOCK_UNLOCKED, > -}; > - > /* > * Make sure the floating-point register state in the > * the thread_struct is up to date for task tsk. > Index: working-2.6/include/asm-ppc64/processor.h > =================================================================== > --- working-2.6.orig/include/asm-ppc64/processor.h 2005-04-26 15:38:02.000000000 +1000 > +++ working-2.6/include/asm-ppc64/processor.h 2005-05-05 11:24:46.000000000 +1000 > @@ -590,16 +590,6 @@ > } > > /* > - * Note: the vm_start and vm_end fields here should *not* > - * be in kernel space. (Could vm_end == vm_start perhaps?) > - */ > -#define IOREMAP_MMAP { &ioremap_mm, 0, 0x1000, NULL, \ > - PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \ > - 1, NULL, NULL } > - > -extern struct mm_struct ioremap_mm; > - > -/* > * Return saved PC of a blocked thread. For now, this is the "user" PC > */ > #define thread_saved_pc(tsk) \ > Index: working-2.6/arch/ppc64/mm/hash_utils.c > =================================================================== > --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-05 10:58:04.000000000 +1000 > +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-05 11:17:03.000000000 +1000 > @@ -310,10 +310,6 @@ > > vsid = get_vsid(mm->context.id, ea); > break; > - case IO_REGION_ID: > - mm = &ioremap_mm; > - vsid = get_kernel_vsid(ea); > - break; > case VMALLOC_REGION_ID: > mm = &init_mm; > vsid = get_kernel_vsid(ea); > Index: working-2.6/arch/ppc64/mm/init.c > =================================================================== > --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-05 10:58:04.000000000 +1000 > +++ working-2.6/arch/ppc64/mm/init.c 2005-05-05 11:22:54.000000000 +1000 > @@ -144,7 +144,7 @@ > > pte = pte_offset_kernel(pmd, addr); > do { > - pte_t ptent = ptep_get_and_clear(&ioremap_mm, addr, pte); > + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); > WARN_ON(!pte_none(ptent) && !pte_present(ptent)); > } while (pte++, addr += PAGE_SIZE, addr != end); > } > @@ -181,13 +181,13 @@ > > static void unmap_im_area(unsigned long addr, unsigned long end) > { > - struct mm_struct *mm = &ioremap_mm; > + struct mm_struct *mm = &init_mm; > unsigned long next; > pgd_t *pgd; > > spin_lock(&mm->page_table_lock); > > - pgd = pgd_offset_i(addr); > + pgd = pgd_offset_k(addr); > flush_cache_vunmap(addr, end); > do { > next = pgd_addr_end(addr, end); > @@ -214,21 +214,21 @@ > unsigned long vsid; > > if (mem_init_done) { > - spin_lock(&ioremap_mm.page_table_lock); > - pgdp = pgd_offset_i(ea); > - pudp = pud_alloc(&ioremap_mm, pgdp, ea); > + spin_lock(&init_mm.page_table_lock); > + pgdp = pgd_offset_k(ea); > + pudp = pud_alloc(&init_mm, pgdp, ea); > if (!pudp) > return -ENOMEM; > - pmdp = pmd_alloc(&ioremap_mm, pudp, ea); > + pmdp = pmd_alloc(&init_mm, pudp, ea); > if (!pmdp) > return -ENOMEM; > - ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); > + ptep = pte_alloc_kernel(&init_mm, pmdp, ea); > if (!ptep) > return -ENOMEM; > pa = abs_to_phys(pa); > - set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, > + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, > __pgprot(flags))); > - spin_unlock(&ioremap_mm.page_table_lock); > + spin_unlock(&init_mm.page_table_lock); > } else { > unsigned long va, vpn, hash, hpteg; > > > From apw at shadowen.org Fri May 6 03:37:00 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Thu, 05 May 2005 18:37:00 +0100 Subject: [3/3] sparsemem memory model for ppc64 In-Reply-To: <20050505023132.GB20283@austin.ibm.com> References: <20050505023132.GB20283@austin.ibm.com> Message-ID: <427A59BC.1020208@shadowen.org> Olof Johansson wrote: > Hi, > > Just two formatting nitpicks below. Thanks, this would be better served by rewriting the first comment and removing the second all together. /* Add all physical memory to the bootmem map, mark each area * present. The first block has already been marked present above. */ I note that the diff in question has sneaked into the wrong patch, that segement represents memory_present. So I'll rediff them with it there. No overall change to the code. -apw From kravetz at us.ibm.com Fri May 6 03:53:20 2005 From: kravetz at us.ibm.com (mike kravetz) Date: Thu, 5 May 2005 10:53:20 -0700 Subject: [3/3] sparsemem memory model for ppc64 In-Reply-To: References: Message-ID: <20050505175320.GC3930@w-mikek2.ibm.com> On Wed, May 04, 2005 at 09:30:57PM +0100, Andy Whitcroft wrote: > + /* > + * Note presence of first (logical/coalasced) LMB which will > + * contain RMO region > + */ > + start_pfn = lmb.memory.region[0].physbase >> PAGE_SHIFT; > + end_pfn = start_pfn + (lmb.memory.region[0].size >> PAGE_SHIFT); > + memory_present(0, start_pfn, end_pfn); I need to take a close look at this again, but I think this special handling for the RMO region in unnecessary. I added it in the 'early days of SPARSE' when there were some 'bootstrap' issues and we needed to initialize some memory before setting up the bootmem bitmap. I'm pretty sure all those issues have gone away. -- Mike From ntl at pobox.com Fri May 6 04:32:02 2005 From: ntl at pobox.com (Nathan Lynch) Date: Thu, 5 May 2005 13:32:02 -0500 Subject: [PATCH] ppc64: don't create spurious symlinks under node0 sysdev Message-ID: <20050505183202.GA3614@otto> On partitioned systems we can wind up creating spurious symlinks in /sys/devices/system/node/node0 to non-present cpus. The symlinks are not broken; the problem is that we're potentially misinforming userspace that there is a relationship between node0 and cpus which are to be added later. There's no guarantee at all that a cpu which is added later will belong to node 0. sysfs.c | 7 ++++++- 1 files changed, 6 insertions(+), 1 deletion(-) Signed-off-by: Nathan Lynch Index: linux-2.6.12-rc3-mm3/arch/ppc64/kernel/sysfs.c =================================================================== --- linux-2.6.12-rc3-mm3.orig/arch/ppc64/kernel/sysfs.c +++ linux-2.6.12-rc3-mm3/arch/ppc64/kernel/sysfs.c @@ -404,7 +404,12 @@ static int __init topology_init(void) struct cpu *c = &per_cpu(cpu_devices, cpu); #ifdef CONFIG_NUMA - parent = &node_devices[cpu_to_node(cpu)]; + /* The node to which a cpu belongs can't be known + * until the cpu is made present. + */ + parent = NULL; + if (cpu_present(cpu)) + parent = &node_devices[cpu_to_node(cpu)]; #endif /* * For now, we just see if the system supports making From linas at austin.ibm.com Fri May 6 07:57:25 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Thu, 5 May 2005 16:57:25 -0500 Subject: PATCH [PPC64]: dead processes never reaped In-Reply-To: <1113975850.5515.377.camel@gaston> References: <20050418193833.GW15596@austin.ibm.com> <1113975850.5515.377.camel@gaston> Message-ID: <20050505215725.GJ11745@austin.ibm.com> On Wed, Apr 20, 2005 at 03:44:10PM +1000, Benjamin Herrenschmidt was heard to remark: > On Mon, 2005-04-18 at 14:38 -0500, Linas Vepstas wrote: > > > > The patch below appears to fix a problem where a number of dead processes > > linger on the system. On a highly loaded system, dozens of processes > > were found stuck in do_exit(), calling thier very last schedule(), and > > then being lost forever. And this problem seems to be unreproducible. Dang, it was one of the more interesting ones I've seen. --linas From ntl at pobox.com Fri May 6 08:15:20 2005 From: ntl at pobox.com (Nathan Lynch) Date: Thu, 5 May 2005 17:15:20 -0500 Subject: [PATCH 1/2] logical numbering for numa nodes (2nd try) Message-ID: <20050505221520.GB3614@otto> (version 2) This patch fixes the ppc64 numa code to be more consistent with the conversion from numnodes to node_online_mask etc. and removes the dependence on the platform numa numbering by setting up a mapping between the platform ids found in the ibm,associativity properties and logical node numbers. The main reason I want to make this change is that the numbering scheme of the platform ids is unspecified and we really can't rely on the values being below MAX_NUMNODES. I know you weren't really keen on having this mapping but I think in the long term this is what we'll wind up having to do anyway. I've also ripped out DEBUG_NUMA -- the effect is that it's now as if DEBUG_NUMA is always on. This means that resources have to be explicitly associated with their nodes. As Dave Hansen suggested in response to the original version of the patch, I've made it so that establishing a mapping between the domain and logical node has to be done explicitly instead of implicitly on the first lookup. This patch exposes some latent issues in the interaction of cpu hotplug, numa, and sched domains which are addressed in the next patch. arch/ppc64/mm/numa.c | 208 ++++++++++++++++++++++++++----------------- include/asm-ppc64/mmzone.h | 17 --- include/asm-ppc64/topology.h | 10 -- 3 files changed, 130 insertions(+), 105 deletions(-) Signed-off-by: Nathan Lynch Index: linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c =================================================================== --- linux-2.6.12-rc3-mm3.orig/arch/ppc64/mm/numa.c +++ linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c @@ -26,11 +26,7 @@ static int numa_enabled = 1; static int numa_debug; #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } -#ifdef DEBUG_NUMA #define ARRAY_INITIALISER -1 -#else -#define ARRAY_INITIALISER 0 -#endif int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = ARRAY_INITIALISER}; @@ -58,6 +54,64 @@ EXPORT_SYMBOL(numa_memory_lookup_table); EXPORT_SYMBOL(numa_cpumask_lookup_table); EXPORT_SYMBOL(nr_cpus_in_node); +#define INVALID_DOMAIN (-1) +static int nid_to_domain_tbl[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = INVALID_DOMAIN }; + +static int nid_to_domain(int nid) +{ + BUG_ON(nid >= MAX_NUMNODES); + BUG_ON(nid < 0); + + return nid_to_domain_tbl[nid]; +} + +/* Returns -1 if domain not mapped */ +static int domain_to_nid(int domain) +{ + int nid; + + WARN_ON(domain == INVALID_DOMAIN); + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + int tmp = nid_to_domain(nid); + if (tmp == domain) + return nid; + } + + return -1; +} + +/* Map the given domain to the next available node id if it is not + * already mapped. If this is a new mapping, set the node online. + */ +static int __init establish_domain_mapping(int domain) +{ + int nid; + + WARN_ON(domain == INVALID_DOMAIN); + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if (nid_to_domain_tbl[nid] == domain) { + WARN_ON(!node_online(nid)); + return nid; + } + else if (nid_to_domain_tbl[nid] != INVALID_DOMAIN) + continue; + + printk(KERN_INFO + "Mapping platform domain %i to logical node %i\n", + domain, nid); + + nid_to_domain_tbl[nid] = domain; + node_set_online(nid); + return nid; + } + printk(KERN_WARNING "nid_to_domain_tbl full; time to increase" + " NODES_SHIFT?\n"); + + return -1; +} + static inline void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -126,16 +180,23 @@ static int of_node_numa_domain(struct de unsigned int *tmp; if (min_common_depth == -1) - return 0; + return INVALID_DOMAIN; tmp = of_get_associativity(device); if (tmp && (tmp[0] >= min_common_depth)) { numa_domain = tmp[min_common_depth]; } else { - dbg("WARNING: no NUMA information for %s\n", + dbg("no NUMA information for %s\n", device->full_name); - numa_domain = 0; + numa_domain = INVALID_DOMAIN; } + + /* POWER4 LPAR uses 0xffff for invalid domain; + * fix that up here so callers don't have to worry about it. + */ + if (numa_domain == 0xffff) + numa_domain = INVALID_DOMAIN; + return numa_domain; } @@ -223,12 +284,12 @@ static unsigned long read_n_cells(int n, } /* - * Figure out to which domain a cpu belongs and stick it there. - * Return the id of the domain used. + * Figure out to which node a cpu belongs and stick it there. + * Return the id of the node used. */ static int numa_setup_cpu(unsigned long lcpu) { - int numa_domain = 0; + int nid = 0, numa_domain = INVALID_DOMAIN; struct device_node *cpu = find_cpu_node(lcpu); if (!cpu) { @@ -238,25 +299,17 @@ static int numa_setup_cpu(unsigned long numa_domain = of_node_numa_domain(cpu); - if (numa_domain >= num_online_nodes()) { - /* - * POWER4 LPAR uses 0xffff as invalid node, - * dont warn in this case. - */ - if (numa_domain != 0xffff) - printk(KERN_ERR "WARNING: cpu %ld " - "maps to invalid NUMA node %d\n", - lcpu, numa_domain); - numa_domain = 0; - } -out: - node_set_online(numa_domain); + if (numa_domain != INVALID_DOMAIN) + nid = domain_to_nid(numa_domain); - map_cpu_to_node(lcpu, numa_domain); + if (nid < 0) + nid = 0; +out: + map_cpu_to_node(lcpu, nid); of_node_put(cpu); - return numa_domain; + return nid; } static int cpu_numa_callback(struct notifier_block *nfb, @@ -278,8 +331,8 @@ static int cpu_numa_callback(struct noti case CPU_DEAD: case CPU_UP_CANCELED: unmap_cpu_from_node(lcpu); - break; ret = NOTIFY_OK; + break; #endif } return ret; @@ -319,7 +372,6 @@ static int __init parse_numa_properties( struct device_node *cpu = NULL; struct device_node *memory = NULL; int addr_cells, size_cells; - int max_domain = 0; long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; unsigned long i; @@ -341,37 +393,13 @@ static int __init parse_numa_properties( if (min_common_depth < 0) return min_common_depth; - max_domain = numa_setup_cpu(boot_cpuid); - - /* - * Even though we connect cpus to numa domains later in SMP init, - * we need to know the maximum node id now. This is because each - * node id must have NODE_DATA etc backing it. - * As a result of hotplug we could still have cpus appear later on - * with larger node ids. In that case we force the cpu into node 0. - */ - for_each_cpu(i) { - int numa_domain; - - cpu = find_cpu_node(i); - - if (cpu) { - numa_domain = of_node_numa_domain(cpu); - of_node_put(cpu); - - if (numa_domain < MAX_NUMNODES && - max_domain < numa_domain) - max_domain = numa_domain; - } - } - addr_cells = get_mem_addr_cells(); size_cells = get_mem_size_cells(); memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long start; unsigned long size; - int numa_domain; + int numa_domain, nid; int ranges; unsigned int *memcell_buf; unsigned int len; @@ -391,17 +419,16 @@ new_range: numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) { - if (numa_domain != 0xffff) - printk(KERN_ERR "WARNING: memory at %lx maps " - "to invalid NUMA node %d\n", start, - numa_domain); - numa_domain = 0; + if (numa_domain < 0) + nid = 0; + else { + nid = domain_to_nid(numa_domain); + if (nid < 0) + nid = establish_domain_mapping(numa_domain); + if (nid < 0) + nid = 0; } - if (max_domain < numa_domain) - max_domain = numa_domain; - if (! (size = numa_enforce_memory_limit(start, size))) { if (--ranges) goto new_range; @@ -412,41 +439,53 @@ new_range: /* * Initialize new node struct, or add to an existing one. */ - if (init_node_data[numa_domain].node_end_pfn) { + if (init_node_data[nid].node_end_pfn) { if ((start / PAGE_SIZE) < - init_node_data[numa_domain].node_start_pfn) - init_node_data[numa_domain].node_start_pfn = + init_node_data[nid].node_start_pfn) + init_node_data[nid].node_start_pfn = start / PAGE_SIZE; if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > - init_node_data[numa_domain].node_end_pfn) - init_node_data[numa_domain].node_end_pfn = + init_node_data[nid].node_end_pfn) + init_node_data[nid].node_end_pfn = (start / PAGE_SIZE) + (size / PAGE_SIZE); - init_node_data[numa_domain].node_present_pages += + init_node_data[nid].node_present_pages += size / PAGE_SIZE; } else { - node_set_online(numa_domain); - - init_node_data[numa_domain].node_start_pfn = + init_node_data[nid].node_start_pfn = start / PAGE_SIZE; - init_node_data[numa_domain].node_end_pfn = - init_node_data[numa_domain].node_start_pfn + + init_node_data[nid].node_end_pfn = + init_node_data[nid].node_start_pfn + size / PAGE_SIZE; - init_node_data[numa_domain].node_present_pages = + init_node_data[nid].node_present_pages = size / PAGE_SIZE; } for (i = start ; i < (start+size); i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = - numa_domain; + nid; if (--ranges) goto new_range; } - for (i = 0; i <= max_domain; i++) - node_set_online(i); + /* We need to establish domain<->nid mappings for any + * cpu nodes in the device tree with domains which were not + * encountered in the memory loop above. + */ + while ((cpu = of_find_node_by_type(cpu, "cpu"))) { + int domain = of_node_numa_domain(cpu); + if (domain < 0) + continue; + if (domain_to_nid(domain) < 0) + establish_domain_mapping(domain); + } + + /* Secondary logical cpus are associated with nids later in + * boot, but we need to explicitly set up the boot cpu. + */ + numa_setup_cpu(boot_cpuid); return 0; } @@ -541,7 +580,7 @@ static unsigned long careful_allocation( * If the memory came from a previously allocated node, we must * retry with the bootmem allocator. */ - if (pa_to_nid(ret) < nid) { + if (pa_to_nid(ret) != nid) { nid = pa_to_nid(ret); ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), size, align, 0); @@ -632,7 +671,7 @@ void __init do_init_bootmem(void) memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long mem_start, mem_size; - int numa_domain, ranges; + int numa_domain, ranges, thisnid; unsigned int *memcell_buf; unsigned int len; @@ -644,9 +683,18 @@ void __init do_init_bootmem(void) new_range: mem_start = read_n_cells(addr_cells, &memcell_buf); mem_size = read_n_cells(size_cells, &memcell_buf); - numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0; - if (numa_domain != nid) + if (numa_enabled) + numa_domain = of_node_numa_domain(memory); + else + numa_domain = -1; + + if (numa_domain < 0) + thisnid = 0; + else + thisnid = domain_to_nid(numa_domain); + + if (thisnid != nid) continue; mem_size = numa_enforce_memory_limit(mem_start, mem_size); Index: linux-2.6.12-rc3-mm3/include/asm-ppc64/mmzone.h =================================================================== --- linux-2.6.12-rc3-mm3.orig/include/asm-ppc64/mmzone.h +++ linux-2.6.12-rc3-mm3/include/asm-ppc64/mmzone.h @@ -27,24 +27,9 @@ extern int nr_cpus_in_node[]; #define MEMORY_INCREMENT_SHIFT 24 #define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT) -/* NUMA debugging, will not work on a DLPAR machine */ -#undef DEBUG_NUMA - static inline int pa_to_nid(unsigned long pa) { - int nid; - - nid = numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT]; - -#ifdef DEBUG_NUMA - /* the physical address passed in is not in the map for the system */ - if (nid == -1) { - printk("bad address: %lx\n", pa); - BUG(); - } -#endif - - return nid; + return numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT]; } #define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) Index: linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h =================================================================== --- linux-2.6.12-rc3-mm3.orig/include/asm-ppc64/topology.h +++ linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h @@ -8,15 +8,7 @@ static inline int cpu_to_node(int cpu) { - int node; - - node = numa_cpu_lookup_table[cpu]; - -#ifdef DEBUG_NUMA - BUG_ON(node == -1); -#endif - - return node; + return numa_cpu_lookup_table[cpu]; } #define parent_node(node) (node) From ntl at pobox.com Fri May 6 08:15:39 2005 From: ntl at pobox.com (Nathan Lynch) Date: Thu, 5 May 2005 17:15:39 -0500 Subject: [PATCH 2/2] update cpu-to-node mappings during dlpar Message-ID: <20050505221539.GC3614@otto> This fixes up some fallout from the preceding patch. The sched domains reinitialization code which runs at cpu hotplug time expects the cpu-to-node mappings to have been set up earlier than we were doing. Seems that things such as cpu_to_node() are expected to return sane values regardless of whether the cpu is online. It makes sense, I suppose: we should be updating the cpu<->node mappings when the topology changes, instead of keying on the state of cpus. Map logical cpus to nodes when a processor is added to the system; tear down the mapping(s) when a processor is going away. Get rid of the numa cpu hotplug notifier stuff. arch/ppc64/kernel/pSeries_smp.c | 3 + arch/ppc64/mm/numa.c | 61 +++++++++++++--------------------------- include/asm-ppc64/topology.h | 13 ++++++++ 3 files changed, 37 insertions(+), 40 deletions(-) Signed-off-by: Nathan Lynch Index: linux-2.6.12-rc3-mm3/arch/ppc64/kernel/pSeries_smp.c =================================================================== --- linux-2.6.12-rc3-mm3.orig/arch/ppc64/kernel/pSeries_smp.c +++ linux-2.6.12-rc3-mm3/arch/ppc64/kernel/pSeries_smp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mpic.h" @@ -187,6 +188,7 @@ static int pSeries_add_processor(struct BUG_ON(cpu_isset(cpu, cpu_present_map)); cpu_set(cpu, cpu_present_map); set_hard_smp_processor_id(cpu, *intserv++); + numa_setup_cpu(cpu, np); } err = 0; out_unlock: @@ -218,6 +220,7 @@ static void pSeries_remove_processor(str continue; BUG_ON(cpu_online(cpu)); cpu_clear(cpu, cpu_present_map); + numa_teardown_cpu(cpu); set_hard_smp_processor_id(cpu, -1); break; } Index: linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c =================================================================== --- linux-2.6.12-rc3-mm3.orig/arch/ppc64/mm/numa.c +++ linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c @@ -136,6 +136,11 @@ static void unmap_cpu_from_node(unsigned cpu, node); } } +#else +static void unmap_cpu_from_node(unsigned long cpu) +{ + return; +} #endif /* CONFIG_HOTPLUG_CPU */ static struct device_node * __devinit find_cpu_node(unsigned int cpu) @@ -284,19 +289,25 @@ static unsigned long read_n_cells(int n, } /* - * Figure out to which node a cpu belongs and stick it there. - * Return the id of the node used. + * Figure out to which node a cpu belongs and stick it there. Return + * the id of the node used. We allow the caller to optionally pass + * the device_node which corresponds to the logical cpu, since at + * DLPAR time the new node may not have been added to the device tree + * yet. */ -static int numa_setup_cpu(unsigned long lcpu) +int numa_setup_cpu(unsigned long lcpu, struct device_node *np) { int nid = 0, numa_domain = INVALID_DOMAIN; - struct device_node *cpu = find_cpu_node(lcpu); + struct device_node *cpu = np ? of_node_get(np) : find_cpu_node(lcpu); if (!cpu) { WARN_ON(1); goto out; } + if (!numa_enabled) + goto out; + numa_domain = of_node_numa_domain(cpu); if (numa_domain != INVALID_DOMAIN) @@ -312,32 +323,10 @@ out: return nid; } -static int cpu_numa_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned long lcpu = (unsigned long)hcpu; - int ret = NOTIFY_DONE; - - switch (action) { - case CPU_UP_PREPARE: - if (min_common_depth == -1 || !numa_enabled) - map_cpu_to_node(lcpu, 0); - else - numa_setup_cpu(lcpu); - ret = NOTIFY_OK; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_UP_CANCELED: - unmap_cpu_from_node(lcpu); - ret = NOTIFY_OK; - break; -#endif - } - return ret; +void numa_teardown_cpu(unsigned long lcpu) +{ + unmap_cpu_from_node(lcpu); } - /* * Check and possibly modify a memory region to enforce the memory limit. * @@ -373,7 +362,7 @@ static int __init parse_numa_properties( struct device_node *memory = NULL; int addr_cells, size_cells; long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; - unsigned long i; + unsigned long i, lcpu; if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); @@ -482,10 +471,8 @@ new_range: establish_domain_mapping(domain); } - /* Secondary logical cpus are associated with nids later in - * boot, but we need to explicitly set up the boot cpu. - */ - numa_setup_cpu(boot_cpuid); + for_each_present_cpu(lcpu) + numa_setup_cpu(lcpu, NULL); return 0; } @@ -602,10 +589,6 @@ void __init do_init_bootmem(void) int nid; int addr_cells, size_cells; struct device_node *memory = NULL; - static struct notifier_block ppc64_numa_nb = { - .notifier_call = cpu_numa_callback, - .priority = 1 /* Must run before sched domains notifier. */ - }; min_low_pfn = 0; max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; @@ -616,8 +599,6 @@ void __init do_init_bootmem(void) else dump_numa_topology(); - register_cpu_notifier(&ppc64_numa_nb); - for_each_online_node(nid) { unsigned long start_paddr, end_paddr; int i; Index: linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h =================================================================== --- linux-2.6.12-rc3-mm3.orig/include/asm-ppc64/topology.h +++ linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h @@ -4,6 +4,8 @@ #include #include +struct device_node; /* for numa_setup_cpu() */ + #ifdef CONFIG_NUMA static inline int cpu_to_node(int cpu) @@ -51,10 +53,21 @@ static inline int node_to_first_cpu(int .nr_balance_failed = 0, \ } +int numa_setup_cpu(unsigned long lcpu, struct device_node *); +void numa_teardown_cpu(unsigned long lcpu); #else /* !CONFIG_NUMA */ #include +static int inline numa_setup_cpu(unsigned long lcpu, struct device_node *np) +{ + return 0; +} + +static void inline numa_teardown_cpu(unsigned long lcpu) +{ + return; +} #endif /* CONFIG_NUMA */ #endif /* _ASM_PPC64_TOPOLOGY_H */ From paulus at samba.org Fri May 6 15:10:54 2005 From: paulus at samba.org (Paul Mackerras) Date: Fri, 6 May 2005 15:10:54 +1000 Subject: [PATCH 1/4] ppc64: rename arch/ppc64/kernel/pSeries_pci.c In-Reply-To: <200504200152.58965.arnd@arndb.de> References: <200504200149.22063.arnd@arndb.de> <200504200152.58965.arnd@arndb.de> Message-ID: <17018.64606.662481.104228@cargo.ozlabs.ibm.com> Arnd Bergmann writes: > Rename pSeries_pci.c to rtas_pci.c as a preparation to generalize it > for use by BPA. Most of the file can be used by any machine that > implements rtas. Hmmm, you rename pSeries_pci.c to rtas_pci.c and then in the next patch you recreate pSeries_pci.c and move some stuff from rtas_pci.c into it. Could we have one patch that creates rtas_pci.c and just moves stuff from pSeries_pci.c to it? Paul. From paulus at samba.org Fri May 6 15:33:21 2005 From: paulus at samba.org (Paul Mackerras) Date: Fri, 6 May 2005 15:33:21 +1000 Subject: [PATCH 2/3] native hash clear In-Reply-To: <20050413140850.GC5081@in.ibm.com> References: <20050413140605.GA5081@in.ibm.com> <20050413140807.GB5081@in.ibm.com> <20050413140850.GC5081@in.ibm.com> Message-ID: <17019.417.816952.686163@cargo.ozlabs.ibm.com> R Sharada writes: > Add code to clear the hash table and invalidate the tlb for native (SMP, > non-LPAR) mode. Supports 16M and 4k pages. > + /* we take the tlbie lock and hold it. Some hardware will > + * deadlock if we try to tlbie from two processors at once. > + */ > + spin_lock(&native_tlbie_lock); ... > + /* > + * we could lock the pte here, but we are the only cpu > + * running, right? and for crash dump, we probably > + * don't want to wait for a maybe bad cpu. > + */ So which is it? Are we locking things, and possibly waiting forever for a bad cpu, or are we not locking things? Or is there a reason why we lock in one case but not the other? Paul. From sharada at in.ibm.com Fri May 6 16:08:15 2005 From: sharada at in.ibm.com (R Sharada) Date: Fri, 6 May 2005 11:38:15 +0530 Subject: 2.6.12-rc3-mm3 pcibus_to_node patch breaks ppc64 CONFIG_NUMA case Message-ID: <20050506060815.GA2282@in.ibm.com> Hello, The patches in 2.6.12-rc3-mm3, that introduce pcibus_to_node, in the ide driver code, break the ppc64 CONFIG_NUMA case, as there is no definition for pcibus_to_node for the CONFIG_NUMA case in ppc64. The asm-generic/topology.h definition for pcibus_to_node gets included for the !CONFIG_NUMA case and works ok. There is a patch in mm3 for x86 that adds this definition on i386 and x86_64 platforms, but none for ppc64. The following are the patches that seem to cause this problem: numa-aware-block-device-control-structure-allocation.patch numa-aware-block-device-control-structure-allocation-tidy.patch - The above two patches introduce the pcibus_to_node call Note: x86-x86_64-pcibus_to_node.patch - this patches x86 code for pcibus_to_node alone Thanks and Regards, Sharada From service at paypal.com Fri May 6 18:06:13 2005 From: service at paypal.com (PayPal) Date: Fri, 6 May 2005 10:06:13 +0200 (CEST) Subject: PayPal Flagged Account Message-ID: <20050506080613.97FBA2DC869@server12.kundenserver12hsgbr.de> An HTML attachment was scrubbed... URL: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050506/83700ea2/attachment.htm From miltonm at bga.com Fri May 6 18:39:33 2005 From: miltonm at bga.com (Milton Miller) Date: Fri, 6 May 2005 03:39:33 -0500 Subject: [PATCH 2/3] native hash clear In-Reply-To: <17019.417.816952.686163@cargo.ozlabs.ibm.com> References: <20050413140605.GA5081@in.ibm.com> <20050413140807.GB5081@in.ibm.com> <20050413140850.GC5081@in.ibm.com> <17019.417.816952.686163@cargo.ozlabs.ibm.com> Message-ID: <530ae20d9cfe9d8ee5771f6c55964e20@bga.com> On May 6, 2005, at 12:33 AM, Paul Mackerras wrote: > R Sharada writes: > >> Add code to clear the hash table and invalidate the tlb for native >> (SMP, >> non-LPAR) mode. Supports 16M and 4k pages. > >> + /* we take the tlbie lock and hold it. Some hardware will >> + * deadlock if we try to tlbie from two processors at once. >> + */ >> + spin_lock(&native_tlbie_lock); > ... >> + /* >> + * we could lock the pte here, but we are the only cpu >> + * running, right? and for crash dump, we probably >> + * don't want to wait for a maybe bad cpu. >> + */ > > So which is it? Are we locking things, and possibly waiting forever > for a bad cpu, or are we not locking things? Or is there a reason why > we lock in one case but not the other? Right now, a bit of both. We don't worry about software interlocks, but we do use the lock over the smaller area that protects us from causing a hardware deadlock. So, the question is, is this the right tradeoff? milton From amavin at redhat.com Sat May 7 00:35:56 2005 From: amavin at redhat.com (Ananth N Mavinakayanahalli) Date: Fri, 06 May 2005 10:35:56 -0400 Subject: [PATCH] Kprobes: don't eat dabr/iabr exceptions Message-ID: <427B80CC.8010602@redhat.com> Hi Anton, Please find patch below that should fix the issue of kprobes eating up IABR/DABR exceptions. Please let me know if this works for you. Thanks, Ananth Signed-off-by: Ananth N Mavinakayanahalli -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: do-not-eat-dabr.patch Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050506/05d78b05/attachment.txt From christoph at lameter.com Sat May 7 00:11:20 2005 From: christoph at lameter.com (Christoph Lameter) Date: Fri, 6 May 2005 07:11:20 -0700 (PDT) Subject: 2.6.12-rc3-mm3 pcibus_to_node patch breaks ppc64 CONFIG_NUMA case In-Reply-To: <20050506060815.GA2282@in.ibm.com> References: <20050506060815.GA2282@in.ibm.com> Message-ID: On Fri, 6 May 2005, R Sharada wrote: > Hello, > The patches in 2.6.12-rc3-mm3, that introduce pcibus_to_node, in the > ide driver code, break the ppc64 CONFIG_NUMA case, as there is no definition > for pcibus_to_node for the CONFIG_NUMA case in ppc64. The asm-generic/topology.h > definition for pcibus_to_node gets included for the !CONFIG_NUMA case and works > ok. There is a patch in mm3 for x86 that adds this definition on i386 and > x86_64 platforms, but none for ppc64. It seems that the asm-generic/topology.h should always be included so that fallback mechanism can be defined for all platforms. Could you change that for ppc64? From johnrose at austin.ibm.com Sat May 7 07:49:11 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 06 May 2005 16:49:11 -0500 Subject: Patch to kill ioremap_mm In-Reply-To: <1115335822.7627.189.camel@gaston> References: <20050505014256.GE18270@localhost.localdomain> <1115306696.6011.6.camel@sinatra.austin.ibm.com> <1115335822.7627.189.camel@gaston> Message-ID: <1115416151.15458.15.camel@sinatra.austin.ibm.com> > We discussed that, it's sort of step #2 :) The idea I have is to fold > normal ioremap into the vmalloc space like other archs, ditch imalloc, > and for now to keep the top 1TB for the explicit mappings (PHB IO > space). David or I will do that after we are finished with other things. I would assume that we'll still have a path for creating mappings before mem_init_done? Ack, I'm already imagining the slot/phb DLPAR problems that vmalloc may not be able to handle. For example splitting existing regions, removing mappings within a given range, etc. John From linas at austin.ibm.com Sat May 7 09:05:06 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Fri, 6 May 2005 18:05:06 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <1112685311.9518.35.camel@gaston> References: <20050223002409.GA10909@austin.ibm.com> <20050223174356.GH13081@kroah.com> <20050224011409.GE2088@austin.ibm.com> <421DDEF7.7080103@jp.fujitsu.com> <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> Message-ID: <20050506230506.GL11745@austin.ibm.com> Hi, This is an "FYI" patch partially implementing the PCI error recovery API previously detailed by BenH in an earlier email. Its an "FYI patch" because this patch has numerous flaws and limitations which I'm hoping to address any day now. I've been busy with other things, but have recently been able to carve out a chunk of time to work on this. This patch is almost identical to a previous patch I'd mailed out before, with only minor changes made to bring it into line with BenH's proposed API. Basically, I'm just dusting off the old patch, prior to making more serious changes. I hope to send a more serious patch in a few days/week. Meanwhile, criticism invited. This patch does actually recover from PCI errors on ethernet cards plugged into ppc64 hotplug slots, and from PCI errors on the IPR scsi controller. --linas -------------- next part -------------- --- include/linux/pci.h.linas-orig 2005-04-29 20:27:22.000000000 -0500 +++ include/linux/pci.h 2005-05-06 16:34:02.000000000 -0500 @@ -659,6 +659,80 @@ struct pci_dynids { unsigned int use_driver_data:1; /* pci_driver->driver_data is used */ }; +/* ---------------------------------------------------------------- */ +/** PCI error recovery state. Whenever the PCI bus state changes, + * the io_state_change() callback will be called to notify the + * device driver os state changes. + */ + +enum pci_channel_state { + pci_channel_io_normal = 0, /* I/O channel is in normal state */ + pci_channel_io_frozen = 1, /* I/O to channel is blocked */ + pci_channel_io_perm_failure, /* pci card is dead */ +}; + +enum pcierr_result { + PCIERR_RESULT_CAN_RECOVER=1, + PCIERR_RESULT_NEED_RESET, + PCIERR_RESULT_DISCONNECT, + PCIERR_RESULT_RECOVERED, +}; + +/* PCI bus error event callbacks */ +struct pci_error_handlers +{ + int (*error_detected)(struct pci_dev *dev, enum pci_channel_state error); + int (*error_recover)(struct pci_dev *dev); + int (*error_restart)(struct pci_dev *dev); + int (*link_reset)(struct pci_dev *dev); + int (*slot_reset)(struct pci_dev *dev); +}; + +/** + * PCI Error notifier event flags. + */ +#define PEH_NOTIFY_ERROR 1 + +/** PEH event -- structure holding pci controller data that describes + * a change in the isolation status of a PCI slot. A pointer + * to this struct is passed as the data pointer in a notify callback. + */ +struct peh_event { + struct list_head list; + struct pci_dev *dev; /* affected device */ + enum pci_channel_state state; /* PCI bus state for the affected device */ + int time_unavail; /* milliseconds until device might be available */ +}; + +/** + * peh_send_failure_event - generate a PCI error event + * @dev pci device + * + * This routine builds a PCI error event which will be delivered + * to all listeners on the peh_notifier_chain. + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int peh_send_failure_event (struct pci_dev *dev, + enum pci_channel_state state, + int time_unavail); + +/** + * peh_register_notifier - Register to find out about EEH events. + * @nb: notifier block to callback on events + */ +int peh_register_notifier(struct notifier_block *nb); + +/** + * peh_unregister_notifier - Unregister to an EEH event notifier. + * @nb: notifier block to callback on events + */ +int peh_unregister_notifier(struct notifier_block *nb); + +/* ---------------------------------------------------------------- */ + struct module; struct pci_driver { struct list_head node; @@ -671,6 +745,7 @@ struct pci_driver { int (*resume) (struct pci_dev *dev); /* Device woken up */ int (*enable_wake) (struct pci_dev *dev, u32 state, int enable); /* Enable wake event */ + struct pci_error_handlers err_handler; struct device_driver driver; struct pci_dynids dynids; }; --- Documentation/pci-error-recovery.txt.linas-orig 2005-05-06 17:44:41.000000000 -0500 +++ Documentation/pci-error-recovery.txt 2005-05-06 17:39:19.000000000 -0500 @@ -0,0 +1,192 @@ + + PCI Error Recovery + ------------------ + + +Preliminary sketch of API, cut n pasted from email from BenH. +circa 5 april 2005 + +The error recovery API support is exposed by the driver in the form of +a structure of function pointers pointed to by a new field in struct +pci_driver. The absence of this pointer in pci_driver denotes an +"non-aware" driver, behaviour on these is platform dependant. Platforms +like ppc64 can try to simulate hotplug remove/add. + +The definition of "pci_error_token" is not covered here. It is based on +Seto's work on the synchronous error detection. We still need to define +functions for extracting infos out of an opaque error token. This is +separate from this API. + +This structure has the form: + +struct pci_error_handlers +{ + int (*error_detected)(struct pci_dev *dev, pci_error_token error); + int (*error_recover)(struct pci_dev *dev); + int (*error_restart)(struct pci_dev *dev); + int (*link_reset)(struct pci_dev *dev); + int (*slot_reset)(struct pci_dev *dev); +}; + +A driver doesn't have to implement all of these callbacks. The only mandatory +one is error_detected. If a callback is not implemented, the corresponding +feature is considered unsupported. For example, if error_recover and +error_restart (they really go together, see desscription to understand why) +aren't there, then the driver is assumed as not doing any direct recovery and +requires a reset. If link_reset is not implemented, the card is assumed as +not caring about link resets, in which case, if recover is supported, the core +can try recover (but not slot_reset unless it really did reset the slot). If slot +reset is not supported, link reset can be called instead on a slot reset. + +At first, the call will always be : + + 1) error_detected() + + Error detected. This is sent once after an error has been detected. At +this point, the device might not be accessible anymore depending on the +platform (the slot will be isolated on ppc64). The driver may already +have "noticed" the error because of a failing IO, but this is the proper +"synchronisation point", that is, it gives a chance to the driver to +cleanup, waiting for pending stuffs (timers, whatever, etc...) to +complete, it can take semaphores, schedule, etc... everything but touch +the device. Within this function and after it returns, the driver +shouldn't do any new IOs. Called in task context. This is sort of a +"quiesce" point. See note about interrupts at the end of this doc. + + Result codes: + - PCIERR_RESULT_CAN_RECOVER: + Return this if you think you might be able to recover + the HW by just banging IOs or if you want to be given + a chance to extract some diagnostic informations (see + below). + - PCIERR_RESULT_NEED_RESET: + Return this if you think you can't recover unless the + slot is reset. + - PCIERR_RESULT_DISCONNECT: + Return this if you think you won't recover at all, + (this will detach the driver ? or just leave it + dangling ? to be decided) + + +So at this point, we have called error_detected() for all drivers +on the segment that had the error. On ppc64, the slot is isolated. What +happens now typically depends on the result from the drivers. If all +drivers on the segment/slot return PCIERR_RESULT_CAN_RECOVER, we would +re-enable IOs on the slot (or do nothing special if the platform doesn't +isolate slots) and call 2). If not and we can reset slots, we go to 4), +if neither, we have a dead slot. If it's an hotplug slot, we might +"simulate" reset by triggering HW unplug/replug tho. + + 2) error_recover() + + This is the "early recovery" call. IOs are allowed again, but DMA is +not (hrm... to be discussed, I prefer not), with some restrictions. This +is NOT a callback for the driver to start operations again, only to +peek/poke at the device, extract diagnostic informations if any, and +eventually do things like trigger a device local reset or such things, +but not restart operations. This is sent if all drivers on a segment +agree that they can try to recover and no automatic link reset was performed +by the HW. If the platform can't just re-enable IOs without a slot reset or a +link reset, it doesn't call this callback and goes directly to 3) or 4). All IOs +should be done _synchronously_ from withing this callback, errors triggered by +them will be returned via the normal pci_check_whatever() api, no new +error_detected() callback will be issued due to an error happening here. However, +such an error might cause IOs to be re-blocked for the whole segment, and thus +invalidate the recovery that other devices on the same segment might have done, +forcing the whole segment into one of the next states, that is link reset or +slot reset. + + Result codes: + - PCIERR_RESULT_RECOVERED + Return this if you think your device is fully + functionnal and think you are ready to start + to do your normal driver job again. There is no + guarantee that because you returned that, you'll be + allowed to actually proceed as another driver on the + same segment might have failed and thus triggered a + slot reset on platforms that support it. + + - PCIERR_RESULT_NEED_RESET + Return this if you think your device is not + recoverable in it's current state and you need a slot + reset to proceed. + + - PCIERR_RESULT_DISCONNECT + Same as above. Total failure, no recovery even after + reset driver dead. (To be defined more precisely) + + 3) link_reset() + + This is called after the link has been reset. This is typically a +PCI Express specific state at this point and is done wether a non fatal error +has been detected that can be "solved" by resetting the link. The driver is +informed here of that reset and should check if the device appears to be in +working condition. This function acts a bit like 2) error_recover(), that is +it is not supposed to restart normal driver IO operations right away, just +"probe" the device to check it's recoverability status. If all is right, then +the core will call error_restart() once all driver have ack'd link_reset(). + + Result codes: + (identical to error_recover) + + 4) slot_reset() + + This is called after the slot has been hard reset (and PCI BARs +re-configured by the platform). If the platform supports PCI hotplug, +it can implement this by toggling power on the slot off/on. Drivers here +have a chance to re-initialize the hardware (re-download firmware etc...), +but drivers shouldn't restart normal IO processing operations at this point. +(see note about interrupts, they aren't guaranteed to be delivered until the +restart callback has been called). Upon success from this callback, the +patform will call error_restart() to complete the error handling and let +the driver restart normal IO request processing. + +However, a driver can still return a critical failure from here in case +it just can't get it's device back from reset. There is just nothing we +can do about it tho. The driver will just be considered "dead" in this case. + + Result codes: + - PCIERR_RESULT_DISCONNECT + Same as above. + + 5) error_restart() + + This is called if all drivers on the segment have returned +PCIERR_RESULT_RECOVERED from one of the 3 prevous callbacks. That basically +tells the driver to restart activity, everything is back & running. No result +code is taken into account here. If a new error happens, it will restart +a new error handling process. + +That's it. I think this covers all the possibilities. The way those +callbacks are called is platform policy. A platform with no slot reset +capability for example may want to just "ignore" drivers that can't +recover (disconnect them) and try to let other cards on the same segment +recover. Keep in mind that in most real life cases, though, there will +be only one driver per segment. + +Now, there is a note about interrupts. If you get an interrupt and your +device is dead or has been isolated, there is a problem :) + +After much thinking, I decided to leave that to the platform. That is, +the recovery API only precies that: + + - There is no guarantee that interrupt delivery can proceed from any +device on the segment starting from the error detection and until the +restart callback is sent, at which point interrupts are expected to be +fully operational. + + - There is no guarantee that interrupt delivery is stopped, that is, ad +river that gets an interrupts after detecting an error, or that detects +and error within the interrupt handler such that it prevents proper +ack'ing of the interrupt (and thus removal of the source) should just +return IRQ_NOTHANDLED. It's up to the platform to deal with taht +condition, typically by masking the irq source during the duration of +the error handling. It is expected that the platform "knows" which +interrupts are routed to error-management capable slots and can deal +with temporarily disabling that irq number during error processing (this +isn't terribly complex). That means some IRQ latency for other devices +sharing the interrupt, but there is simply no other way. High end +platforms aren't supposed to share interrupts between many devices +anyway :) + + --- drivers/pci/Makefile.linas-orig 2005-04-29 20:31:33.000000000 -0500 +++ drivers/pci/Makefile 2005-05-06 12:28:43.000000000 -0500 @@ -3,7 +3,7 @@ # obj-y += access.o bus.o probe.o remove.o pci.o quirks.o \ - names.o pci-driver.o search.o pci-sysfs.o \ + names.o pci-driver.o pci-error.o search.o pci-sysfs.o \ rom.o obj-$(CONFIG_PROC_FS) += proc.o --- drivers/pci/pci-error.c.linas-orig 2005-05-06 17:44:47.000000000 -0500 +++ drivers/pci/pci-error.c 2005-05-06 16:56:02.000000000 -0500 @@ -0,0 +1,152 @@ +/* + * pci-error.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#undef DEBUG + +/** Overview: + * PEH, or "PCI Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for PEH. + * An PEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. PEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of PEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with PEH. + */ + +/* PEH event workqueue setup. */ +static spinlock_t peh_eventlist_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(peh_eventlist); +static void peh_event_handler(void *); +DECLARE_WORK(peh_event_wq, peh_event_handler, NULL); + +static struct notifier_block *peh_notifier_chain; + +/** + * peh_event_handler - dispatch PEH events. The detection of a frozen + * slot can occur inside an interrupt, where it can be hard to do + * anything about it. The goal of this routine is to pull these + * detection events out of the context of the interrupt handler, and + * re-dispatch them for processing at a later time in a normal context. + * + * @dummy - unused + */ +static void peh_event_handler(void *dummy) +{ + unsigned long flags; + struct peh_event *event; + + while (1) { + spin_lock_irqsave(&peh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&peh_eventlist)) { + event = list_entry(peh_eventlist.next, struct peh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&peh_eventlist_lock, flags); + if (event == NULL) + break; + + printk(KERN_INFO "PEH: Detected PCI bus error on device " + "%s %s\n", + pci_name(event->dev), pci_pretty_name(event->dev)); + + notifier_call_chain (&peh_notifier_chain, + PEH_NOTIFY_ERROR, event); + + pci_dev_put(event->dev); + kfree(event); + } +} + + +/** + * peh_send_failure_event - generate a PCI error event + * @dev pci device + * + * This routine builds a PCI error event which will be delivered + * to all listeners on the peh_notifier_chain. + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int peh_send_failure_event (struct pci_dev *dev, + enum pci_channel_state state, + int time_unavail) +{ + unsigned long flags; + struct peh_event *event; + + event = kmalloc(sizeof(*event), GFP_ATOMIC); + if (event == NULL) { + printk (KERN_ERR "PEH: out of memory, event not handled\n"); + return 1; + } + + event->dev = dev; + event->state = state; + event->time_unavail = time_unavail; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&peh_eventlist_lock, flags); + list_add(&event->list, &peh_eventlist); + spin_unlock_irqrestore(&peh_eventlist_lock, flags); + + schedule_work(&peh_event_wq); + + return 0; +} + +/** + * peh_register_notifier - Register to find out about EEH events. + * @nb: notifier block to callback on events + */ +int peh_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&peh_notifier_chain, nb); +} + +/** + * peh_unregister_notifier - Unregister to an EEH event notifier. + * @nb: notifier block to callback on events + */ +int peh_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&peh_notifier_chain, nb); +} + + --- drivers/scsi/ipr.c.linas-orig 2005-04-29 20:33:36.000000000 -0500 +++ drivers/scsi/ipr.c 2005-05-06 17:28:15.000000000 -0500 @@ -80,6 +80,11 @@ #include #include #include + +#ifdef CONFIG_PPC64 +#define CONFIG_SCSI_IPR_EEH +#endif /* CONFIG_PPC64 */ + #include "ipr.h" /* @@ -4993,6 +4998,7 @@ static int ipr_reset_start_bist(struct i return rc; } + /** * ipr_reset_allowed - Query whether or not IOA can be reset * @ioa_cfg: ioa config struct @@ -5306,6 +5312,69 @@ static void ipr_initiate_ioa_reset(struc shutdown_type); } +#ifdef CONFIG_SCSI_IPR_EEH + +/** If the PCI slot is frozen, hold off all i/o + * activity; then, as soon as the slot is available again, + * initiate an adapter reset. + */ +static int ipr_reset_freeze(struct ipr_cmnd *ipr_cmd) +{ + list_add_tail(&ipr_cmd->queue, &ipr_cmd->ioa_cfg->pending_q); + ipr_cmd->done = ipr_reset_ioa_job; + return IPR_RC_JOB_RETURN; +} + +static void ipr_eeh_frozen (struct pci_dev *pdev) +{ + unsigned long flags = 0; + struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev); + + spin_lock_irqsave(ioa_cfg->host->host_lock, flags); + _ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_freeze, IPR_SHUTDOWN_NONE); + spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags); +} + +static int ipr_eeh_thawed (struct pci_dev *pdev) +{ + unsigned long flags = 0; + struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev); + + spin_lock_irqsave(ioa_cfg->host->host_lock, flags); + _ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_restore_cfg_space, + IPR_SHUTDOWN_NONE); + spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags); + + return PCIERR_RESULT_RECOVERED; +} + +static void ipr_eeh_perm_failure (struct pci_dev *pdev) +{ +#if 0 // XXXXXXXXXXXXXXXXXXXXXXX + ipr_cmd->job_step = ipr_reset_shutdown_ioa; + rc = IPR_RC_JOB_CONTINUE; +#endif +} + +static int ipr_eeh_error_detected (struct pci_dev *pdev, + enum pci_channel_state state) +{ + switch (state) { + case pci_channel_io_frozen: + ipr_eeh_frozen (pdev); + return PCIERR_RESULT_NEED_RESET; + + case pci_channel_io_perm_failure: + ipr_eeh_perm_failure (pdev); + return PCIERR_RESULT_DISCONNECT; + break; + default: + break; + } + return PCIERR_RESULT_NEED_RESET; +} +#endif + /** * ipr_probe_ioa_part2 - Initializes IOAs found in ipr_probe_ioa(..) * @ioa_cfg: ioa cfg struct @@ -6015,6 +6084,10 @@ static struct pci_driver ipr_driver = { .id_table = ipr_pci_table, .probe = ipr_probe, .remove = ipr_remove, + .err_handler = { + .error_detected = ipr_eeh_error_detected, + .slot_reset = ipr_eeh_thawed, + }, .driver = { .shutdown = ipr_shutdown, }, --- drivers/scsi/sym53c8xx_2/sym_glue.c.linas-orig 2005-04-29 20:33:12.000000000 -0500 +++ drivers/scsi/sym53c8xx_2/sym_glue.c 2005-05-06 16:55:02.000000000 -0500 @@ -49,6 +49,10 @@ #include #include +#ifdef CONFIG_PPC64 +#define CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY +#endif + #include "sym_glue.h" #include "sym_nvram.h" @@ -770,6 +774,10 @@ static irqreturn_t sym53c8xx_intr(int ir struct sym_hcb *np = (struct sym_hcb *)dev_id; if (DEBUG_FLAGS & DEBUG_TINY) printf_debug ("["); +#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY + if (np->s.io_state != pci_channel_io_normal) + return IRQ_HANDLED; +#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */ spin_lock_irqsave(np->s.host->host_lock, flags); sym_interrupt(np); @@ -844,6 +852,27 @@ static void sym_eh_done(struct scsi_cmnd */ static void sym_eh_timeout(u_long p) { __sym_eh_done((struct scsi_cmnd *)p, 1); } +#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY +static void sym_eeh_timeout(u_long p) +{ + struct sym_eh_wait *ep = (struct sym_eh_wait *) p; + if (!ep) + return; + complete(&ep->done); +} + +static void sym_eeh_done(struct sym_eh_wait *ep) +{ + if (!ep) + return; + ep->timed_out = 0; + if (!del_timer(&ep->timer)) + return; + + complete(&ep->done); +} +#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */ + /* * Generic method for our eh processing. * The 'op' argument tells what we have to do. @@ -905,6 +934,35 @@ prepare: sts = 0; break; case SYM_EH_HOST_RESET: +#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY +printk("duuuuuude attempting symbios recovery\n"); +dump_stack(); + int rc = eeh_slot_is_isolated (np->s.device); + +printk ("duude symbios is isolated ??=%d\n", rc); +printk ("duuude the current io state is %d\n", np->s.io_state); + if (rc) { + struct sym_eh_wait eeh, *eep = &eeh; + np->s.io_reset_wait = eep; + init_completion(&eep->done); + init_timer(&eep->timer); + eep->to_do = SYM_EH_DO_WAIT; + eep->timer.expires = jiffies + (10*HZ); + eep->timer.function = sym_eeh_timeout; + eep->timer.data = (u_long)eep; + eep->timed_out = 1; /* Be pessimistic for once :) */ + add_timer(&eep->timer); + spin_unlock_irq(np->s.host->host_lock); + wait_for_completion(&eep->done); + spin_lock_irq(np->s.host->host_lock); + if (eep->timed_out) { +printk ("duude symbios timed out\n"); + } else { +printk ("duude symbios waited for completion\n"); + } + np->s.io_reset_wait = NULL; + } +#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */ sym_reset_scsi_bus(np, 0); sym_start_up (np, 1); sts = 0; @@ -1577,6 +1635,30 @@ static int sym_setup_bus_dma_mask(struct return -1; } +#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY +int sym2_io_error_detected (struct pci_dev *pdev, enum pci_channel_state state) +{ + struct sym_hcb *np = pci_get_drvdata(pdev); +printk ("duude symbios got this state change %d jiffies=%ld\n", state, jiffies); + + np->s.io_state = state; + // XXX if perm frozen, then ...? + + return 0; +} + +int sym2_io_slot_reset (struct pci_dev *pdev) +{ + struct sym_hcb *np = pci_get_drvdata(pdev); +printk ("duude symbios got slot reset done jiffies=%ld\n", jiffies); + + np->s.io_state = pci_channel_io_normal; + sym_eeh_done (np->s.io_reset_wait); + + return 0; +} +#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */ + /* * Host attach and initialisations. * @@ -1625,6 +1707,8 @@ static struct Scsi_Host * __devinit sym_ if (!np) goto attach_failed; np->s.device = dev->pdev; + np->s.io_state = pci_channel_io_normal; + np->s.io_reset_wait = NULL; np->bus_dmat = dev->pdev; /* Result in 1 DMA pool per HBA */ host_data->ncb = np; np->s.host = instance; @@ -2359,6 +2443,10 @@ static struct pci_driver sym2_driver = { .id_table = sym2_id_table, .probe = sym2_probe, .remove = __devexit_p(sym2_remove), + .err_handler = { + .error_detected = sym2_io_error_detected, + .slot_reset = sym2_io_slot_reset, + }, }; static int __init sym2_init(void) --- drivers/scsi/sym53c8xx_2/sym_glue.h.linas-orig 2005-04-29 20:32:45.000000000 -0500 +++ drivers/scsi/sym53c8xx_2/sym_glue.h 2005-05-06 16:29:39.000000000 -0500 @@ -358,6 +358,10 @@ struct sym_shcb { char chip_name[8]; struct pci_dev *device; + /* pci bus i/o state; waiter for clearing of i/o state */ + enum pci_channel_state io_state; + struct sym_eh_wait *io_reset_wait; + struct Scsi_Host *host; void __iomem * mmio_va; /* MMIO kernel virtual address */ --- drivers/scsi/sym53c8xx_2/sym_hipd.c.linas-orig 2005-04-29 20:22:45.000000000 -0500 +++ drivers/scsi/sym53c8xx_2/sym_hipd.c 2005-05-06 12:28:43.000000000 -0500 @@ -2836,6 +2836,7 @@ void sym_interrupt (struct sym_hcb *np) u_char istat, istatc; u_char dstat; u_short sist; + u_int icnt; /* * interrupt on the fly ? @@ -2877,6 +2878,7 @@ void sym_interrupt (struct sym_hcb *np) sist = 0; dstat = 0; istatc = istat; + icnt = 0; do { if (istatc & SIP) sist |= INW (nc_sist); @@ -2884,6 +2886,14 @@ void sym_interrupt (struct sym_hcb *np) dstat |= INB (nc_dstat); istatc = INB (nc_istat); istat |= istatc; + icnt ++; + if (100 < icnt) { +#define CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY +#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY + if(eeh_slot_is_isolated (np->s.device)) + return; +#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */ + } } while (istatc & (SIP|DIP)); if (DEBUG_FLAGS & DEBUG_TINY) --- include/asm-ppc64/eeh.h.linas-orig 2005-04-29 20:34:03.000000000 -0500 +++ include/asm-ppc64/eeh.h 2005-05-06 12:28:43.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include struct pci_dev; @@ -36,6 +37,11 @@ struct notifier_block; #define EEH_MODE_SUPPORTED (1<<0) #define EEH_MODE_NOCHECK (1<<1) #define EEH_MODE_ISOLATED (1<<2) +#define EEH_MODE_RECOVERING (1<<3) + +/* Max number of EEH freezes allowed before we consider the device + * to be permanently disabled. */ +#define EEH_MAX_ALLOWED_FREEZES 5 void __init eeh_init(void); unsigned long eeh_check_failure(const volatile void __iomem *token, @@ -59,35 +65,82 @@ void eeh_add_device_late(struct pci_dev * eeh_remove_device - undo EEH setup for the indicated pci device * @dev: pci device to be removed * - * This routine should be when a device is removed from a running - * system (e.g. by hotplug or dlpar). + * This routine should be called when a device is removed from + * a running system (e.g. by hotplug or dlpar). It unregisters + * the PCI device from the EEH subsystem. I/O errors affecting + * this device will no longer be detected after this call; thus, + * i/o errors affecting this slot may leave this device unusable. */ void eeh_remove_device(struct pci_dev *); -#define EEH_DISABLE 0 -#define EEH_ENABLE 1 -#define EEH_RELEASE_LOADSTORE 2 -#define EEH_RELEASE_DMA 3 +/** + * eeh_slot_is_isolated -- return non-zero value if slot is frozen + */ +int eeh_slot_is_isolated (struct pci_dev *dev); /** - * Notifier event flags. + * eeh_ioaddr_is_isolated -- return non-zero value if device at + * io address is frozen. */ -#define EEH_NOTIFY_FREEZE 1 +int eeh_ioaddr_is_isolated(const volatile void __iomem *token); -/** EEH event -- structure holding pci slot data that describes - * a change in the isolation status of a PCI slot. A pointer - * to this struct is passed as the data pointer in a notify callback. - */ -struct eeh_event { - struct list_head list; - struct pci_dev *dev; - struct device_node *dn; - int reset_state; -}; - -/** Register to find out about EEH events. */ -int eeh_register_notifier(struct notifier_block *nb); -int eeh_unregister_notifier(struct notifier_block *nb); +/** + * eeh_slot_error_detail -- record and EEH error condition to the log + * @severity: 1 if temporary, 2 if permanent failure. + * + * Obtains the the EEH error details from the RTAS subsystem, + * and then logs these details with the RTAS error log system. + */ +void eeh_slot_error_detail (struct device_node *dn, int severity); + +/** + * rtas_set_slot_reset -- unfreeze a frozen slot + * + * Clear the EEH-frozen condition on a slot. This routine + * does this by asserting the PCI #RST line for 1/8th of + * a second; this routine will sleep while the adapter is + * being reset. + */ +void rtas_set_slot_reset (struct device_node *dn); + +/** rtas_pci_slot_reset raises/lowers the pci #RST line + * state: 1/0 to raise/lower the #RST + * + * Clear the EEH-frozen condition on a slot. This routine + * asserts the PCI #RST line if the 'state' argument is '1', + * and drops the #RST line if 'state is '0'. This routine is + * safe to call in an interrupt context. + * + */ +void rtas_pci_slot_reset(struct device_node *dn, int state); +void eeh_pci_slot_reset(struct pci_dev *dev, int state); + +/** eeh_pci_slot_availability -- Indicates whether a PCI + * slot is ready to be used. After a PCI reset, it may take a while + * for the PCI fabric to fully reset the comminucations path to the + * given PCI card. This routine can be used to determine how long + * to wait before a PCI slot might become usable. + * + * This routine returns how long to wait (in milliseconds) before + * the slot is expected to be usable. A value of zero means the + * slot is immediately usable. A negavitve value means that the + * slot is permanently disabled. + */ +int eeh_pci_slot_availability(struct pci_dev *dev); + +/** Restore device configuration info across device resets. + */ +void eeh_restore_bars(struct device_node *); +void eeh_pci_restore_bars(struct pci_dev *dev); + +/** + * rtas_configure_bridge -- firmware initialization of pci bridge + * + * Ask the firmware to configure any PCI bridge devices + * located behind the indicated node. Required after a + * pci device reset. + */ +void rtas_configure_bridge(struct device_node *dn); /** * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure. --- include/asm-ppc64/prom.h.linas-orig 2005-04-29 20:32:46.000000000 -0500 +++ include/asm-ppc64/prom.h 2005-05-06 12:28:43.000000000 -0500 @@ -119,6 +119,7 @@ struct property { */ struct pci_controller; struct iommu_table; +struct eeh_recovery_ops; struct device_node { char *name; @@ -137,8 +138,12 @@ struct device_node { int devfn; /* for pci devices */ int eeh_mode; /* See eeh.h for possible EEH_MODEs */ int eeh_config_addr; + int eeh_check_count; /* number of times device driver ignored error */ + int eeh_freeze_count; /* number of times this device froze up. */ + int eeh_is_bridge; /* device is pci-to-pci bridge */ struct pci_controller *phb; /* for pci devices */ struct iommu_table *iommu_table; /* for phb's or bridges */ + u32 config_space[16]; /* saved PCI config space */ struct property *properties; struct device_node *parent; --- include/asm-ppc64/rtas.h.linas-orig 2005-04-29 20:32:32.000000000 -0500 +++ include/asm-ppc64/rtas.h 2005-05-06 12:28:43.000000000 -0500 @@ -243,4 +243,6 @@ extern unsigned long rtas_rmo_buf; #define GLOBAL_INTERRUPT_QUEUE 9005 +extern int rtas_write_config(struct device_node *dn, int where, int size, u32 val); + #endif /* _PPC64_RTAS_H */ --- arch/ppc64/kernel/eeh.c.linas-orig 2005-04-29 20:29:19.000000000 -0500 +++ arch/ppc64/kernel/eeh.c 2005-05-06 16:52:39.000000000 -0500 @@ -17,16 +17,17 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include #include +#include #include -#include #include #include #include #include #include #include +#include #include #include #include @@ -49,8 +50,8 @@ * were "empty": all reads return 0xff's and all writes are silently * ignored. EEH slot isolation events can be triggered by parity * errors on the address or data busses (e.g. during posted writes), - * which in turn might be caused by dust, vibration, humidity, - * radioactivity or plain-old failed hardware. + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. * * Note, however, that one of the leading causes of EEH slot * freeze events are buggy device drivers, buggy device microcode, @@ -75,22 +76,13 @@ #define BUID_HI(buid) ((buid) >> 32) #define BUID_LO(buid) ((buid) & 0xffffffff) -/* EEH event workqueue setup. */ -static DEFINE_SPINLOCK(eeh_eventlist_lock); -LIST_HEAD(eeh_eventlist); -static void eeh_event_handler(void *); -DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL); - -static struct notifier_block *eeh_notifier_chain; - /* * If a device driver keeps reading an MMIO register in an interrupt * handler after a slot isolation event has occurred, we assume it * is broken and panic. This sets the threshold for how many read * attempts we allow before panicking. */ -#define EEH_MAX_FAILS 1000 -static atomic_t eeh_fail_count; +#define EEH_MAX_FAILS 100000 /* RTAS tokens */ static int ibm_set_eeh_option; @@ -107,6 +99,10 @@ static DEFINE_SPINLOCK(slot_errbuf_lock) static int eeh_error_buf_size; /* System monitoring statistics */ +static DEFINE_PER_CPU(unsigned long, no_device); +static DEFINE_PER_CPU(unsigned long, no_dn); +static DEFINE_PER_CPU(unsigned long, no_cfg_addr); +static DEFINE_PER_CPU(unsigned long, ignored_check); static DEFINE_PER_CPU(unsigned long, total_mmio_ffs); static DEFINE_PER_CPU(unsigned long, false_positives); static DEFINE_PER_CPU(unsigned long, ignored_failures); @@ -225,9 +221,9 @@ pci_addr_cache_insert(struct pci_dev *de while (*p) { parent = *p; piar = rb_entry(parent, struct pci_io_addr_range, rb_node); - if (alo < piar->addr_lo) { + if (ahi < piar->addr_lo) { p = &parent->rb_left; - } else if (ahi > piar->addr_hi) { + } else if (alo > piar->addr_hi) { p = &parent->rb_right; } else { if (dev != piar->pcidev || @@ -245,6 +241,11 @@ pci_addr_cache_insert(struct pci_dev *de piar->addr_hi = ahi; piar->pcidev = dev; piar->flags = flags; + +#ifdef DEBUG + printk (KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name (dev)); +#endif rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -369,8 +370,12 @@ void pci_addr_cache_remove_device(struct */ void __init pci_addr_cache_build(void) { + struct device_node *dn; struct pci_dev *dev = NULL; + if (!eeh_subsystem_enabled) + return; + spin_lock_init(&pci_io_addr_cache_root.piar_lock); while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -379,6 +384,17 @@ void __init pci_addr_cache_build(void) continue; } pci_addr_cache_insert_device(dev); + + /* Save the BAR's; firmware doesn't restore these after EEH reset */ + dn = pci_device_to_OF_node(dev); + if (dn) { + int i; + for (i = 0; i < 16; i++) + pci_read_config_dword(dev, i * 4, &dn->config_space[i]); + + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) + dn->eeh_is_bridge = 1; + } } #ifdef DEBUG @@ -390,24 +406,32 @@ void __init pci_addr_cache_build(void) /* --------------------------------------------------------------- */ /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */ -/** - * eeh_register_notifier - Register to find out about EEH events. - * @nb: notifier block to callback on events - */ -int eeh_register_notifier(struct notifier_block *nb) +void eeh_slot_error_detail (struct device_node *dn, int severity) { - return notifier_chain_register(&eeh_notifier_chain, nb); -} + unsigned long flags; + int rc; -/** - * eeh_unregister_notifier - Unregister to an EEH event notifier. - * @nb: notifier block to callback on events - */ -int eeh_unregister_notifier(struct notifier_block *nb) -{ - return notifier_chain_unregister(&eeh_notifier_chain, nb); + if (!dn) return; + + /* Log the error with the rtas logger */ + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + rc = rtas_call(ibm_slot_error_detail, + 8, 1, NULL, dn->eeh_config_addr, + BUID_HI(dn->phb->buid), + BUID_LO(dn->phb->buid), NULL, 0, + virt_to_phys(slot_errbuf), + eeh_error_buf_size, + severity); + + if (rc == 0) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); } +EXPORT_SYMBOL(eeh_slot_error_detail); + /** * read_slot_reset_state - Read the reset state of a device node's slot * @dn: device node to read @@ -422,6 +446,7 @@ static int read_slot_reset_state(struct outputs = 4; } else { token = ibm_read_slot_reset_state; + rets[2] = 0; /* fake PE Unavailable info */ outputs = 3; } @@ -430,75 +455,8 @@ static int read_slot_reset_state(struct } /** - * eeh_panic - call panic() for an eeh event that cannot be handled. - * The philosophy of this routine is that it is better to panic and - * halt the OS than it is to risk possible data corruption by - * oblivious device drivers that don't know better. - * - * @dev pci device that had an eeh event - * @reset_state current reset state of the device slot - */ -static void eeh_panic(struct pci_dev *dev, int reset_state) -{ - /* - * XXX We should create a separate sysctl for this. - * - * Since the panic_on_oops sysctl is used to halt the system - * in light of potential corruption, we can use it here. - */ - if (panic_on_oops) - panic("EEH: MMIO failure (%d) on device:%s %s\n", reset_state, - pci_name(dev), pci_pretty_name(dev)); - else { - __get_cpu_var(ignored_failures)++; - printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s %s\n", - reset_state, pci_name(dev), pci_pretty_name(dev)); - } -} - -/** - * eeh_event_handler - dispatch EEH events. The detection of a frozen - * slot can occur inside an interrupt, where it can be hard to do - * anything about it. The goal of this routine is to pull these - * detection events out of the context of the interrupt handler, and - * re-dispatch them for processing at a later time in a normal context. - * - * @dummy - unused - */ -static void eeh_event_handler(void *dummy) -{ - unsigned long flags; - struct eeh_event *event; - - while (1) { - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - if (event == NULL) - break; - - printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device " - "%s %s\n", event->reset_state, - pci_name(event->dev), pci_pretty_name(event->dev)); - - atomic_set(&eeh_fail_count, 0); - notifier_call_chain (&eeh_notifier_chain, - EEH_NOTIFY_FREEZE, event); - - __get_cpu_var(slot_resets)++; - - pci_dev_put(event->dev); - kfree(event); - } -} - -/** - * eeh_token_to_phys - convert EEH address token to phys address - * @token i/o token, should be address in the form 0xE.... + * eeh_token_to_phys - convert I/O address to phys address + * @token i/o address, should be address in the form 0xA.... */ static inline unsigned long eeh_token_to_phys(unsigned long token) { @@ -513,6 +471,18 @@ static inline unsigned long eeh_token_to return pa | (token & (PAGE_SIZE-1)); } + +static inline struct pci_dev * eeh_find_pci_dev(struct device_node *dn) +{ + struct pci_dev *dev = NULL; + for_each_pci_dev(dev) { + if (pci_device_to_OF_node(dev) == dn) + return dev; + } + return NULL; +} + + /** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node @@ -528,29 +498,33 @@ static inline unsigned long eeh_token_to * * It is safe to call this routine in an interrupt context. */ +extern void disable_irq_nosync(unsigned int); + int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) { int ret; int rets[3]; - unsigned long flags; - int rc, reset_state; - struct eeh_event *event; + enum pci_channel_state state; __get_cpu_var(total_mmio_ffs)++; if (!eeh_subsystem_enabled) return 0; - if (!dn) + if (!dn) { + __get_cpu_var(no_dn)++; return 0; + } /* Access to IO BARs might get this far and still not want checking. */ if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) || dn->eeh_mode & EEH_MODE_NOCHECK) { + __get_cpu_var(ignored_check)++; return 0; } if (!dn->eeh_config_addr) { + __get_cpu_var(no_cfg_addr)++; return 0; } @@ -559,12 +533,18 @@ int eeh_dn_check_failure(struct device_n * slot, we know it's bad already, we don't need to check... */ if (dn->eeh_mode & EEH_MODE_ISOLATED) { - atomic_inc(&eeh_fail_count); - if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { + dn->eeh_check_count ++; + if (dn->eeh_check_count >= EEH_MAX_FAILS) { + printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", + dn->eeh_check_count); + dump_stack(); /* re-read the slot reset state */ if (read_slot_reset_state(dn, rets) != 0) rets[0] = -1; /* reset state unknown */ - eeh_panic(dev, rets[0]); + + /* If we are here, then we hit an infinite loop. Stop. */ + panic("EEH: MMIO halt (%d) on device:%s %s\n", rets[0], + pci_name(dev), pci_pretty_name(dev)); } return 0; } @@ -577,53 +557,41 @@ int eeh_dn_check_failure(struct device_n * In any case they must share a common PHB. */ ret = read_slot_reset_state(dn, rets); - if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) { + if (!(ret == 0 && ((rets[1] == 1 && (rets[0] == 2 || rets[0] >= 4)) + || (rets[0] == 5)))) { __get_cpu_var(false_positives)++; return 0; } - /* prevent repeated reports of this failure */ - dn->eeh_mode |= EEH_MODE_ISOLATED; - - reset_state = rets[0]; + /* Note that empty slots will fail; empty slots don't have children... */ + if ((rets[0] == 5) && (dn->child == NULL)) { + __get_cpu_var(false_positives)++; + return 0; + } - spin_lock_irqsave(&slot_errbuf_lock, flags); - memset(slot_errbuf, 0, eeh_error_buf_size); + /* Prevent repeated reports of this failure */ + dn->eeh_mode |= EEH_MODE_ISOLATED; + __get_cpu_var(slot_resets)++; - rc = rtas_call(ibm_slot_error_detail, - 8, 1, NULL, dn->eeh_config_addr, - BUID_HI(dn->phb->buid), - BUID_LO(dn->phb->buid), NULL, 0, - virt_to_phys(slot_errbuf), - eeh_error_buf_size, - 1 /* Temporary Error */); + if (!dev) + dev = eeh_find_pci_dev (dn); - if (rc == 0) - log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); - spin_unlock_irqrestore(&slot_errbuf_lock, flags); + /* Some devices go crazy if irq's are not ack'ed; disable irq now */ + if (dev) + disable_irq_nosync (dev->irq); + + state = pci_channel_io_normal; + if ((rets[0] == 2) || (rets[0] == 4)) + state = pci_channel_io_frozen; + if (rets[0] == 5) + state = pci_channel_io_perm_failure; - printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n", - rets[0], dn->name, dn->full_name); - event = kmalloc(sizeof(*event), GFP_ATOMIC); - if (event == NULL) { - eeh_panic(dev, reset_state); - return 1; - } - - event->dev = dev; - event->dn = dn; - event->reset_state = reset_state; - - /* We may or may not be called in an interrupt context */ - spin_lock_irqsave(&eeh_eventlist_lock, flags); - list_add(&event->list, &eeh_eventlist); - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + peh_send_failure_event (dev, state, rets[2]); /* Most EEH events are due to device driver bugs. Having * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - dump_stack(); - schedule_work(&eeh_event_wq); + if (rets[0] != 5) dump_stack(); return 0; } @@ -635,7 +603,6 @@ EXPORT_SYMBOL(eeh_dn_check_failure); * @token i/o token, should be address in the form 0xA.... * @val value, should be all 1's (XXX why do we need this arg??) * - * Check for an eeh failure at the given token address. * Check for an EEH failure at the given token address. Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze event. This routine @@ -643,6 +610,7 @@ EXPORT_SYMBOL(eeh_dn_check_failure); * * Note this routine is safe to call in an interrupt context. */ + unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) { unsigned long addr; @@ -652,8 +620,10 @@ unsigned long eeh_check_failure(const vo /* Finding the phys addr + pci device; this is pretty quick. */ addr = eeh_token_to_phys((unsigned long __force) token); dev = pci_get_device_by_addr(addr); - if (!dev) + if (!dev) { + __get_cpu_var(no_device)++; return val; + } dn = pci_device_to_OF_node(dev); eeh_dn_check_failure (dn, dev); @@ -664,6 +634,249 @@ unsigned long eeh_check_failure(const vo EXPORT_SYMBOL(eeh_check_failure); +/* ------------------------------------------------------------- */ +/* The code below deals with error recovery */ + +int +eeh_slot_is_isolated(struct pci_dev *dev) +{ + struct device_node *dn; + dn = pci_device_to_OF_node(dev); + return (dn->eeh_mode & EEH_MODE_ISOLATED); +} + +int +eeh_ioaddr_is_isolated(const volatile void __iomem *token) +{ + unsigned long addr; + struct pci_dev *dev; + int rc; + + addr = eeh_token_to_phys((unsigned long __force) token); + dev = pci_get_device_by_addr(addr); + if (!dev) + return 0; + rc = eeh_slot_is_isolated(dev); + pci_dev_put(dev); + return rc; +} + +/** eeh_pci_slot_reset -- raises/lowers the pci #RST line + * state: 1/0 to raise/lower the #RST + */ +void +eeh_pci_slot_reset(struct pci_dev *dev, int state) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + rtas_pci_slot_reset (dn, state); +} + +/** Return negative value if a permanent error, else return + * a number of milliseconds to wait until the PCI slot is + * ready to be used. + */ +static int +eeh_slot_availability(struct device_node *dn) +{ + int rc; + int rets[3]; + + rc = read_slot_reset_state(dn, rets); +printk ("duuude dn=%s read slot reset state rc=%d rets=%d--%d--%d\n", dn->full_name, rc, rets[0], rets[1], rets[2]); + + if (rc) return rc; + + if (rets[1] == 0) return -1; /* EEH is not supported */ + if (rets[0] == 0) return 0; /* Oll Korrect */ + if (rets[0] == 5) { + if (rets[2] == 0) return -1; /* permanently unavailable */ + return rets[2]; /* number of millisecs to wait */ + } + return -1; +} + +int +eeh_pci_slot_availability(struct pci_dev *dev) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + if (!dn) return -1; + + BUG_ON (dn->phb==NULL); + if (dn->phb==NULL) { + printk (KERN_ERR "EEH, checking on slot with no phb dn=%s dev=%s:%s\n", + dn->full_name, pci_name(dev), pci_pretty_name (dev)); + return -1; + } + return eeh_slot_availability (dn); +} + +void +rtas_pci_slot_reset(struct device_node *dn, int state) +{ + int rc; + + if (!dn) + return; + if (!dn->phb) { + printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", dn->full_name); + return; + } + + dn->eeh_mode |= EEH_MODE_RECOVERING; + rc = rtas_call(ibm_set_slot_reset,4,1, NULL, + dn->eeh_config_addr, + BUID_HI(dn->phb->buid), + BUID_LO(dn->phb->buid), + state); + if (rc) { + printk (KERN_WARNING "EEH: Unable to reset the failed slot, (%d) #RST=%d\n", rc, state); + return; + } + + if (state == 0) + dn->eeh_mode &= ~(EEH_MODE_RECOVERING|EEH_MODE_ISOLATED); +} + +/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second + * dn -- device node to be reset. + */ + +void +rtas_set_slot_reset(struct device_node *dn) +{ + int i, rc; + +printk ("duude going to reset device %s\n", dn->full_name); +eeh_slot_availability(dn); + rtas_pci_slot_reset (dn, 1); + + /* The PCI bus requires that the reset be held high for at least + * a 100 milliseconds. We wait a bit longer 'just in case'. */ + +#define PCI_BUS_RST_HOLD_TIME_MSEC 250 + msleep (PCI_BUS_RST_HOLD_TIME_MSEC); + rtas_pci_slot_reset (dn, 0); + + /* After a PCI slot has been reset, the PCI Express spec requires + * a 1.5 second idle time for the bus to stabilize, before starting + * up traffic. */ +#define PCI_BUS_SETTLE_TIME_MSEC 1800 + msleep (PCI_BUS_SETTLE_TIME_MSEC); + + /* Now double check with the firmware to make sure the device is + * ready to be used; if not, wait for recovery. */ + for (i=0; i<10; i++) { + rc = eeh_slot_availability (dn); + if (rc <= 0) return; + + msleep (rc+100); + } +eeh_slot_availability (dn); +printk ("duuude WTFFFFFFFFFFFFFFFFFFFFFFF done reseting %s\n", dn->full_name); +extern int rtas_read_config(struct device_node *dn, int where, int size, u32 *val); +u32 val; +for(i=0;i<16;i++) { +rc = rtas_read_config (dn, i*4,4,&val); +printk ("duude read config %d rc=%d val=%x expect=%x\n", i, rc, val,dn->config_space[i]); +} + +} + +EXPORT_SYMBOL(rtas_set_slot_reset); + +void +rtas_configure_bridge(struct device_node *dn) +{ + int token = rtas_token ("ibm,configure-bridge"); + int rc; + + if (token == RTAS_UNKNOWN_SERVICE) + return; + rc = rtas_call(token,3,1, NULL, + dn->eeh_config_addr, + BUID_HI(dn->phb->buid), + BUID_LO(dn->phb->buid)); + if (rc) { + printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", + rc, dn->full_name); + } +} + +EXPORT_SYMBOL(rtas_configure_bridge); + +/* ------------------------------------------------------- */ +/** Save and restore of PCI BARs + * + * Although firmware will set up BARs during boot, it doesn't + * set up device BAR's after a device reset, although it will, + * if requested, set up bridge configuration. Thus, we need to + * configure the PCI devices ourselves. Config-space setup is + * stored in the PCI structures which are normally deleted during + * device removal. Thus, the "save" routine references the + * structures so that they aren't deleted. + */ + +/** + * __restore_bars - Restore the Base Address Registers + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static inline void __restore_bars (struct device_node *dn) +{ + int i; + + if (NULL==dn->phb) return; + for (i=4; i<10; i++) { + rtas_write_config(dn, i*4, 4, dn->config_space[i]); + } + + /* 12 == Expansion ROM Address */ + rtas_write_config(dn, 12*4, 4, dn->config_space[12]); + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(dn->config_space))[BYTE_SWAP(OFF)]) + + rtas_write_config (dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + + rtas_write_config (dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + rtas_write_config(dn, 15*4, 4, dn->config_space[15]); +} + +/** + * eeh_restore_bars - restore the PCI config space info + */ +void eeh_restore_bars(struct device_node *dn) +{ + if (! dn->eeh_is_bridge) + __restore_bars (dn); + + if (dn->child) + eeh_restore_bars (dn->child); +#if DO_SIBLINGS + if (dn->sibling) + eeh_restore_bars (dn->sibling); +#endif +} + +void eeh_pci_restore_bars(struct pci_dev *dev) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + eeh_restore_bars (dn); +} + +/* ------------------------------------------------------------- */ +/* The code below deals with enabling EEH for devices during the + * early boot sequence. EEH must be enabled before any PCI probing + * can be done. + */ + +#define EEH_ENABLE 1 + struct eeh_early_enable_info { unsigned int buid_hi; unsigned int buid_lo; @@ -682,6 +895,8 @@ static void *early_enable_eeh(struct dev int enable; dn->eeh_mode = 0; + dn->eeh_check_count = 0; + dn->eeh_freeze_count = 0; if (status && strcmp(status, "ok") != 0) return NULL; /* ignore devices with bad status */ @@ -743,7 +958,7 @@ static void *early_enable_eeh(struct dev dn->full_name); } - return NULL; + return NULL; } /* @@ -824,11 +1039,13 @@ void eeh_add_device_early(struct device_ struct pci_controller *phb; struct eeh_early_enable_info info; - if (!dn || !eeh_subsystem_enabled) + if (!dn) return; phb = dn->phb; if (NULL == phb || 0 == phb->buid) { - printk(KERN_WARNING "EEH: Expected buid but found none\n"); + printk(KERN_WARNING "EEH: Expected buid but found none for %s\n", + dn->full_name); + dump_stack(); return; } @@ -847,6 +1064,9 @@ EXPORT_SYMBOL(eeh_add_device_early); */ void eeh_add_device_late(struct pci_dev *dev) { + int i; + struct device_node *dn; + if (!dev || !eeh_subsystem_enabled) return; @@ -856,6 +1076,14 @@ void eeh_add_device_late(struct pci_dev #endif pci_addr_cache_insert_device (dev); + + /* Save the BAR's; firmware doesn't restore these after EEH reset */ + dn = pci_device_to_OF_node(dev); + for (i = 0; i < 16; i++) + pci_read_config_dword(dev, i * 4, &dn->config_space[i]); + + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) + dn->eeh_is_bridge = 1; } EXPORT_SYMBOL(eeh_add_device_late); @@ -885,12 +1113,17 @@ static int proc_eeh_show(struct seq_file unsigned int cpu; unsigned long ffs = 0, positives = 0, failures = 0; unsigned long resets = 0; + unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0; for_each_cpu(cpu) { ffs += per_cpu(total_mmio_ffs, cpu); positives += per_cpu(false_positives, cpu); failures += per_cpu(ignored_failures, cpu); resets += per_cpu(slot_resets, cpu); + no_dev += per_cpu(no_device, cpu); + no_dn += per_cpu(no_dn, cpu); + no_cfg += per_cpu(no_cfg_addr, cpu); + no_check += per_cpu(ignored_check, cpu); } if (0 == eeh_subsystem_enabled) { @@ -898,13 +1131,17 @@ static int proc_eeh_show(struct seq_file seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs); } else { seq_printf(m, "EEH Subsystem is enabled\n"); - seq_printf(m, "eeh_total_mmio_ffs=%ld\n" + seq_printf(m, + "no device=%ld\n" + "no device node=%ld\n" + "no config address=%ld\n" + "check not wanted=%ld\n" + "eeh_total_mmio_ffs=%ld\n" "eeh_false_positives=%ld\n" "eeh_ignored_failures=%ld\n" - "eeh_slot_resets=%ld\n" - "eeh_fail_count=%d\n", - ffs, positives, failures, resets, - eeh_fail_count.counter); + "eeh_slot_resets=%ld\n", + no_dev, no_dn, no_cfg, no_check, + ffs, positives, failures, resets); } return 0; --- arch/ppc64/kernel/pSeries_pci.c.linas-orig 2005-04-29 20:33:03.000000000 -0500 +++ arch/ppc64/kernel/pSeries_pci.c 2005-05-06 12:28:43.000000000 -0500 @@ -52,7 +52,7 @@ static int s7a_workaround; extern struct mpic *pSeries_mpic; -static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) +int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) { int returnval = -1; unsigned long buid, addr; @@ -101,7 +101,7 @@ static int rtas_pci_read_config(struct p return PCIBIOS_DEVICE_NOT_FOUND; } -static int rtas_write_config(struct device_node *dn, int where, int size, u32 val) +int rtas_write_config(struct device_node *dn, int where, int size, u32 val) { unsigned long buid, addr; int ret; --- drivers/pci/hotplug/rpaphp.h.linas-orig 2005-04-29 20:26:21.000000000 -0500 +++ drivers/pci/hotplug/rpaphp.h 2005-05-06 12:28:43.000000000 -0500 @@ -118,7 +118,8 @@ extern int rpaphp_enable_pci_slot(struct extern int register_pci_slot(struct slot *slot); extern int rpaphp_unconfig_pci_adapter(struct slot *slot); extern int rpaphp_get_pci_adapter_status(struct slot *slot, int is_init, u8 * value); -extern struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev); +extern void init_eeh_handler (void); +extern void exit_eeh_handler (void); /* rpaphp_core.c */ extern int rpaphp_add_slot(struct device_node *dn); --- drivers/pci/hotplug/rpaphp_core.c.linas-orig 2005-04-29 20:32:16.000000000 -0500 +++ drivers/pci/hotplug/rpaphp_core.c 2005-05-06 12:28:43.000000000 -0500 @@ -460,12 +460,18 @@ static int __init rpaphp_init(void) { info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); + /* Get set to handle EEH events. */ + init_eeh_handler(); + /* read all the PRA info from the system */ return init_rpa(); } static void __exit rpaphp_exit(void) { + /* Let EEH know we are going away. */ + exit_eeh_handler(); + cleanup_slots(); } --- drivers/pci/hotplug/rpaphp_pci.c.linas-orig 2005-04-29 20:22:38.000000000 -0500 +++ drivers/pci/hotplug/rpaphp_pci.c 2005-05-06 17:19:33.000000000 -0500 @@ -22,8 +22,13 @@ * Send feedback to * */ +#include +#include +#include #include +#include #include +#include #include #include #include "../pci.h" /* for pci_add_new_bus */ @@ -63,6 +68,7 @@ int rpaphp_claim_resource(struct pci_dev root ? "Address space collision on" : "No parent found for", resource, dtype, pci_name(dev), res->start, res->end); + dump_stack(); } return err; } @@ -188,6 +194,19 @@ rpaphp_fixup_new_pci_devices(struct pci_ static int rpaphp_pci_config_bridge(struct pci_dev *dev); +static void rpaphp_eeh_add_bus_device(struct pci_bus *bus) +{ + struct pci_dev *dev; + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (bus) + rpaphp_eeh_add_bus_device (subbus); + } + } +} + /***************************************************************************** rpaphp_pci_config_slot() will configure all devices under the given slot->dn and return the the first pci_dev. @@ -215,6 +234,8 @@ rpaphp_pci_config_slot(struct device_nod } if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) rpaphp_pci_config_bridge(dev); + + rpaphp_eeh_add_bus_device(bus); } return dev; } @@ -223,7 +244,6 @@ static int rpaphp_pci_config_bridge(stru { u8 sec_busno; struct pci_bus *child_bus; - struct pci_dev *child_dev; dbg("Enter %s: BRIDGE dev=%s\n", __FUNCTION__, pci_name(dev)); @@ -240,11 +260,7 @@ static int rpaphp_pci_config_bridge(stru /* do pci_scan_child_bus */ pci_scan_child_bus(child_bus); - list_for_each_entry(child_dev, &child_bus->devices, bus_list) { - eeh_add_device_late(child_dev); - } - - /* fixup new pci devices without touching bus struct */ + /* Fixup new pci devices without touching bus struct */ rpaphp_fixup_new_pci_devices(child_bus, 0); /* Make the discovered devices available */ @@ -282,7 +298,7 @@ static void print_slot_pci_funcs(struct return; } #else -static void print_slot_pci_funcs(struct slot *slot) +static inline void print_slot_pci_funcs(struct slot *slot) { return; } @@ -364,7 +380,6 @@ static void rpaphp_eeh_remove_bus_device if (pdev) rpaphp_eeh_remove_bus_device(pdev); } - } return; } @@ -566,36 +581,280 @@ exit: return retval; } -struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev) +/** + * rpaphp_search_bus_for_dev - return 1 if device is under this bus, else 0 + * @bus: the bus to search for this device. + * @dev: the pci device we are looking for. + */ +static int rpaphp_search_bus_for_dev (struct pci_bus *bus, struct pci_dev *dev) +{ + struct list_head *ln; + + if (!bus) return 0; + + for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) { + struct pci_dev *pdev = pci_dev_b(ln); + if (pdev == dev) + return 1; + if (pdev->subordinate) { + int rc; + rc = rpaphp_search_bus_for_dev (pdev->subordinate, dev); + if (rc) + return 1; + } + } + return 0; +} + +/** + * rpaphp_find_slot - find and return the slot holding the device + * @dev: pci device for which we want the slot structure. + */ +static struct slot *rpaphp_find_slot(struct pci_dev *dev) { - struct list_head *tmp, *n; - struct slot *slot; + struct list_head *tmp, *n; + struct slot *slot; list_for_each_safe(tmp, n, &rpaphp_slot_head) { struct pci_bus *bus; - struct list_head *ln; slot = list_entry(tmp, struct slot, rpaphp_slot_list); - if (slot->bridge == NULL) { - if (slot->dev_type == PCI_DEV) { - printk(KERN_WARNING "PCI slot missing bridge %s %s \n", - slot->name, slot->location); - } + + /* PHB's don't have bridges. */ + if (slot->bridge == NULL) continue; - } + + /* The PCI device could be the slot itself. */ + if (slot->bridge == dev) + return slot; bus = slot->bridge->subordinate; if (!bus) { + printk (KERN_WARNING "PCI bridge is missing bus: %s %s\n", + pci_name (slot->bridge), pci_pretty_name (slot->bridge)); continue; /* should never happen? */ } - for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) { - struct pci_dev *pdev = pci_dev_b(ln); - if (pdev == dev) - return slot->hotplug_slot; - } + + if (rpaphp_search_bus_for_dev (bus, dev)) + return slot; } + return NULL; +} + +/** get_phb_of_device -- find the pci controller for the device + * @dev the pci device + * This routine returns a pointer to the device node that + * describes the pci controller for the indicated slot. + */ +static struct device_node * +get_phb_of_device (struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_bus *bus; + + while (1) { + bus = dev->bus; + if (!bus) + break; + dn = pci_bus_to_OF_node(bus); + + if (dn->phb) + return dn; + + dev = bus->self; + BUG_ON (dev==NULL); + if (dev == NULL) + return NULL; + } return NULL; } -EXPORT_SYMBOL_GPL(rpaphp_find_hotplug_slot); +/* ------------------------------------------------------- */ +/** + * handle_eeh_events -- reset a PCI device after hard lockup. + * + * pSeries systems will isolate a PCI slot if the PCI-Host + * bridge detects address or data parity errors, DMA's + * occuring to wild addresses (which usually happen due to + * bugs in device drivers or in PCI adapter firmware). + * Slot isolations also occur if #SERR, #PERR or other misc + * PCI-related errors are detected. + * + * Recovery process consists of unplugging the device driver + * (which generated hotplug events to userspace), then issuing + * a PCI #RST to the device, then reconfiguring the PCI config + * space for all bridges & devices under this slot, and then + * finally restarting the device drivers (which cause a second + * set of hotplug events to go out to userspace). + */ + +int eeh_reset_device (struct pci_dev *dev, struct device_node *dn, int reconfig) +{ + struct slot *frozen_slot= NULL; + + if (!dev) + return 1; + + if (reconfig) + frozen_slot = rpaphp_find_slot(dev); + + if (reconfig && frozen_slot) rpaphp_unconfig_pci_adapter (frozen_slot); + + /* Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices */ + rtas_set_slot_reset (dn->child); + rtas_configure_bridge(dn); + eeh_restore_bars(dn->child); +printk ("duude, post restore bars, for %s here's the dump\n", dn->full_name); +{ +extern int rtas_read_config(struct device_node *dn, int where, int size, u32 *val); +int i, rc; +u32 val; +struct device_node *xn=dn->child; +for(i=0;i<16;i++) { +rc = rtas_read_config (xn, i*4,4,&val); +printk ("duude read config %d rc=%d val=%x expect=%x\n", i, rc, val,xn->config_space[i]); +}} + + enable_irq (dev->irq); + + /* Give the system 5 seconds to finish running the user-space + * hotplug scripts, e.g. ifdown for ethernet. Yes, this is a hack, + * but if we don't do this, weird things happen. + */ + if (reconfig && frozen_slot) { + ssleep (5); + rpaphp_enable_pci_slot (frozen_slot); + } + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 15 + +int handle_eeh_events (struct notifier_block *self, + unsigned long reason, void *ev) +{ + int freeze_count=0; + struct device_node *frozen_device; + struct peh_event *event = ev; + struct pci_dev *dev = event->dev; + int perm_failure = 0; + int rc; + + if (!dev) + { + printk ("EEH: EEH error caught, but no PCI device specified!\n"); + return 1; + } + + frozen_device = get_phb_of_device (dev); + + if (!frozen_device) + { + printk (KERN_ERR "EEH: Cannot find PCI conroller for %s %s\n", + pci_name(dev), pci_pretty_name (dev)); + + return 1; + } + + /* We get "permanent failure" messages on empty slots. + * These are false alarms. Empty slots have no child dn. */ + if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL)) + return 0; + + if (frozen_device) + freeze_count = frozen_device->eeh_freeze_count; + freeze_count ++; + if (freeze_count > EEH_MAX_ALLOWED_FREEZES) + perm_failure = 1; + + /* If the reset state is a '5' and the time to reset is 0 (infinity) + * or is more then 15 seconds, then mark this as a permanent failure. + */ + if ((event->state == pci_channel_io_perm_failure) && + ((event->time_unavail <= 0) || + (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) + perm_failure = 1; + + /* Log the error with the rtas logger. */ + if (perm_failure) { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + printk (KERN_ERR + "EEH: device %s:%s has failed %d times \n" + "and has been permanently disabled. Please try reseating\n" + "this device or replacing it.\n", + pci_name (dev), + pci_pretty_name (dev), + freeze_count); + + eeh_slot_error_detail (frozen_device, 2 /* Permanent Error */); + + /* Notify the device that its about to go down. */ + /* XXX this should be a recursive walk to children for + * multi-function devices */ + if (dev->driver->err_handler.error_detected) { + dev->driver->err_handler.error_detected (dev, pci_channel_io_perm_failure); + } + + /* If there's a hotplug slot, unconfigure it */ + struct slot * frozen_slot = rpaphp_find_slot(dev); + rpaphp_unconfig_pci_adapter (frozen_slot); + return 1; + } else { + eeh_slot_error_detail (frozen_device, 1 /* Temporary Error */); + } + + printk (KERN_WARNING + "EEH: This device has failed %d times since last reboot: %s:%s\n", + freeze_count, + pci_name (dev), + pci_pretty_name (dev)); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset */ + /* XXX this should be a recursive walk to children for + * multi-function devices; each child should get to report + * status too, if needed ... if any child can't handle the reset, + * then need to hotplug it. + * XXX This does not follow flow of BenH's last email at all. + * XXX will be fixed later XXX + */ + if (dev->driver->err_handler.error_detected) { + dev->driver->err_handler.error_detected (dev, pci_channel_io_frozen); + rc = eeh_reset_device (dev, frozen_device, 0); + if (dev->driver->err_handler.slot_reset) + dev->driver->err_handler.slot_reset (dev); + } else { + rc = eeh_reset_device (dev, frozen_device, 1); + } + + /* Store the freeze count with the pci adapter, and not the slot. + * This way, if the device is replaced, the count is cleared. + */ + frozen_device->eeh_freeze_count = freeze_count; + + return rc; +} + +static struct notifier_block eeh_block; + +void __init init_eeh_handler (void) +{ + eeh_block.notifier_call = handle_eeh_events; + peh_register_notifier (&eeh_block); +} + +void __exit exit_eeh_handler (void) +{ + peh_unregister_notifier (&eeh_block); +} + --- kernel/printk.c.linas-orig 2005-04-29 20:32:46.000000000 -0500 +++ kernel/printk.c 2005-05-06 12:28:43.000000000 -0500 @@ -383,6 +383,23 @@ asmlinkage long sys_syslog(int type, cha return do_syslog(type, buf, len); } +#ifdef CONFIG_DEBUG_KERNEL +/** + * Its very handy to be able to view the syslog buffer during debug. + * But do_syslog() uses locks and so it will deadlock if called during + * a debugging session. The routine provides the start and end of the + * physical and logical logs, and is equivalent to do_syslog(3). + */ + +void debugger_syslog_data(char *syslog_data[4]) +{ + syslog_data[0] = log_buf; + syslog_data[1] = log_buf + __LOG_BUF_LEN; + syslog_data[2] = log_buf + log_end - (logged_chars < __LOG_BUF_LEN ? logged_chars : __LOG_BUF_LEN); + syslog_data[3] = log_buf + log_end; +} +#endif /* CONFIG_DEBUG_KERNEL */ + /* * Call the console drivers on a range of log_buf */ --- arch/ppc64/xmon/xmon.c.linas-orig 2005-04-29 20:31:03.000000000 -0500 +++ arch/ppc64/xmon/xmon.c 2005-05-06 12:28:43.000000000 -0500 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,7 @@ static void prdump(unsigned long, long); static int ppc_inst_dump(unsigned long, long, int); void print_address(unsigned long); static void backtrace(struct pt_regs *); +static void xmon_show_stack(unsigned long sp, unsigned long lr, unsigned long pc); static void excprint(struct pt_regs *); static void prregs(struct pt_regs *); static void memops(int); @@ -131,6 +133,7 @@ static void csum(void); static void bootcmds(void); void dump_segments(void); static void symbol_lookup(void); +static void xmon_show_dmesg(void); static void xmon_print_symbol(unsigned long address, const char *mid, const char *after); static const char *getvecname(unsigned long vec); @@ -170,6 +173,7 @@ Commands:\n\ #endif "\ C checksum\n\ + D show dmesg (printk) buffer\n\ d dump bytes\n\ di dump instructions\n\ df dump float values\n\ @@ -186,6 +190,7 @@ Commands:\n\ mz zero a block of memory\n\ mi show information about memory allocation\n\ p show the task list\n\ + P show the task list and stacks\n\ r print registers\n\ s single step\n\ S print special registers\n\ @@ -310,6 +315,7 @@ int xmon_core(struct pt_regs *regs, int #endif msr = get_msr(); + msr |= MSR_SF|MSR_IR|MSR_DR; set_msrd(msr & ~MSR_EE); /* disable interrupts */ bp = in_breakpoint_table(regs->nip, &offset); @@ -323,15 +329,39 @@ int xmon_core(struct pt_regs *regs, int #ifdef CONFIG_SMP cpu = smp_processor_id(); if (cpu_isset(cpu, cpus_in_xmon)) { + int recursive = 1; get_output_lock(); excprint(regs); printf("cpu 0x%x: Exception %lx %s in xmon, " "returning to main loop\n", cpu, regs->trap, getvecname(TRAP(regs))); - longjmp(xmon_fault_jmp[cpu], 1); + + /* If crash occured in firmware, then saved stack pointer + * is bad, and we get recursive fault. Switch to using + * emergency stack in this case. + */ + unsigned long *sp = ((unsigned long *) xmon_fault_jmp[cpu]) + 1; + if (*sp < 0xc000000000000000) + { + printf("Bad stack pointer %lx in xmon, using emergency stack\n", *sp); + *sp = (unsigned long ) (get_paca()->emergency_sp); + sp = (unsigned long *) *sp; + *sp = (unsigned long ) (get_paca()->emergency_sp); + recursive = -1; + } + sp = (unsigned long *) *sp; + if (*sp < 0xc000000000000000) + { + printf("Bad stack frame %lx in xmon, using emergency stack\n", *sp); + *sp = (unsigned long ) (get_paca()->emergency_sp); + recursive = -1; + } +printf ("duude planing on returning with setjmp=%p\n", xmon_fault_jmp[cpu]); +printf ("duude planing on returning to %p w/stack=%p or %p\n", xmon_fault_jmp[cpu][0], sp, xmon_fault_jmp[cpu][1]); + longjmp(xmon_fault_jmp[cpu], recursive); } - if (setjmp(recurse_jmp) != 0) { + if (setjmp(recurse_jmp) > 0) { if (!in_xmon || !xmon_gate) { printf("xmon: WARNING: bad recursive fault " "on cpu 0x%x\n", cpu); @@ -353,6 +383,11 @@ int xmon_core(struct pt_regs *regs, int if (!fromipi) { get_output_lock(); excprint(regs); +printf ("duude this was a normal entry\n"); +printf ("duude saved return addr=%p, saves stackp=%p stack=%p\n", recurse_jmp[0], recurse_jmp[1], *((long **)(recurse_jmp[1]))); +printf ("duude my stack really really is %p\n", &msr); +printf ("duude my my setjmp is %p\n", recurse_jmp); + if (bp) { printf("cpu 0x%x stopped at breakpoint 0x%x (", cpu, BP_NUM(bp)); @@ -386,7 +421,7 @@ int xmon_core(struct pt_regs *regs, int smp_send_debugger_break(MSG_ALL_BUT_SELF); /* wait for other cpus to come in */ for (timeout = 100000000; timeout != 0; --timeout) { - if (cpus_weight(cpus_in_xmon) >= ncpus) + if (cpus_weight(*((cpumask_t *) &cpus_in_xmon)) >= ncpus) break; barrier(); } @@ -757,6 +792,64 @@ static void remove_cpu_bpts(void) set_iabr(0); } +static inline int +xmon_process_cpu(const task_t *p) +{ + return p->thread_info->cpu; +} + +#define xmon_task_has_cpu(p) (task_curr(p)) + +static void +xmon_show_task(task_t *p) +{ + printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + (void *)p, p->pid, p->parent->pid, + xmon_task_has_cpu(p), xmon_process_cpu(p), + (p->state == 0) ? 'R' : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & EXIT_ZOMBIE) ? 'Z' : + (p->state & EXIT_DEAD) ? 'X' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + (void *)(&p->thread), + (p == current) ? '*': ' ', + p->comm); +} + +static task_t *xmon_next_thread(const task_t *p) +{ + return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); +} + +static void +xmon_show_state(int prt_stacks) +{ + task_t *g, *p; + + printf("%-*s Pid Parent [*] cpu State %-*s Command\n", + (int)(2*sizeof(void *))+2, "Task Addr", + (int)(2*sizeof(void *))+2, "Thread"); + +#ifdef PER_CPU_RUNQUEUES_NO_LONGER_DECLARED_STATIC_IN_SCHED_C + /* Run the active tasks first */ + for (cpu = 0; cpu < NR_CPUS; ++cpu) + if (cpu_online(cpu)) { + p = cpu_curr(cpu); + xmon_show_task(p); + } +#endif + + /* Now the real tasks */ + do_each_thread(g, p) { + xmon_show_task(p); + if (prt_stacks) + xmon_show_stack(p->thread.ksp, 0, 0); + } while ((p = xmon_next_thread(p)) != g); +} + + /* Command interpreting routine */ static char *last_cmd; @@ -809,6 +902,9 @@ cmds(struct pt_regs *excp) case 'd': dump(); break; + case 'D': + xmon_show_dmesg(); + break; case 'l': symbol_lookup(); break; @@ -839,7 +935,10 @@ cmds(struct pt_regs *excp) printf(help_string); break; case 'p': - show_state(); + xmon_show_state(0); + break; + case 'P': + xmon_show_state(1); break; case 'b': bpt_cmds(); @@ -2400,6 +2499,58 @@ static void xmon_print_symbol(unsigned l printf("%s", after); } +extern void debugger_syslog_data(char *syslog_data[4]); +#define SYSLOG_WRAP(p) if (p < syslog_data[0]) p = syslog_data[1]-1; \ + else if (p >= syslog_data[1]) p = syslog_data[0]; + +static void xmon_show_dmesg(void) +{ + char *syslog_data[4], *start, *end, c; + int logsize; + + /* syslog_data[0,1] physical start, end+1. + * syslog_data[2,3] logical start, end+1. + */ + debugger_syslog_data(syslog_data); + if (syslog_data[2] == syslog_data[3]) + return; + logsize = syslog_data[1] - syslog_data[0]; + start = syslog_data[0] + (syslog_data[2] - syslog_data[0]) % logsize; + end = syslog_data[0] + (syslog_data[3] - syslog_data[0]) % logsize; + + /* Do a line at a time (max 200 chars) to reduce overhead */ + c = '\0'; + while(1) { + char *p; + int chars = 0; + if (!*start) { + while (!*start) { + ++start; + SYSLOG_WRAP(start); + if (start == end) + break; + } + if (start == end) + break; + } + p = start; + while (*start && chars < 200) { + c = *start; + ++chars; + ++start; + SYSLOG_WRAP(start); + if (start == end || c == '\n') + break; + } + if (chars) + printf("%.*s", chars, p); + if (start == end) + break; + } + if (c != '\n') + printf("\n"); +} + static void debug_trace(void) { unsigned long val, cmd, on; From michael at ellerman.id.au Sat May 7 12:01:19 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sat, 7 May 2005 12:01:19 +1000 Subject: [PATCH 1/3] ppc64: iseries_veth: Don't send packets to LPARs which aren't up Message-ID: <200505071201.20123.michael@ellerman.id.au> Hi Everybody, The iseries_veth driver has a logic bug which means it will erroneously send packets to LPARs for which we don't have a connection. This usually isn't a big problem because the Hypervisor call fails gracefully and we return, but if packets are TX'ed during the negotiation of the connection bad things might happen. Regardless, the right thing is to bail early if we know there's no connection. Signed-off-by: Michael Ellerman -- iseries_veth.c | 2 +- 1 files changed, 1 insertion(+), 1 deletion(-) Index: veth-fixes/drivers/net/iseries_veth.c =================================================================== --- veth-fixes.orig/drivers/net/iseries_veth.c +++ veth-fixes/drivers/net/iseries_veth.c @@ -924,7 +924,7 @@ static int veth_transmit_to_one(struct s spin_lock_irqsave(&cnx->lock, flags); - if (! cnx->state & VETH_STATE_READY) + if (! (cnx->state & VETH_STATE_READY)) goto drop; if ((skb->len - 14) > VETH_MAX_MTU) From michael at ellerman.id.au Sat May 7 12:01:25 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sat, 7 May 2005 12:01:25 +1000 Subject: [PATCH 2/3] ppc64: iseries_veth: Set dev->trans_start so watchdog timer works right Message-ID: <200505071201.25357.michael@ellerman.id.au> Hi Everybody, The iseries_veth driver doesn't set dev->trans_start in it's TX path. This will cause the net device watchdog timer to fire earlier than we want it to, which causes the driver to needlessly reset its connections to other LPARs. Signed-off-by: Michael Ellerman -- iseries_veth.c | 2 ++ 1 files changed, 2 insertions(+) Index: veth-fixes/drivers/net/iseries_veth.c =================================================================== --- veth-fixes.orig/drivers/net/iseries_veth.c +++ veth-fixes/drivers/net/iseries_veth.c @@ -1023,6 +1023,8 @@ static int veth_start_xmit(struct sk_buf lpmask = veth_transmit_to_many(skb, lpmask, dev); + dev->trans_start = jiffies; + if (! lpmask) { dev_kfree_skb(skb); } else { From michael at ellerman.id.au Sat May 7 12:01:33 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Sat, 7 May 2005 12:01:33 +1000 Subject: [PATCH 3/3] ppc64: iseries_veth: Don't leak skbs in RX path Message-ID: <200505071201.33881.michael@ellerman.id.au> Hi Everybody, In some strange circumstances the iseries_veth driver will leak skbs in its RX path. Fix is simply to call dev_kfree_skb() in the right place. Fix up the comment as well. Signed-off-by: Michael Ellerman -- iseries_veth.c | 17 +++++++++++------ 1 files changed, 11 insertions(+), 6 deletions(-) Index: veth-fixes/drivers/net/iseries_veth.c =================================================================== --- veth-fixes.orig/drivers/net/iseries_veth.c +++ veth-fixes/drivers/net/iseries_veth.c @@ -1264,13 +1264,18 @@ static void veth_receive(struct veth_lpa vlan = skb->data[9]; dev = veth_dev[vlan]; - if (! dev) - /* Some earlier versions of the driver sent - broadcasts down all connections, even to - lpars that weren't on the relevant vlan. - So ignore packets belonging to a vlan we're - not on. */ + if (! dev) { + /* + * Some earlier versions of the driver sent + * broadcasts down all connections, even to lpars + * that weren't on the relevant vlan. So ignore + * packets belonging to a vlan we're not on. + * We can also be here if we receive packets while + * the driver is going down, because then dev is NULL. + */ + dev_kfree_skb_irq(skb); continue; + } port = (struct veth_port *)dev->priv; dest = *((u64 *) skb->data) & 0xFFFFFFFFFFFF0000; From david at gibson.dropbear.id.au Sat May 7 13:37:13 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Sat, 7 May 2005 13:37:13 +1000 Subject: Patch to kill ioremap_mm In-Reply-To: <1115310090.6011.21.camel@sinatra.austin.ibm.com> References: <20050505014256.GE18270@localhost.localdomain> <1115310090.6011.21.camel@sinatra.austin.ibm.com> Message-ID: <20050507033712.GC19538@localhost.localdomain> On Thu, May 05, 2005 at 11:21:30AM -0500, John Rose wrote: > > > Hi David- > > Given that we use a separate allocation scheme for imalloc mappings, > does it make sense to lump these into the vmalloc mm_struct, and to > share the vmalloc address space? This saves lines of code, but is it as > clear as the existing (separate) layout? In a word: yes. As Ben's explained this can be a first step to greatly simplifying the imalloc stuff. But even without that, there's absolutely no good reason to have two different sets of pagetables for the kernel unlike every other architecture. init_mm provides a perfectly good virtual memory space, let's use it, instead of pointlessly duplicating stuff and complexifying the hash path. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From arnd at arndb.de Sat May 7 21:31:52 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Sat, 7 May 2005 13:31:52 +0200 Subject: [PATCH 1/4] ppc64: rename arch/ppc64/kernel/pSeries_pci.c In-Reply-To: <17018.64606.662481.104228@cargo.ozlabs.ibm.com> References: <200504200149.22063.arnd@arndb.de> <200504200152.58965.arnd@arndb.de> <17018.64606.662481.104228@cargo.ozlabs.ibm.com> Message-ID: <200505071331.53944.arnd@arndb.de> On Freedag 06 Mai 2005 07:10, Paul Mackerras wrote: > Hmmm, you rename pSeries_pci.c to rtas_pci.c and then in the next > patch you recreate pSeries_pci.c and move some stuff from rtas_pci.c > into it. ?Could we have one patch that creates rtas_pci.c and just > moves stuff from pSeries_pci.c to it? Sure. I wanted to make it easier to review as the rename patch is trivial and the second patch is less to read than the combined one. In my next update I will fold the two together. Arnd <>< From olh at suse.de Sun May 8 03:04:34 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 19:04:34 +0200 Subject: [PATCH] remove unused arch/ppc64/boot/piggyback.c Message-ID: <20050507170434.GA25407@suse.de> piggyback is not called in arch/ppc64/boot/Makefile Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/Makefile +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile @@ -52,7 +52,7 @@ obj-sec = $(foreach section, $(1), $(pat src-sec = $(foreach section, $(1), $(patsubst %,$(obj)/kernel-%.c, $(section))) gz-sec = $(foreach section, $(1), $(patsubst %,$(obj)/kernel-%.gz, $(section))) -hostprogs-y := piggy addnote addRamDisk +hostprogs-y := addnote addRamDisk targets += zImage zImage.initrd imagesize.c \ $(patsubst $(obj)/%,%, $(call obj-sec, $(required) $(initrd))) \ $(patsubst $(obj)/%,%, $(call src-sec, $(required) $(initrd))) \ @@ -78,9 +78,6 @@ addsection = $(CROSS32OBJCOPY) $(1) \ quiet_cmd_addnote = ADDNOTE $@ cmd_addnote = $(CROSS32LD) $(BOOTLFLAGS) -o $@ $(obj-boot) && $(obj)/addnote $@ -quiet_cmd_piggy = PIGGY $@ - cmd_piggy = $(obj)/piggyback $(@:.o=) < $< | $(CROSS32AS) -o $@ - $(call gz-sec, $(required)): $(obj)/kernel-%.gz: % FORCE $(call if_changed,gzip) Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/piggyback.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/piggyback.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright 2001 IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include - -extern long ce_exec_config[]; - -int main(int argc, char *argv[]) -{ - int i, cnt, pos, len; - unsigned int cksum, val; - unsigned char *lp; - unsigned char buf[8192]; - char *varname; - if (argc != 2) - { - fprintf(stderr, "usage: %s name out-file\n", - argv[0]); - exit(1); - } - - varname = strrchr(argv[1], '/'); - if (varname) - varname++; - else - varname = argv[1]; - - fprintf(stdout, "#\n"); - fprintf(stdout, "# Miscellaneous data structures:\n"); - fprintf(stdout, "# WARNING - this file is automatically generated!\n"); - fprintf(stdout, "#\n"); - fprintf(stdout, "\n"); - fprintf(stdout, "\t.data\n"); - fprintf(stdout, "\t.globl %s_data\n", varname); - fprintf(stdout, "%s_data:\n", varname); - pos = 0; - cksum = 0; - while ((len = read(0, buf, sizeof(buf))) > 0) - { - cnt = 0; - lp = (unsigned char *)buf; - len = (len + 3) & ~3; /* Round up to longwords */ - for (i = 0; i < len; i += 4) - { - if (cnt == 0) - { - fprintf(stdout, "\t.long\t"); - } - fprintf(stdout, "0x%02X%02X%02X%02X", lp[0], lp[1], lp[2], lp[3]); - val = *(unsigned long *)lp; - cksum ^= val; - lp += 4; - if (++cnt == 4) - { - cnt = 0; - fprintf(stdout, " # %x \n", pos+i-12); - fflush(stdout); - } else - { - fprintf(stdout, ","); - } - } - if (cnt) - { - fprintf(stdout, "0\n"); - } - pos += len; - } - fprintf(stdout, "\t.globl %s_len\n", varname); - fprintf(stdout, "%s_len:\t.long\t0x%x\n", varname, pos); - fflush(stdout); - fclose(stdout); - fprintf(stderr, "cksum = %x\n", cksum); - exit(0); -} - From olh at suse.de Sun May 8 03:05:51 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 19:05:51 +0200 Subject: [PATCH] remove unused arch/ppc64/boot/mknote.c Message-ID: <20050507170551.GB25407@suse.de> mknote is not called in arch/ppc64/boot/Makefile Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/mknote.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/mknote.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) Cort Dougan 1999. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Generate a note section as per the CHRP specification. - * - */ - -#include - -#define PL(x) printf("%c%c%c%c", ((x)>>24)&0xff, ((x)>>16)&0xff, ((x)>>8)&0xff, (x)&0xff ); - -int main(void) -{ -/* header */ - /* namesz */ - PL(strlen("PowerPC")+1); - /* descrsz */ - PL(6*4); - /* type */ - PL(0x1275); - /* name */ - printf("PowerPC"); printf("%c", 0); - -/* descriptor */ - /* real-mode */ - PL(0xffffffff); - /* real-base */ - PL(0x00c00000); - /* real-size */ - PL(0xffffffff); - /* virt-base */ - PL(0xffffffff); - /* virt-size */ - PL(0xffffffff); - /* load-base */ - PL(0x4000); - return 0; -} From markus at unixforces.net Sun May 8 03:09:04 2005 From: markus at unixforces.net (Markus Rothe) Date: Sat, 7 May 2005 17:09:04 +0000 Subject: [BUG] linux-2.6.12_rc4: Oops: Kernel access of bad area, sig: 11 [#1] Message-ID: <20050507170904.GA9488@unixforces.net> Hello, I'm running Linux on my G5. I just compiled linux-2.6.12_rc4 and got this when loading my sound modules at boot time (linux-2.6.12_rc3 worked just fine): ---- SNIP ---- Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2 POWERMAC Modules linked in: snd_powermac snd_pcm snd_page_alloc snd_timer snd soundcore NIP: C0000000002E4030 XER: 20000000 LR: D0000000001B4AC8 CTR: C0000000002E4004 REGS: c000000237783890 TRAP: 0300 Not tainted (2.6.12-rc4) MSR: 9000000000009032 EE: 1 PR: 0 FP: 0 ME: 1 IR/DR: 11 CR: 22022484 DAR: 0000000000000002 DSISR: 0000000040000000 TASK: c0000002375ba030[7007] 'modprobe' THREAD: c000000237780000 CPU: 0 GPR00: D0000000001B4AC8 C000000237783B10 C0000000005A5E70 0000000000000000 GPR04: 0000000000000001 0000000000000060 0000000000000000 0000000000000001 GPR08: 0000000000000002 0000000000000000 C00000000047AAC0 C0000000002E4004 GPR12: D0000000001B9C58 C00000000047B800 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000010028790 0000000000000000 000000001002A4A0 GPR20: 0000000000000000 0000000000000000 0000000000000000 00000080001B1010 GPR24: 000000001002AA90 000000001002ABF8 000000001002ABE0 C000000000478958 GPR28: C00000000F70D580 000000000000000A D0000000001CACD8 D0000000001C137C NIP [c0000000002e4030] .i2c_smbus_write_byte_data+0x2c/0x50 LR [d0000000001b4ac8] .send_init_client+0x50/0x110 [snd_powermac] Call Trace: [c000000237783b10] [d0000000001c2030] tumbler_mixers+0x0/0xffffffffffff8b28 [snd_powermac] (unreliable) [c000000237783bc0] [d0000000001b4ac8] .send_init_client+0x50/0x110 [snd_powermac] [c000000237783c60] [d0000000001b9514] .snd_pmac_tumbler_post_init+0x3c/0x94 [snd_powermac] [c000000237783ce0] [d0000000001b74fc] .alsa_card_pmac_init+0x174/0x3cc [snd_powermac] [c000000237783d90] [c000000000063988] .sys_init_module+0x2cc/0x4a8 [c000000237783e30] [c00000000000d980] syscall_exit+0x0/0x18 Instruction dump: 4e800020 7c0802a6 7c691b78 7c872378 38c00000 39000002 f8010010 f821ff51 98a10070 60000000 60000000 60000000 a0890006 e8630008 39210070 <6>usbcore: registered new driver snd-usb-audio ---- SNIP ---- Could someone please take a look at that? Best regards, Markus -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050507/1305e517/attachment.pgp From olh at suse.de Sun May 8 07:24:49 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 23:24:49 +0200 Subject: missing deps in arch/ppc64/boot Message-ID: <20050507212449.GA26741@suse.de> Sam, touching arch/ppc64/boot/zlib.h will not cause a rebuild of arch/ppc64/boot/zlib.o. Any ideas what is missing? I use 'make ARCH=ppc64 O=../O-2.6.12-rc4-ppc64-defconfig-boot' ../O-2.6.12-rc4-ppc64-defconfig-boot/arch/ppc64/boot/.zlib.o.cmd cmd_arch/ppc64/boot/zlib.o := gcc -m32 -Wp,-MD,arch/ppc64/boot/.zlib.o.d -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -I/home/olaf/kernel/linux-2.6.12-rc4-olh/arch/ppc64/boot/include -fno-builtin -nostdinc -isystem /usr/lib/gcc-lib/powerpc-suse-linux/3.3.3/include -c -o arch/ppc64/boot/zlib.o /home/olaf/kernel/linux-2.6.12-rc4-olh/arch/ppc64/boot/zlib.c deps_arch/ppc64/boot/zlib.o := \ /home/olaf/kernel/linux-2.6.12-rc4-olh/arch/ppc64/boot/zlib.c \ /home/olaf/kernel/linux-2.6.12-rc4-olh/arch/ppc64/boot/zlib.h \ arch/ppc64/boot/zlib.o: $(deps_arch/ppc64/boot/zlib.o) $(deps_arch/ppc64/boot/zlib.o): From olh at suse.de Sun May 8 07:50:11 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 23:50:11 +0200 Subject: [PATCH] remove printk usage in arch/ppc64/boot/prom.c Message-ID: <20050507215011.GA26918@suse.de> remove the printk usage in the zImage. we are not there, yet. Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/main.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c @@ -17,7 +17,6 @@ extern void *finddevice(const char *); extern int getprop(void *, const char *, void *, int); -extern void printk(char *fmt, ...); extern void printf(const char *fmt, ...); extern int sprintf(char *buf, const char *fmt, ...); void gunzip(void *, int, unsigned char *, int *); Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/prom.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c @@ -23,7 +23,7 @@ void *finddevice(const char *name); int getprop(void *phandle, const char *name, void *buf, int buflen); void chrpboot(int a1, int a2, void *prom); /* in main.c */ -void printk(char *fmt, ...); +int printf(char *fmt, ...); /* there is no convenient header to get this from... -- paulus */ extern unsigned long strlen(const char *); @@ -203,7 +203,7 @@ readchar(void) case 1: return ch; case -1: - printk("read(stdin) returned -1\r\n"); + printf("read(stdin) returned -1\r\n"); return -1; } } @@ -611,18 +611,6 @@ int sprintf(char * buf, const char *fmt, static char sprint_buf[1024]; -void -printk(char *fmt, ...) -{ - va_list args; - int n; - - va_start(args, fmt); - n = vsprintf(sprint_buf, fmt, args); - va_end(args); - write(stdout, sprint_buf, n); -} - int printf(char *fmt, ...) { From olh at suse.de Sun May 8 07:52:20 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 23:52:20 +0200 Subject: [PATCH] remove duplicate printf in arch/ppc64/boot/main.c Message-ID: <20050507215220.GB26918@suse.de> initrd size is printed as hex, add a missing 0x remove a duplicate printf when initrd is used. remove use of kernel type to access the first bytes of the initrd memarea. Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/main.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c @@ -146,10 +146,10 @@ void start(unsigned long a1, unsigned lo } a1 = initrd.addr; a2 = initrd.size; - printf("initial ramdisk moving 0x%lx <- 0x%lx (%lx bytes)\n\r", + printf("initial ramdisk moving 0x%lx <- 0x%lx (0x%lx bytes)\n\r", initrd.addr, (unsigned long)_initrd_start, initrd.size); memmove((void *)initrd.addr, (void *)_initrd_start, initrd.size); - printf("initrd head: 0x%lx\n\r", *((u32 *)initrd.addr)); + printf("initrd head: 0x%lx\n\r", *((unsigned long *)initrd.addr)); } /* Eventually gunzip the kernel */ @@ -200,9 +200,6 @@ void start(unsigned long a1, unsigned lo flush_cache((void *)vmlinux.addr, vmlinux.size); - if (a1) - printf("initrd head: 0x%lx\n\r", *((u32 *)initrd.addr)); - kernel_entry = (kernel_entry_t)vmlinux.addr; #ifdef DEBUG printf( "kernel:\n\r" From olh at suse.de Sun May 8 07:58:01 2005 From: olh at suse.de (Olaf Hering) Date: Sat, 7 May 2005 23:58:01 +0200 Subject: [PATCH] make arch/ppc64/boot standalone Message-ID: <20050507215801.GC26918@suse.de> make the bootheader for ppc64 independent from kernel and libc headers add -nostdinc -isystem $gccincludes to not include libc headers declare all functions in header files, also the stuff from string.S declare some functions static use stddef.h to get size_t (hopefully ok) remove ppc32-types.h, only elf.h used the __NN types Signed-off-by: Olaf Hering arch/ppc64/boot/ppc32-types.h | 36 ------ arch/ppc64/boot/Makefile | 4 arch/ppc64/boot/crt0.S | 2 arch/ppc64/boot/div64.S | 2 arch/ppc64/boot/include/ctype.h | 54 +++++++++ arch/ppc64/boot/include/elf.h | 149 ++++++++++++++++++++++++ arch/ppc64/boot/include/page.h | 34 +++++ arch/ppc64/boot/include/ppc_asm.h | 228 ++++++++++++++++++++++++++++++++++++++ arch/ppc64/boot/include/prom.h | 18 +++ arch/ppc64/boot/include/stdio.h | 16 ++ arch/ppc64/boot/include/string.h | 20 +++ arch/ppc64/boot/main.c | 55 +++------ arch/ppc64/boot/prom.c | 25 +--- arch/ppc64/boot/string.S | 2 14 files changed, 552 insertions(+), 93 deletions(-) Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/prom.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/prom.h @@ -0,0 +1,18 @@ +#ifndef _PPC_BOOT_PROM_H_ +#define _PPC_BOOT_PROM_H_ + +extern int (*prom) (void *); +extern void *chosen_handle; + +extern void *stdin; +extern void *stdout; +extern void *stderr; + +extern int write(void *handle, void *ptr, int nb); +extern int read(void *handle, void *ptr, int nb); +extern void exit(void); +extern void pause(void); +extern void *finddevice(const char *); +extern void *claim(unsigned long virt, unsigned long size, unsigned long align); +extern int getprop(void *phandle, const char *name, void *buf, int buflen); +#endif /* _PPC_BOOT_PROM_H_ */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/stdio.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/stdio.h @@ -0,0 +1,16 @@ +#ifndef _PPC_BOOT_STDIO_H_ +#define _PPC_BOOT_STDIO_H_ + +extern int printf(const char *fmt, ...); + +extern int sprintf(char *buf, const char *fmt, ...); + +extern int vsprintf(char *buf, const char *fmt, va_list args); + +extern int putc(int c, void *f); +extern int putchar(int c); +extern int getchar(void); + +extern int fputs(char *str, void *f); + +#endif /* _PPC_BOOT_STDIO_H_ */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/string.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/string.h @@ -0,0 +1,20 @@ +#ifndef _PPC_BOOT_STRING_H_ +#define _PPC_BOOT_STRING_H_ + +extern char *strcpy(char *dest, const char *src); +extern char *strncpy(char *dest, const char *src, size_t n); +extern char *strcat(char *dest, const char *src); +extern int strcmp(const char *s1, const char *s2); +extern size_t strlen(const char *s); +extern size_t strnlen(const char *s, size_t count); + +extern unsigned long simple_strtoul(const char *cp, char **endp, + unsigned int base); +extern long simple_strtol(const char *cp, char **endp, unsigned int base); + +extern void *memset(void *s, int c, size_t n); +extern void *memmove(void *dest, const void *src, unsigned long n); +extern void *memcpy(void *dest, const void *src, unsigned long n); +extern int memcmp(const void *s1, const void *s2, size_t n); + +#endif /* _PPC_BOOT_STRING_H_ */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/main.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/main.c @@ -8,36 +8,28 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include "ppc32-types.h" +#include +#include +#include +#include +#include +#include +#include #include "zlib.h" -#include -#include -#include -#include - -extern void *finddevice(const char *); -extern int getprop(void *, const char *, void *, int); -extern void printf(const char *fmt, ...); -extern int sprintf(char *buf, const char *fmt, ...); -void gunzip(void *, int, unsigned char *, int *); -void *claim(unsigned int, unsigned int, unsigned int); -void flush_cache(void *, unsigned long); -void pause(void); -extern void exit(void); - -unsigned long strlen(const char *s); -void *memmove(void *dest, const void *src, unsigned long n); -void *memcpy(void *dest, const void *src, unsigned long n); + +static void gunzip(void *, int, unsigned char *, int *); +extern void flush_cache(void *, unsigned long); + /* Value picked to match that used by yaboot */ #define PROG_START 0x01400000 #define RAM_END (256<<20) // Fixme: use OF */ -char *avail_ram; -char *begin_avail, *end_avail; -char *avail_high; -unsigned int heap_use; -unsigned int heap_max; +static char *avail_ram; +static char *begin_avail, *end_avail; +static char *avail_high; +static unsigned int heap_use; +static unsigned int heap_max; extern char _start[]; extern char _vmlinux_start[]; @@ -52,9 +44,9 @@ struct addr_range { unsigned long size; unsigned long memsize; }; -struct addr_range vmlinux = {0, 0, 0}; -struct addr_range vmlinuz = {0, 0, 0}; -struct addr_range initrd = {0, 0, 0}; +static struct addr_range vmlinux = {0, 0, 0}; +static struct addr_range vmlinuz = {0, 0, 0}; +static struct addr_range initrd = {0, 0, 0}; static char scratch[128<<10]; /* 128kB of scratch space for gunzip */ @@ -64,13 +56,6 @@ typedef void (*kernel_entry_t)( unsigned void *); -int (*prom)(void *); - -void *chosen_handle; -void *stdin; -void *stdout; -void *stderr; - #undef DEBUG static unsigned long claim_base = PROG_START; @@ -277,7 +262,7 @@ void zfree(void *x, void *addr, unsigned #define DEFLATED 8 -void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) +static void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) { z_stream s; int r, i, flags; Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/prom.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c @@ -7,26 +7,20 @@ * 2 of the License, or (at your option) any later version. */ #include -#include -#include -#include +#include +#include +#include +#include +#include int (*prom)(void *); void *chosen_handle; + void *stdin; void *stdout; void *stderr; -void exit(void); -void *finddevice(const char *name); -int getprop(void *phandle, const char *name, void *buf, int buflen); -void chrpboot(int a1, int a2, void *prom); /* in main.c */ - -int printf(char *fmt, ...); - -/* there is no convenient header to get this from... -- paulus */ -extern unsigned long strlen(const char *); int write(void *handle, void *ptr, int nb) @@ -193,7 +187,7 @@ fputs(char *str, void *f) return write(f, str, n) == n? 0: -1; } -int +static int readchar(void) { char ch; @@ -420,9 +414,6 @@ static char * number(char * str, long nu return str; } -/* Forward decl. needed for IP address printing stuff... */ -int sprintf(char * buf, const char *fmt, ...); - int vsprintf(char *buf, const char *fmt, va_list args) { int len; @@ -612,7 +603,7 @@ int sprintf(char * buf, const char *fmt, static char sprint_buf[1024]; int -printf(char *fmt, ...) +printf(const char *fmt, ...) { va_list args; int n; Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/Makefile +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile @@ -22,8 +22,8 @@ HOSTCC := gcc -BOOTCFLAGS := $(HOSTCFLAGS) $(LINUXINCLUDE) -fno-builtin -BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional +BOOTCFLAGS := $(HOSTCFLAGS) -I$(srctree)/$(src)/include -fno-builtin -nostdinc -isystem $(shell $(CROSS32CC) -print-file-name=include) +BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc BOOTLFLAGS := -Ttext 0x00400000 -e _start -T $(srctree)/$(src)/zImage.lds OBJCOPYFLAGS := contents,alloc,load,readonly,data Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/ppc32-types.h =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/ppc32-types.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _PPC64_TYPES_H -#define _PPC64_TYPES_H - -typedef __signed__ char __s8; -typedef unsigned char __u8; - -typedef __signed__ short __s16; -typedef unsigned short __u16; - -typedef __signed__ int __s32; -typedef unsigned int __u32; - -typedef __signed__ long long __s64; -typedef unsigned long long __u64; - -typedef signed char s8; -typedef unsigned char u8; - -typedef signed short s16; -typedef unsigned short u16; - -typedef signed int s32; -typedef unsigned int u32; - -typedef signed long long s64; -typedef unsigned long long u64; - -typedef struct { - __u32 u[4]; -} __attribute((aligned(16))) __vector128; - -#define BITS_PER_LONG 32 - -typedef __vector128 vector128; - -#endif /* _PPC64_TYPES_H */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/ctype.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/ctype.h @@ -0,0 +1,54 @@ +#ifndef _PPC_BOOT_CTYPE_H +#define _PPC_BOOT_CTYPE_H + +/* + * NOTE! This ctype does not handle EOF like the standard C + * library is required to. + */ + +#define _U 0x01 /* upper */ +#define _L 0x02 /* lower */ +#define _D 0x04 /* digit */ +#define _C 0x08 /* cntrl */ +#define _P 0x10 /* punct */ +#define _S 0x20 /* white space (space/lf/tab) */ +#define _X 0x40 /* hex digit */ +#define _SP 0x80 /* hard space (0x20) */ + +extern unsigned char _ctype[]; + +#define __ismask(x) (_ctype[(int)(unsigned char)(x)]) + +#define isalnum(c) ((__ismask(c)&(_U|_L|_D)) != 0) +#define isalpha(c) ((__ismask(c)&(_U|_L)) != 0) +#define iscntrl(c) ((__ismask(c)&(_C)) != 0) +#define isdigit(c) ((__ismask(c)&(_D)) != 0) +#define isgraph(c) ((__ismask(c)&(_P|_U|_L|_D)) != 0) +#define islower(c) ((__ismask(c)&(_L)) != 0) +#define isprint(c) ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0) +#define ispunct(c) ((__ismask(c)&(_P)) != 0) +#define isspace(c) ((__ismask(c)&(_S)) != 0) +#define isupper(c) ((__ismask(c)&(_U)) != 0) +#define isxdigit(c) ((__ismask(c)&(_D|_X)) != 0) + +#define isascii(c) (((unsigned char)(c))<=0x7f) +#define toascii(c) (((unsigned char)(c))&0x7f) + +static inline unsigned char __tolower(unsigned char c) +{ + if (isupper(c)) + c -= 'A' - 'a'; + return c; +} + +static inline unsigned char __toupper(unsigned char c) +{ + if (islower(c)) + c -= 'a' - 'A'; + return c; +} + +#define tolower(c) __tolower(c) +#define toupper(c) __toupper(c) + +#endif /* _PPC_BOOT_CTYPE_H */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/elf.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/elf.h @@ -0,0 +1,149 @@ +#ifndef _PPC_BOOT_ELF_H_ +#define _PPC_BOOT_ELF_H_ + +/* 32-bit ELF base types. */ +typedef unsigned int Elf32_Addr; +typedef unsigned short Elf32_Half; +typedef unsigned int Elf32_Off; +typedef signed int Elf32_Sword; +typedef unsigned int Elf32_Word; + +/* 64-bit ELF base types. */ +typedef unsigned long long Elf64_Addr; +typedef unsigned short Elf64_Half; +typedef signed short Elf64_SHalf; +typedef unsigned long long Elf64_Off; +typedef signed int Elf64_Sword; +typedef unsigned int Elf64_Word; +typedef unsigned long long Elf64_Xword; +typedef signed long long Elf64_Sxword; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME 0x6474e550 + +#define PT_GNU_STACK (PT_LOOS + 0x474e551) + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* These constants define the various ELF target machines */ +#define EM_NONE 0 +#define EM_PPC 20 /* PowerPC */ +#define EM_PPC64 21 /* PowerPC64 */ + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[16]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#endif /* _PPC_BOOT_ELF_H_ */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/page.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/page.h @@ -0,0 +1,34 @@ +#ifndef _PPC_BOOT_PAGE_H +#define _PPC_BOOT_PAGE_H +/* + * Copyright (C) 2001 PPC64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifdef __ASSEMBLY__ +#define ASM_CONST(x) x +#else +#define __ASM_CONST(x) x##UL +#define ASM_CONST(x) __ASM_CONST(x) +#endif + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +/* align addr on a size boundary - adjust address up/down if needed */ +#define _ALIGN_UP(addr,size) (((addr)+((size)-1))&(~((size)-1))) +#define _ALIGN_DOWN(addr,size) ((addr)&(~((size)-1))) + +/* align addr on a size boundary - adjust address up if needed */ +#define _ALIGN(addr,size) _ALIGN_UP(addr,size) + +/* to align the pointer to the (next) page boundary */ +#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) + +#endif /* _PPC_BOOT_PAGE_H */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/include/ppc_asm.h =================================================================== --- /dev/null +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/include/ppc_asm.h @@ -0,0 +1,228 @@ +#ifndef _PPC64_PPC_ASM_H +#define _PPC64_PPC_ASM_H +/* + * + * Definitions used by various bits of low-level assembly code on PowerPC. + * + * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Macros for storing registers into and loading registers from + * exception frames. + */ +#define SAVE_GPR(n, base) std n,GPR0+8*(n)(base) +#define SAVE_2GPRS(n, base) SAVE_GPR(n, base); SAVE_GPR(n+1, base) +#define SAVE_4GPRS(n, base) SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base) +#define SAVE_8GPRS(n, base) SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base) +#define SAVE_10GPRS(n, base) SAVE_8GPRS(n, base); SAVE_2GPRS(n+8, base) +#define REST_GPR(n, base) ld n,GPR0+8*(n)(base) +#define REST_2GPRS(n, base) REST_GPR(n, base); REST_GPR(n+1, base) +#define REST_4GPRS(n, base) REST_2GPRS(n, base); REST_2GPRS(n+2, base) +#define REST_8GPRS(n, base) REST_4GPRS(n, base); REST_4GPRS(n+4, base) +#define REST_10GPRS(n, base) REST_8GPRS(n, base); REST_2GPRS(n+8, base) + +#define SAVE_NVGPRS(base) SAVE_8GPRS(14, base); SAVE_10GPRS(22, base) +#define REST_NVGPRS(base) REST_8GPRS(14, base); REST_10GPRS(22, base) + +#define SAVE_FPR(n, base) stfd n,THREAD_FPR0+8*(n)(base) +#define SAVE_2FPRS(n, base) SAVE_FPR(n, base); SAVE_FPR(n+1, base) +#define SAVE_4FPRS(n, base) SAVE_2FPRS(n, base); SAVE_2FPRS(n+2, base) +#define SAVE_8FPRS(n, base) SAVE_4FPRS(n, base); SAVE_4FPRS(n+4, base) +#define SAVE_16FPRS(n, base) SAVE_8FPRS(n, base); SAVE_8FPRS(n+8, base) +#define SAVE_32FPRS(n, base) SAVE_16FPRS(n, base); SAVE_16FPRS(n+16, base) +#define REST_FPR(n, base) lfd n,THREAD_FPR0+8*(n)(base) +#define REST_2FPRS(n, base) REST_FPR(n, base); REST_FPR(n+1, base) +#define REST_4FPRS(n, base) REST_2FPRS(n, base); REST_2FPRS(n+2, base) +#define REST_8FPRS(n, base) REST_4FPRS(n, base); REST_4FPRS(n+4, base) +#define REST_16FPRS(n, base) REST_8FPRS(n, base); REST_8FPRS(n+8, base) +#define REST_32FPRS(n, base) REST_16FPRS(n, base); REST_16FPRS(n+16, base) + +#define SAVE_VR(n,b,base) li b,THREAD_VR0+(16*(n)); stvx n,b,base +#define SAVE_2VRS(n,b,base) SAVE_VR(n,b,base); SAVE_VR(n+1,b,base) +#define SAVE_4VRS(n,b,base) SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base) +#define SAVE_8VRS(n,b,base) SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base) +#define SAVE_16VRS(n,b,base) SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base) +#define SAVE_32VRS(n,b,base) SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base) +#define REST_VR(n,b,base) li b,THREAD_VR0+(16*(n)); lvx n,b,base +#define REST_2VRS(n,b,base) REST_VR(n,b,base); REST_VR(n+1,b,base) +#define REST_4VRS(n,b,base) REST_2VRS(n,b,base); REST_2VRS(n+2,b,base) +#define REST_8VRS(n,b,base) REST_4VRS(n,b,base); REST_4VRS(n+4,b,base) +#define REST_16VRS(n,b,base) REST_8VRS(n,b,base); REST_8VRS(n+8,b,base) +#define REST_32VRS(n,b,base) REST_16VRS(n,b,base); REST_16VRS(n+16,b,base) + +/* Macros to adjust thread priority for Iseries hardware multithreading */ +#define HMT_LOW or 1,1,1 +#define HMT_MEDIUM or 2,2,2 +#define HMT_HIGH or 3,3,3 + +/* Insert the high 32 bits of the MSR into what will be the new + MSR (via SRR1 and rfid) This preserves the MSR.SF and MSR.ISF + bits. */ + +#define FIX_SRR1(ra, rb) \ + mr rb,ra; \ + mfmsr ra; \ + rldimi ra,rb,0,32 + +#define CLR_TOP32(r) rlwinm (r),(r),0,0,31 /* clear top 32 bits */ + +/* + * LOADADDR( rn, name ) + * loads the address of 'name' into 'rn' + * + * LOADBASE( rn, name ) + * loads the address (less the low 16 bits) of 'name' into 'rn' + * suitable for base+disp addressing + */ +#define LOADADDR(rn,name) \ + lis rn,name##@highest; \ + ori rn,rn,name##@higher; \ + rldicr rn,rn,32,31; \ + oris rn,rn,name##@h; \ + ori rn,rn,name##@l + +#define LOADBASE(rn,name) \ + lis rn,name at highest; \ + ori rn,rn,name at higher; \ + rldicr rn,rn,32,31; \ + oris rn,rn,name at ha + + +#define SET_REG_TO_CONST(reg, value) \ + lis reg,(((value)>>48)&0xFFFF); \ + ori reg,reg,(((value)>>32)&0xFFFF); \ + rldicr reg,reg,32,31; \ + oris reg,reg,(((value)>>16)&0xFFFF); \ + ori reg,reg,((value)&0xFFFF); + +#define SET_REG_TO_LABEL(reg, label) \ + lis reg,(label)@highest; \ + ori reg,reg,(label)@higher; \ + rldicr reg,reg,32,31; \ + oris reg,reg,(label)@h; \ + ori reg,reg,(label)@l; + + +/* Condition Register Bit Fields */ + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + + +/* General Purpose Registers (GPRs) */ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + + +/* Floating Point Registers (FPRs) */ + +#define fr0 0 +#define fr1 1 +#define fr2 2 +#define fr3 3 +#define fr4 4 +#define fr5 5 +#define fr6 6 +#define fr7 7 +#define fr8 8 +#define fr9 9 +#define fr10 10 +#define fr11 11 +#define fr12 12 +#define fr13 13 +#define fr14 14 +#define fr15 15 +#define fr16 16 +#define fr17 17 +#define fr18 18 +#define fr19 19 +#define fr20 20 +#define fr21 21 +#define fr22 22 +#define fr23 23 +#define fr24 24 +#define fr25 25 +#define fr26 26 +#define fr27 27 +#define fr28 28 +#define fr29 29 +#define fr30 30 +#define fr31 31 + +#define vr0 0 +#define vr1 1 +#define vr2 2 +#define vr3 3 +#define vr4 4 +#define vr5 5 +#define vr6 6 +#define vr7 7 +#define vr8 8 +#define vr9 9 +#define vr10 10 +#define vr11 11 +#define vr12 12 +#define vr13 13 +#define vr14 14 +#define vr15 15 +#define vr16 16 +#define vr17 17 +#define vr18 18 +#define vr19 19 +#define vr20 20 +#define vr21 21 +#define vr22 22 +#define vr23 23 +#define vr24 24 +#define vr25 25 +#define vr26 26 +#define vr27 27 +#define vr28 28 +#define vr29 29 +#define vr30 30 +#define vr31 31 + +#endif /* _PPC64_PPC_ASM_H */ Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/crt0.S =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/crt0.S +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/crt0.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include +#include .text .globl _start Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/div64.S =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/div64.S +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/div64.S @@ -13,7 +13,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include +#include .globl __div64_32 __div64_32: Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/string.S =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/string.S +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/string.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include +#include .text .globl strcpy From benh at kernel.crashing.org Sun May 8 14:49:04 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Sun, 08 May 2005 14:49:04 +1000 Subject: [BUG] linux-2.6.12_rc4: Oops: Kernel access of bad area, sig: 11 [#1] In-Reply-To: <20050507170904.GA9488@unixforces.net> References: <20050507170904.GA9488@unixforces.net> Message-ID: <1115527744.6304.47.camel@gaston> On Sat, 2005-05-07 at 17:09 +0000, Markus Rothe wrote: > Hello, > > I'm running Linux on my G5. > > I just compiled linux-2.6.12_rc4 and got this when loading my sound > modules at boot time (linux-2.6.12_rc3 worked just fine): Hrm... weird. I'll have a look on monday. It looks like something is blowing up in the i2c layer... Ben. > ---- SNIP ---- > Oops: Kernel access of bad area, sig: 11 [#1] > SMP NR_CPUS=2 POWERMAC > Modules linked in: snd_powermac snd_pcm snd_page_alloc snd_timer snd > soundcore > NIP: C0000000002E4030 XER: 20000000 LR: D0000000001B4AC8 CTR: > C0000000002E4004 > REGS: c000000237783890 TRAP: 0300 Not tainted (2.6.12-rc4) > MSR: 9000000000009032 EE: 1 PR: 0 FP: 0 ME: 1 IR/DR: 11 CR: 22022484 > DAR: 0000000000000002 DSISR: 0000000040000000 > TASK: c0000002375ba030[7007] 'modprobe' THREAD: c000000237780000 CPU: 0 > GPR00: D0000000001B4AC8 C000000237783B10 C0000000005A5E70 0000000000000000 > GPR04: 0000000000000001 0000000000000060 0000000000000000 0000000000000001 > GPR08: 0000000000000002 0000000000000000 C00000000047AAC0 C0000000002E4004 > GPR12: D0000000001B9C58 C00000000047B800 0000000000000000 0000000000000000 > GPR16: 0000000000000000 0000000010028790 0000000000000000 000000001002A4A0 > GPR20: 0000000000000000 0000000000000000 0000000000000000 00000080001B1010 > GPR24: 000000001002AA90 000000001002ABF8 000000001002ABE0 C000000000478958 > GPR28: C00000000F70D580 000000000000000A D0000000001CACD8 D0000000001C137C > NIP [c0000000002e4030] .i2c_smbus_write_byte_data+0x2c/0x50 > LR [d0000000001b4ac8] .send_init_client+0x50/0x110 [snd_powermac] > Call Trace: > [c000000237783b10] [d0000000001c2030] > tumbler_mixers+0x0/0xffffffffffff8b28 [snd_powermac] (unreliable) > [c000000237783bc0] [d0000000001b4ac8] .send_init_client+0x50/0x110 > [snd_powermac] > [c000000237783c60] [d0000000001b9514] > .snd_pmac_tumbler_post_init+0x3c/0x94 [snd_powermac] > [c000000237783ce0] [d0000000001b74fc] .alsa_card_pmac_init+0x174/0x3cc > [snd_powermac] > [c000000237783d90] [c000000000063988] .sys_init_module+0x2cc/0x4a8 > [c000000237783e30] [c00000000000d980] syscall_exit+0x0/0x18 > Instruction dump: > 4e800020 7c0802a6 7c691b78 7c872378 38c00000 39000002 f8010010 f821ff51 > 98a10070 60000000 60000000 60000000 a0890006 e8630008 39210070 > <6>usbcore: registered new driver snd-usb-audio > ---- SNIP ---- > > Could someone please take a look at that? > > Best regards, > > Markus > _______________________________________________ > Linuxppc64-dev mailing list > Linuxppc64-dev at ozlabs.org > https://ozlabs.org/cgi-bin/mailman/listinfo/linuxppc64-dev -- Benjamin Herrenschmidt From markus at unixforces.net Sun May 8 17:22:00 2005 From: markus at unixforces.net (Markus Rothe) Date: Sun, 8 May 2005 07:22:00 +0000 Subject: [BUG] linux-2.6.12_rc4: Oops: Kernel access of bad area, sig: 11 [#1] In-Reply-To: <20050507170904.GA9488@unixforces.net> References: <20050507170904.GA9488@unixforces.net> Message-ID: <20050508072159.GA10031@unixforces.net> Hi, I've just noticed, that I've changed my config slightly from rc3 to rc4. Here is the according diff part of the configs. ---- SNIP ---- --- /usr/src/linux-2.6.12-rc3/.config 2005-05-08 06:54:38.000000000 +0000 +++ /usr/src/linux-2.6.12-rc4/.config 2005-05-08 07:16:32.000000000 +0000 @@ -602,7 +607,7 @@ # I2C support # CONFIG_I2C=y -# CONFIG_I2C_CHARDEV is not set +CONFIG_I2C_CHARDEV=m # # I2C Algorithms ---- SNIP ---- I've compiled rc3 with CONFIG_I2C_CHARDEV=m and now I get the Oops with this version, too. Markus -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050508/70a14b61/attachment.pgp From sam at ravnborg.org Sun May 8 17:32:20 2005 From: sam at ravnborg.org (Sam Ravnborg) Date: Sun, 8 May 2005 09:32:20 +0200 (CEST) Subject: missing deps in arch/ppc64/boot In-Reply-To: <20050507212449.GA26741@suse.de> References: <20050507212449.GA26741@suse.de> Message-ID: <40912.194.237.142.21.1115537540.squirrel@194.237.142.21> > > Sam, > > touching arch/ppc64/boot/zlib.h will not cause a rebuild of > arch/ppc64/boot/zlib.o. Any ideas what is missing? Browsing the Makefile it looks like none of the targets derived from src-boot are assingen to "targets". You need to tell kbuild which targets exits in a given directory, this is done by assigned the .o name to targets. Something like this: src-boot := crt0.S string.S prom.c main.c zlib.c imagesize.c div64.S targets += $(addsuffix .o, $(basename $(src-boot))) src-boot := $(addprefix $(obj)/, $(src-boot)) obj-boot := $(addsuffix .o, $(basename $(src-boot))) Let me know if this cures it. Sam From olh at suse.de Sun May 8 18:33:31 2005 From: olh at suse.de (Olaf Hering) Date: Sun, 8 May 2005 10:33:31 +0200 Subject: panic reboot stuck in rtas_os_term Message-ID: <20050508083331.GA30329@suse.de> A panic does not trigger a reboot anymore on JS20, rtas_os_term() is stuck in RTAS. .config is the defconfig. The panic reboot works on a p630, but I miss the 'rebooting in 180 seconds' message. Any ideas how to fix that? VFS: Cannot open root device "" or unknown-block(8,2) Please append a correct "root=" boot option Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(8,2) rtas [swapper]: Entering rtas_call rtas [swapper]: token = 0x1a rtas [swapper]: nargs = 1 rtas [swapper]: nret = 1 rtas [swapper]: &outputs = 0x0 rtas [swapper]: narg[0] = 0x72cfa8 rtas [swapper]: entering rtas with 0x72c728 ... zImage starting: loaded at 0x400000 Allocating 0x893000 bytes for kernel ... gunzipping (0x1c00000 <- 0x407000:0x6addcb)...done 0x73da20 bytes 0xdb6c bytes of heap consumed, max in use 0xa1e4 OF stdout device is: /vdevice/vty at 0 Hypertas detected, assuming LPAR ! command line: memory layout at init: memory_limit : 0000000000000000 (16 MB aligned) alloc_bottom : 00000000023a7000 alloc_top : 0000000008000000 alloc_top_hi : 000000001e000000 rmo_top : 0000000008000000 ram_top : 000000001e000000 Looking for displays instantiating rtas at 0x0000000007a70000...rtas_ram_size = 2c8000 fixed_base_addr = 7a70000 code_base_addr = 7afa000 Code Image Load Complete. registered vars: name addr size hash align -------------------------------- ---------------- ---- ---- ----- glob_rtas_trace_buf : 0000000007ab9100 65552 7 0 prtas_was_interrupted : 0000000007aca100 4 9 1 callperf : 0000000007aca400 12496 9 1 pglob_os_term_state : 0000000007acd700 4 12 1 hypStopWatch : 0000000007ac9400 1800 14 8 prtas_in_progress : 0000000007ac9e00 4 20 1 last_error_log : 0000000007acdc00 1024 30 0 nmi_work_buffer : 0000000007ace000 4096 31 12 done 0000000000000000 : boot cpu 0000000000000000 0000000000000001 : starting cpu hw idx 0000000000000001... done copying OF device tree ... Building dt strings... Building dt structure... Device tree strings 0x00000000024a8000 -> 0x00000000024a8e13 Device tree struct 0x00000000024a9000 -> 0x00000000024af000 Calling quiesce ... returning from prom_init firmware_features = 0x55f Starting Linux PPC64 2.6.12-rc4 ----------------------------------------------------- ppc64_pft_size = 0x17 ppc64_debug_switch = 0x0 ppc64_interrupt_controller = 0x2 systemcfg = 0xc0000000005a8000 systemcfg->platform = 0x101 systemcfg->processorCount = 0x2 systemcfg->physicalMemorySize = 0x1e000000 ppc64_caches.dcache_line_size = 0x80 ppc64_caches.icache_line_size = 0x80 htab_address = 0x0000000000000000 htab_hash_mask = 0xffff ----------------------------------------------------- [boot]0100 MM Init [boot]0100 MM Init Done Linux version 2.6.12-rc4 (olaf at mac) (gcc version 3.3.3 (SuSE Linux)) #12 SMP Sun May 8 10:08:56 CEST 2005 [boot]0012 Setup Arch Top of RAM: 0x1e000000, Total RAM: 0x1e000000 Memory hole size: 0MB Syscall map setup, 236 32 bits and 212 64 bits syscalls No ramdisk, default root is /dev/sda2 PPC64 nvram contains 16384 bytes Using default idle loop [boot]0015 Setup Done Built 1 zonelists Kernel command line: [boot]0020 XICS Init xics: no ISA interrupt controller [boot]0021 XICS Done PID hash table entries: 2048 (order: 11, 65536 bytes) time_init: decrementer frequency = 199.840527 MHz time_init: processor frequency = 1600.000000 MHz firmware_features = 0x55f Starting Linux PPC64 2.6.12-rc4 ----------------------------------------------------- ppc64_pft_size = 0x17 ppc64_debug_switch = 0x0 ppc64_interrupt_controller = 0x2 systemcfg = 0xc0000000005a8000 systemcfg->platform = 0x101 systemcfg->processorCount = 0x2 systemcfg->physicalMemorySize = 0x1e000000 ppc64_caches.dcache_line_size = 0x80 ppc64_caches.icache_line_size = 0x80 htab_address = 0x0000000000000000 htab_hash_mask = 0xffff ----------------------------------------------------- [boot]0100 MM Init [boot]0100 MM Init Done Linux version 2.6.12-rc4 (olaf at mac) (gcc version 3.3.3 (SuSE Linux)) #12 SMP Sun May 8 10:08:56 CEST 2005 [boot]0012 Setup Arch Top of RAM: 0x1e000000, Total RAM: 0x1e000000 Memory hole size: 0MB Syscall map setup, 236 32 bits and 212 64 bits syscalls No ramdisk, default root is /dev/sda2 PPC64 nvram contains 16384 bytes Using default idle loop [boot]0015 Setup Done Built 1 zonelists Kernel command line: [boot]0020 XICS Init xics: no ISA interrupt controller [boot]0021 XICS Done PID hash table entries: 2048 (order: 11, 65536 bytes) time_init: decrementer frequency = 199.840527 MHz time_init: processor frequency = 1600.000000 MHz Console: colour dummy device 80x25 Dentry cache hash table entries: 65536 (order: 7, 524288 bytes) Inode-cache hash table entries: 32768 (order: 6, 262144 bytes) freeing bootmem node 0 Memory: 468604k/491520k available (4752k kernel code, 22276k reserved, 1548k data, 430k bss, 356k init) Mount-cache hash table entries: 256 Processor 1 found. Brought up 2 CPUs NET: Registered protocol family 16 PCI: Probing PCI hardware IDE Fixup IRQ: Can't find IO-APIC ! IOMMU table initialized, virtual merging enabled mapping IO 100f4000000 -> e000000000000000, size: 400000 PCI: Probing PCI hardware done SCSI subsystem initialized usbcore: registered new driver usbfs usbcore: registered new driver hub i/pSeries Real Time Clock Driver v1.1 RTAS daemon started Total HugeTLB memory allocated, 0 JFS: nTxBlock = 3665, nTxLock = 29327 Initializing Cryptographic API HVSI: registered 0 devices Linux agpgart interface v0.101 (c) Dave Jones Serial: 8250/16550 driver $Revision: 1.90 $ 4 ports, IRQ sharing disabled io scheduler noop registered io scheduler anticipatory registered io scheduler deadline registered io scheduler cfq registered Floppy drive(s): fd0 is 2.88M RAMDISK driver initialized: 16 RAM disks of 65536K size 1024 blocksize loop: loaded (max 8 devices) Intel(R) PRO/1000 Network Driver - version 5.7.6-k2 Copyright (c) 1999-2004 Intel Corporation. pcnet32.c:v1.30i 06.28.2004 tsbogend at alpha.franken.de e100: Intel(R) PRO/100 Network Driver, 3.3.6-k2-NAPI e100: Copyright(c) 1999-2004 Intel Corporation tg3.c:v3.27 (May 5, 2005) eth0: Tigon3 [partno(none) rev 2003 PHY(serdes)] (PCIX:133MHz:64-bit) 10/100/1000BaseT Ethernet 00:0d:60:1e:ff:32 eth0: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[1] Split[0] WireSpeed[1] TSOcap[0] eth1: Tigon3 [partno(none) rev 2003 PHY(serdes)] (PCIX:133MHz:64-bit) 10/100/1000BaseT Ethernet 00:0d:60:1e:ff:33 eth1: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[0] Split[0] WireSpeed[1] TSOcap[1] netconsole: not configured, aborting Warning: no ADB interface detected Uniform Multi-Platform E-IDE driver Revision: 7.00alpha2 ide: Assuming 33MHz system bus speed for PIO modes; override with idebus=xx AMD8111: IDE controller at PCI slot 0000:00:04.1 AMD8111: chipset revision 3 AMD8111: 0000:00:04.1 (rev 03) UDMA133 controller AMD8111: 100% native mode on irq 32 ide0: BM-DMA at 0x7c00-0x7c07, BIOS settings: hda:pio, hdb:pio ide1: BM-DMA at 0x7c08-0x7c0f, BIOS settings: hdc:pio, hdd:pio hda: TOSHIBA MK4019GAXB, ATA DISK drive ide0 at 0x7400-0x7407,0x6c02 on irq 32 hda: max request size: 128KiB hda: 78140160 sectors (40007 MB), CHS=65535/16/63, UDMA(33) hda: cache flushes supported hda: hda1 hda2 hda3 ipr: IBM Power RAID SCSI Device Driver version: 2.0.13 (February 21, 2005) st: Version 20050312, fixed bufsize 32768, s/g segs 256 ieee1394: raw1394: /dev/raw1394 device initialized ohci_hcd 0000:21:00.0: OHCI Host Controller ohci_hcd 0000:21:00.0: new USB bus registered, assigned bus number 1 ohci_hcd 0000:21:00.0: irq 35, io mem 0x100e0001000 hub 1-0:1.0: USB hub found hub 1-0:1.0: 3 ports detected ohci_hcd 0000:21:00.1: OHCI Host Controller ohci_hcd 0000:21:00.1: new USB bus registered, assigned bus number 2 ohci_hcd 0000:21:00.1: irq 35, io mem 0x100e0000000 hub 2-0:1.0: USB hub found hub 2-0:1.0: 3 ports detected usbcore: registered new driver hiddev usbcore: registered new driver usbhid /home/olaf/kernel/linux-2.6.12-rc4-olh/drivers/usb/input/hid-core.c: v2.01:USB HID core driver pegasus: v0.6.12 (2005/01/13), Pegasus/Pegasus II USB Ethernet driver usbcore: registered new driver pegasus mice: PS/2 mouse device common for all mice i2c /dev entries driver md: linear personality registered as nr 1 md: raid0 personality registered as nr 2 md: raid1 personality registered as nr 3 md: raid10 personality registered as nr 9 md: raid5 personality registered as nr 4 raid5: measuring checksumming speed 8regs : 3836.000 MB/sec 8regs_prefetch: 3156.000 MB/sec 32regs : 4640.000 MB/sec 32regs_prefetch: 3848.000 MB/sec raid5: using function: 32regs (4640.000 MB/sec) md: md driver 0.90.1 MAX_MD_DEVS=256, MD_SB_DISKS=27 device-mapper: 4.4.0-ioctl (2005-01-12) initialised: dm-devel at redhat.com oprofile: using ppc64/970 performance monitoring. NET: Registered protocol family 2 IP: routing cache hash table of 2048 buckets, 32Kbytes TCP established hash table entries: 16384 (order: 6, 262144 bytes) TCP bind hash table entries: 16384 (order: 6, 262144 bytes) TCP: Hash tables configured (established 16384 bind 16384) IPv4 over IPv4 tunneling driver NET: Registered protocol family 1 NET: Registered protocol family 17 md: Autodetecting RAID arrays. md: autorun ... md: ... autorun DONE. VFS: Cannot open root device "" or unknown-block(8,2) Please append a correct "root=" boot option Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(8,2) rtas [swapper]: Entering rtas_call rtas [swapper]: token = 0x1a rtas [swapper]: nargs = 1 rtas [swapper]: nret = 1 rtas [swapper]: &outputs = 0x0 rtas [swapper]: narg[0] = 0x72cfa8 rtas [swapper]: entering rtas with 0x72c728 From olh at suse.de Sun May 8 19:15:33 2005 From: olh at suse.de (Olaf Hering) Date: Sun, 8 May 2005 11:15:33 +0200 Subject: panic reboot stuck in rtas_os_term In-Reply-To: <20050508083331.GA30329@suse.de> References: <20050508083331.GA30329@suse.de> Message-ID: <20050508091533.GA30450@suse.de> On Sun, May 08, Olaf Hering wrote: > > A panic does not trigger a reboot anymore on JS20, rtas_os_term() is stuck in > RTAS. .config is the defconfig. > The panic reboot works on a p630, but I miss the 'rebooting in 180 seconds' message. > Any ideas how to fix that? > > VFS: Cannot open root device "" or unknown-block(8,2) > Please append a correct "root=" boot option > Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(8,2) > rtas [swapper]: Entering rtas_call > rtas [swapper]: token = 0x1a > rtas [swapper]: nargs = 1 > rtas [swapper]: nret = 1 > rtas [swapper]: &outputs = 0x0 > rtas [swapper]: narg[0] = 0x72cfa8 > rtas [swapper]: entering rtas with 0x72c728 This appears to be a firmware bug, fails now also with sles9 sp1 kernel. ibm,os-term was appearently added with recent firmware updates. http://linux.bkbits.net:8080/linux-2.5/gnupatch at 41997abd4TXSpY49vjgObO6N2R2MxA From olh at suse.de Sun May 8 20:07:02 2005 From: olh at suse.de (Olaf Hering) Date: Sun, 8 May 2005 12:07:02 +0200 Subject: missing deps in arch/ppc64/boot In-Reply-To: <40912.194.237.142.21.1115537540.squirrel@194.237.142.21> References: <20050507212449.GA26741@suse.de> <40912.194.237.142.21.1115537540.squirrel@194.237.142.21> Message-ID: <20050508100702.GA30759@suse.de> On Sun, May 08, Sam Ravnborg wrote: > > > > Sam, > > > > touching arch/ppc64/boot/zlib.h will not cause a rebuild of > > arch/ppc64/boot/zlib.o. Any ideas what is missing? > > Browsing the Makefile it looks like none of the targets derived from > src-boot are assingen to "targets". > You need to tell kbuild which targets exits in a given directory, this > is done by assigned the .o name to targets. This does not help. Maybe targets is reset for some reason? But this patch works: Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/Makefile +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile @@ -28,6 +28,7 @@ BOOTLFLAGS := -Ttext 0x00400000 -e _star OBJCOPYFLAGS := contents,alloc,load,readonly,data src-boot := crt0.S string.S prom.c main.c zlib.c imagesize.c div64.S +targets-boot := $(addsuffix .o, $(basename $(src-boot))) src-boot := $(addprefix $(obj)/, $(src-boot)) obj-boot := $(addsuffix .o, $(basename $(src-boot))) @@ -54,6 +55,7 @@ gz-sec = $(foreach section, $(1), $(pat hostprogs-y := addnote addRamDisk targets += zImage zImage.initrd imagesize.c \ + $(targets-boot) \ $(patsubst $(obj)/%,%, $(call obj-sec, $(required) $(initrd))) \ $(patsubst $(obj)/%,%, $(call src-sec, $(required) $(initrd))) \ $(patsubst $(obj)/%,%, $(call gz-sec, $(required) $(initrd))) \ From olh at suse.de Mon May 9 02:48:51 2005 From: olh at suse.de (Olaf Hering) Date: Sun, 8 May 2005 18:48:51 +0200 Subject: [PATCH] remove unused arch/ppc64/boot/div64.S Message-ID: <20050508164851.GA1707@suse.de> remove unused arch/ppc64/boot/div64.S, it is built but not called Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/Makefile +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/Makefile @@ -27,7 +27,7 @@ BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAG BOOTLFLAGS := -Ttext 0x00400000 -e _start -T $(srctree)/$(src)/zImage.lds OBJCOPYFLAGS := contents,alloc,load,readonly,data -src-boot := crt0.S string.S prom.c main.c zlib.c imagesize.c div64.S +src-boot := crt0.S string.S prom.c main.c zlib.c imagesize.c src-boot := $(addprefix $(obj)/, $(src-boot)) obj-boot := $(addsuffix .o, $(basename $(src-boot))) Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/div64.S =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/div64.S +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Divide a 64-bit unsigned number by a 32-bit unsigned number. - * This routine assumes that the top 32 bits of the dividend are - * non-zero to start with. - * On entry, r3 points to the dividend, which get overwritten with - * the 64-bit quotient, and r4 contains the divisor. - * On exit, r3 contains the remainder. - * - * Copyright (C) 2002 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include - - .globl __div64_32 -__div64_32: - lwz r5,0(r3) # get the dividend into r5/r6 - lwz r6,4(r3) - cmplw r5,r4 - li r7,0 - li r8,0 - blt 1f - divwu r7,r5,r4 # if dividend.hi >= divisor, - mullw r0,r7,r4 # quotient.hi = dividend.hi / divisor - subf. r5,r0,r5 # dividend.hi %= divisor - beq 3f -1: mr r11,r5 # here dividend.hi != 0 - andis. r0,r5,0xc000 - bne 2f - cntlzw r0,r5 # we are shifting the dividend right - li r10,-1 # to make it < 2^32, and shifting - srw r10,r10,r0 # the divisor right the same amount, - add r9,r4,r10 # rounding up (so the estimate cannot - andc r11,r6,r10 # ever be too large, only too small) - andc r9,r9,r10 - or r11,r5,r11 - rotlw r9,r9,r0 - rotlw r11,r11,r0 - divwu r11,r11,r9 # then we divide the shifted quantities -2: mullw r10,r11,r4 # to get an estimate of the quotient, - mulhwu r9,r11,r4 # multiply the estimate by the divisor, - subfc r6,r10,r6 # take the product from the divisor, - add r8,r8,r11 # and add the estimate to the accumulated - subfe. r5,r9,r5 # quotient - bne 1b -3: cmplw r6,r4 - blt 4f - divwu r0,r6,r4 # perform the remaining 32-bit division - mullw r10,r0,r4 # and get the remainder - add r8,r8,r0 - subf r6,r10,r6 -4: stw r7,0(r3) # return the quotient in *r3 - stw r8,4(r3) - mr r3,r6 # return the remainder in r3 - blr From benh at kernel.crashing.org Mon May 9 08:15:13 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Mon, 09 May 2005 08:15:13 +1000 Subject: [BUG] linux-2.6.12_rc4: Oops: Kernel access of bad area, sig: 11 [#1] In-Reply-To: <20050508072159.GA10031@unixforces.net> References: <20050507170904.GA9488@unixforces.net> <20050508072159.GA10031@unixforces.net> Message-ID: <1115590513.6305.52.camel@gaston> On Sun, 2005-05-08 at 07:22 +0000, Markus Rothe wrote: > Hi, > > I've just noticed, that I've changed my config slightly from rc3 to rc4. > Here is the according diff part of the configs. > > ---- SNIP ---- > > --- /usr/src/linux-2.6.12-rc3/.config 2005-05-08 06:54:38.000000000 +0000 > +++ /usr/src/linux-2.6.12-rc4/.config 2005-05-08 07:16:32.000000000 +0000 > @@ -602,7 +607,7 @@ > # I2C support > # > CONFIG_I2C=y > -# CONFIG_I2C_CHARDEV is not set > +CONFIG_I2C_CHARDEV=m > > # > # I2C Algorithms > > ---- SNIP ---- > > I've compiled rc3 with CONFIG_I2C_CHARDEV=m and now I get the Oops with > this version, too. So something is screwing with the i2c bus .. interesting... Ben. From benh at kernel.crashing.org Mon May 9 09:09:08 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Mon, 09 May 2005 09:09:08 +1000 Subject: [Fwd: PATCH - enables software RAID on Linux] Message-ID: <1115593748.6305.61.camel@gaston> -------- Forwarded Message -------- From: Dustin Kirkland Reply-To: dustin.kirkland at us.ibm.com To: yaboot-devel at lists.penguinppc.org Cc: dustin.kirkland at us.ibm.com Subject: PATCH - enables software RAID on Linux Date: Fri, 06 May 2005 10:17:42 -0400 Hi- I've made a few minor changes to yaboot that provides support for software RAID on Linux. Many thanks to Paul Nasrat who provided key assistance. The basic changes that were needed in yaboot: 1) the ability to read partitions marked "Linux RAID" (0xfd) in addition to regular "Linux" filesystem partitions when looking for /etc/yaboot.conf 2) the additional protection to prevent of_open() from reading Linux RAID partitions (such as RAID swap space) 3) new functionality in ybin to automatically write yaboot to multiple available PReP partitions (so that you can boot from multiple discs) -- #1 is accomplished by adding a LINUX_RAID macro in include/fdisk-part.h and allowing such partitions to be added to the partition_t list in partition_fdisk_lookup(). #2 means that the partition_t structure needs a new integer field (sys_ind) to hold the partition type, and the add_new_partition() function needs to pass this information when called. Then, the of_open () simply needs to test against part->sys_ind against the LINUX_RAID value before allowing an ok return. #3 is solved via recursion. ybin is augmented such that it will look for any PReP partitions available on the system and will recursively call itself targeting each individual PReP partition. The user will be prompted for each partition in the default interactive mode, and yaboot will be written to all PReP partitions in --force mode. As I've coded it here, it depends on fdisk/awk/xargs. I found uses of grep elsewhere in the script so I figured this should be legit. Please see the attached patch and consider for inclusion in subsequent yaboot releases. -- Benjamin Herrenschmidt From olh at suse.de Mon May 9 11:02:55 2005 From: olh at suse.de (Olaf Hering) Date: Mon, 9 May 2005 03:02:55 +0200 Subject: [PATCH] print negative numbers correctly via vsprintf in arch/ppc64/boot/prom.c In-Reply-To: <20050508164851.GA1707@suse.de> References: <20050508164851.GA1707@suse.de> Message-ID: <20050509010255.GA6759@suse.de> On Sun, May 08, Olaf Hering wrote: > > remove unused arch/ppc64/boot/div64.S, it is built but not called The other way round: if num has a value of -1, accessing the digits[] array will fail and the format string will be printed in funny way, or not at all. This happens if one prints negative numbers. Just change the code to match lib/vsprintf.c asm/div64.h cant be used because u64 maps to u32 for this build. uint64_t -> __u64 -> unsigned long Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/prom.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c @@ -11,6 +11,23 @@ #include #include +extern __u32 __div64_32(unsigned long long *dividend, __u32 divisor); + +/* The unnecessary pointer compare is there + * to check for type safety (n must be 64bit) + */ +# define do_div(n,base) ({ \ + __u32 __base = (base); \ + __u32 __rem; \ + (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ + if (((n) >> 32) == 0) { \ + __rem = (__u32)(n) % __base; \ + (n) = (__u32)(n) / __base; \ + } else \ + __rem = __div64_32(&(n), __base); \ + __rem; \ + }) + int (*prom)(void *); void *chosen_handle; @@ -352,7 +369,7 @@ static int skip_atoi(const char **s) #define SPECIAL 32 /* 0x */ #define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ -static char * number(char * str, long num, int base, int size, int precision, int type) +static char * number(char * str, unsigned long long num, int base, int size, int precision, int type) { char c,sign,tmp[66]; const char *digits="0123456789abcdefghijklmnopqrstuvwxyz"; @@ -367,9 +384,9 @@ static char * number(char * str, long nu c = (type & ZEROPAD) ? '0' : ' '; sign = 0; if (type & SIGN) { - if (num < 0) { + if ((signed long long)num < 0) { sign = '-'; - num = -num; + num = - (unsigned long long)num; size--; } else if (type & PLUS) { sign = '+'; @@ -389,8 +406,7 @@ static char * number(char * str, long nu if (num == 0) tmp[i++]='0'; else while (num != 0) { - tmp[i++] = digits[num % base]; - num /= base; + tmp[i++] = digits[do_div(num, base)]; } if (i > precision) precision = i; @@ -426,7 +442,7 @@ int sprintf(char * buf, const char *fmt, int vsprintf(char *buf, const char *fmt, va_list args) { int len; - unsigned long num; + unsigned long long num; int i, base; char * str; const char *s; From schwab at suse.de Mon May 9 19:42:39 2005 From: schwab at suse.de (Andreas Schwab) Date: Mon, 09 May 2005 11:42:39 +0200 Subject: [PATCH] print negative numbers correctly via vsprintf in arch/ppc64/boot/prom.c In-Reply-To: <20050509010255.GA6759@suse.de> (Olaf Hering's message of "Mon, 9 May 2005 03:02:55 +0200") References: <20050508164851.GA1707@suse.de> <20050509010255.GA6759@suse.de> Message-ID: Olaf Hering writes: > @@ -352,7 +369,7 @@ static int skip_atoi(const char **s) > #define SPECIAL 32 /* 0x */ > #define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ > > -static char * number(char * str, long num, int base, int size, int precision, int type) > +static char * number(char * str, unsigned long long num, int base, int size, int precision, int type) > { > char c,sign,tmp[66]; > const char *digits="0123456789abcdefghijklmnopqrstuvwxyz"; > @@ -367,9 +384,9 @@ static char * number(char * str, long nu > c = (type & ZEROPAD) ? '0' : ' '; > sign = 0; > if (type & SIGN) { > - if (num < 0) { > + if ((signed long long)num < 0) { > sign = '-'; > - num = -num; > + num = - (unsigned long long)num; I think the latter cast is useless. Andreas. -- Andreas Schwab, SuSE Labs, schwab at suse.de SuSE Linux Products GmbH, Maxfeldstra?e 5, 90409 N?rnberg, Germany Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different." From johnrose at austin.ibm.com Tue May 10 01:43:42 2005 From: johnrose at austin.ibm.com (John Rose) Date: Mon, 09 May 2005 10:43:42 -0500 Subject: Patch to kill ioremap_mm In-Reply-To: <1115423232.23610.5.camel@gaston> References: <20050505014256.GE18270@localhost.localdomain> <1115306696.6011.6.camel@sinatra.austin.ibm.com> <1115335822.7627.189.camel@gaston> <1115392690.15458.12.camel@sinatra.austin.ibm.com> <1115423232.23610.5.camel@gaston> Message-ID: <1115653422.14937.41.camel@sinatra.austin.ibm.com> Hi Ben- > Do we need that for normal mmio ioremap mappings or only for PHB IO > space ? Just the latter. > As I said, the later would stay separate, but I feel we don't > need the imalloc infrastructure to handle it. We can probably just > directly set/invalidate PTEs for the ranges when needed. So you won't use vmalloc to manage the explicit mappings for the PHB ranges? How will you keep up with what's been mapped for a PHB? Seems like this would be necessary to know upon removal. Or will you simply invalidate for the entire PHB range, even for subranges that aren't mapped? Sweating the details :) John From apw at shadowen.org Tue May 10 01:49:04 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Mon, 09 May 2005 16:49:04 +0100 Subject: sparsemem ppc64 tidy flat memory comments and fix benign mempresent call In-Reply-To: <427A59BC.1020208@shadowen.org> Message-ID: I was going to rediff the memory present patches, but as -mm has picked these up already here is a simple patch to clean up this errant comment and address a benign call to memory_present(). Applies onto the existing patches. -apw Tidy up the comments for the ppc64 flat memory support and fix a currently benign double call to memory_present() for the first memory block. Signed-off-by: Andy Whitcroft --- init.c | 9 +++++---- 1 files changed, 5 insertions(+), 4 deletions(-) diff -upN reference/arch/ppc64/mm/init.c current/arch/ppc64/mm/init.c --- reference/arch/ppc64/mm/init.c +++ current/arch/ppc64/mm/init.c @@ -631,18 +631,19 @@ void __init do_init_bootmem(void) max_pfn = max_low_pfn; - /* add all physical memory to the bootmem map. Also, find the first - * presence of all LMBs*/ + /* Add all physical memory to the bootmem map, mark each area + * present. The first block has already been marked present above. + */ for (i=0; i < lmb.memory.cnt; i++) { unsigned long physbase, size; physbase = lmb.memory.region[i].physbase; size = lmb.memory.region[i].size; - if (i) { /* already created mappings for first LMB */ + if (i) { start_pfn = physbase >> PAGE_SHIFT; end_pfn = start_pfn + (size >> PAGE_SHIFT); + memory_present(0, start_pfn, end_pfn); } - memory_present(0, start_pfn, end_pfn); free_bootmem(physbase, size); } From jschopp at austin.ibm.com Tue May 10 01:56:36 2005 From: jschopp at austin.ibm.com (Joel Schopp) Date: Mon, 09 May 2005 10:56:36 -0500 Subject: [Fwd: PATCH - enables software RAID on Linux] In-Reply-To: <1115593748.6305.61.camel@gaston> References: <1115593748.6305.61.camel@gaston> Message-ID: <427F8834.9060509@austin.ibm.com> > Please see the attached patch and consider for inclusion in subsequent > yaboot releases. In the forwarding no patch was attached. From dhowells at redhat.com Tue May 10 03:27:07 2005 From: dhowells at redhat.com (David Howells) Date: Mon, 09 May 2005 18:27:07 +0100 Subject: [PATCH 1/3] ppc64: iseries_veth: Don't send packets to LPARs which aren't up In-Reply-To: <200505071201.20123.michael@ellerman.id.au> References: <200505071201.20123.michael@ellerman.id.au> Message-ID: <10647.1115659627@redhat.com> Michael Ellerman wrote: > > The iseries_veth driver has a logic bug which means it will erroneously > send packets to LPARs for which we don't have a connection. Any particular versions of the kernel? David From linas at austin.ibm.com Tue May 10 04:24:45 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Mon, 9 May 2005 13:24:45 -0500 Subject: [PATCH 2.6] PPC64: janitorial HVSI use wait_event_timeout() Message-ID: <20050509182445.GP11745@austin.ibm.com> Hi, I seem to be dragging around the following janitorial patch, it was submitted upstream to LKML on March 6 2005, but seems not to have made it in. Hollis, please review; I've been running with it for months. --linas Use wait_event_timeout() in place of custom wait-queue code. The code is not changed in any way (I don't think), but is cleaned up quite a bit (will get expanded to almost identical code). Acked-by: Linas Vepstas Signed-off-by: Nishanth Aravamudan Signed-off-by: Domen Puncer --- linux-2.6.11.8/drivers/char/hvsi.c.linas-orig 2005-04-29 20:29:25.000000000 -0500 +++ linux-2.6.11.8/drivers/char/hvsi.c 2005-05-06 12:28:43.000000000 -0500 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -631,27 +632,9 @@ static int __init poll_for_state(struct /* wait for irq handler to change our state */ static int wait_for_state(struct hvsi_struct *hp, int state) { - unsigned long end_jiffies = jiffies + HVSI_TIMEOUT; - unsigned long timeout; - int ret = 0; - - DECLARE_WAITQUEUE(myself, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&hp->stateq, &myself); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (hp->state == state) - break; - timeout = end_jiffies - jiffies; - if (time_after(jiffies, end_jiffies)) { - ret = -EIO; - break; - } - schedule_timeout(timeout); - } - remove_wait_queue(&hp->stateq, &myself); - set_current_state(TASK_RUNNING); + int ret=0; + if(!wait_event_timeout(hp->stateq, (hp->state == state), jiffies + + HVSI_TIMEOUT)) ret = -EIO; return ret; } @@ -868,24 +851,8 @@ static int hvsi_open(struct tty_struct * /* wait for hvsi_write_worker to empty hp->outbuf */ static void hvsi_flush_output(struct hvsi_struct *hp) { - unsigned long end_jiffies = jiffies + HVSI_TIMEOUT; - unsigned long timeout; - - DECLARE_WAITQUEUE(myself, current); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&hp->emptyq, &myself); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (hp->n_outbuf <= 0) - break; - timeout = end_jiffies - jiffies; - if (time_after(jiffies, end_jiffies)) - break; - schedule_timeout(timeout); - } - remove_wait_queue(&hp->emptyq, &myself); - set_current_state(TASK_RUNNING); + wait_event_timeout(hp->emptyq, (hp->n_outbuf <= 0), jiffies + + HVSI_TIMEOUT); /* 'writer' could still be pending if it didn't see n_outbuf = 0 yet */ cancel_delayed_work(&hp->writer); From olh at suse.de Tue May 10 04:26:29 2005 From: olh at suse.de (Olaf Hering) Date: Mon, 9 May 2005 20:26:29 +0200 Subject: [PATCH] print negative numbers correctly via vsprintf in arch/ppc64/boot/prom.c In-Reply-To: References: <20050508164851.GA1707@suse.de> <20050509010255.GA6759@suse.de> Message-ID: <20050509182629.GA25582@suse.de> On Mon, May 09, Andreas Schwab wrote: > > + num = - (unsigned long long)num; > > I think the latter cast is useless. This was a typo. if num has a value of -1, accessing the digits[] array will fail and the format string will be printed in funny way, or not at all. This happens if one prints negative numbers. Just change the code to match lib/vsprintf.c asm/div64.h cant be used because u64 maps to u32 for this build. Signed-off-by: Olaf Hering Index: linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c =================================================================== --- linux-2.6.12-rc4-olh.orig/arch/ppc64/boot/prom.c +++ linux-2.6.12-rc4-olh/arch/ppc64/boot/prom.c @@ -11,6 +11,23 @@ #include #include +extern __u32 __div64_32(unsigned long long *dividend, __u32 divisor); + +/* The unnecessary pointer compare is there + * to check for type safety (n must be 64bit) + */ +# define do_div(n,base) ({ \ + __u32 __base = (base); \ + __u32 __rem; \ + (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ + if (((n) >> 32) == 0) { \ + __rem = (__u32)(n) % __base; \ + (n) = (__u32)(n) / __base; \ + } else \ + __rem = __div64_32(&(n), __base); \ + __rem; \ + }) + int (*prom)(void *); void *chosen_handle; @@ -352,7 +369,7 @@ static int skip_atoi(const char **s) #define SPECIAL 32 /* 0x */ #define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ -static char * number(char * str, long num, int base, int size, int precision, int type) +static char * number(char * str, unsigned long long num, int base, int size, int precision, int type) { char c,sign,tmp[66]; const char *digits="0123456789abcdefghijklmnopqrstuvwxyz"; @@ -367,9 +384,9 @@ static char * number(char * str, long nu c = (type & ZEROPAD) ? '0' : ' '; sign = 0; if (type & SIGN) { - if (num < 0) { + if ((signed long long)num < 0) { sign = '-'; - num = -num; + num = - (signed long long)num; size--; } else if (type & PLUS) { sign = '+'; @@ -389,8 +406,7 @@ static char * number(char * str, long nu if (num == 0) tmp[i++]='0'; else while (num != 0) { - tmp[i++] = digits[num % base]; - num /= base; + tmp[i++] = digits[do_div(num, base)]; } if (i > precision) precision = i; @@ -426,7 +442,7 @@ int sprintf(char * buf, const char *fmt, int vsprintf(char *buf, const char *fmt, va_list args) { int len; - unsigned long num; + unsigned long long num; int i, base; char * str; const char *s; From schwab at suse.de Tue May 10 05:54:08 2005 From: schwab at suse.de (Andreas Schwab) Date: Mon, 09 May 2005 21:54:08 +0200 Subject: [PATCH] print negative numbers correctly via vsprintf in arch/ppc64/boot/prom.c In-Reply-To: <20050509182629.GA25582@suse.de> (Olaf Hering's message of "Mon, 9 May 2005 20:26:29 +0200") References: <20050508164851.GA1707@suse.de> <20050509010255.GA6759@suse.de> <20050509182629.GA25582@suse.de> Message-ID: Olaf Hering writes: > @@ -367,9 +384,9 @@ static char * number(char * str, long nu > c = (type & ZEROPAD) ? '0' : ' '; > sign = 0; > if (type & SIGN) { > - if (num < 0) { > + if ((signed long long)num < 0) { > sign = '-'; > - num = -num; > + num = - (signed long long)num; The cast is still not needed. In 2's complement representation the negation of a signed number and an unsigned number are the same operation. Andreas. -- Andreas Schwab, SuSE Labs, schwab at suse.de SuSE Linux Products GmbH, Maxfeldstra?e 5, 90409 N?rnberg, Germany Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different." From olh at suse.de Tue May 10 05:56:42 2005 From: olh at suse.de (Olaf Hering) Date: Mon, 9 May 2005 21:56:42 +0200 Subject: [PATCH] print negative numbers correctly via vsprintf in arch/ppc64/boot/prom.c In-Reply-To: References: <20050508164851.GA1707@suse.de> <20050509010255.GA6759@suse.de> <20050509182629.GA25582@suse.de> Message-ID: <20050509195642.GA28538@suse.de> On Mon, May 09, Andreas Schwab wrote: > The cast is still not needed. In 2's complement representation the > negation of a signed number and an unsigned number are the same operation. Linus wanted it that way a few months/years ago, in lib/vsprintf.c From benh at kernel.crashing.org Tue May 10 08:56:44 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Tue, 10 May 2005 08:56:44 +1000 Subject: Patch to kill ioremap_mm In-Reply-To: <1115653422.14937.41.camel@sinatra.austin.ibm.com> References: <20050505014256.GE18270@localhost.localdomain> <1115306696.6011.6.camel@sinatra.austin.ibm.com> <1115335822.7627.189.camel@gaston> <1115392690.15458.12.camel@sinatra.austin.ibm.com> <1115423232.23610.5.camel@gaston> <1115653422.14937.41.camel@sinatra.austin.ibm.com> Message-ID: <1115679405.7339.7.camel@gaston> On Mon, 2005-05-09 at 10:43 -0500, John Rose wrote: > Hi Ben- > > > Do we need that for normal mmio ioremap mappings or only for PHB IO > > space ? > > Just the latter. > > > As I said, the later would stay separate, but I feel we don't > > need the imalloc infrastructure to handle it. We can probably just > > directly set/invalidate PTEs for the ranges when needed. > > So you won't use vmalloc to manage the explicit mappings for the PHB > ranges? How will you keep up with what's been mapped for a PHB? Seems > like this would be necessary to know upon removal. Or will you simply > invalidate for the entire PHB range, even for subranges that aren't > mapped? > > Sweating the details :) Well, for one, I'm not 100% sure we actually need to remove the linux PTEs on removal. We only need to remove the hash entries, which can be done "preventively" easily enough. When a segment is removed, we kick out hash entries for that segment. If somebody tries to access that later one, it gets a sigbus since HV will fail inserting an entry. Ben. From benh at kernel.crashing.org Tue May 10 08:57:30 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Tue, 10 May 2005 08:57:30 +1000 Subject: [Fwd: PATCH - enables software RAID on Linux] In-Reply-To: <427F8834.9060509@austin.ibm.com> References: <1115593748.6305.61.camel@gaston> <427F8834.9060509@austin.ibm.com> Message-ID: <1115679451.7339.9.camel@gaston> On Mon, 2005-05-09 at 10:56 -0500, Joel Schopp wrote: > > Please see the attached patch and consider for inclusion in subsequent > > yaboot releases. > > In the forwarding no patch was attached Yup, I noticed. I've asked him to re-post to linuxppc64-dev Ben. From jschopp at austin.ibm.com Tue May 10 09:03:51 2005 From: jschopp at austin.ibm.com (Joel Schopp) Date: Mon, 09 May 2005 18:03:51 -0500 Subject: sparsemem ppc64 tidy flat memory comments and fix benign mempresent call In-Reply-To: References: Message-ID: <427FEC57.8060505@austin.ibm.com> > diff -upN reference/arch/ppc64/mm/init.c current/arch/ppc64/mm/init.c > --- reference/arch/ppc64/mm/init.c > +++ current/arch/ppc64/mm/init.c > @@ -631,18 +631,19 @@ void __init do_init_bootmem(void) > > max_pfn = max_low_pfn; > > - /* add all physical memory to the bootmem map. Also, find the first > - * presence of all LMBs*/ > + /* Add all physical memory to the bootmem map, mark each area > + * present. The first block has already been marked present above. > + */ > for (i=0; i < lmb.memory.cnt; i++) { > unsigned long physbase, size; > > physbase = lmb.memory.region[i].physbase; > size = lmb.memory.region[i].size; > - if (i) { /* already created mappings for first LMB */ > + if (i) { > start_pfn = physbase >> PAGE_SHIFT; > end_pfn = start_pfn + (size >> PAGE_SHIFT); > + memory_present(0, start_pfn, end_pfn); > } > - memory_present(0, start_pfn, end_pfn); > free_bootmem(physbase, size); > } Instead of moving all that around why don't we just drop the duplicate and the if altogether? I tested and sent a patch back in March that cleaned up the non-numa case pretty well. http://sourceforge.net/mailarchive/message.php?msg_id=11320001 From michael at ellerman.id.au Tue May 10 09:04:09 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Tue, 10 May 2005 09:04:09 +1000 Subject: [PATCH 1/3] ppc64: iseries_veth: Don't send packets to LPARs which aren't up In-Reply-To: <10647.1115659627@redhat.com> References: <200505071201.20123.michael@ellerman.id.au> <10647.1115659627@redhat.com> Message-ID: <200505100904.09850.michael@ellerman.id.au> On Tue, 10 May 2005 03:27, David Howells wrote: > Michael Ellerman wrote: > > The iseries_veth driver has a logic bug which means it will erroneously > > send packets to LPARs for which we don't have a connection. > > Any particular versions of the kernel? 2.6.* I believe. Certainly RHEL4's version is broken. I haven't got an RHEL3 kernel handy, but the 2.6 driver was based on the 2.4 version I believe, so it's possible the bug is in 2.4 too. cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050510/69a1e8dd/attachment.pgp From hollisb at us.ibm.com Tue May 10 09:23:13 2005 From: hollisb at us.ibm.com (Hollis Blanchard) Date: Mon, 9 May 2005 18:23:13 -0500 Subject: [PATCH 2.6] PPC64: janitorial HVSI use wait_event_timeout() In-Reply-To: <20050509182445.GP11745@austin.ibm.com> References: <20050509182445.GP11745@austin.ibm.com> Message-ID: On May 9, 2005, at 1:24 PM, Linas Vepstas wrote: > > I seem to be dragging around the following janitorial patch, it was > submitted upstream to LKML on March 6 2005, but seems not to have made > it in. Hollis, please review; I've been running with it for months. This patch has some formatting issues that need correcting. If it works for you I'm fine with the principle. (Also, please don't email my Lotus Notes account.) Thanks. -- Hollis Blanchard IBM Linux Technology Center From dustin.kirkland at us.ibm.com Tue May 10 09:21:37 2005 From: dustin.kirkland at us.ibm.com (Dustin Kirkland) Date: Mon, 09 May 2005 19:21:37 -0400 Subject: [Fwd: PATCH - enables software RAID on Linux] Message-ID: <1115680898.7274.15.camel@t41p> Hello all- I have been advised to post this patch to this list, as the yaboot-devel list appears to be somewhat stale. You might also be interested in the thread regarding yaboot 1.x ownership spurned by this post to yaboot-devel: http://lists.penguinppc.org/yaboot-devel/2005/yaboot- devel-200505/threads.html Thanks, Dustin -------------- next part -------------- An embedded message was scrubbed... From: Dustin Kirkland Subject: PATCH - enables software RAID on Linux Date: Fri, 06 May 2005 10:17:42 -0400 Size: 9319 Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050509/a54fd260/attachment.eml -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: This is a digitally signed message part Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050509/a54fd260/attachment.pgp From michael at ellerman.id.au Tue May 10 16:00:08 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Tue, 10 May 2005 16:00:08 +1000 Subject: [PATCH 1/3] ppc64: iseries_veth: Don't send packets to LPARs which aren't up In-Reply-To: <200505100904.09850.michael@ellerman.id.au> References: <200505071201.20123.michael@ellerman.id.au> <10647.1115659627@redhat.com> <200505100904.09850.michael@ellerman.id.au> Message-ID: <200505101600.17146.michael@ellerman.id.au> On Tue, 10 May 2005 09:04, Michael Ellerman wrote: > On Tue, 10 May 2005 03:27, David Howells wrote: > > Michael Ellerman wrote: > > > The iseries_veth driver has a logic bug which means it will erroneously > > > send packets to LPARs for which we don't have a connection. > > > > Any particular versions of the kernel? > > 2.6.* I believe. Certainly RHEL4's version is broken. > > I haven't got an RHEL3 kernel handy, but the 2.6 driver was based on the > 2.4 version I believe, so it's possible the bug is in 2.4 too. Well something was based on something, no one's quite sure. But the 2.4 driver looks ok, it doesn't have this code at all. cheers -- Michael Ellerman IBM OzLabs email: michael:ellerman.id.au inmsg: mpe:jabber.org wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: not available Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050510/f5f19f0d/attachment.pgp From apw at shadowen.org Wed May 11 01:45:48 2005 From: apw at shadowen.org (Andy Whitcroft) Date: Tue, 10 May 2005 16:45:48 +0100 Subject: sparsemem ppc64 tidy flat memory comments and fix benign mempresent call In-Reply-To: <427FEC57.8060505@austin.ibm.com> References: <427FEC57.8060505@austin.ibm.com> Message-ID: <4280D72C.4090203@shadowen.org> > Instead of moving all that around why don't we just drop the duplicate > and the if altogether? I tested and sent a patch back in March that > cleaned up the non-numa case pretty well. > > http://sourceforge.net/mailarchive/message.php?msg_id=11320001 Ok, Mike also expressed the feeling that it was no longer necessary to handle the first block separatly. I've tested the attached patch on the machines I have to hand and it seems to boot just fine in the flat memory modes with this applied. Joel, Mike, Dave could you test this one on your platforms to confirm its widly applicable, if so we can push it up to -mm. The patch attached applies to the patches proposed for the next -mm. A full stack on top of 2.6.12-rc3-mm2 can be found at the URL below (see the series file): http://www.shadowen.org/~apw/linux/sparsemem/sparsemem-2.6.12-rc3-mm2-V3/ Cheers. -apw -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: sparsemem-ppc64-flat-first-block-is-not-special Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050510/b0adf4c9/attachment.txt From kravetz at us.ibm.com Wed May 11 05:42:25 2005 From: kravetz at us.ibm.com (mike kravetz) Date: Tue, 10 May 2005 12:42:25 -0700 Subject: sparsemem ppc64 tidy flat memory comments and fix benign mempresent call In-Reply-To: <4280D72C.4090203@shadowen.org> References: <427FEC57.8060505@austin.ibm.com> <4280D72C.4090203@shadowen.org> Message-ID: <20050510194225.GD3915@w-mikek2.ibm.com> On Tue, May 10, 2005 at 04:45:48PM +0100, Andy Whitcroft wrote: > Joel, Mike, Dave could you test this one on your platforms to confirm > its widly applicable, if so we can push it up to -mm. It works on my machine with various config options. -- Mike From david at gibson.dropbear.id.au Wed May 11 11:08:09 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Wed, 11 May 2005 11:08:09 +1000 Subject: [PATCH 1/3] ppc64: iseries_veth: Don't send packets to LPARs which aren't up In-Reply-To: <200505101600.17146.michael@ellerman.id.au> References: <200505071201.20123.michael@ellerman.id.au> <10647.1115659627@redhat.com> <200505100904.09850.michael@ellerman.id.au> <200505101600.17146.michael@ellerman.id.au> Message-ID: <20050511010809.GB18715@localhost.localdomain> On Tue, May 10, 2005 at 04:00:08PM +1000, Michael Ellerman wrote: > On Tue, 10 May 2005 09:04, Michael Ellerman wrote: > > On Tue, 10 May 2005 03:27, David Howells wrote: > > > Michael Ellerman wrote: > > > > The iseries_veth driver has a logic bug which means it will erroneously > > > > send packets to LPARs for which we don't have a connection. > > > > > > Any particular versions of the kernel? > > > > 2.6.* I believe. Certainly RHEL4's version is broken. > > > > I haven't got an RHEL3 kernel handy, but the 2.6 driver was based on the > > 2.4 version I believe, so it's possible the bug is in 2.4 too. > > Well something was based on something, no one's quite sure. But the 2.4 driver > looks ok, it doesn't have this code at all. The 2.6 driver is based on the 2.4 driver in the sense that the 2.4 driver was the only source of information about the virtual ethernet protocol. However, the driver was pretty much completely rewritten in the process. Unfortunately, this seems to have introduce quite a few bugs like these ones. On the other hand, the code is now comprehensible so the bugs are actually fixable.. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: Digital signature Url : http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050511/7fcaaf61/attachment.pgp From david at gibson.dropbear.id.au Thu May 12 13:39:53 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 12 May 2005 13:39:53 +1000 Subject: First cut at four-level pagetables Message-ID: <20050512033953.GA29780@localhost.localdomain> Here's a first shot at patch which implements true four-level page tables for ppc64. It uses full page tables at the bottom and top levels, and quarter-page tables at the middle two levels. This gives a total usable address space of 44 bits (16T). I've also tweaked the VSID allocation to let us use all that space (thereby halving the number of available contexts) and added some #if and BUILD_BUG sanity checks. Hugepages are presently completely broken, working on that now. This patch applies on top of the patch posted earlier eliminating ioremap_dir. Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-12 13:25:23.000000000 +1000 @@ -15,19 +15,24 @@ #include #endif /* __ASSEMBLY__ */ -#include - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ #define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 10 -#define PGD_INDEX_SIZE 10 +#define PMD_INDEX_SIZE 7 +#define PUD_INDEX_SIZE 7 +#define PGD_INDEX_SIZE 9 + +#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) +#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) /* PMD_SHIFT determines what a second-level page table entry can map */ @@ -35,8 +40,13 @@ #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -45,15 +55,23 @@ /* * Size of EA range mapped by our pagetables. */ -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) -#define EADDR_MASK ((1UL << EADDR_SIZE) - 1) +#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) +#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE) + +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif /* * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_SIZE (0x40000000000UL) #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* @@ -151,6 +169,8 @@ #ifdef CONFIG_HUGETLB_PAGE +#error Hugepages broken for now + #ifndef __ASSEMBLY__ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local); @@ -197,39 +217,45 @@ #define pte_pfn(x) ((unsigned long)((pte_val(x) >> PTE_SHIFT))) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pmd_set(pmdp, ptep) \ - (pmd_val(*(pmdp)) = __ba_to_bpn(ptep)) +#define pmd_set(pmdp, ptep) (pmd_val(*(pmdp)) = (unsigned long)(ptep)) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) == 0) #define pmd_present(pmd) (pmd_val(pmd) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) -#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page_kernel(pmd) (pmd_val(pmd)) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) -#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) +#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (unsigned long)(pmdp)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) ((pud_val(pud)) == 0UL) -#define pud_present(pud) (pud_val(pud) != 0UL) -#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define pud_page(pud) (__bpn_to_ba(pud_val(pud))) +#define pud_bad(pud) ((pud_val(pud)) == 0) +#define pud_present(pud) (pud_val(pud) != 0) +#define pud_clear(pudp) (pud_val(*(pudp)) = 0) +#define pud_page(pud) (pud_val(pud)) + +#define pgd_set(pgdp, pudp) ({pgd_val(*(pgdp)) = (unsigned long)(pudp);}) +#define pgd_none(pgd) (!pgd_val(pgd)) +#define pgd_bad(pgd) (pgd_val(pgd) == 0) +#define pgd_present(pgd) (pgd_val(pgd) != 0) +#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0) +#define pgd_page(pgd) (pgd_val(pgd)) /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. */ /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */ -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff) +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -/* Find an entry in the second-level page table.. */ +#define pud_offset(pgdp, addr) \ + (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + #define pmd_offset(pudp,addr) \ - ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) -/* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ - ((pte_t *) pmd_page_kernel(*(dir)) \ - + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) + (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -458,9 +484,11 @@ #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-12 12:08:54.000000000 +1000 @@ -35,6 +35,8 @@ #ifdef CONFIG_HUGETLB_PAGE +#error Hugepages broken for now + #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) /* For 64-bit processes the hugepage range is 1T-1.5T */ @@ -91,7 +93,7 @@ #ifndef __ASSEMBLY__ #include -#undef STRICT_MM_TYPECHECKS +#define STRICT_MM_TYPECHECKS #define REGION_SIZE 4UL #define REGION_SHIFT 60UL @@ -125,27 +127,31 @@ * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b. */ typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned int pmd; } pmd_t; -typedef struct { unsigned int pgd; } pgd_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) +#define pud_val(x) ((x).pud) #define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) -#define __pte(x) ((pte_t) { (x) } ) -#define __pmd(x) ((pmd_t) { (x) } ) -#define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) +#define __pte(x) ((pte_t) { (x) }) +#define __pmd(x) ((pmd_t) { (x) }) +#define __pud(x) ((pud_t) { (x) }) +#define __pgd(x) ((pgd_t) { (x) }) +#define __pgprot(x) ((pgprot_t) { (x) }) #else /* * .. while these make it easier on the compiler */ typedef unsigned long pte_t; -typedef unsigned int pmd_t; -typedef unsigned int pgd_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long pgd_t; typedef unsigned long pgprot_t; #define pte_val(x) (x) @@ -208,9 +214,6 @@ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE) -#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT) - #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) #ifdef CONFIG_DISCONTIGMEM Index: working-2.6/include/asm-ppc64/pgalloc.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgalloc.h 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgalloc.h 2005-05-12 12:08:54.000000000 +1000 @@ -6,7 +6,7 @@ #include #include -extern kmem_cache_t *zero_cache; +extern kmem_cache_t *pmd_cache; /* * This program is free software; you can redistribute it and/or @@ -18,13 +18,31 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL); + return (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); } static inline void pgd_free(pgd_t *pgd) { - kmem_cache_free(zero_cache, pgd); + free_page((unsigned long)pgd); +} + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, PUD) + +static inline pud_t * +pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pudp; + + pudp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT); + memset(pudp, 0, PUD_TABLE_SIZE); + return pudp; +} + +static inline void +pud_free(pud_t *pud) +{ + kmem_cache_free(pmd_cache, pud); } #define pud_populate(MM, PUD, PMD) pud_set(PUD, PMD) @@ -32,13 +50,17 @@ static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + pmd_t *pmdp; + + pmdp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT); + memset(pmdp, 0, PMD_TABLE_SIZE); + return pmdp; } static inline void pmd_free(pmd_t *pmd) { - kmem_cache_free(zero_cache, pmd); + kmem_cache_free(pmd_cache, pmd); } #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) @@ -47,44 +69,54 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); - if (pte) - return virt_to_page(pte); - return NULL; + return alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); } static inline void pte_free_kernel(pte_t *pte) { - kmem_cache_free(zero_cache, pte); + free_page((unsigned long)pte); } static inline void pte_free(struct page *ptepage) { - kmem_cache_free(zero_cache, page_address(ptepage)); + __free_page(ptepage); } -struct pte_freelist_batch +typedef struct pgtable_free { + unsigned long val; +} pgtable_free_t; + +static inline pgtable_free_t pgtable_free_page(struct page *page) { - struct rcu_head rcu; - unsigned int index; - struct page * pages[0]; -}; + return (pgtable_free_t){.val = (unsigned long) page}; +} -#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \ - sizeof(struct page *)) +static inline pgtable_free_t pgtable_free_cache(void *p) +{ + return (pgtable_free_t){.val = ((unsigned long) p) | 1}; +} -extern void pte_free_now(struct page *ptepage); -extern void pte_free_submit(struct pte_freelist_batch *batch); +static inline void pgtable_free(pgtable_free_t pgf) +{ + if (pgf.val & 1) + kmem_cache_free(pmd_cache, (void *)(pgf.val & ~1)); + else + __free_page((struct page *)pgf.val); +} -DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); -#define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) +#define __pte_free_tlb(tlb, ptepage) \ + pgtable_free_tlb(tlb, pgtable_free_page(ptepage)) +#define __pmd_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pmd)) +#define __pud_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pud)) #define check_pgt_cache() do { } while (0) Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-12 13:28:03.000000000 +1000 @@ -66,6 +66,14 @@ #include #include +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif + int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; static unsigned long phbs_io_bot = PHBS_IO_BASE; @@ -292,7 +300,7 @@ * Before that, we map using addresses going * up from ioremap_bot. imalloc will use * the addresses from ioremap_bot through - * IMALLOC_END (0xE000001fffffffff) + * IMALLOC_END * */ pa = addr & PAGE_MASK; @@ -896,23 +904,19 @@ return virt_addr; } -kmem_cache_t *zero_cache; - -static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) -{ - memset(pte, 0, PAGE_SIZE); -} +kmem_cache_t *pmd_cache; void pgtable_cache_init(void) { - zero_cache = kmem_cache_create("zero", - PAGE_SIZE, - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - zero_ctor, - NULL); - if (!zero_cache) - panic("pgtable_cache_init(): could not create zero_cache!\n"); + BUILD_BUG_ON(PTE_TABLE_SIZE != PAGE_SIZE); + BUILD_BUG_ON(PMD_TABLE_SIZE != PUD_TABLE_SIZE); + BUILD_BUG_ON(PGD_TABLE_SIZE != PAGE_SIZE); + + pmd_cache = kmem_cache_create("pmd", PMD_TABLE_SIZE, PMD_TABLE_SIZE, + SLAB_POISON |SLAB_DEBUG_INITIAL, + NULL, NULL); + if (! pmd_cache) + panic("pmd_pud_cache_init(): could not create pmd_pud_cache!\n"); } pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, Index: working-2.6/include/asm-ppc64/processor.h =================================================================== --- working-2.6.orig/include/asm-ppc64/processor.h 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/include/asm-ppc64/processor.h 2005-05-12 13:29:50.000000000 +1000 @@ -531,7 +531,7 @@ extern struct task_struct *last_task_used_altivec; /* 64-bit user address space is 41-bits (2TBs user VM) */ -#define TASK_SIZE_USER64 (0x0000020000000000UL) +#define TASK_SIZE_USER64 (0x0000100000000000UL) /* * 32-bit user address space is 4GB - 1 page Index: working-2.6/arch/ppc64/kernel/head.S =================================================================== --- working-2.6.orig/arch/ppc64/kernel/head.S 2005-05-12 12:08:53.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/head.S 2005-05-12 12:08:54.000000000 +1000 @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_ISERIES #define DO_SOFT_DISABLE @@ -2117,17 +2118,17 @@ empty_zero_page: .space 4096 - .globl swapper_pg_dir -swapper_pg_dir: - .space 4096 - #ifdef CONFIG_SMP /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */ .globl stab_array stab_array: .space 4096 * 48 #endif - + + .globl swapper_pg_dir +swapper_pg_dir: + .space PAGE_SIZE + /* * This space gets a copy of optional info passed to us by the bootstrap * Used to pass parameters into the kernel like root=/dev/sda1, etc. Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-12 13:28:17.000000000 +1000 @@ -30,7 +30,7 @@ break; if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; - if (addr > IMALLOC_END-size) + if (addr >= IMALLOC_END-size) return 1; } *im_addr = addr; Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-12 11:56:11.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-12 13:26:06.000000000 +1000 @@ -298,7 +298,7 @@ int local = 0; cpumask_t tmp; - if ((ea & ~REGION_MASK) > EADDR_MASK) + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) return 1; switch (REGION_ID(ea)) { Index: working-2.6/include/asm-ppc64/mmu.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu.h 2005-05-12 13:24:06.000000000 +1000 @@ -259,8 +259,10 @@ #define VSID_BITS 36 #define VSID_MODULUS ((1UL< References: <20050512033953.GA29780@localhost.localdomain> Message-ID: <20050512040804.GB29780@localhost.localdomain> On Thu, May 12, 2005 at 01:39:53PM +1000, David Gibson wrote: > Here's a first shot at patch which implements true four-level page > tables for ppc64. It uses full page tables at the bottom and top > levels, and quarter-page tables at the middle two levels. This gives > a total usable address space of 44 bits (16T). I've also tweaked the > VSID allocation to let us use all that space (thereby halving the > number of available contexts) and added some #if and BUILD_BUG sanity > checks. > > Hugepages are presently completely broken, working on that now. This > patch applies on top of the patch posted earlier eliminating > ioremap_dir. Bah, sorry. Patch broken. Changes to arch/ppc64/mm/tlb.c are missing. Fixed version later. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From david at gibson.dropbear.id.au Thu May 12 14:45:46 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 12 May 2005 14:45:46 +1000 Subject: Revised patch to kill ioremap_mm Message-ID: <20050512044546.GD29780@localhost.localdomain> Here's a new version of the patch which removes unmap_im_area() and it's pagetable walking functions, as well as the extra set of tables themselves. arch/ppc64/kernel/eeh.c | 2 arch/ppc64/kernel/head.S | 4 - arch/ppc64/kernel/process.c | 8 --- arch/ppc64/mm/hash_utils.c | 4 - arch/ppc64/mm/imalloc.c | 20 +++++---- arch/ppc64/mm/init.c | 93 ++++-------------------------------------- include/asm-ppc64/imalloc.h | 12 +++-- include/asm-ppc64/page.h | 2 include/asm-ppc64/pgtable.h | 9 ---- include/asm-ppc64/processor.h | 10 ---- 10 files changed, 31 insertions(+), 133 deletions(-) Currently ppc64 has two mm_structs for the kernel, init_mm and also ioremap_mm. The latter really isn't necessary: this patch abolishes it, instead restricting vmallocs to the lower 1TB of the init_mm's range and placing io mappings in the upper 1TB. This simplifies the code in a number of places and eliminates an unecessary set of pagetables. It also tweaks the unmap/free path a little, allowing use to also remove the im_ set of page table walkers, replacing them with unmap_vm_area(). Signed-off-by: David Gibson Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-12 14:08:37.000000000 +1000 @@ -53,7 +53,8 @@ * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_END (VMALLOC_START + EADDR_MASK) +#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* * Bits in a linux-style PTE. These match the bits in the @@ -239,9 +240,6 @@ /* This now only contains the vmalloc pages */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -/* to find an entry in the ioremap page-table-directory */ -#define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -459,15 +457,12 @@ #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -extern unsigned long ioremap_bot, ioremap_base; - #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; -extern pgd_t ioremap_dir[]; extern void paging_init(void); Index: working-2.6/include/asm-ppc64/imalloc.h =================================================================== --- working-2.6.orig/include/asm-ppc64/imalloc.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/imalloc.h 2005-05-12 14:08:37.000000000 +1000 @@ -4,9 +4,9 @@ /* * Define the address range of the imalloc VM area. */ -#define PHBS_IO_BASE IOREGIONBASE -#define IMALLOC_BASE (IOREGIONBASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ -#define IMALLOC_END (IOREGIONBASE + EADDR_MASK) +#define PHBS_IO_BASE VMALLOC_END +#define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ +#define IMALLOC_END (VMALLOC_START + EADDR_MASK) /* imalloc region types */ @@ -18,7 +18,9 @@ extern struct vm_struct * im_get_free_area(unsigned long size); extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, - int region_type); -unsigned long im_free(void *addr); + int region_type); +extern void im_free(void *addr); + +extern unsigned long ioremap_bot; #endif /* _PPC64_IMALLOC_H */ Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-12 14:08:37.000000000 +1000 @@ -202,9 +202,7 @@ #define PAGE_OFFSET ASM_CONST(0xC000000000000000) #define KERNELBASE PAGE_OFFSET #define VMALLOCBASE ASM_CONST(0xD000000000000000) -#define IOREGIONBASE ASM_CONST(0xE000000000000000) -#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) #define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) #define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) #define USER_REGION_ID (0UL) Index: working-2.6/arch/ppc64/kernel/eeh.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/eeh.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/eeh.c 2005-05-12 14:05:12.000000000 +1000 @@ -505,7 +505,7 @@ pte_t *ptep; unsigned long pa; - ptep = find_linux_pte(ioremap_mm.pgd, token); + ptep = find_linux_pte(init_mm.pgd, token); if (!ptep) return token; pa = pte_pfn(*ptep) << PAGE_SHIFT; Index: working-2.6/arch/ppc64/kernel/process.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/process.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/process.c 2005-05-12 14:05:12.000000000 +1000 @@ -58,14 +58,6 @@ struct task_struct *last_task_used_altivec = NULL; #endif -struct mm_struct ioremap_mm = { - .pgd = ioremap_dir, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .cpu_vm_mask = CPU_MASK_ALL, - .page_table_lock = SPIN_LOCK_UNLOCKED, -}; - /* * Make sure the floating-point register state in the * the thread_struct is up to date for task tsk. Index: working-2.6/include/asm-ppc64/processor.h =================================================================== --- working-2.6.orig/include/asm-ppc64/processor.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/processor.h 2005-05-12 14:08:37.000000000 +1000 @@ -590,16 +590,6 @@ } /* - * Note: the vm_start and vm_end fields here should *not* - * be in kernel space. (Could vm_end == vm_start perhaps?) - */ -#define IOREMAP_MMAP { &ioremap_mm, 0, 0x1000, NULL, \ - PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \ - 1, NULL, NULL } - -extern struct mm_struct ioremap_mm; - -/* * Return saved PC of a blocked thread. For now, this is the "user" PC */ #define thread_saved_pc(tsk) \ Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-12 14:08:37.000000000 +1000 @@ -310,10 +310,6 @@ vsid = get_vsid(mm->context.id, ea); break; - case IO_REGION_ID: - mm = &ioremap_mm; - vsid = get_kernel_vsid(ea); - break; case VMALLOC_REGION_ID: mm = &init_mm; vsid = get_kernel_vsid(ea); Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-12 14:11:16.000000000 +1000 @@ -73,9 +73,6 @@ extern pgd_t swapper_pg_dir[]; extern struct task_struct *current_set[NR_CPUS]; -extern pgd_t ioremap_dir[]; -pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir; - unsigned long klimit = (unsigned long)_end; unsigned long _SDR1=0; @@ -137,69 +134,6 @@ #else -static void unmap_im_area_pte(pmd_t *pmd, unsigned long addr, - unsigned long end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - pte_t ptent = ptep_get_and_clear(&ioremap_mm, addr, pte); - WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static inline void unmap_im_area_pmd(pud_t *pud, unsigned long addr, - unsigned long end) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - unmap_im_area_pte(pmd, addr, next); - } while (pmd++, addr = next, addr != end); -} - -static inline void unmap_im_area_pud(pgd_t *pgd, unsigned long addr, - unsigned long end) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - unmap_im_area_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void unmap_im_area(unsigned long addr, unsigned long end) -{ - struct mm_struct *mm = &ioremap_mm; - unsigned long next; - pgd_t *pgd; - - spin_lock(&mm->page_table_lock); - - pgd = pgd_offset_i(addr); - flush_cache_vunmap(addr, end); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - unmap_im_area_pud(pgd, addr, next); - } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); - - spin_unlock(&mm->page_table_lock); -} - /* * map_io_page currently only called by __ioremap * map_io_page adds an entry to the ioremap page table @@ -214,21 +148,21 @@ unsigned long vsid; if (mem_init_done) { - spin_lock(&ioremap_mm.page_table_lock); - pgdp = pgd_offset_i(ea); - pudp = pud_alloc(&ioremap_mm, pgdp, ea); + spin_lock(&init_mm.page_table_lock); + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) return -ENOMEM; - pmdp = pmd_alloc(&ioremap_mm, pudp, ea); + pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); + ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; pa = abs_to_phys(pa); - set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); - spin_unlock(&ioremap_mm.page_table_lock); + spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; @@ -267,13 +201,9 @@ for (i = 0; i < size; i += PAGE_SIZE) if (map_io_page(ea+i, pa+i, flags)) - goto failure; + return NULL; return (void __iomem *) (ea + (addr & ~PAGE_MASK)); - failure: - if (mem_init_done) - unmap_im_area(ea, ea + size); - return NULL; } @@ -381,19 +311,14 @@ */ void iounmap(volatile void __iomem *token) { - unsigned long address, size; void *addr; if (!mem_init_done) return; addr = (void *) ((unsigned long __force) token & PAGE_MASK); - - if ((size = im_free(addr)) == 0) - return; - address = (unsigned long)addr; - unmap_im_area(address, address + size); + im_free(addr); } static int iounmap_subset_regions(unsigned long addr, unsigned long size) Index: working-2.6/arch/ppc64/kernel/head.S =================================================================== --- working-2.6.orig/arch/ppc64/kernel/head.S 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/head.S 2005-05-12 14:08:37.000000000 +1000 @@ -2121,10 +2121,6 @@ swapper_pg_dir: .space 4096 - .globl ioremap_dir -ioremap_dir: - .space 4096 - #ifdef CONFIG_SMP /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */ .globl stab_array Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-12 14:20:33.000000000 +1000 @@ -15,6 +15,7 @@ #include #include #include +#include static DECLARE_MUTEX(imlist_sem); struct vm_struct * imlist = NULL; @@ -285,29 +286,32 @@ return area; } -unsigned long im_free(void * addr) +void im_free(void * addr) { struct vm_struct **p, *tmp; - unsigned long ret_size = 0; if (!addr) - return ret_size; - if ((PAGE_SIZE-1) & (unsigned long) addr) { + return; + if ((unsigned long) addr & ~PAGE_MASK) { printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr); - return ret_size; + return; } down(&imlist_sem); for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { - ret_size = tmp->size; *p = tmp->next; + + /* XXX: do we need the lock? */ + spin_lock(&init_mm.page_table_lock); + unmap_vm_area(tmp); + spin_unlock(&init_mm.page_table_lock); + kfree(tmp); up(&imlist_sem); - return ret_size; + return; } } up(&imlist_sem); printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__, addr); - return ret_size; } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From david at gibson.dropbear.id.au Thu May 12 15:56:10 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 12 May 2005 15:56:10 +1000 Subject: First cut at four-level pagetables In-Reply-To: <20050512040804.GB29780@localhost.localdomain> References: <20050512033953.GA29780@localhost.localdomain> <20050512040804.GB29780@localhost.localdomain> Message-ID: <20050512055610.GF29780@localhost.localdomain> On Thu, May 12, 2005 at 02:08:04PM +1000, David Gibson wrote: > On Thu, May 12, 2005 at 01:39:53PM +1000, David Gibson wrote: > > Here's a first shot at patch which implements true four-level page > > tables for ppc64. It uses full page tables at the bottom and top > > levels, and quarter-page tables at the middle two levels. This gives > > a total usable address space of 44 bits (16T). I've also tweaked the > > VSID allocation to let us use all that space (thereby halving the > > number of available contexts) and added some #if and BUILD_BUG sanity > > checks. > > > > Hugepages are presently completely broken, working on that now. This > > patch applies on top of the patch posted earlier eliminating > > ioremap_dir. > > Bah, sorry. Patch broken. Changes to arch/ppc64/mm/tlb.c are > missing. Fixed version later. Ok, better patch below. Hugepages are still broken, but otherwise this seems to work, at least against elementary testing. arch/ppc64/kernel/head.S | 11 ++-- arch/ppc64/mm/hash_utils.c | 2 arch/ppc64/mm/imalloc.c | 2 arch/ppc64/mm/init.c | 34 ++++++++------- arch/ppc64/mm/tlb.c | 95 ++++++++++++++++++++++++------------------ include/asm-ppc64/imalloc.h | 2 include/asm-ppc64/mmu.h | 6 +- include/asm-ppc64/page.h | 27 ++++++----- include/asm-ppc64/pgalloc.h | 80 ++++++++++++++++++++++++----------- include/asm-ppc64/pgtable.h | 80 +++++++++++++++++++++++------------ include/asm-ppc64/processor.h | 2 11 files changed, 213 insertions(+), 128 deletions(-) This patch implements full four-level page tables for ppc64. It uses a full page for the tables at the bottom and top level, and a quarter page for the intermediate levels. This gives a total usable address space of 44 bits (16T). This patch also tweaks the VSID allocation to have a matching range for user addresses (thereby halving the number of available contexts) and adds some #if and BUILD_BUG sanity checks. Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-12 14:36:48.000000000 +1000 @@ -15,19 +15,24 @@ #include #endif /* __ASSEMBLY__ */ -#include - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ #define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 10 -#define PGD_INDEX_SIZE 10 +#define PMD_INDEX_SIZE 7 +#define PUD_INDEX_SIZE 7 +#define PGD_INDEX_SIZE 9 + +#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) +#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) /* PMD_SHIFT determines what a second-level page table entry can map */ @@ -35,8 +40,13 @@ #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -45,15 +55,23 @@ /* * Size of EA range mapped by our pagetables. */ -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) -#define EADDR_MASK ((1UL << EADDR_SIZE) - 1) +#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) +#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE) + +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif /* * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_SIZE (0x40000000000UL) #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* @@ -151,6 +169,8 @@ #ifdef CONFIG_HUGETLB_PAGE +#error Hugepages broken for now + #ifndef __ASSEMBLY__ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local); @@ -197,39 +217,45 @@ #define pte_pfn(x) ((unsigned long)((pte_val(x) >> PTE_SHIFT))) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pmd_set(pmdp, ptep) \ - (pmd_val(*(pmdp)) = __ba_to_bpn(ptep)) +#define pmd_set(pmdp, ptep) (pmd_val(*(pmdp)) = (unsigned long)(ptep)) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) == 0) #define pmd_present(pmd) (pmd_val(pmd) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) -#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page_kernel(pmd) (pmd_val(pmd)) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) -#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) +#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (unsigned long)(pmdp)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) ((pud_val(pud)) == 0UL) -#define pud_present(pud) (pud_val(pud) != 0UL) -#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define pud_page(pud) (__bpn_to_ba(pud_val(pud))) +#define pud_bad(pud) ((pud_val(pud)) == 0) +#define pud_present(pud) (pud_val(pud) != 0) +#define pud_clear(pudp) (pud_val(*(pudp)) = 0) +#define pud_page(pud) (pud_val(pud)) + +#define pgd_set(pgdp, pudp) ({pgd_val(*(pgdp)) = (unsigned long)(pudp);}) +#define pgd_none(pgd) (!pgd_val(pgd)) +#define pgd_bad(pgd) (pgd_val(pgd) == 0) +#define pgd_present(pgd) (pgd_val(pgd) != 0) +#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0) +#define pgd_page(pgd) (pgd_val(pgd)) /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. */ /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */ -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff) +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -/* Find an entry in the second-level page table.. */ +#define pud_offset(pgdp, addr) \ + (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + #define pmd_offset(pudp,addr) \ - ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) -/* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ - ((pte_t *) pmd_page_kernel(*(dir)) \ - + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) + (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -458,9 +484,11 @@ #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-12 14:36:48.000000000 +1000 @@ -35,6 +35,8 @@ #ifdef CONFIG_HUGETLB_PAGE +#error Hugepages broken for now + #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) /* For 64-bit processes the hugepage range is 1T-1.5T */ @@ -91,7 +93,7 @@ #ifndef __ASSEMBLY__ #include -#undef STRICT_MM_TYPECHECKS +#define STRICT_MM_TYPECHECKS #define REGION_SIZE 4UL #define REGION_SHIFT 60UL @@ -125,27 +127,31 @@ * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b. */ typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned int pmd; } pmd_t; -typedef struct { unsigned int pgd; } pgd_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) +#define pud_val(x) ((x).pud) #define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) -#define __pte(x) ((pte_t) { (x) } ) -#define __pmd(x) ((pmd_t) { (x) } ) -#define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) +#define __pte(x) ((pte_t) { (x) }) +#define __pmd(x) ((pmd_t) { (x) }) +#define __pud(x) ((pud_t) { (x) }) +#define __pgd(x) ((pgd_t) { (x) }) +#define __pgprot(x) ((pgprot_t) { (x) }) #else /* * .. while these make it easier on the compiler */ typedef unsigned long pte_t; -typedef unsigned int pmd_t; -typedef unsigned int pgd_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long pgd_t; typedef unsigned long pgprot_t; #define pte_val(x) (x) @@ -208,9 +214,6 @@ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE) -#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT) - #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) #ifdef CONFIG_DISCONTIGMEM Index: working-2.6/include/asm-ppc64/pgalloc.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgalloc.h 2005-05-02 08:57:22.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgalloc.h 2005-05-12 14:36:48.000000000 +1000 @@ -6,7 +6,7 @@ #include #include -extern kmem_cache_t *zero_cache; +extern kmem_cache_t *pmd_cache; /* * This program is free software; you can redistribute it and/or @@ -18,13 +18,31 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL); + return (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); } static inline void pgd_free(pgd_t *pgd) { - kmem_cache_free(zero_cache, pgd); + free_page((unsigned long)pgd); +} + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, PUD) + +static inline pud_t * +pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pudp; + + pudp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT); + memset(pudp, 0, PUD_TABLE_SIZE); + return pudp; +} + +static inline void +pud_free(pud_t *pud) +{ + kmem_cache_free(pmd_cache, pud); } #define pud_populate(MM, PUD, PMD) pud_set(PUD, PMD) @@ -32,13 +50,17 @@ static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + pmd_t *pmdp; + + pmdp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT); + memset(pmdp, 0, PMD_TABLE_SIZE); + return pmdp; } static inline void pmd_free(pmd_t *pmd) { - kmem_cache_free(zero_cache, pmd); + kmem_cache_free(pmd_cache, pmd); } #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) @@ -47,44 +69,54 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); - if (pte) - return virt_to_page(pte); - return NULL; + return alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); } static inline void pte_free_kernel(pte_t *pte) { - kmem_cache_free(zero_cache, pte); + free_page((unsigned long)pte); } static inline void pte_free(struct page *ptepage) { - kmem_cache_free(zero_cache, page_address(ptepage)); + __free_page(ptepage); } -struct pte_freelist_batch +typedef struct pgtable_free { + unsigned long val; +} pgtable_free_t; + +static inline pgtable_free_t pgtable_free_page(struct page *page) { - struct rcu_head rcu; - unsigned int index; - struct page * pages[0]; -}; + return (pgtable_free_t){.val = (unsigned long) page}; +} -#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \ - sizeof(struct page *)) +static inline pgtable_free_t pgtable_free_cache(void *p) +{ + return (pgtable_free_t){.val = ((unsigned long) p) | 1}; +} -extern void pte_free_now(struct page *ptepage); -extern void pte_free_submit(struct pte_freelist_batch *batch); +static inline void pgtable_free(pgtable_free_t pgf) +{ + if (pgf.val & 1) + kmem_cache_free(pmd_cache, (void *)(pgf.val & ~1)); + else + __free_page((struct page *)pgf.val); +} -DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); -#define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) +#define __pte_free_tlb(tlb, ptepage) \ + pgtable_free_tlb(tlb, pgtable_free_page(ptepage)) +#define __pmd_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pmd)) +#define __pud_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pud)) #define check_pgt_cache() do { } while (0) Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-12 14:36:48.000000000 +1000 @@ -66,6 +66,14 @@ #include #include +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif + int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; static unsigned long phbs_io_bot = PHBS_IO_BASE; @@ -225,7 +233,7 @@ * Before that, we map using addresses going * up from ioremap_bot. imalloc will use * the addresses from ioremap_bot through - * IMALLOC_END (0xE000001fffffffff) + * IMALLOC_END * */ pa = addr & PAGE_MASK; @@ -824,23 +832,19 @@ return virt_addr; } -kmem_cache_t *zero_cache; - -static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) -{ - memset(pte, 0, PAGE_SIZE); -} +kmem_cache_t *pmd_cache; void pgtable_cache_init(void) { - zero_cache = kmem_cache_create("zero", - PAGE_SIZE, - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - zero_ctor, - NULL); - if (!zero_cache) - panic("pgtable_cache_init(): could not create zero_cache!\n"); + BUILD_BUG_ON(PTE_TABLE_SIZE != PAGE_SIZE); + BUILD_BUG_ON(PMD_TABLE_SIZE != PUD_TABLE_SIZE); + BUILD_BUG_ON(PGD_TABLE_SIZE != PAGE_SIZE); + + pmd_cache = kmem_cache_create("pmd", PMD_TABLE_SIZE, PMD_TABLE_SIZE, + SLAB_POISON |SLAB_DEBUG_INITIAL, + NULL, NULL); + if (! pmd_cache) + panic("pmd_pud_cache_init(): could not create pmd_pud_cache!\n"); } pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, Index: working-2.6/include/asm-ppc64/processor.h =================================================================== --- working-2.6.orig/include/asm-ppc64/processor.h 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/include/asm-ppc64/processor.h 2005-05-12 14:36:48.000000000 +1000 @@ -531,7 +531,7 @@ extern struct task_struct *last_task_used_altivec; /* 64-bit user address space is 41-bits (2TBs user VM) */ -#define TASK_SIZE_USER64 (0x0000020000000000UL) +#define TASK_SIZE_USER64 (0x0000100000000000UL) /* * 32-bit user address space is 4GB - 1 page Index: working-2.6/arch/ppc64/kernel/head.S =================================================================== --- working-2.6.orig/arch/ppc64/kernel/head.S 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/head.S 2005-05-12 14:36:48.000000000 +1000 @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_ISERIES #define DO_SOFT_DISABLE @@ -2117,17 +2118,17 @@ empty_zero_page: .space 4096 - .globl swapper_pg_dir -swapper_pg_dir: - .space 4096 - #ifdef CONFIG_SMP /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */ .globl stab_array stab_array: .space 4096 * 48 #endif - + + .globl swapper_pg_dir +swapper_pg_dir: + .space PAGE_SIZE + /* * This space gets a copy of optional info passed to us by the bootstrap * Used to pass parameters into the kernel like root=/dev/sda1, etc. Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-12 14:36:48.000000000 +1000 @@ -31,7 +31,7 @@ break; if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; - if (addr > IMALLOC_END-size) + if (addr >= IMALLOC_END-size) return 1; } *im_addr = addr; Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-12 14:24:04.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-12 14:36:48.000000000 +1000 @@ -298,7 +298,7 @@ int local = 0; cpumask_t tmp; - if ((ea & ~REGION_MASK) > EADDR_MASK) + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) return 1; switch (REGION_ID(ea)) { Index: working-2.6/include/asm-ppc64/mmu.h =================================================================== --- working-2.6.orig/include/asm-ppc64/mmu.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/mmu.h 2005-05-12 14:36:48.000000000 +1000 @@ -259,8 +259,10 @@ #define VSID_BITS 36 #define VSID_MODULUS ((1UL<index; i++) + pgtable_free(batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { /* This is safe as we are holding page_table_lock */ cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); @@ -49,19 +100,19 @@ if (atomic_read(&tlb->mm->mm_users) < 2 || cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pte_free(ptepage); + pgtable_free(pgf); return; } if (*batchp == NULL) { *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); if (*batchp == NULL) { - pte_free_now(ptepage); + pgtable_free_now(pgf); return; } (*batchp)->index = 0; } - (*batchp)->pages[(*batchp)->index++] = ptepage; + (*batchp)->tables[(*batchp)->index++] = pgf; if ((*batchp)->index == PTE_FREELIST_SIZE) { pte_free_submit(*batchp); *batchp = NULL; @@ -132,42 +183,6 @@ put_cpu(); } -#ifdef CONFIG_SMP -static void pte_free_smp_sync(void *arg) -{ - /* Do nothing, just ensure we sync with all CPUs */ -} -#endif - -/* This is only called when we are critically out of memory - * (and fail to get a page in pte_free_tlb). - */ -void pte_free_now(struct page *ptepage) -{ - pte_freelist_forced_free++; - - smp_call_function(pte_free_smp_sync, NULL, 0, 1); - - pte_free(ptepage); -} - -static void pte_free_rcu_callback(struct rcu_head *head) -{ - struct pte_freelist_batch *batch = - container_of(head, struct pte_freelist_batch, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - pte_free(batch->pages[i]); - free_page((unsigned long)batch); -} - -void pte_free_submit(struct pte_freelist_batch *batch) -{ - INIT_RCU_HEAD(&batch->rcu); - call_rcu(&batch->rcu, pte_free_rcu_callback); -} - void pte_free_finish(void) { /* This is safe as we are holding page_table_lock */ -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From michael at ellerman.id.au Thu May 12 18:09:45 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Thu, 12 May 2005 18:09:45 +1000 Subject: [PATCH 4/4] iseries_veth: Cleanup skbs to prevent unregister_netdevice() hanging Message-ID: <200505121809.45419.michael@ellerman.id.au> Hi Andrew, Jeff, The iseries_veth driver is badly behaved in that it will keep TX packets hanging around forever if they're not ACK'ed and the queue never fills up. This causes the unregister_netdevice code to wait forever when we try to take the device down, because there's still skbs around with references to our struct net_device. There's already code to cleanup any un-ACK'ed packets in veth_stop_connection() but it's being called after we unregister the net_device, which is too late. The fix is to rearrange the module exit function so that we cleanup any outstanding skbs and then unregister the driver. Signed-off-by: Michael Ellerman -- drivers/net/iseries_veth.c | 11 +++++++++-- 1 files changed, 9 insertions(+), 2 deletions(-) Index: veth-fixes/drivers/net/iseries_veth.c =================================================================== --- veth-fixes.orig/drivers/net/iseries_veth.c 2005-05-12 16:27:32.000000000 +1000 +++ veth-fixes/drivers/net/iseries_veth.c 2005-05-12 16:27:42.000000000 +1000 @@ -1388,18 +1388,25 @@ { int i; - vio_unregister_driver(&veth_driver); + /* Stop the queues first to stop any new packets being sent. */ + for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) + if (veth_dev[i]) + netif_stop_queue(veth_dev[i]); + /* Stop the connections before we unregister the driver. This + * ensures there's no skbs lying around holding the device open. */ for (i = 0; i < HVMAXARCHITECTEDLPS; ++i) veth_stop_connection(i); HvLpEvent_unregisterHandler(HvLpEvent_Type_VirtualLan); /* Hypervisor callbacks may have scheduled more work while we - * were destroying connections. Now that we've disconnected from + * were stoping connections. Now that we've disconnected from * the hypervisor make sure everything's finished. */ flush_scheduled_work(); + vio_unregister_driver(&veth_driver); + for (i = 0; i < HVMAXARCHITECTEDLPS; ++i) veth_destroy_connection(i); From david at gibson.dropbear.id.au Thu May 12 21:28:57 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Thu, 12 May 2005 21:28:57 +1000 Subject: [PATCH 4/4] iseries_veth: Cleanup skbs to prevent unregister_netdevice() hanging In-Reply-To: <200505121809.45419.michael@ellerman.id.au> References: <200505121809.45419.michael@ellerman.id.au> Message-ID: <20050512112857.GC32694@localhost.localdomain> On Thu, May 12, 2005 at 06:09:45PM +1000, Michael Ellerman wrote: > Hi Andrew, Jeff, > > The iseries_veth driver is badly behaved in that it will keep TX packets > hanging around forever if they're not ACK'ed and the queue never fills up. > > This causes the unregister_netdevice code to wait forever when we try to take > the device down, because there's still skbs around with references to our > struct net_device. > > There's already code to cleanup any un-ACK'ed packets in veth_stop_connection() > but it's being called after we unregister the net_device, which is too late. > > The fix is to rearrange the module exit function so that we cleanup any > outstanding skbs and then unregister the driver. > > Signed-off-by: Michael Ellerman Nice catch. Acked-by: David Gibson -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From will_schmidt at vnet.ibm.com Fri May 13 05:59:30 2005 From: will_schmidt at vnet.ibm.com (will schmidt) Date: Thu, 12 May 2005 14:59:30 -0500 Subject: (updated) RFC/Patch xmon pte/pgd/userspace address additions. Message-ID: <4283B5A2.4030706@vnet.ibm.com> Hi Folks, Per Ben's prompting me, :-) this version is updated to handle the additional pud_offset calls (part of the 4L header stuff). - I've removed the try_spinlock code; - As an alternative to duplicating lots of function to add mread calls in place of references, I've added setjmp(bus_error_jmp) {} around what seem more likely to be critical areas. - cleaned up spacing - changed most of the function names to be xmon_xxx instead of wm_xxx. these functions show up under a submenu 'w'. use "w?" at xmon> prompt to get the help blurb. > the bulk of my intent was to make it easier for me to poke at memory within a particular user process. > > I realize that the spacing is a bit screwed up, and the function names should eventually change. Because i couldnt decide on letters for the new functions, i put them under a submenu 'w'. > > wP will dump info on all processes. > > wp 0xabc will make process with pid 0xabc the active pid. <- active only with respect to xmon poking into memory. > > wd 0xabcd1234 - will call through the pdg/pmd functions and return the kernel address corresponding to 0xabcd1234 within the processes memory space location. > > wg will dump gprs of the process/thread. -Will -------------- next part -------------- An embedded and charset-unspecified text was scrubbed... Name: xmon_pgd_may12.diff Url: http://ozlabs.org/pipermail/linuxppc64-dev/attachments/20050512/bf82fdeb/attachment.txt From david at gibson.dropbear.id.au Fri May 13 11:10:38 2005 From: david at gibson.dropbear.id.au (David Gibson) Date: Fri, 13 May 2005 11:10:38 +1000 Subject: [PATCH] ppc64: Abolish ioremap_mm Message-ID: <20050513011038.GC19269@localhost.localdomain> Andrew, please apply: Currently ppc64 has two mm_structs for the kernel, init_mm and also ioremap_mm. The latter really isn't necessary: this patch abolishes it, instead restricting vmallocs to the lower 1TB of the init_mm's range and placing io mappings in the upper 1TB. This simplifies the code in a number of places and eliminates an unecessary set of pagetables. It also tweaks the unmap/free path a little, allowing us to remove the unmap_im_area() set of page table walkers, replacing them with unmap_vm_area(). arch/ppc64/kernel/eeh.c | 2 arch/ppc64/kernel/head.S | 4 - arch/ppc64/kernel/process.c | 8 --- arch/ppc64/mm/hash_utils.c | 4 - arch/ppc64/mm/imalloc.c | 20 +++++---- arch/ppc64/mm/init.c | 93 ++++-------------------------------------- include/asm-ppc64/imalloc.h | 12 +++-- include/asm-ppc64/page.h | 2 include/asm-ppc64/pgtable.h | 9 ---- include/asm-ppc64/processor.h | 10 ---- 10 files changed, 31 insertions(+), 133 deletions(-) Signed-off-by: David Gibson Index: working-2.6/include/asm-ppc64/pgtable.h =================================================================== --- working-2.6.orig/include/asm-ppc64/pgtable.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/pgtable.h 2005-05-12 14:08:37.000000000 +1000 @@ -53,7 +53,8 @@ * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_END (VMALLOC_START + EADDR_MASK) +#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* * Bits in a linux-style PTE. These match the bits in the @@ -239,9 +240,6 @@ /* This now only contains the vmalloc pages */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -/* to find an entry in the ioremap page-table-directory */ -#define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -459,15 +457,12 @@ #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -extern unsigned long ioremap_bot, ioremap_base; - #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; -extern pgd_t ioremap_dir[]; extern void paging_init(void); Index: working-2.6/include/asm-ppc64/imalloc.h =================================================================== --- working-2.6.orig/include/asm-ppc64/imalloc.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/imalloc.h 2005-05-12 14:08:37.000000000 +1000 @@ -4,9 +4,9 @@ /* * Define the address range of the imalloc VM area. */ -#define PHBS_IO_BASE IOREGIONBASE -#define IMALLOC_BASE (IOREGIONBASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ -#define IMALLOC_END (IOREGIONBASE + EADDR_MASK) +#define PHBS_IO_BASE VMALLOC_END +#define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ +#define IMALLOC_END (VMALLOC_START + EADDR_MASK) /* imalloc region types */ @@ -18,7 +18,9 @@ extern struct vm_struct * im_get_free_area(unsigned long size); extern struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, - int region_type); -unsigned long im_free(void *addr); + int region_type); +extern void im_free(void *addr); + +extern unsigned long ioremap_bot; #endif /* _PPC64_IMALLOC_H */ Index: working-2.6/include/asm-ppc64/page.h =================================================================== --- working-2.6.orig/include/asm-ppc64/page.h 2005-05-11 10:05:51.000000000 +1000 +++ working-2.6/include/asm-ppc64/page.h 2005-05-12 14:08:37.000000000 +1000 @@ -202,9 +202,7 @@ #define PAGE_OFFSET ASM_CONST(0xC000000000000000) #define KERNELBASE PAGE_OFFSET #define VMALLOCBASE ASM_CONST(0xD000000000000000) -#define IOREGIONBASE ASM_CONST(0xE000000000000000) -#define IO_REGION_ID (IOREGIONBASE >> REGION_SHIFT) #define VMALLOC_REGION_ID (VMALLOCBASE >> REGION_SHIFT) #define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) #define USER_REGION_ID (0UL) Index: working-2.6/arch/ppc64/kernel/eeh.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/eeh.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/eeh.c 2005-05-12 14:05:12.000000000 +1000 @@ -505,7 +505,7 @@ pte_t *ptep; unsigned long pa; - ptep = find_linux_pte(ioremap_mm.pgd, token); + ptep = find_linux_pte(init_mm.pgd, token); if (!ptep) return token; pa = pte_pfn(*ptep) << PAGE_SHIFT; Index: working-2.6/arch/ppc64/kernel/process.c =================================================================== --- working-2.6.orig/arch/ppc64/kernel/process.c 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/process.c 2005-05-12 14:05:12.000000000 +1000 @@ -58,14 +58,6 @@ struct task_struct *last_task_used_altivec = NULL; #endif -struct mm_struct ioremap_mm = { - .pgd = ioremap_dir, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .cpu_vm_mask = CPU_MASK_ALL, - .page_table_lock = SPIN_LOCK_UNLOCKED, -}; - /* * Make sure the floating-point register state in the * the thread_struct is up to date for task tsk. Index: working-2.6/include/asm-ppc64/processor.h =================================================================== --- working-2.6.orig/include/asm-ppc64/processor.h 2005-04-26 15:38:02.000000000 +1000 +++ working-2.6/include/asm-ppc64/processor.h 2005-05-12 14:08:37.000000000 +1000 @@ -590,16 +590,6 @@ } /* - * Note: the vm_start and vm_end fields here should *not* - * be in kernel space. (Could vm_end == vm_start perhaps?) - */ -#define IOREMAP_MMAP { &ioremap_mm, 0, 0x1000, NULL, \ - PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \ - 1, NULL, NULL } - -extern struct mm_struct ioremap_mm; - -/* * Return saved PC of a blocked thread. For now, this is the "user" PC */ #define thread_saved_pc(tsk) \ Index: working-2.6/arch/ppc64/mm/hash_utils.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/hash_utils.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/hash_utils.c 2005-05-12 14:08:37.000000000 +1000 @@ -310,10 +310,6 @@ vsid = get_vsid(mm->context.id, ea); break; - case IO_REGION_ID: - mm = &ioremap_mm; - vsid = get_kernel_vsid(ea); - break; case VMALLOC_REGION_ID: mm = &init_mm; vsid = get_kernel_vsid(ea); Index: working-2.6/arch/ppc64/mm/init.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/init.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/init.c 2005-05-12 14:11:16.000000000 +1000 @@ -73,9 +73,6 @@ extern pgd_t swapper_pg_dir[]; extern struct task_struct *current_set[NR_CPUS]; -extern pgd_t ioremap_dir[]; -pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir; - unsigned long klimit = (unsigned long)_end; unsigned long _SDR1=0; @@ -137,69 +134,6 @@ #else -static void unmap_im_area_pte(pmd_t *pmd, unsigned long addr, - unsigned long end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - pte_t ptent = ptep_get_and_clear(&ioremap_mm, addr, pte); - WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static inline void unmap_im_area_pmd(pud_t *pud, unsigned long addr, - unsigned long end) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - unmap_im_area_pte(pmd, addr, next); - } while (pmd++, addr = next, addr != end); -} - -static inline void unmap_im_area_pud(pgd_t *pgd, unsigned long addr, - unsigned long end) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - unmap_im_area_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void unmap_im_area(unsigned long addr, unsigned long end) -{ - struct mm_struct *mm = &ioremap_mm; - unsigned long next; - pgd_t *pgd; - - spin_lock(&mm->page_table_lock); - - pgd = pgd_offset_i(addr); - flush_cache_vunmap(addr, end); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - unmap_im_area_pud(pgd, addr, next); - } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); - - spin_unlock(&mm->page_table_lock); -} - /* * map_io_page currently only called by __ioremap * map_io_page adds an entry to the ioremap page table @@ -214,21 +148,21 @@ unsigned long vsid; if (mem_init_done) { - spin_lock(&ioremap_mm.page_table_lock); - pgdp = pgd_offset_i(ea); - pudp = pud_alloc(&ioremap_mm, pgdp, ea); + spin_lock(&init_mm.page_table_lock); + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) return -ENOMEM; - pmdp = pmd_alloc(&ioremap_mm, pudp, ea); + pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); + ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; pa = abs_to_phys(pa); - set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); - spin_unlock(&ioremap_mm.page_table_lock); + spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; @@ -267,13 +201,9 @@ for (i = 0; i < size; i += PAGE_SIZE) if (map_io_page(ea+i, pa+i, flags)) - goto failure; + return NULL; return (void __iomem *) (ea + (addr & ~PAGE_MASK)); - failure: - if (mem_init_done) - unmap_im_area(ea, ea + size); - return NULL; } @@ -381,19 +311,14 @@ */ void iounmap(volatile void __iomem *token) { - unsigned long address, size; void *addr; if (!mem_init_done) return; addr = (void *) ((unsigned long __force) token & PAGE_MASK); - - if ((size = im_free(addr)) == 0) - return; - address = (unsigned long)addr; - unmap_im_area(address, address + size); + im_free(addr); } static int iounmap_subset_regions(unsigned long addr, unsigned long size) Index: working-2.6/arch/ppc64/kernel/head.S =================================================================== --- working-2.6.orig/arch/ppc64/kernel/head.S 2005-04-26 15:37:55.000000000 +1000 +++ working-2.6/arch/ppc64/kernel/head.S 2005-05-12 14:08:37.000000000 +1000 @@ -2121,10 +2121,6 @@ swapper_pg_dir: .space 4096 - .globl ioremap_dir -ioremap_dir: - .space 4096 - #ifdef CONFIG_SMP /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */ .globl stab_array Index: working-2.6/arch/ppc64/mm/imalloc.c =================================================================== --- working-2.6.orig/arch/ppc64/mm/imalloc.c 2005-05-11 10:05:49.000000000 +1000 +++ working-2.6/arch/ppc64/mm/imalloc.c 2005-05-12 14:20:33.000000000 +1000 @@ -15,6 +15,7 @@ #include #include #include +#include static DECLARE_MUTEX(imlist_sem); struct vm_struct * imlist = NULL; @@ -285,29 +286,32 @@ return area; } -unsigned long im_free(void * addr) +void im_free(void * addr) { struct vm_struct **p, *tmp; - unsigned long ret_size = 0; if (!addr) - return ret_size; - if ((PAGE_SIZE-1) & (unsigned long) addr) { + return; + if ((unsigned long) addr & ~PAGE_MASK) { printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr); - return ret_size; + return; } down(&imlist_sem); for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { - ret_size = tmp->size; *p = tmp->next; + + /* XXX: do we need the lock? */ + spin_lock(&init_mm.page_table_lock); + unmap_vm_area(tmp); + spin_unlock(&init_mm.page_table_lock); + kfree(tmp); up(&imlist_sem); - return ret_size; + return; } } up(&imlist_sem); printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__, addr); - return ret_size; } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/people/dgibson From michael at ellerman.id.au Fri May 13 17:44:10 2005 From: michael at ellerman.id.au (Michael Ellerman) Date: Fri, 13 May 2005 17:44:10 +1000 Subject: [PATCH] Updated: fix-pci-mmap-on-ppc-and-ppc64.patch Message-ID: <200505131744.11095.michael@ellerman.id.au> Hi Andrew, This is an updated version of Ben's fix-pci-mmap-on-ppc-and-ppc64.patch which is in 2.6.12-rc4-mm1. It fixes the patch to work on PPC iSeries, removes some debug printks at Ben's request, and incorporates your fix-pci-mmap-on-ppc-and-ppc64-fix.patch also. cheers Signed-off-by: Michael Ellerman -- From: Benjamin Herrenschmidt This patch was discussed at length on linux-pci and so far, the last iteration of it didn't raise any comment. It's effect is a nop on architecture that don't define the new pci_resource_to_user() callback anyway. It allows architecture like ppc who put weird things inside of PCI resource structures to convert to some different value for user visible ones. It also fixes mmap'ing of IO space on those archs. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton arch/ppc/kernel/pci.c | 21 +++++++++++++++++++-- arch/ppc64/kernel/pci.c | 22 ++++++++++++++++++++-- drivers/pci/pci-sysfs.c | 26 +++++++++++++++++++++----- drivers/pci/proc.c | 14 ++++++++++---- include/asm-ppc/pci.h | 6 ++++++ include/asm-ppc64/pci.h | 7 +++++++ include/linux/pci.h | 14 ++++++++++++++ 7 files changed, 97 insertions(+), 13 deletions(-) Index: 2.6.12-rc4-mm1/arch/ppc64/kernel/pci.c =================================================================== --- 2.6.12-rc4-mm1.orig/arch/ppc64/kernel/pci.c 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/arch/ppc64/kernel/pci.c 2005-05-13 15:40:19.000000000 +1000 @@ -351,7 +351,7 @@ *offset += hose->pci_mem_offset; res_bit = IORESOURCE_MEM; } else { - io_offset = (unsigned long)hose->io_base_virt; + io_offset = (unsigned long)hose->io_base_virt - pci_io_base; *offset += io_offset; res_bit = IORESOURCE_IO; } @@ -378,7 +378,7 @@ /* found it! construct the final physical address */ if (mmap_state == pci_mmap_io) - *offset += hose->io_base_phys - io_offset; + *offset += hose->io_base_phys - io_offset; return rp; } @@ -941,4 +941,22 @@ } EXPORT_SYMBOL(pci_read_irq_line); +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = pci_io_base - (unsigned long)hose->io_base_virt + + hose->io_base_phys; + + *start = rsrc->start + offset; + *end = rsrc->end + offset; +} + #endif /* CONFIG_PPC_MULTIPLATFORM */ Index: 2.6.12-rc4-mm1/arch/ppc/kernel/pci.c =================================================================== --- 2.6.12-rc4-mm1.orig/arch/ppc/kernel/pci.c 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/arch/ppc/kernel/pci.c 2005-05-13 15:40:19.000000000 +1000 @@ -1495,7 +1495,7 @@ *offset += hose->pci_mem_offset; res_bit = IORESOURCE_MEM; } else { - io_offset = (unsigned long)hose->io_base_virt; + io_offset = hose->io_base_virt - ___IO_BASE; *offset += io_offset; res_bit = IORESOURCE_IO; } @@ -1522,7 +1522,7 @@ /* found it! construct the final physical address */ if (mmap_state == pci_mmap_io) - *offset += hose->io_base_phys - _IO_BASE; + *offset += hose->io_base_phys - io_offset; return rp; } @@ -1739,6 +1739,23 @@ return result; } +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end) +{ + struct pci_controller *hose = pci_bus_to_hose(dev->bus->number); + unsigned long offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = ___IO_BASE - hose->io_base_virt + hose->io_base_phys; + + *start = rsrc->start + offset; + *end = rsrc->end + offset; +} + void __init pci_init_resource(struct resource *res, unsigned long start, unsigned long end, int flags, char *name) Index: 2.6.12-rc4-mm1/drivers/pci/pci-sysfs.c =================================================================== --- 2.6.12-rc4-mm1.orig/drivers/pci/pci-sysfs.c 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/drivers/pci/pci-sysfs.c 2005-05-13 15:40:19.000000000 +1000 @@ -60,15 +60,18 @@ char * str = buf; int i; int max = 7; + u64 start, end; if (pci_dev->subordinate) max = DEVICE_COUNT_RESOURCE; for (i = 0; i < max; i++) { - str += sprintf(str,"0x%016lx 0x%016lx 0x%016lx\n", - pci_resource_start(pci_dev,i), - pci_resource_end(pci_dev,i), - pci_resource_flags(pci_dev,i)); + struct resource *res = &pci_dev->resource[i]; + pci_resource_to_user(pci_dev, i, res, &start, &end); + str += sprintf(str,"0x%016llx 0x%016llx 0x%016llx\n", + (unsigned long long)start, + (unsigned long long)end, + (unsigned long long)res->flags); } return (str - buf); } @@ -301,8 +304,21 @@ struct device, kobj)); struct resource *res = (struct resource *)attr->attr.private; enum pci_mmap_state mmap_type; + u64 start, end; + int i; - vma->vm_pgoff += res->start >> PAGE_SHIFT; + for (i = 0; i < PCI_ROM_RESOURCE; i++) + if (res == &pdev->resource[i]) + break; + if (i >= PCI_ROM_RESOURCE) + return -ENODEV; + + /* pci_mmap_page_range() expects the same kind of entry as coming + * from /proc/bus/pci/ which is a "user visible" value. If this is + * different from the resource itself, arch will do necessary fixup. + */ + pci_resource_to_user(pdev, i, res, &start, &end); + vma->vm_pgoff += start >> PAGE_SHIFT; mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; return pci_mmap_page_range(pdev, vma, mmap_type, 0); Index: 2.6.12-rc4-mm1/drivers/pci/proc.c =================================================================== --- 2.6.12-rc4-mm1.orig/drivers/pci/proc.c 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/drivers/pci/proc.c 2005-05-13 15:40:19.000000000 +1000 @@ -355,14 +355,20 @@ dev->device, dev->irq); /* Here should be 7 and not PCI_NUM_RESOURCES as we need to preserve compatibility */ - for(i=0; i<7; i++) + for(i=0; i<7; i++) { + u64 start, end; + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); seq_printf(m, LONG_FORMAT, - dev->resource[i].start | + ((unsigned long)start) | (dev->resource[i].flags & PCI_REGION_FLAG_MASK)); - for(i=0; i<7; i++) + } + for(i=0; i<7; i++) { + u64 start, end; + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); seq_printf(m, LONG_FORMAT, dev->resource[i].start < dev->resource[i].end ? - dev->resource[i].end - dev->resource[i].start + 1 : 0); + (unsigned long)(end - start) + 1 : 0); + } seq_putc(m, '\t'); if (drv) seq_printf(m, "%s", drv->name); Index: 2.6.12-rc4-mm1/include/asm-ppc64/pci.h =================================================================== --- 2.6.12-rc4-mm1.orig/include/asm-ppc64/pci.h 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/include/asm-ppc64/pci.h 2005-05-13 15:41:51.000000000 +1000 @@ -136,6 +136,13 @@ unsigned long size, pgprot_t prot); +#ifdef CONFIG_PPC_MULTIPLATFORM +#define HAVE_ARCH_PCI_RESOURCE_TO_USER +extern void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + #endif /* __KERNEL__ */ Index: 2.6.12-rc4-mm1/include/asm-ppc/pci.h =================================================================== --- 2.6.12-rc4-mm1.orig/include/asm-ppc/pci.h 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/include/asm-ppc/pci.h 2005-05-13 15:40:19.000000000 +1000 @@ -103,6 +103,12 @@ unsigned long size, pgprot_t prot); +#define HAVE_ARCH_PCI_RESOURCE_TO_USER +extern void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end); + + #endif /* __KERNEL__ */ #endif /* __PPC_PCI_H */ Index: 2.6.12-rc4-mm1/include/linux/pci.h =================================================================== --- 2.6.12-rc4-mm1.orig/include/linux/pci.h 2005-05-13 15:39:55.000000000 +1000 +++ 2.6.12-rc4-mm1/include/linux/pci.h 2005-05-13 15:43:33.000000000 +1000 @@ -1020,6 +1020,20 @@ #define pci_pretty_name(dev) "" #endif + +/* Some archs don't want to expose struct resource to userland as-is + * in sysfs and /proc + */ +#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER +static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, u64 *start, u64 *end) +{ + *start = rsrc->start; + *end = rsrc->end; +} +#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ + + /* * The world is not perfect and supplies us with broken PCI devices. * For at least a part of these bugs we need a work-around, so both From benh at kernel.crashing.org Fri May 13 17:59:06 2005 From: benh at kernel.crashing.org (Benjamin Herrenschmidt) Date: Fri, 13 May 2005 17:59:06 +1000 Subject: [PATCH] Updated: fix-pci-mmap-on-ppc-and-ppc64.patch In-Reply-To: <200505131744.11095.michael@ellerman.id.au> References: <200505131744.11095.michael@ellerman.id.au> Message-ID: <1115971146.5128.16.camel@gaston> On Fri, 2005-05-13 at 17:44 +1000, Michael Ellerman wrote: > Hi Andrew, > > This is an updated version of Ben's fix-pci-mmap-on-ppc-and-ppc64.patch > which is in 2.6.12-rc4-mm1. > > It fixes the patch to work on PPC iSeries, removes some debug printks > at Ben's request, and incorporates your > fix-pci-mmap-on-ppc-and-ppc64-fix.patch also. > > cheers > > Signed-off-by: Michael Ellerman Acked-by: Benjamin Herrenschmidt From tlnguyen at snoqualmie.dp.intel.com Sat May 14 01:02:28 2005 From: tlnguyen at snoqualmie.dp.intel.com (long) Date: Fri, 13 May 2005 08:02:28 -0700 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) Message-ID: <200505131502.j4DF2S7F015855@snoqualmie.dp.intel.com> On Fri, 6 May 2005 18:05:06, Linas wrote: >Hi, > >This is an "FYI" patch partially implementing the PCI error recovery API >previously detailed by BenH in an earlier email. > >Its an "FYI patch" because this patch has numerous flaws and limitations >which I'm hoping to address any day now. I've been busy with other >things, but have recently been able to carve out a chunk of time to work >on this. > >This patch is almost identical to a previous patch I'd mailed out >before, with only minor changes made to bring it into line with >BenH's proposed API. Basically, I'm just dusting off the old patch, >prior to making more serious changes. I hope to send a more serious >patch in a few days/week. Meanwhile, criticism invited. > >This patch does actually recover from PCI errors on ethernet cards >plugged into ppc64 hotplug slots, and from PCI errors on the IPR scsi >controller. This patch works fine for my AER code. Please post it to LKML. Thanks, Long From johnrose at austin.ibm.com Sat May 14 02:25:56 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 13 May 2005 11:25:56 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <20050506230506.GL11745@austin.ibm.com> References: <20050223002409.GA10909@austin.ibm.com> <20050223174356.GH13081@kroah.com> <20050224011409.GE2088@austin.ibm.com> <421DDEF7.7080103@jp.fujitsu.com> <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> Message-ID: <1116001556.31126.28.camel@sinatra.austin.ibm.com> Hi Linas- Sorry for the delay. My first comment is that patches that affect PCI Hotplug drivers should also be submitted to pcihpd-discuss at lists.sourceforge.net. We've discussed this offline, and maybe this will generate some public discussion. I don't think handle_eeh_events() or eeh_reset_device() belong in the RPA PCI hotplug driver. I've pasted these functions at the bottom of this note. These are out of place compared to the rest of the rpaphp code. They use eeh-specific RTAS calls, structures, and functions, which do not otherwise appear in RPA PCI Hotplug. These functions are using PCI Hotplug, rather than implementing it, so they belong elsewhere. Looking at the patch, all EEH needs from PCI Hotplug are enable and disable functions. The rpaphp driver could register disable/enable functions with eeh.c, and we could avoid introducing this unrelated code to the PCI Hotplug driver. One other quick comment: +static struct device_node * +get_phb_of_device (struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_bus *bus; + + while (1) { + bus = dev->bus; + if (!bus) + break; + dn = pci_bus_to_OF_node(bus); + + if (dn->phb) + return dn; + + dev = bus->self; + BUG_ON (dev==NULL); + if (dev == NULL) + return NULL; + } Could you just use pci_device_to_OF_node(), then dn->phb? This way, you could avoid the loop. +/* ------------------------------------------------------- */ +/** + * handle_eeh_events -- reset a PCI device after hard lockup. + * + * pSeries systems will isolate a PCI slot if the PCI-Host + * bridge detects address or data parity errors, DMA's + * occuring to wild addresses (which usually happen due to + * bugs in device drivers or in PCI adapter firmware). + * Slot isolations also occur if #SERR, #PERR or other misc + * PCI-related errors are detected. + * + * Recovery process consists of unplugging the device driver + * (which generated hotplug events to userspace), then issuing + * a PCI #RST to the device, then reconfiguring the PCI config + * space for all bridges & devices under this slot, and then + * finally restarting the device drivers (which cause a second + * set of hotplug events to go out to userspace). + */ + +int eeh_reset_device (struct pci_dev *dev, struct device_node *dn, int reconfig) +{ + struct slot *frozen_slot= NULL; + + if (!dev) + return 1; + + if (reconfig) + frozen_slot = rpaphp_find_slot(dev); + + if (reconfig && frozen_slot) rpaphp_unconfig_pci_adapter (frozen_slot); + + /* Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices */ + rtas_set_slot_reset (dn->child); + rtas_configure_bridge(dn); + eeh_restore_bars(dn->child); +printk ("duude, post restore bars, for %s here's the dump\n", dn->full_name); +{ +extern int rtas_read_config(struct device_node *dn, int where, int size, u32 *val); +int i, rc; +u32 val; +struct device_node *xn=dn->child; +for(i=0;i<16;i++) { +rc = rtas_read_config (xn, i*4,4,&val); +printk ("duude read config %d rc=%d val=%x expect=%x\n", i, rc, val,xn->config_space[i]); +}} + + enable_irq (dev->irq); + + /* Give the system 5 seconds to finish running the user-space + * hotplug scripts, e.g. ifdown for ethernet. Yes, this is a hack, + * but if we don't do this, weird things happen. + */ + if (reconfig && frozen_slot) { + ssleep (5); + rpaphp_enable_pci_slot (frozen_slot); + } + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 15 + +int handle_eeh_events (struct notifier_block *self, + unsigned long reason, void *ev) +{ + int freeze_count=0; + struct device_node *frozen_device; + struct peh_event *event = ev; + struct pci_dev *dev = event->dev; + int perm_failure = 0; + int rc; + + if (!dev) + { + printk ("EEH: EEH error caught, but no PCI device specified!\n"); + return 1; + } + + frozen_device = get_phb_of_device (dev); + + if (!frozen_device) + { + printk (KERN_ERR "EEH: Cannot find PCI conroller for %s %s\n", + pci_name(dev), pci_pretty_name (dev)); + + return 1; + } + + /* We get "permanent failure" messages on empty slots. + * These are false alarms. Empty slots have no child dn. */ + if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL)) + return 0; + + if (frozen_device) + freeze_count = frozen_device->eeh_freeze_count; + freeze_count ++; + if (freeze_count > EEH_MAX_ALLOWED_FREEZES) + perm_failure = 1; + + /* If the reset state is a '5' and the time to reset is 0 (infinity) + * or is more then 15 seconds, then mark this as a permanent failure. + */ + if ((event->state == pci_channel_io_perm_failure) && + ((event->time_unavail <= 0) || + (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) + perm_failure = 1; + + /* Log the error with the rtas logger. */ + if (perm_failure) { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + printk (KERN_ERR + "EEH: device %s:%s has failed %d times \n" + "and has been permanently disabled. Please try reseating\n" + "this device or replacing it.\n", + pci_name (dev), + pci_pretty_name (dev), + freeze_count); + + eeh_slot_error_detail (frozen_device, 2 /* Permanent Error */); + + /* Notify the device that its about to go down. */ + /* XXX this should be a recursive walk to children for + * multi-function devices */ + if (dev->driver->err_handler.error_detected) { + dev->driver->err_handler.error_detected (dev, pci_channel_io_perm_failure); + } + + /* If there's a hotplug slot, unconfigure it */ + struct slot * frozen_slot = rpaphp_find_slot(dev); + rpaphp_unconfig_pci_adapter (frozen_slot); + return 1; + } else { + eeh_slot_error_detail (frozen_device, 1 /* Temporary Error */); + } + + printk (KERN_WARNING + "EEH: This device has failed %d times since last reboot: %s:%s\n", + freeze_count, + pci_name (dev), + pci_pretty_name (dev)); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset */ + /* XXX this should be a recursive walk to children for + * multi-function devices; each child should get to report + * status too, if needed ... if any child can't handle the reset, + * then need to hotplug it. + * XXX This does not follow flow of BenH's last email at all. + * XXX will be fixed later XXX + */ + if (dev->driver->err_handler.error_detected) { + dev->driver->err_handler.error_detected (dev, pci_channel_io_frozen); + rc = eeh_reset_device (dev, frozen_device, 0); + if (dev->driver->err_handler.slot_reset) + dev->driver->err_handler.slot_reset (dev); + } else { + rc = eeh_reset_device (dev, frozen_device, 1); + } + + /* Store the freeze count with the pci adapter, and not the slot. + * This way, if the device is replaced, the count is cleared. + */ + frozen_device->eeh_freeze_count = freeze_count; + + return rc; +} Thanks- John From greg at kroah.com Sat May 14 02:18:55 2005 From: greg at kroah.com (Greg KH) Date: Fri, 13 May 2005 09:18:55 -0700 Subject: [PATCH] Updated: fix-pci-mmap-on-ppc-and-ppc64.patch In-Reply-To: <200505131744.11095.michael@ellerman.id.au> References: <200505131744.11095.michael@ellerman.id.au> Message-ID: <20050513161855.GC11089@kroah.com> On Fri, May 13, 2005 at 05:44:10PM +1000, Michael Ellerman wrote: > Hi Andrew, Hm, why not send this to the pci maintainer? > This is an updated version of Ben's fix-pci-mmap-on-ppc-and-ppc64.patch > which is in 2.6.12-rc4-mm1. > > It fixes the patch to work on PPC iSeries, removes some debug printks > at Ben's request, and incorporates your > fix-pci-mmap-on-ppc-and-ppc64-fix.patch also. I'll add it to my queue. greg k-h From linas at austin.ibm.com Sat May 14 03:28:09 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Fri, 13 May 2005 12:28:09 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <1116001556.31126.28.camel@sinatra.austin.ibm.com> References: <20050224011409.GE2088@austin.ibm.com> <421DDEF7.7080103@jp.fujitsu.com> <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> <1116001556.31126.28.camel@sinatra.austin.ibm.com> Message-ID: <20050513172809.GA4138@austin.ibm.com> On Fri, May 13, 2005 at 11:25:56AM -0500, John Rose was heard to remark: > > I don't think handle_eeh_events() or eeh_reset_device() > belong in the RPA PCI hotplug driver. Suggestions where these should go? They got moved from arch/ppc64/kernel to drivers/pci/hotplug at the urging of Paulus and GregKH; in part because rpaphp can be built as a module, whereas the the ppc64 bits cannot. Would a distinct file in drivers/pci/hotplug work? Or someething else? > Looking at the patch, all EEH needs from PCI Hotplug are enable and > disable functions. The rpaphp driver could register disable/enable > functions with eeh.c, and we could avoid introducing this unrelated > code to the PCI Hotplug driver. That's how it used to work; no one liked that idea. But it doesn't hurt to revisit. I think the biggest problem here is module dependencies. Alternately, we might struggle to find a generic pci-error-recovery-on-top-of-pci-hotplug solution, but this may be premature at this point. > One other quick comment: > > +static struct device_node * > +get_phb_of_device (struct pci_dev *dev) > +{ > + struct device_node *dn; > + struct pci_bus *bus; > + > + while (1) { > + bus = dev->bus; > + if (!bus) > + break; > + dn = pci_bus_to_OF_node(bus); > + > + if (dn->phb) > + return dn; > + > + dev = bus->self; > + BUG_ON (dev==NULL); > + if (dev == NULL) > + return NULL; > + } > > Could you just use pci_device_to_OF_node(), then dn->phb? This way, > you could avoid the loop. Yes, except that not all dn's have phb's. The goal here is to walk up the tree, until a phb is found. --linas From linas at austin.ibm.com Sat May 14 03:29:33 2005 From: linas at austin.ibm.com (Linas Vepstas) Date: Fri, 13 May 2005 12:29:33 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <200505131502.j4DF2S7F015855@snoqualmie.dp.intel.com> References: <200505131502.j4DF2S7F015855@snoqualmie.dp.intel.com> Message-ID: <20050513172933.GB4138@austin.ibm.com> On Fri, May 13, 2005 at 08:02:28AM -0700, long was heard to remark: > On Fri, 6 May 2005 18:05:06, Linas wrote: > >Hi, > > > >This is an "FYI" patch partially implementing the PCI error recovery API > >previously detailed by BenH in an earlier email. > > This patch works fine for my AER code. Please post it to LKML. I'll try to do that "real soon now". I've got Symbios recovery working at this time. --linas From johnrose at austin.ibm.com Sat May 14 03:40:28 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 13 May 2005 12:40:28 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <20050513172809.GA4138@austin.ibm.com> References: <20050224011409.GE2088@austin.ibm.com> <421DDEF7.7080103@jp.fujitsu.com> <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> <1116001556.31126.28.camel@sinatra.austin.ibm.com> <20050513172809.GA4138@austin.ibm.com> Message-ID: <1116006028.31126.101.camel@sinatra.austin.ibm.com> Hi Linas- > Suggestions where these should go? They got moved from arch/ppc64/kernel > to drivers/pci/hotplug at the urging of Paulus and GregKH; in part because > rpaphp can be built as a module, whereas the the ppc64 bits cannot. > Would a distinct file in drivers/pci/hotplug work? Or someething else? Paul, Greg, can you explain the motivation behind this? Why doesn't the EEH recovery logic belong in eeh.c, with the rest of EEH code? > > Could you just use pci_device_to_OF_node(), then dn->phb? This way, > > you could avoid the loop. > > Yes, except that not all dn's have phb's. The goal here is to walk up > the tree, until a phb is found. If a PCI node had a null phb pointer, that would be a bug :) This pointer should be set for nodes present at boot as well as dynamically added ones. Thanks- John From greg at kroah.com Sat May 14 04:19:59 2005 From: greg at kroah.com (Greg KH) Date: Fri, 13 May 2005 11:19:59 -0700 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <1116006028.31126.101.camel@sinatra.austin.ibm.com> References: <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> <1116001556.31126.28.camel@sinatra.austin.ibm.com> <20050513172809.GA4138@austin.ibm.com> <1116006028.31126.101.camel@sinatra.austin.ibm.com> Message-ID: <20050513181958.GA13102@kroah.com> On Fri, May 13, 2005 at 12:40:28PM -0500, John Rose wrote: > Hi Linas- > > > Suggestions where these should go? They got moved from arch/ppc64/kernel > > to drivers/pci/hotplug at the urging of Paulus and GregKH; in part because > > rpaphp can be built as a module, whereas the the ppc64 bits cannot. > > Would a distinct file in drivers/pci/hotplug work? Or someething else? > > Paul, Greg, can you explain the motivation behind this? Why doesn't the > EEH recovery logic belong in eeh.c, with the rest of EEH code? So that all platforms can use this logic? Not all the world is a ppc64 box :) greg k-h From johnrose at austin.ibm.com Sat May 14 04:42:56 2005 From: johnrose at austin.ibm.com (John Rose) Date: Fri, 13 May 2005 13:42:56 -0500 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <20050513181958.GA13102@kroah.com> References: <20050224231455.GH2088@austin.ibm.com> <421E9D16.3000606@jp.fujitsu.com> <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> <1116001556.31126.28.camel@sinatra.austin.ibm.com> <20050513172809.GA4138@austin.ibm.com> <1116006028.31126.101.camel@sinatra.austin.ibm.com> <20050513181958.GA13102@kroah.com> Message-ID: <1116009776.2018.17.camel@sinatra.austin.ibm.com> > So that all platforms can use this logic? Not all the world is a ppc64 > box :) ?? The chunk of code in question is quite PPC64-specific, so it's not of use to other platforms. The code is being placed in the rpaphp driver, which is also PPC64 specific. Why couldn't arch/ppc64/kernel/eeh.c register an error_recover() implementation with the generic layer? Thanks- John From greg at kroah.com Sat May 14 04:49:50 2005 From: greg at kroah.com (Greg KH) Date: Fri, 13 May 2005 11:49:50 -0700 Subject: [PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated) In-Reply-To: <1116009776.2018.17.camel@sinatra.austin.ibm.com> References: <20050312013251.GA2609@austin.ibm.com> <4235847F.3080705@jp.fujitsu.com> <20050314181420.GD498@austin.ibm.com> <1112685311.9518.35.camel@gaston> <20050506230506.GL11745@austin.ibm.com> <1116001556.31126.28.camel@sinatra.austin.ibm.com> <20050513172809.GA4138@austin.ibm.com> <1116006028.31126.101.camel@sinatra.austin.ibm.com> <20050513181958.GA13102@kroah.com> <1116009776.2018.17.camel@sinatra.austin.ibm.com> Message-ID: <20050513184950.GA13413@kroah.com> On Fri, May 13, 2005 at 01:42:56PM -0500, John Rose wrote: > > So that all platforms can use this logic? Not all the world is a ppc64 > > box :) > > ?? The chunk of code in question is quite PPC64-specific, so it's not > of use to other platforms. The code is being placed in the rpaphp > driver, which is also PPC64 specific. > > Why couldn't arch/ppc64/kernel/eeh.c register an error_recover() > implementation with the generic layer? To be honest, I don't really remember anymore, this thread has been going on for _months_ now :) greg k-h From arnd at arndb.de Sat May 14 05:23:49 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Fri, 13 May 2005 21:23:49 +0200 Subject: [PATCH 2/8] ppc64: add a minimal nvram driver In-Reply-To: <200505132117.37461.arnd@arndb.de> References: <200505132117.37461.arnd@arndb.de> Message-ID: <200505132123.50284.arnd@arndb.de> The firmware provides the location and size of the nvram in the device tree, so it does not really contain any hardware specific bits and could be used on other machines as well. From: Utz Bacher Signed-off-by: Arnd Bergmann Index: linus-2.5/arch/ppc64/kernel/bpa_nvram.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linus-2.5/arch/ppc64/kernel/bpa_nvram.c 2005-04-20 01:55:36.000000000 +0200 @@ -0,0 +1,118 @@ +/* + * NVRAM for CPBW + * + * (C) Copyright IBM Corp. 2005 + * + * Authors : Utz Bacher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +static void __iomem *bpa_nvram_start; +static long bpa_nvram_len; +static spinlock_t bpa_nvram_lock = SPIN_LOCK_UNLOCKED; + +static ssize_t bpa_nvram_read(char *buf, size_t count, loff_t *index) +{ + unsigned long flags; + + if (*index >= bpa_nvram_len) + return 0; + if (*index + count > bpa_nvram_len) + count = bpa_nvram_len - *index; + + spin_lock_irqsave(&bpa_nvram_lock, flags); + + memcpy_fromio(buf, bpa_nvram_start + *index, count); + + spin_unlock_irqrestore(&bpa_nvram_lock, flags); + + *index += count; + return count; +} + +static ssize_t bpa_nvram_write(char *buf, size_t count, loff_t *index) +{ + unsigned long flags; + + if (*index >= bpa_nvram_len) + return 0; + if (*index + count > bpa_nvram_len) + count = bpa_nvram_len - *index; + + spin_lock_irqsave(&bpa_nvram_lock, flags); + + memcpy_toio(bpa_nvram_start + *index, buf, count); + + spin_unlock_irqrestore(&bpa_nvram_lock, flags); + + *index += count; + return count; +} + +static ssize_t bpa_nvram_get_size(void) +{ + return bpa_nvram_len; +} + +int __init bpa_nvram_init(void) +{ + struct device_node *nvram_node; + unsigned long *buffer; + int proplen; + unsigned long nvram_addr; + int ret; + + ret = -ENODEV; + nvram_node = of_find_node_by_type(NULL, "nvram"); + if (!nvram_node) + goto out; + + ret = -EIO; + buffer = (unsigned long *)get_property(nvram_node, "reg", &proplen); + if (proplen != 2*sizeof(unsigned long)) + goto out; + + ret = -ENODEV; + nvram_addr = buffer[0]; + bpa_nvram_len = buffer[1]; + if ( (!bpa_nvram_len) || (!nvram_addr) ) + goto out; + + bpa_nvram_start = ioremap(nvram_addr, bpa_nvram_len); + if (!bpa_nvram_start) + goto out; + + printk(KERN_INFO "BPA NVRAM, %luk mapped to %p\n", + bpa_nvram_len >> 10, bpa_nvram_start); + + ppc_md.nvram_read = bpa_nvram_read; + ppc_md.nvram_write = bpa_nvram_write; + ppc_md.nvram_size = bpa_nvram_get_size; + +out: + of_node_put(nvram_node); + return ret; +} Index: linus-2.5/include/asm-ppc64/nvram.h =================================================================== --- linus-2.5.orig/include/asm-ppc64/nvram.h 2005-04-20 01:54:03.000000000 +0200 +++ linus-2.5/include/asm-ppc64/nvram.h 2005-04-20 01:55:36.000000000 +0200 @@ -70,6 +70,7 @@ extern int pSeries_nvram_init(void); extern int pmac_nvram_init(void); +extern int bpa_nvram_init(void); /* PowerMac specific nvram stuffs */ From arnd at arndb.de Sat May 14 05:21:25 2005 From: arnd at arndb.de (Arnd Bergmann) Date: Fri, 13 May 2005 21:21:25 +0200 Subject: [PATCH 1/8] ppc64: split out generic rtas code from pSeries_pci.c In-Reply-To: <200505132117.37461.arnd@arndb.de> References: <200505132117.37461.arnd@arndb.de> Message-ID: <200505132121.26996.arnd@arndb.de> BPA is using rtas for PCI but should not be confused by pSeries code. This also avoids some #ifdefs. Other platforms that want to use rtas_pci.c could create their own platform_pci.c with platform specific fixups. Signed-off-by: Arnd Bergmann --- linux-cg.orig/arch/ppc64/kernel/Makefile 2005-05-13 14:56:19.016994560 -0400 +++ linux-cg/arch/ppc64/kernel/Makefile 2005-05-13 15:00:05.111971888 -0400 @@ -32,13 +32,14 @@ obj-$(CONFIG_PPC_MULTIPLATFORM) += nvram obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \ pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \ -