[PATCH] [2.4] show TCE statistics in /proc/ppc64
linas at austin.ibm.com
linas at austin.ibm.com
Wed Nov 26 10:09:21 EST 2003
Actually attaching the actual patch would be appropriate ...
On Tue, Nov 25, 2003 at 04:22:01PM -0600, linas at austin.ibm.com wrote:
>
> The attached patch should apply cleanly against the current
> ameslab-2.4 tree. It provides TCE usage statistics in /proc
> that might be useful to anyone interested in DMA performance
> or is debugging DMA usage in device drivers.
>
> I don't have BK access so I can't bk push anything at this time:
> Paul, if this patch looks good, can you apply & etc. as needed?
>
> The TCE usage stats are in the /proc/ppc64/tce directory.
> Get stats for all PCI devices by 'cat /proc/ppc64/tce/stats'.
> One can also get more detailed per-device stats by performing an
> 'echo "show bb:dd" > /proc/ppc64/tce/stats'
> where bb os the hex bus number, dd is the hex device number
> (cat /proc/ppc64/pci to figure these out). One can then
> 'cat /proc/ppc64/tce/detail-bb:dd' to see the detailed stats.
>
> Here's a sample of the 'detail' output:
>
> device 21:01
> Fri Nov 21 14:04:07 PST 2003
> total_use_cnt=92042 alloc_cnt=1267 max_alloc_cnt=1267
> Level use_cnt split merge alloc maxaloc actual stale entries
> 0 29552 0 0 59 114 59 1 32768
> 1 1988 60 7 662 675 3 0 16384
> 2 3882 35 5 11 25 11 0 8192
> 3 7147 31 5 30 44 30 0 4096
> 4 12405 44 12 45 69 45 0 2048
> 5 28815 71 26 285 285 285 0 1024
> 6 8253 336 171 175 175 175 0 512
> 7 0 633 463 0 1 0 0 256
> 8 0 314 229 0 1 0 0 128
> 9 0 153 110 0 1 0 0 64
>
> [dump of TCE bitmaps cut out]
>
> --linas
>
>
-------------- next part --------------
Index: arch/ppc64/config.in
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/config.in,v
retrieving revision 1.1.1.4
diff -u -p -u -p -r1.1.1.4 config.in
--- arch/ppc64/config.in 25 Nov 2003 20:04:34 -0000 1.1.1.4
+++ arch/ppc64/config.in 25 Nov 2003 20:43:04 -0000
@@ -50,6 +50,11 @@ bool 'Shared kernel/user space addressin
tristate 'LPAR Configuration Data' CONFIG_LPARCFG
+if [ "$CONFIG_PPC_ISERIES" != "y" ]; then
+ if [ "$CONFIG_PROC_FS" = "y" ]; then
+ bool 'Show realtime tce usage stats in /proc/ppc64/tce' CONFIG_TCE_STATS
+ fi
+fi
endmenu
mainmenu_option next_comment
Index: arch/ppc64/kdb/kdbasupport.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kdb/Attic/kdbasupport.c,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 kdbasupport.c
--- arch/ppc64/kdb/kdbasupport.c 25 Nov 2003 20:04:34 -0000 1.1.1.1
+++ arch/ppc64/kdb/kdbasupport.c 25 Nov 2003 20:32:44 -0000
@@ -1617,11 +1617,16 @@ kdba_dump_tce_table(int argc, const char
long tce_table_address;
int nr;
int i,j,k;
- int full,empty;
+ int full,partial,empty;
int fulldump=0;
u64 mapentry;
- int totalpages;
+ int freepages;
int levelpages;
+#ifdef CONFIG_TCE_STATS
+ struct tce_blk_stats *blk_stats;
+ int alloced_blocks, stale_blocks;
+ unsigned long alloc_jiffies;
+#endif /* CONFIG_TCE_STATS */
if (argc == 0) {
kdb_printf("need address\n");
@@ -1634,17 +1639,21 @@ kdba_dump_tce_table(int argc, const char
if (strcmp(argv[2], "full") == 0)
fulldump=1;
- /* with address, read contents of memory and dump tce table. */
- /* possibly making some assumptions on the depth and size of table..*/
+ /* use address to read contents of memory and dump tce table. */
- nr = kdba_readarea_size(tce_table_address+0 ,&kt.busNumber,8);
- nr = kdba_readarea_size(tce_table_address+8 ,&kt.size,8);
- nr = kdba_readarea_size(tce_table_address+16,&kt.startOffset,8);
- nr = kdba_readarea_size(tce_table_address+24,&kt.base,8);
- nr = kdba_readarea_size(tce_table_address+32,&kt.index,8);
- nr = kdba_readarea_size(tce_table_address+40,&kt.tceType,8);
+#define GET_TCE_VAL(X) \
+ nr = kdba_readarea_size( \
+ ((long) &(((struct TceTable *)tce_table_address)->X)), \
+ &(kt.X), sizeof(kt.X));
+
+ GET_TCE_VAL (busNumber);
+ GET_TCE_VAL (size);
+ GET_TCE_VAL (startOffset);
+ GET_TCE_VAL (base);
+ GET_TCE_VAL (index);
+ GET_TCE_VAL (tceType);
#ifdef CONFIG_SMP
- nr = kdba_readarea_size(tce_table_address+48,&kt.lock,8);
+ GET_TCE_VAL (lock);
#endif
kdb_printf("\n");
@@ -1658,43 +1667,94 @@ kdba_dump_tce_table(int argc, const char
#ifdef CONFIG_SMP
kdb_printf("lock: 0x%x \n",(uint)kt.lock.lock);
#endif
- nr = kdba_readarea_size(tce_table_address+56,&kt.mlbm.maxLevel,8);
- kdb_printf(" maxLevel: 0x%x \n",(uint)kt.mlbm.maxLevel);
- totalpages=0;
+ GET_TCE_VAL (mlbm.maxLevel);
+ kdb_printf(" maxLevel: 0x%x \n",(uint)kt.mlbm.maxLevel);
+#ifdef CONFIG_TCE_STATS
+ GET_TCE_VAL (use_cnt);
+ GET_TCE_VAL (alloc_cnt);
+ kdb_printf(" use_cnt: %d \n",(uint)kt.use_cnt);
+ kdb_printf(" alloc_cnt: %d \n",(uint)kt.alloc_cnt);
+#endif
+ freepages=0;
for (i=0;i<NUM_TCE_LEVELS;i++) {
- nr = kdba_readarea_size(tce_table_address+64+i*24,&kt.mlbm.level[i].numBits,8);
- nr = kdba_readarea_size(tce_table_address+72+i*24,&kt.mlbm.level[i].numBytes,8);
- nr = kdba_readarea_size(tce_table_address+80+i*24,&kt.mlbm.level[i].map,8);
+ GET_TCE_VAL (mlbm.level[i].numBits);
+ GET_TCE_VAL (mlbm.level[i].numBytes);
+ GET_TCE_VAL (mlbm.level[i].map);
kdb_printf(" level[%d]\n",i);
kdb_printf(" numBits: 0x%x\n",(uint)kt.mlbm.level[i].numBits);
kdb_printf(" numBytes: 0x%x\n",(uint)kt.mlbm.level[i].numBytes);
kdb_printf(" map*: %p\n",kt.mlbm.level[i].map);
+#ifdef CONFIG_TCE_STATS
+ GET_TCE_VAL (mlbm.level[i].blk_stats);
+ blk_stats = kt.mlbm.level[i].blk_stats;
+ if (blk_stats) {
+ alloced_blocks = 0;
+ stale_blocks = 0;
+ /* alloc_jiffies will be set if the block is allocated and not freed.
+ * stale blocks suggest a leak or a really slow i/o system */
+ for (j=0; j<kt.mlbm.level[i].numBits; j++) {
+ kdba_readarea_size( (long) (&(blk_stats[j].alloc_jiffies)), &alloc_jiffies, 8);
+ if (alloc_jiffies && alloc_jiffies != ((unsigned long) -1)) {
+ alloced_blocks++;
+ /* 'stale' if alloc happened more than 3 seconds ago */
+ if (jiffies - alloc_jiffies > 3*HZ) {
+ stale_blocks ++;
+ }
+ }
+ }
+ kdb_printf(" blk_stats: %p\n", blk_stats);
+ } else {
+ alloced_blocks = -1;
+ stale_blocks = -1;
+ }
+ GET_TCE_VAL (mlbm.level[i].use_cnt);
+ GET_TCE_VAL (mlbm.level[i].split_cnt);
+ GET_TCE_VAL (mlbm.level[i].merge_cnt);
+ GET_TCE_VAL (mlbm.level[i].alloc_cnt);
+ kdb_printf(" use_cnt: %d split: %d merge: %d alloced: %d cnt-alloc: %d stale: %d\n",
+ kt.mlbm.level[i].use_cnt,
+ kt.mlbm.level[i].split_cnt,
+ kt.mlbm.level[i].merge_cnt,
+ kt.mlbm.level[i].alloc_cnt,
+ alloced_blocks,
+ stale_blocks);
+#endif
/* if these dont match, this might not be a valid tce table, so
dont try to iterate the map entries. */
if (kt.mlbm.level[i].numBits == 8*kt.mlbm.level[i].numBytes) {
- full=0;empty=0;levelpages=0;
+ int n=0;
+ full=0;partial=0;empty=0;levelpages=0;
for (j=0;j<kt.mlbm.level[i].numBytes; j++) {
mapentry=0;
nr = kdba_readarea_size((long int)(kt.mlbm.level[i].map+j),&mapentry,1);
- if (mapentry)
+ mapentry >>= 56;
+ if (mapentry == 0xff)
full++;
+ else if (mapentry)
+ partial++;
else
empty++;
if (mapentry && fulldump) {
- kdb_printf("0x%lx\n",mapentry);
+ if (n && (n%32 == 0)) kdb_printf ("\n");
+ kdb_printf("%02lx ",(int) mapentry);
+ n++;
}
- for (k=0;(k<=64) && ((0x1UL<<k) <= mapentry);k++) {
+ for (k=0;(k<8) && ((0x1UL<<k) <= mapentry);k++) {
if ((0x1UL<<k) & mapentry) levelpages++;
}
}
- kdb_printf(" full:0x%x empty:0x%x pages:0x%x\n",full,empty,levelpages);
+ if (fulldump) kdb_printf ("\n");
+ kdb_printf(" full:0x%x partial:0x%x empty:0x%x free:0x%x\n",
+ full,partial,empty,levelpages);
} else {
kdb_printf(" numBits/numBytes mismatch..? \n");
}
- totalpages+=levelpages;
+ freepages += (1UL<<i) * levelpages;
}
- kdb_printf(" Total pages:0x%x\n",totalpages);
+ kdb_printf(" Total pages: 0x%lx free:0x%x used:0x%lx\n",
+ kt.mlbm.level[0].numBits, freepages,
+ kt.mlbm.level[0].numBits - freepages);
kdb_printf("\n");
return 0;
}
Index: arch/ppc64/kernel/Makefile
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/Makefile,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 Makefile
--- arch/ppc64/kernel/Makefile 25 Nov 2003 20:04:35 -0000 1.1.1.3
+++ arch/ppc64/kernel/Makefile 25 Nov 2003 20:32:44 -0000
@@ -29,7 +29,7 @@ obj-y := ppc_ksyms.o setup
iSeries_proc.o HvCall.o flight_recorder.o HvLpConfig.o \
rtc.o perfmon.o cputable.o vio.o
-obj-$(CONFIG_PCI) += pci.o pci_dn.o pci_dma.o pSeries_lpar.o pSeries_hvCall.o
+obj-$(CONFIG_PCI) += pci.o pci_dn.o pci_dma.o proc_tce.o pSeries_lpar.o pSeries_hvCall.o
ifeq ($(CONFIG_PPC_ISERIES),y)
obj-$(CONFIG_PCI) += iSeries_pci.o iSeries_pci_reset.o iSeries_IoMmTable.o iSeries_irq.o iSeries_VpdInfo.o XmPciLpEvent.o
Index: arch/ppc64/kernel/pci_dma.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/pci_dma.c,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 pci_dma.c
--- arch/ppc64/kernel/pci_dma.c 25 Nov 2003 20:04:37 -0000 1.1.1.3
+++ arch/ppc64/kernel/pci_dma.c 25 Nov 2003 20:32:44 -0000
@@ -44,7 +44,7 @@
#include "pci.h"
/* #define DEBUG_TCE 1 */
-/* #define MONITOR_TCE 1 */ /* Turn on to sanity check TCE generation. */
+#define MONITOR_TCE 1 /* Turn on to sanity check TCE generation. */
/* Initialize so this guy does not end up in the BSS section.
@@ -212,6 +212,30 @@ static void tce_build_pSeries(struct Tce
}
+#ifdef CONFIG_TCE_STATS
+/*
+ * Initialize tce-table statistics. Handy typically only for device driver
+ * debugging, perf tuning, etc.
+ */
+static void init_tce_stats(struct TceTable * tbl)
+{
+ int i;
+
+ for (i=0; i<NUM_TCE_LEVELS; ++i) {
+ tbl->mlbm.level[i].use_cnt = 0;
+ tbl->mlbm.level[i].split_cnt = 0;
+ tbl->mlbm.level[i].merge_cnt = 0;
+ tbl->mlbm.level[i].alloc_cnt = 0;
+ tbl->mlbm.level[i].max_alloc_cnt = 0;
+ tbl->mlbm.level[i].blk_stats = 0x0;
+ }
+
+ tbl->use_cnt = 0;
+ tbl->alloc_cnt = 0;
+ tbl->max_alloc_cnt = 0;
+}
+
+#endif /* CONFIG_TCE_STATS */
/*
* Build a TceTable structure. This contains a multi-level bit map which
* is used to manage allocation of the tce space.
@@ -276,7 +300,6 @@ struct TceTable *build_tce_table(struct
}
/* For the highest level, turn on all the bits */
-
i = tbl->mlbm.maxLevel;
p = tbl->mlbm.level[i].map;
m = numBits[i];
@@ -301,6 +324,10 @@ struct TceTable *build_tce_table(struct
}
}
+#ifdef CONFIG_TCE_STATS
+ init_tce_stats (tbl);
+#endif /* CONFIG_TCE_STATS */
+
return tbl;
}
@@ -364,6 +391,37 @@ static long alloc_tce_range_nolock( stru
*/
PPCDBG(PPCDBG_TCE, "alloc_tce_range_nolock: allocating block %ld, (byte=%ld, bit=%ld) order %d\n", block, i, bit, order );
tcenum = block << order;
+#ifdef CONFIG_TCE_STATS
+ if (tbl->mlbm.level[order].blk_stats) {
+ tbl->mlbm.level[order].blk_stats[block].use_cnt ++;
+ tbl->mlbm.level[order].blk_stats[block].alloc_jiffies = jiffies;
+ }
+ tbl->mlbm.level[order].use_cnt ++;
+ tbl->use_cnt ++;
+ tbl->mlbm.level[order].alloc_cnt ++;
+ tbl->alloc_cnt ++;
+ if (tbl->mlbm.level[order].max_alloc_cnt <
+ tbl->mlbm.level[order].alloc_cnt) {
+ tbl->mlbm.level[order].max_alloc_cnt =
+ tbl->mlbm.level[order].alloc_cnt;
+ }
+ if (tbl->max_alloc_cnt < tbl->alloc_cnt) {
+ tbl->max_alloc_cnt = tbl->alloc_cnt;
+ }
+
+#define THRESHOLD 1000
+ static int watermark = THRESHOLD;
+ if (tbl->alloc_cnt > watermark) {
+ printk ("alloc_tce_range: more than %d ranges alloced (%d)\n",
+ watermark, tbl->alloc_cnt);
+ watermark += THRESHOLD;
+ }
+ if (((int)tbl->alloc_cnt) < ((int)(watermark - 2*THRESHOLD))) {
+ watermark -= THRESHOLD;
+ printk ("alloc_tce_range: alloc usage dropped below %d (%d)\n",
+ watermark-THRESHOLD, tbl->alloc_cnt);
+ }
+#endif /* CONFIG_TCE_STATS */
return tcenum;
}
++map;
@@ -388,6 +446,25 @@ static long alloc_tce_range_nolock( stru
if((tcenum == -1) && (order < (NUM_TCE_LEVELS - 1))) {
tcenum = alloc_tce_range_nolock( tbl, order+1 );
if ( tcenum != -1 ) {
+#ifdef CONFIG_TCE_STATS
+ /* fix up stats for 'what we actually used' */
+ if (tbl->mlbm.level[order].blk_stats) {
+ tbl->mlbm.level[order].blk_stats[(tcenum>>order)].alloc_jiffies = jiffies;
+ tbl->mlbm.level[order].blk_stats[(tcenum>>order)].use_cnt ++;
+ tbl->mlbm.level[order].blk_stats[(tcenum>>order)+1].alloc_jiffies = jiffies;
+ }
+ if (tbl->mlbm.level[order+1].blk_stats) {
+ tbl->mlbm.level[order+1].blk_stats[(tcenum>>(order+1))].alloc_jiffies = (unsigned long) -1;
+ tbl->mlbm.level[order+1].blk_stats[(tcenum>>(order+1))].use_cnt --;
+ }
+ tbl->mlbm.level[order].use_cnt ++;
+ tbl->mlbm.level[order+1].use_cnt --;
+ tbl->mlbm.level[order+1].split_cnt ++;
+
+ tbl->mlbm.level[order+1].alloc_cnt --; /* uncount higher order */
+ tbl->mlbm.level[order].alloc_cnt +=2; /* count twice, since next free will uncount */
+ tbl->alloc_cnt ++; /* count 'twice' since free will uncount */
+#endif /* CONFIG_TCE_STATS */
free_tce_range_nolock( tbl, tcenum+(1<<order), order );
}
}
@@ -450,6 +527,17 @@ void free_tce_range_nolock(struct TceTab
mask = 0x80 >> bit;
bytep = map + byte;
+#ifdef CONFIG_TCE_STATS
+ tbl->alloc_cnt --;
+ tbl->mlbm.level[order].alloc_cnt --;
+ if (tbl->mlbm.level[order].blk_stats) {
+ if (0 == tbl->mlbm.level[order].blk_stats[block].alloc_jiffies) {
+ printk("PCI_DMA: Freeing tce that wasn't alloced: busno 0x%lx tcenum %lx, order %x\n", tbl->busNumber, tcenum,order);
+ }
+ tbl->mlbm.level[order].blk_stats[block].alloc_jiffies = 0;
+ }
+#endif /* CONFIG_TCE_STATS */
+
#ifdef DEBUG_TCE
PPCDBG(PPCDBG_TCE,"free_tce_range_nolock: freeing block %ld (byte=%d, bit=%d) of order %d\n",
block, byte, bit, order);
@@ -487,6 +575,11 @@ void free_tce_range_nolock(struct TceTab
PPCDBG(PPCDBG_TCE,
"free_tce_range: buddying blocks %ld & %ld\n",
block, block+1);
+#ifdef CONFIG_TCE_STATS
+ tbl->mlbm.level[order+1].merge_cnt ++;
+ tbl->alloc_cnt ++; /* undo excess counting */
+ tbl->mlbm.level[order+1].alloc_cnt ++; /* undo excess counts */
+#endif /* CONFIG_TCE_STATS */
free_tce_range_nolock( tbl, tcenum, order+1 );
}
}
@@ -757,8 +850,8 @@ void create_tce_tables(void) {
void create_pci_bus_tce_table( unsigned long token ) {
struct TceTable * newTceTable;
- PPCDBG(PPCDBG_TCE, "Entering create_pci_bus_tce_table.\n");
- PPCDBG(PPCDBG_TCE, "\ttoken = 0x%lx\n", token);
+ PPCDBG(PPCDBG_TCEINIT, "Entering create_pci_bus_tce_table.\n");
+ PPCDBG(PPCDBG_TCEINIT, "\ttoken = 0x%lx\n", token);
newTceTable = (struct TceTable *)kmalloc( sizeof(struct TceTable), GFP_KERNEL );
@@ -1084,7 +1177,7 @@ dma_addr_t pci_map_single(struct pci_dev
unsigned order, nPages;
PPCDBG(PPCDBG_TCE, "pci_map_single:\n");
- PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, size = 0x%16.16lx, direction = 0x%16.16lx, vaddr = 0x%16.16lx\n", hwdev, size, direction, vaddr);
+ PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, size = 0x%lx, direction = %ld, vaddr = 0x%16.16lx\n", hwdev, size, direction, vaddr);
if (direction == PCI_DMA_NONE)
BUG();
@@ -1297,7 +1390,7 @@ static dma_addr_t create_tces_sg(struct
/* Client asked for way to much space. This is checked later anyway */
/* It is easier to debug here for the drivers than in the tce tables.*/
if(order >= NUM_TCE_LEVELS) {
- printk("PCI_DMA: create_tces_sg size too large: 0x%llx \n",(numTces << PAGE_SHIFT));
+ printk("PCI_DMA: create_tces_sg size too large: 0x%x \n",(numTces << PAGE_SHIFT));
panic("numTces is off");
return NO_TCE;
}
@@ -1403,7 +1496,7 @@ void pci_unmap_sg( struct pci_dev *hwdev
dma_addr_t dma_end_page, dma_start_page;
PPCDBG(PPCDBG_TCE, "pci_unmap_sg:\n");
- PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, sg = 0x%16.16lx, direction = 0x%16.16lx, nelms = 0x%16.16lx\n", hwdev, sg, direction, nelms);
+ PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, sg = 0x%16.16lx, direction = %ld, nelms = %ld\n", hwdev, sg, direction, nelms);
if ( direction == PCI_DMA_NONE || nelms == 0 )
BUG();
@@ -1425,7 +1518,7 @@ void pci_unmap_sg( struct pci_dev *hwdev
/* Client asked for way to much space. This is checked later anyway */
/* It is easier to debug here for the drivers than in the tce tables.*/
if(order >= NUM_TCE_LEVELS) {
- printk("PCI_DMA: dma_start_page:0x%lx dma_end_page:0x%lx\n",dma_start_page,dma_end_page);
+ printk("PCI_DMA: dma_start_page:0x%x dma_end_page:0x%x\n",dma_start_page,dma_end_page);
printk("PCI_DMA: pci_unmap_sg size too large: 0x%x \n",(numTces << PAGE_SHIFT));
return;
}
Index: arch/ppc64/kernel/proc_pmc.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/proc_pmc.c,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 proc_pmc.c
--- arch/ppc64/kernel/proc_pmc.c 25 Nov 2003 20:04:36 -0000 1.1.1.3
+++ arch/ppc64/kernel/proc_pmc.c 25 Nov 2003 20:32:44 -0000
@@ -47,6 +47,11 @@
/* pci Flight Recorder AHT */
extern void proc_pciFr_init(struct proc_dir_entry *proc_ppc64_root);
+#ifdef CONFIG_TCE_STATS
+/* PCI TCE stats interface */
+extern void proc_tce_init(struct proc_dir_entry *proc_ppc64_root);
+#endif /* CONFIG_TCE_STATS */
+
static int proc_pmc_control_mode = 0;
struct proc_dir_entry *proc_ppc64_root = NULL;
@@ -188,6 +193,11 @@ void proc_ppc64_init(void)
/* Create the /proc/ppc64/pcifr for the Pci Flight Recorder. */
proc_pciFr_init(proc_ppc64_root);
+
+#ifdef CONFIG_TCE_STATS
+ /* Create the /proc/ppc64/tce entry for TCE stats/debugging */
+ proc_tce_init (proc_ppc64_root);
+#endif /* CONFIG_TCE_STATS */
proc_ppc64_pmc_root = proc_mkdir("pmc", proc_ppc64_root);
Index: include/asm-ppc64/pci_dma.h
===================================================================
RCS file: /home/linas/cvsroot/linux24/include/asm-ppc64/pci_dma.h,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 pci_dma.h
--- include/asm-ppc64/pci_dma.h 15 Jul 2003 16:54:54 -0000 1.1.1.1
+++ include/asm-ppc64/pci_dma.h 25 Nov 2003 21:27:08 -0000
@@ -53,10 +53,32 @@ union Tce {
} tceBits;
};
+#ifdef CONFIG_TCE_STATS
+struct tce_blk_stats {
+ unsigned long alloc_jiffies; /* time when last allocated, helps find leaks */
+ unsigned int use_cnt; /* how many times this block has been alloced */
+ char direction; /* last i/o direction */
+};
+#endif /* CONFIG_TCE_STATS */
+
struct Bitmap {
unsigned long numBits;
unsigned long numBytes;
unsigned char * map;
+#ifdef CONFIG_TCE_STATS
+ unsigned int use_cnt; /* num of blocks that were ever alloced */
+
+ /* The split/merge counts provide stats about the buddy system,
+ * helping debug fragmentation problems. */
+ unsigned int split_cnt; /* num blocks split to make smaller blocks */
+ unsigned int merge_cnt; /* num blocks buddied back up by free */
+
+ unsigned int alloc_cnt; /* num alloc's currently pending */
+ unsigned int max_alloc_cnt; /* highest num alloc's ever */
+
+ /* Individual block stats should help debug alloc leaks. */
+ struct tce_blk_stats * blk_stats;
+#endif /* CONFIG_TCE_STATS */
};
struct MultiLevelBitmap {
@@ -73,6 +95,11 @@ struct TceTable {
u64 tceType;
spinlock_t lock;
struct MultiLevelBitmap mlbm;
+#ifdef CONFIG_TCE_STATS
+ unsigned int use_cnt; /* num alloc's there were ever made */
+ unsigned int alloc_cnt; /* num alloc's currently pending */
+ unsigned int max_alloc_cnt; /* highest num alloc's ever */
+#endif /* CONFIG_TCE_STATS */
};
struct TceTableManagerCB {
--- arch/ppc64/kernel/proc_tce.c.orig 2003-11-21 18:34:35.000000000 -0600
+++ arch/ppc64/kernel/proc_tce.c 2003-11-24 18:11:10.000000000 -0600
@@ -0,0 +1,484 @@
+/*
+ * proc_tce.c
+ * Copyright (C) 2003 Linas Vepstas, IBM Corporation
+ *
+ * Dynamic DMA mapping statistics support.
+ *
+ * Manages the TCE space assigned to this partition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <asm/pci_dma.h>
+#include <asm/uaccess.h>
+
+#include "pci.h"
+
+#ifdef CONFIG_TCE_STATS
+
+static struct proc_dir_entry *proc_ppc64_tce_root = NULL;
+
+/* ================================================================= */
+/* Alloc the detail-stats array. */
+
+static inline int
+get_tce_stats_bytes(struct TceTable * tbl)
+{
+ int num_entries, num_bytes;
+
+ num_entries = tbl->mlbm.level[0].numBits;
+ num_entries *= 2; /* room for other levels as well */
+ num_bytes = num_entries * sizeof( struct tce_blk_stats );
+ return num_bytes;
+}
+
+static inline int
+get_tce_stats_order(struct TceTable * tbl)
+{
+ return get_order (get_tce_stats_bytes(tbl));
+}
+
+#define TRUE 1
+#define FALSE 0
+
+static int
+blk_is_alloced (struct TceTable * tbl, int tcenum, int order)
+{
+ unsigned byte, bit;
+ unsigned char mask, *bytep;
+
+ if (order < 0) return TRUE;
+ if (order > tbl->mlbm.maxLevel) return TRUE;
+ bit = tcenum >> order;
+ byte = bit /8;
+ bit = bit%8;
+ mask = 0x80>>bit;
+ bytep = tbl->mlbm.level[order].map + byte;
+ if (mask & *bytep) return FALSE;
+
+ /* check downwards */
+ if (FALSE == blk_is_alloced (tbl, tcenum, order-1)) return FALSE;
+ if (FALSE == blk_is_alloced (tbl, tcenum+1, order-1)) return FALSE;
+
+ return blk_is_alloced (tbl, tcenum, order+1);
+}
+
+static void
+setup_detail_tce_stats(struct TceTable * tbl)
+{
+ int i;
+ struct tce_blk_stats *p;
+
+ /* Alloc per-block stats array */
+ p = (struct tce_blk_stats *)
+ __get_free_pages( GFP_ATOMIC, get_tce_stats_order(tbl));
+
+ /* alloc may fail for large areas; keep driving */
+ if (p) memset( p, 0, get_tce_stats_bytes(tbl) );
+
+ for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+ tbl->mlbm.level[i].use_cnt = 0;
+ tbl->mlbm.level[i].split_cnt = 0;
+ tbl->mlbm.level[i].merge_cnt = 0;
+ tbl->mlbm.level[i].max_alloc_cnt = 0;
+
+ if (p) {
+ tbl->mlbm.level[i].blk_stats = p;
+ p += tbl->mlbm.level[i].numBits;
+ } else {
+ tbl->mlbm.level[i].blk_stats = 0x0;
+ }
+ }
+
+ tbl->use_cnt = 0;
+ tbl->max_alloc_cnt = 0;
+
+#if 0
+ /* make block stats match current bitmap */
+ for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+ p = tbl->mlbm.level[i].blk_stats;
+ if (p) {
+ int j;
+ for (j=0; j<tbl->mlbm.level[i].numBits; j++) {
+ int tcenum = j<<i;
+ if (blk_is_alloced(tbl,tcenum,i)) {
+ p[j].use_cnt ++;
+ p[j].alloc_jiffies = jiffies;
+ }
+ }
+ }
+ }
+#endif
+
+}
+
+static void
+teardown_detail_tce_stats(struct TceTable * tbl)
+{
+ struct tce_blk_stats *p;
+ p = tbl->mlbm.level[0].blk_stats;
+ if (!p) return;
+ int i;
+ for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+ tbl->mlbm.level[i].blk_stats = NULL;
+ }
+ free_pages ((unsigned long)p, get_tce_stats_order(tbl));
+}
+
+/* ================================================================= */
+#define SZ ((0<(count-n))?(count-n):0)
+
+static ssize_t
+proc_tce_detail_read (struct file * file, char * user_buf,
+ size_t count, loff_t *ppos)
+{
+ int n = 0;
+
+ /* Find the tce table */
+ struct inode * inode = file->f_dentry->d_inode;
+ struct proc_dir_entry * dp;
+ dp = (struct proc_dir_entry *) inode->u.generic_ip;
+ struct TceTable *tbl = dp->data;
+
+ char * buf = (char*) __get_free_page(GFP_KERNEL);
+ if (!buf) return -ENOMEM;
+
+ /* start of virtual pci_for_each_dev(pdev_iter) */
+ static int loop_iter;
+ if (*ppos == 0) {
+ loop_iter = 0;
+
+ /* print header, summary stats */
+ n += snprintf (buf+n, SZ, "total_use_cnt=%d", tbl->use_cnt);
+ n += snprintf (buf+n, SZ, " alloc_cnt=%d", tbl->alloc_cnt);
+ n += snprintf (buf+n, SZ, " max_alloc_cnt=%d\n", tbl->max_alloc_cnt);
+ n += snprintf (buf+n, SZ,
+ "\tLevel\tuse_cnt\tsplit\tmerge\talloc\tmaxaloc\tactual\tstale\tentries\n");
+
+ int i;
+ for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+ struct Bitmap *lvl = &tbl->mlbm.level[i];
+
+ struct tce_blk_stats * blk_stats;
+ blk_stats = lvl->blk_stats;
+
+ int alloced_blocks=0, stale_blocks=0;
+ if (blk_stats) {
+
+ /* alloc_jiffies will be set if the block is
+ * allocated and not freed. Stale blocks suggest
+ * a leak or a really slow i/o system */
+ int j;
+ for (j=0; j<lvl->numBits; j++) {
+ unsigned long alloc_jiffies = blk_stats[j].alloc_jiffies;
+ if (alloc_jiffies && alloc_jiffies != ((unsigned long) -1)) {
+ alloced_blocks++;
+ /* 'stale' if alloc happened more than 3 seconds ago */
+ if (jiffies - alloc_jiffies > 3*HZ) {
+ stale_blocks ++;
+ }
+ }
+ }
+ } else {
+ n += snprintf (buf+n, SZ, "\t*** No Block Stats Available ***\n");
+ }
+ n += snprintf (buf+n, SZ,
+ "\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%ld\n", i,
+ lvl->use_cnt,
+ lvl->split_cnt,
+ lvl->merge_cnt,
+ lvl->alloc_cnt,
+ lvl->max_alloc_cnt,
+ alloced_blocks,
+ stale_blocks,
+ lvl->numBits);
+ }
+ n += snprintf (buf+n, SZ, "\n");
+
+ /* we are done printing header */
+ cond_resched();
+ if (n > count) n = count;
+ copy_to_user (user_buf, buf, n);
+ free_page((unsigned long) buf);
+
+ *ppos += n;
+ return n;
+ }
+
+ /* end of iteration over levels */
+ if (loop_iter > tbl->mlbm.maxLevel) {
+ free_page((unsigned long) buf);
+ return 0;
+ }
+
+ struct Bitmap *lvl = &tbl->mlbm.level[loop_iter];
+
+ /* Dump bits for each level */
+ n += snprintf (buf+n, SZ, "\nlevel[%d] num_entries=%ld\n",
+ loop_iter, lvl->numBits);
+
+ struct tce_blk_stats * blk_stats;
+ blk_stats = lvl->blk_stats;
+
+ if (blk_stats) {
+ int i;
+ for (i=0; i<lvl->numBytes; i++) {
+ if (i && 0 == i%4) n += snprintf (buf+n, SZ, " ");
+ if (i && 0 == i%32) n += snprintf (buf+n, SZ, "\n");
+ n += snprintf (buf+n, SZ, "%02x", lvl->map[i]);
+ if (count-n < 10) break;
+ }
+ }
+ if (count-n < 10) { n += snprintf (buf+n, SZ, "..."); }
+ n += snprintf (buf+n, SZ, "\n");
+
+ /* iterate loop one more time. */
+ loop_iter ++;
+
+ cond_resched();
+ if (n > count) n = count;
+ copy_to_user (user_buf, buf, n);
+ free_page((unsigned long) buf);
+
+ *ppos += n;
+ return n;
+}
+
+static ssize_t
+proc_tce_detail_write(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static int
+proc_tce_detail_unlink (struct inode *inode, struct dentry *dent)
+{
+ struct proc_dir_entry * dp;
+ dp = (struct proc_dir_entry *) inode->u.generic_ip;
+ struct TceTable *tbl = dp->data;
+
+ printk ("attempt cleanup of tce stats upon file deletion tbl=%p\n", tbl);
+
+ teardown_detail_tce_stats(tbl);
+ remove_proc_entry(dp->name, dp->parent);
+ return 0;
+}
+
+/* ================================================================= */
+
+struct file_operations tce_detail_stats_operations = {
+ .read = proc_tce_detail_read,
+ .write = proc_tce_detail_write
+};
+
+struct inode_operations tce_detail_inode_ops = {
+ .unlink = proc_tce_detail_unlink,
+};
+
+/* ================================================================= */
+
+static ssize_t
+proc_tce_stats_read (struct file * file, char * user_buf,
+ size_t count, loff_t *ppos)
+{
+ int n = 0;
+
+ static struct pci_dev *pdev_iter;
+
+ /* start of virtual pci_for_each_dev(pdev_iter) */
+ if (*ppos == 0) {
+ pdev_iter = pci_dev_g(pci_devices.next);
+ }
+
+ /* while not done virtual pci_for_each_dev(pdev_iter) */
+ if (pdev_iter == pci_dev_g(&pci_devices)) {
+ return 0;
+ }
+
+ char * buf = (char*) __get_free_page(GFP_KERNEL);
+ if (!buf) return -ENOMEM;
+
+ /* Attempt to print just one device per call, so as to not
+ * overflow the user's buffer. If user gives us too small
+ * a buffer, we'll send the garbled data but who cares. */
+ while (pdev_iter != pci_dev_g(&pci_devices)) {
+ if (PCI_SLOT(pdev_iter->devfn) == 0) goto try_again;
+ if (pdev_iter->sysdata == NULL) goto try_again;
+
+ n += format_device_location (pdev_iter, buf+n, SZ);
+ n += snprintf (buf+n, SZ, "\n");
+ struct device_node *dn = (struct device_node *)pdev_iter->sysdata;
+ if (!dn) goto try_again;
+ struct TceTable *tbl = dn->tce_table;
+ if (!tbl) goto try_again;
+ n += snprintf (buf+n, SZ, "\ttotal_use_cnt=%d", tbl->use_cnt);
+ n += snprintf (buf+n, SZ, " alloc_cnt=%d", tbl->alloc_cnt);
+ n += snprintf (buf+n, SZ, " max_alloc_cnt=%d\n", tbl->max_alloc_cnt);
+
+ n += snprintf (buf+n, SZ,
+ "\tLevel\tuse_cnt\tsplit\tmerge\talloc\tmax_allo\n");
+ int i;
+ for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+ n += snprintf (buf+n, SZ,
+ "\t%d\t%d\t%d\t%d\t%d\t%d\n",
+ i, tbl->mlbm.level[i].use_cnt,
+ tbl->mlbm.level[i].split_cnt,
+ tbl->mlbm.level[i].merge_cnt,
+ tbl->mlbm.level[i].alloc_cnt,
+ tbl->mlbm.level[i].max_alloc_cnt);
+ }
+ break;
+try_again:
+ pdev_iter = pci_dev_g(pdev_iter->global_list.next);
+ }
+ n += snprintf (buf+n, SZ, "\n");
+
+ /* iterate once for next time */
+ pdev_iter = pci_dev_g(pdev_iter->global_list.next);
+
+ cond_resched();
+ if (n > count) n = count;
+ copy_to_user (user_buf, buf, n);
+ free_page((unsigned long) buf);
+
+ *ppos += n;
+ return n;
+}
+
+static ssize_t
+proc_tce_stats_write(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ if (!buf || count == 0) return 0;
+
+ /* the 'reset' keyword zero's out the stats for all pci devices */
+ if (0 == strncmp (buf, "reset", 5)) {
+ struct pci_dev *pdev;
+
+ pci_for_each_dev(pdev) {
+ if (PCI_SLOT(pdev->devfn) == 0) continue;
+ if (pdev->sysdata == NULL) continue;
+
+ struct device_node *dn = (struct device_node *)pdev->sysdata;
+ if (!dn) continue;
+ struct TceTable *tbl = dn->tce_table;
+ if (!tbl) continue;
+
+ int i;
+ for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+ tbl->mlbm.level[i].use_cnt = 0;
+ tbl->mlbm.level[i].split_cnt = 0;
+ tbl->mlbm.level[i].merge_cnt = 0;
+ tbl->mlbm.level[i].max_alloc_cnt = 0;
+ }
+ tbl->use_cnt = 0;
+ tbl->max_alloc_cnt = 0;
+ teardown_detail_tce_stats (tbl);
+ }
+ *ppos += count;
+ return count;
+ }
+
+ /* The 'show' keyword attempts to enable collection of detailed stats
+ * for the indicated bus:deviceid */
+ if (0 == strncmp (buf, "show", 4)) {
+ char * p = strchr (buf, ':');
+ if (!p) return count;
+ unsigned long busno = simple_strtoul (buf+5, &p , 16);
+ if (!p) return count;
+ unsigned long devno = simple_strtoul (p+1, NULL , 16);
+ // printk ("parsed out bus=0x%lx dev=0x%lx\n", busno, devno);
+
+ /* try to find the matching pci_dev */
+ struct pci_dev *pdev;
+ struct device_node *dn;
+ struct TceTable *tbl;
+
+ pci_for_each_dev(pdev) {
+ if (devno != PCI_SLOT(pdev->devfn)) continue;
+ if (busno != pdev->bus->number) continue;
+ if (pdev->sysdata == NULL) continue;
+ dn = (struct device_node *)pdev->sysdata;
+ if (!dn) continue;
+ tbl = dn->tce_table;
+ if (!tbl) continue;
+ break;
+ }
+ if (pdev == pci_dev_g(&pci_devices)) {
+ printk (KERN_INFO "tce_stats: uanble to find device %lx:%lx\n", busno, devno);
+ return count;
+ }
+ setup_detail_tce_stats(tbl);
+
+ /* Create the coresponding entry in the proc table */
+ char fname[100];
+ snprintf (fname, 100, "detail-%02lx:%02lx",busno, devno);
+ struct proc_dir_entry *ent;
+ ent = create_proc_entry (fname, S_IWUSR|S_IRUGO, proc_ppc64_tce_root);
+ if (!ent) {
+ teardown_detail_tce_stats(tbl);
+ return count;
+ }
+
+ ent->proc_fops = &tce_detail_stats_operations;
+ ent->proc_iops = &tce_detail_inode_ops;
+ // ent->read_proc = proc_tce_page_read;
+ ent->data = tbl;
+
+ return count;
+ }
+
+ *ppos += count;
+ return count;
+}
+
+/* ================================================================= */
+
+struct file_operations tce_stats_operations = {
+ .read = proc_tce_stats_read,
+ .write = proc_tce_stats_write
+};
+
+/* ================================================================= */
+/* Create entry /proc/ppc64/tce */
+
+void proc_tce_init(struct proc_dir_entry *proc_ppc64_root)
+{
+ struct proc_dir_entry *ent = NULL;
+
+ if (!proc_ppc64_root) return;
+
+ printk(KERN_INFO "proc_tce: creating /proc/ppc64/tce\n");
+ ent = proc_mkdir("tce", proc_ppc64_root);
+ if (!ent) {
+ printk (KERN_ERR "Failed to create /proc/ppc64/tce\n");
+ return;
+ }
+ proc_ppc64_tce_root = ent;
+
+ /* create the 'listener' */
+ ent = create_proc_entry ("stats", S_IWUSR|S_IRUGO, proc_ppc64_tce_root);
+ if (!ent) return;
+
+ ent->proc_fops = &tce_stats_operations;
+
+}
+
+#endif /* CONFIG_TCE_STATS */
+/* ============================= END OF FILE ================================ */
More information about the Linuxppc64-dev
mailing list