[PATCH] [2.4] show TCE statistics in /proc/ppc64

linas at austin.ibm.com linas at austin.ibm.com
Wed Nov 26 10:09:21 EST 2003


Actually attaching the actual patch would be appropriate ...

On Tue, Nov 25, 2003 at 04:22:01PM -0600, linas at austin.ibm.com wrote:
>
> The attached patch should apply cleanly against the current
> ameslab-2.4 tree.  It provides TCE usage statistics in /proc
> that might be useful to anyone interested in DMA performance
> or is debugging DMA usage in device drivers.
>
> I don't have BK access so I can't bk push anything at this time:
> Paul, if this patch looks good, can you apply & etc. as needed?
>
> The TCE usage stats are in the /proc/ppc64/tce directory.
> Get stats for all PCI devices by 'cat /proc/ppc64/tce/stats'.
> One can also get more detailed per-device stats by performing an
> 'echo "show bb:dd" > /proc/ppc64/tce/stats'
> where bb os the hex bus number, dd is the hex device number
> (cat /proc/ppc64/pci to figure these out).  One can then
> 'cat /proc/ppc64/tce/detail-bb:dd' to see the detailed stats.
>
> Here's a sample of the 'detail' output:
>
> device 21:01
> Fri Nov 21 14:04:07 PST 2003
> total_use_cnt=92042 alloc_cnt=1267 max_alloc_cnt=1267
>         Level   use_cnt split   merge   alloc   maxaloc actual  stale   entries
>         0       29552   0       0       59      114     59      1       32768
>         1       1988    60      7       662     675     3       0       16384
>         2       3882    35      5       11      25      11      0       8192
>         3       7147    31      5       30      44      30      0       4096
>         4       12405   44      12      45      69      45      0       2048
>         5       28815   71      26      285     285     285     0       1024
>         6       8253    336     171     175     175     175     0       512
>         7       0       633     463     0       1       0       0       256
>         8       0       314     229     0       1       0       0       128
>         9       0       153     110     0       1       0       0       64
>
> [dump of TCE bitmaps cut out]
>
> --linas
>
>
-------------- next part --------------
Index: arch/ppc64/config.in
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/config.in,v
retrieving revision 1.1.1.4
diff -u -p -u -p -r1.1.1.4 config.in
--- arch/ppc64/config.in	25 Nov 2003 20:04:34 -0000	1.1.1.4
+++ arch/ppc64/config.in	25 Nov 2003 20:43:04 -0000
@@ -50,6 +50,11 @@ bool 'Shared kernel/user space addressin

 tristate 'LPAR Configuration Data' CONFIG_LPARCFG

+if [ "$CONFIG_PPC_ISERIES" != "y" ]; then
+  if [ "$CONFIG_PROC_FS" = "y" ]; then
+    bool 'Show realtime tce usage stats in /proc/ppc64/tce' CONFIG_TCE_STATS
+  fi
+fi
 endmenu

 mainmenu_option next_comment
Index: arch/ppc64/kdb/kdbasupport.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kdb/Attic/kdbasupport.c,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 kdbasupport.c
--- arch/ppc64/kdb/kdbasupport.c	25 Nov 2003 20:04:34 -0000	1.1.1.1
+++ arch/ppc64/kdb/kdbasupport.c	25 Nov 2003 20:32:44 -0000
@@ -1617,11 +1617,16 @@ kdba_dump_tce_table(int argc, const char
     long tce_table_address;
     int nr;
     int i,j,k;
-    int full,empty;
+    int full,partial,empty;
     int fulldump=0;
     u64 mapentry;
-    int totalpages;
+    int freepages;
     int levelpages;
+#ifdef CONFIG_TCE_STATS
+    struct tce_blk_stats *blk_stats;
+    int alloced_blocks, stale_blocks;
+    unsigned long alloc_jiffies;
+#endif /* CONFIG_TCE_STATS */

     if (argc == 0) {
 	kdb_printf("need address\n");
@@ -1634,17 +1639,21 @@ kdba_dump_tce_table(int argc, const char
 	if (strcmp(argv[2], "full") == 0)
 	    fulldump=1;

-    /* with address, read contents of memory and dump tce table. */
-    /* possibly making some assumptions on the depth and size of table..*/
+    /* use address to read contents of memory and dump tce table. */

-    nr = kdba_readarea_size(tce_table_address+0 ,&kt.busNumber,8);
-    nr = kdba_readarea_size(tce_table_address+8 ,&kt.size,8);
-    nr = kdba_readarea_size(tce_table_address+16,&kt.startOffset,8);
-    nr = kdba_readarea_size(tce_table_address+24,&kt.base,8);
-    nr = kdba_readarea_size(tce_table_address+32,&kt.index,8);
-    nr = kdba_readarea_size(tce_table_address+40,&kt.tceType,8);
+#define GET_TCE_VAL(X) \
+    nr = kdba_readarea_size( \
+            ((long) &(((struct TceTable *)tce_table_address)->X)), \
+            &(kt.X), sizeof(kt.X));
+
+    GET_TCE_VAL (busNumber);
+    GET_TCE_VAL (size);
+    GET_TCE_VAL (startOffset);
+    GET_TCE_VAL (base);
+    GET_TCE_VAL (index);
+    GET_TCE_VAL (tceType);
 #ifdef CONFIG_SMP
-    nr = kdba_readarea_size(tce_table_address+48,&kt.lock,8);
+    GET_TCE_VAL (lock);
 #endif

     kdb_printf("\n");
@@ -1658,43 +1667,94 @@ kdba_dump_tce_table(int argc, const char
 #ifdef CONFIG_SMP
     kdb_printf("lock:        0x%x \n",(uint)kt.lock.lock);
 #endif
-    nr = kdba_readarea_size(tce_table_address+56,&kt.mlbm.maxLevel,8);
-    kdb_printf(" maxLevel:        0x%x \n",(uint)kt.mlbm.maxLevel);
-    totalpages=0;
+    GET_TCE_VAL (mlbm.maxLevel);
+    kdb_printf(" maxLevel:   0x%x \n",(uint)kt.mlbm.maxLevel);
+#ifdef CONFIG_TCE_STATS
+    GET_TCE_VAL (use_cnt);
+    GET_TCE_VAL (alloc_cnt);
+    kdb_printf(" use_cnt:    %d \n",(uint)kt.use_cnt);
+    kdb_printf(" alloc_cnt:    %d \n",(uint)kt.alloc_cnt);
+#endif
+    freepages=0;
     for (i=0;i<NUM_TCE_LEVELS;i++) {
-	nr = kdba_readarea_size(tce_table_address+64+i*24,&kt.mlbm.level[i].numBits,8);
-	nr = kdba_readarea_size(tce_table_address+72+i*24,&kt.mlbm.level[i].numBytes,8);
-	nr = kdba_readarea_size(tce_table_address+80+i*24,&kt.mlbm.level[i].map,8);
+    	GET_TCE_VAL (mlbm.level[i].numBits);
+    	GET_TCE_VAL (mlbm.level[i].numBytes);
+    	GET_TCE_VAL (mlbm.level[i].map);
 	kdb_printf("   level[%d]\n",i);
 	kdb_printf("   numBits:   0x%x\n",(uint)kt.mlbm.level[i].numBits);
 	kdb_printf("   numBytes:  0x%x\n",(uint)kt.mlbm.level[i].numBytes);
 	kdb_printf("   map*:      %p\n",kt.mlbm.level[i].map);
+#ifdef CONFIG_TCE_STATS
+	GET_TCE_VAL (mlbm.level[i].blk_stats);
+	blk_stats = kt.mlbm.level[i].blk_stats;
+	if (blk_stats) {
+	    alloced_blocks = 0;
+	    stale_blocks = 0;
+            /* alloc_jiffies will be set if the block is allocated and not freed.
+             * stale blocks suggest a leak or a really slow i/o system */
+	    for (j=0; j<kt.mlbm.level[i].numBits; j++) {
+                kdba_readarea_size(  (long) (&(blk_stats[j].alloc_jiffies)), &alloc_jiffies, 8);
+                if (alloc_jiffies && alloc_jiffies != ((unsigned long) -1)) {
+                    alloced_blocks++;
+                    /* 'stale' if alloc happened more than 3 seconds ago */
+            	    if (jiffies - alloc_jiffies > 3*HZ) {
+			    stale_blocks ++;
+		    }
+                }
+	    }
+            kdb_printf("   blk_stats: %p\n", blk_stats);
+        } else {
+	    alloced_blocks = -1;
+	    stale_blocks = -1;
+        }
+        GET_TCE_VAL (mlbm.level[i].use_cnt);
+        GET_TCE_VAL (mlbm.level[i].split_cnt);
+        GET_TCE_VAL (mlbm.level[i].merge_cnt);
+        GET_TCE_VAL (mlbm.level[i].alloc_cnt);
+        kdb_printf("   use_cnt: %d  split: %d merge: %d alloced: %d cnt-alloc: %d stale: %d\n",
+                 kt.mlbm.level[i].use_cnt,
+                 kt.mlbm.level[i].split_cnt,
+                 kt.mlbm.level[i].merge_cnt,
+                 kt.mlbm.level[i].alloc_cnt,
+                 alloced_blocks,
+                 stale_blocks);
+#endif

 	 /* if these dont match, this might not be a valid tce table, so
 	    dont try to iterate the map entries. */
 	if (kt.mlbm.level[i].numBits == 8*kt.mlbm.level[i].numBytes) {
-	    full=0;empty=0;levelpages=0;
+            int n=0;
+	    full=0;partial=0;empty=0;levelpages=0;
 	    for (j=0;j<kt.mlbm.level[i].numBytes; j++) {
 		mapentry=0;
 		nr = kdba_readarea_size((long int)(kt.mlbm.level[i].map+j),&mapentry,1);
-		if (mapentry)
+                mapentry >>= 56;
+		if (mapentry == 0xff)
 		    full++;
+		else if (mapentry)
+		    partial++;
 		else
 		    empty++;
 		if (mapentry && fulldump) {
-		    kdb_printf("0x%lx\n",mapentry);
+                    if (n && (n%32 == 0)) kdb_printf ("\n");
+		    kdb_printf("%02lx ",(int) mapentry);
+                    n++;
 		}
-		for (k=0;(k<=64) && ((0x1UL<<k) <= mapentry);k++) {
+		for (k=0;(k<8) && ((0x1UL<<k) <= mapentry);k++) {
 		    if ((0x1UL<<k) & mapentry) levelpages++;
 		}
 	    }
-	    kdb_printf("      full:0x%x empty:0x%x pages:0x%x\n",full,empty,levelpages);
+	    if (fulldump) kdb_printf ("\n");
+	    kdb_printf("      full:0x%x partial:0x%x empty:0x%x free:0x%x\n",
+                              full,partial,empty,levelpages);
 	} else {
 	    kdb_printf("      numBits/numBytes mismatch..? \n");
 	}
-	totalpages+=levelpages;
+	freepages += (1UL<<i) * levelpages;
     }
-    kdb_printf("      Total pages:0x%x\n",totalpages);
+    kdb_printf("      Total pages: 0x%lx free:0x%x used:0x%lx\n",
+                  kt.mlbm.level[0].numBits, freepages,
+                  kt.mlbm.level[0].numBits - freepages);
     kdb_printf("\n");
     return 0;
 }
Index: arch/ppc64/kernel/Makefile
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/Makefile,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 Makefile
--- arch/ppc64/kernel/Makefile	25 Nov 2003 20:04:35 -0000	1.1.1.3
+++ arch/ppc64/kernel/Makefile	25 Nov 2003 20:32:44 -0000
@@ -29,7 +29,7 @@ obj-y               :=	ppc_ksyms.o setup
 			iSeries_proc.o HvCall.o flight_recorder.o HvLpConfig.o \
 			rtc.o perfmon.o cputable.o vio.o

-obj-$(CONFIG_PCI) +=  pci.o pci_dn.o pci_dma.o pSeries_lpar.o pSeries_hvCall.o
+obj-$(CONFIG_PCI) +=  pci.o pci_dn.o pci_dma.o proc_tce.o pSeries_lpar.o pSeries_hvCall.o

 ifeq ($(CONFIG_PPC_ISERIES),y)
 obj-$(CONFIG_PCI) += iSeries_pci.o iSeries_pci_reset.o iSeries_IoMmTable.o iSeries_irq.o iSeries_VpdInfo.o XmPciLpEvent.o
Index: arch/ppc64/kernel/pci_dma.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/pci_dma.c,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 pci_dma.c
--- arch/ppc64/kernel/pci_dma.c	25 Nov 2003 20:04:37 -0000	1.1.1.3
+++ arch/ppc64/kernel/pci_dma.c	25 Nov 2003 20:32:44 -0000
@@ -44,7 +44,7 @@
 #include "pci.h"

 /* #define DEBUG_TCE 1   */
-/* #define MONITOR_TCE 1 */ /* Turn on to sanity check TCE generation. */
+#define MONITOR_TCE 1  /* Turn on to sanity check TCE generation. */


 /* Initialize so this guy does not end up in the BSS section.
@@ -212,6 +212,30 @@ static void tce_build_pSeries(struct Tce

 }

+#ifdef CONFIG_TCE_STATS
+/*
+ * Initialize tce-table statistics.  Handy typically only for device driver
+ * debugging, perf tuning, etc.
+ */
+static void init_tce_stats(struct TceTable * tbl)
+{
+	int i;
+
+	for (i=0; i<NUM_TCE_LEVELS; ++i) {
+		tbl->mlbm.level[i].use_cnt = 0;
+		tbl->mlbm.level[i].split_cnt = 0;
+		tbl->mlbm.level[i].merge_cnt = 0;
+		tbl->mlbm.level[i].alloc_cnt = 0;
+		tbl->mlbm.level[i].max_alloc_cnt = 0;
+		tbl->mlbm.level[i].blk_stats = 0x0;
+	}
+
+	tbl->use_cnt = 0;
+	tbl->alloc_cnt = 0;
+	tbl->max_alloc_cnt = 0;
+}
+
+#endif /* CONFIG_TCE_STATS */
 /*
  * Build a TceTable structure.  This contains a multi-level bit map which
  * is used to manage allocation of the tce space.
@@ -276,7 +300,6 @@ struct TceTable *build_tce_table(struct
 	}

 	/* For the highest level, turn on all the bits */
-
 	i = tbl->mlbm.maxLevel;
 	p = tbl->mlbm.level[i].map;
 	m = numBits[i];
@@ -301,6 +324,10 @@ struct TceTable *build_tce_table(struct
 		}
 	}

+#ifdef CONFIG_TCE_STATS
+	init_tce_stats (tbl);
+#endif /* CONFIG_TCE_STATS */
+
 	return tbl;
 }

@@ -364,6 +391,37 @@ static long alloc_tce_range_nolock( stru
 			 */
 			PPCDBG(PPCDBG_TCE, "alloc_tce_range_nolock: allocating block %ld, (byte=%ld, bit=%ld) order %d\n", block, i, bit, order );
 			tcenum = block << order;
+#ifdef CONFIG_TCE_STATS
+			if (tbl->mlbm.level[order].blk_stats) {
+				tbl->mlbm.level[order].blk_stats[block].use_cnt ++;
+				tbl->mlbm.level[order].blk_stats[block].alloc_jiffies = jiffies;
+			}
+			tbl->mlbm.level[order].use_cnt ++;
+			tbl->use_cnt ++;
+			tbl->mlbm.level[order].alloc_cnt ++;
+			tbl->alloc_cnt ++;
+			if (tbl->mlbm.level[order].max_alloc_cnt <
+			    tbl->mlbm.level[order].alloc_cnt) {
+				tbl->mlbm.level[order].max_alloc_cnt =
+				    tbl->mlbm.level[order].alloc_cnt;
+			}
+			if (tbl->max_alloc_cnt < tbl->alloc_cnt) {
+				tbl->max_alloc_cnt = tbl->alloc_cnt;
+			}
+
+#define THRESHOLD 1000
+			static int watermark = THRESHOLD;
+			if (tbl->alloc_cnt > watermark) {
+				printk ("alloc_tce_range: more than %d ranges alloced (%d)\n",
+					watermark, tbl->alloc_cnt);
+				watermark += THRESHOLD;
+			}
+			if (((int)tbl->alloc_cnt) < ((int)(watermark - 2*THRESHOLD))) {
+				watermark -= THRESHOLD;
+				printk ("alloc_tce_range: alloc usage dropped below %d (%d)\n",
+					watermark-THRESHOLD, tbl->alloc_cnt);
+			}
+#endif /* CONFIG_TCE_STATS */
 			return tcenum;
 		}
 		++map;
@@ -388,6 +446,25 @@ static long alloc_tce_range_nolock( stru
 	if((tcenum == -1) && (order < (NUM_TCE_LEVELS - 1))) {
 		tcenum = alloc_tce_range_nolock( tbl, order+1 );
 		if ( tcenum != -1 ) {
+#ifdef CONFIG_TCE_STATS
+			/* fix up stats for 'what we actually used' */
+			if (tbl->mlbm.level[order].blk_stats) {
+				tbl->mlbm.level[order].blk_stats[(tcenum>>order)].alloc_jiffies = jiffies;
+				tbl->mlbm.level[order].blk_stats[(tcenum>>order)].use_cnt ++;
+				tbl->mlbm.level[order].blk_stats[(tcenum>>order)+1].alloc_jiffies = jiffies;
+			}
+			if (tbl->mlbm.level[order+1].blk_stats) {
+				tbl->mlbm.level[order+1].blk_stats[(tcenum>>(order+1))].alloc_jiffies = (unsigned long) -1;
+				tbl->mlbm.level[order+1].blk_stats[(tcenum>>(order+1))].use_cnt --;
+			}
+			tbl->mlbm.level[order].use_cnt ++;
+			tbl->mlbm.level[order+1].use_cnt --;
+			tbl->mlbm.level[order+1].split_cnt ++;
+
+			tbl->mlbm.level[order+1].alloc_cnt --; /* uncount higher order */
+			tbl->mlbm.level[order].alloc_cnt +=2;  /* count twice, since next free will uncount */
+			tbl->alloc_cnt ++;   /* count 'twice' since free will uncount */
+#endif /* CONFIG_TCE_STATS */
 			free_tce_range_nolock( tbl, tcenum+(1<<order), order );
 		}
 	}
@@ -450,6 +527,17 @@ void free_tce_range_nolock(struct TceTab
 	mask  = 0x80 >> bit;
 	bytep = map + byte;

+#ifdef CONFIG_TCE_STATS
+	tbl->alloc_cnt --;
+	tbl->mlbm.level[order].alloc_cnt --;
+	if (tbl->mlbm.level[order].blk_stats) {
+		if (0 == tbl->mlbm.level[order].blk_stats[block].alloc_jiffies) {
+			printk("PCI_DMA: Freeing tce that wasn't alloced: busno 0x%lx tcenum %lx, order %x\n", tbl->busNumber, tcenum,order);
+		}
+		tbl->mlbm.level[order].blk_stats[block].alloc_jiffies = 0;
+	}
+#endif /* CONFIG_TCE_STATS */
+
 #ifdef DEBUG_TCE
 	PPCDBG(PPCDBG_TCE,"free_tce_range_nolock: freeing block %ld (byte=%d, bit=%d) of order %d\n",
 	       block, byte, bit, order);
@@ -487,6 +575,11 @@ void free_tce_range_nolock(struct TceTab
 			PPCDBG(PPCDBG_TCE,
 			       "free_tce_range: buddying blocks %ld & %ld\n",
 			       block, block+1);
+#ifdef CONFIG_TCE_STATS
+			tbl->mlbm.level[order+1].merge_cnt ++;
+			tbl->alloc_cnt ++;  /* undo excess counting */
+			tbl->mlbm.level[order+1].alloc_cnt ++;  /* undo excess counts */
+#endif /* CONFIG_TCE_STATS */
 			free_tce_range_nolock( tbl, tcenum, order+1 );
 		}
 	}
@@ -757,8 +850,8 @@ void create_tce_tables(void) {
 void create_pci_bus_tce_table( unsigned long token ) {
 	struct TceTable * newTceTable;

-	PPCDBG(PPCDBG_TCE, "Entering create_pci_bus_tce_table.\n");
-	PPCDBG(PPCDBG_TCE, "\ttoken = 0x%lx\n", token);
+	PPCDBG(PPCDBG_TCEINIT, "Entering create_pci_bus_tce_table.\n");
+	PPCDBG(PPCDBG_TCEINIT, "\ttoken = 0x%lx\n", token);

 	newTceTable = (struct TceTable *)kmalloc( sizeof(struct TceTable), GFP_KERNEL );

@@ -1084,7 +1177,7 @@ dma_addr_t pci_map_single(struct pci_dev
 	unsigned order, nPages;

 	PPCDBG(PPCDBG_TCE, "pci_map_single:\n");
-	PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, size = 0x%16.16lx, direction = 0x%16.16lx, vaddr = 0x%16.16lx\n", hwdev, size, direction, vaddr);
+	PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, size = 0x%lx, direction = %ld, vaddr = 0x%16.16lx\n", hwdev, size, direction, vaddr);
 	if (direction == PCI_DMA_NONE)
 		BUG();

@@ -1297,7 +1390,7 @@ static dma_addr_t create_tces_sg(struct
  	/* Client asked for way to much space.  This is checked later anyway */
 	/* It is easier to debug here for the drivers than in the tce tables.*/
  	if(order >= NUM_TCE_LEVELS) {
-		printk("PCI_DMA: create_tces_sg size too large: 0x%llx \n",(numTces << PAGE_SHIFT));
+		printk("PCI_DMA: create_tces_sg size too large: 0x%x \n",(numTces << PAGE_SHIFT));
 		panic("numTces is off");
  		return NO_TCE;
  	}
@@ -1403,7 +1496,7 @@ void pci_unmap_sg( struct pci_dev *hwdev
 	dma_addr_t dma_end_page, dma_start_page;

 	PPCDBG(PPCDBG_TCE, "pci_unmap_sg:\n");
-	PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, sg = 0x%16.16lx, direction = 0x%16.16lx, nelms = 0x%16.16lx\n", hwdev, sg, direction, nelms);
+	PPCDBG(PPCDBG_TCE, "\thwdev = 0x%16.16lx, sg = 0x%16.16lx, direction = %ld, nelms = %ld\n", hwdev, sg, direction, nelms);

 	if ( direction == PCI_DMA_NONE || nelms == 0 )
 		BUG();
@@ -1425,7 +1518,7 @@ void pci_unmap_sg( struct pci_dev *hwdev
  	/* Client asked for way to much space.  This is checked later anyway */
 	/* It is easier to debug here for the drivers than in the tce tables.*/
  	if(order >= NUM_TCE_LEVELS) {
-		printk("PCI_DMA: dma_start_page:0x%lx  dma_end_page:0x%lx\n",dma_start_page,dma_end_page);
+		printk("PCI_DMA: dma_start_page:0x%x  dma_end_page:0x%x\n",dma_start_page,dma_end_page);
 		printk("PCI_DMA: pci_unmap_sg size too large: 0x%x \n",(numTces << PAGE_SHIFT));
  		return;
  	}
Index: arch/ppc64/kernel/proc_pmc.c
===================================================================
RCS file: /home/linas/cvsroot/linux24/arch/ppc64/kernel/proc_pmc.c,v
retrieving revision 1.1.1.3
diff -u -p -u -p -r1.1.1.3 proc_pmc.c
--- arch/ppc64/kernel/proc_pmc.c	25 Nov 2003 20:04:36 -0000	1.1.1.3
+++ arch/ppc64/kernel/proc_pmc.c	25 Nov 2003 20:32:44 -0000
@@ -47,6 +47,11 @@
 /* pci Flight Recorder AHT */
 extern void proc_pciFr_init(struct proc_dir_entry *proc_ppc64_root);

+#ifdef CONFIG_TCE_STATS
+/* PCI TCE stats interface */
+extern void proc_tce_init(struct proc_dir_entry *proc_ppc64_root);
+#endif /* CONFIG_TCE_STATS */
+
 static int proc_pmc_control_mode = 0;

 struct proc_dir_entry *proc_ppc64_root = NULL;
@@ -188,6 +193,11 @@ void proc_ppc64_init(void)

 	/* Create the /proc/ppc64/pcifr for the Pci Flight Recorder.	 */
 	proc_pciFr_init(proc_ppc64_root);
+
+#ifdef CONFIG_TCE_STATS
+	/* Create the /proc/ppc64/tce entry for TCE stats/debugging */
+	proc_tce_init (proc_ppc64_root);
+#endif /* CONFIG_TCE_STATS */

 	proc_ppc64_pmc_root = proc_mkdir("pmc", proc_ppc64_root);

Index: include/asm-ppc64/pci_dma.h
===================================================================
RCS file: /home/linas/cvsroot/linux24/include/asm-ppc64/pci_dma.h,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 pci_dma.h
--- include/asm-ppc64/pci_dma.h	15 Jul 2003 16:54:54 -0000	1.1.1.1
+++ include/asm-ppc64/pci_dma.h	25 Nov 2003 21:27:08 -0000
@@ -53,10 +53,32 @@ union Tce {
 	} tceBits;
 };

+#ifdef CONFIG_TCE_STATS
+struct tce_blk_stats {
+	unsigned long alloc_jiffies; /* time when last allocated, helps find leaks */
+	unsigned int use_cnt;        /* how many times this block has been alloced */
+	char direction;              /* last i/o direction */
+};
+#endif /* CONFIG_TCE_STATS */
+
 struct Bitmap {
 	unsigned long	numBits;
 	unsigned long	numBytes;
 	unsigned char * map;
+#ifdef CONFIG_TCE_STATS
+	unsigned int use_cnt;      /* num of blocks that were ever alloced */
+
+	/* The split/merge counts provide stats about the buddy system,
+	 * helping debug fragmentation problems. */
+	unsigned int split_cnt;    /* num blocks split to make smaller blocks */
+	unsigned int merge_cnt;    /* num blocks buddied back up by free */
+
+	unsigned int alloc_cnt;       /* num alloc's currently pending */
+	unsigned int max_alloc_cnt;   /* highest num alloc's ever */
+
+	/* Individual block stats should help debug alloc leaks. */
+	struct tce_blk_stats * blk_stats;
+#endif /* CONFIG_TCE_STATS */
 };

 struct MultiLevelBitmap {
@@ -73,6 +95,11 @@ struct TceTable {
 	u64	tceType;
 	spinlock_t lock;
 	struct MultiLevelBitmap mlbm;
+#ifdef CONFIG_TCE_STATS
+	unsigned int use_cnt;         /* num alloc's there were ever made */
+	unsigned int alloc_cnt;       /* num alloc's currently pending */
+	unsigned int max_alloc_cnt;   /* highest num alloc's ever */
+#endif /* CONFIG_TCE_STATS */
 };

 struct TceTableManagerCB {

--- arch/ppc64/kernel/proc_tce.c.orig	2003-11-21 18:34:35.000000000 -0600
+++ arch/ppc64/kernel/proc_tce.c	2003-11-24 18:11:10.000000000 -0600
@@ -0,0 +1,484 @@
+/*
+ * proc_tce.c
+ * Copyright (C) 2003 Linas Vepstas, IBM Corporation
+ *
+ * Dynamic DMA mapping statistics support.
+ *
+ * Manages the TCE space assigned to this partition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <asm/pci_dma.h>
+#include <asm/uaccess.h>
+
+#include "pci.h"
+
+#ifdef CONFIG_TCE_STATS
+
+static struct proc_dir_entry *proc_ppc64_tce_root = NULL;
+
+/* ================================================================= */
+/* Alloc the detail-stats array.  */
+
+static inline int
+get_tce_stats_bytes(struct TceTable * tbl)
+{
+	int num_entries, num_bytes;
+
+	num_entries = tbl->mlbm.level[0].numBits;
+	num_entries *= 2;  /* room for other levels as well */
+	num_bytes = num_entries * sizeof( struct tce_blk_stats );
+	return num_bytes;
+}
+
+static inline int
+get_tce_stats_order(struct TceTable * tbl)
+{
+	return get_order (get_tce_stats_bytes(tbl));
+}
+
+#define TRUE 1
+#define FALSE 0
+
+static int
+blk_is_alloced (struct TceTable * tbl, int tcenum, int order)
+{
+	unsigned byte, bit;
+	unsigned char mask, *bytep;
+
+	if (order < 0) return TRUE;
+	if (order > tbl->mlbm.maxLevel) return TRUE;
+	bit = tcenum >> order;
+	byte = bit /8;
+	bit = bit%8;
+	mask = 0x80>>bit;
+	bytep = tbl->mlbm.level[order].map + byte;
+	if (mask & *bytep) return FALSE;
+
+	/* check downwards */
+	if (FALSE == blk_is_alloced (tbl, tcenum, order-1)) return FALSE;
+	if (FALSE == blk_is_alloced (tbl, tcenum+1, order-1)) return FALSE;
+
+	return blk_is_alloced (tbl, tcenum, order+1);
+}
+
+static void
+setup_detail_tce_stats(struct TceTable * tbl)
+{
+	int i;
+	struct tce_blk_stats *p;
+
+	/* Alloc per-block stats array */
+	p = (struct tce_blk_stats *)
+		__get_free_pages( GFP_ATOMIC, get_tce_stats_order(tbl));
+
+	/* alloc may fail for large areas; keep driving */
+	if (p) memset( p, 0, get_tce_stats_bytes(tbl) );
+
+	for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+		tbl->mlbm.level[i].use_cnt = 0;
+		tbl->mlbm.level[i].split_cnt = 0;
+		tbl->mlbm.level[i].merge_cnt = 0;
+		tbl->mlbm.level[i].max_alloc_cnt = 0;
+
+		if (p) {
+			tbl->mlbm.level[i].blk_stats = p;
+			p += tbl->mlbm.level[i].numBits;
+		} else {
+			tbl->mlbm.level[i].blk_stats = 0x0;
+		}
+	}
+
+	tbl->use_cnt = 0;
+	tbl->max_alloc_cnt = 0;
+
+#if 0
+	/* make block stats match current bitmap */
+	for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+		p = tbl->mlbm.level[i].blk_stats;
+		if (p) {
+			int j;
+			for (j=0; j<tbl->mlbm.level[i].numBits; j++) {
+				int tcenum = j<<i;
+				if (blk_is_alloced(tbl,tcenum,i)) {
+					p[j].use_cnt ++;
+					p[j].alloc_jiffies = jiffies;
+				}
+			}
+		}
+	}
+#endif
+
+}
+
+static void
+teardown_detail_tce_stats(struct TceTable * tbl)
+{
+	struct tce_blk_stats *p;
+	p = tbl->mlbm.level[0].blk_stats;
+	if (!p) return;
+	int i;
+	for (i=0; i<=tbl->mlbm.maxLevel; ++i) {
+		tbl->mlbm.level[i].blk_stats = NULL;
+	}
+	free_pages ((unsigned long)p, get_tce_stats_order(tbl));
+}
+
+/* ================================================================= */
+#define SZ ((0<(count-n))?(count-n):0)
+
+static ssize_t
+proc_tce_detail_read (struct file * file, char * user_buf,
+		size_t count, loff_t *ppos)
+{
+	int n = 0;
+
+	/* Find the tce table */
+	struct inode * inode = file->f_dentry->d_inode;
+	struct proc_dir_entry * dp;
+	dp = (struct proc_dir_entry *) inode->u.generic_ip;
+	struct TceTable *tbl = dp->data;
+
+	char * buf = (char*) __get_free_page(GFP_KERNEL);
+	if (!buf) return -ENOMEM;
+
+	/* start of virtual pci_for_each_dev(pdev_iter) */
+	static int loop_iter;
+	if (*ppos == 0) {
+		loop_iter = 0;
+
+		/* print header, summary stats */
+		n += snprintf (buf+n, SZ, "total_use_cnt=%d", tbl->use_cnt);
+		n += snprintf (buf+n, SZ, " alloc_cnt=%d", tbl->alloc_cnt);
+		n += snprintf (buf+n, SZ, " max_alloc_cnt=%d\n", tbl->max_alloc_cnt);
+		n += snprintf (buf+n, SZ,
+		    "\tLevel\tuse_cnt\tsplit\tmerge\talloc\tmaxaloc\tactual\tstale\tentries\n");
+
+		int i;
+		for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+			struct Bitmap *lvl = &tbl->mlbm.level[i];
+
+			struct tce_blk_stats * blk_stats;
+			blk_stats = lvl->blk_stats;
+
+			int alloced_blocks=0, stale_blocks=0;
+			if (blk_stats) {
+
+				/* alloc_jiffies will be set if the block is
+				 * allocated and not freed.  Stale blocks suggest
+				 * a leak or a really slow i/o system */
+				int j;
+				for (j=0; j<lvl->numBits; j++) {
+					unsigned long alloc_jiffies = blk_stats[j].alloc_jiffies;
+					if (alloc_jiffies && alloc_jiffies != ((unsigned long) -1)) {
+						alloced_blocks++;
+						/* 'stale' if alloc happened more than 3 seconds ago */
+						if (jiffies - alloc_jiffies > 3*HZ) {
+							stale_blocks ++;
+						}
+					}
+				}
+			} else {
+				n += snprintf (buf+n, SZ, "\t*** No Block Stats Available ***\n");
+			}
+			n += snprintf (buf+n, SZ,
+				"\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%ld\n", i,
+				lvl->use_cnt,
+				lvl->split_cnt,
+				lvl->merge_cnt,
+				lvl->alloc_cnt,
+				lvl->max_alloc_cnt,
+				alloced_blocks,
+				stale_blocks,
+				lvl->numBits);
+		}
+		n += snprintf (buf+n, SZ, "\n");
+
+		/* we are done printing header */
+		cond_resched();
+		if (n > count) n = count;
+		copy_to_user (user_buf, buf, n);
+		free_page((unsigned long) buf);
+
+		*ppos += n;
+		return n;
+	}
+
+	/* end of iteration over levels */
+	if (loop_iter > tbl->mlbm.maxLevel) {
+		free_page((unsigned long) buf);
+		return 0;
+	}
+
+	struct Bitmap *lvl = &tbl->mlbm.level[loop_iter];
+
+	/* Dump bits for each level */
+	n += snprintf (buf+n, SZ, "\nlevel[%d] num_entries=%ld\n",
+		loop_iter, lvl->numBits);
+
+	struct tce_blk_stats * blk_stats;
+	blk_stats = lvl->blk_stats;
+
+	if (blk_stats) {
+		int i;
+		for (i=0; i<lvl->numBytes; i++) {
+			if (i && 0 == i%4) n += snprintf (buf+n, SZ, " ");
+			if (i && 0 == i%32) n += snprintf (buf+n, SZ, "\n");
+			n += snprintf (buf+n, SZ, "%02x", lvl->map[i]);
+			if (count-n < 10) break;
+		}
+	}
+	if (count-n < 10) {  n += snprintf (buf+n, SZ, "..."); }
+	n += snprintf (buf+n, SZ, "\n");
+
+	/* iterate loop one more time. */
+	loop_iter ++;
+
+	cond_resched();
+	if (n > count) n = count;
+	copy_to_user (user_buf, buf, n);
+	free_page((unsigned long) buf);
+
+	*ppos += n;
+	return n;
+}
+
+static ssize_t
+proc_tce_detail_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static int
+proc_tce_detail_unlink (struct inode *inode, struct dentry *dent)
+{
+	struct proc_dir_entry * dp;
+	dp = (struct proc_dir_entry *) inode->u.generic_ip;
+	struct TceTable *tbl = dp->data;
+
+	printk ("attempt cleanup of tce stats upon file deletion tbl=%p\n", tbl);
+
+	teardown_detail_tce_stats(tbl);
+	remove_proc_entry(dp->name, dp->parent);
+	return 0;
+}
+
+/* ================================================================= */
+
+struct file_operations tce_detail_stats_operations = {
+	.read = proc_tce_detail_read,
+	.write = proc_tce_detail_write
+};
+
+struct inode_operations tce_detail_inode_ops = {
+	.unlink = proc_tce_detail_unlink,
+};
+
+/* ================================================================= */
+
+static ssize_t
+proc_tce_stats_read (struct file * file, char * user_buf,
+		size_t count, loff_t *ppos)
+{
+	int n = 0;
+
+	static struct pci_dev *pdev_iter;
+
+	/* start of virtual pci_for_each_dev(pdev_iter) */
+	if (*ppos == 0) {
+		pdev_iter = pci_dev_g(pci_devices.next);
+	}
+
+	/* while not done virtual pci_for_each_dev(pdev_iter) */
+	if (pdev_iter == pci_dev_g(&pci_devices)) {
+		return 0;
+	}
+
+	char * buf = (char*) __get_free_page(GFP_KERNEL);
+	if (!buf) return -ENOMEM;
+
+	/* Attempt to print just one device per call, so as to not
+	 * overflow the user's buffer.  If user gives us too small
+	 * a buffer, we'll send the garbled data but who cares. */
+	while (pdev_iter != pci_dev_g(&pci_devices)) {
+		if (PCI_SLOT(pdev_iter->devfn) == 0) goto try_again;
+		if (pdev_iter->sysdata == NULL) goto try_again;
+
+		n += format_device_location (pdev_iter, buf+n, SZ);
+		n += snprintf (buf+n, SZ, "\n");
+		struct device_node *dn = (struct device_node *)pdev_iter->sysdata;
+		if (!dn) goto try_again;
+		struct TceTable *tbl = dn->tce_table;
+		if (!tbl) goto try_again;
+		n += snprintf (buf+n, SZ, "\ttotal_use_cnt=%d", tbl->use_cnt);
+		n += snprintf (buf+n, SZ, " alloc_cnt=%d", tbl->alloc_cnt);
+		n += snprintf (buf+n, SZ, " max_alloc_cnt=%d\n", tbl->max_alloc_cnt);
+
+		n += snprintf (buf+n, SZ,
+				"\tLevel\tuse_cnt\tsplit\tmerge\talloc\tmax_allo\n");
+		int i;
+		for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+			n += snprintf (buf+n, SZ,
+				"\t%d\t%d\t%d\t%d\t%d\t%d\n",
+				i, tbl->mlbm.level[i].use_cnt,
+				tbl->mlbm.level[i].split_cnt,
+				tbl->mlbm.level[i].merge_cnt,
+				tbl->mlbm.level[i].alloc_cnt,
+				tbl->mlbm.level[i].max_alloc_cnt);
+		}
+		break;
+try_again:
+		pdev_iter = pci_dev_g(pdev_iter->global_list.next);
+	}
+	n += snprintf (buf+n, SZ, "\n");
+
+	/* iterate once for next time */
+	pdev_iter = pci_dev_g(pdev_iter->global_list.next);
+
+	cond_resched();
+	if (n > count) n = count;
+	copy_to_user (user_buf, buf, n);
+	free_page((unsigned long) buf);
+
+       	*ppos += n;
+	return n;
+}
+
+static ssize_t
+proc_tce_stats_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	if (!buf || count == 0) return 0;
+
+	/* the 'reset' keyword zero's out the stats for all pci devices */
+	if (0 == strncmp (buf, "reset", 5)) {
+		struct pci_dev *pdev;
+
+		pci_for_each_dev(pdev) {
+			if (PCI_SLOT(pdev->devfn) == 0) continue;
+			if (pdev->sysdata == NULL) continue;
+
+			struct device_node *dn = (struct device_node *)pdev->sysdata;
+			if (!dn) continue;
+			struct TceTable *tbl = dn->tce_table;
+			if (!tbl) continue;
+
+			int i;
+			for (i=0; i<= tbl->mlbm.maxLevel; i++) {
+				tbl->mlbm.level[i].use_cnt = 0;
+				tbl->mlbm.level[i].split_cnt = 0;
+				tbl->mlbm.level[i].merge_cnt = 0;
+				tbl->mlbm.level[i].max_alloc_cnt = 0;
+			}
+			tbl->use_cnt = 0;
+			tbl->max_alloc_cnt = 0;
+			teardown_detail_tce_stats (tbl);
+		}
+		*ppos += count;
+		return count;
+	}
+
+	/* The 'show' keyword attempts to enable collection of detailed stats
+	 * for the indicated bus:deviceid */
+	if (0 == strncmp (buf, "show", 4)) {
+		char * p = strchr (buf, ':');
+		if (!p) return count;
+		unsigned long busno = simple_strtoul (buf+5, &p , 16);
+		if (!p) return count;
+		unsigned long devno = simple_strtoul (p+1, NULL , 16);
+		// printk ("parsed out bus=0x%lx dev=0x%lx\n", busno, devno);
+
+		/* try to find the matching pci_dev */
+		struct pci_dev *pdev;
+		struct device_node *dn;
+		struct TceTable *tbl;
+
+		pci_for_each_dev(pdev) {
+			if (devno != PCI_SLOT(pdev->devfn)) continue;
+			if (busno != pdev->bus->number) continue;
+			if (pdev->sysdata == NULL) continue;
+			dn = (struct device_node *)pdev->sysdata;
+			if (!dn) continue;
+			tbl = dn->tce_table;
+			if (!tbl) continue;
+			break;
+		}
+		if (pdev == pci_dev_g(&pci_devices)) {
+			printk (KERN_INFO "tce_stats: uanble to find device %lx:%lx\n", busno, devno);
+			return count;
+		}
+		setup_detail_tce_stats(tbl);
+
+		/* Create the coresponding entry in the proc table */
+		char fname[100];
+		snprintf (fname, 100, "detail-%02lx:%02lx",busno, devno);
+		struct proc_dir_entry *ent;
+		ent = create_proc_entry (fname, S_IWUSR|S_IRUGO, proc_ppc64_tce_root);
+		if (!ent) {
+			teardown_detail_tce_stats(tbl);
+			return count;
+		}
+
+		ent->proc_fops = &tce_detail_stats_operations;
+		ent->proc_iops = &tce_detail_inode_ops;
+		// ent->read_proc = proc_tce_page_read;
+		ent->data = tbl;
+
+		return count;
+	}
+
+	*ppos += count;
+	return count;
+}
+
+/* ================================================================= */
+
+struct file_operations tce_stats_operations = {
+	.read = proc_tce_stats_read,
+	.write = proc_tce_stats_write
+};
+
+/* ================================================================= */
+/* Create entry /proc/ppc64/tce */
+
+void proc_tce_init(struct proc_dir_entry *proc_ppc64_root)
+{
+	struct proc_dir_entry *ent = NULL;
+
+	if (!proc_ppc64_root) return;
+
+	printk(KERN_INFO "proc_tce: creating /proc/ppc64/tce\n");
+	ent = proc_mkdir("tce", proc_ppc64_root);
+	if (!ent) {
+		printk (KERN_ERR "Failed to create /proc/ppc64/tce\n");
+		return;
+	}
+	proc_ppc64_tce_root = ent;
+
+	/* create the 'listener' */
+	ent = create_proc_entry ("stats", S_IWUSR|S_IRUGO, proc_ppc64_tce_root);
+	if (!ent) return;
+
+	ent->proc_fops = &tce_stats_operations;
+
+}
+
+#endif /* CONFIG_TCE_STATS */
+/* ============================= END OF FILE ================================ */


More information about the Linuxppc64-dev mailing list