[PATCH 11/19] powerpc: iommu enablement for CMO

Robert Jennings rcj at linux.vnet.ibm.com
Fri Jun 13 08:19:36 EST 2008


From: Robert Jennings <rcj at linux.vnet.ibm.com>

To support Cooperative Memory Overcommitment (CMO), we need to check
for failure and busy responses from some of the tce hcalls.

These changes for the pseries platform affect the powerpc architecture;
patches for the other affected platforms are included in this patch.

pSeries platform IOMMU code changes:
 * platform TCE functions must handle H_NOT_ENOUGH_RESOURCES errors.
 * platform TCE functions must retry when H_LONG_BUSY_* is returned.
 * platform TCE functions must return error when H_NOT_ENOUGH_RESOURCES
   encountered.

Architecture IOMMU code changes:
 * Calls to ppc_md.tce_build need to check return values and return 
   DMA_MAPPING_ERROR

Architecture changes:
 * struct machdep_calls for tce_build*_pSeriesLP functions need to change
   to indicate failure
 * all other platforms will need updates to iommu functions to match the new
   calling semantics; they will return 0 on success.  The other platforms
   default configs have been built, but no further testing was performed.

Signed-off-by: Robert Jennings <rcj at linux.vnet.ibm.com>

---
 arch/powerpc/kernel/iommu.c            |   71 +++++++++++++++++++++++++++++--
 arch/powerpc/platforms/cell/iommu.c    |    3 +
 arch/powerpc/platforms/iseries/iommu.c |    3 +
 arch/powerpc/platforms/pasemi/iommu.c  |    3 +
 arch/powerpc/platforms/pseries/iommu.c |   76 ++++++++++++++++++++++++++++-----
 arch/powerpc/sysdev/dart_iommu.c       |    3 +
 include/asm-powerpc/machdep.h          |    2 
 7 files changed, 139 insertions(+), 22 deletions(-)

Index: b/arch/powerpc/kernel/iommu.c
===================================================================
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -183,6 +183,49 @@ static unsigned long iommu_range_alloc(s
 	return n;
 }
 
+/** iommu_undo - Clear iommu_table bits without calling platform tce_free.
+ *
+ * @tbl - struct iommu_table to alter
+ * @dma_addr - DMA address to free entries for
+ * @npages - number of pages to free entries for
+ *
+ * This is the same as __iommu_free without the call to ppc_md.tce_free();
+ *
+ * To clean up after ppc_md.tce_build() errors we need to clear bits
+ * in the table without calling the ppc_md.tce_free() method; calling
+ * ppc_md.tce_free() could alter entries that were not touched due to a
+ * premature failure in ppc_md.tce_build().
+ *
+ * The ppc_md.tce_build() needs to perform its own clean up prior to
+ * returning its error.
+ */
+static void iommu_undo(struct iommu_table *tbl, dma_addr_t dma_addr,
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+
+	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	free_entry = entry - tbl->it_offset;
+
+	if (((free_entry + npages) > tbl->it_size) ||
+	    (entry < tbl->it_offset)) {
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "iommu_undo: invalid entry\n");
+			printk(KERN_INFO "\tentry    = 0x%lx\n", entry);
+			printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr);
+			printk(KERN_INFO "\tTable    = 0x%lx\n", (u64)tbl);
+			printk(KERN_INFO "\tbus#     = 0x%lx\n", tbl->it_busno);
+			printk(KERN_INFO "\tsize     = 0x%lx\n", tbl->it_size);
+			printk(KERN_INFO "\tstartOff = 0x%lx\n", tbl->it_offset);
+			printk(KERN_INFO "\tindex    = 0x%lx\n", tbl->it_index);
+			WARN_ON(1);
+		}
+		return;
+	}
+
+	iommu_area_free(tbl->it_map, free_entry, npages);
+}
+
 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 			      void *page, unsigned int npages,
 			      enum dma_data_direction direction,
@@ -190,6 +233,7 @@ static dma_addr_t iommu_alloc(struct dev
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
+	int rc;
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
@@ -204,9 +248,20 @@ static dma_addr_t iommu_alloc(struct dev
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
-			 direction);
+	rc = ppc_md.tce_build(tbl, entry, npages,
+	                      (unsigned long)page & IOMMU_PAGE_MASK, direction);
 
+	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * DMA_ERROR_CODE. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(rc)) {
+		iommu_undo(tbl, ret, npages);
+
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
 
 	/* Flush/invalidate TLB caches if necessary */
 	if (ppc_md.tce_flush)
@@ -275,7 +330,7 @@ int iommu_map_sg(struct device *dev, str
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
-	int outcount, incount, i;
+	int outcount, incount, i, rc = 0;
 	unsigned int align;
 	unsigned long handle;
 	unsigned int max_seg_size;
@@ -336,7 +391,10 @@ int iommu_map_sg(struct device *dev, str
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);
+		rc = ppc_md.tce_build(tbl, entry, npages,
+		                      vaddr & IOMMU_PAGE_MASK, direction);
+		if(unlikely(rc))
+			goto failure;
 
 		/* If we are in an open segment, try merging */
 		if (segstart != s) {
@@ -399,7 +457,10 @@ int iommu_map_sg(struct device *dev, str
 
 			vaddr = s->dma_address & IOMMU_PAGE_MASK;
 			npages = iommu_num_pages(s->dma_address, s->dma_length);
-			__iommu_free(tbl, vaddr, npages);
+			if (!rc)
+				__iommu_free(tbl, vaddr, npages);
+			else
+				iommu_undo(tbl, vaddr, npages);
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
 		}
Index: b/arch/powerpc/platforms/cell/iommu.c
===================================================================
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct 
 	}
 }
 
-static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
+static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction)
 {
 	int i;
@@ -210,6 +210,7 @@ static void tce_build_cell(struct iommu_
 
 	pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
 		 index, npages, direction, base_pte);
+	return 0;
 }
 
 static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
Index: b/arch/powerpc/platforms/iseries/iommu.c
===================================================================
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -41,7 +41,7 @@
 #include <asm/iseries/hv_call_event.h>
 #include <asm/iseries/iommu.h>
 
-static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
+static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction)
 {
 	u64 rc;
@@ -70,6 +70,7 @@ static void tce_build_iSeries(struct iom
 		index++;
 		uaddr += TCE_PAGE_SIZE;
 	}
+	return 0;
 }
 
 static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
Index: b/arch/powerpc/platforms/pasemi/iommu.c
===================================================================
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -83,7 +83,7 @@ static u32 *iob_l2_base;
 static struct iommu_table iommu_table_iobmap;
 static int iommu_table_iobmap_inited;
 
-static void iobmap_build(struct iommu_table *tbl, long index,
+static int iobmap_build(struct iommu_table *tbl, long index,
 			 long npages, unsigned long uaddr,
 			 enum dma_data_direction direction)
 {
@@ -107,6 +107,7 @@ static void iobmap_build(struct iommu_ta
 		uaddr += IOBMAP_PAGE_SIZE;
 		bus_addr += IOBMAP_PAGE_SIZE;
 	}
+	return 0;
 }
 
 
Index: b/arch/powerpc/platforms/pseries/iommu.c
===================================================================
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -48,7 +49,7 @@
 #include "plpar_wrappers.h"
 
 
-static void tce_build_pSeries(struct iommu_table *tbl, long index,
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      long npages, unsigned long uaddr,
 			      enum dma_data_direction direction)
 {
@@ -71,6 +72,7 @@ static void tce_build_pSeries(struct iom
 		uaddr += TCE_PAGE_SIZE;
 		tcep++;
 	}
+	return 0;
 }
 
 
@@ -93,13 +95,18 @@ static unsigned long tce_get_pseries(str
 	return *tcep;
 }
 
-static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static void tce_free_pSeriesLP(struct iommu_table*, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				long npages, unsigned long uaddr,
 				enum dma_data_direction direction)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce, tce;
 	u64 rpn;
+	int sleep_msecs, ret = 0;
+	long tcenum_start = tcenum, npages_start = npages;
 
 	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
@@ -108,7 +115,21 @@ static void tce_build_pSeriesLP(struct i
 
 	while (npages--) {
 		tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
-		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
+		do {
+			rc = plpar_tce_put((u64)tbl->it_index,
+			                   (u64)tcenum << 12, tce);
+			if (unlikely(H_IS_LONG_BUSY(rc))) {
+				sleep_msecs = plpar_get_longbusy_msecs(rc);
+				mdelay(sleep_msecs);
+			}
+		} while (unlikely(H_IS_LONG_BUSY(rc)));
+
+		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+			ret = (int)rc;
+			tce_free_pSeriesLP(tbl, tcenum_start,
+			                   (npages_start - (npages + 1)));
+			break;
+		}
 
 		if (rc && printk_ratelimit()) {
 			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
@@ -121,19 +142,22 @@ static void tce_build_pSeriesLP(struct i
 		tcenum++;
 		rpn++;
 	}
+	return ret;
 }
 
 static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
 
-static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
 				     enum dma_data_direction direction)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce;
 	u64 *tcep;
 	u64 rpn;
 	long l, limit;
+	long tcenum_start = tcenum, npages_start = npages;
+	int sleep_msecs, ret = 0;
 
 	if (npages == 1)
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
@@ -171,15 +195,26 @@ static void tce_buildmulti_pSeriesLP(str
 			rpn++;
 		}
 
-		rc = plpar_tce_put_indirect((u64)tbl->it_index,
-					    (u64)tcenum << 12,
-					    (u64)virt_to_abs(tcep),
-					    limit);
+		do {
+			rc = plpar_tce_put_indirect(tbl->it_index, tcenum << 12,
+						    virt_to_abs(tcep), limit);
+			if (unlikely(H_IS_LONG_BUSY(rc))) {
+				sleep_msecs = plpar_get_longbusy_msecs(rc);
+				mdelay(sleep_msecs);
+			}
+		} while (unlikely(H_IS_LONG_BUSY(rc)));
 
 		npages -= limit;
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+		ret = (int)rc;
+		tce_freemulti_pSeriesLP(tbl, tcenum_start,
+		                        (npages_start - (npages + limit)));
+		return ret;
+	}
+
 	if (rc && printk_ratelimit()) {
 		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -187,14 +222,23 @@ static void tce_buildmulti_pSeriesLP(str
 		printk("\ttce[0] val = 0x%lx\n", tcep[0]);
 		show_stack(current, (unsigned long *)__get_SP());
 	}
+	return ret;
 }
 
 static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
 {
+	int sleep_msecs;
 	u64 rc;
 
 	while (npages--) {
-		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
+		do {
+			rc = plpar_tce_put((u64)tbl->it_index,
+			                   (u64)tcenum << 12, 0);
+			if (unlikely(H_IS_LONG_BUSY(rc))) {
+				sleep_msecs = plpar_get_longbusy_msecs(rc);
+				mdelay(sleep_msecs);
+			}
+		} while (unlikely(H_IS_LONG_BUSY(rc)));
 
 		if (rc && printk_ratelimit()) {
 			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
@@ -210,9 +254,17 @@ static void tce_free_pSeriesLP(struct io
 
 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
 {
+	int sleep_msecs;
 	u64 rc;
 
-	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+	do {
+		rc = plpar_tce_stuff((u64)tbl->it_index,
+		                     (u64)tcenum << 12, 0, npages);
+		if (unlikely(H_IS_LONG_BUSY(rc))) {
+			sleep_msecs = plpar_get_longbusy_msecs(rc);
+			mdelay(sleep_msecs);
+		}
+	} while (unlikely(H_IS_LONG_BUSY(rc)));
 
 	if (rc && printk_ratelimit()) {
 		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
Index: b/arch/powerpc/sysdev/dart_iommu.c
===================================================================
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -147,7 +147,7 @@ static void dart_flush(struct iommu_tabl
 	}
 }
 
-static void dart_build(struct iommu_table *tbl, long index,
+static int dart_build(struct iommu_table *tbl, long index,
 		       long npages, unsigned long uaddr,
 		       enum dma_data_direction direction)
 {
@@ -183,6 +183,7 @@ static void dart_build(struct iommu_tabl
 	} else {
 		dart_dirty = 1;
 	}
+	return 0;
 }
 
 
Index: b/include/asm-powerpc/machdep.h
===================================================================
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -76,7 +76,7 @@ struct machdep_calls {
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
 
-	void		(*tce_build)(struct iommu_table * tbl,
+	int		(*tce_build)(struct iommu_table * tbl,
 				     long index,
 				     long npages,
 				     unsigned long uaddr,



More information about the Linuxppc-dev mailing list