[RFC PATCH kernel v2] KVM: PPC: Avoid marking DMA-mapped pages dirty in real mode
Alexey Kardashevskiy
aik at ozlabs.ru
Thu Sep 6 19:49:32 AEST 2018
At the moment the real mode handler of H_PUT_TCE calls iommu_tce_xchg_rm()
which in turn reads the old TCE and if it was a valid entry - marks
the physical page dirty if it was mapped for writing. Since it is
the real mode, realmode_pfn_to_page() is used instead of pfn_to_page()
to get the page struct. However SetPageDirty() itself reads the compound
page head and returns a virtual address for the head page struct and
setting dirty bit for that kills the system.
This adds a dirty bit tracking into the MM/IOMMU API:
- this uses the lowest bit of the cached host phys address to carry
the dirty bit;
- this marks pages dirty when they are unpinned which happens when
the preregistered memory is released which always happens in virtual
mode;
- this changes mm_iommu_ua_to_hpa{_rm} to take DMA direction parameter so
the bit is set when just about to map a page. This might seem rather early
as there is a chance of failure after the translation. The other way
of doing this would be adding a new mm_iommu_ua_mark_dirty() helper
which would have to do the useraddr->hpa lookup again which seems
suboptimal as we normally:
- do not see mapping errors;
- map the entire guest so it is TCE_WRITE;
- consequences of a page being marked dirty are not worth of the extra
lookup.
This moves dirty bit setting away from iommu_tce_xchg{_rm} to the callers
which are KVM and VFIO. KVM requires the memory preregistration; VFIO
uses the same mechanism which KVM does when VFIO_SPAPR_TCE_v2_IOMMU
or explicitly marks pages dirty when VFIO_SPAPR_TCE_IOMMU.
This removes realmode_pfn_to_page() as it is not used anymore.
Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
---
v1 was here: https://www.spinics.net/lists/kvm-ppc/msg14218.html
This is based on top of (it does not really depend on this but rebase does not just work):
[PATCH kernel 0/4] KVM: PPC: Some error handling rework
KVM: PPC: Validate all tces before updating tables
KVM: PPC: Inform the userspace about TCE update failures
KVM: PPC: Validate TCEs against preregistered memory page sizes
KVM: PPC: Propagate errors to the guest when failed instead of
ignoring
---
arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
arch/powerpc/include/asm/mmu_context.h | 7 ++--
arch/powerpc/kernel/iommu.c | 16 ---------
arch/powerpc/kvm/book3s_64_vio.c | 5 +--
arch/powerpc/kvm/book3s_64_vio_hv.c | 7 ++--
arch/powerpc/mm/init_64.c | 49 ----------------------------
arch/powerpc/mm/mmu_context_iommu.c | 23 ++++++++++---
drivers/vfio/vfio_iommu_spapr_tce.c | 26 +++++++++++----
8 files changed, 50 insertions(+), 84 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 13a688f..2fdc865 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1051,7 +1051,6 @@ static inline void vmemmap_remove_mapping(unsigned long start,
return hash__vmemmap_remove_mapping(start, page_size);
}
#endif
-struct page *realmode_pfn_to_page(unsigned long pfn);
static inline pte_t pmd_pte(pmd_t pmd)
{
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b2f89b6..ebafa43 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
+#include <linux/dma-direction.h>
#include <asm/mmu.h>
#include <asm/cputable.h>
#include <asm/cputhreads.h>
@@ -35,9 +36,11 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa);
+ unsigned long ua, unsigned int pageshift,
+ enum dma_data_direction dir, unsigned long *hpa);
extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa);
+ unsigned long ua, unsigned int pageshift,
+ enum dma_data_direction dir, unsigned long *hpa);
extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
#endif
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index af7a20d..1540e7f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1000,10 +1000,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
- if (!ret && ((*direction == DMA_FROM_DEVICE) ||
- (*direction == DMA_BIDIRECTIONAL)))
- SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
-
/* if (unlikely(ret))
pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
__func__, hwaddr, entry << tbl->it_page_shift,
@@ -1021,18 +1017,6 @@ long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
- if (!ret && ((*direction == DMA_FROM_DEVICE) ||
- (*direction == DMA_BIDIRECTIONAL))) {
- struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
-
- if (likely(pg)) {
- SetPageDirty(pg);
- } else {
- tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
- ret = -EFAULT;
- }
- }
-
return ret;
}
EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 5e3151b..174299d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -391,7 +391,7 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
if (!mem)
return H_TOO_HARD;
- if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
+ if (mm_iommu_ua_to_hpa(mem, ua, shift, DMA_NONE, &hpa))
return H_TOO_HARD;
}
@@ -483,7 +483,8 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
return H_TOO_HARD;
- if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
+ if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, dir,
+ &hpa)))
return H_TOO_HARD;
if (mm_iommu_mapped_inc(mem))
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8d82133..5f810dc 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -123,7 +123,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
if (!mem)
return H_TOO_HARD;
- if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
+ if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, DMA_NONE, &hpa))
return H_TOO_HARD;
}
@@ -292,7 +292,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
return H_TOO_HARD;
if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
- &hpa)))
+ dir, &hpa)))
return H_TOO_HARD;
if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
@@ -475,7 +475,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
if (mem)
prereg = mm_iommu_ua_to_hpa_rm(mem, ua,
- IOMMU_PAGE_SHIFT_4K, &tces) == 0;
+ IOMMU_PAGE_SHIFT_4K, DMA_NONE,
+ &tces) == 0;
}
if (!prereg) {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 51ce091..7a9886f 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -308,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr,
{
}
-/*
- * We do not have access to the sparsemem vmemmap, so we fallback to
- * walking the list of sparsemem blocks which we already maintain for
- * the sake of crashdump. In the long run, we might want to maintain
- * a tree if performance of that linear walk becomes a problem.
- *
- * realmode_pfn_to_page functions can fail due to:
- * 1) As real sparsemem blocks do not lay in RAM continously (they
- * are in virtual address space which is not available in the real mode),
- * the requested page struct can be split between blocks so get_page/put_page
- * may fail.
- * 2) When huge pages are used, the get_page/put_page API will fail
- * in real mode as the linked addresses in the page struct are virtual
- * too.
- */
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
- struct vmemmap_backing *vmem_back;
- struct page *page;
- unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
- unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
-
- for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
- if (pg_va < vmem_back->virt_addr)
- continue;
-
- /* After vmemmap_list entry free is possible, need check all */
- if ((pg_va + sizeof(struct page)) <=
- (vmem_back->virt_addr + page_size)) {
- page = (struct page *) (vmem_back->phys + pg_va -
- vmem_back->virt_addr);
- return page;
- }
- }
-
- /* Probably that page struct is split between real pages */
- return NULL;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
-#else
-
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- return page;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index c9ee9e2..bc59a73 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -18,11 +18,15 @@
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
+#include <linux/sizes.h>
#include <asm/mmu_context.h>
#include <asm/pte-walk.h>
static DEFINE_MUTEX(mem_list_mutex);
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK (SZ_4K - 1)
+
struct mm_iommu_table_group_mem_t {
struct list_head next;
struct rcu_head rcu;
@@ -263,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
if (!page)
continue;
+ if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+ SetPageDirty(page);
+
put_page(page);
mem->hpas[i] = 0;
}
@@ -379,7 +386,8 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
EXPORT_SYMBOL_GPL(mm_iommu_find);
long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift,
+ enum dma_data_direction dir, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
u64 *va = &mem->hpas[entry];
@@ -390,14 +398,18 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
if (pageshift > mem->pageshift)
return -EFAULT;
- *hpa = *va | (ua & ~PAGE_MASK);
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ *va |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+
+ *hpa = (*va & ~MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
return 0;
}
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift,
+ enum dma_data_direction dir, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
void *va = &mem->hpas[entry];
@@ -413,7 +425,10 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
if (!pa)
return -EFAULT;
- *hpa = *pa | (ua & ~PAGE_MASK);
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+
+ *hpa = (*pa & ~MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
return 0;
}
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 96721b1..8fea7cd1 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -410,16 +410,21 @@ static void tce_iommu_release(void *iommu_data)
}
static void tce_iommu_unuse_page(struct tce_container *container,
- unsigned long hpa)
+ unsigned long hpa, enum dma_data_direction dir)
{
struct page *page;
page = pfn_to_page(hpa >> PAGE_SHIFT);
+
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ SetPageDirty(page);
+
put_page(page);
}
static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
unsigned long tce, unsigned long shift,
+ enum dma_data_direction dir,
unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
{
long ret = 0;
@@ -429,7 +434,7 @@ static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
if (!mem)
return -EINVAL;
- ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
+ ret = mm_iommu_ua_to_hpa(mem, tce, shift, dir, phpa);
if (ret)
return -EINVAL;
@@ -450,10 +455,17 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container,
return;
ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
- tbl->it_page_shift, &hpa, &mem);
+ tbl->it_page_shift, DMA_NONE, &hpa, &mem);
if (ret)
pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
__func__, be64_to_cpu(*pua), entry, ret);
+
+ /*
+ * VFIO_SPAPR_TCE_v2_IOMMU requires preregistered memory which
+ * keeps track of dirty pages separately so we do not call
+ * SetPageDirty(page) here unlike tce_iommu_unuse_page() does.
+ */
+
if (mem)
mm_iommu_mapped_dec(mem);
@@ -485,7 +497,7 @@ static int tce_iommu_clear(struct tce_container *container,
continue;
}
- tce_iommu_unuse_page(container, oldhpa);
+ tce_iommu_unuse_page(container, oldhpa, direction);
}
return 0;
@@ -532,7 +544,7 @@ static long tce_iommu_build(struct tce_container *container,
dirtmp = direction;
ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
if (ret) {
- tce_iommu_unuse_page(container, hpa);
+ tce_iommu_unuse_page(container, hpa, dirtmp);
pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
__func__, entry << tbl->it_page_shift,
tce, ret);
@@ -540,7 +552,7 @@ static long tce_iommu_build(struct tce_container *container,
}
if (dirtmp != DMA_NONE)
- tce_iommu_unuse_page(container, hpa);
+ tce_iommu_unuse_page(container, hpa, dirtmp);
tce += IOMMU_PAGE_SIZE(tbl);
}
@@ -566,7 +578,7 @@ static long tce_iommu_build_v2(struct tce_container *container,
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
ret = tce_iommu_prereg_ua_to_hpa(container,
- tce, tbl->it_page_shift, &hpa, &mem);
+ tce, tbl->it_page_shift, direction, &hpa, &mem);
if (ret)
break;
--
2.11.0
More information about the Linuxppc-dev
mailing list