[SLOF] [PATCH slof v3 4/4] virtio: Enable IOMMU

Michael Roth mdroth at linux.vnet.ibm.com
Tue Dec 3 13:49:06 AEDT 2019


Quoting Alexey Kardashevskiy (2019-11-27 18:15:03)
> 
> 
> On 23/11/2019 01:42, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2019-11-21 21:14:10)
> >>
> >>
> >> On 22/11/2019 12:49, Alexey Kardashevskiy wrote:
> >>>
> >>>
> >>> On 22/11/2019 11:09, Michael Roth wrote:
> >>>> Quoting Alexey Kardashevskiy (2019-11-20 17:58:01)
> >>>>> When QEMU is started with iommu_platform=on, the guest driver must accept
> >>>>> it or the device will fail.
> >>>>>
> >>>>> This enables IOMMU support for virtio-net, -scsi, -block, -serial, -9p
> >>>>> devices. -serial and -9p are only compile tested though.
> >>>>>
> >>>>> For virtio-net we map all RX buffers once and TX when xmit() is called
> >>>>> and unmap older pages when we are about to reuse the VQ descriptor.
> >>>>> As all other devices are synchronous, we unmap IOMMU pages right after
> >>>>> completion of a transaction.
> >>>>>
> >>>>> This depends on QEMU's:
> >>>>> https://patchwork.ozlabs.org/patch/1194067/
> >>>>>
> >>>>> Copied-bits-and-pieces-from: Michael Roth <mdroth at linux.vnet.ibm.com>
> >>>>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
> >>>>> ---
> >>>>> Changes:
> >>>>> v2:
> >>>>> * added Mike's fs/dma-instance-function.fs
> >>>>> * total rework
> >>>>> ---
> >>>>>  board-qemu/slof/Makefile         |  1 +
> >>>>>  lib/libvirtio/virtio.h           |  5 ++
> >>>>>  lib/libvirtio/virtio-9p.c        |  4 ++
> >>>>>  lib/libvirtio/virtio-blk.c       |  4 ++
> >>>>>  lib/libvirtio/virtio-net.c       |  5 +-
> >>>>>  lib/libvirtio/virtio-scsi.c      |  5 ++
> >>>>>  lib/libvirtio/virtio-serial.c    | 12 +++--
> >>>>>  lib/libvirtio/virtio.c           | 82 +++++++++++++++++++++++++++++++-
> >>>>>  board-qemu/slof/OF.fs            |  3 ++
> >>>>>  slof/fs/dma-instance-function.fs | 28 +++++++++++
> >>>>>  10 files changed, 143 insertions(+), 6 deletions(-)
> >>>>>  create mode 100644 slof/fs/dma-instance-function.fs
> >>>>>
> >>>>> diff --git a/board-qemu/slof/Makefile b/board-qemu/slof/Makefile
> >>>>> index 2263e751bde9..d7ed2d7a6f18 100644
> >>>>> --- a/board-qemu/slof/Makefile
> >>>>> +++ b/board-qemu/slof/Makefile
> >>>>> @@ -99,6 +99,7 @@ OF_FFS_FILES = \
> >>>>>         $(SLOFCMNDIR)/fs/graphics.fs \
> >>>>>         $(SLOFCMNDIR)/fs/generic-disk.fs \
> >>>>>         $(SLOFCMNDIR)/fs/dma-function.fs \
> >>>>> +       $(SLOFCMNDIR)/fs/dma-instance-function.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-device.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-bridge.fs \
> >>>>>         $(SLOFCMNDIR)/fs/pci-properties.fs \
> >>>>> diff --git a/lib/libvirtio/virtio.h b/lib/libvirtio/virtio.h
> >>>>> index 4927a97f9f5f..250120b3112b 100644
> >>>>> --- a/lib/libvirtio/virtio.h
> >>>>> +++ b/lib/libvirtio/virtio.h
> >>>>> @@ -29,6 +29,7 @@
> >>>>>  #define VIRTIO_F_RING_INDIRECT_DESC    BIT(28)
> >>>>>  #define VIRTIO_F_RING_EVENT_IDX                BIT(29)
> >>>>>  #define VIRTIO_F_VERSION_1             BIT(32)
> >>>>> +#define VIRTIO_F_IOMMU_PLATFORM         BIT(33)
> >>>>>
> >>>>>  #define VIRTIO_TIMEOUT                 5000 /* 5 sec timeout */
> >>>>>
> >>>>> @@ -84,6 +85,8 @@ struct vqs {
> >>>>>         struct vring_desc *desc;
> >>>>>         struct vring_avail *avail;
> >>>>>         struct vring_used *used;
> >>>>> +       void **desc_gpas; /* to get gpa from desc->addr (which is ioba) */
> >>>>> +       uint64_t bus_desc;
> >>>>>  };
> >>>>>
> >>>>>  struct virtio_device {
> >>>>> @@ -109,6 +112,8 @@ extern struct vring_used *virtio_get_vring_used(struct virtio_device *dev, int q
> >>>>>  extern void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>                               uint64_t addr, uint32_t len,
> >>>>>                               uint16_t flags, uint16_t next);
> >>>>> +extern void virtio_free_desc(struct vqs *vq, int id, uint64_t features);
> >>>>> +void *virtio_desc_addr(struct virtio_device *vdev, int queue, int id);
> >>>>>  extern struct vqs *virtio_queue_init_vq(struct virtio_device *dev, unsigned int id);
> >>>>>  extern void virtio_queue_term_vq(struct virtio_device *dev, struct vqs *vq, unsigned int id);
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-9p.c b/lib/libvirtio/virtio-9p.c
> >>>>> index 426069fe9509..76078612b06e 100644
> >>>>> --- a/lib/libvirtio/virtio-9p.c
> >>>>> +++ b/lib/libvirtio/virtio-9p.c
> >>>>> @@ -129,6 +129,10 @@ static int virtio_9p_transact(void *opaque, uint8_t *tx, int tx_size, uint8_t *r
> >>>>>                 // do something better
> >>>>>                 mb();
> >>>>>         }
> >>>>> +
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +
> >>>>>         if (i == 0) {
> >>>>>                 return -1;
> >>>>>         }
> >>>>> diff --git a/lib/libvirtio/virtio-blk.c b/lib/libvirtio/virtio-blk.c
> >>>>> index a0dadbb0d6a8..0363038e559d 100644
> >>>>> --- a/lib/libvirtio/virtio-blk.c
> >>>>> +++ b/lib/libvirtio/virtio-blk.c
> >>>>> @@ -195,6 +195,10 @@ virtioblk_transfer(struct virtio_device *dev, char *buf, uint64_t blocknum,
> >>>>>                         break;
> >>>>>         }
> >>>>>
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 2, dev->features);
> >>>>> +
> >>>>>         if (status == 0)
> >>>>>                 return cnt;
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-net.c b/lib/libvirtio/virtio-net.c
> >>>>> index ae67883020ef..5a0d19088527 100644
> >>>>> --- a/lib/libvirtio/virtio-net.c
> >>>>> +++ b/lib/libvirtio/virtio-net.c
> >>>>> @@ -255,6 +255,9 @@ static int virtionet_xmit(struct virtio_net *vnet, char *buf, int len)
> >>>>>         idx = virtio_modern16_to_cpu(vdev, vq_tx->avail->idx);
> >>>>>         id = (idx * 2) % vq_tx->size;
> >>>>>
> >>>>> +       virtio_free_desc(vq_tx, id, vdev->features);
> >>>>> +       virtio_free_desc(vq_tx, id + 1, vdev->features);
> >>>>> +
> >>>>>         /* Set up virtqueue descriptor for header */
> >>>>>         virtio_fill_desc(vq_tx, id, vdev->features, (uint64_t)nethdr,
> >>>>>                          net_hdr_size, VRING_DESC_F_NEXT, id + 1);
> >>>>> @@ -317,7 +320,7 @@ static int virtionet_receive(struct virtio_net *vnet, char *buf, int maxlen)
> >>>>>  #endif
> >>>>>
> >>>>>         /* Copy data to destination buffer */
> >>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(vdev, vq_rx->desc[id].addr), len);
> >>>>> +       memcpy(buf, virtio_desc_addr(vdev, VQ_RX, id), len);
> >>>>>
> >>>>>         /* Move indices to next entries */
> >>>>>         last_rx_idx = last_rx_idx + 1;
> >>>>> diff --git a/lib/libvirtio/virtio-scsi.c b/lib/libvirtio/virtio-scsi.c
> >>>>> index 251661e8d466..44573d10e24c 100644
> >>>>> --- a/lib/libvirtio/virtio-scsi.c
> >>>>> +++ b/lib/libvirtio/virtio-scsi.c
> >>>>> @@ -81,6 +81,11 @@ int virtioscsi_send(struct virtio_device *dev,
> >>>>>                         break;
> >>>>>         }
> >>>>>
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +       virtio_free_desc(vq, id + 1, dev->features);
> >>>>> +       if (!(buf == NULL || buf_len == 0))
> >>>>> +               virtio_free_desc(vq, id + 2, dev->features);
> >>>>> +
> >>>>>         return 0;
> >>>>>  }
> >>>>>
> >>>>> diff --git a/lib/libvirtio/virtio-serial.c b/lib/libvirtio/virtio-serial.c
> >>>>> index 78ab0e25d7db..66fbfe8b6cc6 100644
> >>>>> --- a/lib/libvirtio/virtio-serial.c
> >>>>> +++ b/lib/libvirtio/virtio-serial.c
> >>>>> @@ -108,7 +108,7 @@ void virtio_serial_shutdown(struct virtio_device *dev)
> >>>>>
> >>>>>  int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>>>  {
> >>>>> -       int id;
> >>>>> +       int id, ret;
> >>>>>         uint32_t vq_size, time;
> >>>>>         volatile uint16_t *current_used_idx;
> >>>>>         uint16_t last_used_idx, avail_idx;
> >>>>> @@ -135,17 +135,21 @@ int virtio_serial_putchar(struct virtio_device *dev, char c)
> >>>>>         virtio_queue_notify(dev, TX_Q);
> >>>>>
> >>>>>         /* Wait for host to consume the descriptor */
> >>>>> +       ret = 1;
> >>>>>         time = SLOF_GetTimer() + VIRTIO_TIMEOUT;
> >>>>>         while (*current_used_idx == last_used_idx) {
> >>>>>                 // do something better
> >>>>>                 mb();
> >>>>>                 if (time < SLOF_GetTimer()) {
> >>>>>                         printf("virtio_serial_putchar failed! \n");
> >>>>> -                       return 0;
> >>>>> +                       ret = 0;
> >>>>> +                       break;
> >>>>>                 }
> >>>>>         }
> >>>>>
> >>>>> -       return 1;
> >>>>> +       virtio_free_desc(vq, id, dev->features);
> >>>>> +
> >>>>> +       return ret;
> >>>>>  }
> >>>>>
> >>>>>  char virtio_serial_getchar(struct virtio_device *dev)
> >>>>> @@ -165,7 +169,7 @@ char virtio_serial_getchar(struct virtio_device *dev)
> >>>>>                 % vq_rx->size;
> >>>>>
> >>>>>         /* Copy data to destination buffer */
> >>>>> -       memcpy(buf, (void *)virtio_modern64_to_cpu(dev, vq_rx->desc[id - 1].addr), RX_ELEM_SIZE);
> >>>>> +       memcpy(buf, virtio_desc_addr(dev, RX_Q, id - 1), RX_ELEM_SIZE);
> >>>>>
> >>>>>         /* Move indices to next entries */
> >>>>>         last_rx_idx = last_rx_idx + 1;
> >>>>> diff --git a/lib/libvirtio/virtio.c b/lib/libvirtio/virtio.c
> >>>>> index fff0c7e54c19..d4db5f656888 100644
> >>>>> --- a/lib/libvirtio/virtio.c
> >>>>> +++ b/lib/libvirtio/virtio.c
> >>>>> @@ -273,6 +273,17 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>         next %= vq->size;
> >>>>>
> >>>>>         if (features & VIRTIO_F_VERSION_1) {
> >>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>> +                       void *gpa = (void *) addr;
> >>>>> +
> >>>>> +                       if (!vq->desc_gpas) {
> >>>>> +                               fprintf(stderr, "IOMMU setup has not been done!\n");
> >>>>> +                               return;
> >>>>> +                       }
> >>>>> +
> >>>>> +                       addr = SLOF_dma_map_in(gpa, len, 0);
> >>>>> +                       vq->desc_gpas[id] = gpa;
> >>>>> +               }
> >>>>>                 desc->addr = cpu_to_le64(addr);
> >>>>>                 desc->len = cpu_to_le32(len);
> >>>>>                 desc->flags = cpu_to_le16(flags);
> >>>>> @@ -285,6 +296,32 @@ void virtio_fill_desc(struct vqs *vq, int id, uint64_t features,
> >>>>>         }
> >>>>>  }
> >>>>>
> >>>>> +void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>>>> +{
> >>>>> +       struct vring_desc *desc;
> >>>>> +
> >>>>> +       id %= vq->size;
> >>>>> +       desc = &vq->desc[id];
> >>>>> +
> >>>>> +       if (features & VIRTIO_F_VERSION_1) {
> >>>>> +               if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>> +                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>>>> +                                        0, le32_to_cpu(desc->len));
> >>>>> +                       vq->desc_gpas[id] = NULL;
> >>>>> +               }
> >>>>> +       }
> >>>>> +}
> >>>>
> >>>> On a 4.18 RHEL8 kernel I was unable to boot from virtio-blk, it complains the
> >>>> disk isn't bootable and the H_PUT_TCE traces show:
> >>>>
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067356: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x1000,0xe46b003,0xdc73350
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067358: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067361: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x2000,0xe46c003,0xdc73350
> >>>>  qemu-system-ppc-19019 [045] .... 786844.067362: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3000,0xe67b003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070533: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070817: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x4000,0xe469003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.070818: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071102: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x5000,0xe67a003,0xdc73490
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071103: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.071123: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_SUCCESS
> >>>>  qemu-system-ppc-19019 [041] .... 786844.073532: kvm_hcall_enter: VCPU 0: hcall=H_PUT_TCE GPR4-7=0x80000000,0x3228,0x0,0xdc73488
> >>>>  qemu-system-ppc-19019 [041] .... 786844.073534: kvm_hcall_exit: VCPU 0: ret=RESUME_GUEST hcall_rc=H_PARAMETER
> >>>>
> >>>> If I disable in-kernel TCE acceleration, or use TCG, it seems to work
> >>>> fine. If I add the following hack it seems to fix things:
> >>>>
> >>>>   @@ -305,7 +307,8 @@ void virtio_free_desc(struct vqs *vq, int id, uint64_t features)
> >>>>    
> >>>>           if (features & VIRTIO_F_VERSION_1) {
> >>>>                   if (features & VIRTIO_F_IOMMU_PLATFORM) {
> >>>>   -                       SLOF_dma_map_out(le64_to_cpu(desc->addr),
> >>>>   +                       fprintf(stderr, "mapping out desc->addr: %llx\n", le64_to_cpu(desc->addr));
> >>>>   +                       SLOF_dma_map_out(le64_to_cpu(desc->addr) & ~(0xfff),
> >>>>                                            0, le32_to_cpu(desc->len));
> >>>>                           vq->desc_gpas[id] = NULL;
> >>>>                   }
> >>>>
> >>>> I guess the kernel is a bit stricter about unmapping unaligned TCE entries? If
> >>>> so, maybe we should align automatically in dma-map-out?
> >>>
> >>>
> >>> I thought I did that as well in 1/4, I'll double check. Hmmm.
> >>
> >>
> >> v2 of these patches did not align (or, trunc, really) in dma-map-out but
> >> v3 do, it is "swap dma-trunc swap" in dma-map-out of 1/4. SO I am really
> >> confused now.
> > 
> > Yah sorry I think the issue was on my end. I thought:
> > 
> >   https://github.com/aik/SLOF/commits/virtio-iommu
> 
> 
> My bad, I should have pushed it out since I started pushing them out.
> 
> The question now is if we are good to go and I can push these out and
> send SLOF blob update so people can actually start testing it? Thanks,

System came back after clearing GARD record, re-tested disk boot for
virtio-blk/virtio-scsi, and netboot via virtio-net, with iommu on/off and
everything seems to be working.

Still need to verify in an actual uv-capable system, but in terms of base
IOMMU enablement the series looks good:

Tested-by: Michael Roth <mdroth at linux.vnet.ibm.com>

> 
> 
> >   
> > reflected this series, but it does contain an older version of the
> > dma-map* patch:
> > 
> >   index f136324..be529c9 100644
> >   --- a/board-qemu/slof/pci-phb.fs
> >   +++ b/board-qemu/slof/pci-phb.fs
> >   @@ -146,6 +146,7 @@ setup-puid
> >    \ grub does not align allocated addresses to the size so when mapping,
> >    \ we might ask bm-alloc for an extra IOMMU page
> >    : dma-align ( size virt -- aligned-size ) tce-mask and + tce-ps #aligned ;
> >   +: dma-trunc ( addr -- addr&~fff) tce-mask not and ;
> >   
> >    : dma-map-in  ( virt size cachable? -- devaddr )
> >       phb-debug? IF cr ." dma-map-in called: " .s cr THEN
> >   @@ -163,7 +164,7 @@ setup-puid
> >       swap                             ( size dev-addr virt )
> >       2dup tce-mask and or >r         \ add page offset to the return value
> >   
> >   -   tce-mask not and 3 OR            \ Trunk virt and add read and write perm
> >   +   dma-trunc 3 OR                   \ Truncate and add read and write perm
> >       rot                             ( virt dev-addr size r: dev-addr )
> >       0
> >       ?DO
> >   @@ -179,10 +180,10 @@ setup-puid
> >    : dma-map-out  ( virt devaddr size -- )
> >       phb-debug? IF cr ." dma-map-out called: " .s cr THEN
> >       (init-dma-window-vars)
> >   -   rot drop            ( devaddr size )
> >   +   rot drop                        ( devaddr size )
> >       over dma-align
> >   -   2dup swap tce-mask not and swap bm-handle -rot
> >   -   bm-free
> >   +   swap dma-trunc swap             ( devaddr-trunc size-extended )
> >   +   2dup bm-handle -rot bm-free
> >       0
> >       ?DO
> >           dup 0 dma-window-liobn -rot
> > 
> > 
> > I haven't been able to re-test yet since something seems to have gone
> > awry with the test system last night, but I assume that's what the issue
> > was. Sorry for the confusion.
> > 
> >>
> >> It is true that checking differs between qemu's and kvm's h_put_tce
> >> handlers, I added this (see below) to my qemu and it still works, can
> >> you please try in your tree and capture the spapr_iommu_pci_put traces?
> >> Thanks,
> >>
> >>
> >> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> >> index 3d3bcc86496a..feae27bf11fa 100644
> >> --- a/hw/ppc/spapr_iommu.c
> >> +++ b/hw/ppc/spapr_iommu.c
> >> @@ -564,6 +564,9 @@ static target_ulong h_put_tce(PowerPCCPU *cpu,
> >> SpaprMachineState *spapr,
> >>      if (tcet) {
> >>          hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift);
> >>
> >> +        if (ioba & ~page_mask)
> >> +            abort();
> >> +
> >>
> >>
> >>
> >>
> >> -- 
> >> Alexey
> 
> -- 
> Alexey


More information about the SLOF mailing list