[PATCH 2/2] uacce: add uacce module
zhangfei
zhangfei.gao at linaro.org
Fri Aug 23 19:21:33 AEST 2019
Hi, Jonathan
Thanks for your careful review and good suggestion.
Sorry for late response, I am checking one detail.
On 2019/8/16 上午12:54, Jonathan Cameron wrote:
> On Wed, 14 Aug 2019 17:34:25 +0800
> Zhangfei Gao <zhangfei.gao at linaro.org> wrote:
>
>> From: Kenneth Lee <liguozhu at hisilicon.com>
>>
>> Uacce is the kernel component to support WarpDrive accelerator
>> framework. It provides register/unregister interface for device drivers
>> to expose their hardware resource to the user space. The resource is
>> taken as "queue" in WarpDrive.
> It's a bit confusing to have both the term UACCE and WarpDrive in here.
> I'd just use the uacce name in all comments etc.
Yes, make sense
>
>> Uacce create a chrdev for every registration, the queue is allocated to
>> the process when the chrdev is opened. Then the process can access the
>> hardware resource by interact with the queue file. By mmap the queue
>> file space to user space, the process can directly put requests to the
>> hardware without syscall to the kernel space.
>>
>> Uacce also manages unify addresses between the hardware and user space
>> of the process. So they can share the same virtual address in the
>> communication.
>>
>> Signed-off-by: Kenneth Lee <liguozhu at hisilicon.com>
>> Signed-off-by: Zaibo Xu <xuzaibo at huawei.com>
>> Signed-off-by: Zhou Wang <wangzhou1 at hisilicon.com>
>> Signed-off-by: Zhangfei Gao <zhangfei.gao at linaro.org>
> I would strip this back to which ever case is of most interest (SVA I guess?)
> and only think about adding support for the others if necessary at a later date.
> (or in later patches).
Do you mean split the patch and send sva part first?
>> +
>> +static int uacce_qfr_alloc_pages(struct uacce_qfile_region *qfr)
>> +{
>> + int gfp_mask = GFP_ATOMIC | __GFP_ZERO;
> More readable to just have this inline.
Yes, all right.
>
>> + int i, j;
>> +
>> + qfr->pages = kcalloc(qfr->nr_pages, sizeof(*qfr->pages), gfp_mask);
> kcalloc is always set to zero anyway.
OK
>
>> +
>> +static struct uacce_qfile_region *
>> +uacce_create_region(struct uacce_queue *q, struct vm_area_struct *vma,
>> + enum uacce_qfrt type, unsigned int flags)
>> +{
>> + struct uacce_qfile_region *qfr;
>> + struct uacce *uacce = q->uacce;
>> + unsigned long vm_pgoff;
>> + int ret = -ENOMEM;
>> +
>> + dev_dbg(uacce->pdev, "create qfr (type=%x, flags=%x)\n", type, flags);
>> + qfr = kzalloc(sizeof(*qfr), GFP_ATOMIC);
>> + if (!qfr)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + qfr->type = type;
>> + qfr->flags = flags;
>> + qfr->iova = vma->vm_start;
>> + qfr->nr_pages = vma_pages(vma);
>> +
>> + if (vma->vm_flags & VM_READ)
>> + qfr->prot |= IOMMU_READ;
>> +
>> + if (vma->vm_flags & VM_WRITE)
>> + qfr->prot |= IOMMU_WRITE;
>> +
>> + if (flags & UACCE_QFRF_SELFMT) {
>> + ret = uacce->ops->mmap(q, vma, qfr);
>> + if (ret)
>> + goto err_with_qfr;
>> + return qfr;
>> + }
>> +
>> + /* allocate memory */
>> + if (flags & UACCE_QFRF_DMA) {
>> + dev_dbg(uacce->pdev, "allocate dma %d pages\n", qfr->nr_pages);
>> + qfr->kaddr = dma_alloc_coherent(uacce->pdev, qfr->nr_pages <<
>> + PAGE_SHIFT, &qfr->dma,
>> + GFP_KERNEL);
>> + if (!qfr->kaddr) {
>> + ret = -ENOMEM;
>> + goto err_with_qfr;
>> + }
>> + } else {
>> + dev_dbg(uacce->pdev, "allocate %d pages\n", qfr->nr_pages);
>> + ret = uacce_qfr_alloc_pages(qfr);
>> + if (ret)
>> + goto err_with_qfr;
>> + }
>> +
>> + /* map to device */
>> + ret = uacce_queue_map_qfr(q, qfr);
>> + if (ret)
>> + goto err_with_pages;
>> +
>> + /* mmap to user space */
>> + if (flags & UACCE_QFRF_MMAP) {
>> + if (flags & UACCE_QFRF_DMA) {
>> +
>> + /* dma_mmap_coherent() requires vm_pgoff as 0
>> + * restore vm_pfoff to initial value for mmap()
>> + */
>> + dev_dbg(uacce->pdev, "mmap dma qfr\n");
>> + vm_pgoff = vma->vm_pgoff;
>> + vma->vm_pgoff = 0;
>> + ret = dma_mmap_coherent(uacce->pdev, vma, qfr->kaddr,
>> + qfr->dma,
>> + qfr->nr_pages << PAGE_SHIFT);
> Does setting vm_pgoff if (ret) make sense?
Since we need restore the environment, so restore vm_pgoff no matter
succeed or not.
>> + vma->vm_pgoff = vm_pgoff;
>> + } else {
>> + ret = uacce_queue_mmap_qfr(q, qfr, vma);
>> + }
>> +
>> + if (ret)
>> + goto err_with_mapped_qfr;
>> + }
>> +
>> + return qfr;
>> +
>> +err_with_mapped_qfr:
>> + uacce_queue_unmap_qfr(q, qfr);
>> +err_with_pages:
>> + if (flags & UACCE_QFRF_DMA)
>> + dma_free_coherent(uacce->pdev, qfr->nr_pages << PAGE_SHIFT,
>> + qfr->kaddr, qfr->dma);
>> + else
>> + uacce_qfr_free_pages(qfr);
>> +err_with_qfr:
>> + kfree(qfr);
>> +
>> + return ERR_PTR(ret);
>> +}
>> +
>> +/* we assume you have uacce_queue_unmap_qfr(q, qfr) from all related queues */
>> +static void uacce_destroy_region(struct uacce_queue *q,
>> + struct uacce_qfile_region *qfr)
>> +{
>> + struct uacce *uacce = q->uacce;
>> +
>> + if (qfr->flags & UACCE_QFRF_DMA) {
>> + dev_dbg(uacce->pdev, "free dma qfr %s (kaddr=%lx, dma=%llx)\n",
>> + uacce_qfrt_str(qfr), (unsigned long)qfr->kaddr,
>> + qfr->dma);
>> + dma_free_coherent(uacce->pdev, qfr->nr_pages << PAGE_SHIFT,
>> + qfr->kaddr, qfr->dma);
>> + } else if (qfr->pages) {
>> + if (qfr->flags & UACCE_QFRF_KMAP && qfr->kaddr) {
>> + dev_dbg(uacce->pdev, "vunmap qfr %s\n",
>> + uacce_qfrt_str(qfr));
>> + vunmap(qfr->kaddr);
>> + qfr->kaddr = NULL;
>> + }
>> +
>> + uacce_qfr_free_pages(qfr);
>> + }
>> + kfree(qfr);
>> +}
>> +
>> +static long uacce_cmd_share_qfr(struct uacce_queue *tgt, int fd)
>> +{
>> + struct file *filep = fget(fd);
> That's not a trivial assignment so I would not have it up here.
>
>> + struct uacce_queue *src;
>> + int ret = -EINVAL;
>> +
> filep = fget(fd);
Yes, make sense.
>> + if (!filep)
>> + return ret;
>> +
>> + if (filep->f_op != &uacce_fops)
>> + goto out_with_fd;
>> +
>> + src = filep->private_data;
>> + if (!src)
>> + goto out_with_fd;
>> +
>> + /* no share sva is needed if the dev can do fault-from-dev */
>> + if (tgt->uacce->flags & UACCE_DEV_FAULT_FROM_DEV)
>> + goto out_with_fd;
>> +
>> + dev_dbg(&src->uacce->dev, "share ss with %s\n",
>> + dev_name(&tgt->uacce->dev));
>> +
>> + uacce_qs_wlock();
>> + if (!src->qfrs[UACCE_QFRT_SS] || tgt->qfrs[UACCE_QFRT_SS])
>> + goto out_with_lock;
>> +
>> + ret = uacce_queue_map_qfr(tgt, src->qfrs[UACCE_QFRT_SS]);
>> + if (ret)
>> + goto out_with_lock;
>> +
>> + tgt->qfrs[UACCE_QFRT_SS] = src->qfrs[UACCE_QFRT_SS];
>> + list_add(&tgt->list, &src->qfrs[UACCE_QFRT_SS]->qs);
>> + ret = 0;
>> +
>> +out_with_lock:
>> + uacce_qs_wunlock();
>> +out_with_fd:
>> + fput(filep);
>> + return ret;
>> +}
>> +
>> +static int uacce_start_queue(struct uacce_queue *q)
>> +{
>> + int ret, i, j;
>> + struct uacce_qfile_region *qfr;
>> + struct device *dev = &q->uacce->dev;
>> +
>> + /*
>> + * map KMAP qfr to kernel
>> + * vmap should be done in non-spinlocked context!
>> + */
>> + for (i = 0; i < UACCE_QFRT_MAX; i++) {
>> + qfr = q->qfrs[i];
>> + if (qfr && (qfr->flags & UACCE_QFRF_KMAP) && !qfr->kaddr) {
>> + qfr->kaddr = vmap(qfr->pages, qfr->nr_pages, VM_MAP,
>> + PAGE_KERNEL);
>> + if (!qfr->kaddr) {
>> + ret = -ENOMEM;
>> + dev_dbg(dev, "fail to kmap %s qfr(%d pages)\n",
>> + uacce_qfrt_str(qfr), qfr->nr_pages);
> If it's useful, dev_err.
OK
>> +static long uacce_fops_unl_ioctl(struct file *filep,
>> + unsigned int cmd, unsigned long arg)
>> +{
>> + struct uacce_queue *q = filep->private_data;
>> + struct uacce *uacce = q->uacce;
>> +
>> + switch (cmd) {
>> + case UACCE_CMD_SHARE_SVAS:
>> + return uacce_cmd_share_qfr(q, arg);
>> +
>> + case UACCE_CMD_START:
>> + return uacce_start_queue(q);
>> +
>> + case UACCE_CMD_GET_SS_DMA:
>> + return uacce_get_ss_dma(q, (void __user *)arg);
>> +
>> + default:
>> + if (uacce->ops->ioctl)
>> + return uacce->ops->ioctl(q, cmd, arg);
>> +
>> + dev_err(&uacce->dev, "ioctl cmd (%d) is not supported!\n", cmd);
>> + return -EINVAL;
> Flip the logic so the error is the indented path.
> if (!uacce->ops->ioctl) ...
>
> Fits better with typical coding model in a kernel reviewers head
> - well mine anyway ;)
OK, make sense.
>> +
>> +static int uacce_fops_open(struct inode *inode, struct file *filep)
>> +{
>> + struct uacce_queue *q;
>> + struct iommu_sva *handle = NULL;
>> + struct uacce *uacce;
>> + int ret;
>> + int pasid = 0;
>> +
>> + uacce = idr_find(&uacce_idr, iminor(inode));
>> + if (!uacce)
>> + return -ENODEV;
>> +
>> + if (atomic_read(&uacce->state) == UACCE_ST_RST)
>> + return -EINVAL;
>> +
>> + if (!uacce->ops->get_queue)
>> + return -EINVAL;
>> +
>> + if (!try_module_get(uacce->pdev->driver->owner))
>> + return -ENODEV;
>> +
>> + ret = uacce_dev_open_check(uacce);
>> + if (ret)
>> + goto open_err;
>> +
>> +#ifdef CONFIG_IOMMU_SVA
>> + if (uacce->flags & UACCE_DEV_PASID) {
>> + handle = iommu_sva_bind_device(uacce->pdev, current->mm, NULL);
>> + if (IS_ERR(handle))
>> + goto open_err;
>> + pasid = iommu_sva_get_pasid(handle);
>> + }
>> +#endif
>> + ret = uacce->ops->get_queue(uacce, pasid, &q);
>> + if (ret < 0)
>> + goto open_err;
>> +
>> + q->pasid = pasid;
>> + q->handle = handle;
>> + q->uacce = uacce;
>> + q->mm = current->mm;
>> + memset(q->qfrs, 0, sizeof(q->qfrs));
>> + INIT_LIST_HEAD(&q->list);
>> + init_waitqueue_head(&q->wait);
>> + filep->private_data = q;
>> + mutex_lock(&uacce->q_lock);
>> + list_add(&q->q_dev, &uacce->qs);
>> + mutex_unlock(&uacce->q_lock);
>> +
>> + return 0;
> blank line
OK.
>> +open_err:
>> + module_put(uacce->pdev->driver->owner);
>> + return ret;
>> +}
>> +
>> +static int uacce_fops_release(struct inode *inode, struct file *filep)
>> +{
>> + struct uacce_queue *q = (struct uacce_queue *)filep->private_data;
>> + struct uacce_qfile_region *qfr;
>> + struct uacce *uacce = q->uacce;
>> + int i;
>> + bool is_to_free_region;
>> + int free_pages = 0;
>> +
>> + mutex_lock(&uacce->q_lock);
>> + list_del(&q->q_dev);
>> + mutex_unlock(&uacce->q_lock);
>> +
>> + if (atomic_read(&uacce->state) == UACCE_ST_STARTED &&
>> + uacce->ops->stop_queue)
>> + uacce->ops->stop_queue(q);
>> +
>> + uacce_qs_wlock();
>> +
>> + for (i = 0; i < UACCE_QFRT_MAX; i++) {
>> + qfr = q->qfrs[i];
>> + if (!qfr)
>> + continue;
>> +
>> + is_to_free_region = false;
>> + uacce_queue_unmap_qfr(q, qfr);
>> + if (i == UACCE_QFRT_SS) {
>> + list_del(&q->list);
>> + if (list_empty(&qfr->qs))
>> + is_to_free_region = true;
>> + } else
>> + is_to_free_region = true;
>> +
>> + if (is_to_free_region) {
>> + free_pages += qfr->nr_pages;
>> + uacce_destroy_region(q, qfr);
>> + }
>> +
>> + qfr = NULL;
>> + }
>> +
>> + uacce_qs_wunlock();
>> +
>> + if (current->mm == q->mm) {
>> + down_write(&q->mm->mmap_sem);
>> + q->mm->data_vm -= free_pages;
>> + up_write(&q->mm->mmap_sem);
>> + }
>> +
>> +#ifdef CONFIG_IOMMU_SVA
>> + if (uacce->flags & UACCE_DEV_PASID)
>> + iommu_sva_unbind_device(q->handle);
>> +#endif
>> +
>> + if (uacce->ops->put_queue)
>> + uacce->ops->put_queue(q);
>> +
>> + dev_dbg(&uacce->dev, "uacce state switch to INIT\n");
>> + atomic_set(&uacce->state, UACCE_ST_INIT);
>> + module_put(uacce->pdev->driver->owner);
> blank line here.
>
>> + return 0;
>> +}
>> +
>> +static enum uacce_qfrt uacce_get_region_type(struct uacce *uacce,
>> + struct vm_area_struct *vma)
>> +{
>> + enum uacce_qfrt type = UACCE_QFRT_MAX;
>> + int i;
>> + size_t next_start = UACCE_QFR_NA;
>> +
>> + for (i = UACCE_QFRT_MAX - 1; i >= 0; i--) {
>> + if (vma->vm_pgoff >= uacce->qf_pg_start[i]) {
>> + type = i;
>> + break;
>> + }
>> + }
>> +
>> + switch (type) {
>> + case UACCE_QFRT_MMIO:
>> + if (!uacce->ops->mmap) {
>> + dev_err(&uacce->dev, "no driver mmap!\n");
>> + return UACCE_QFRT_INVALID;
>> + }
>> + break;
>> +
>> + case UACCE_QFRT_DKO:
>> + if ((uacce->flags & UACCE_DEV_PASID) ||
>> + (uacce->flags & UACCE_DEV_NOIOMMU))
>> + return UACCE_QFRT_INVALID;
>> + break;
>> +
>> + case UACCE_QFRT_DUS:
>> + break;
>> +
>> + case UACCE_QFRT_SS:
>> + /* todo: this can be valid to protect the process space */
>> + if (uacce->flags & UACCE_DEV_FAULT_FROM_DEV)
>> + return UACCE_QFRT_INVALID;
>> + break;
>> +
>> + default:
>> + dev_err(&uacce->dev, "uacce bug (%d)!\n", type);
>> + return UACCE_QFRT_INVALID;
>> + }
>> +
>> + /* make sure the mapping size is exactly the same as the region */
>> + if (type < UACCE_QFRT_SS) {
>> + for (i = type + 1; i < UACCE_QFRT_MAX; i++)
>> + if (uacce->qf_pg_start[i] != UACCE_QFR_NA) {
>> + next_start = uacce->qf_pg_start[i];
>> + break;
>> + }
>> +
>> + if (next_start == UACCE_QFR_NA) {
>> + dev_err(&uacce->dev, "uacce config error: SS offset set improperly\n");
>> + return UACCE_QFRT_INVALID;
>> + }
>> +
>> + if (vma_pages(vma) !=
>> + next_start - uacce->qf_pg_start[type]) {
>> + dev_err(&uacce->dev, "invalid mmap size (%ld vs %ld pages) for region %s.\n",
>> + vma_pages(vma),
>> + next_start - uacce->qf_pg_start[type],
>> + qfrt_str[type]);
>> + return UACCE_QFRT_INVALID;
>> + }
>> + }
>> +
>> + return type;
>> +}
>> +
>> +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
>> +{
>> + struct uacce_queue *q = (struct uacce_queue *)filep->private_data;
> As below, cast not needed.
OK, thanks
>
>> + struct uacce *uacce = q->uacce;
>> + enum uacce_qfrt type = uacce_get_region_type(uacce, vma);
>> + struct uacce_qfile_region *qfr;
>> + unsigned int flags = 0;
>> + int ret;
>> +
>> + dev_dbg(&uacce->dev, "mmap q file(t=%s, off=%lx, start=%lx, end=%lx)\n",
>> + qfrt_str[type], vma->vm_pgoff, vma->vm_start, vma->vm_end);
>> +
>> + if (type == UACCE_QFRT_INVALID)
>> + return -EINVAL;
>> +
>> + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
>> +
>> + uacce_qs_wlock();
>> +
>> + /* fixme: if the region need no pages, we don't need to check it */
>> + if (q->mm->data_vm + vma_pages(vma) >
>> + rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
>> + ret = -ENOMEM;
>> + goto out_with_lock;
>> + }
>> +
>> + if (q->qfrs[type]) {
>> + ret = -EBUSY;
>> + goto out_with_lock;
>> + }
>> +
>> + switch (type) {
>> + case UACCE_QFRT_MMIO:
>> + flags = UACCE_QFRF_SELFMT;
>> + break;
>> +
>> + case UACCE_QFRT_SS:
>> + if (atomic_read(&uacce->state) != UACCE_ST_STARTED) {
>> + ret = -EINVAL;
>> + goto out_with_lock;
>> + }
>> +
>> + flags = UACCE_QFRF_MAP | UACCE_QFRF_MMAP;
>> +
>> + if (uacce->flags & UACCE_DEV_NOIOMMU)
>> + flags |= UACCE_QFRF_DMA;
>> + break;
>> +
>> + case UACCE_QFRT_DKO:
>> + flags = UACCE_QFRF_MAP | UACCE_QFRF_KMAP;
>> +
>> + if (uacce->flags & UACCE_DEV_NOIOMMU)
>> + flags |= UACCE_QFRF_DMA;
>> + break;
>> +
>> + case UACCE_QFRT_DUS:
>> + if ((uacce->flags & UACCE_DEV_NOIOMMU) ||
>> + (uacce->flags & UACCE_DEV_PASID)) {
>> + flags = UACCE_QFRF_SELFMT;
>> + break;
>> + }
>> +
>> + flags = UACCE_QFRF_MAP | UACCE_QFRF_MMAP;
>> + break;
>> +
>> + default:
>> + WARN_ON(&uacce->dev);
>> + break;
>> + }
>> +
>> + qfr = uacce_create_region(q, vma, type, flags);
>> + if (IS_ERR(qfr)) {
>> + ret = PTR_ERR(qfr);
>> + goto out_with_lock;
>> + }
>> + q->qfrs[type] = qfr;
>> +
>> + if (type == UACCE_QFRT_SS) {
>> + INIT_LIST_HEAD(&qfr->qs);
>> + list_add(&q->list, &q->qfrs[type]->qs);
>> + }
>> +
>> + uacce_qs_wunlock();
>> +
>> + if (qfr->pages)
>> + q->mm->data_vm += qfr->nr_pages;
>> +
>> + return 0;
>> +
>> +out_with_lock:
>> + uacce_qs_wunlock();
>> + return ret;
>> +}
>> +
>> +static __poll_t uacce_fops_poll(struct file *file, poll_table *wait)
>> +{
>> + struct uacce_queue *q = (struct uacce_queue *)file->private_data;
> Private data is a void * so no need to cast explicitly.
>
> struct uacce_queue *q = file->private_data;
OK
>> +static ssize_t api_show(struct device *dev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> +
>> + return sprintf(buf, "%s\n", uacce->api_ver);
>> +}
>> +
>> +static ssize_t numa_distance_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> + int distance = 0;
>> +
>> +#ifdef CONFIG_NUMA
>> + distance = cpu_to_node(smp_processor_id()) - uacce->pdev->numa_node;
> What is this function supposed to return?
> It currently returns the absolute difference in node number.
> I suppose if that is 0, then it means local node, but all other values
> have no sensible meaning.
>
> Perhaps use node_distance()? That should give you the SLIT distance
> so 10 for local and bigger for everything else.
Yes, node_distance is better and consider #ifdef CONFIG_NUMA
>> +#endif
>> + return sprintf(buf, "%d\n", abs(distance));
>> +}
>> +
>> +static ssize_t node_id_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> + int node_id = -1;
>> +
>> +#ifdef CONFIG_NUMA
>> + node_id = uacce->pdev->numa_node;
> use dev_to_node(uacce->pdev) which already does this protection for you.
Yes, dev_to_node is better
>> +#endif
>> + return sprintf(buf, "%d\n", node_id);
>> +}
>> +
>> +static ssize_t flags_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> +
>> + return sprintf(buf, "%d\n", uacce->flags);
>> +}
>> +
>> +static ssize_t available_instances_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> +
>> + return sprintf(buf, "%d\n", uacce->ops->get_available_instances(uacce));
>> +}
>> +
>> +static ssize_t algorithms_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> +
>> + return sprintf(buf, "%s", uacce->algs);
>> +}
>> +
>> +static ssize_t qfrs_offset_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct uacce *uacce = UACCE_FROM_CDEV_ATTR(dev);
>> + int i, ret;
>> + unsigned long offset;
>> +
>> + for (i = 0, ret = 0; i < UACCE_QFRT_MAX; i++) {
>> + offset = uacce->qf_pg_start[i];
>> + if (offset != UACCE_QFR_NA)
>> + offset = offset << PAGE_SHIFT;
>> + if (i == UACCE_QFRT_SS)
>> + break;
>> + ret += sprintf(buf + ret, "%lu\t", offset);
>> + }
>> + ret += sprintf(buf + ret, "%lu\n", offset);
>> +
>> + return ret;
>> +}
>> +
>> +static DEVICE_ATTR_RO(id);
>> +static DEVICE_ATTR_RO(api);
>> +static DEVICE_ATTR_RO(numa_distance);
>> +static DEVICE_ATTR_RO(node_id);
>> +static DEVICE_ATTR_RO(flags);
>> +static DEVICE_ATTR_RO(available_instances);
>> +static DEVICE_ATTR_RO(algorithms);
>> +static DEVICE_ATTR_RO(qfrs_offset);
>> +
>> +static struct attribute *uacce_dev_attrs[] = {
> New ABI. All needs documenting in
>
> Documentation/ABI/
ok
>
>> + &dev_attr_id.attr,
>> + &dev_attr_api.attr,
>> + &dev_attr_node_id.attr,
>> + &dev_attr_numa_distance.attr,
>> + &dev_attr_flags.attr,
>> + &dev_attr_available_instances.attr,
>> + &dev_attr_algorithms.attr,
>> + &dev_attr_qfrs_offset.attr,
>> + NULL,
>> +};
>> +
>> +static const struct attribute_group uacce_dev_attr_group = {
>> + .name = UACCE_DEV_ATTRS,
>> + .attrs = uacce_dev_attrs,
>> +};
>> +
>> +static const struct attribute_group *uacce_dev_attr_groups[] = {
>> + &uacce_dev_attr_group,
>> + NULL
>> +};
>> +
>> +static int uacce_create_chrdev(struct uacce *uacce)
>> +{
>> + int ret;
>> +
>> + ret = idr_alloc(&uacce_idr, uacce, 0, 0, GFP_KERNEL);
>> + if (ret < 0)
>> + return ret;
>> +
>> + cdev_init(&uacce->cdev, &uacce_fops);
>> + uacce->dev_id = ret;
>> + uacce->cdev.owner = THIS_MODULE;
>> + device_initialize(&uacce->dev);
>> + uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
>> + uacce->dev.class = uacce_class;
>> + uacce->dev.groups = uacce_dev_attr_groups;
>> + uacce->dev.parent = uacce->pdev;
>> + dev_set_name(&uacce->dev, "%s-%d", uacce->drv_name, uacce->dev_id);
>> + ret = cdev_device_add(&uacce->cdev, &uacce->dev);
>> + if (ret)
>> + goto err_with_idr;
>> +
>> + dev_dbg(&uacce->dev, "create uacce minior=%d\n", uacce->dev_id);
>> + return 0;
>> +
>> +err_with_idr:
>> + idr_remove(&uacce_idr, uacce->dev_id);
>> + return ret;
>> +}
>> +
>> +static void uacce_destroy_chrdev(struct uacce *uacce)
>> +{
>> + cdev_device_del(&uacce->cdev, &uacce->dev);
>> + idr_remove(&uacce_idr, uacce->dev_id);
>> +}
>> +
>> +static int uacce_default_get_available_instances(struct uacce *uacce)
>> +{
> Does this one ever make sense for a real device?
>
>> + return -1;
>> +}
>> +
>> +static int uacce_default_start_queue(struct uacce_queue *q)
>> +{
>> + dev_dbg(&q->uacce->dev, "fake start queue");
> Does this ever make sense on a real device?
Will remove these two default funcs.
>
>> + return 0;
>> +}
>> +
>> +static int uacce_dev_match(struct device *dev, void *data)
>> +{
>> + if (dev->parent == data)
>> + return -EBUSY;
>> +
>> + return 0;
>> +}
>> +
>> +/* Borrowed from VFIO to fix msi translation */
>> +static bool uacce_iommu_has_sw_msi(struct iommu_group *group,
>> + phys_addr_t *base)
>> +{
>> + struct list_head group_resv_regions;
>> + struct iommu_resv_region *region, *next;
>> + bool ret = false;
>> +
>> + INIT_LIST_HEAD(&group_resv_regions);
>> + iommu_get_group_resv_regions(group, &group_resv_regions);
>> + list_for_each_entry(region, &group_resv_regions, list) {
>> + pr_debug("uacce: find a resv region (%d) on %llx\n",
>> + region->type, region->start);
>> +
>> + /*
>> + * The presence of any 'real' MSI regions should take
>> + * precedence over the software-managed one if the
>> + * IOMMU driver happens to advertise both types.
>> + */
>> + if (region->type == IOMMU_RESV_MSI) {
>> + ret = false;
>> + break;
>> + }
>> +
>> + if (region->type == IOMMU_RESV_SW_MSI) {
>> + *base = region->start;
>> + ret = true;
>> + }
>> + }
>> + list_for_each_entry_safe(region, next, &group_resv_regions, list)
>> + kfree(region);
>> + return ret;
>> +}
>> +
>> +static int uacce_set_iommu_domain(struct uacce *uacce)
>> +{
>> + struct iommu_domain *domain;
>> + struct iommu_group *group;
>> + struct device *dev = uacce->pdev;
>> + bool resv_msi;
>> + phys_addr_t resv_msi_base = 0;
>> + int ret;
>> +
>> + if ((uacce->flags & UACCE_DEV_NOIOMMU) ||
>> + (uacce->flags & UACCE_DEV_PASID))
>> + return 0;
>> +
>> + /*
>> + * We don't support multiple register for the same dev in RFC version ,
>> + * will add it in formal version
> So this effectively multiple complete uacce interfaces for one device.
> Is there a known usecase for that?
Here is preventing one device with multiple algorithm and register
multi-times,
and without sva, they can not be distinguished.
>> + */
>> + ret = class_for_each_device(uacce_class, NULL, uacce->pdev,
>> + uacce_dev_match);
>> + if (ret)
>> + return ret;
>> +
>> + /* allocate and attach a unmanged domain */
>> + domain = iommu_domain_alloc(uacce->pdev->bus);
>> + if (!domain) {
>> + dev_dbg(&uacce->dev, "cannot get domain for iommu\n");
>> + return -ENODEV;
>> + }
>> +
>> + ret = iommu_attach_device(domain, uacce->pdev);
>> + if (ret)
>> + goto err_with_domain;
>> +
>> + if (iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
>> + uacce->prot |= IOMMU_CACHE;
>> + dev_dbg(dev, "Enable uacce with c-coherent capa\n");
>> + } else
>> + dev_dbg(dev, "Enable uacce without c-coherent capa\n");
> Do those debug statements add anything? I'd also like a comment to explain
> why we care here.
OK,
>> +
>> + group = iommu_group_get(dev);
>> + if (!group) {
>> + ret = -EINVAL;
>> + goto err_with_domain;
>> + }
>> +
>> + resv_msi = uacce_iommu_has_sw_msi(group, &resv_msi_base);
>> + iommu_group_put(group);
>> +
>> + if (resv_msi) {
>> + if (!irq_domain_check_msi_remap() &&
>> + !iommu_capable(dev->bus, IOMMU_CAP_INTR_REMAP)) {
>> + dev_warn(dev, "No interrupt remapping support!");
>> + ret = -EPERM;
>> + goto err_with_domain;
>> + }
>> +
>> + dev_dbg(dev, "Set resv msi %llx on iommu domain\n",
>> + (u64)resv_msi_base);
>> + ret = iommu_get_msi_cookie(domain, resv_msi_base);
>> + if (ret)
>> + goto err_with_domain;
>> + }
>> +
>> + return 0;
>> +
>> +err_with_domain:
>> + iommu_domain_free(domain);
>> + return ret;
>> +}
>> +
>> +static void uacce_unset_iommu_domain(struct uacce *uacce)
>> +{
>> + struct iommu_domain *domain;
>> +
>> + if ((uacce->flags & UACCE_DEV_NOIOMMU) ||
>> + (uacce->flags & UACCE_DEV_PASID))
>> + return;
>> +
>> + domain = iommu_get_domain_for_dev(uacce->pdev);
>> + if (domain) {
>> + iommu_detach_device(domain, uacce->pdev);
>> + iommu_domain_free(domain);
>> + } else
>> + dev_err(&uacce->dev, "bug: no domain attached to device\n");
> Given this is an error path, perhaps the following flow is easier to read (slightly)
>
> if (!domain) {
> dev_err(&uacce->dev, "bug: no domain attached to device\n");
> return;
> }
>
> iommu_detach_device(domain, uacce->pdev);
> iommu_domain_free(domain);
> }
Yes, this is much better.
>> +}
>> +
>> +/**
>> + * uacce_register - register an accelerator
>> + * @uacce: the accelerator structure
>> + */
>> +int uacce_register(struct uacce *uacce)
>> +{
>> + int ret;
>> +
>> + if (!uacce->pdev) {
>> + pr_debug("uacce parent device not set\n");
>> + return -ENODEV;
>> + }
>> +
>> + if (uacce->flags & UACCE_DEV_NOIOMMU) {
>> + add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
>> + dev_warn(uacce->pdev,
>> + "Register to noiommu mode, which export kernel data to user space and may vulnerable to attack");
> Greg already covered this one ;)
>
>> + }
>> +
>> + /* if dev support fault-from-dev, it should support pasid */
>> + if ((uacce->flags & UACCE_DEV_FAULT_FROM_DEV) &&
>> + !(uacce->flags & UACCE_DEV_PASID)) {
>> + dev_warn(&uacce->dev, "SVM/SAV device should support PASID\n");
>> + return -EINVAL;
>> + }
>> +
>> + if (!uacce->ops->start_queue)
>> + uacce->ops->start_queue = uacce_default_start_queue;
>> +
>> + if (!uacce->ops->get_available_instances)
>> + uacce->ops->get_available_instances =
>> + uacce_default_get_available_instances;
>> +
>> +#ifdef CONFIG_IOMMU_SVA
>> + if (uacce->flags & UACCE_DEV_PASID) {
>> + ret = iommu_dev_enable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
>> + if (ret)
>> + uacce->flags &= ~(UACCE_DEV_FAULT_FROM_DEV |
>> + UACCE_DEV_PASID);
>> + }
>> +#endif
>> +
>> + ret = uacce_set_iommu_domain(uacce);
>> + if (ret)
>> + return ret;
>> +
>> + mutex_lock(&uacce_mutex);
>> +
>> + ret = uacce_create_chrdev(uacce);
>> + if (ret)
>> + goto err_with_lock;
>> +
>> + dev_dbg(&uacce->dev, "uacce state initialized to INIT");
> Not sure that tells us much of interest. Probably clean out most of the
> dev_dbg statements. Useful when developing but once it works they
> add lines of code that don't do anything useful.
Will remove most dev_dbg.
>
>> + atomic_set(&uacce->state, UACCE_ST_INIT);
>> + INIT_LIST_HEAD(&uacce->qs);
>> + mutex_init(&uacce->q_lock);
>> + mutex_unlock(&uacce_mutex);
>> +
> One blank line is almost always enough.
>
>> +
>> + return 0;
>> +
>> +err_with_lock:
>> + mutex_unlock(&uacce_mutex);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(uacce_register);
>> +
>> +/**
>> + * uacce_unregister - unregisters a uacce
>> + * @uacce: the accelerator to unregister
>> + *
>> + * Unregister an accelerator that wat previously successully registered with
>> + * uacce_register().
>> + */
>> +void uacce_unregister(struct uacce *uacce)
>> +{
>> + mutex_lock(&uacce_mutex);
>> +
>> +#ifdef CONFIG_IOMMU_SVA
>> + if (uacce->flags & UACCE_DEV_PASID)
>> + iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
>> +#endif
>> + uacce_unset_iommu_domain(uacce);
>> +
>> + uacce_destroy_chrdev(uacce);
>> +
>> + mutex_unlock(&uacce_mutex);
>> +}
>> +EXPORT_SYMBOL_GPL(uacce_unregister);
>> +
>> +static int __init uacce_init(void)
>> +{
>> + int ret;
>> +
>> + uacce_class = class_create(THIS_MODULE, UACCE_CLASS_NAME);
>> + if (IS_ERR(uacce_class)) {
>> + ret = PTR_ERR(uacce_class);
>> + goto err;
>> + }
>> +
>> + ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, "uacce");
>> + if (ret)
>> + goto err_with_class;
>> +
>> + pr_info("uacce init with major number:%d\n", MAJOR(uacce_devt));
>> +
>> + return 0;
>> +
>> +err_with_class:
>> + class_destroy(uacce_class);
>> +err:
>> + return ret;
>> +}
>> +
>> +static __exit void uacce_exit(void)
>> +{
>> + unregister_chrdev_region(uacce_devt, MINORMASK);
>> + class_destroy(uacce_class);
>> + idr_destroy(&uacce_idr);
>> +}
>> +
>> +subsys_initcall(uacce_init);
>> +module_exit(uacce_exit);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Hisilicon Tech. Co., Ltd.");
>> +MODULE_DESCRIPTION("Accelerator interface for Userland applications");
>> diff --git a/include/linux/uacce.h b/include/linux/uacce.h
>> new file mode 100644
>> index 0000000..fe2f6f4
>> --- /dev/null
>> +++ b/include/linux/uacce.h
>> @@ -0,0 +1,109 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __UACCE_H
>> +#define __UACCE_H
>> +
>> +#include <linux/cdev.h>
>> +#include <linux/device.h>
>> +#include <linux/fs.h>
>> +#include <linux/list.h>
>> +#include <linux/iommu.h>
>> +#include <uapi/misc/uacce.h>
>> +
>> +struct uacce_queue;
>> +struct uacce;
>> +
>> +/* uacce mode of the driver */
>> +#define UACCE_MODE_NOUACCE 0 /* don't use uacce */
>> +#define UACCE_MODE_UACCE 1 /* use uacce exclusively */
>> +#define UACCE_MODE_NOIOMMU 2 /* use uacce noiommu mode */
>> +
>> +#define UACCE_QFRF_MAP BIT(0) /* map to current queue */
>> +#define UACCE_QFRF_MMAP BIT(1) /* map to user space */
>> +#define UACCE_QFRF_KMAP BIT(2) /* map to kernel space */
>> +#define UACCE_QFRF_DMA BIT(3) /* use dma api for the region */
>> +#define UACCE_QFRF_SELFMT BIT(4) /* self maintained qfr */
>> +
>> +struct uacce_qfile_region {
> I'd like to see kernel doc for all the structures in here.
Will add description of each member above this structure in this file.
>> + enum uacce_qfrt type;
>> + unsigned long iova; /* iova share between user and device space */
>> + struct page **pages;
>> + int nr_pages;
>> + int prot;
>> + unsigned int flags;
>> + struct list_head qs; /* qs sharing the same region, for ss */
>> + void *kaddr; /* kernel addr */
>> + dma_addr_t dma; /* dma address, if created by dma api */
>> +};
>> +
>> +/**
>> + * struct uacce_ops - WD device operations
> Make sure you don't miss out elements in the docs.
>
> get_available_instances.
>
> Easy to check, just build the kernel doc.
>
>> + * @get_queue: get a queue from the device according to algorithm
> Not obvious in this case what 'according to algorithm' is referring to as
> the function takes simply "unsigned long arg"
Yes, a bit confusing, will remove it.
>
>> + * @put_queue: free a queue to the device
>> + * @start_queue: make the queue start work after get_queue
>> + * @stop_queue: make the queue stop work before put_queue
>> + * @is_q_updated: check whether the task is finished
>> + * @mask_notify: mask the task irq of queue
>> + * @mmap: mmap addresses of queue to user space
>> + * @reset: reset the WD device
> uacce device?
>
>> + * @reset_queue: reset the queue
>> + * @ioctl: ioctl for user space users of the queue
> Extra spaces after : compared to other entries.
>
>> + */
>> +struct uacce_ops {
>> + int (*get_available_instances)(struct uacce *uacce);
>> + int (*get_queue)(struct uacce *uacce, unsigned long arg,
>> + struct uacce_queue **q);
>> + void (*put_queue)(struct uacce_queue *q);
>> + int (*start_queue)(struct uacce_queue *q);
>> + void (*stop_queue)(struct uacce_queue *q);
>> + int (*is_q_updated)(struct uacce_queue *q);
>> + void (*mask_notify)(struct uacce_queue *q, int event_mask);
>> + int (*mmap)(struct uacce_queue *q, struct vm_area_struct *vma,
>> + struct uacce_qfile_region *qfr);
>> + int (*reset)(struct uacce *uacce);
>> + int (*reset_queue)(struct uacce_queue *q);
>> + long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
>> + unsigned long arg);
>> +};
>> +
>> +struct uacce_queue {
>> + struct uacce *uacce;
>> + void *priv;
>> + wait_queue_head_t wait;
>> + int pasid;
>> + struct iommu_sva *handle;
>> + struct list_head list; /* share list for qfr->qs */
>> + struct mm_struct *mm;
>> + struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
>> + struct list_head q_dev;
>> +};
>> +
>> +#define UACCE_ST_INIT 0
>> +#define UACCE_ST_OPENED 1
>> +#define UACCE_ST_STARTED 2
>> +#define UACCE_ST_RST 3
> These seem to be states in a state machine, perhaps an enum
> is more suited as their actual values don't matter (I think!)
Yes, enum is better.
Thanks
More information about the Linux-accelerators
mailing list