X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=qemu%2Fhw%2Fvfio%2Fcommon.c;fp=qemu%2Fhw%2Fvfio%2Fcommon.c;h=f27db36fb3aee03736621e77cdbb10a8ac4d1a6f;hb=437fd90c0250dee670290f9b714253671a990160;hp=85ee9b005edc7a46c109e91dd69fd3898edf4e62;hpb=5bbd6fe9b8bab2a93e548c5a53b032d1939eec05;p=kvmfornfv.git diff --git a/qemu/hw/vfio/common.c b/qemu/hw/vfio/common.c index 85ee9b005..f27db36fb 100644 --- a/qemu/hw/vfio/common.c +++ b/qemu/hw/vfio/common.c @@ -18,6 +18,7 @@ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) */ +#include "qemu/osdep.h" #include #include #include @@ -226,7 +227,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, /* * Try the mapping, if it fails with EBUSY, unmap the region and try * again. This shouldn't be necessary, but we sometimes see it in - * the the VGA ROM space. + * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && @@ -312,13 +313,17 @@ out: rcu_read_unlock(); } +static hwaddr vfio_container_granularity(VFIOContainer *container) +{ + return (hwaddr)1 << ctz64(container->iova_pgsizes); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.type1.listener); + VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; - Int128 llend; + Int128 llend, llsize; void *vaddr; int ret; @@ -344,14 +349,22 @@ static void vfio_listener_region_add(MemoryListener *listener, if (int128_ge(int128_make64(iova), llend)) { return; } + end = int128_get64(int128_sub(llend, int128_one())); + + if ((iova < container->min_iova) || (end > container->max_iova)) { + error_report("vfio: IOMMU container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, + container, iova, end); + ret = -EFAULT; + goto fail; + } memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - trace_vfio_listener_region_add_iommu(iova, - int128_get64(int128_sub(llend, int128_one()))); + trace_vfio_listener_region_add_iommu(iova, end); /* * FIXME: We should do some checking to see if the * capabilities of the host VFIO IOMMU are adequate to model @@ -362,65 +375,60 @@ static void vfio_listener_region_add(MemoryListener *listener, * would be the right place to wire that up (tell the KVM * device emulation the VFIO iommu handles to use). */ - /* - * This assumes that the guest IOMMU is empty of - * mappings at this point. - * - * One way of doing this is: - * 1. Avoid sharing IOMMUs between emulated devices or different - * IOMMU groups. - * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if - * there are some mappings in IOMMU. - * - * VFIO on SPAPR does that. Other IOMMU models may do that different, - * they must make sure there are no existing mappings or - * loop through existing mappings to map them into VFIO. - */ giommu = g_malloc0(sizeof(*giommu)); giommu->iommu = section->mr; giommu->container = container; giommu->n.notify = vfio_iommu_map_notify; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); + memory_region_iommu_replay(giommu->iommu, &giommu->n, + vfio_container_granularity(container), + false); return; } /* Here we assume that memory_region_is_ram(section->mr)==true */ - end = int128_get64(llend); vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); - trace_vfio_listener_region_add_ram(iova, end - 1, vaddr); + trace_vfio_listener_region_add_ram(iova, end, vaddr); + + llsize = int128_sub(llend, int128_make64(iova)); - ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly); + ret = vfio_dma_map(container, iova, int128_get64(llsize), + vaddr, section->readonly); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iova, end - iova, vaddr, ret); + container, iova, int128_get64(llsize), vaddr, ret); + goto fail; + } - /* - * On the initfn path, store the first error in the container so we - * can gracefully fail. Runtime, there's not much we can do other - * than throw a hardware error. - */ - if (!container->iommu_data.type1.initialized) { - if (!container->iommu_data.type1.error) { - container->iommu_data.type1.error = ret; - } - } else { - hw_error("vfio: DMA mapping failed, unable to continue"); + return; + +fail: + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + if (!container->initialized) { + if (!container->error) { + container->error = ret; } + } else { + hw_error("vfio: DMA mapping failed, unable to continue"); } } static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.type1.listener); + VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; int ret; @@ -485,49 +493,165 @@ static const MemoryListener vfio_memory_listener = { static void vfio_listener_release(VFIOContainer *container) { - memory_listener_unregister(&container->iommu_data.type1.listener); + memory_listener_unregister(&container->listener); } -int vfio_mmap_region(Object *obj, VFIORegion *region, - MemoryRegion *mem, MemoryRegion *submem, - void **map, size_t size, off_t offset, - const char *name) +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name) { - int ret = 0; - VFIODevice *vbasedev = region->vbasedev; + struct vfio_region_info *info; + int ret; + + ret = vfio_get_region_info(vbasedev, index, &info); + if (ret) { + return ret; + } - if (vbasedev->allow_mmap && size && region->flags & - VFIO_REGION_INFO_FLAG_MMAP) { - int prot = 0; + region->vbasedev = vbasedev; + region->flags = info->flags; + region->size = info->size; + region->fd_offset = info->offset; + region->nr = index; - if (region->flags & VFIO_REGION_INFO_FLAG_READ) { - prot |= PROT_READ; + if (region->size) { + region->mem = g_new0(MemoryRegion, 1); + memory_region_init_io(region->mem, obj, &vfio_region_ops, + region, name, region->size); + + if (!vbasedev->no_mmap && + region->flags & VFIO_REGION_INFO_FLAG_MMAP && + !(region->size & ~qemu_real_host_page_mask)) { + + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; } + } + + g_free(info); + + trace_vfio_region_setup(vbasedev->name, index, name, + region->flags, region->fd_offset, region->size); + return 0; +} + +int vfio_region_mmap(VFIORegion *region) +{ + int i, prot = 0; + char *name; + + if (!region->mem) { + return 0; + } - if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) { - prot |= PROT_WRITE; + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + for (i = 0; i < region->nr_mmaps; i++) { + region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, + MAP_SHARED, region->vbasedev->fd, + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { + int ret = -errno; + + trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, + region->fd_offset + + region->mmaps[i].offset, + region->fd_offset + + region->mmaps[i].offset + + region->mmaps[i].size - 1, ret); + + region->mmaps[i].mmap = NULL; + + for (i--; i >= 0; i--) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + object_unparent(OBJECT(®ion->mmaps[i].mem)); + region->mmaps[i].mmap = NULL; + } + + return ret; } - *map = mmap(NULL, size, prot, MAP_SHARED, - vbasedev->fd, - region->fd_offset + offset); - if (*map == MAP_FAILED) { - *map = NULL; - ret = -errno; - goto empty_region; + name = g_strdup_printf("%s mmaps[%d]", + memory_region_name(region->mem), i); + memory_region_init_ram_ptr(®ion->mmaps[i].mem, + memory_region_owner(region->mem), + name, region->mmaps[i].size, + region->mmaps[i].mmap); + g_free(name); + memory_region_set_skip_dump(®ion->mmaps[i].mem); + memory_region_add_subregion(region->mem, region->mmaps[i].offset, + ®ion->mmaps[i].mem); + + trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), + region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size - 1); + } + + return 0; +} + +void vfio_region_exit(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); } + } - memory_region_init_ram_ptr(submem, obj, name, size, *map); - memory_region_set_skip_dump(submem); - } else { -empty_region: - /* Create a zero sized sub-region to make cleanup easy. */ - memory_region_init(submem, obj, name, 0); + trace_vfio_region_exit(region->vbasedev->name, region->nr); +} + +void vfio_region_finalize(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; } - memory_region_add_subregion(mem, offset, submem); + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + object_unparent(OBJECT(®ion->mmaps[i].mem)); + } + } - return ret; + object_unparent(OBJECT(region->mem)); + + g_free(region->mem); + g_free(region->mmaps); + + trace_vfio_region_finalize(region->vbasedev->name, region->nr); +} + +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_set_enabled(®ion->mmaps[i].mem, enabled); + } + } + + trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), + enabled); } void vfio_reset_handler(void *opaque) @@ -668,6 +792,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) || ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU); + struct vfio_iommu_type1_info info; ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { @@ -684,21 +809,27 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - container->iommu_data.type1.listener = vfio_memory_listener; - container->iommu_data.release = vfio_listener_release; - - memory_listener_register(&container->iommu_data.type1.listener, - container->space->as); - - if (container->iommu_data.type1.error) { - ret = container->iommu_data.type1.error; - error_report("vfio: memory listener initialization failed for container"); - goto listener_release_exit; + /* + * FIXME: This assumes that a Type1 IOMMU can map any 64-bit + * IOVA whatsoever. That's not actually true, but the current + * kernel interface doesn't tell us what it can map, and the + * existing Type1 IOMMUs generally support any IOVA we're + * going to actually try in practice. + */ + container->min_iova = 0; + container->max_iova = (hwaddr)-1; + + /* Assume just 4K IOVA page size */ + container->iova_pgsizes = 0x1000; + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); + /* Ignore errors */ + if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + container->iova_pgsizes = info.iova_pgsizes; } - - container->iommu_data.type1.initialized = true; - } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + struct vfio_iommu_spapr_tce_info info; + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { error_report("vfio: failed to set group container: %m"); @@ -724,18 +855,41 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - container->iommu_data.type1.listener = vfio_memory_listener; - container->iommu_data.release = vfio_listener_release; - - memory_listener_register(&container->iommu_data.type1.listener, - container->space->as); + /* + * This only considers the host IOMMU's 32-bit window. At + * some point we need to add support for the optional 64-bit + * window and dynamic windows + */ + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m"); + ret = -errno; + goto free_container_exit; + } + container->min_iova = info.dma32_window_start; + container->max_iova = container->min_iova + info.dma32_window_size - 1; + /* Assume just 4K IOVA pages for now */ + container->iova_pgsizes = 0x1000; } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; goto free_container_exit; } + container->listener = vfio_memory_listener; + + memory_listener_register(&container->listener, container->space->as); + + if (container->error) { + ret = container->error; + error_report("vfio: memory listener initialization failed for container"); + goto listener_release_exit; + } + + container->initialized = true; + QLIST_INIT(&container->group_list); QLIST_INSERT_HEAD(&space->containers, container, next); @@ -774,9 +928,7 @@ static void vfio_disconnect_container(VFIOGroup *group) VFIOAddressSpace *space = container->space; VFIOGuestIOMMU *giommu, *tmp; - if (container->iommu_data.release) { - container->iommu_data.release(container); - } + vfio_listener_release(container); QLIST_REMOVE(container, next); QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { @@ -926,47 +1078,115 @@ void vfio_put_base_device(VFIODevice *vbasedev) close(vbasedev->fd); } -static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid, - int req, void *param) +int vfio_get_region_info(VFIODevice *vbasedev, int index, + struct vfio_region_info **info) { - VFIOGroup *group; - VFIOContainer *container; - int ret = -1; + size_t argsz = sizeof(struct vfio_region_info); - group = vfio_get_group(groupid, as); - if (!group) { - error_report("vfio: group %d not registered", groupid); - return ret; + *info = g_malloc0(argsz); + + (*info)->index = index; + (*info)->argsz = argsz; + + if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + g_free(*info); + return -errno; } - container = group->container; - if (group->container) { - ret = ioctl(container->fd, req, param); - if (ret < 0) { - error_report("vfio: failed to ioctl %d to container: ret=%d, %s", - _IOC_NR(req) - VFIO_BASE, ret, strerror(errno)); - } + return 0; +} + +/* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +static bool vfio_eeh_container_ok(VFIOContainer *container) +{ + /* + * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO + * implementation is broken if there are multiple groups in a + * container. The hardware works in units of Partitionable + * Endpoints (== IOMMU groups) and the EEH operations naively + * iterate across all groups in the container, without any logic + * to make sure the groups have their state synchronized. For + * certain operations (ENABLE) that might be ok, until an error + * occurs, but for others (GET_STATE) it's clearly broken. + */ + + /* + * XXX Once fixed kernels exist, test for them here + */ + + if (QLIST_EMPTY(&container->group_list)) { + return false; } - vfio_put_group(group); + if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { + return false; + } - return ret; + return true; } -int vfio_container_ioctl(AddressSpace *as, int32_t groupid, - int req, void *param) +static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) { - /* We allow only certain ioctls to the container */ - switch (req) { - case VFIO_CHECK_EXTENSION: - case VFIO_IOMMU_SPAPR_TCE_GET_INFO: - case VFIO_EEH_PE_OP: - break; - default: - /* Return an error on unknown requests */ - error_report("vfio: unsupported ioctl %X", req); - return -1; + struct vfio_eeh_pe_op pe_op = { + .argsz = sizeof(pe_op), + .op = op, + }; + int ret; + + if (!vfio_eeh_container_ok(container)) { + error_report("vfio/eeh: EEH_PE_OP 0x%x: " + "kernel requires a container with exactly one group", op); + return -EPERM; + } + + ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); + if (ret < 0) { + error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); + return -errno; + } + + return 0; +} + +static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) +{ + VFIOAddressSpace *space = vfio_get_address_space(as); + VFIOContainer *container = NULL; + + if (QLIST_EMPTY(&space->containers)) { + /* No containers to act on */ + goto out; } - return vfio_container_do_ioctl(as, groupid, req, param); + container = QLIST_FIRST(&space->containers); + + if (QLIST_NEXT(container, next)) { + /* We don't yet have logic to synchronize EEH state across + * multiple containers */ + container = NULL; + goto out; + } + +out: + vfio_put_address_space(space); + return container; +} + +bool vfio_eeh_as_ok(AddressSpace *as) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + return (container != NULL) && vfio_eeh_container_ok(container); +} + +int vfio_eeh_as_op(AddressSpace *as, uint32_t op) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + if (!container) { + return -ENODEV; + } + return vfio_eeh_container_op(container, op); }