tiagovignatti · July 28, 2015 23:56
diff --git a/- b/-
 commit f13a04be0723afbc8188fdc8d4cc0776590e94b3
 Author: Daniel Thompson <[email protected]>
 Date:   Fri Jun 19 14:52:28 2015 +0100

    drm: prime: Honour O_RDWR during prime-handle-to-fd
    
    Currently DRM_IOCTL_PRIME_HANDLE_TO_FD rejects all flags except
    (DRM|O)_CLOEXEC making it difficult (maybe impossible) for userspace
    to mmap() the resulting dma-buf even when this is supported by the
    DRM driver.
    
    It is trivial to relax the restriction and permit read/write access.
    This is safe because the flags are seldom touched by drm; mostly they
    are passed verbatim to dma_buf calls.
    
    CrOS, kernel v3.14 changes: removed first snip of code from the original patch,
    related to documentation only that was diverging from newer kernels.
    
    Change-Id: Ide8e6b548e3c8b57b18e693b5c4a9d2d800526ff
    Signed-off-by: Daniel Thompson <[email protected]>
    Signed-off-by: Tiago Vignatti <[email protected]>

 diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
 index cd608bc..aca5374 100644
 --- a/drivers/gpu/drm/drm_prime.c
 +++ b/drivers/gpu/drm/drm_prime.c
 @@ -579,14 +579,11 @@ int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data,
 		return -ENOSYS;
 
 	/* check flags are valid */
 -	if (args->flags & ~DRM_CLOEXEC)
 +	if (args->flags & ~(DRM_CLOEXEC | DRM_RDWR))
 		return -EINVAL;
 
 -	/* we only want to pass DRM_CLOEXEC which is == O_CLOEXEC */
 -	flags = args->flags & DRM_CLOEXEC;
 -
 	return dev->driver->prime_handle_to_fd(dev, file_priv,
 -			args->handle, flags, &args->fd);
 +			args->handle, args->flags, &args->fd);
 }
 
 int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data,
 diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
 index 3f1fc13..32191ee 100644
 --- a/include/uapi/drm/drm.h
 +++ b/include/uapi/drm/drm.h
 @@ -643,6 +643,7 @@ struct drm_set_client_cap {
 	__u64 value;
 };
 
 +#define DRM_RDWR O_RDWR
 #define DRM_CLOEXEC O_CLOEXEC
 struct drm_prime_handle {
 	__u32 handle;

 commit 0e98ed9e7077dc6fcee33e4b039316b7aa7c7767
 Author: Tiago Vignatti <[email protected]>
 Date:   Mon Jul 27 18:57:09 2015 -0300

    drm/i915: Use CPU mapping for userspace dma-buf mmap()
    
    This patch changes the behavior of dma-buf mmap to use CPU mapping instead
    going through GTT fence. This works fine in devices with LLC ("Core" mostly)
    because coherency is not need to be controlled.
    
    Caching mode is therefore changed to WB, avoiding the performance penalty of
    writing through the write-combined buffer which leads the best performance for
    CPU accesses. Besides, WB caching mode acquires a linear view of the memory,
    so user-space have to deal with it accordingly.
    
    vgem:
    chronos@localhost /tmp/chrome/out_gbm/Release $ ./content_perftests
    --ozone-platform=gbm --ozone-use-surfaceless --no-sandbox
    --gtest_filter=GpuMemoryBufferPerfTests* | grep RES
    *RESULT gpu_memory_buffer_time_ozone_native: map & unmap = 5.432 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: map & unmap = .385 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: read = 460.648 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: read = 1.371 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: write = 19.43 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: write = 1.824 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: read & write = 455.453 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: read & write = 2.648 us/task
    
    vgem cpu mmap (aka "after"):
    chronos@localhost /tmp/chrome/out_gbm/Release $ ./content_perftests
    --ozone-platform=gbm --ozone-use-surfaceless --no-sandbox
    --gtest_filter=GpuMemoryBufferPerfTests* | grep RES
    *RESULT gpu_memory_buffer_time_ozone_native: map & unmap = 4.765 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: map & unmap = .365 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: read = 18.209 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: read = 3.542 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: write = 19.246 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: write = 3.788 us/task
    *RESULT gpu_memory_buffer_time_ozone_native: read & write = 23.506 us/task
    *RESULT gpu_memory_buffer_time_shared_memory: read & write = 7.141 us/task
    
    Note that in this patch we're removing Atom support because it requires better
    coherency handling. For that we'd need to expose dma-buf invalidate/flushing
    ioctls and we're investigating it (WIP at the moment).
    
    BUG=chromium:487189
    TEST=content_perftests (like shown above) and on amd64-generic_freon,
    vgem_fb_test (which needs to be changed to not use tiling when creating gbm
    BOs)
    
    Change-Id: Ie33dc5d63ad12820b2f11caab2d662a8585ea49c
    Signed-off-by: Tiago Vignatti <[email protected]>

 diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
 index 5d0f65d..f6b9a80 100644
 --- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
 +++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
 @@ -199,19 +199,24 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
 	struct drm_device *dev = obj->base.dev;
 +	int ret;
 
 	if (obj->base.size < vma->vm_end - vma->vm_start)
 		return -EINVAL;
 
 -	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 -	vma->vm_ops = dev->driver->gem_vm_ops;
 -	vma->vm_private_data = &obj->base;
 -	vma->vm_page_prot =
 -		pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 +	/* On non-LLC machines we'd need to be careful cause CPU and GPU don't
 +	 * share the CPU's L3 cache and coherency may hurt when CPU mapping. */
 +	if (!HAS_LLC(dev))
 +		return -EINVAL;
 +
 +	if (!obj->base.filp)
 +		return -EINVAL;
 
 -	vma->vm_ops->open(vma);
 +	ret = obj->base.filp->f_op->mmap(obj->base.filp, vma);
 +	fput(vma->vm_file);
 +	vma->vm_file = get_file(obj->base.filp);
 
 -	return 0;
 +	return ret;
 }
 
 static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)
	commit f13a04be0723afbc8188fdc8d4cc0776590e94b3
	Author: Daniel Thompson <[email protected]>
	Date: Fri Jun 19 14:52:28 2015 +0100

	drm: prime: Honour O_RDWR during prime-handle-to-fd

	Currently DRM_IOCTL_PRIME_HANDLE_TO_FD rejects all flags except
	(DRM\|O)_CLOEXEC making it difficult (maybe impossible) for userspace
	to mmap() the resulting dma-buf even when this is supported by the
	DRM driver.

	It is trivial to relax the restriction and permit read/write access.
	This is safe because the flags are seldom touched by drm; mostly they
	are passed verbatim to dma_buf calls.

	CrOS, kernel v3.14 changes: removed first snip of code from the original patch,
	related to documentation only that was diverging from newer kernels.

	Change-Id: Ide8e6b548e3c8b57b18e693b5c4a9d2d800526ff
	Signed-off-by: Daniel Thompson <[email protected]>
	Signed-off-by: Tiago Vignatti <[email protected]>

	diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
	index cd608bc..aca5374 100644
	--- a/drivers/gpu/drm/drm_prime.c
	+++ b/drivers/gpu/drm/drm_prime.c
	@@ -579,14 +579,11 @@ int drm_prime_handle_to_fd_ioctl(struct drm_device dev, void data,
	return -ENOSYS;

	/* check flags are valid */
	- if (args->flags & ~DRM_CLOEXEC)
	+ if (args->flags & ~(DRM_CLOEXEC \| DRM_RDWR))
	return -EINVAL;

	- /* we only want to pass DRM_CLOEXEC which is == O_CLOEXEC */
	- flags = args->flags & DRM_CLOEXEC;
	-
	return dev->driver->prime_handle_to_fd(dev, file_priv,
	- args->handle, flags, &args->fd);
	+ args->handle, args->flags, &args->fd);
	}

	int drm_prime_fd_to_handle_ioctl(struct drm_device dev, void data,
	diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
	index 3f1fc13..32191ee 100644
	--- a/include/uapi/drm/drm.h
	+++ b/include/uapi/drm/drm.h
	@@ -643,6 +643,7 @@ struct drm_set_client_cap {
	__u64 value;
	};

	+#define DRM_RDWR O_RDWR
	#define DRM_CLOEXEC O_CLOEXEC
	struct drm_prime_handle {
	__u32 handle;

	commit 0e98ed9e7077dc6fcee33e4b039316b7aa7c7767
	Author: Tiago Vignatti <[email protected]>
	Date: Mon Jul 27 18:57:09 2015 -0300

	drm/i915: Use CPU mapping for userspace dma-buf mmap()

	This patch changes the behavior of dma-buf mmap to use CPU mapping instead
	going through GTT fence. This works fine in devices with LLC ("Core" mostly)
	because coherency is not need to be controlled.

	Caching mode is therefore changed to WB, avoiding the performance penalty of
	writing through the write-combined buffer which leads the best performance for
	CPU accesses. Besides, WB caching mode acquires a linear view of the memory,
	so user-space have to deal with it accordingly.

	vgem:
	chronos@localhost /tmp/chrome/out_gbm/Release $ ./content_perftests
	--ozone-platform=gbm --ozone-use-surfaceless --no-sandbox
	--gtest_filter=GpuMemoryBufferPerfTests* \| grep RES
	*RESULT gpu_memory_buffer_time_ozone_native: map & unmap = 5.432 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: map & unmap = .385 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: read = 460.648 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: read = 1.371 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: write = 19.43 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: write = 1.824 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: read & write = 455.453 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: read & write = 2.648 us/task

	vgem cpu mmap (aka "after"):
	chronos@localhost /tmp/chrome/out_gbm/Release $ ./content_perftests
	--ozone-platform=gbm --ozone-use-surfaceless --no-sandbox
	--gtest_filter=GpuMemoryBufferPerfTests* \| grep RES
	*RESULT gpu_memory_buffer_time_ozone_native: map & unmap = 4.765 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: map & unmap = .365 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: read = 18.209 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: read = 3.542 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: write = 19.246 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: write = 3.788 us/task
	*RESULT gpu_memory_buffer_time_ozone_native: read & write = 23.506 us/task
	*RESULT gpu_memory_buffer_time_shared_memory: read & write = 7.141 us/task

	Note that in this patch we're removing Atom support because it requires better
	coherency handling. For that we'd need to expose dma-buf invalidate/flushing
	ioctls and we're investigating it (WIP at the moment).

	BUG=chromium:487189
	TEST=content_perftests (like shown above) and on amd64-generic_freon,
	vgem_fb_test (which needs to be changed to not use tiling when creating gbm
	BOs)

	Change-Id: Ie33dc5d63ad12820b2f11caab2d662a8585ea49c
	Signed-off-by: Tiago Vignatti <[email protected]>

	diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
	index 5d0f65d..f6b9a80 100644
	--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
	+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
	@@ -199,19 +199,24 @@ static int i915_gem_dmabuf_mmap(struct dma_buf dma_buf, struct vm_area_struct
	{
	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
	struct drm_device *dev = obj->base.dev;
	+ int ret;

	if (obj->base.size < vma->vm_end - vma->vm_start)
	return -EINVAL;

	- vma->vm_flags \|= VM_IO \| VM_PFNMAP \| VM_DONTEXPAND \| VM_DONTDUMP;
	- vma->vm_ops = dev->driver->gem_vm_ops;
	- vma->vm_private_data = &obj->base;
	- vma->vm_page_prot =
	- pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
	+ /* On non-LLC machines we'd need to be careful cause CPU and GPU don't
	+ * share the CPU's L3 cache and coherency may hurt when CPU mapping. */
	+ if (!HAS_LLC(dev))
	+ return -EINVAL;
	+
	+ if (!obj->base.filp)
	+ return -EINVAL;

	- vma->vm_ops->open(vma);
	+ ret = obj->base.filp->f_op->mmap(obj->base.filp, vma);
	+ fput(vma->vm_file);
	+ vma->vm_file = get_file(obj->base.filp);

	- return 0;
	+ return ret;
	}

	static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)