Created
October 20, 2016 17:51
-
-
Save Alexey-Kamenev/41821acaecad66de6081a4f017a07aef to your computer and use it in GitHub Desktop.
Jetson inference - external USB camera
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ------------- | |
// in gstCamera.cpp, gstCamera::buildLaunchStr(): | |
ss << "v4l2src device=\"/dev/video1\" ! video/x-raw, width=(int)" << mWidth << ", height=(int)" << mHeight << ", format=(string)YUY2 ! appsink name=mysink"; | |
// ------------- | |
// in gstCamera.cpp, gstCamera::ConvertRGBA: | |
if(CUDA_FAILED(cudaYUYVToRGBAf((uchar2*)input, (float4*)mRGBA, mWidth, mHeight))) | |
return false; | |
// ------------- | |
// in cudaYUV-YUYV.cu: | |
__global__ void yuyvToRgbaf(uchar4* src, int srcAlignedWidth, float4* dst, int dstAlignedWidth, int width, int height ) | |
{ | |
const int x = blockIdx.x * blockDim.x + threadIdx.x; | |
const int y = blockIdx.y * blockDim.y + threadIdx.y; | |
if( x >= srcAlignedWidth || y >= height ) | |
return; | |
const uchar4 macroPx = src[y * srcAlignedWidth + x]; | |
// Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1. | |
// U0 and V0 is the color of both pixels. | |
// YUYV [ Y0 | U0 | Y1 | V0 ] | |
const float y0 = macroPx.x; | |
const float y1 = macroPx.z; | |
const float u = macroPx.y - 128.0f; | |
const float v = macroPx.w - 128.0f; | |
const float4 px0 = make_float4(y0 + 1.4065f * v, | |
y0 - 0.3455f * u - 0.7169f * v, | |
y0 + 1.7790f * u, 255.0f ); | |
const float4 px1 = make_float4(y1 + 1.4065f * v, | |
y1 - 0.3455f * u - 0.7169f * v, | |
y1 + 1.7790f * u, 255.0f); | |
dst[y * dstAlignedWidth + x * 2] = make_float4(clamp(px0.x, 0.0f, 255.0f), | |
clamp(px0.y, 0.0f, 255.0f), | |
clamp(px0.z, 0.0f, 255.0f), | |
clamp(px0.w, 0.0f, 255.0f)); | |
dst[y * dstAlignedWidth + x * 2 + 1] = make_float4(clamp(px1.x, 0.0f, 255.0f), | |
clamp(px1.y, 0.0f, 255.0f), | |
clamp(px1.z, 0.0f, 255.0f), | |
clamp(px1.w, 0.0f, 255.0f)); | |
} | |
cudaError_t launchYUYVf(uchar2* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height) | |
{ | |
if( !input || !inputPitch || !output || !outputPitch || !width || !height ) | |
return cudaErrorInvalidValue; | |
const dim3 block(8,8); | |
const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y)); | |
const int srcAlignedWidth = inputPitch / sizeof(uchar4); | |
const int dstAlignedWidth = outputPitch / sizeof(float4); | |
//printf("yuyvToRgba %zu %zu %i %i %i %i\n", width, height, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y); | |
yuyvToRgbaf<<<grid, block>>>((uchar4*)input, srcAlignedWidth, output, dstAlignedWidth, width, height); | |
return CUDA(cudaGetLastError()); | |
} | |
cudaError_t cudaYUYVToRGBAf(uchar2 *input, float4 *output, size_t width, size_t height) | |
{ | |
return launchYUYVf(input, width * sizeof(uchar2), output, width * sizeof(float4), width, height); | |
} | |
__global__ void rgbaToYUYVf(float4* src, uchar2* dst, int width, int height) | |
{ | |
const int x = blockIdx.x * blockDim.x + threadIdx.x; | |
const int y = blockIdx.y * blockDim.y + threadIdx.y; | |
if(x >= width * 2 || y >= height) | |
return; | |
float4 pix1 = src[y * width + 2 * x]; | |
float4 pix2 = src[y * width + 2 * x + 1]; | |
float4 resf; | |
resf.x = clamp(0.299 * pix1.x + 0.587 * pix1.y + 0.114 * pix1.z + 0, 0.0f, 255.0f); | |
resf.z = clamp(0.299 * pix2.x + 0.587 * pix2.y + 0.114 * pix2.z + 0, 0.0f, 255.0f); | |
resf.w = clamp(0.615 * pix1.x - 0.515 * pix1.y - 0.100 * pix1.z + 128, 0.0f, 255.0f); | |
resf.y = clamp(-0.148 * pix1.x - 0.291 * pix1.y + 0.439 * pix1.z + 128, 0.0f, 255.0f); | |
uchar4* pdst = (uchar4*)(dst + y * width + x * 2); | |
*pdst = make_uchar4((unsigned char)resf.x, (unsigned char)resf.y, (unsigned char)resf.z, (unsigned char)resf.w); | |
} | |
cudaError_t launchRGBAf(float4* input, uchar2* output, size_t width, size_t height) | |
{ | |
if( !input || !output || !width || !height ) | |
return cudaErrorInvalidValue; | |
const dim3 block(8,8); | |
const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y)); | |
rgbaToYUYVf<<<grid, block>>>(input, output, width, height); | |
return CUDA(cudaGetLastError()); | |
} | |
cudaError_t cudaRGBAToYUYVf(float4 *input, uchar2 *output, size_t width, size_t height) | |
{ | |
return launchRGBAf(input, output, width, height); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This looks nice!
I don't know if you are aware, but there's since early last year, jetson-inference repo has this pipeline for capturing from v4l cameras:
This ends up calling
v4lconvert_convert
inlibv4lconvert.so
, which turns out to be pretty time consuming.Do you think your kernel approach might improve performance? Are you still using it? Thanks!