Skip to content

Instantly share code, notes, and snippets.

@Alexey-Kamenev
Created October 20, 2016 17:51
Show Gist options
  • Save Alexey-Kamenev/41821acaecad66de6081a4f017a07aef to your computer and use it in GitHub Desktop.
Save Alexey-Kamenev/41821acaecad66de6081a4f017a07aef to your computer and use it in GitHub Desktop.
Jetson inference - external USB camera
// -------------
// in gstCamera.cpp, gstCamera::buildLaunchStr():
ss << "v4l2src device=\"/dev/video1\" ! video/x-raw, width=(int)" << mWidth << ", height=(int)" << mHeight << ", format=(string)YUY2 ! appsink name=mysink";
// -------------
// in gstCamera.cpp, gstCamera::ConvertRGBA:
if(CUDA_FAILED(cudaYUYVToRGBAf((uchar2*)input, (float4*)mRGBA, mWidth, mHeight)))
return false;
// -------------
// in cudaYUV-YUYV.cu:
__global__ void yuyvToRgbaf(uchar4* src, int srcAlignedWidth, float4* dst, int dstAlignedWidth, int width, int height )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if( x >= srcAlignedWidth || y >= height )
return;
const uchar4 macroPx = src[y * srcAlignedWidth + x];
// Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1.
// U0 and V0 is the color of both pixels.
// YUYV [ Y0 | U0 | Y1 | V0 ]
const float y0 = macroPx.x;
const float y1 = macroPx.z;
const float u = macroPx.y - 128.0f;
const float v = macroPx.w - 128.0f;
const float4 px0 = make_float4(y0 + 1.4065f * v,
y0 - 0.3455f * u - 0.7169f * v,
y0 + 1.7790f * u, 255.0f );
const float4 px1 = make_float4(y1 + 1.4065f * v,
y1 - 0.3455f * u - 0.7169f * v,
y1 + 1.7790f * u, 255.0f);
dst[y * dstAlignedWidth + x * 2] = make_float4(clamp(px0.x, 0.0f, 255.0f),
clamp(px0.y, 0.0f, 255.0f),
clamp(px0.z, 0.0f, 255.0f),
clamp(px0.w, 0.0f, 255.0f));
dst[y * dstAlignedWidth + x * 2 + 1] = make_float4(clamp(px1.x, 0.0f, 255.0f),
clamp(px1.y, 0.0f, 255.0f),
clamp(px1.z, 0.0f, 255.0f),
clamp(px1.w, 0.0f, 255.0f));
}
cudaError_t launchYUYVf(uchar2* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height)
{
if( !input || !inputPitch || !output || !outputPitch || !width || !height )
return cudaErrorInvalidValue;
const dim3 block(8,8);
const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
const int srcAlignedWidth = inputPitch / sizeof(uchar4);
const int dstAlignedWidth = outputPitch / sizeof(float4);
//printf("yuyvToRgba %zu %zu %i %i %i %i\n", width, height, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y);
yuyvToRgbaf<<<grid, block>>>((uchar4*)input, srcAlignedWidth, output, dstAlignedWidth, width, height);
return CUDA(cudaGetLastError());
}
cudaError_t cudaYUYVToRGBAf(uchar2 *input, float4 *output, size_t width, size_t height)
{
return launchYUYVf(input, width * sizeof(uchar2), output, width * sizeof(float4), width, height);
}
__global__ void rgbaToYUYVf(float4* src, uchar2* dst, int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x >= width * 2 || y >= height)
return;
float4 pix1 = src[y * width + 2 * x];
float4 pix2 = src[y * width + 2 * x + 1];
float4 resf;
resf.x = clamp(0.299 * pix1.x + 0.587 * pix1.y + 0.114 * pix1.z + 0, 0.0f, 255.0f);
resf.z = clamp(0.299 * pix2.x + 0.587 * pix2.y + 0.114 * pix2.z + 0, 0.0f, 255.0f);
resf.w = clamp(0.615 * pix1.x - 0.515 * pix1.y - 0.100 * pix1.z + 128, 0.0f, 255.0f);
resf.y = clamp(-0.148 * pix1.x - 0.291 * pix1.y + 0.439 * pix1.z + 128, 0.0f, 255.0f);
uchar4* pdst = (uchar4*)(dst + y * width + x * 2);
*pdst = make_uchar4((unsigned char)resf.x, (unsigned char)resf.y, (unsigned char)resf.z, (unsigned char)resf.w);
}
cudaError_t launchRGBAf(float4* input, uchar2* output, size_t width, size_t height)
{
if( !input || !output || !width || !height )
return cudaErrorInvalidValue;
const dim3 block(8,8);
const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
rgbaToYUYVf<<<grid, block>>>(input, output, width, height);
return CUDA(cudaGetLastError());
}
cudaError_t cudaRGBAToYUYVf(float4 *input, uchar2 *output, size_t width, size_t height)
{
return launchRGBAf(input, output, width, height);
}
@mescarra
Copy link

This looks nice!

I don't know if you are aware, but there's since early last year, jetson-inference repo has this pipeline for capturing from v4l cameras:

ss << "v4l2src device=/dev/video" << mV4L2Device << " ! ";
		ss << "video/x-raw, width=(int)" << mWidth << ", height=(int)" << mHeight << ", "; 
		ss << "format=RGB ! videoconvert ! video/x-raw, format=RGB ! videoconvert !";
		ss << "appsink name=mysink";

This ends up calling v4lconvert_convert in libv4lconvert.so, which turns out to be pretty time consuming.

Do you think your kernel approach might improve performance? Are you still using it? Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment