Created
March 4, 2016 01:38
-
-
Save me-vlad/4388b4a9436cb4fad3c8 to your computer and use it in GitHub Desktop.
ffmpeg nvidia cuda patches
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -ruN ffmpeg-orig/libavcodec/Makefile ffmpeg/libavcodec/Makefile | |
--- ffmpeg-orig/libavcodec/Makefile 2016-03-04 00:17:21.166198276 +0200 | |
+++ ffmpeg/libavcodec/Makefile 2016-03-04 00:18:17.583433360 +0200 | |
@@ -102,7 +102,7 @@ | |
motion_est.o ratecontrol.o \ | |
mpegvideoencdsp.o | |
OBJS-$(CONFIG_MSS34DSP) += mss34dsp.o | |
-OBJS-$(CONFIG_NVENC) += nvenc.o | |
+OBJS-$(CONFIG_NVENC) += nvenc.o nvenc_ptx.o | |
OBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o | |
OBJS-$(CONFIG_QPELDSP) += qpeldsp.o | |
OBJS-$(CONFIG_QSV) += qsv.o | |
diff -ruN ffmpeg-orig/libavcodec/nvenc.c ffmpeg/libavcodec/nvenc.c | |
--- ffmpeg-orig/libavcodec/nvenc.c 2016-03-04 00:17:21.113193357 +0200 | |
+++ ffmpeg/libavcodec/nvenc.c 2016-03-04 00:26:51.381033223 +0200 | |
@@ -32,15 +32,11 @@ | |
#include "libavutil/avassert.h" | |
#include "libavutil/opt.h" | |
#include "libavutil/mem.h" | |
+#include "libavutil/cudautils.h" | |
#include "avcodec.h" | |
#include "internal.h" | |
#include "thread.h" | |
-#if defined(_WIN32) | |
-#define CUDAAPI __stdcall | |
-#else | |
-#define CUDAAPI | |
-#endif | |
#if defined(_WIN32) | |
#define LOAD_FUNC(l, s) GetProcAddress(l, s) | |
@@ -50,28 +46,19 @@ | |
#define DL_CLOSE_FUNC(l) dlclose(l) | |
#endif | |
-typedef enum cudaError_enum { | |
- CUDA_SUCCESS = 0 | |
-} CUresult; | |
-typedef int CUdevice; | |
-typedef void* CUcontext; | |
- | |
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx); | |
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx); | |
+#define BLOCKSX 128 | |
+#define THREADSX 128 | |
typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList); | |
typedef struct NvencInputSurface | |
{ | |
NV_ENC_INPUT_PTR input_surface; | |
+ CUdeviceptr dptr; | |
+ void* hRes; | |
int width; | |
int height; | |
+ size_t pitch; | |
int lockCount; | |
@@ -107,24 +94,11 @@ | |
typedef struct NvencDynLoadFunctions | |
{ | |
- PCUINIT cu_init; | |
- PCUDEVICEGETCOUNT cu_device_get_count; | |
- PCUDEVICEGET cu_device_get; | |
- PCUDEVICEGETNAME cu_device_get_name; | |
- PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability; | |
- PCUCTXCREATE cu_ctx_create; | |
- PCUCTXPOPCURRENT cu_ctx_pop_current; | |
- PCUCTXDESTROY cu_ctx_destroy; | |
- | |
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs; | |
- int nvenc_device_count; | |
- CUdevice nvenc_devices[16]; | |
#if defined(_WIN32) | |
- HMODULE cuda_lib; | |
HMODULE nvenc_lib; | |
#else | |
- void* cuda_lib; | |
void* nvenc_lib; | |
#endif | |
} NvencDynLoadFunctions; | |
@@ -140,14 +114,18 @@ | |
AVClass *avclass; | |
NvencDynLoadFunctions nvenc_dload_funcs; | |
+ CudaDynLoadFunctions* cuda_dload_funcs; | |
NV_ENC_INITIALIZE_PARAMS init_encode_params; | |
NV_ENC_CONFIG encode_config; | |
CUcontext cu_context; | |
+ CUmodule cu_module; | |
+ CUfunction cu_func_interleaveChroma; | |
int max_surface_count; | |
NvencInputSurface *input_surfaces; | |
NvencOutputSurface *output_surfaces; | |
+ NvencInputSurface transferSurf; | |
NvencDataList output_surface_queue; | |
NvencDataList output_surface_ready_queue; | |
@@ -164,8 +142,10 @@ | |
int twopass; | |
int gpu; | |
int buffer_delay; | |
+ int aq; | |
} NvencContext; | |
+ | |
static const NvencValuePair nvenc_h264_level_pairs[] = { | |
{ "auto", NV_ENC_LEVEL_AUTOSELECT }, | |
{ "1" , NV_ENC_LEVEL_H264_1 }, | |
@@ -330,79 +310,18 @@ | |
return res->u.timestamp; | |
} | |
-#define CHECK_LOAD_FUNC(t, f, s) \ | |
-do { \ | |
- (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \ | |
- if (!(f)) { \ | |
- av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \ | |
- goto error; \ | |
- } \ | |
-} while (0) | |
- | |
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx) | |
-{ | |
- NvencContext *ctx = avctx->priv_data; | |
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
- | |
- if (dl_fn->cuda_lib) | |
- return 1; | |
- | |
-#if defined(_WIN32) | |
- dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll")); | |
-#else | |
- dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY); | |
-#endif | |
- | |
- if (!dl_fn->cuda_lib) { | |
- av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n"); | |
- goto error; | |
- } | |
- | |
- CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName"); | |
- CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability"); | |
- CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2"); | |
- CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2"); | |
- CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2"); | |
- | |
- return 1; | |
- | |
-error: | |
- | |
- if (dl_fn->cuda_lib) | |
- DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
- | |
- dl_fn->cuda_lib = NULL; | |
- | |
- return 0; | |
-} | |
- | |
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func) | |
-{ | |
- if (err != CUDA_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err); | |
- return 0; | |
- } | |
- return 1; | |
-} | |
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error | |
- | |
static av_cold int nvenc_check_cuda(AVCodecContext *avctx) | |
{ | |
- int device_count = 0; | |
- CUdevice cu_device = 0; | |
- char gpu_name[128]; | |
- int smminor = 0, smmajor = 0; | |
- int i, smver, target_smver; | |
+ int target_smver; | |
NvencContext *ctx = avctx->priv_data; | |
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
+ | |
+ if (!init_cuda()) | |
+ return 0; | |
switch (avctx->codec->id) { | |
case AV_CODEC_ID_H264: | |
- target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30; | |
+ target_smver = 0x30; | |
break; | |
case AV_CODEC_ID_H265: | |
target_smver = 0x52; | |
@@ -412,49 +331,19 @@ | |
goto error; | |
} | |
- if (!nvenc_dyload_cuda(avctx)) | |
- return 0; | |
- | |
- if (dl_fn->nvenc_device_count > 0) | |
- return 1; | |
- | |
- check_cuda_errors(dl_fn->cu_init(0)); | |
- | |
- check_cuda_errors(dl_fn->cu_device_get_count(&device_count)); | |
- | |
- if (!device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n"); | |
+ if (!is_gpu_feature_available(ctx->gpu, target_smver)) | |
+ { | |
+ av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu); | |
goto error; | |
} | |
- | |
- av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count); | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
- for (i = 0; i < device_count; ++i) { | |
- check_cuda_errors(dl_fn->cu_device_get(&cu_device, i)); | |
- check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device)); | |
- check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device)); | |
- | |
- smver = (smmajor << 4) | smminor; | |
- | |
- av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available"); | |
- | |
- if (smver >= target_smver) | |
- dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device; | |
- } | |
- | |
- if (!dl_fn->nvenc_device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n"); | |
- goto error; | |
+ else | |
+ { | |
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu); | |
} | |
return 1; | |
error: | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
return 0; | |
} | |
@@ -488,23 +377,18 @@ | |
} | |
nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance"); | |
- | |
if (!nvEncodeAPICreateInstance) { | |
av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n"); | |
goto error; | |
} | |
dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER; | |
- | |
nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs); | |
- | |
if (nvstatus != NV_ENC_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n"); | |
goto error; | |
} | |
- av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n"); | |
- | |
return 1; | |
error: | |
@@ -512,7 +396,6 @@ | |
DL_CLOSE_FUNC(dl_fn->nvenc_lib); | |
dl_fn->nvenc_lib = NULL; | |
- | |
return 0; | |
} | |
@@ -523,29 +406,16 @@ | |
DL_CLOSE_FUNC(dl_fn->nvenc_lib); | |
dl_fn->nvenc_lib = NULL; | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
- DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
- dl_fn->cuda_lib = NULL; | |
- | |
- dl_fn->cu_init = NULL; | |
- dl_fn->cu_device_get_count = NULL; | |
- dl_fn->cu_device_get = NULL; | |
- dl_fn->cu_device_get_name = NULL; | |
- dl_fn->cu_device_compute_capability = NULL; | |
- dl_fn->cu_ctx_create = NULL; | |
- dl_fn->cu_ctx_pop_current = NULL; | |
- dl_fn->cu_ctx_destroy = NULL; | |
- | |
+ deinit_cuda(); | |
av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n"); | |
} | |
static av_cold int nvenc_encode_init(AVCodecContext *avctx) | |
{ | |
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 }; | |
+ NV_ENC_REGISTER_RESOURCE registerParams = { 0 }; | |
NV_ENC_PRESET_CONFIG preset_config = { 0 }; | |
- CUcontext cu_context_curr; | |
+ CudaDynLoadFunctions *p_cuda; | |
CUresult cu_res; | |
GUID encoder_preset = NV_ENC_PRESET_HQ_GUID; | |
GUID codec; | |
@@ -558,6 +428,7 @@ | |
int res = 0; | |
int dw, dh; | |
int qp_inter_p; | |
+ extern char color_ptx[]; | |
NvencContext *ctx = avctx->priv_data; | |
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
@@ -575,28 +446,18 @@ | |
encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; | |
encode_session_params.apiVersion = NVENCAPI_VERSION; | |
- if (ctx->gpu >= dl_fn->nvenc_device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count); | |
- res = AVERROR(EINVAL); | |
- goto error; | |
- } | |
- | |
- ctx->cu_context = NULL; | |
- cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins | |
+ cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu); | |
+ p_cuda = get_cuda_dl_func(); | |
if (cu_res != CUDA_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res); | |
res = AVERROR_EXTERNAL; | |
goto error; | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context); | |
- cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr); | |
- | |
- if (cu_res != CUDA_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res); | |
- res = AVERROR_EXTERNAL; | |
- goto error; | |
- } | |
+ __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx)); | |
+ __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma")); | |
encode_session_params.device = ctx->cu_context; | |
encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA; | |
@@ -866,6 +727,15 @@ | |
ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; | |
} | |
+ if (ctx->aq) | |
+ { | |
+ ctx->encode_config.rcParams.enableAQ = 1; | |
+ } | |
+ else | |
+ { | |
+ ctx->encode_config.rcParams.enableAQ = 0; | |
+ } | |
+ | |
switch (avctx->codec->id) { | |
case AV_CODEC_ID_H264: | |
ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1; | |
@@ -885,9 +755,6 @@ | |
if (!ctx->profile) { | |
switch (avctx->profile) { | |
- case FF_PROFILE_H264_HIGH_444_PREDICTIVE: | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- break; | |
case FF_PROFILE_H264_BASELINE: | |
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; | |
break; | |
@@ -913,9 +780,6 @@ | |
} else if (!strcmp(ctx->profile, "baseline")) { | |
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; | |
avctx->profile = FF_PROFILE_H264_BASELINE; | |
- } else if (!strcmp(ctx->profile, "high444p")) { | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE; | |
} else { | |
av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile); | |
res = AVERROR(EINVAL); | |
@@ -923,13 +787,7 @@ | |
} | |
} | |
- // force setting profile as high444p if input is AV_PIX_FMT_YUV444P | |
- if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) { | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE; | |
- } | |
- | |
- ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1; | |
+ ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1; | |
if (ctx->level) { | |
res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level); | |
@@ -988,6 +846,8 @@ | |
goto error; | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n"); | |
+ | |
ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces)); | |
if (!ctx->input_surfaces) { | |
@@ -1002,28 +862,32 @@ | |
goto error; | |
} | |
+ // Allocation for temp surface used for sys mem -> device mem transfer | |
+ if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) | |
+ { | |
+ ctx->transferSurf.width = (avctx->width + 31) & ~31; | |
+ ctx->transferSurf.height = (avctx->height + 31) & ~31; | |
+ p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr, | |
+ &ctx->transferSurf.pitch, | |
+ ctx->transferSurf.width, | |
+ ctx->transferSurf.height/ 2, 16); | |
+ } | |
+ | |
for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) { | |
- NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 }; | |
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 }; | |
- allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER; | |
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; | |
- allocSurf.width = (avctx->width + 31) & ~31; | |
- allocSurf.height = (avctx->height + 31) & ~31; | |
- | |
- allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED; | |
+ ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31; | |
+ ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31; | |
switch (avctx->pix_fmt) { | |
case AV_PIX_FMT_YUV420P: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL; | |
- break; | |
- | |
case AV_PIX_FMT_NV12: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; | |
- break; | |
- | |
- case AV_PIX_FMT_YUV444P: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL; | |
+ ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL; | |
+ p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr, | |
+ &ctx->input_surfaces[surfaceCount].pitch, | |
+ ctx->input_surfaces[surfaceCount].width, | |
+ ctx->input_surfaces[surfaceCount].height * 3 / 2, 16); | |
break; | |
default: | |
@@ -1032,18 +896,21 @@ | |
goto error; | |
} | |
- nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf); | |
+ registerParams.version = NV_ENC_REGISTER_RESOURCE_VER, | |
+ registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, | |
+ registerParams.width = ctx->input_surfaces[surfaceCount].width, | |
+ registerParams.height = ctx->input_surfaces[surfaceCount].height, | |
+ registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch, | |
+ registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format; | |
+ registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr, | |
+ nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, ®isterParams); | |
if (nv_status != NV_ENC_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n"); | |
+ av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n"); | |
res = AVERROR_EXTERNAL; | |
goto error; | |
} | |
- | |
+ ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource; | |
ctx->input_surfaces[surfaceCount].lockCount = 0; | |
- ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer; | |
- ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt; | |
- ctx->input_surfaces[surfaceCount].width = allocSurf.width; | |
- ctx->input_surfaces[surfaceCount].height = allocSurf.height; | |
/* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */ | |
allocOut.size = 1024 * 1024; | |
@@ -1106,21 +973,22 @@ | |
return 0; | |
error: | |
- | |
for (i = 0; i < surfaceCount; ++i) { | |
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface); | |
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes); | |
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr); | |
+ | |
if (ctx->output_surfaces[i].output_surface) | |
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface); | |
} | |
+ p_cuda->cu_mem_free(ctx->transferSurf.dptr); | |
if (ctx->nvencoder) | |
p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
if (ctx->cu_context) | |
- dl_fn->cu_ctx_destroy(ctx->cu_context); | |
+ release_cuda_context(&ctx->cu_context, ctx->gpu); | |
nvenc_unload_nvenc(avctx); | |
- | |
ctx->nvencoder = NULL; | |
ctx->cu_context = NULL; | |
@@ -1132,6 +1000,7 @@ | |
NvencContext *ctx = avctx->priv_data; | |
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs; | |
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func(); | |
int i; | |
av_freep(&ctx->timestamp_list.data); | |
@@ -1139,16 +1008,19 @@ | |
av_freep(&ctx->output_surface_queue.data); | |
for (i = 0; i < ctx->max_surface_count; ++i) { | |
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface); | |
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes); | |
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr); | |
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface); | |
} | |
ctx->max_surface_count = 0; | |
- p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
+ if (ctx->nvencoder) | |
+ p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
+ | |
ctx->nvencoder = NULL; | |
- dl_fn->cu_ctx_destroy(ctx->cu_context); | |
- ctx->cu_context = NULL; | |
+ if (ctx->cu_context) | |
+ release_cuda_context(&ctx->cu_context, ctx->gpu); | |
nvenc_unload_nvenc(avctx); | |
@@ -1163,6 +1035,7 @@ | |
uint32_t slice_mode_data; | |
uint32_t *slice_offsets; | |
+ char picType = 'X'; | |
NV_ENC_LOCK_BITSTREAM lock_params = { 0 }; | |
NVENCSTATUS nv_status; | |
int res = 0; | |
@@ -1215,12 +1088,15 @@ | |
FF_DISABLE_DEPRECATION_WARNINGS | |
case NV_ENC_PIC_TYPE_I: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; | |
+ picType = 'I'; | |
break; | |
case NV_ENC_PIC_TYPE_P: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P; | |
+ picType = 'P'; | |
break; | |
case NV_ENC_PIC_TYPE_B: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B; | |
+ picType = 'B'; | |
break; | |
case NV_ENC_PIC_TYPE_BI: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI; | |
@@ -1234,6 +1110,8 @@ | |
#endif | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d PicType %c Frame AvgQP %d SATD Cost %d Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes); | |
+ | |
pkt->pts = lock_params.outputTimeStamp; | |
pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list); | |
@@ -1261,6 +1139,61 @@ | |
return res; | |
} | |
+ | |
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, | |
+ CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride) | |
+{ | |
+ void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride}; | |
+ __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf) | |
+{ | |
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func(); | |
+ if (frame->format == AV_PIX_FMT_NV12) { | |
+ | |
+ // check opaque field, if there's already a deviceptr | |
+ if (frame->opaque && check_nvinfo(frame->opaque) && | |
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)frame->opaque; | |
+ | |
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else | |
+ { | |
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ } | |
+ else if (frame->format == AV_PIX_FMT_YUV420P) { | |
+ // check opaque field, if there's already a deviceptr | |
+ if (frame->opaque && check_nvinfo(frame->opaque) && | |
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)frame->opaque; | |
+ | |
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch); | |
+ } | |
+ else | |
+ { | |
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch); | |
+ } | |
+ } | |
+ else { | |
+ av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n"); | |
+ return AVERROR(EINVAL); | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt, | |
const AVFrame *frame, int *got_packet) | |
{ | |
@@ -1276,7 +1209,7 @@ | |
pic_params.version = NV_ENC_PIC_PARAMS_VER; | |
if (frame) { | |
- NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 }; | |
+ NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 }; | |
NvencInputSurface *inSurf = NULL; | |
for (i = 0; i < ctx->max_surface_count; ++i) { | |
@@ -1290,69 +1223,27 @@ | |
inSurf->lockCount = 1; | |
- lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER; | |
- lockBufferParams.inputBuffer = inSurf->input_surface; | |
- | |
- nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams); | |
+ mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER; | |
+ mapParams.registeredResource = inSurf->hRes; | |
+ nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams); | |
if (nv_status != NV_ENC_SUCCESS) { | |
- av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n"); | |
+ av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n"); | |
return 0; | |
} | |
- if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1, | |
- frame->data[2], frame->linesize[2], | |
- avctx->width >> 1, avctx->height >> 1); | |
- | |
- buf += (inSurf->height * lockBufferParams.pitch) >> 2; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width >> 1, avctx->height >> 1); | |
- } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width, avctx->height >> 1); | |
- } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
+ inSurf->input_surface = mapParams.mappedResource; | |
+ if (inSurf->format != mapParams.mappedBufferFmt) { | |
+ av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n"); | |
+ return 0; | |
+ } | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[2], frame->linesize[2], | |
- avctx->width, avctx->height); | |
- } else { | |
- av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n"); | |
- return AVERROR(EINVAL); | |
+ if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) { | |
+ p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface); | |
+ av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n"); | |
+ return 0; | |
} | |
- nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface); | |
+ nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface); | |
if (nv_status != NV_ENC_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n"); | |
return AVERROR_EXTERNAL; | |
@@ -1470,7 +1361,6 @@ | |
static const enum AVPixelFormat pix_fmts_nvenc[] = { | |
AV_PIX_FMT_YUV420P, | |
AV_PIX_FMT_NV12, | |
- AV_PIX_FMT_YUV444P, | |
AV_PIX_FMT_NONE | |
}; | |
@@ -1478,13 +1368,14 @@ | |
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM | |
static const AVOption options[] = { | |
{ "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE }, | |
- { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
+ { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
{ "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { .str = "auto" }, 0, 0, VE }, | |
{ "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
{ "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, | |
{ "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE }, | |
{ "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE }, | |
{ "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE }, | |
+ { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, | |
{ NULL } | |
}; | |
diff -ruN ffmpeg-orig/libavcodec/nvenc_ptx.c ffmpeg/libavcodec/nvenc_ptx.c | |
--- ffmpeg-orig/libavcodec/nvenc_ptx.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavcodec/nvenc_ptx.c 2016-03-04 00:18:17.585433546 +0200 | |
@@ -0,0 +1,240 @@ | |
+#if _WIN32 || _WIN64 | |
+#if _WIN64 | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+// Check GCC | |
+#if __GNUC__ | |
+#if __x86_64__ || __ppc64__ | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+#ifdef ENVIRONMENT32 | |
+const char color_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19830389\n" | |
+ "// Cuda compilation tools, release 8.0, V8.0.0\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.3\n" | |
+ ".target sm_30\n" | |
+ ".address_size 32\n" | |
+ "\n" | |
+ "// .globl interleaveChroma\n" | |
+ "\n" | |
+ ".visible .entry interleaveChroma(\n" | |
+ ".param .u32 interleaveChroma_param_0,\n" | |
+ ".param .u32 interleaveChroma_param_1,\n" | |
+ ".param .u32 interleaveChroma_param_2,\n" | |
+ ".param .u32 interleaveChroma_param_3,\n" | |
+ ".param .u32 interleaveChroma_param_4,\n" | |
+ ".param .u32 interleaveChroma_param_5,\n" | |
+ ".param .u32 interleaveChroma_param_6\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<5>;\n" | |
+ ".reg .b32 %r<57>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r15, [interleaveChroma_param_0];\n" | |
+ "ld.param.u32 %r16, [interleaveChroma_param_1];\n" | |
+ "ld.param.u32 %r17, [interleaveChroma_param_2];\n" | |
+ "ld.param.u32 %r18, [interleaveChroma_param_3];\n" | |
+ "ld.param.u32 %r21, [interleaveChroma_param_4];\n" | |
+ "ld.param.u32 %r19, [interleaveChroma_param_5];\n" | |
+ "ld.param.u32 %r20, [interleaveChroma_param_6];\n" | |
+ "shr.s32 %r1, %r21, 1;\n" | |
+ "mov.u32 %r55, %ctaid.x;\n" | |
+ "setp.ge.s32 %p1, %r55, %r1;\n" | |
+ "@%p1 bra BB0_6;\n" | |
+ "\n" | |
+ "cvta.to.global.u32 %r3, %r17;\n" | |
+ "cvta.to.global.u32 %r4, %r16;\n" | |
+ "cvta.to.global.u32 %r5, %r15;\n" | |
+ "mov.u32 %r6, %tid.x;\n" | |
+ "shr.s32 %r7, %r18, 3;\n" | |
+ "mov.u32 %r8, %ntid.x;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "setp.ge.s32 %p2, %r6, %r7;\n" | |
+ "@%p2 bra BB0_5;\n" | |
+ "\n" | |
+ "mul.lo.s32 %r22, %r55, %r19;\n" | |
+ "mul.lo.s32 %r23, %r55, %r20;\n" | |
+ "shr.s32 %r10, %r23, 2;\n" | |
+ "shr.s32 %r24, %r22, 2;\n" | |
+ "shr.u32 %r11, %r24, 1;\n" | |
+ "mov.u32 %r56, %r6;\n" | |
+ "\n" | |
+ "BB0_4:\n" | |
+ "mov.u32 %r12, %r56;\n" | |
+ "add.s32 %r25, %r12, %r11;\n" | |
+ "shl.b32 %r26, %r25, 2;\n" | |
+ "add.s32 %r27, %r5, %r26;\n" | |
+ "add.s32 %r28, %r4, %r26;\n" | |
+ "ld.global.u32 %r29, [%r28];\n" | |
+ "and.b32 %r30, %r29, 65280;\n" | |
+ "shl.b32 %r31, %r30, 16;\n" | |
+ "ld.global.u32 %r32, [%r27];\n" | |
+ "shl.b32 %r33, %r32, 8;\n" | |
+ "and.b32 %r34, %r33, 16711680;\n" | |
+ "shl.b32 %r35, %r29, 8;\n" | |
+ "and.b32 %r36, %r35, 65280;\n" | |
+ "and.b32 %r37, %r32, 255;\n" | |
+ "or.b32 %r38, %r34, %r37;\n" | |
+ "or.b32 %r39, %r38, %r31;\n" | |
+ "or.b32 %r40, %r39, %r36;\n" | |
+ "shl.b32 %r41, %r12, 1;\n" | |
+ "add.s32 %r42, %r41, %r10;\n" | |
+ "shl.b32 %r43, %r42, 2;\n" | |
+ "add.s32 %r44, %r3, %r43;\n" | |
+ "st.global.u32 [%r44], %r40;\n" | |
+ "and.b32 %r45, %r29, -16777216;\n" | |
+ "and.b32 %r46, %r32, -16777216;\n" | |
+ "shr.u32 %r47, %r46, 8;\n" | |
+ "or.b32 %r48, %r45, %r47;\n" | |
+ "and.b32 %r49, %r29, 16711680;\n" | |
+ "shr.u32 %r50, %r49, 8;\n" | |
+ "bfe.u32 %r51, %r32, 16, 8;\n" | |
+ "or.b32 %r52, %r48, %r51;\n" | |
+ "or.b32 %r53, %r52, %r50;\n" | |
+ "st.global.u32 [%r44+4], %r53;\n" | |
+ "add.s32 %r13, %r8, %r12;\n" | |
+ "setp.lt.s32 %p3, %r13, %r7;\n" | |
+ "mov.u32 %r56, %r13;\n" | |
+ "@%p3 bra BB0_4;\n" | |
+ "\n" | |
+ "BB0_5:\n" | |
+ "mov.u32 %r54, %nctaid.x;\n" | |
+ "add.s32 %r55, %r54, %r55;\n" | |
+ "setp.lt.s32 %p4, %r55, %r1;\n" | |
+ "@%p4 bra BB0_2;\n" | |
+ "\n" | |
+ "BB0_6:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#elif defined ENVIRONMENT64 | |
+const char color_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19830389\n" | |
+ "// Cuda compilation tools, release 8.0, V8.0.0\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.3\n" | |
+ ".target sm_30\n" | |
+ ".address_size 64\n" | |
+ "\n" | |
+ "// .globl interleaveChroma\n" | |
+ "\n" | |
+ ".visible .entry interleaveChroma(\n" | |
+ ".param .u64 interleaveChroma_param_0,\n" | |
+ ".param .u64 interleaveChroma_param_1,\n" | |
+ ".param .u64 interleaveChroma_param_2,\n" | |
+ ".param .u32 interleaveChroma_param_3,\n" | |
+ ".param .u32 interleaveChroma_param_4,\n" | |
+ ".param .u32 interleaveChroma_param_5,\n" | |
+ ".param .u32 interleaveChroma_param_6\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<5>;\n" | |
+ ".reg .b32 %r<47>;\n" | |
+ ".reg .b64 %rd<14>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd4, [interleaveChroma_param_0];\n" | |
+ "ld.param.u64 %rd5, [interleaveChroma_param_1];\n" | |
+ "ld.param.u64 %rd6, [interleaveChroma_param_2];\n" | |
+ "ld.param.u32 %r12, [interleaveChroma_param_3];\n" | |
+ "ld.param.u32 %r15, [interleaveChroma_param_4];\n" | |
+ "ld.param.u32 %r13, [interleaveChroma_param_5];\n" | |
+ "ld.param.u32 %r14, [interleaveChroma_param_6];\n" | |
+ "shr.s32 %r1, %r15, 1;\n" | |
+ "mov.u32 %r45, %ctaid.x;\n" | |
+ "setp.ge.s32 %p1, %r45, %r1;\n" | |
+ "@%p1 bra BB0_6;\n" | |
+ "\n" | |
+ "cvta.to.global.u64 %rd1, %rd6;\n" | |
+ "cvta.to.global.u64 %rd2, %rd5;\n" | |
+ "cvta.to.global.u64 %rd3, %rd4;\n" | |
+ "mov.u32 %r3, %tid.x;\n" | |
+ "shr.s32 %r4, %r12, 3;\n" | |
+ "mov.u32 %r5, %ntid.x;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "setp.ge.s32 %p2, %r3, %r4;\n" | |
+ "@%p2 bra BB0_5;\n" | |
+ "\n" | |
+ "mul.lo.s32 %r16, %r45, %r13;\n" | |
+ "mul.lo.s32 %r17, %r45, %r14;\n" | |
+ "shr.s32 %r7, %r17, 2;\n" | |
+ "shr.s32 %r18, %r16, 2;\n" | |
+ "shr.u32 %r8, %r18, 1;\n" | |
+ "mov.u32 %r46, %r3;\n" | |
+ "\n" | |
+ "BB0_4:\n" | |
+ "mov.u32 %r9, %r46;\n" | |
+ "add.s32 %r19, %r9, %r8;\n" | |
+ "mul.wide.u32 %rd7, %r19, 4;\n" | |
+ "add.s64 %rd8, %rd3, %rd7;\n" | |
+ "add.s64 %rd9, %rd2, %rd7;\n" | |
+ "ld.global.u32 %r20, [%rd9];\n" | |
+ "and.b32 %r21, %r20, 65280;\n" | |
+ "shl.b32 %r22, %r21, 16;\n" | |
+ "ld.global.u32 %r23, [%rd8];\n" | |
+ "shl.b32 %r24, %r23, 8;\n" | |
+ "and.b32 %r25, %r24, 16711680;\n" | |
+ "shl.b32 %r26, %r20, 8;\n" | |
+ "and.b32 %r27, %r26, 65280;\n" | |
+ "and.b32 %r28, %r23, 255;\n" | |
+ "or.b32 %r29, %r25, %r28;\n" | |
+ "or.b32 %r30, %r29, %r22;\n" | |
+ "or.b32 %r31, %r30, %r27;\n" | |
+ "shl.b32 %r32, %r9, 1;\n" | |
+ "add.s32 %r33, %r32, %r7;\n" | |
+ "mul.wide.u32 %rd10, %r33, 4;\n" | |
+ "add.s64 %rd11, %rd1, %rd10;\n" | |
+ "st.global.u32 [%rd11], %r31;\n" | |
+ "and.b32 %r34, %r20, -16777216;\n" | |
+ "and.b32 %r35, %r23, -16777216;\n" | |
+ "shr.u32 %r36, %r35, 8;\n" | |
+ "or.b32 %r37, %r34, %r36;\n" | |
+ "and.b32 %r38, %r20, 16711680;\n" | |
+ "shr.u32 %r39, %r38, 8;\n" | |
+ "bfe.u32 %r40, %r23, 16, 8;\n" | |
+ "or.b32 %r41, %r37, %r40;\n" | |
+ "or.b32 %r42, %r41, %r39;\n" | |
+ "add.s32 %r43, %r33, 1;\n" | |
+ "mul.wide.u32 %rd12, %r43, 4;\n" | |
+ "add.s64 %rd13, %rd1, %rd12;\n" | |
+ "st.global.u32 [%rd13], %r42;\n" | |
+ "add.s32 %r10, %r5, %r9;\n" | |
+ "setp.lt.s32 %p3, %r10, %r4;\n" | |
+ "mov.u32 %r46, %r10;\n" | |
+ "@%p3 bra BB0_4;\n" | |
+ "\n" | |
+ "BB0_5:\n" | |
+ "mov.u32 %r44, %nctaid.x;\n" | |
+ "add.s32 %r45, %r44, %r45;\n" | |
+ "setp.lt.s32 %p4, %r45, %r1;\n" | |
+ "@%p4 bra BB0_2;\n" | |
+ "\n" | |
+ "BB0_6:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -ruN ffmpeg-orig/configure ffmpeg/configure | |
--- ffmpeg-orig/configure 2016-03-04 00:17:20.652150576 +0200 | |
+++ ffmpeg/configure 2016-03-04 02:36:28.264332984 +0200 | |
@@ -278,6 +278,7 @@ | |
--enable-mmal enable decoding via MMAL [no] | |
--enable-netcdf enable NetCDF, needed for sofalizer filter [no] | |
--enable-nvenc enable NVIDIA NVENC support [no] | |
+ --enable-nvresize enable NVIDIA CUDA accelerated resizer [no] | |
--enable-openal enable OpenAL 1.1 capture support [no] | |
--enable-opencl enable OpenCL code | |
--enable-opengl enable OpenGL rendering [no] | |
@@ -1502,6 +1503,7 @@ | |
mmal | |
netcdf | |
nvenc | |
+ nvresize | |
openal | |
opencl | |
opengl | |
@@ -5325,6 +5327,7 @@ | |
frei0r_src_filter_extralibs='$ldl' | |
ladspa_filter_extralibs='$ldl' | |
nvenc_encoder_extralibs='$ldl' | |
+nvresize_filter_extralibs='$ldl' | |
if ! disabled network; then | |
check_func getaddrinfo $network_extralibs | |
diff -ruN ffmpeg-orig/libavcodec/Makefile ffmpeg/libavcodec/Makefile | |
--- ffmpeg-orig/libavcodec/Makefile 2016-03-04 00:17:21.166198276 +0200 | |
+++ ffmpeg/libavcodec/Makefile 2016-03-04 00:18:17.583433360 +0200 | |
@@ -102,7 +102,7 @@ | |
motion_est.o ratecontrol.o \ | |
mpegvideoencdsp.o | |
OBJS-$(CONFIG_MSS34DSP) += mss34dsp.o | |
-OBJS-$(CONFIG_NVENC) += nvenc.o | |
+OBJS-$(CONFIG_NVENC) += nvenc.o nvenc_ptx.o | |
OBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o | |
OBJS-$(CONFIG_QPELDSP) += qpeldsp.o | |
OBJS-$(CONFIG_QSV) += qsv.o | |
diff -ruN ffmpeg-orig/libavcodec/nvenc.c ffmpeg/libavcodec/nvenc.c | |
--- ffmpeg-orig/libavcodec/nvenc.c 2016-03-04 00:17:21.113193357 +0200 | |
+++ ffmpeg/libavcodec/nvenc.c 2016-03-04 00:26:51.381033223 +0200 | |
@@ -32,15 +32,11 @@ | |
#include "libavutil/avassert.h" | |
#include "libavutil/opt.h" | |
#include "libavutil/mem.h" | |
+#include "libavutil/cudautils.h" | |
#include "avcodec.h" | |
#include "internal.h" | |
#include "thread.h" | |
-#if defined(_WIN32) | |
-#define CUDAAPI __stdcall | |
-#else | |
-#define CUDAAPI | |
-#endif | |
#if defined(_WIN32) | |
#define LOAD_FUNC(l, s) GetProcAddress(l, s) | |
@@ -50,28 +46,19 @@ | |
#define DL_CLOSE_FUNC(l) dlclose(l) | |
#endif | |
-typedef enum cudaError_enum { | |
- CUDA_SUCCESS = 0 | |
-} CUresult; | |
-typedef int CUdevice; | |
-typedef void* CUcontext; | |
- | |
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal); | |
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev); | |
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx); | |
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx); | |
+#define BLOCKSX 128 | |
+#define THREADSX 128 | |
typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList); | |
typedef struct NvencInputSurface | |
{ | |
NV_ENC_INPUT_PTR input_surface; | |
+ CUdeviceptr dptr; | |
+ void* hRes; | |
int width; | |
int height; | |
+ size_t pitch; | |
int lockCount; | |
@@ -107,24 +94,11 @@ | |
typedef struct NvencDynLoadFunctions | |
{ | |
- PCUINIT cu_init; | |
- PCUDEVICEGETCOUNT cu_device_get_count; | |
- PCUDEVICEGET cu_device_get; | |
- PCUDEVICEGETNAME cu_device_get_name; | |
- PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability; | |
- PCUCTXCREATE cu_ctx_create; | |
- PCUCTXPOPCURRENT cu_ctx_pop_current; | |
- PCUCTXDESTROY cu_ctx_destroy; | |
- | |
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs; | |
- int nvenc_device_count; | |
- CUdevice nvenc_devices[16]; | |
#if defined(_WIN32) | |
- HMODULE cuda_lib; | |
HMODULE nvenc_lib; | |
#else | |
- void* cuda_lib; | |
void* nvenc_lib; | |
#endif | |
} NvencDynLoadFunctions; | |
@@ -140,14 +114,18 @@ | |
AVClass *avclass; | |
NvencDynLoadFunctions nvenc_dload_funcs; | |
+ CudaDynLoadFunctions* cuda_dload_funcs; | |
NV_ENC_INITIALIZE_PARAMS init_encode_params; | |
NV_ENC_CONFIG encode_config; | |
CUcontext cu_context; | |
+ CUmodule cu_module; | |
+ CUfunction cu_func_interleaveChroma; | |
int max_surface_count; | |
NvencInputSurface *input_surfaces; | |
NvencOutputSurface *output_surfaces; | |
+ NvencInputSurface transferSurf; | |
NvencDataList output_surface_queue; | |
NvencDataList output_surface_ready_queue; | |
@@ -164,8 +142,10 @@ | |
int twopass; | |
int gpu; | |
int buffer_delay; | |
+ int aq; | |
} NvencContext; | |
+ | |
static const NvencValuePair nvenc_h264_level_pairs[] = { | |
{ "auto", NV_ENC_LEVEL_AUTOSELECT }, | |
{ "1" , NV_ENC_LEVEL_H264_1 }, | |
@@ -330,79 +310,18 @@ | |
return res->u.timestamp; | |
} | |
-#define CHECK_LOAD_FUNC(t, f, s) \ | |
-do { \ | |
- (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \ | |
- if (!(f)) { \ | |
- av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \ | |
- goto error; \ | |
- } \ | |
-} while (0) | |
- | |
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx) | |
-{ | |
- NvencContext *ctx = avctx->priv_data; | |
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
- | |
- if (dl_fn->cuda_lib) | |
- return 1; | |
- | |
-#if defined(_WIN32) | |
- dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll")); | |
-#else | |
- dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY); | |
-#endif | |
- | |
- if (!dl_fn->cuda_lib) { | |
- av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n"); | |
- goto error; | |
- } | |
- | |
- CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet"); | |
- CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName"); | |
- CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability"); | |
- CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2"); | |
- CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2"); | |
- CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2"); | |
- | |
- return 1; | |
- | |
-error: | |
- | |
- if (dl_fn->cuda_lib) | |
- DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
- | |
- dl_fn->cuda_lib = NULL; | |
- | |
- return 0; | |
-} | |
- | |
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func) | |
-{ | |
- if (err != CUDA_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err); | |
- return 0; | |
- } | |
- return 1; | |
-} | |
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error | |
- | |
static av_cold int nvenc_check_cuda(AVCodecContext *avctx) | |
{ | |
- int device_count = 0; | |
- CUdevice cu_device = 0; | |
- char gpu_name[128]; | |
- int smminor = 0, smmajor = 0; | |
- int i, smver, target_smver; | |
+ int target_smver; | |
NvencContext *ctx = avctx->priv_data; | |
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
+ | |
+ if (!init_cuda()) | |
+ return 0; | |
switch (avctx->codec->id) { | |
case AV_CODEC_ID_H264: | |
- target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30; | |
+ target_smver = 0x30; | |
break; | |
case AV_CODEC_ID_H265: | |
target_smver = 0x52; | |
@@ -412,49 +331,19 @@ | |
goto error; | |
} | |
- if (!nvenc_dyload_cuda(avctx)) | |
- return 0; | |
- | |
- if (dl_fn->nvenc_device_count > 0) | |
- return 1; | |
- | |
- check_cuda_errors(dl_fn->cu_init(0)); | |
- | |
- check_cuda_errors(dl_fn->cu_device_get_count(&device_count)); | |
- | |
- if (!device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n"); | |
+ if (!is_gpu_feature_available(ctx->gpu, target_smver)) | |
+ { | |
+ av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu); | |
goto error; | |
} | |
- | |
- av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count); | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
- for (i = 0; i < device_count; ++i) { | |
- check_cuda_errors(dl_fn->cu_device_get(&cu_device, i)); | |
- check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device)); | |
- check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device)); | |
- | |
- smver = (smmajor << 4) | smminor; | |
- | |
- av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available"); | |
- | |
- if (smver >= target_smver) | |
- dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device; | |
- } | |
- | |
- if (!dl_fn->nvenc_device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n"); | |
- goto error; | |
+ else | |
+ { | |
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu); | |
} | |
return 1; | |
error: | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
return 0; | |
} | |
@@ -488,23 +377,18 @@ | |
} | |
nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance"); | |
- | |
if (!nvEncodeAPICreateInstance) { | |
av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n"); | |
goto error; | |
} | |
dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER; | |
- | |
nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs); | |
- | |
if (nvstatus != NV_ENC_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n"); | |
goto error; | |
} | |
- av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n"); | |
- | |
return 1; | |
error: | |
@@ -512,7 +396,6 @@ | |
DL_CLOSE_FUNC(dl_fn->nvenc_lib); | |
dl_fn->nvenc_lib = NULL; | |
- | |
return 0; | |
} | |
@@ -523,29 +406,16 @@ | |
DL_CLOSE_FUNC(dl_fn->nvenc_lib); | |
dl_fn->nvenc_lib = NULL; | |
- | |
- dl_fn->nvenc_device_count = 0; | |
- | |
- DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
- dl_fn->cuda_lib = NULL; | |
- | |
- dl_fn->cu_init = NULL; | |
- dl_fn->cu_device_get_count = NULL; | |
- dl_fn->cu_device_get = NULL; | |
- dl_fn->cu_device_get_name = NULL; | |
- dl_fn->cu_device_compute_capability = NULL; | |
- dl_fn->cu_ctx_create = NULL; | |
- dl_fn->cu_ctx_pop_current = NULL; | |
- dl_fn->cu_ctx_destroy = NULL; | |
- | |
+ deinit_cuda(); | |
av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n"); | |
} | |
static av_cold int nvenc_encode_init(AVCodecContext *avctx) | |
{ | |
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 }; | |
+ NV_ENC_REGISTER_RESOURCE registerParams = { 0 }; | |
NV_ENC_PRESET_CONFIG preset_config = { 0 }; | |
- CUcontext cu_context_curr; | |
+ CudaDynLoadFunctions *p_cuda; | |
CUresult cu_res; | |
GUID encoder_preset = NV_ENC_PRESET_HQ_GUID; | |
GUID codec; | |
@@ -558,6 +428,7 @@ | |
int res = 0; | |
int dw, dh; | |
int qp_inter_p; | |
+ extern char color_ptx[]; | |
NvencContext *ctx = avctx->priv_data; | |
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
@@ -575,28 +446,18 @@ | |
encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; | |
encode_session_params.apiVersion = NVENCAPI_VERSION; | |
- if (ctx->gpu >= dl_fn->nvenc_device_count) { | |
- av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count); | |
- res = AVERROR(EINVAL); | |
- goto error; | |
- } | |
- | |
- ctx->cu_context = NULL; | |
- cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins | |
+ cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu); | |
+ p_cuda = get_cuda_dl_func(); | |
if (cu_res != CUDA_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res); | |
res = AVERROR_EXTERNAL; | |
goto error; | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context); | |
- cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr); | |
- | |
- if (cu_res != CUDA_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res); | |
- res = AVERROR_EXTERNAL; | |
- goto error; | |
- } | |
+ __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx)); | |
+ __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma")); | |
encode_session_params.device = ctx->cu_context; | |
encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA; | |
@@ -866,6 +727,15 @@ | |
ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; | |
} | |
+ if (ctx->aq) | |
+ { | |
+ ctx->encode_config.rcParams.enableAQ = 1; | |
+ } | |
+ else | |
+ { | |
+ ctx->encode_config.rcParams.enableAQ = 0; | |
+ } | |
+ | |
switch (avctx->codec->id) { | |
case AV_CODEC_ID_H264: | |
ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1; | |
@@ -885,9 +755,6 @@ | |
if (!ctx->profile) { | |
switch (avctx->profile) { | |
- case FF_PROFILE_H264_HIGH_444_PREDICTIVE: | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- break; | |
case FF_PROFILE_H264_BASELINE: | |
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; | |
break; | |
@@ -913,9 +780,6 @@ | |
} else if (!strcmp(ctx->profile, "baseline")) { | |
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; | |
avctx->profile = FF_PROFILE_H264_BASELINE; | |
- } else if (!strcmp(ctx->profile, "high444p")) { | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE; | |
} else { | |
av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile); | |
res = AVERROR(EINVAL); | |
@@ -923,13 +787,7 @@ | |
} | |
} | |
- // force setting profile as high444p if input is AV_PIX_FMT_YUV444P | |
- if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) { | |
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID; | |
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE; | |
- } | |
- | |
- ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1; | |
+ ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1; | |
if (ctx->level) { | |
res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level); | |
@@ -988,6 +846,8 @@ | |
goto error; | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n"); | |
+ | |
ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces)); | |
if (!ctx->input_surfaces) { | |
@@ -1002,28 +862,32 @@ | |
goto error; | |
} | |
+ // Allocation for temp surface used for sys mem -> device mem transfer | |
+ if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) | |
+ { | |
+ ctx->transferSurf.width = (avctx->width + 31) & ~31; | |
+ ctx->transferSurf.height = (avctx->height + 31) & ~31; | |
+ p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr, | |
+ &ctx->transferSurf.pitch, | |
+ ctx->transferSurf.width, | |
+ ctx->transferSurf.height/ 2, 16); | |
+ } | |
+ | |
for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) { | |
- NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 }; | |
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 }; | |
- allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER; | |
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; | |
- allocSurf.width = (avctx->width + 31) & ~31; | |
- allocSurf.height = (avctx->height + 31) & ~31; | |
- | |
- allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED; | |
+ ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31; | |
+ ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31; | |
switch (avctx->pix_fmt) { | |
case AV_PIX_FMT_YUV420P: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL; | |
- break; | |
- | |
case AV_PIX_FMT_NV12: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; | |
- break; | |
- | |
- case AV_PIX_FMT_YUV444P: | |
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL; | |
+ ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL; | |
+ p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr, | |
+ &ctx->input_surfaces[surfaceCount].pitch, | |
+ ctx->input_surfaces[surfaceCount].width, | |
+ ctx->input_surfaces[surfaceCount].height * 3 / 2, 16); | |
break; | |
default: | |
@@ -1032,18 +896,21 @@ | |
goto error; | |
} | |
- nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf); | |
+ registerParams.version = NV_ENC_REGISTER_RESOURCE_VER, | |
+ registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, | |
+ registerParams.width = ctx->input_surfaces[surfaceCount].width, | |
+ registerParams.height = ctx->input_surfaces[surfaceCount].height, | |
+ registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch, | |
+ registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format; | |
+ registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr, | |
+ nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, ®isterParams); | |
if (nv_status != NV_ENC_SUCCESS) { | |
- av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n"); | |
+ av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n"); | |
res = AVERROR_EXTERNAL; | |
goto error; | |
} | |
- | |
+ ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource; | |
ctx->input_surfaces[surfaceCount].lockCount = 0; | |
- ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer; | |
- ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt; | |
- ctx->input_surfaces[surfaceCount].width = allocSurf.width; | |
- ctx->input_surfaces[surfaceCount].height = allocSurf.height; | |
/* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */ | |
allocOut.size = 1024 * 1024; | |
@@ -1106,21 +973,22 @@ | |
return 0; | |
error: | |
- | |
for (i = 0; i < surfaceCount; ++i) { | |
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface); | |
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes); | |
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr); | |
+ | |
if (ctx->output_surfaces[i].output_surface) | |
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface); | |
} | |
+ p_cuda->cu_mem_free(ctx->transferSurf.dptr); | |
if (ctx->nvencoder) | |
p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
if (ctx->cu_context) | |
- dl_fn->cu_ctx_destroy(ctx->cu_context); | |
+ release_cuda_context(&ctx->cu_context, ctx->gpu); | |
nvenc_unload_nvenc(avctx); | |
- | |
ctx->nvencoder = NULL; | |
ctx->cu_context = NULL; | |
@@ -1132,6 +1000,7 @@ | |
NvencContext *ctx = avctx->priv_data; | |
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; | |
NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs; | |
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func(); | |
int i; | |
av_freep(&ctx->timestamp_list.data); | |
@@ -1139,16 +1008,19 @@ | |
av_freep(&ctx->output_surface_queue.data); | |
for (i = 0; i < ctx->max_surface_count; ++i) { | |
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface); | |
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes); | |
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr); | |
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface); | |
} | |
ctx->max_surface_count = 0; | |
- p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
+ if (ctx->nvencoder) | |
+ p_nvenc->nvEncDestroyEncoder(ctx->nvencoder); | |
+ | |
ctx->nvencoder = NULL; | |
- dl_fn->cu_ctx_destroy(ctx->cu_context); | |
- ctx->cu_context = NULL; | |
+ if (ctx->cu_context) | |
+ release_cuda_context(&ctx->cu_context, ctx->gpu); | |
nvenc_unload_nvenc(avctx); | |
@@ -1163,6 +1035,7 @@ | |
uint32_t slice_mode_data; | |
uint32_t *slice_offsets; | |
+ char picType = 'X'; | |
NV_ENC_LOCK_BITSTREAM lock_params = { 0 }; | |
NVENCSTATUS nv_status; | |
int res = 0; | |
@@ -1215,12 +1088,15 @@ | |
FF_DISABLE_DEPRECATION_WARNINGS | |
case NV_ENC_PIC_TYPE_I: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; | |
+ picType = 'I'; | |
break; | |
case NV_ENC_PIC_TYPE_P: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P; | |
+ picType = 'P'; | |
break; | |
case NV_ENC_PIC_TYPE_B: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B; | |
+ picType = 'B'; | |
break; | |
case NV_ENC_PIC_TYPE_BI: | |
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI; | |
@@ -1234,6 +1110,8 @@ | |
#endif | |
} | |
+ av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d PicType %c Frame AvgQP %d SATD Cost %d Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes); | |
+ | |
pkt->pts = lock_params.outputTimeStamp; | |
pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list); | |
@@ -1261,6 +1139,61 @@ | |
return res; | |
} | |
+ | |
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, | |
+ CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride) | |
+{ | |
+ void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride}; | |
+ __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf) | |
+{ | |
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func(); | |
+ if (frame->format == AV_PIX_FMT_NV12) { | |
+ | |
+ // check opaque field, if there's already a deviceptr | |
+ if (frame->opaque && check_nvinfo(frame->opaque) && | |
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)frame->opaque; | |
+ | |
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else | |
+ { | |
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ } | |
+ else if (frame->format == AV_PIX_FMT_YUV420P) { | |
+ // check opaque field, if there's already a deviceptr | |
+ if (frame->opaque && check_nvinfo(frame->opaque) && | |
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)frame->opaque; | |
+ | |
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch); | |
+ } | |
+ else | |
+ { | |
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch); | |
+ } | |
+ } | |
+ else { | |
+ av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n"); | |
+ return AVERROR(EINVAL); | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt, | |
const AVFrame *frame, int *got_packet) | |
{ | |
@@ -1276,7 +1209,7 @@ | |
pic_params.version = NV_ENC_PIC_PARAMS_VER; | |
if (frame) { | |
- NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 }; | |
+ NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 }; | |
NvencInputSurface *inSurf = NULL; | |
for (i = 0; i < ctx->max_surface_count; ++i) { | |
@@ -1290,69 +1223,27 @@ | |
inSurf->lockCount = 1; | |
- lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER; | |
- lockBufferParams.inputBuffer = inSurf->input_surface; | |
- | |
- nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams); | |
+ mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER; | |
+ mapParams.registeredResource = inSurf->hRes; | |
+ nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams); | |
if (nv_status != NV_ENC_SUCCESS) { | |
- av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n"); | |
+ av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n"); | |
return 0; | |
} | |
- if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1, | |
- frame->data[2], frame->linesize[2], | |
- avctx->width >> 1, avctx->height >> 1); | |
- | |
- buf += (inSurf->height * lockBufferParams.pitch) >> 2; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width >> 1, avctx->height >> 1); | |
- } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width, avctx->height >> 1); | |
- } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) { | |
- uint8_t *buf = lockBufferParams.bufferDataPtr; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[0], frame->linesize[0], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
- | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[1], frame->linesize[1], | |
- avctx->width, avctx->height); | |
- | |
- buf += inSurf->height * lockBufferParams.pitch; | |
+ inSurf->input_surface = mapParams.mappedResource; | |
+ if (inSurf->format != mapParams.mappedBufferFmt) { | |
+ av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n"); | |
+ return 0; | |
+ } | |
- av_image_copy_plane(buf, lockBufferParams.pitch, | |
- frame->data[2], frame->linesize[2], | |
- avctx->width, avctx->height); | |
- } else { | |
- av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n"); | |
- return AVERROR(EINVAL); | |
+ if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) { | |
+ p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface); | |
+ av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n"); | |
+ return 0; | |
} | |
- nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface); | |
+ nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface); | |
if (nv_status != NV_ENC_SUCCESS) { | |
av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n"); | |
return AVERROR_EXTERNAL; | |
@@ -1470,7 +1361,6 @@ | |
static const enum AVPixelFormat pix_fmts_nvenc[] = { | |
AV_PIX_FMT_YUV420P, | |
AV_PIX_FMT_NV12, | |
- AV_PIX_FMT_YUV444P, | |
AV_PIX_FMT_NONE | |
}; | |
@@ -1478,13 +1368,14 @@ | |
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM | |
static const AVOption options[] = { | |
{ "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE }, | |
- { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
+ { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
{ "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { .str = "auto" }, 0, 0, VE }, | |
{ "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE }, | |
{ "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, | |
{ "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE }, | |
{ "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE }, | |
{ "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE }, | |
+ { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, | |
{ NULL } | |
}; | |
diff -ruN ffmpeg-orig/libavcodec/nvenc_ptx.c ffmpeg/libavcodec/nvenc_ptx.c | |
--- ffmpeg-orig/libavcodec/nvenc_ptx.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavcodec/nvenc_ptx.c 2016-03-04 00:18:17.585433546 +0200 | |
@@ -0,0 +1,240 @@ | |
+#if _WIN32 || _WIN64 | |
+#if _WIN64 | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+// Check GCC | |
+#if __GNUC__ | |
+#if __x86_64__ || __ppc64__ | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+#ifdef ENVIRONMENT32 | |
+const char color_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19830389\n" | |
+ "// Cuda compilation tools, release 8.0, V8.0.0\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.3\n" | |
+ ".target sm_30\n" | |
+ ".address_size 32\n" | |
+ "\n" | |
+ "// .globl interleaveChroma\n" | |
+ "\n" | |
+ ".visible .entry interleaveChroma(\n" | |
+ ".param .u32 interleaveChroma_param_0,\n" | |
+ ".param .u32 interleaveChroma_param_1,\n" | |
+ ".param .u32 interleaveChroma_param_2,\n" | |
+ ".param .u32 interleaveChroma_param_3,\n" | |
+ ".param .u32 interleaveChroma_param_4,\n" | |
+ ".param .u32 interleaveChroma_param_5,\n" | |
+ ".param .u32 interleaveChroma_param_6\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<5>;\n" | |
+ ".reg .b32 %r<57>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r15, [interleaveChroma_param_0];\n" | |
+ "ld.param.u32 %r16, [interleaveChroma_param_1];\n" | |
+ "ld.param.u32 %r17, [interleaveChroma_param_2];\n" | |
+ "ld.param.u32 %r18, [interleaveChroma_param_3];\n" | |
+ "ld.param.u32 %r21, [interleaveChroma_param_4];\n" | |
+ "ld.param.u32 %r19, [interleaveChroma_param_5];\n" | |
+ "ld.param.u32 %r20, [interleaveChroma_param_6];\n" | |
+ "shr.s32 %r1, %r21, 1;\n" | |
+ "mov.u32 %r55, %ctaid.x;\n" | |
+ "setp.ge.s32 %p1, %r55, %r1;\n" | |
+ "@%p1 bra BB0_6;\n" | |
+ "\n" | |
+ "cvta.to.global.u32 %r3, %r17;\n" | |
+ "cvta.to.global.u32 %r4, %r16;\n" | |
+ "cvta.to.global.u32 %r5, %r15;\n" | |
+ "mov.u32 %r6, %tid.x;\n" | |
+ "shr.s32 %r7, %r18, 3;\n" | |
+ "mov.u32 %r8, %ntid.x;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "setp.ge.s32 %p2, %r6, %r7;\n" | |
+ "@%p2 bra BB0_5;\n" | |
+ "\n" | |
+ "mul.lo.s32 %r22, %r55, %r19;\n" | |
+ "mul.lo.s32 %r23, %r55, %r20;\n" | |
+ "shr.s32 %r10, %r23, 2;\n" | |
+ "shr.s32 %r24, %r22, 2;\n" | |
+ "shr.u32 %r11, %r24, 1;\n" | |
+ "mov.u32 %r56, %r6;\n" | |
+ "\n" | |
+ "BB0_4:\n" | |
+ "mov.u32 %r12, %r56;\n" | |
+ "add.s32 %r25, %r12, %r11;\n" | |
+ "shl.b32 %r26, %r25, 2;\n" | |
+ "add.s32 %r27, %r5, %r26;\n" | |
+ "add.s32 %r28, %r4, %r26;\n" | |
+ "ld.global.u32 %r29, [%r28];\n" | |
+ "and.b32 %r30, %r29, 65280;\n" | |
+ "shl.b32 %r31, %r30, 16;\n" | |
+ "ld.global.u32 %r32, [%r27];\n" | |
+ "shl.b32 %r33, %r32, 8;\n" | |
+ "and.b32 %r34, %r33, 16711680;\n" | |
+ "shl.b32 %r35, %r29, 8;\n" | |
+ "and.b32 %r36, %r35, 65280;\n" | |
+ "and.b32 %r37, %r32, 255;\n" | |
+ "or.b32 %r38, %r34, %r37;\n" | |
+ "or.b32 %r39, %r38, %r31;\n" | |
+ "or.b32 %r40, %r39, %r36;\n" | |
+ "shl.b32 %r41, %r12, 1;\n" | |
+ "add.s32 %r42, %r41, %r10;\n" | |
+ "shl.b32 %r43, %r42, 2;\n" | |
+ "add.s32 %r44, %r3, %r43;\n" | |
+ "st.global.u32 [%r44], %r40;\n" | |
+ "and.b32 %r45, %r29, -16777216;\n" | |
+ "and.b32 %r46, %r32, -16777216;\n" | |
+ "shr.u32 %r47, %r46, 8;\n" | |
+ "or.b32 %r48, %r45, %r47;\n" | |
+ "and.b32 %r49, %r29, 16711680;\n" | |
+ "shr.u32 %r50, %r49, 8;\n" | |
+ "bfe.u32 %r51, %r32, 16, 8;\n" | |
+ "or.b32 %r52, %r48, %r51;\n" | |
+ "or.b32 %r53, %r52, %r50;\n" | |
+ "st.global.u32 [%r44+4], %r53;\n" | |
+ "add.s32 %r13, %r8, %r12;\n" | |
+ "setp.lt.s32 %p3, %r13, %r7;\n" | |
+ "mov.u32 %r56, %r13;\n" | |
+ "@%p3 bra BB0_4;\n" | |
+ "\n" | |
+ "BB0_5:\n" | |
+ "mov.u32 %r54, %nctaid.x;\n" | |
+ "add.s32 %r55, %r54, %r55;\n" | |
+ "setp.lt.s32 %p4, %r55, %r1;\n" | |
+ "@%p4 bra BB0_2;\n" | |
+ "\n" | |
+ "BB0_6:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#elif defined ENVIRONMENT64 | |
+const char color_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19830389\n" | |
+ "// Cuda compilation tools, release 8.0, V8.0.0\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.3\n" | |
+ ".target sm_30\n" | |
+ ".address_size 64\n" | |
+ "\n" | |
+ "// .globl interleaveChroma\n" | |
+ "\n" | |
+ ".visible .entry interleaveChroma(\n" | |
+ ".param .u64 interleaveChroma_param_0,\n" | |
+ ".param .u64 interleaveChroma_param_1,\n" | |
+ ".param .u64 interleaveChroma_param_2,\n" | |
+ ".param .u32 interleaveChroma_param_3,\n" | |
+ ".param .u32 interleaveChroma_param_4,\n" | |
+ ".param .u32 interleaveChroma_param_5,\n" | |
+ ".param .u32 interleaveChroma_param_6\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<5>;\n" | |
+ ".reg .b32 %r<47>;\n" | |
+ ".reg .b64 %rd<14>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd4, [interleaveChroma_param_0];\n" | |
+ "ld.param.u64 %rd5, [interleaveChroma_param_1];\n" | |
+ "ld.param.u64 %rd6, [interleaveChroma_param_2];\n" | |
+ "ld.param.u32 %r12, [interleaveChroma_param_3];\n" | |
+ "ld.param.u32 %r15, [interleaveChroma_param_4];\n" | |
+ "ld.param.u32 %r13, [interleaveChroma_param_5];\n" | |
+ "ld.param.u32 %r14, [interleaveChroma_param_6];\n" | |
+ "shr.s32 %r1, %r15, 1;\n" | |
+ "mov.u32 %r45, %ctaid.x;\n" | |
+ "setp.ge.s32 %p1, %r45, %r1;\n" | |
+ "@%p1 bra BB0_6;\n" | |
+ "\n" | |
+ "cvta.to.global.u64 %rd1, %rd6;\n" | |
+ "cvta.to.global.u64 %rd2, %rd5;\n" | |
+ "cvta.to.global.u64 %rd3, %rd4;\n" | |
+ "mov.u32 %r3, %tid.x;\n" | |
+ "shr.s32 %r4, %r12, 3;\n" | |
+ "mov.u32 %r5, %ntid.x;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "setp.ge.s32 %p2, %r3, %r4;\n" | |
+ "@%p2 bra BB0_5;\n" | |
+ "\n" | |
+ "mul.lo.s32 %r16, %r45, %r13;\n" | |
+ "mul.lo.s32 %r17, %r45, %r14;\n" | |
+ "shr.s32 %r7, %r17, 2;\n" | |
+ "shr.s32 %r18, %r16, 2;\n" | |
+ "shr.u32 %r8, %r18, 1;\n" | |
+ "mov.u32 %r46, %r3;\n" | |
+ "\n" | |
+ "BB0_4:\n" | |
+ "mov.u32 %r9, %r46;\n" | |
+ "add.s32 %r19, %r9, %r8;\n" | |
+ "mul.wide.u32 %rd7, %r19, 4;\n" | |
+ "add.s64 %rd8, %rd3, %rd7;\n" | |
+ "add.s64 %rd9, %rd2, %rd7;\n" | |
+ "ld.global.u32 %r20, [%rd9];\n" | |
+ "and.b32 %r21, %r20, 65280;\n" | |
+ "shl.b32 %r22, %r21, 16;\n" | |
+ "ld.global.u32 %r23, [%rd8];\n" | |
+ "shl.b32 %r24, %r23, 8;\n" | |
+ "and.b32 %r25, %r24, 16711680;\n" | |
+ "shl.b32 %r26, %r20, 8;\n" | |
+ "and.b32 %r27, %r26, 65280;\n" | |
+ "and.b32 %r28, %r23, 255;\n" | |
+ "or.b32 %r29, %r25, %r28;\n" | |
+ "or.b32 %r30, %r29, %r22;\n" | |
+ "or.b32 %r31, %r30, %r27;\n" | |
+ "shl.b32 %r32, %r9, 1;\n" | |
+ "add.s32 %r33, %r32, %r7;\n" | |
+ "mul.wide.u32 %rd10, %r33, 4;\n" | |
+ "add.s64 %rd11, %rd1, %rd10;\n" | |
+ "st.global.u32 [%rd11], %r31;\n" | |
+ "and.b32 %r34, %r20, -16777216;\n" | |
+ "and.b32 %r35, %r23, -16777216;\n" | |
+ "shr.u32 %r36, %r35, 8;\n" | |
+ "or.b32 %r37, %r34, %r36;\n" | |
+ "and.b32 %r38, %r20, 16711680;\n" | |
+ "shr.u32 %r39, %r38, 8;\n" | |
+ "bfe.u32 %r40, %r23, 16, 8;\n" | |
+ "or.b32 %r41, %r37, %r40;\n" | |
+ "or.b32 %r42, %r41, %r39;\n" | |
+ "add.s32 %r43, %r33, 1;\n" | |
+ "mul.wide.u32 %rd12, %r43, 4;\n" | |
+ "add.s64 %rd13, %rd1, %rd12;\n" | |
+ "st.global.u32 [%rd13], %r42;\n" | |
+ "add.s32 %r10, %r5, %r9;\n" | |
+ "setp.lt.s32 %p3, %r10, %r4;\n" | |
+ "mov.u32 %r46, %r10;\n" | |
+ "@%p3 bra BB0_4;\n" | |
+ "\n" | |
+ "BB0_5:\n" | |
+ "mov.u32 %r44, %nctaid.x;\n" | |
+ "add.s32 %r45, %r44, %r45;\n" | |
+ "setp.lt.s32 %p4, %r45, %r1;\n" | |
+ "@%p4 bra BB0_2;\n" | |
+ "\n" | |
+ "BB0_6:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#endif | |
diff -ruN ffmpeg-orig/libavfilter/allfilters.c ffmpeg/libavfilter/allfilters.c | |
--- ffmpeg-orig/libavfilter/allfilters.c 2016-03-04 00:17:21.071189460 +0200 | |
+++ ffmpeg/libavfilter/allfilters.c 2016-03-04 00:30:07.377997402 +0200 | |
@@ -225,6 +225,7 @@ | |
REGISTER_FILTER(NOFORMAT, noformat, vf); | |
REGISTER_FILTER(NOISE, noise, vf); | |
REGISTER_FILTER(NULL, null, vf); | |
+ REGISTER_FILTER(NVRESIZE, nvresize, vf); | |
REGISTER_FILTER(OCR, ocr, vf); | |
REGISTER_FILTER(OCV, ocv, vf); | |
REGISTER_FILTER(OVERLAY, overlay, vf); | |
diff -ruN ffmpeg-orig/libavfilter/Makefile ffmpeg/libavfilter/Makefile | |
--- ffmpeg-orig/libavfilter/Makefile 2016-03-04 00:17:21.079190202 +0200 | |
+++ ffmpeg/libavfilter/Makefile 2016-03-04 02:41:05.358919913 +0200 | |
@@ -204,6 +204,7 @@ | |
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o | |
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o | |
OBJS-$(CONFIG_NULL_FILTER) += vf_null.o | |
+OBJS-$(CONFIG_NVRESIZE_FILTER) += vf_nvresize.o vf_nvresize_ptx.o | |
OBJS-$(CONFIG_OCR_FILTER) += vf_ocr.o | |
OBJS-$(CONFIG_OCV_FILTER) += vf_libopencv.o | |
OBJS-$(CONFIG_OPENCL) += deshake_opencl.o unsharp_opencl.o | |
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize.c ffmpeg/libavfilter/vf_nvresize.c | |
--- ffmpeg-orig/libavfilter/vf_nvresize.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavfilter/vf_nvresize.c 2016-03-04 02:27:15.262271696 +0200 | |
@@ -0,0 +1,669 @@ | |
+/* | |
+ * Copyright (c) 2011 Roger Pau Monné <[email protected]> | |
+ * Copyright (c) 2011 Stefano Sabatini | |
+ * Copyright (c) 2013 Paul B Mahol | |
+ * | |
+ * This file is part of FFmpeg. | |
+ * | |
+ * FFmpeg is free software; you can redistribute it and/or | |
+ * modify it under the terms of the GNU Lesser General Public | |
+ * License as published by the Free Software Foundation; either | |
+ * version 2.1 of the License, or (at your option) any later version. | |
+ * | |
+ * FFmpeg is distributed in the hope that it will be useful, | |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+ * Lesser General Public License for more details. | |
+ * | |
+ * You should have received a copy of the GNU Lesser General Public | |
+ * License along with FFmpeg; if not, write to the Free Software | |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+ */ | |
+ | |
+ | |
+#include "libavutil/avassert.h" | |
+#include "libavutil/avstring.h" | |
+#include "libavutil/eval.h" | |
+#include "libavutil/mathematics.h" | |
+#include "libavutil/opt.h" | |
+#include "libavutil/pixdesc.h" | |
+#include "libavutil/parseutils.h" | |
+#include "libavutil/cudautils.h" | |
+ | |
+#include "avfilter.h" | |
+#include "drawutils.h" | |
+#include "formats.h" | |
+#include "internal.h" | |
+#include "video.h" | |
+ | |
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) | |
+#define MAX_OUTPUT 16 | |
+#define BLOCKX 32 | |
+#define BLOCKY 16 | |
+ | |
+typedef struct cu_tex { | |
+ int w; | |
+ int h; | |
+ size_t pitch; | |
+ CUdeviceptr dptr; | |
+} cu_tex; | |
+ | |
+typedef struct NVResizeContext { | |
+ const AVClass *class; | |
+ | |
+ /** | |
+ * New dimensions. Special values are: | |
+ * 0 = original width/height | |
+ * -1 = keep original aspect | |
+ * -N = try to keep aspect but make sure it is divisible by N | |
+ */ | |
+ int nb_outputs; | |
+ | |
+ char *size_str; | |
+ int force_original_aspect_ratio; | |
+ int readback_FB; | |
+ int gpu; | |
+ | |
+ int cuda_inited; | |
+ | |
+ CUcontext cu_ctx; | |
+ CudaDynLoadFunctions* cu_dl_func; | |
+ CUmodule cu_module; | |
+ CUfunction cu_func_uchar; | |
+ CUfunction cu_func_uchar2; | |
+ CUfunction cu_func_uchar4; | |
+ CUtexref cu_tex_uchar; | |
+ CUtexref cu_tex_uchar2; | |
+ CUtexref cu_tex_uchar4; | |
+ cu_tex intex; | |
+ cu_tex outtex[MAX_OUTPUT]; | |
+ | |
+} NVResizeContext; | |
+ | |
+#define OFFSET(x) offsetof(NVResizeContext, x) | |
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM | |
+ | |
+static const AVOption nvresize_options[] = { | |
+ { "outputs", "set number of outputs", OFFSET(nb_outputs), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS }, | |
+ { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS }, | |
+ { "size", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS }, | |
+ { "s", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS }, | |
+ { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS }, | |
+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFILTER_DEFINE_CLASS(nvresize); | |
+ | |
+static int query_formats(AVFilterContext *ctx) | |
+{ | |
+ static const enum AVPixelFormat pix_fmts[] = { | |
+ AV_PIX_FMT_YUV420P, | |
+ AV_PIX_FMT_YUV444P, | |
+ AV_PIX_FMT_NV12, | |
+ AV_PIX_FMT_ARGB, | |
+ AV_PIX_FMT_RGBA, | |
+ AV_PIX_FMT_ABGR, | |
+ AV_PIX_FMT_BGRA, | |
+ AV_PIX_FMT_NONE, | |
+ }; | |
+ | |
+ AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts); | |
+ if (!fmts_list) | |
+ return AVERROR(ENOMEM); | |
+ return ff_set_common_formats(ctx, fmts_list); | |
+} | |
+ | |
+static int config_output(AVFilterLink *outlink) | |
+{ | |
+ AVFilterContext *ctx = outlink->src; | |
+ AVFilterLink *inlink = outlink->src->inputs[0]; | |
+ NVResizeContext *s = ctx->priv; | |
+ | |
+ int outIdx = atoi(outlink->srcpad->name + 3); | |
+ int64_t w, h; | |
+ int factor_w, factor_h; | |
+ | |
+ w = s->outtex[outIdx].w; | |
+ h = s->outtex[outIdx].h; | |
+ | |
+ // Check if it is requested that the result has to be divisible by a some | |
+ // factor (w or h = -n with n being the factor). | |
+ factor_w = 1; | |
+ factor_h = 1; | |
+ if (w < -1) { | |
+ factor_w = -w; | |
+ } | |
+ if (h < -1) { | |
+ factor_h = -h; | |
+ } | |
+ | |
+ if (w < 0 && h < 0) | |
+ s->outtex[outIdx].w = s->outtex[outIdx].h = 0; | |
+ | |
+ if (!(w = s->outtex[outIdx].w)) | |
+ w = inlink->w; | |
+ if (!(h = s->outtex[outIdx].h)) | |
+ h = inlink->h; | |
+ | |
+ // Make sure that the result is divisible by the factor we determined | |
+ // earlier. If no factor was set, it is nothing will happen as the default | |
+ // factor is 1 | |
+ if (w < 0) | |
+ w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w; | |
+ if (h < 0) | |
+ h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h; | |
+ | |
+ // Note that force_original_aspect_ratio may overwrite the previous set | |
+ // dimensions so that it is not divisible by the set factors anymore. | |
+ if (s->force_original_aspect_ratio) { | |
+ int tmp_w = av_rescale(h, inlink->w, inlink->h); | |
+ int tmp_h = av_rescale(w, inlink->h, inlink->w); | |
+ | |
+ if (s->force_original_aspect_ratio == 1) { | |
+ w = FFMIN(tmp_w, w); | |
+ h = FFMIN(tmp_h, h); | |
+ } else { | |
+ w = FFMAX(tmp_w, w); | |
+ h = FFMAX(tmp_h, h); | |
+ } | |
+ } | |
+ | |
+ if (w > INT_MAX || h > INT_MAX || | |
+ (h * inlink->w) > INT_MAX || | |
+ (w * inlink->h) > INT_MAX) | |
+ av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n"); | |
+ | |
+ s->outtex[outIdx].w = outlink->w = w; | |
+ s->outtex[outIdx].h = outlink->h = h; | |
+ | |
+ if (inlink->sample_aspect_ratio.num){ | |
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); | |
+ } else | |
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; | |
+ | |
+ // create output device memory | |
+ switch(outlink->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ case AV_PIX_FMT_NV12: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16)); | |
+ break; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static av_cold int init(AVFilterContext *ctx) | |
+{ | |
+ extern char resize_ptx[]; | |
+ NVResizeContext *s = ctx->priv; | |
+ int ret; | |
+ int i, j; | |
+ int count = 0; | |
+ for (i = 0; i < s->nb_outputs; i++) { | |
+ char name[32]; | |
+ AVFilterPad pad = { 0 }; | |
+ | |
+ snprintf(name, sizeof(name), "out%d", i); | |
+ pad.type = ctx->filter->inputs[0].type; | |
+ pad.name = av_strdup(name); | |
+ pad.config_props = config_output; | |
+ if (!pad.name) | |
+ return AVERROR(ENOMEM); | |
+ | |
+ ff_insert_outpad(ctx, i, &pad); | |
+ } | |
+ | |
+ // parse size parameters here | |
+ if (s->size_str) { | |
+ char split = '|'; | |
+ char* found = NULL; | |
+ char* head = s->size_str; | |
+ while ((found = strchr(head, split)) != NULL) { | |
+ *found = 0; | |
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) { | |
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head); | |
+ return ret; | |
+ } | |
+ head = found+1; | |
+ count++; | |
+ } | |
+ | |
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) { | |
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head); | |
+ return ret; | |
+ } | |
+ count++; | |
+ } | |
+ | |
+ // sort the output | |
+ for (i = 0; i < count; i++) { | |
+ for (j = i+1; j < count; j++) { | |
+ int tempH, tempW; | |
+ if (s->outtex[i].w < s->outtex[j].w) { | |
+ tempW = s->outtex[i].w; tempH = s->outtex[i].h; | |
+ s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h; | |
+ s->outtex[j].w = tempW; s->outtex[j].h = tempH; | |
+ } | |
+ } | |
+ } | |
+ | |
+ if (count < s->nb_outputs) { | |
+ int offset = s->nb_outputs - count; | |
+ for (i = s->nb_outputs-1; i >= offset; i--) { | |
+ s->outtex[i].w = s->outtex[i-offset].w; | |
+ s->outtex[i].h = s->outtex[i-offset].h; | |
+ } | |
+ for (i = 0; i < offset; i++) { | |
+ s->outtex[i].w = s->outtex[i].h = 0; | |
+ } | |
+ } | |
+ | |
+ // init cuda_context | |
+ if (!s->cu_ctx) { | |
+ init_cuda(); | |
+ get_cuda_context(&s->cu_ctx, s->gpu); | |
+ } | |
+ s->cu_dl_func = get_cuda_dl_func(); | |
+ | |
+ __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx)); | |
+ | |
+ // load functions | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar, s->cu_module, "uchar_tex")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex")); | |
+ | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst) | |
+{ | |
+ av_assert0(src->width == dst->w && src->height == dst->h); | |
+ | |
+ switch (src->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy UV channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ // copy the packed 32-bit plane | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format)); | |
+ return -1; | |
+ } | |
+ return 0; | |
+} | |
+ | |
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst) | |
+{ | |
+ //av_assert0(src->w == dst->width && src->h == dst->height); | |
+ | |
+ switch (dst->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy UV channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ // copy the packed 32-bit plane | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format)); | |
+ return -1; | |
+ } | |
+ return 0; | |
+} | |
+ | |
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels, | |
+ CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch, | |
+ CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch) | |
+{ | |
+ void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; | |
+ CUDA_ARRAY_DESCRIPTOR desc; | |
+ desc.Width = src_width; | |
+ desc.Height = src_height; | |
+ desc.NumChannels = channels; | |
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; | |
+ __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch)); | |
+ | |
+ __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, | |
+ BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format) | |
+{ | |
+ switch (format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2, | |
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2, | |
+ dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h, src->w, src->h, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2, | |
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch/4); | |
+ } | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format)); | |
+ return -1; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target) | |
+{ | |
+ int offset; | |
+ cu_tex* src; | |
+ if (source == NULL) { | |
+ return &s->intex; | |
+ } | |
+ | |
+ if (target->w * 4 > source->w) { | |
+ return source; | |
+ } | |
+ | |
+ offset = target - s->outtex; | |
+ for (int i = offset - 1; i >= 0; i--) { | |
+ if (target->w * 4 > s->outtex[i].w) { | |
+ return &s->outtex[i]; | |
+ } | |
+ } | |
+ | |
+ src = (offset == 0 ? source : &s->outtex[offset-1]); | |
+ av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, " | |
+ "might cause artificial\n", target->w, target->h, src->w, src->h); | |
+ | |
+ return src; | |
+} | |
+ | |
+static int filter_frame(AVFilterLink *inlink, AVFrame *in) | |
+{ | |
+ AVFilterContext *ctx = inlink->dst; | |
+ NVResizeContext *s = ctx->priv; | |
+ int i; | |
+ cu_tex* resize_src = NULL; | |
+ ffnvinfo* info; | |
+ | |
+ // copy input to gpu | |
+ if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)in->opaque; | |
+ s->intex.dptr = info->dptr[0]; | |
+ s->intex.pitch = info->linesize[0]; | |
+ s->intex.w = in->width; | |
+ s->intex.h = in->height; | |
+ } | |
+ else { | |
+ if ( (in->width != s->intex.h || in->height != s->intex.h) && | |
+ !s->intex.dptr) { | |
+ __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr)); | |
+ s->intex.w = in->width; | |
+ s->intex.h = in->height; | |
+ s->intex.dptr = (CUdeviceptr)NULL; | |
+ } | |
+ if (!s->intex.dptr) { | |
+ switch (in->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ case AV_PIX_FMT_NV12: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16)); | |
+ break; | |
+ case AV_PIX_FMT_YUV444P: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16)); | |
+ break; | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16)); | |
+ break; | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format)); | |
+ return -1; | |
+ } | |
+ } | |
+ copy_from_avframe(s, in, &s->intex); | |
+ } | |
+ | |
+ for (i = 0; i < ctx->nb_outputs; i++) { | |
+ AVFrame *out; | |
+ if (ctx->outputs[i]->status) | |
+ continue; | |
+ | |
+ out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h); | |
+ if (!out) { | |
+ av_frame_free(&in); | |
+ return AVERROR(ENOMEM); | |
+ } | |
+ av_frame_copy_props(out, in); | |
+ | |
+ // do works here | |
+ resize_src = find_resize_src(s, resize_src, &s->outtex[i]); | |
+ do_cuda_resize(s, resize_src, &s->outtex[i], in->format); | |
+ info = init_nvinfo(); | |
+ switch (out->format) { | |
+ case AV_PIX_FMT_YUV444P: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2; | |
+ info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV420P: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4; | |
+ info->linesize[0] = s->outtex[i].pitch; | |
+ info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->linesize[0] = info->linesize[1] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->linesize[0] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ default: | |
+ break; | |
+ } | |
+ | |
+ out->opaque = (void*)info; | |
+ if (s->readback_FB) | |
+ copy_to_avframe(s, &s->outtex[i], out); | |
+ | |
+ if (ff_filter_frame(ctx->outputs[i], out) < 0) | |
+ break; | |
+ } | |
+ | |
+ av_frame_free(&in); | |
+ return 0; | |
+} | |
+ | |
+ | |
+static av_cold void uninit(AVFilterContext *ctx) | |
+{ | |
+ NVResizeContext *s = ctx->priv; | |
+ | |
+ for (int i = 0; i < s->nb_outputs; i++) { | |
+ av_freep(&ctx->output_pads[i].name); | |
+ if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr); | |
+ } | |
+ if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu); | |
+ | |
+ av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n"); | |
+ | |
+} | |
+ | |
+static const AVFilterPad nvresize_inputs[] = { | |
+ { | |
+ .name = "default", | |
+ .type = AVMEDIA_TYPE_VIDEO, | |
+ .filter_frame = filter_frame, | |
+ }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFilter ff_vf_nvresize = { | |
+ .name = "nvresize", | |
+ .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."), | |
+ .inputs = nvresize_inputs, | |
+ .outputs = NULL, | |
+ .flags = AVFILTER_FLAG_DYNAMIC_OUTPUTS, | |
+ .priv_class = &nvresize_class, | |
+ .init = init, | |
+ .uninit = uninit, | |
+ .query_formats = query_formats, | |
+ .priv_size = sizeof(NVResizeContext), | |
+}; | |
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize_ptx.c ffmpeg/libavfilter/vf_nvresize_ptx.c | |
--- ffmpeg-orig/libavfilter/vf_nvresize_ptx.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavfilter/vf_nvresize_ptx.c 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,659 @@ | |
+#if _WIN32 || _WIN64 | |
+#if _WIN64 | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+// Check GCC | |
+#if __GNUC__ | |
+#if __x86_64__ || __ppc64__ | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+#ifdef ENVIRONMENT32 | |
+const char resize_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19324607\n" | |
+ "// Cuda compilation tools, release 7.0, V7.0.27\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.2\n" | |
+ ".target sm_30\n" | |
+ ".address_size 32\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar\n" | |
+ ".global .texref uchar_tex;\n" | |
+ ".global .texref uchar2_tex;\n" | |
+ ".global .texref uchar4_tex;\n" | |
+ "\n" | |
+ ".visible .entry Subsample_Bilinear_uchar(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<43>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB0_2;\n" | |
+ "bra.uni BB0_1;\n" | |
+ "\n" | |
+ "BB0_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n" | |
+ "and.b32 %r20, %r16, 255;\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n" | |
+ "and.b32 %r25, %r21, 255;\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n" | |
+ "and.b32 %r30, %r26, 255;\n" | |
+ "tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r35, %r31, 255;\n" | |
+ "add.s32 %r36, %r20, %r25;\n" | |
+ "add.s32 %r37, %r36, %r30;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "mad.lo.s32 %r41, %r2, %r6, %r1;\n" | |
+ "add.s32 %r42, %r15, %r41;\n" | |
+ "st.global.u8 [%r42], %r40;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar2\n" | |
+ ".visible .entry Subsample_Bilinear_uchar2(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<3>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<53>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar2_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB1_2;\n" | |
+ "bra.uni BB1_1;\n" | |
+ "\n" | |
+ "BB1_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r32, %r16, 255;\n" | |
+ "and.b32 %r33, %r20, 255;\n" | |
+ "and.b32 %r34, %r24, 255;\n" | |
+ "and.b32 %r35, %r28, 255;\n" | |
+ "add.s32 %r36, %r32, %r33;\n" | |
+ "add.s32 %r37, %r36, %r34;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "and.b32 %r41, %r17, 255;\n" | |
+ "and.b32 %r42, %r21, 255;\n" | |
+ "and.b32 %r43, %r25, 255;\n" | |
+ "and.b32 %r44, %r29, 255;\n" | |
+ "add.s32 %r45, %r41, %r42;\n" | |
+ "add.s32 %r46, %r45, %r43;\n" | |
+ "add.s32 %r47, %r46, %r44;\n" | |
+ "add.s32 %r48, %r47, 2;\n" | |
+ "shr.u32 %r49, %r48, 2;\n" | |
+ "mad.lo.s32 %r50, %r2, %r6, %r1;\n" | |
+ "shl.b32 %r51, %r50, 1;\n" | |
+ "add.s32 %r52, %r15, %r51;\n" | |
+ "cvt.u16.u32 %rs1, %r49;\n" | |
+ "cvt.u16.u32 %rs2, %r40;\n" | |
+ "st.global.v2.u8 [%r52], {%rs2, %rs1};\n" | |
+ "\n" | |
+ "BB1_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar4\n" | |
+ ".visible .entry Subsample_Bilinear_uchar4(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<5>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<71>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar4_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB2_2;\n" | |
+ "bra.uni BB2_1;\n" | |
+ "\n" | |
+ "BB2_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r32, %r16, 255;\n" | |
+ "and.b32 %r33, %r20, 255;\n" | |
+ "and.b32 %r34, %r24, 255;\n" | |
+ "and.b32 %r35, %r28, 255;\n" | |
+ "add.s32 %r36, %r32, %r33;\n" | |
+ "add.s32 %r37, %r36, %r34;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "and.b32 %r41, %r17, 255;\n" | |
+ "and.b32 %r42, %r21, 255;\n" | |
+ "and.b32 %r43, %r25, 255;\n" | |
+ "and.b32 %r44, %r29, 255;\n" | |
+ "add.s32 %r45, %r41, %r42;\n" | |
+ "add.s32 %r46, %r45, %r43;\n" | |
+ "add.s32 %r47, %r46, %r44;\n" | |
+ "add.s32 %r48, %r47, 2;\n" | |
+ "shr.u32 %r49, %r48, 2;\n" | |
+ "and.b32 %r50, %r18, 255;\n" | |
+ "and.b32 %r51, %r22, 255;\n" | |
+ "and.b32 %r52, %r26, 255;\n" | |
+ "and.b32 %r53, %r30, 255;\n" | |
+ "add.s32 %r54, %r50, %r51;\n" | |
+ "add.s32 %r55, %r54, %r52;\n" | |
+ "add.s32 %r56, %r55, %r53;\n" | |
+ "add.s32 %r57, %r56, 2;\n" | |
+ "shr.u32 %r58, %r57, 2;\n" | |
+ "and.b32 %r59, %r19, 255;\n" | |
+ "and.b32 %r60, %r23, 255;\n" | |
+ "and.b32 %r61, %r27, 255;\n" | |
+ "and.b32 %r62, %r31, 255;\n" | |
+ "add.s32 %r63, %r59, %r60;\n" | |
+ "add.s32 %r64, %r63, %r61;\n" | |
+ "add.s32 %r65, %r64, %r62;\n" | |
+ "add.s32 %r66, %r65, 2;\n" | |
+ "shr.u32 %r67, %r66, 2;\n" | |
+ "mad.lo.s32 %r68, %r2, %r6, %r1;\n" | |
+ "shl.b32 %r69, %r68, 2;\n" | |
+ "add.s32 %r70, %r15, %r69;\n" | |
+ "cvt.u16.u32 %rs1, %r67;\n" | |
+ "cvt.u16.u32 %rs2, %r58;\n" | |
+ "cvt.u16.u32 %rs3, %r49;\n" | |
+ "cvt.u16.u32 %rs4, %r40;\n" | |
+ "st.global.v4.u8 [%r70], {%rs4, %rs3, %rs2, %rs1};\n" | |
+ "\n" | |
+ "BB2_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#elif defined ENVIRONMENT64 | |
+const char resize_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19324607\n" | |
+ "// Cuda compilation tools, release 7.0, V7.0.27\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.2\n" | |
+ ".target sm_30\n" | |
+ ".address_size 64\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar\n" | |
+ ".global .texref uchar_tex;\n" | |
+ ".global .texref uchar2_tex;\n" | |
+ ".global .texref uchar4_tex;\n" | |
+ "\n" | |
+ ".visible .entry Subsample_Bilinear_uchar(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<40>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB0_2;\n" | |
+ "bra.uni BB0_1;\n" | |
+ "\n" | |
+ "BB0_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n" | |
+ "and.b32 %r18, %r14, 255;\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n" | |
+ "and.b32 %r23, %r19, 255;\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n" | |
+ "and.b32 %r28, %r24, 255;\n" | |
+ "tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r33, %r29, 255;\n" | |
+ "add.s32 %r34, %r18, %r23;\n" | |
+ "add.s32 %r35, %r34, %r28;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "mad.lo.s32 %r39, %r2, %r5, %r1;\n" | |
+ "cvt.s64.s32 %rd4, %r39;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "st.global.u8 [%rd5], %r38;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar2\n" | |
+ ".visible .entry Subsample_Bilinear_uchar2(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar2_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<3>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<49>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar2_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB1_2;\n" | |
+ "bra.uni BB1_1;\n" | |
+ "\n" | |
+ "BB1_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r30, %r14, 255;\n" | |
+ "and.b32 %r31, %r18, 255;\n" | |
+ "and.b32 %r32, %r22, 255;\n" | |
+ "and.b32 %r33, %r26, 255;\n" | |
+ "add.s32 %r34, %r30, %r31;\n" | |
+ "add.s32 %r35, %r34, %r32;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "and.b32 %r39, %r15, 255;\n" | |
+ "and.b32 %r40, %r19, 255;\n" | |
+ "and.b32 %r41, %r23, 255;\n" | |
+ "and.b32 %r42, %r27, 255;\n" | |
+ "add.s32 %r43, %r39, %r40;\n" | |
+ "add.s32 %r44, %r43, %r41;\n" | |
+ "add.s32 %r45, %r44, %r42;\n" | |
+ "add.s32 %r46, %r45, 2;\n" | |
+ "shr.u32 %r47, %r46, 2;\n" | |
+ "mad.lo.s32 %r48, %r2, %r5, %r1;\n" | |
+ "mul.wide.s32 %rd4, %r48, 2;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "cvt.u16.u32 %rs1, %r47;\n" | |
+ "cvt.u16.u32 %rs2, %r38;\n" | |
+ "st.global.v2.u8 [%rd5], {%rs2, %rs1};\n" | |
+ "\n" | |
+ "BB1_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar4\n" | |
+ ".visible .entry Subsample_Bilinear_uchar4(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar4_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<5>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<67>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar4_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB2_2;\n" | |
+ "bra.uni BB2_1;\n" | |
+ "\n" | |
+ "BB2_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r30, %r14, 255;\n" | |
+ "and.b32 %r31, %r18, 255;\n" | |
+ "and.b32 %r32, %r22, 255;\n" | |
+ "and.b32 %r33, %r26, 255;\n" | |
+ "add.s32 %r34, %r30, %r31;\n" | |
+ "add.s32 %r35, %r34, %r32;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "and.b32 %r39, %r15, 255;\n" | |
+ "and.b32 %r40, %r19, 255;\n" | |
+ "and.b32 %r41, %r23, 255;\n" | |
+ "and.b32 %r42, %r27, 255;\n" | |
+ "add.s32 %r43, %r39, %r40;\n" | |
+ "add.s32 %r44, %r43, %r41;\n" | |
+ "add.s32 %r45, %r44, %r42;\n" | |
+ "add.s32 %r46, %r45, 2;\n" | |
+ "shr.u32 %r47, %r46, 2;\n" | |
+ "and.b32 %r48, %r16, 255;\n" | |
+ "and.b32 %r49, %r20, 255;\n" | |
+ "and.b32 %r50, %r24, 255;\n" | |
+ "and.b32 %r51, %r28, 255;\n" | |
+ "add.s32 %r52, %r48, %r49;\n" | |
+ "add.s32 %r53, %r52, %r50;\n" | |
+ "add.s32 %r54, %r53, %r51;\n" | |
+ "add.s32 %r55, %r54, 2;\n" | |
+ "shr.u32 %r56, %r55, 2;\n" | |
+ "and.b32 %r57, %r17, 255;\n" | |
+ "and.b32 %r58, %r21, 255;\n" | |
+ "and.b32 %r59, %r25, 255;\n" | |
+ "and.b32 %r60, %r29, 255;\n" | |
+ "add.s32 %r61, %r57, %r58;\n" | |
+ "add.s32 %r62, %r61, %r59;\n" | |
+ "add.s32 %r63, %r62, %r60;\n" | |
+ "add.s32 %r64, %r63, 2;\n" | |
+ "shr.u32 %r65, %r64, 2;\n" | |
+ "mad.lo.s32 %r66, %r2, %r5, %r1;\n" | |
+ "mul.wide.s32 %rd4, %r66, 4;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "cvt.u16.u32 %rs1, %r65;\n" | |
+ "cvt.u16.u32 %rs2, %r56;\n" | |
+ "cvt.u16.u32 %rs3, %r47;\n" | |
+ "cvt.u16.u32 %rs4, %r38;\n" | |
+ "st.global.v4.u8 [%rd5], {%rs4, %rs3, %rs2, %rs1};\n" | |
+ "\n" | |
+ "BB2_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#endif | |
diff -ruN ffmpeg-orig/libavutil/cudautils.c ffmpeg/libavutil/cudautils.c | |
--- ffmpeg-orig/libavutil/cudautils.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavutil/cudautils.c 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,288 @@ | |
+/* | |
+* | |
+* This file is part of FFmpeg. | |
+* | |
+* FFmpeg is free software; you can redistribute it and/or | |
+* modify it under the terms of the GNU Lesser General Public | |
+* License as published by the Free Software Foundation; either | |
+* version 2.1 of the License, or (at your option) any later version. | |
+* | |
+* FFmpeg is distributed in the hope that it will be useful, | |
+* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+* Lesser General Public License for more details. | |
+* | |
+* You should have received a copy of the GNU Lesser General Public | |
+* License along with FFmpeg; if not, write to the Free Software | |
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+*/ | |
+ | |
+#include "cudautils.h" | |
+#include "common.h" | |
+#include "log.h" | |
+ | |
+#define FF_NVINFO_VERSION 1 | |
+static NVGUID NV_INFO_GUID = { 0x2cab9a64, 0x7095, 0x11e5, { 0xad, 0x1d, 0x94, 0xde, 0x80, 0x65, 0xb7, 0x74 } }; | |
+static CudaContext cudaCtx = { { NULL }, { 0 }, { NULL }, 0 , { NULL }, { "" }, { 0 } }; | |
+ | |
+int dyload_cuda(void); | |
+int check_cuda(void); | |
+int check_cuda_errors(CUresult err, const char *func); | |
+ | |
+ | |
+#define CHECK_LOAD_FUNC(t, f, s) \ | |
+do { \ | |
+ (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \ | |
+ if (!(f)) { \ | |
+ av_log(NULL, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \ | |
+ goto error; \ | |
+ } \ | |
+} while (0) | |
+ | |
+int check_cuda_errors(CUresult err, const char *func) | |
+{ | |
+ if (err != CUDA_SUCCESS) { | |
+ av_log(NULL, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err); | |
+ return 0; | |
+ } | |
+ return 1; | |
+} | |
+#define check_cuda_errors(f) if (!check_cuda_errors(f, #f)) goto error | |
+ | |
+ | |
+int dyload_cuda(void) | |
+{ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ if (dl_fn->cuda_lib) | |
+ return 1; | |
+ | |
+#if defined(_WIN32) | |
+ dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll")); | |
+#else | |
+ dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY); | |
+#endif | |
+ | |
+ if (!dl_fn->cuda_lib) { | |
+ av_log(NULL, AV_LOG_FATAL, "Failed loading CUDA library\n"); | |
+ goto error; | |
+ } | |
+ | |
+ CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName"); | |
+ CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability"); | |
+ CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2"); | |
+ CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2"); | |
+ CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2"); | |
+ CHECK_LOAD_FUNC(PCUMODULELOADDATA, dl_fn->cu_module_load_data, "cuModuleLoadData"); | |
+ CHECK_LOAD_FUNC(PCUMODULEGETFUNCTION, dl_fn->cu_module_get_function, "cuModuleGetFunction"); | |
+ CHECK_LOAD_FUNC(PCUMODULEGETTEXREF, dl_fn->cu_module_get_texref, "cuModuleGetTexRef"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETFLAGS, dl_fn->cu_texref_set_flags, "cuTexRefSetFlags"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETFILTERMODE, dl_fn->cu_texref_set_filtermode, "cuTexRefSetFilterMode"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETADDRESS2D, dl_fn->cu_texref_set_address_2D, "cuTexRefSetAddress2D_v3"); | |
+ CHECK_LOAD_FUNC(PCUMEMALLOCPITCH, dl_fn->cu_mem_alloc_pitch, "cuMemAllocPitch_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMCPY2D, dl_fn->cu_mem_cpy_2D, "cuMemcpy2D_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMCPY2DASYNC, dl_fn->cu_mem_cpy_2D_async, "cuMemcpy2DAsync_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMFREE, dl_fn->cu_mem_free, "cuMemFree_v2"); | |
+ CHECK_LOAD_FUNC(PCULAUNCHKERNEL, dl_fn->cu_launch_kernel, "cuLaunchKernel"); | |
+ | |
+ av_log(NULL, AV_LOG_VERBOSE, "CUDA Library and Function loaded successfully\n"); | |
+ return 1; | |
+ | |
+error: | |
+ if (dl_fn->cuda_lib) | |
+ DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
+ | |
+ dl_fn->cuda_lib = NULL; | |
+ return 0; | |
+} | |
+ | |
+ | |
+int check_cuda(void) | |
+{ | |
+ int device_count = 0; | |
+ int smminor = 0, smmajor = 0; | |
+ int i; | |
+ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ | |
+ if (!dyload_cuda()) | |
+ return 0; | |
+ | |
+ check_cuda_errors(dl_fn->cu_init(0)); | |
+ check_cuda_errors(dl_fn->cu_device_get_count(&device_count)); | |
+ | |
+ if (!device_count) { | |
+ av_log(NULL, AV_LOG_FATAL, "No CUDA capable devices found\n"); | |
+ goto error; | |
+ } | |
+ | |
+ av_log(NULL, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count); | |
+ | |
+ for (i = 0; i < device_count; ++i) { | |
+ check_cuda_errors(dl_fn->cu_device_get(&cudaCtx.cu_devices[i], i)); | |
+ check_cuda_errors(dl_fn->cu_device_get_name(cudaCtx.gpu_name[i], sizeof(cudaCtx.gpu_name[i]), cudaCtx.cu_devices[i])); | |
+ check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cudaCtx.cu_devices[i])); | |
+ | |
+ cudaCtx.smver[i] = (smmajor << 4) | smminor; | |
+ av_log(NULL, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d]\n", i, cudaCtx.gpu_name[i], smmajor, smminor); | |
+ | |
+ } | |
+ cudaCtx.device_count = device_count; | |
+ return 1; | |
+ | |
+error: | |
+ cudaCtx.device_count = 0; | |
+ return 0; | |
+} | |
+ | |
+int init_cuda(void) | |
+{ | |
+ if (cudaCtx.device_count == 0) | |
+ { | |
+ if (!check_cuda()) | |
+ return 0; | |
+ } | |
+ | |
+ return 1; | |
+} | |
+ | |
+void deinit_cuda(void) | |
+{ | |
+ | |
+ int i, deinit_flag; | |
+ deinit_flag = 1; | |
+ | |
+ for (i = 0; i < cudaCtx.device_count; i++) | |
+ { | |
+ if (cudaCtx.cuda_context_arr[i] != NULL) | |
+ { | |
+ deinit_flag = 0; | |
+ } | |
+ } | |
+ | |
+ if (deinit_flag == 1) | |
+ { | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
+ dl_fn->cuda_lib = NULL; | |
+ | |
+ dl_fn->cu_init = NULL; | |
+ dl_fn->cu_device_get_count = NULL; | |
+ dl_fn->cu_device_get = NULL; | |
+ dl_fn->cu_device_get_name = NULL; | |
+ dl_fn->cu_device_compute_capability = NULL; | |
+ dl_fn->cu_ctx_create = NULL; | |
+ dl_fn->cu_ctx_pop_current = NULL; | |
+ dl_fn->cu_ctx_destroy = NULL; | |
+ dl_fn->cu_module_load_data = NULL; | |
+ dl_fn->cu_module_get_function = NULL; | |
+ dl_fn->cu_module_get_texref = NULL; | |
+ dl_fn->cu_texref_set_flags = NULL; | |
+ dl_fn->cu_texref_set_filtermode = NULL; | |
+ dl_fn->cu_texref_set_address_2D = NULL; | |
+ dl_fn->cu_mem_alloc_pitch = NULL; | |
+ dl_fn->cu_mem_cpy_2D = NULL; | |
+ dl_fn->cu_mem_cpy_2D_async = NULL; | |
+ dl_fn->cu_mem_free = NULL; | |
+ dl_fn->cu_launch_kernel = NULL; | |
+ av_log(NULL, AV_LOG_VERBOSE, "Cuda Library unloaded\n"); | |
+ } | |
+} | |
+ | |
+int is_gpu_feature_available(int gpu, int target_smver) | |
+{ | |
+ if (cudaCtx.device_count > 0) | |
+ { | |
+ if (cudaCtx.smver[gpu] >= target_smver) | |
+ { | |
+ return 1; | |
+ } | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+ | |
+int get_cuda_context(CUcontext *ctx, int gpu) | |
+{ | |
+ CUresult cu_res; | |
+ CUcontext get_ctx; | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ if (cudaCtx.cuda_context_arr[gpu] == NULL) | |
+ { | |
+ cu_res = dl_fn->cu_ctx_create(&get_ctx, 4, cudaCtx.cu_devices[gpu]); | |
+ | |
+ if (cu_res != CUDA_SUCCESS) { | |
+ ctx = NULL; | |
+ return cu_res; | |
+ } | |
+ | |
+ cudaCtx.cuda_context_arr[gpu] = get_ctx; | |
+ } | |
+ | |
+ *ctx = cudaCtx.cuda_context_arr[gpu]; | |
+ cudaCtx.cuda_context_count[gpu] += 1; | |
+ av_log(NULL, AV_LOG_VERBOSE, "cudalib : Cuda Context created 0x%p\n", *ctx); | |
+ | |
+ return CUDA_SUCCESS; | |
+} | |
+ | |
+void release_cuda_context(CUcontext *ctx, int gpu) | |
+{ | |
+ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ ctx = NULL; | |
+ cudaCtx.cuda_context_count[gpu] -= 1; | |
+ if (cudaCtx.cuda_context_count[gpu] == 0) | |
+ { | |
+ dl_fn->cu_ctx_destroy(cudaCtx.cuda_context_arr[gpu]); | |
+ cudaCtx.cuda_context_arr[gpu] = NULL; | |
+ } | |
+ | |
+} | |
+ | |
+int check_nvinfo(void* ptr) | |
+{ | |
+ ffnvinfo* info; | |
+ if (!ptr) return 0; | |
+ info = (ffnvinfo*)ptr; | |
+ if (memcmp(&info->guid, &NV_INFO_GUID, sizeof(info->guid)) != 0) return 0; | |
+ return 1; | |
+} | |
+ | |
+ffnvinfo* init_nvinfo() | |
+{ | |
+ ffnvinfo* info = av_mallocz(sizeof(ffnvinfo)); | |
+ memcpy(&info->guid, &NV_INFO_GUID, sizeof(info->guid)); | |
+ info->version = FF_NVINFO_VERSION; | |
+ return info; | |
+} | |
+ | |
+CudaDynLoadFunctions* get_cuda_dl_func() | |
+{ | |
+ return &cudaCtx.cuda_dload_funcs; | |
+} | |
+ | |
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType) | |
+{ | |
+ CUDA_MEMCPY2D memcpy2D = { 0 }; | |
+ CudaDynLoadFunctions* cu_dl_func = get_cuda_dl_func(); | |
+ | |
+ memcpy2D.srcMemoryType = srcMemoryType; | |
+ memcpy2D.dstMemoryType = dstMemoryType; | |
+ memcpy2D.srcHost = srcHost; | |
+ memcpy2D.srcDevice = srcDevice; | |
+ memcpy2D.srcPitch = srcPitch; | |
+ memcpy2D.dstHost = dstHost; | |
+ memcpy2D.dstDevice = dstDevice; | |
+ memcpy2D.dstPitch = dstPitch; | |
+ memcpy2D.WidthInBytes = width; | |
+ memcpy2D.Height = height; | |
+ return cu_dl_func->cu_mem_cpy_2D_async(&memcpy2D, NULL); | |
+} | |
diff -ruN ffmpeg-orig/libavutil/cudautils.h ffmpeg/libavutil/cudautils.h | |
--- ffmpeg-orig/libavutil/cudautils.h 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavutil/cudautils.h 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,216 @@ | |
+/* | |
+* | |
+* This file is part of FFmpeg. | |
+* | |
+* FFmpeg is free software; you can redistribute it and/or | |
+* modify it under the terms of the GNU Lesser General Public | |
+* License as published by the Free Software Foundation; either | |
+* version 2.1 of the License, or (at your option) any later version. | |
+* | |
+* FFmpeg is distributed in the hope that it will be useful, | |
+* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+* Lesser General Public License for more details. | |
+* | |
+* You should have received a copy of the GNU Lesser General Public | |
+* License along with FFmpeg; if not, write to the Free Software | |
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+*/ | |
+ | |
+#if defined(_WIN32) | |
+#include <windows.h> | |
+#else | |
+#include <dlfcn.h> | |
+#endif | |
+ | |
+#include "common.h" | |
+ | |
+ | |
+#if defined(_WIN32) | |
+#define CUDAAPI __stdcall | |
+#else | |
+#define CUDAAPI | |
+#endif | |
+ | |
+#if defined(_WIN32) | |
+#define LOAD_FUNC(l, s) GetProcAddress(l, s) | |
+#define DL_CLOSE_FUNC(l) FreeLibrary(l) | |
+#else | |
+#define LOAD_FUNC(l, s) dlsym(l, s) | |
+#define DL_CLOSE_FUNC(l) dlclose(l) | |
+#endif | |
+ | |
+#define MAX_NUM_GPU 16 | |
+ | |
+#define CU_TRSF_READ_AS_INTEGER 0x01 | |
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02 | |
+#define CU_TRSF_SRGB 0x10 | |
+ | |
+#define __cu(a) do { \ | |
+ CUresult ret; \ | |
+ if ((ret = (a)) != CUDA_SUCCESS) { \ | |
+ av_log(NULL, AV_LOG_FATAL, "[%s:%d]%s has returned CUDA error %d\n", __FILE__, __LINE__, #a, ret); \ | |
+ return AVERROR_EXTERNAL;\ | |
+ }} while (0) | |
+ | |
+ | |
+typedef int CUdevice; | |
+typedef void* CUcontext; | |
+typedef void* CUmodule; | |
+typedef void* CUfunction; | |
+typedef void* CUtexref; | |
+typedef void* CUstream; | |
+typedef void* CUarray; | |
+#if defined(_WIN64) || defined(__LP64__) | |
+typedef unsigned long long CUdeviceptr; | |
+#else | |
+typedef unsigned int CUdeviceptr; | |
+#endif | |
+ | |
+typedef enum cudaError_enum { | |
+ CUDA_SUCCESS = 0 | |
+} CUresult; | |
+ | |
+typedef enum CUfilter_mode_enum { | |
+ CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ | |
+ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ | |
+} CUfilter_mode; | |
+ | |
+typedef enum CUarray_format_enum { | |
+ CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ | |
+ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ | |
+ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ | |
+ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ | |
+ CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ | |
+} CUarray_format; | |
+ | |
+typedef struct CUDA_ARRAY_DESCRIPTOR_st | |
+{ | |
+ size_t Width; /**< Width of array */ | |
+ size_t Height; /**< Height of array */ | |
+ | |
+ CUarray_format Format; /**< Array format */ | |
+ unsigned int NumChannels; /**< Channels per array element */ | |
+} CUDA_ARRAY_DESCRIPTOR; | |
+ | |
+typedef enum CUmemorytype_enum { | |
+ CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ | |
+ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ | |
+ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ | |
+ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ | |
+} CUmemorytype; | |
+ | |
+typedef struct CUDA_MEMCPY2D_st { | |
+ size_t srcXInBytes; /**< Source X in bytes */ | |
+ size_t srcY; /**< Source Y */ | |
+ | |
+ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ | |
+ const void *srcHost; /**< Source host pointer */ | |
+ CUdeviceptr srcDevice; /**< Source device pointer */ | |
+ CUarray srcArray; /**< Source array reference */ | |
+ size_t srcPitch; /**< Source pitch (ignored when src is array) */ | |
+ | |
+ size_t dstXInBytes; /**< Destination X in bytes */ | |
+ size_t dstY; /**< Destination Y */ | |
+ | |
+ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ | |
+ void *dstHost; /**< Destination host pointer */ | |
+ CUdeviceptr dstDevice; /**< Destination device pointer */ | |
+ CUarray dstArray; /**< Destination array reference */ | |
+ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ | |
+ | |
+ size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ | |
+ size_t Height; /**< Height of 2D memory copy */ | |
+} CUDA_MEMCPY2D; | |
+ | |
+ | |
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx); | |
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx); | |
+typedef CUresult(CUDAAPI *PCUMODULELOADDATA)(CUmodule *module, const void *image); | |
+typedef CUresult(CUDAAPI *PCUMODULEGETFUNCTION)(CUfunction *hfunc, CUmodule hmod, const char *name); | |
+typedef CUresult(CUDAAPI *PCUMODULEGETTEXREF)(CUtexref *pTexRef, CUmodule hmod, const char *name); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETFLAGS)(CUtexref hTexRef, unsigned int Flags); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETFILTERMODE)(CUtexref hTexRef, CUfilter_mode fm); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETADDRESS2D)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); | |
+typedef CUresult(CUDAAPI *PCUMEMALLOCPITCH)(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); | |
+typedef CUresult(CUDAAPI *PCUMEMCPY2D)(const CUDA_MEMCPY2D *pCopy); | |
+typedef CUresult(CUDAAPI *PCUMEMCPY2DASYNC)(const CUDA_MEMCPY2D *pCopy, CUstream hStream); | |
+typedef CUresult(CUDAAPI *PCUMEMFREE)(CUdeviceptr dptr); | |
+typedef CUresult(CUDAAPI *PCULAUNCHKERNEL)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); | |
+ | |
+ | |
+typedef struct CudaDynLoadFunctions{ | |
+ PCUINIT cu_init; | |
+ PCUDEVICEGETCOUNT cu_device_get_count; | |
+ PCUDEVICEGET cu_device_get; | |
+ PCUDEVICEGETNAME cu_device_get_name; | |
+ PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability; | |
+ PCUCTXCREATE cu_ctx_create; | |
+ PCUCTXPOPCURRENT cu_ctx_pop_current; | |
+ PCUCTXDESTROY cu_ctx_destroy; | |
+ PCUMODULELOADDATA cu_module_load_data; | |
+ PCUMODULEGETFUNCTION cu_module_get_function; | |
+ PCUMODULEGETTEXREF cu_module_get_texref; | |
+ PCUTEXREFSETFLAGS cu_texref_set_flags; | |
+ PCUTEXREFSETFILTERMODE cu_texref_set_filtermode; | |
+ PCUTEXREFSETADDRESS2D cu_texref_set_address_2D; | |
+ PCUMEMALLOCPITCH cu_mem_alloc_pitch; | |
+ PCUMEMCPY2D cu_mem_cpy_2D; | |
+ PCUMEMCPY2DASYNC cu_mem_cpy_2D_async; | |
+ PCUMEMFREE cu_mem_free; | |
+ PCULAUNCHKERNEL cu_launch_kernel; | |
+ | |
+#if defined(_WIN32) | |
+ HMODULE cuda_lib; | |
+#else | |
+ void* cuda_lib; | |
+#endif | |
+} CudaDynLoadFunctions; | |
+ | |
+typedef struct CudaContext{ | |
+ | |
+ CUcontext cuda_context_arr[MAX_NUM_GPU]; | |
+ unsigned int cuda_context_count[MAX_NUM_GPU]; | |
+ CudaDynLoadFunctions cuda_dload_funcs; | |
+ | |
+ int device_count; | |
+ CUdevice cu_devices[MAX_NUM_GPU]; | |
+ char gpu_name[MAX_NUM_GPU][128]; | |
+ int smver[MAX_NUM_GPU]; | |
+} CudaContext; | |
+ | |
+typedef struct _NVGUID { | |
+ uint32_t Data1; | |
+ uint16_t Data2; | |
+ uint16_t Data3; | |
+ uint8_t Data4[8]; | |
+} NVGUID; | |
+ | |
+typedef struct _ffnvinfo { | |
+ NVGUID guid; | |
+ uint32_t version; | |
+ //CUcontext cudaCtx; | |
+ void* vxCtx; | |
+ CUdeviceptr dptr[8]; | |
+ uint32_t linesize[8]; | |
+} ffnvinfo; | |
+ | |
+int init_cuda(void); | |
+void deinit_cuda(void); | |
+int get_cuda_context(CUcontext *ctx, int gpu); | |
+void release_cuda_context(CUcontext *ctx, int gpu); | |
+int is_gpu_feature_available(int gpu, int target_smver); | |
+int check_nvinfo(void* ptr); | |
+ffnvinfo* init_nvinfo(void); | |
+CudaDynLoadFunctions* get_cuda_dl_func(void); | |
+ | |
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType); | |
diff -ruN ffmpeg-orig/libavutil/Makefile ffmpeg/libavutil/Makefile | |
--- ffmpeg-orig/libavutil/Makefile 2016-03-04 00:17:21.063188717 +0200 | |
+++ ffmpeg/libavutil/Makefile 2016-03-04 00:18:17.589433917 +0200 | |
@@ -21,6 +21,7 @@ | |
common.h \ | |
cpu.h \ | |
crc.h \ | |
+ cudautils.h \ | |
des.h \ | |
display.h \ | |
downmix_info.h \ | |
@@ -98,6 +99,7 @@ | |
color_utils.o \ | |
cpu.o \ | |
crc.o \ | |
+ cudautils.o \ | |
des.o \ | |
display.o \ | |
downmix_info.o \ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -ruN ffmpeg-orig/configure ffmpeg/configure | |
--- ffmpeg-orig/configure 2016-03-04 00:17:20.652150576 +0200 | |
+++ ffmpeg/configure 2016-03-04 02:36:28.264332984 +0200 | |
@@ -278,6 +278,7 @@ | |
--enable-mmal enable decoding via MMAL [no] | |
--enable-netcdf enable NetCDF, needed for sofalizer filter [no] | |
--enable-nvenc enable NVIDIA NVENC support [no] | |
+ --enable-nvresize enable NVIDIA CUDA accelerated resizer [no] | |
--enable-openal enable OpenAL 1.1 capture support [no] | |
--enable-opencl enable OpenCL code | |
--enable-opengl enable OpenGL rendering [no] | |
@@ -1502,6 +1503,7 @@ | |
mmal | |
netcdf | |
nvenc | |
+ nvresize | |
openal | |
opencl | |
opengl | |
@@ -5325,6 +5327,7 @@ | |
frei0r_src_filter_extralibs='$ldl' | |
ladspa_filter_extralibs='$ldl' | |
nvenc_encoder_extralibs='$ldl' | |
+nvresize_filter_extralibs='$ldl' | |
if ! disabled network; then | |
check_func getaddrinfo $network_extralibs | |
diff -ruN ffmpeg-orig/libavfilter/allfilters.c ffmpeg/libavfilter/allfilters.c | |
--- ffmpeg-orig/libavfilter/allfilters.c 2016-03-04 00:17:21.071189460 +0200 | |
+++ ffmpeg/libavfilter/allfilters.c 2016-03-04 00:30:07.377997402 +0200 | |
@@ -225,6 +225,7 @@ | |
REGISTER_FILTER(NOFORMAT, noformat, vf); | |
REGISTER_FILTER(NOISE, noise, vf); | |
REGISTER_FILTER(NULL, null, vf); | |
+ REGISTER_FILTER(NVRESIZE, nvresize, vf); | |
REGISTER_FILTER(OCR, ocr, vf); | |
REGISTER_FILTER(OCV, ocv, vf); | |
REGISTER_FILTER(OVERLAY, overlay, vf); | |
diff -ruN ffmpeg-orig/libavfilter/Makefile ffmpeg/libavfilter/Makefile | |
--- ffmpeg-orig/libavfilter/Makefile 2016-03-04 00:17:21.079190202 +0200 | |
+++ ffmpeg/libavfilter/Makefile 2016-03-04 02:41:05.358919913 +0200 | |
@@ -204,6 +204,7 @@ | |
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o | |
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o | |
OBJS-$(CONFIG_NULL_FILTER) += vf_null.o | |
+OBJS-$(CONFIG_NVRESIZE_FILTER) += vf_nvresize.o vf_nvresize_ptx.o | |
OBJS-$(CONFIG_OCR_FILTER) += vf_ocr.o | |
OBJS-$(CONFIG_OCV_FILTER) += vf_libopencv.o | |
OBJS-$(CONFIG_OPENCL) += deshake_opencl.o unsharp_opencl.o | |
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize.c ffmpeg/libavfilter/vf_nvresize.c | |
--- ffmpeg-orig/libavfilter/vf_nvresize.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavfilter/vf_nvresize.c 2016-03-04 02:27:15.262271696 +0200 | |
@@ -0,0 +1,669 @@ | |
+/* | |
+ * Copyright (c) 2011 Roger Pau Monné <[email protected]> | |
+ * Copyright (c) 2011 Stefano Sabatini | |
+ * Copyright (c) 2013 Paul B Mahol | |
+ * | |
+ * This file is part of FFmpeg. | |
+ * | |
+ * FFmpeg is free software; you can redistribute it and/or | |
+ * modify it under the terms of the GNU Lesser General Public | |
+ * License as published by the Free Software Foundation; either | |
+ * version 2.1 of the License, or (at your option) any later version. | |
+ * | |
+ * FFmpeg is distributed in the hope that it will be useful, | |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+ * Lesser General Public License for more details. | |
+ * | |
+ * You should have received a copy of the GNU Lesser General Public | |
+ * License along with FFmpeg; if not, write to the Free Software | |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+ */ | |
+ | |
+ | |
+#include "libavutil/avassert.h" | |
+#include "libavutil/avstring.h" | |
+#include "libavutil/eval.h" | |
+#include "libavutil/mathematics.h" | |
+#include "libavutil/opt.h" | |
+#include "libavutil/pixdesc.h" | |
+#include "libavutil/parseutils.h" | |
+#include "libavutil/cudautils.h" | |
+ | |
+#include "avfilter.h" | |
+#include "drawutils.h" | |
+#include "formats.h" | |
+#include "internal.h" | |
+#include "video.h" | |
+ | |
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) | |
+#define MAX_OUTPUT 16 | |
+#define BLOCKX 32 | |
+#define BLOCKY 16 | |
+ | |
+typedef struct cu_tex { | |
+ int w; | |
+ int h; | |
+ size_t pitch; | |
+ CUdeviceptr dptr; | |
+} cu_tex; | |
+ | |
+typedef struct NVResizeContext { | |
+ const AVClass *class; | |
+ | |
+ /** | |
+ * New dimensions. Special values are: | |
+ * 0 = original width/height | |
+ * -1 = keep original aspect | |
+ * -N = try to keep aspect but make sure it is divisible by N | |
+ */ | |
+ int nb_outputs; | |
+ | |
+ char *size_str; | |
+ int force_original_aspect_ratio; | |
+ int readback_FB; | |
+ int gpu; | |
+ | |
+ int cuda_inited; | |
+ | |
+ CUcontext cu_ctx; | |
+ CudaDynLoadFunctions* cu_dl_func; | |
+ CUmodule cu_module; | |
+ CUfunction cu_func_uchar; | |
+ CUfunction cu_func_uchar2; | |
+ CUfunction cu_func_uchar4; | |
+ CUtexref cu_tex_uchar; | |
+ CUtexref cu_tex_uchar2; | |
+ CUtexref cu_tex_uchar4; | |
+ cu_tex intex; | |
+ cu_tex outtex[MAX_OUTPUT]; | |
+ | |
+} NVResizeContext; | |
+ | |
+#define OFFSET(x) offsetof(NVResizeContext, x) | |
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM | |
+ | |
+static const AVOption nvresize_options[] = { | |
+ { "outputs", "set number of outputs", OFFSET(nb_outputs), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS }, | |
+ { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS }, | |
+ { "size", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS }, | |
+ { "s", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS }, | |
+ { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS }, | |
+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFILTER_DEFINE_CLASS(nvresize); | |
+ | |
+static int query_formats(AVFilterContext *ctx) | |
+{ | |
+ static const enum AVPixelFormat pix_fmts[] = { | |
+ AV_PIX_FMT_YUV420P, | |
+ AV_PIX_FMT_YUV444P, | |
+ AV_PIX_FMT_NV12, | |
+ AV_PIX_FMT_ARGB, | |
+ AV_PIX_FMT_RGBA, | |
+ AV_PIX_FMT_ABGR, | |
+ AV_PIX_FMT_BGRA, | |
+ AV_PIX_FMT_NONE, | |
+ }; | |
+ | |
+ AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts); | |
+ if (!fmts_list) | |
+ return AVERROR(ENOMEM); | |
+ return ff_set_common_formats(ctx, fmts_list); | |
+} | |
+ | |
+static int config_output(AVFilterLink *outlink) | |
+{ | |
+ AVFilterContext *ctx = outlink->src; | |
+ AVFilterLink *inlink = outlink->src->inputs[0]; | |
+ NVResizeContext *s = ctx->priv; | |
+ | |
+ int outIdx = atoi(outlink->srcpad->name + 3); | |
+ int64_t w, h; | |
+ int factor_w, factor_h; | |
+ | |
+ w = s->outtex[outIdx].w; | |
+ h = s->outtex[outIdx].h; | |
+ | |
+ // Check if it is requested that the result has to be divisible by a some | |
+ // factor (w or h = -n with n being the factor). | |
+ factor_w = 1; | |
+ factor_h = 1; | |
+ if (w < -1) { | |
+ factor_w = -w; | |
+ } | |
+ if (h < -1) { | |
+ factor_h = -h; | |
+ } | |
+ | |
+ if (w < 0 && h < 0) | |
+ s->outtex[outIdx].w = s->outtex[outIdx].h = 0; | |
+ | |
+ if (!(w = s->outtex[outIdx].w)) | |
+ w = inlink->w; | |
+ if (!(h = s->outtex[outIdx].h)) | |
+ h = inlink->h; | |
+ | |
+ // Make sure that the result is divisible by the factor we determined | |
+ // earlier. If no factor was set, it is nothing will happen as the default | |
+ // factor is 1 | |
+ if (w < 0) | |
+ w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w; | |
+ if (h < 0) | |
+ h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h; | |
+ | |
+ // Note that force_original_aspect_ratio may overwrite the previous set | |
+ // dimensions so that it is not divisible by the set factors anymore. | |
+ if (s->force_original_aspect_ratio) { | |
+ int tmp_w = av_rescale(h, inlink->w, inlink->h); | |
+ int tmp_h = av_rescale(w, inlink->h, inlink->w); | |
+ | |
+ if (s->force_original_aspect_ratio == 1) { | |
+ w = FFMIN(tmp_w, w); | |
+ h = FFMIN(tmp_h, h); | |
+ } else { | |
+ w = FFMAX(tmp_w, w); | |
+ h = FFMAX(tmp_h, h); | |
+ } | |
+ } | |
+ | |
+ if (w > INT_MAX || h > INT_MAX || | |
+ (h * inlink->w) > INT_MAX || | |
+ (w * inlink->h) > INT_MAX) | |
+ av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n"); | |
+ | |
+ s->outtex[outIdx].w = outlink->w = w; | |
+ s->outtex[outIdx].h = outlink->h = h; | |
+ | |
+ if (inlink->sample_aspect_ratio.num){ | |
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); | |
+ } else | |
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; | |
+ | |
+ // create output device memory | |
+ switch(outlink->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ case AV_PIX_FMT_NV12: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr, | |
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16)); | |
+ break; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static av_cold int init(AVFilterContext *ctx) | |
+{ | |
+ extern char resize_ptx[]; | |
+ NVResizeContext *s = ctx->priv; | |
+ int ret; | |
+ int i, j; | |
+ int count = 0; | |
+ for (i = 0; i < s->nb_outputs; i++) { | |
+ char name[32]; | |
+ AVFilterPad pad = { 0 }; | |
+ | |
+ snprintf(name, sizeof(name), "out%d", i); | |
+ pad.type = ctx->filter->inputs[0].type; | |
+ pad.name = av_strdup(name); | |
+ pad.config_props = config_output; | |
+ if (!pad.name) | |
+ return AVERROR(ENOMEM); | |
+ | |
+ ff_insert_outpad(ctx, i, &pad); | |
+ } | |
+ | |
+ // parse size parameters here | |
+ if (s->size_str) { | |
+ char split = '|'; | |
+ char* found = NULL; | |
+ char* head = s->size_str; | |
+ while ((found = strchr(head, split)) != NULL) { | |
+ *found = 0; | |
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) { | |
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head); | |
+ return ret; | |
+ } | |
+ head = found+1; | |
+ count++; | |
+ } | |
+ | |
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) { | |
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head); | |
+ return ret; | |
+ } | |
+ count++; | |
+ } | |
+ | |
+ // sort the output | |
+ for (i = 0; i < count; i++) { | |
+ for (j = i+1; j < count; j++) { | |
+ int tempH, tempW; | |
+ if (s->outtex[i].w < s->outtex[j].w) { | |
+ tempW = s->outtex[i].w; tempH = s->outtex[i].h; | |
+ s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h; | |
+ s->outtex[j].w = tempW; s->outtex[j].h = tempH; | |
+ } | |
+ } | |
+ } | |
+ | |
+ if (count < s->nb_outputs) { | |
+ int offset = s->nb_outputs - count; | |
+ for (i = s->nb_outputs-1; i >= offset; i--) { | |
+ s->outtex[i].w = s->outtex[i-offset].w; | |
+ s->outtex[i].h = s->outtex[i-offset].h; | |
+ } | |
+ for (i = 0; i < offset; i++) { | |
+ s->outtex[i].w = s->outtex[i].h = 0; | |
+ } | |
+ } | |
+ | |
+ // init cuda_context | |
+ if (!s->cu_ctx) { | |
+ init_cuda(); | |
+ get_cuda_context(&s->cu_ctx, s->gpu); | |
+ } | |
+ s->cu_dl_func = get_cuda_dl_func(); | |
+ | |
+ __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx)); | |
+ | |
+ // load functions | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); | |
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar, s->cu_module, "uchar_tex")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex")); | |
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex")); | |
+ | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR)); | |
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst) | |
+{ | |
+ av_assert0(src->width == dst->w && src->height == dst->h); | |
+ | |
+ switch (src->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ // copy UV channel | |
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ // copy the packed 32-bit plane | |
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format)); | |
+ return -1; | |
+ } | |
+ return 0; | |
+} | |
+ | |
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst) | |
+{ | |
+ //av_assert0(src->w == dst->width && src->h == dst->height); | |
+ | |
+ switch (dst->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy U channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy V channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ // copy Y channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ // copy UV channel | |
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ // copy the packed 32-bit plane | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST)); | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format)); | |
+ return -1; | |
+ } | |
+ return 0; | |
+} | |
+ | |
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels, | |
+ CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch, | |
+ CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch) | |
+{ | |
+ void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; | |
+ CUDA_ARRAY_DESCRIPTOR desc; | |
+ desc.Width = src_width; | |
+ desc.Height = src_height; | |
+ desc.NumChannels = channels; | |
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; | |
+ __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch)); | |
+ | |
+ __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, | |
+ BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL)); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format) | |
+{ | |
+ switch (format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2, | |
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2, | |
+ dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV444P: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h, src->w, src->h, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch); | |
+ | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2, | |
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch, | |
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ if (src->w == dst->w && src->h == dst->h) { | |
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE)); | |
+ | |
+ } | |
+ else { | |
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4, | |
+ src->dptr, src->w, src->h, src->pitch, | |
+ dst->dptr, dst->w, dst->h, dst->pitch/4); | |
+ } | |
+ | |
+ break; | |
+ | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format)); | |
+ return -1; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target) | |
+{ | |
+ int offset; | |
+ cu_tex* src; | |
+ if (source == NULL) { | |
+ return &s->intex; | |
+ } | |
+ | |
+ if (target->w * 4 > source->w) { | |
+ return source; | |
+ } | |
+ | |
+ offset = target - s->outtex; | |
+ for (int i = offset - 1; i >= 0; i--) { | |
+ if (target->w * 4 > s->outtex[i].w) { | |
+ return &s->outtex[i]; | |
+ } | |
+ } | |
+ | |
+ src = (offset == 0 ? source : &s->outtex[offset-1]); | |
+ av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, " | |
+ "might cause artificial\n", target->w, target->h, src->w, src->h); | |
+ | |
+ return src; | |
+} | |
+ | |
+static int filter_frame(AVFilterLink *inlink, AVFrame *in) | |
+{ | |
+ AVFilterContext *ctx = inlink->dst; | |
+ NVResizeContext *s = ctx->priv; | |
+ int i; | |
+ cu_tex* resize_src = NULL; | |
+ ffnvinfo* info; | |
+ | |
+ // copy input to gpu | |
+ if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) { | |
+ ffnvinfo* info = (ffnvinfo*)in->opaque; | |
+ s->intex.dptr = info->dptr[0]; | |
+ s->intex.pitch = info->linesize[0]; | |
+ s->intex.w = in->width; | |
+ s->intex.h = in->height; | |
+ } | |
+ else { | |
+ if ( (in->width != s->intex.h || in->height != s->intex.h) && | |
+ !s->intex.dptr) { | |
+ __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr)); | |
+ s->intex.w = in->width; | |
+ s->intex.h = in->height; | |
+ s->intex.dptr = (CUdeviceptr)NULL; | |
+ } | |
+ if (!s->intex.dptr) { | |
+ switch (in->format) { | |
+ case AV_PIX_FMT_YUV420P: | |
+ case AV_PIX_FMT_NV12: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16)); | |
+ break; | |
+ case AV_PIX_FMT_YUV444P: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16)); | |
+ break; | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16)); | |
+ break; | |
+ default: | |
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format)); | |
+ return -1; | |
+ } | |
+ } | |
+ copy_from_avframe(s, in, &s->intex); | |
+ } | |
+ | |
+ for (i = 0; i < ctx->nb_outputs; i++) { | |
+ AVFrame *out; | |
+ if (ctx->outputs[i]->status) | |
+ continue; | |
+ | |
+ out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h); | |
+ if (!out) { | |
+ av_frame_free(&in); | |
+ return AVERROR(ENOMEM); | |
+ } | |
+ av_frame_copy_props(out, in); | |
+ | |
+ // do works here | |
+ resize_src = find_resize_src(s, resize_src, &s->outtex[i]); | |
+ do_cuda_resize(s, resize_src, &s->outtex[i], in->format); | |
+ info = init_nvinfo(); | |
+ switch (out->format) { | |
+ case AV_PIX_FMT_YUV444P: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2; | |
+ info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_YUV420P: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4; | |
+ info->linesize[0] = s->outtex[i].pitch; | |
+ info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_NV12: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h; | |
+ info->linesize[0] = info->linesize[1] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ case AV_PIX_FMT_ARGB: | |
+ case AV_PIX_FMT_RGBA: | |
+ case AV_PIX_FMT_ABGR: | |
+ case AV_PIX_FMT_BGRA: | |
+ info->dptr[0] = s->outtex[i].dptr; | |
+ info->linesize[0] = s->outtex[i].pitch; | |
+ break; | |
+ | |
+ default: | |
+ break; | |
+ } | |
+ | |
+ out->opaque = (void*)info; | |
+ if (s->readback_FB) | |
+ copy_to_avframe(s, &s->outtex[i], out); | |
+ | |
+ if (ff_filter_frame(ctx->outputs[i], out) < 0) | |
+ break; | |
+ } | |
+ | |
+ av_frame_free(&in); | |
+ return 0; | |
+} | |
+ | |
+ | |
+static av_cold void uninit(AVFilterContext *ctx) | |
+{ | |
+ NVResizeContext *s = ctx->priv; | |
+ | |
+ for (int i = 0; i < s->nb_outputs; i++) { | |
+ av_freep(&ctx->output_pads[i].name); | |
+ if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr); | |
+ } | |
+ if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu); | |
+ | |
+ av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n"); | |
+ | |
+} | |
+ | |
+static const AVFilterPad nvresize_inputs[] = { | |
+ { | |
+ .name = "default", | |
+ .type = AVMEDIA_TYPE_VIDEO, | |
+ .filter_frame = filter_frame, | |
+ }, | |
+ { NULL } | |
+}; | |
+ | |
+AVFilter ff_vf_nvresize = { | |
+ .name = "nvresize", | |
+ .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."), | |
+ .inputs = nvresize_inputs, | |
+ .outputs = NULL, | |
+ .flags = AVFILTER_FLAG_DYNAMIC_OUTPUTS, | |
+ .priv_class = &nvresize_class, | |
+ .init = init, | |
+ .uninit = uninit, | |
+ .query_formats = query_formats, | |
+ .priv_size = sizeof(NVResizeContext), | |
+}; | |
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize_ptx.c ffmpeg/libavfilter/vf_nvresize_ptx.c | |
--- ffmpeg-orig/libavfilter/vf_nvresize_ptx.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavfilter/vf_nvresize_ptx.c 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,659 @@ | |
+#if _WIN32 || _WIN64 | |
+#if _WIN64 | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+// Check GCC | |
+#if __GNUC__ | |
+#if __x86_64__ || __ppc64__ | |
+#define ENVIRONMENT64 | |
+#else | |
+#define ENVIRONMENT32 | |
+#endif | |
+#endif | |
+#ifdef ENVIRONMENT32 | |
+const char resize_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19324607\n" | |
+ "// Cuda compilation tools, release 7.0, V7.0.27\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.2\n" | |
+ ".target sm_30\n" | |
+ ".address_size 32\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar\n" | |
+ ".global .texref uchar_tex;\n" | |
+ ".global .texref uchar2_tex;\n" | |
+ ".global .texref uchar4_tex;\n" | |
+ "\n" | |
+ ".visible .entry Subsample_Bilinear_uchar(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<43>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB0_2;\n" | |
+ "bra.uni BB0_1;\n" | |
+ "\n" | |
+ "BB0_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n" | |
+ "and.b32 %r20, %r16, 255;\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n" | |
+ "and.b32 %r25, %r21, 255;\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n" | |
+ "and.b32 %r30, %r26, 255;\n" | |
+ "tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r35, %r31, 255;\n" | |
+ "add.s32 %r36, %r20, %r25;\n" | |
+ "add.s32 %r37, %r36, %r30;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "mad.lo.s32 %r41, %r2, %r6, %r1;\n" | |
+ "add.s32 %r42, %r15, %r41;\n" | |
+ "st.global.u8 [%r42], %r40;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar2\n" | |
+ ".visible .entry Subsample_Bilinear_uchar2(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<3>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<53>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar2_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB1_2;\n" | |
+ "bra.uni BB1_1;\n" | |
+ "\n" | |
+ "BB1_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r32, %r16, 255;\n" | |
+ "and.b32 %r33, %r20, 255;\n" | |
+ "and.b32 %r34, %r24, 255;\n" | |
+ "and.b32 %r35, %r28, 255;\n" | |
+ "add.s32 %r36, %r32, %r33;\n" | |
+ "add.s32 %r37, %r36, %r34;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "and.b32 %r41, %r17, 255;\n" | |
+ "and.b32 %r42, %r21, 255;\n" | |
+ "and.b32 %r43, %r25, 255;\n" | |
+ "and.b32 %r44, %r29, 255;\n" | |
+ "add.s32 %r45, %r41, %r42;\n" | |
+ "add.s32 %r46, %r45, %r43;\n" | |
+ "add.s32 %r47, %r46, %r44;\n" | |
+ "add.s32 %r48, %r47, 2;\n" | |
+ "shr.u32 %r49, %r48, 2;\n" | |
+ "mad.lo.s32 %r50, %r2, %r6, %r1;\n" | |
+ "shl.b32 %r51, %r50, 1;\n" | |
+ "add.s32 %r52, %r15, %r51;\n" | |
+ "cvt.u16.u32 %rs1, %r49;\n" | |
+ "cvt.u16.u32 %rs2, %r40;\n" | |
+ "st.global.v2.u8 [%r52], {%rs2, %rs1};\n" | |
+ "\n" | |
+ "BB1_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar4\n" | |
+ ".visible .entry Subsample_Bilinear_uchar4(\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<5>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<71>;\n" | |
+ ".reg .s64 %rd<2>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_0];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_1];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_2];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_3];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_4];\n" | |
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar4_param_5];\n" | |
+ "mov.u32 %r9, %ctaid.x;\n" | |
+ "mov.u32 %r10, %ntid.x;\n" | |
+ "mov.u32 %r11, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n" | |
+ "mov.u32 %r12, %ntid.y;\n" | |
+ "mov.u32 %r13, %ctaid.y;\n" | |
+ "mov.u32 %r14, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n" | |
+ "setp.lt.s32 %p1, %r2, %r5;\n" | |
+ "setp.lt.s32 %p2, %r1, %r4;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB2_2;\n" | |
+ "bra.uni BB2_1;\n" | |
+ "\n" | |
+ "BB2_1:\n" | |
+ "cvta.to.global.u32 %r15, %r3;\n" | |
+ "cvt.rn.f32.s32 %f1, %r4;\n" | |
+ "cvt.rn.f32.s32 %f2, %r7;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r5;\n" | |
+ "cvt.rn.f32.s32 %f5, %r8;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r32, %r16, 255;\n" | |
+ "and.b32 %r33, %r20, 255;\n" | |
+ "and.b32 %r34, %r24, 255;\n" | |
+ "and.b32 %r35, %r28, 255;\n" | |
+ "add.s32 %r36, %r32, %r33;\n" | |
+ "add.s32 %r37, %r36, %r34;\n" | |
+ "add.s32 %r38, %r37, %r35;\n" | |
+ "add.s32 %r39, %r38, 2;\n" | |
+ "shr.u32 %r40, %r39, 2;\n" | |
+ "and.b32 %r41, %r17, 255;\n" | |
+ "and.b32 %r42, %r21, 255;\n" | |
+ "and.b32 %r43, %r25, 255;\n" | |
+ "and.b32 %r44, %r29, 255;\n" | |
+ "add.s32 %r45, %r41, %r42;\n" | |
+ "add.s32 %r46, %r45, %r43;\n" | |
+ "add.s32 %r47, %r46, %r44;\n" | |
+ "add.s32 %r48, %r47, 2;\n" | |
+ "shr.u32 %r49, %r48, 2;\n" | |
+ "and.b32 %r50, %r18, 255;\n" | |
+ "and.b32 %r51, %r22, 255;\n" | |
+ "and.b32 %r52, %r26, 255;\n" | |
+ "and.b32 %r53, %r30, 255;\n" | |
+ "add.s32 %r54, %r50, %r51;\n" | |
+ "add.s32 %r55, %r54, %r52;\n" | |
+ "add.s32 %r56, %r55, %r53;\n" | |
+ "add.s32 %r57, %r56, 2;\n" | |
+ "shr.u32 %r58, %r57, 2;\n" | |
+ "and.b32 %r59, %r19, 255;\n" | |
+ "and.b32 %r60, %r23, 255;\n" | |
+ "and.b32 %r61, %r27, 255;\n" | |
+ "and.b32 %r62, %r31, 255;\n" | |
+ "add.s32 %r63, %r59, %r60;\n" | |
+ "add.s32 %r64, %r63, %r61;\n" | |
+ "add.s32 %r65, %r64, %r62;\n" | |
+ "add.s32 %r66, %r65, 2;\n" | |
+ "shr.u32 %r67, %r66, 2;\n" | |
+ "mad.lo.s32 %r68, %r2, %r6, %r1;\n" | |
+ "shl.b32 %r69, %r68, 2;\n" | |
+ "add.s32 %r70, %r15, %r69;\n" | |
+ "cvt.u16.u32 %rs1, %r67;\n" | |
+ "cvt.u16.u32 %rs2, %r58;\n" | |
+ "cvt.u16.u32 %rs3, %r49;\n" | |
+ "cvt.u16.u32 %rs4, %r40;\n" | |
+ "st.global.v4.u8 [%r70], {%rs4, %rs3, %rs2, %rs1};\n" | |
+ "\n" | |
+ "BB2_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#elif defined ENVIRONMENT64 | |
+const char resize_ptx[] = \ | |
+ "//\n" | |
+ "// Generated by NVIDIA NVVM Compiler\n" | |
+ "//\n" | |
+ "// Compiler Build ID: CL-19324607\n" | |
+ "// Cuda compilation tools, release 7.0, V7.0.27\n" | |
+ "// Based on LLVM 3.4svn\n" | |
+ "//\n" | |
+ "\n" | |
+ ".version 4.2\n" | |
+ ".target sm_30\n" | |
+ ".address_size 64\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar\n" | |
+ ".global .texref uchar_tex;\n" | |
+ ".global .texref uchar2_tex;\n" | |
+ ".global .texref uchar4_tex;\n" | |
+ "\n" | |
+ ".visible .entry Subsample_Bilinear_uchar(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<40>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB0_2;\n" | |
+ "bra.uni BB0_1;\n" | |
+ "\n" | |
+ "BB0_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n" | |
+ "and.b32 %r18, %r14, 255;\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n" | |
+ "and.b32 %r23, %r19, 255;\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n" | |
+ "and.b32 %r28, %r24, 255;\n" | |
+ "tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r33, %r29, 255;\n" | |
+ "add.s32 %r34, %r18, %r23;\n" | |
+ "add.s32 %r35, %r34, %r28;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "mad.lo.s32 %r39, %r2, %r5, %r1;\n" | |
+ "cvt.s64.s32 %rd4, %r39;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "st.global.u8 [%rd5], %r38;\n" | |
+ "\n" | |
+ "BB0_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar2\n" | |
+ ".visible .entry Subsample_Bilinear_uchar2(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar2_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<3>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<49>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar2_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB1_2;\n" | |
+ "bra.uni BB1_1;\n" | |
+ "\n" | |
+ "BB1_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r30, %r14, 255;\n" | |
+ "and.b32 %r31, %r18, 255;\n" | |
+ "and.b32 %r32, %r22, 255;\n" | |
+ "and.b32 %r33, %r26, 255;\n" | |
+ "add.s32 %r34, %r30, %r31;\n" | |
+ "add.s32 %r35, %r34, %r32;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "and.b32 %r39, %r15, 255;\n" | |
+ "and.b32 %r40, %r19, 255;\n" | |
+ "and.b32 %r41, %r23, 255;\n" | |
+ "and.b32 %r42, %r27, 255;\n" | |
+ "add.s32 %r43, %r39, %r40;\n" | |
+ "add.s32 %r44, %r43, %r41;\n" | |
+ "add.s32 %r45, %r44, %r42;\n" | |
+ "add.s32 %r46, %r45, 2;\n" | |
+ "shr.u32 %r47, %r46, 2;\n" | |
+ "mad.lo.s32 %r48, %r2, %r5, %r1;\n" | |
+ "mul.wide.s32 %rd4, %r48, 2;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "cvt.u16.u32 %rs1, %r47;\n" | |
+ "cvt.u16.u32 %rs2, %r38;\n" | |
+ "st.global.v2.u8 [%rd5], {%rs2, %rs1};\n" | |
+ "\n" | |
+ "BB1_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "// .globl Subsample_Bilinear_uchar4\n" | |
+ ".visible .entry Subsample_Bilinear_uchar4(\n" | |
+ ".param .u64 Subsample_Bilinear_uchar4_param_0,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n" | |
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n" | |
+ ")\n" | |
+ "{\n" | |
+ ".reg .pred %p<4>;\n" | |
+ ".reg .s16 %rs<5>;\n" | |
+ ".reg .f32 %f<27>;\n" | |
+ ".reg .s32 %r<67>;\n" | |
+ ".reg .s64 %rd<6>;\n" | |
+ "\n" | |
+ "\n" | |
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar4_param_0];\n" | |
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_1];\n" | |
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_2];\n" | |
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_3];\n" | |
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_4];\n" | |
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_5];\n" | |
+ "mov.u32 %r8, %ctaid.x;\n" | |
+ "mov.u32 %r9, %ntid.x;\n" | |
+ "mov.u32 %r10, %tid.x;\n" | |
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n" | |
+ "mov.u32 %r11, %ntid.y;\n" | |
+ "mov.u32 %r12, %ctaid.y;\n" | |
+ "mov.u32 %r13, %tid.y;\n" | |
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n" | |
+ "setp.lt.s32 %p1, %r2, %r4;\n" | |
+ "setp.lt.s32 %p2, %r1, %r3;\n" | |
+ "and.pred %p3, %p1, %p2;\n" | |
+ "@!%p3 bra BB2_2;\n" | |
+ "bra.uni BB2_1;\n" | |
+ "\n" | |
+ "BB2_1:\n" | |
+ "cvta.to.global.u64 %rd2, %rd1;\n" | |
+ "cvt.rn.f32.s32 %f1, %r3;\n" | |
+ "cvt.rn.f32.s32 %f2, %r6;\n" | |
+ "div.rn.f32 %f3, %f2, %f1;\n" | |
+ "cvt.rn.f32.s32 %f4, %r4;\n" | |
+ "cvt.rn.f32.s32 %f5, %r7;\n" | |
+ "div.rn.f32 %f6, %f5, %f4;\n" | |
+ "cvt.rn.f32.s32 %f7, %r1;\n" | |
+ "add.f32 %f8, %f7, 0f3F000000;\n" | |
+ "mul.f32 %f9, %f8, %f3;\n" | |
+ "cvt.rn.f32.s32 %f10, %r2;\n" | |
+ "add.f32 %f11, %f10, 0f3F000000;\n" | |
+ "mul.f32 %f12, %f11, %f6;\n" | |
+ "add.f32 %f13, %f3, 0fBF800000;\n" | |
+ "mul.f32 %f14, %f13, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f15, %f14;\n" | |
+ "add.f32 %f16, %f6, 0fBF800000;\n" | |
+ "mul.f32 %f17, %f16, 0f3F000000;\n" | |
+ "cvt.sat.f32.f32 %f18, %f17;\n" | |
+ "add.f32 %f19, %f15, 0f3F000000;\n" | |
+ "div.rn.f32 %f20, %f15, %f19;\n" | |
+ "add.f32 %f21, %f18, 0f3F000000;\n" | |
+ "div.rn.f32 %f22, %f18, %f21;\n" | |
+ "sub.f32 %f23, %f9, %f20;\n" | |
+ "sub.f32 %f24, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n" | |
+ "add.f32 %f25, %f9, %f20;\n" | |
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n" | |
+ "add.f32 %f26, %f12, %f22;\n" | |
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n" | |
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n" | |
+ "and.b32 %r30, %r14, 255;\n" | |
+ "and.b32 %r31, %r18, 255;\n" | |
+ "and.b32 %r32, %r22, 255;\n" | |
+ "and.b32 %r33, %r26, 255;\n" | |
+ "add.s32 %r34, %r30, %r31;\n" | |
+ "add.s32 %r35, %r34, %r32;\n" | |
+ "add.s32 %r36, %r35, %r33;\n" | |
+ "add.s32 %r37, %r36, 2;\n" | |
+ "shr.u32 %r38, %r37, 2;\n" | |
+ "and.b32 %r39, %r15, 255;\n" | |
+ "and.b32 %r40, %r19, 255;\n" | |
+ "and.b32 %r41, %r23, 255;\n" | |
+ "and.b32 %r42, %r27, 255;\n" | |
+ "add.s32 %r43, %r39, %r40;\n" | |
+ "add.s32 %r44, %r43, %r41;\n" | |
+ "add.s32 %r45, %r44, %r42;\n" | |
+ "add.s32 %r46, %r45, 2;\n" | |
+ "shr.u32 %r47, %r46, 2;\n" | |
+ "and.b32 %r48, %r16, 255;\n" | |
+ "and.b32 %r49, %r20, 255;\n" | |
+ "and.b32 %r50, %r24, 255;\n" | |
+ "and.b32 %r51, %r28, 255;\n" | |
+ "add.s32 %r52, %r48, %r49;\n" | |
+ "add.s32 %r53, %r52, %r50;\n" | |
+ "add.s32 %r54, %r53, %r51;\n" | |
+ "add.s32 %r55, %r54, 2;\n" | |
+ "shr.u32 %r56, %r55, 2;\n" | |
+ "and.b32 %r57, %r17, 255;\n" | |
+ "and.b32 %r58, %r21, 255;\n" | |
+ "and.b32 %r59, %r25, 255;\n" | |
+ "and.b32 %r60, %r29, 255;\n" | |
+ "add.s32 %r61, %r57, %r58;\n" | |
+ "add.s32 %r62, %r61, %r59;\n" | |
+ "add.s32 %r63, %r62, %r60;\n" | |
+ "add.s32 %r64, %r63, 2;\n" | |
+ "shr.u32 %r65, %r64, 2;\n" | |
+ "mad.lo.s32 %r66, %r2, %r5, %r1;\n" | |
+ "mul.wide.s32 %rd4, %r66, 4;\n" | |
+ "add.s64 %rd5, %rd2, %rd4;\n" | |
+ "cvt.u16.u32 %rs1, %r65;\n" | |
+ "cvt.u16.u32 %rs2, %r56;\n" | |
+ "cvt.u16.u32 %rs3, %r47;\n" | |
+ "cvt.u16.u32 %rs4, %r38;\n" | |
+ "st.global.v4.u8 [%rd5], {%rs4, %rs3, %rs2, %rs1};\n" | |
+ "\n" | |
+ "BB2_2:\n" | |
+ "ret;\n" | |
+ "}\n" | |
+ "\n" | |
+ "\n" | |
+; | |
+#endif | |
diff -ruN ffmpeg-orig/libavutil/cudautils.c ffmpeg/libavutil/cudautils.c | |
--- ffmpeg-orig/libavutil/cudautils.c 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavutil/cudautils.c 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,288 @@ | |
+/* | |
+* | |
+* This file is part of FFmpeg. | |
+* | |
+* FFmpeg is free software; you can redistribute it and/or | |
+* modify it under the terms of the GNU Lesser General Public | |
+* License as published by the Free Software Foundation; either | |
+* version 2.1 of the License, or (at your option) any later version. | |
+* | |
+* FFmpeg is distributed in the hope that it will be useful, | |
+* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+* Lesser General Public License for more details. | |
+* | |
+* You should have received a copy of the GNU Lesser General Public | |
+* License along with FFmpeg; if not, write to the Free Software | |
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+*/ | |
+ | |
+#include "cudautils.h" | |
+#include "common.h" | |
+#include "log.h" | |
+ | |
+#define FF_NVINFO_VERSION 1 | |
+static NVGUID NV_INFO_GUID = { 0x2cab9a64, 0x7095, 0x11e5, { 0xad, 0x1d, 0x94, 0xde, 0x80, 0x65, 0xb7, 0x74 } }; | |
+static CudaContext cudaCtx = { { NULL }, { 0 }, { NULL }, 0 , { NULL }, { "" }, { 0 } }; | |
+ | |
+int dyload_cuda(void); | |
+int check_cuda(void); | |
+int check_cuda_errors(CUresult err, const char *func); | |
+ | |
+ | |
+#define CHECK_LOAD_FUNC(t, f, s) \ | |
+do { \ | |
+ (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \ | |
+ if (!(f)) { \ | |
+ av_log(NULL, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \ | |
+ goto error; \ | |
+ } \ | |
+} while (0) | |
+ | |
+int check_cuda_errors(CUresult err, const char *func) | |
+{ | |
+ if (err != CUDA_SUCCESS) { | |
+ av_log(NULL, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err); | |
+ return 0; | |
+ } | |
+ return 1; | |
+} | |
+#define check_cuda_errors(f) if (!check_cuda_errors(f, #f)) goto error | |
+ | |
+ | |
+int dyload_cuda(void) | |
+{ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ if (dl_fn->cuda_lib) | |
+ return 1; | |
+ | |
+#if defined(_WIN32) | |
+ dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll")); | |
+#else | |
+ dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY); | |
+#endif | |
+ | |
+ if (!dl_fn->cuda_lib) { | |
+ av_log(NULL, AV_LOG_FATAL, "Failed loading CUDA library\n"); | |
+ goto error; | |
+ } | |
+ | |
+ CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet"); | |
+ CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName"); | |
+ CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability"); | |
+ CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2"); | |
+ CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2"); | |
+ CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2"); | |
+ CHECK_LOAD_FUNC(PCUMODULELOADDATA, dl_fn->cu_module_load_data, "cuModuleLoadData"); | |
+ CHECK_LOAD_FUNC(PCUMODULEGETFUNCTION, dl_fn->cu_module_get_function, "cuModuleGetFunction"); | |
+ CHECK_LOAD_FUNC(PCUMODULEGETTEXREF, dl_fn->cu_module_get_texref, "cuModuleGetTexRef"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETFLAGS, dl_fn->cu_texref_set_flags, "cuTexRefSetFlags"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETFILTERMODE, dl_fn->cu_texref_set_filtermode, "cuTexRefSetFilterMode"); | |
+ CHECK_LOAD_FUNC(PCUTEXREFSETADDRESS2D, dl_fn->cu_texref_set_address_2D, "cuTexRefSetAddress2D_v3"); | |
+ CHECK_LOAD_FUNC(PCUMEMALLOCPITCH, dl_fn->cu_mem_alloc_pitch, "cuMemAllocPitch_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMCPY2D, dl_fn->cu_mem_cpy_2D, "cuMemcpy2D_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMCPY2DASYNC, dl_fn->cu_mem_cpy_2D_async, "cuMemcpy2DAsync_v2"); | |
+ CHECK_LOAD_FUNC(PCUMEMFREE, dl_fn->cu_mem_free, "cuMemFree_v2"); | |
+ CHECK_LOAD_FUNC(PCULAUNCHKERNEL, dl_fn->cu_launch_kernel, "cuLaunchKernel"); | |
+ | |
+ av_log(NULL, AV_LOG_VERBOSE, "CUDA Library and Function loaded successfully\n"); | |
+ return 1; | |
+ | |
+error: | |
+ if (dl_fn->cuda_lib) | |
+ DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
+ | |
+ dl_fn->cuda_lib = NULL; | |
+ return 0; | |
+} | |
+ | |
+ | |
+int check_cuda(void) | |
+{ | |
+ int device_count = 0; | |
+ int smminor = 0, smmajor = 0; | |
+ int i; | |
+ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ | |
+ if (!dyload_cuda()) | |
+ return 0; | |
+ | |
+ check_cuda_errors(dl_fn->cu_init(0)); | |
+ check_cuda_errors(dl_fn->cu_device_get_count(&device_count)); | |
+ | |
+ if (!device_count) { | |
+ av_log(NULL, AV_LOG_FATAL, "No CUDA capable devices found\n"); | |
+ goto error; | |
+ } | |
+ | |
+ av_log(NULL, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count); | |
+ | |
+ for (i = 0; i < device_count; ++i) { | |
+ check_cuda_errors(dl_fn->cu_device_get(&cudaCtx.cu_devices[i], i)); | |
+ check_cuda_errors(dl_fn->cu_device_get_name(cudaCtx.gpu_name[i], sizeof(cudaCtx.gpu_name[i]), cudaCtx.cu_devices[i])); | |
+ check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cudaCtx.cu_devices[i])); | |
+ | |
+ cudaCtx.smver[i] = (smmajor << 4) | smminor; | |
+ av_log(NULL, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d]\n", i, cudaCtx.gpu_name[i], smmajor, smminor); | |
+ | |
+ } | |
+ cudaCtx.device_count = device_count; | |
+ return 1; | |
+ | |
+error: | |
+ cudaCtx.device_count = 0; | |
+ return 0; | |
+} | |
+ | |
+int init_cuda(void) | |
+{ | |
+ if (cudaCtx.device_count == 0) | |
+ { | |
+ if (!check_cuda()) | |
+ return 0; | |
+ } | |
+ | |
+ return 1; | |
+} | |
+ | |
+void deinit_cuda(void) | |
+{ | |
+ | |
+ int i, deinit_flag; | |
+ deinit_flag = 1; | |
+ | |
+ for (i = 0; i < cudaCtx.device_count; i++) | |
+ { | |
+ if (cudaCtx.cuda_context_arr[i] != NULL) | |
+ { | |
+ deinit_flag = 0; | |
+ } | |
+ } | |
+ | |
+ if (deinit_flag == 1) | |
+ { | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ DL_CLOSE_FUNC(dl_fn->cuda_lib); | |
+ dl_fn->cuda_lib = NULL; | |
+ | |
+ dl_fn->cu_init = NULL; | |
+ dl_fn->cu_device_get_count = NULL; | |
+ dl_fn->cu_device_get = NULL; | |
+ dl_fn->cu_device_get_name = NULL; | |
+ dl_fn->cu_device_compute_capability = NULL; | |
+ dl_fn->cu_ctx_create = NULL; | |
+ dl_fn->cu_ctx_pop_current = NULL; | |
+ dl_fn->cu_ctx_destroy = NULL; | |
+ dl_fn->cu_module_load_data = NULL; | |
+ dl_fn->cu_module_get_function = NULL; | |
+ dl_fn->cu_module_get_texref = NULL; | |
+ dl_fn->cu_texref_set_flags = NULL; | |
+ dl_fn->cu_texref_set_filtermode = NULL; | |
+ dl_fn->cu_texref_set_address_2D = NULL; | |
+ dl_fn->cu_mem_alloc_pitch = NULL; | |
+ dl_fn->cu_mem_cpy_2D = NULL; | |
+ dl_fn->cu_mem_cpy_2D_async = NULL; | |
+ dl_fn->cu_mem_free = NULL; | |
+ dl_fn->cu_launch_kernel = NULL; | |
+ av_log(NULL, AV_LOG_VERBOSE, "Cuda Library unloaded\n"); | |
+ } | |
+} | |
+ | |
+int is_gpu_feature_available(int gpu, int target_smver) | |
+{ | |
+ if (cudaCtx.device_count > 0) | |
+ { | |
+ if (cudaCtx.smver[gpu] >= target_smver) | |
+ { | |
+ return 1; | |
+ } | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+ | |
+int get_cuda_context(CUcontext *ctx, int gpu) | |
+{ | |
+ CUresult cu_res; | |
+ CUcontext get_ctx; | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ if (cudaCtx.cuda_context_arr[gpu] == NULL) | |
+ { | |
+ cu_res = dl_fn->cu_ctx_create(&get_ctx, 4, cudaCtx.cu_devices[gpu]); | |
+ | |
+ if (cu_res != CUDA_SUCCESS) { | |
+ ctx = NULL; | |
+ return cu_res; | |
+ } | |
+ | |
+ cudaCtx.cuda_context_arr[gpu] = get_ctx; | |
+ } | |
+ | |
+ *ctx = cudaCtx.cuda_context_arr[gpu]; | |
+ cudaCtx.cuda_context_count[gpu] += 1; | |
+ av_log(NULL, AV_LOG_VERBOSE, "cudalib : Cuda Context created 0x%p\n", *ctx); | |
+ | |
+ return CUDA_SUCCESS; | |
+} | |
+ | |
+void release_cuda_context(CUcontext *ctx, int gpu) | |
+{ | |
+ | |
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs; | |
+ | |
+ ctx = NULL; | |
+ cudaCtx.cuda_context_count[gpu] -= 1; | |
+ if (cudaCtx.cuda_context_count[gpu] == 0) | |
+ { | |
+ dl_fn->cu_ctx_destroy(cudaCtx.cuda_context_arr[gpu]); | |
+ cudaCtx.cuda_context_arr[gpu] = NULL; | |
+ } | |
+ | |
+} | |
+ | |
+int check_nvinfo(void* ptr) | |
+{ | |
+ ffnvinfo* info; | |
+ if (!ptr) return 0; | |
+ info = (ffnvinfo*)ptr; | |
+ if (memcmp(&info->guid, &NV_INFO_GUID, sizeof(info->guid)) != 0) return 0; | |
+ return 1; | |
+} | |
+ | |
+ffnvinfo* init_nvinfo() | |
+{ | |
+ ffnvinfo* info = av_mallocz(sizeof(ffnvinfo)); | |
+ memcpy(&info->guid, &NV_INFO_GUID, sizeof(info->guid)); | |
+ info->version = FF_NVINFO_VERSION; | |
+ return info; | |
+} | |
+ | |
+CudaDynLoadFunctions* get_cuda_dl_func() | |
+{ | |
+ return &cudaCtx.cuda_dload_funcs; | |
+} | |
+ | |
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType) | |
+{ | |
+ CUDA_MEMCPY2D memcpy2D = { 0 }; | |
+ CudaDynLoadFunctions* cu_dl_func = get_cuda_dl_func(); | |
+ | |
+ memcpy2D.srcMemoryType = srcMemoryType; | |
+ memcpy2D.dstMemoryType = dstMemoryType; | |
+ memcpy2D.srcHost = srcHost; | |
+ memcpy2D.srcDevice = srcDevice; | |
+ memcpy2D.srcPitch = srcPitch; | |
+ memcpy2D.dstHost = dstHost; | |
+ memcpy2D.dstDevice = dstDevice; | |
+ memcpy2D.dstPitch = dstPitch; | |
+ memcpy2D.WidthInBytes = width; | |
+ memcpy2D.Height = height; | |
+ return cu_dl_func->cu_mem_cpy_2D_async(&memcpy2D, NULL); | |
+} | |
diff -ruN ffmpeg-orig/libavutil/cudautils.h ffmpeg/libavutil/cudautils.h | |
--- ffmpeg-orig/libavutil/cudautils.h 1970-01-01 03:00:00.000000000 +0300 | |
+++ ffmpeg/libavutil/cudautils.h 2016-03-04 00:18:17.589433917 +0200 | |
@@ -0,0 +1,216 @@ | |
+/* | |
+* | |
+* This file is part of FFmpeg. | |
+* | |
+* FFmpeg is free software; you can redistribute it and/or | |
+* modify it under the terms of the GNU Lesser General Public | |
+* License as published by the Free Software Foundation; either | |
+* version 2.1 of the License, or (at your option) any later version. | |
+* | |
+* FFmpeg is distributed in the hope that it will be useful, | |
+* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+* Lesser General Public License for more details. | |
+* | |
+* You should have received a copy of the GNU Lesser General Public | |
+* License along with FFmpeg; if not, write to the Free Software | |
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+*/ | |
+ | |
+#if defined(_WIN32) | |
+#include <windows.h> | |
+#else | |
+#include <dlfcn.h> | |
+#endif | |
+ | |
+#include "common.h" | |
+ | |
+ | |
+#if defined(_WIN32) | |
+#define CUDAAPI __stdcall | |
+#else | |
+#define CUDAAPI | |
+#endif | |
+ | |
+#if defined(_WIN32) | |
+#define LOAD_FUNC(l, s) GetProcAddress(l, s) | |
+#define DL_CLOSE_FUNC(l) FreeLibrary(l) | |
+#else | |
+#define LOAD_FUNC(l, s) dlsym(l, s) | |
+#define DL_CLOSE_FUNC(l) dlclose(l) | |
+#endif | |
+ | |
+#define MAX_NUM_GPU 16 | |
+ | |
+#define CU_TRSF_READ_AS_INTEGER 0x01 | |
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02 | |
+#define CU_TRSF_SRGB 0x10 | |
+ | |
+#define __cu(a) do { \ | |
+ CUresult ret; \ | |
+ if ((ret = (a)) != CUDA_SUCCESS) { \ | |
+ av_log(NULL, AV_LOG_FATAL, "[%s:%d]%s has returned CUDA error %d\n", __FILE__, __LINE__, #a, ret); \ | |
+ return AVERROR_EXTERNAL;\ | |
+ }} while (0) | |
+ | |
+ | |
+typedef int CUdevice; | |
+typedef void* CUcontext; | |
+typedef void* CUmodule; | |
+typedef void* CUfunction; | |
+typedef void* CUtexref; | |
+typedef void* CUstream; | |
+typedef void* CUarray; | |
+#if defined(_WIN64) || defined(__LP64__) | |
+typedef unsigned long long CUdeviceptr; | |
+#else | |
+typedef unsigned int CUdeviceptr; | |
+#endif | |
+ | |
+typedef enum cudaError_enum { | |
+ CUDA_SUCCESS = 0 | |
+} CUresult; | |
+ | |
+typedef enum CUfilter_mode_enum { | |
+ CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ | |
+ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ | |
+} CUfilter_mode; | |
+ | |
+typedef enum CUarray_format_enum { | |
+ CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ | |
+ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ | |
+ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ | |
+ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ | |
+ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ | |
+ CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ | |
+} CUarray_format; | |
+ | |
+typedef struct CUDA_ARRAY_DESCRIPTOR_st | |
+{ | |
+ size_t Width; /**< Width of array */ | |
+ size_t Height; /**< Height of array */ | |
+ | |
+ CUarray_format Format; /**< Array format */ | |
+ unsigned int NumChannels; /**< Channels per array element */ | |
+} CUDA_ARRAY_DESCRIPTOR; | |
+ | |
+typedef enum CUmemorytype_enum { | |
+ CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ | |
+ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ | |
+ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ | |
+ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ | |
+} CUmemorytype; | |
+ | |
+typedef struct CUDA_MEMCPY2D_st { | |
+ size_t srcXInBytes; /**< Source X in bytes */ | |
+ size_t srcY; /**< Source Y */ | |
+ | |
+ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ | |
+ const void *srcHost; /**< Source host pointer */ | |
+ CUdeviceptr srcDevice; /**< Source device pointer */ | |
+ CUarray srcArray; /**< Source array reference */ | |
+ size_t srcPitch; /**< Source pitch (ignored when src is array) */ | |
+ | |
+ size_t dstXInBytes; /**< Destination X in bytes */ | |
+ size_t dstY; /**< Destination Y */ | |
+ | |
+ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ | |
+ void *dstHost; /**< Destination host pointer */ | |
+ CUdeviceptr dstDevice; /**< Destination device pointer */ | |
+ CUarray dstArray; /**< Destination array reference */ | |
+ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ | |
+ | |
+ size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ | |
+ size_t Height; /**< Height of 2D memory copy */ | |
+} CUDA_MEMCPY2D; | |
+ | |
+ | |
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal); | |
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev); | |
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx); | |
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx); | |
+typedef CUresult(CUDAAPI *PCUMODULELOADDATA)(CUmodule *module, const void *image); | |
+typedef CUresult(CUDAAPI *PCUMODULEGETFUNCTION)(CUfunction *hfunc, CUmodule hmod, const char *name); | |
+typedef CUresult(CUDAAPI *PCUMODULEGETTEXREF)(CUtexref *pTexRef, CUmodule hmod, const char *name); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETFLAGS)(CUtexref hTexRef, unsigned int Flags); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETFILTERMODE)(CUtexref hTexRef, CUfilter_mode fm); | |
+typedef CUresult(CUDAAPI *PCUTEXREFSETADDRESS2D)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); | |
+typedef CUresult(CUDAAPI *PCUMEMALLOCPITCH)(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); | |
+typedef CUresult(CUDAAPI *PCUMEMCPY2D)(const CUDA_MEMCPY2D *pCopy); | |
+typedef CUresult(CUDAAPI *PCUMEMCPY2DASYNC)(const CUDA_MEMCPY2D *pCopy, CUstream hStream); | |
+typedef CUresult(CUDAAPI *PCUMEMFREE)(CUdeviceptr dptr); | |
+typedef CUresult(CUDAAPI *PCULAUNCHKERNEL)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); | |
+ | |
+ | |
+typedef struct CudaDynLoadFunctions{ | |
+ PCUINIT cu_init; | |
+ PCUDEVICEGETCOUNT cu_device_get_count; | |
+ PCUDEVICEGET cu_device_get; | |
+ PCUDEVICEGETNAME cu_device_get_name; | |
+ PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability; | |
+ PCUCTXCREATE cu_ctx_create; | |
+ PCUCTXPOPCURRENT cu_ctx_pop_current; | |
+ PCUCTXDESTROY cu_ctx_destroy; | |
+ PCUMODULELOADDATA cu_module_load_data; | |
+ PCUMODULEGETFUNCTION cu_module_get_function; | |
+ PCUMODULEGETTEXREF cu_module_get_texref; | |
+ PCUTEXREFSETFLAGS cu_texref_set_flags; | |
+ PCUTEXREFSETFILTERMODE cu_texref_set_filtermode; | |
+ PCUTEXREFSETADDRESS2D cu_texref_set_address_2D; | |
+ PCUMEMALLOCPITCH cu_mem_alloc_pitch; | |
+ PCUMEMCPY2D cu_mem_cpy_2D; | |
+ PCUMEMCPY2DASYNC cu_mem_cpy_2D_async; | |
+ PCUMEMFREE cu_mem_free; | |
+ PCULAUNCHKERNEL cu_launch_kernel; | |
+ | |
+#if defined(_WIN32) | |
+ HMODULE cuda_lib; | |
+#else | |
+ void* cuda_lib; | |
+#endif | |
+} CudaDynLoadFunctions; | |
+ | |
+typedef struct CudaContext{ | |
+ | |
+ CUcontext cuda_context_arr[MAX_NUM_GPU]; | |
+ unsigned int cuda_context_count[MAX_NUM_GPU]; | |
+ CudaDynLoadFunctions cuda_dload_funcs; | |
+ | |
+ int device_count; | |
+ CUdevice cu_devices[MAX_NUM_GPU]; | |
+ char gpu_name[MAX_NUM_GPU][128]; | |
+ int smver[MAX_NUM_GPU]; | |
+} CudaContext; | |
+ | |
+typedef struct _NVGUID { | |
+ uint32_t Data1; | |
+ uint16_t Data2; | |
+ uint16_t Data3; | |
+ uint8_t Data4[8]; | |
+} NVGUID; | |
+ | |
+typedef struct _ffnvinfo { | |
+ NVGUID guid; | |
+ uint32_t version; | |
+ //CUcontext cudaCtx; | |
+ void* vxCtx; | |
+ CUdeviceptr dptr[8]; | |
+ uint32_t linesize[8]; | |
+} ffnvinfo; | |
+ | |
+int init_cuda(void); | |
+void deinit_cuda(void); | |
+int get_cuda_context(CUcontext *ctx, int gpu); | |
+void release_cuda_context(CUcontext *ctx, int gpu); | |
+int is_gpu_feature_available(int gpu, int target_smver); | |
+int check_nvinfo(void* ptr); | |
+ffnvinfo* init_nvinfo(void); | |
+CudaDynLoadFunctions* get_cuda_dl_func(void); | |
+ | |
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType); | |
diff -ruN ffmpeg-orig/libavutil/Makefile ffmpeg/libavutil/Makefile | |
--- ffmpeg-orig/libavutil/Makefile 2016-03-04 00:17:21.063188717 +0200 | |
+++ ffmpeg/libavutil/Makefile 2016-03-04 00:18:17.589433917 +0200 | |
@@ -21,6 +21,7 @@ | |
common.h \ | |
cpu.h \ | |
crc.h \ | |
+ cudautils.h \ | |
des.h \ | |
display.h \ | |
downmix_info.h \ | |
@@ -98,6 +99,7 @@ | |
color_utils.o \ | |
cpu.o \ | |
crc.o \ | |
+ cudautils.o \ | |
des.o \ | |
display.o \ | |
downmix_info.o \ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment