Hi,
Currently incase of using 1 -> N transcode (1 SW decode -> N NVENC encodes) without HW upload filter, we end up allocating multiple Cuda contexts for the N transcode sessions for the same underlying gpu device. This comes with the cuda context initialization overhead. (~100 ms per context creation with 4th gen i5 with GTX 1080 in ubuntu 16.04). Also in case of M * (1->N) full HW accelerated transcode we face this issue where the cuda context is not shared between the M transcode sessions. Sharing the context would greatly reduce the initialization time which will matter in case of short clip transcodes. I currently have a global array in avutil/hwcontext_cuda.c which keeps track of the cuda contexts created and reuses existing contexts when request for hwdevice ctx create occurs. This is shared in the attached patch. Please check the approach and let me know if there is better/cleaner way to do this. Thanks Regards Ganapathy ----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. -----------------------------------------------------------------------------------
From 9e828c7cd943b964ccf4cc8d1059fcef014b24a3 Mon Sep 17 00:00:00 2001 From: Ganapathy Kasi <gk...@nvidia.com> Date: Mon, 12 Jun 2017 13:14:36 -0700 Subject: [PATCH] Share cuda context across multiple transcode sessions for the same gpu Cuda context is allocated per decode/scale/encode session. If there are multiple transcodes in same process, many cuda contexts are allocated for the underlying same gpu device which has a initialization perf overhead. Sharing the cuda context per device fixes the issue. Also nvenc is directly using the cuda interface to create the cuda context instead of using the av_hwdevice interface. --- libavcodec/nvenc.c | 33 ++++++++++++++++++--------------- libavcodec/nvenc.h | 3 ++- libavutil/hwcontext_cuda.c | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c index f79b9a5..d5b6978 100644 --- a/libavcodec/nvenc.c +++ b/libavcodec/nvenc.c @@ -326,10 +326,14 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx) NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs; NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs; char name[128] = { 0}; + char device_str[20]; int major, minor, ret; CUresult cu_res; CUdevice cu_device; CUcontext dummy; + AVHWDeviceContext *device_ctx; + AVCUDADeviceContext *device_hwctx; + int loglevel = AV_LOG_VERBOSE; if (ctx->device == LIST_DEVICES) @@ -364,19 +368,19 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx) if (ctx->device != idx && ctx->device != ANY_DEVICE) return -1; - cu_res = dl_fn->cuda_dl->cuCtxCreate(&ctx->cu_context_internal, 0, cu_device); - if (cu_res != CUDA_SUCCESS) { - av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res); + if (ctx->device == ANY_DEVICE) + ctx->device = 0; + + sprintf(device_str, "%d", ctx->device); + + ret = av_hwdevice_ctx_create(&ctx->hwdevice, AV_HWDEVICE_TYPE_CUDA, device_str, NULL, 0); + if (ret < 0) goto fail; - } - ctx->cu_context = ctx->cu_context_internal; + device_ctx = (AVHWDeviceContext *)ctx->hwdevice->data; + device_hwctx = device_ctx->hwctx; - cu_res = dl_fn->cuda_dl->cuCtxPopCurrent(&dummy); - if (cu_res != CUDA_SUCCESS) { - av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res); - goto fail2; - } + ctx->cu_context = device_hwctx->cuda_ctx; if ((ret = nvenc_open_session(avctx)) < 0) goto fail2; @@ -408,8 +412,8 @@ fail3: } fail2: - dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal); - ctx->cu_context_internal = NULL; + av_buffer_unref(&ctx->hwdevice); + ctx->cu_context = NULL; fail: return AVERROR(ENOSYS); @@ -1374,9 +1378,8 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx) return AVERROR_EXTERNAL; } - if (ctx->cu_context_internal) - dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal); - ctx->cu_context = ctx->cu_context_internal = NULL; + av_buffer_unref(&ctx->hwdevice); + ctx->cu_context = NULL; nvenc_free_functions(&dl_fn->nvenc_dl); cuda_free_functions(&dl_fn->cuda_dl); diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h index 2e24604..327c914 100644 --- a/libavcodec/nvenc.h +++ b/libavcodec/nvenc.h @@ -106,7 +106,6 @@ typedef struct NvencContext NV_ENC_INITIALIZE_PARAMS init_encode_params; NV_ENC_CONFIG encode_config; CUcontext cu_context; - CUcontext cu_context_internal; int nb_surfaces; NvencSurface *surfaces; @@ -116,6 +115,8 @@ typedef struct NvencContext AVFifoBuffer *output_surface_ready_queue; AVFifoBuffer *timestamp_list; + AVBufferRef *hwdevice; + struct { CUdeviceptr ptr; NV_ENC_REGISTERED_PTR regptr; diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c index ed595c3..16d2812 100644 --- a/libavutil/hwcontext_cuda.c +++ b/libavutil/hwcontext_cuda.c @@ -24,8 +24,12 @@ #include "mem.h" #include "pixdesc.h" #include "pixfmt.h" +#include <time.h> #define CUDA_FRAME_ALIGNMENT 256 +#define NUM_DEVICES 8 + +CUcontext cudaCtx[NUM_DEVICES] = { NULL }; typedef struct CUDAFramesContext { int shift_width, shift_height; @@ -363,27 +367,35 @@ static int cuda_device_create(AVHWDeviceContext *ctx, const char *device, cu = hwctx->internal->cuda_dl; err = cu->cuInit(0); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n"); - goto error; - } - err = cu->cuDeviceGet(&cu_device, device_idx); if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx); - goto error; - } - - err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n"); + av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n"); goto error; } - cu->cuCtxPopCurrent(&dummy); + if (!cudaCtx[device_idx]) + { + err = cu->cuDeviceGet(&cu_device, device_idx); + if (err != CUDA_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx); + goto error; + } - hwctx->internal->is_allocated = 1; + err = cu->cuCtxCreate(&hwctx->cuda_ctx, 0, cu_device); + if (err != CUDA_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n"); + goto error; + } + cu->cuCtxPopCurrent(&dummy); + cudaCtx[device_idx] = hwctx->cuda_ctx; + hwctx->internal->is_allocated = 1; + } + else + { + hwctx->cuda_ctx = cudaCtx[device_idx]; + hwctx->internal->is_allocated = 0; + } return 0; error: -- 2.7.4
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel