[FFmpeg-devel] Sharing cuda context between transcode sessions to reduce initialization overhead

Ganapathy Raman Kasi Mon, 12 Jun 2017 13:39:06 -0700

Hi,


Currently incase of using 1 -> N transcode (1 SW decode -> N  NVENC encodes) 
without HW upload filter, we end up allocating multiple Cuda contexts for the N 
transcode sessions for the same underlying gpu device. This comes with the cuda 
context initialization overhead. (~100 ms per context creation with 4th gen i5 
with GTX 1080 in ubuntu 16.04).  Also in case of  M * (1->N) full HW 
accelerated transcode we face this issue where the cuda context is not shared 
between the M transcode sessions. Sharing the context would greatly reduce the 
initialization time which will matter in case of short clip transcodes.


I currently have a global array in avutil/hwcontext_cuda.c which keeps track of 
the cuda contexts created and reuses existing contexts when request for 
hwdevice ctx create occurs. This is shared in the attached patch. Please check 
the approach and let me know if there is better/cleaner way to do this. Thanks


Regards

Ganapathy


-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may 
contain
confidential information.  Any unauthorized review, use, disclosure or 
distribution
is prohibited.  If you are not the intended recipient, please contact the 
sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------

From 9e828c7cd943b964ccf4cc8d1059fcef014b24a3 Mon Sep 17 00:00:00 2001
From: Ganapathy Kasi <gk...@nvidia.com>
Date: Mon, 12 Jun 2017 13:14:36 -0700
Subject: [PATCH] Share cuda context across multiple transcode sessions for the
 same gpu

Cuda context is allocated per decode/scale/encode session. If there are multiple
transcodes in same process, many cuda contexts are allocated for the underlying
same gpu device which has a initialization perf overhead. Sharing the cuda
context per device fixes the issue. Also nvenc is directly using the cuda
interface to create the cuda context instead of using the av_hwdevice interface.
---
 libavcodec/nvenc.c         | 33 ++++++++++++++++++---------------
 libavcodec/nvenc.h         |  3 ++-
 libavutil/hwcontext_cuda.c | 40 ++++++++++++++++++++++++++--------------
 3 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index f79b9a5..d5b6978 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -326,10 +326,14 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
     NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
     NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
     char name[128] = { 0};
+    char device_str[20];
     int major, minor, ret;
     CUresult cu_res;
     CUdevice cu_device;
     CUcontext dummy;
+    AVHWDeviceContext *device_ctx;
+    AVCUDADeviceContext *device_hwctx;
+
     int loglevel = AV_LOG_VERBOSE;
 
     if (ctx->device == LIST_DEVICES)
@@ -364,19 +368,19 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
     if (ctx->device != idx && ctx->device != ANY_DEVICE)
         return -1;
 
-    cu_res = dl_fn->cuda_dl->cuCtxCreate(&ctx->cu_context_internal, 0, cu_device);
-    if (cu_res != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
+    if (ctx->device == ANY_DEVICE)
+        ctx->device = 0;
+
+    sprintf(device_str, "%d", ctx->device);
+
+    ret = av_hwdevice_ctx_create(&ctx->hwdevice, AV_HWDEVICE_TYPE_CUDA, device_str, NULL, 0);
+    if (ret < 0)
         goto fail;
-    }
 
-    ctx->cu_context = ctx->cu_context_internal;
+    device_ctx = (AVHWDeviceContext *)ctx->hwdevice->data;
+    device_hwctx = device_ctx->hwctx;
 
-    cu_res = dl_fn->cuda_dl->cuCtxPopCurrent(&dummy);
-    if (cu_res != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
-        goto fail2;
-    }
+    ctx->cu_context = device_hwctx->cuda_ctx;
 
     if ((ret = nvenc_open_session(avctx)) < 0)
         goto fail2;
@@ -408,8 +412,8 @@ fail3:
     }
 
 fail2:
-    dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal);
-    ctx->cu_context_internal = NULL;
+    av_buffer_unref(&ctx->hwdevice);
+    ctx->cu_context = NULL;
 
 fail:
     return AVERROR(ENOSYS);
@@ -1374,9 +1378,8 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
         return AVERROR_EXTERNAL;
     }
 
-    if (ctx->cu_context_internal)
-        dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal);
-    ctx->cu_context = ctx->cu_context_internal = NULL;
+    av_buffer_unref(&ctx->hwdevice);
+    ctx->cu_context = NULL;
 
     nvenc_free_functions(&dl_fn->nvenc_dl);
     cuda_free_functions(&dl_fn->cuda_dl);
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
index 2e24604..327c914 100644
--- a/libavcodec/nvenc.h
+++ b/libavcodec/nvenc.h
@@ -106,7 +106,6 @@ typedef struct NvencContext
     NV_ENC_INITIALIZE_PARAMS init_encode_params;
     NV_ENC_CONFIG encode_config;
     CUcontext cu_context;
-    CUcontext cu_context_internal;
 
     int nb_surfaces;
     NvencSurface *surfaces;
@@ -116,6 +115,8 @@ typedef struct NvencContext
     AVFifoBuffer *output_surface_ready_queue;
     AVFifoBuffer *timestamp_list;
 
+    AVBufferRef *hwdevice;
+
     struct {
         CUdeviceptr ptr;
         NV_ENC_REGISTERED_PTR regptr;
diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
index ed595c3..16d2812 100644
--- a/libavutil/hwcontext_cuda.c
+++ b/libavutil/hwcontext_cuda.c
@@ -24,8 +24,12 @@
 #include "mem.h"
 #include "pixdesc.h"
 #include "pixfmt.h"
+#include <time.h>
 
 #define CUDA_FRAME_ALIGNMENT 256
+#define NUM_DEVICES 8
+
+CUcontext cudaCtx[NUM_DEVICES] = { NULL };
 
 typedef struct CUDAFramesContext {
     int shift_width, shift_height;
@@ -363,27 +367,35 @@ static int cuda_device_create(AVHWDeviceContext *ctx, const char *device,
     cu = hwctx->internal->cuda_dl;
 
     err = cu->cuInit(0);
-    if (err != CUDA_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n");
-        goto error;
-    }
 
-    err = cu->cuDeviceGet(&cu_device, device_idx);
     if (err != CUDA_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx);
-        goto error;
-    }
-
-    err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device);
-    if (err != CUDA_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n");
+        av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n");
         goto error;
     }
 
-    cu->cuCtxPopCurrent(&dummy);
+    if (!cudaCtx[device_idx])
+    {
+        err = cu->cuDeviceGet(&cu_device, device_idx);
+        if (err != CUDA_SUCCESS) {
+            av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx);
+            goto error;
+        }
 
-    hwctx->internal->is_allocated = 1;
+        err = cu->cuCtxCreate(&hwctx->cuda_ctx, 0, cu_device);
+        if (err != CUDA_SUCCESS) {
+            av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n");
+            goto error;
+        }
 
+        cu->cuCtxPopCurrent(&dummy);
+        cudaCtx[device_idx] = hwctx->cuda_ctx;
+        hwctx->internal->is_allocated = 1;
+    }
+    else
+    {
+        hwctx->cuda_ctx = cudaCtx[device_idx];
+        hwctx->internal->is_allocated = 0;
+    }
     return 0;
 
 error:
-- 
2.7.4

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] Sharing cuda context between transcode sessions to reduce initialization overhead

Reply via email to