Hi,

Recently Nvidia did some work on improving nvenc performance, it includes lots of change so I attach the patch instead of direct send.

Here are the explanations:
1) The first main change is adding an nvresize filter (1:N, one input, multiple outputs) to do hardware resizing, because during our interal 1:N encoding test, we found swscale becomes bottleneck. So we use cuda kernel instead.

2) We use AVFrame::opaque field to store a customized ffnvinfo struture to prevent expensive CPU<->GPU transferration. Without it, the workflow will be like CPU AVFrame input-->copy to GPU-->do CUDA resizing-->copy to CPU AVFrame-->copy to GPU-->NVENC encoding. And now it becomes:
CPU AVFrame input-->copy to GPU-->do CUDA resizing-->NVENC encoding.
Our strategy is to check whether AVFrame::opaque is not null AND its first 128 bytes matches some particular GUID. If so, AVFrame::opaque is a valid ffnvinfo struture and we read GPU address directly from it instead of copying data from AVFrame. Nvresize filter has a -readback parameter, if it's set as 0, resized result won't be copied back to CPU, mostly in case it's connected to an NVENC encoder。 If it's set as 1, resized result will still be copied back to AVFrame so that it could be compatible with other components.

3) Because we are using CUDA address now, input buffer becomes CUDA external memory. We replaced NvEncCreateInputBuffer to cuMemAllocPitch+NvEncRegisterInputBuffer, and NvEncLock/UnlockInputBuffer to NvEncMap/UnmapInputBuffer.

4) And because of using cuda input, it exposed some driver bugs, e.g. nvenc generates corrupted chroma plane data if buffer format is YUV420p. Bug-fixed driver will soon be released, but considering backwards compatibility we decided to convert YUV420P to NV12 explicitly by a cuda kernel in nvenc.c. Even in the bug-fixed driver, there's still a YUV420P->NV12 conversion kernel. The only difference is that kernel is provided along with driver, but here we did it within nvenc.c. The same reason, YUV444P is removed temporarily, there's a bug for cuda input. Once the fix is released, we should enable the support again. We choose to backwards support YUV420p is because it's much more popular than YUV444P.

5) Last is, we move most of cuda typedefs/functions/helpers to cudautils.h/c

A typical use case is:
ffmpeg -y -i $1 $2 $3 -filter_complex \


nvresize=5:s=hd1080\|hd720\|hd480\|wvga\|cif:readback=0[out0][out1][out2][out3][out4] \

-map [out0] -an -vcodec nvenc_h264 -preset slow -profile:v main -async 1 -b:v 200M -bufsize 200M -maxrate 200M -refs 1 -bf 2 $1_1080p.mp4 \

-map [out1] -an -vcodec nvenc_h264 -preset slow -profile:v main -async 1 -b:v 100M -bufsize 100M -maxrate 100M -refs 1 -bf 2 $1_720p.mp4 \

-map [out2] -an -vcodec nvenc_h264 -preset slow -profile:v main -async 1 -b:v 50M -bufsize 50M -maxrate 50M -refs 1 -bf 2 $1_480p.mp4 \

-map [out3] -an -vcodec nvenc_h264 -preset slow -profile:v main -async 1 -b:v 25M -bufsize 25M -maxrate 25M -refs 1 -bf 2 $1_wvga.mp4 \

-map [out4] -an -vcodec nvenc_h264 -preset slow -profile:v main -async 1 -b:v 10M -bufsize 10M -maxrate 10M -refs 1 -bf 2 $1_cif.mp4


Thanks
Agatha Hu
>From 4bb843a47cbcef9c0383efb7e573f0f8eadb65d6 Mon Sep 17 00:00:00 2001
From: Ganapathy Kasi <gk...@nvidia.com>
Date: Wed, 4 Nov 2015 22:22:35 -0800
Subject: [PATCH] combined: cuda resize,yuv420 fix,remove yuv444,add AQ

---
 libavcodec/Makefile           |   2 +-
 libavcodec/nvenc.c            | 435 ++++++++++-----------------
 libavcodec/nvenc_ptx.c        | 240 +++++++++++++++
 libavfilter/Makefile          |   2 +
 libavfilter/allfilters.c      |   1 +
 libavfilter/vf_nvresize.c     | 669 ++++++++++++++++++++++++++++++++++++++++++
 libavfilter/vf_nvresize_ptx.c | 659 +++++++++++++++++++++++++++++++++++++++++
 libavutil/Makefile            |   2 +
 libavutil/cudautils.c         | 288 ++++++++++++++++++
 libavutil/cudautils.h         | 216 ++++++++++++++
 10 files changed, 2241 insertions(+), 273 deletions(-)
 create mode 100644 libavcodec/nvenc_ptx.c
 create mode 100644 libavfilter/vf_nvresize.c
 create mode 100644 libavfilter/vf_nvresize_ptx.c
 create mode 100644 libavutil/cudautils.c
 create mode 100644 libavutil/cudautils.h

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 67fb72a..45ac476 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -98,7 +98,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                           motion_est.o ratecontrol.o    \
                                           mpegvideoencdsp.o
 OBJS-$(CONFIG_MSS34DSP)                += mss34dsp.o
-OBJS-$(CONFIG_NVENC)                   += nvenc.o
+OBJS-$(CONFIG_NVENC)                   += nvenc.o nvenc_ptx.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_QSV)                     += qsv.o
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index 812b0b4..fa6778c 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -32,15 +32,11 @@
 #include "libavutil/avassert.h"
 #include "libavutil/opt.h"
 #include "libavutil/mem.h"
+#include "libavutil/cudautils.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "thread.h"
 
-#if defined(_WIN32)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
 
 #if defined(_WIN32)
 #define LOAD_FUNC(l, s) GetProcAddress(l, s)
@@ -50,28 +46,19 @@
 #define DL_CLOSE_FUNC(l) dlclose(l)
 #endif
 
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+#define BLOCKSX  128
+#define THREADSX 128
 
 typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
 
 typedef struct NvencInputSurface
 {
     NV_ENC_INPUT_PTR input_surface;
+    CUdeviceptr      dptr;
+    void*            hRes;
     int width;
     int height;
+    size_t pitch;
 
     int lockCount;
 
@@ -107,24 +94,11 @@ typedef struct NvencDataList
 
 typedef struct NvencDynLoadFunctions
 {
-    PCUINIT cu_init;
-    PCUDEVICEGETCOUNT cu_device_get_count;
-    PCUDEVICEGET cu_device_get;
-    PCUDEVICEGETNAME cu_device_get_name;
-    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
-    PCUCTXCREATE cu_ctx_create;
-    PCUCTXPOPCURRENT cu_ctx_pop_current;
-    PCUCTXDESTROY cu_ctx_destroy;
-
     NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
-    int nvenc_device_count;
-    CUdevice nvenc_devices[16];
 
 #if defined(_WIN32)
-    HMODULE cuda_lib;
     HMODULE nvenc_lib;
 #else
-    void* cuda_lib;
     void* nvenc_lib;
 #endif
 } NvencDynLoadFunctions;
@@ -140,14 +114,18 @@ typedef struct NvencContext
     AVClass *avclass;
 
     NvencDynLoadFunctions nvenc_dload_funcs;
+    CudaDynLoadFunctions* cuda_dload_funcs;
 
     NV_ENC_INITIALIZE_PARAMS init_encode_params;
     NV_ENC_CONFIG encode_config;
     CUcontext cu_context;
+    CUmodule  cu_module;
+    CUfunction cu_func_interleaveChroma;
 
     int max_surface_count;
     NvencInputSurface *input_surfaces;
     NvencOutputSurface *output_surfaces;
+    NvencInputSurface transferSurf;
 
     NvencDataList output_surface_queue;
     NvencDataList output_surface_ready_queue;
@@ -164,8 +142,10 @@ typedef struct NvencContext
     int twopass;
     int gpu;
     int buffer_delay;
+    int aq;
 } NvencContext;
 
+
 static const NvencValuePair nvenc_h264_level_pairs[] = {
     { "auto", NV_ENC_LEVEL_AUTOSELECT },
     { "1"   , NV_ENC_LEVEL_H264_1     },
@@ -330,79 +310,18 @@ static int64_t timestamp_queue_dequeue(NvencDataList* queue)
     return res->u.timestamp;
 }
 
-#define CHECK_LOAD_FUNC(t, f, s) \
-do { \
-    (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
-    if (!(f)) { \
-        av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
-        goto error; \
-    } \
-} while (0)
-
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx)
-{
-    NvencContext *ctx = avctx->priv_data;
-    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
-
-    if (dl_fn->cuda_lib)
-        return 1;
-
-#if defined(_WIN32)
-    dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
-#else
-    dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
-#endif
-
-    if (!dl_fn->cuda_lib) {
-        av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
-        goto error;
-    }
-
-    CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
-    CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
-    CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
-    CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
-    CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
-    CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
-    CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
-    CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
-
-    return 1;
-
-error:
-
-    if (dl_fn->cuda_lib)
-        DL_CLOSE_FUNC(dl_fn->cuda_lib);
-
-    dl_fn->cuda_lib = NULL;
-
-    return 0;
-}
-
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func)
-{
-    if (err != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
-        return 0;
-    }
-    return 1;
-}
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error
-
 static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
 {
-    int device_count = 0;
-    CUdevice cu_device = 0;
-    char gpu_name[128];
-    int smminor = 0, smmajor = 0;
-    int i, smver, target_smver;
 
+    int  target_smver;
     NvencContext *ctx = avctx->priv_data;
-    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+    if (!init_cuda())
+        return 0;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
-        target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30;
+        target_smver = 0x30;
         break;
     case AV_CODEC_ID_H265:
         target_smver = 0x52;
@@ -412,49 +331,19 @@ static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
         goto error;
     }
 
-    if (!nvenc_dyload_cuda(avctx))
-        return 0;
-
-    if (dl_fn->nvenc_device_count > 0)
-        return 1;
-
-    check_cuda_errors(dl_fn->cu_init(0));
-
-    check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
-
-    if (!device_count) {
-        av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+    if (!is_gpu_feature_available(ctx->gpu, target_smver))
+    {
+        av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu);
         goto error;
     }
-
-    av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
-
-    dl_fn->nvenc_device_count = 0;
-
-    for (i = 0; i < device_count; ++i) {
-        check_cuda_errors(dl_fn->cu_device_get(&cu_device, i));
-        check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device));
-        check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device));
-
-        smver = (smmajor << 4) | smminor;
-
-        av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available");
-
-        if (smver >= target_smver)
-            dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device;
-    }
-
-    if (!dl_fn->nvenc_device_count) {
-        av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
-        goto error;
+    else
+    {
+        av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu);
     }
 
     return 1;
 
 error:
-
-    dl_fn->nvenc_device_count = 0;
-
     return 0;
 }
 
@@ -488,23 +377,18 @@ static av_cold int nvenc_dyload_nvenc(AVCodecContext *avctx)
     }
 
     nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance");
-
     if (!nvEncodeAPICreateInstance) {
         av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
         goto error;
     }
 
     dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
-
     nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
-
     if (nvstatus != NV_ENC_SUCCESS) {
         av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
         goto error;
     }
 
-    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
-
     return 1;
 
 error:
@@ -512,7 +396,6 @@ error:
         DL_CLOSE_FUNC(dl_fn->nvenc_lib);
 
     dl_fn->nvenc_lib = NULL;
-
     return 0;
 }
 
@@ -523,29 +406,16 @@ static av_cold void nvenc_unload_nvenc(AVCodecContext *avctx)
 
     DL_CLOSE_FUNC(dl_fn->nvenc_lib);
     dl_fn->nvenc_lib = NULL;
-
-    dl_fn->nvenc_device_count = 0;
-
-    DL_CLOSE_FUNC(dl_fn->cuda_lib);
-    dl_fn->cuda_lib = NULL;
-
-    dl_fn->cu_init = NULL;
-    dl_fn->cu_device_get_count = NULL;
-    dl_fn->cu_device_get = NULL;
-    dl_fn->cu_device_get_name = NULL;
-    dl_fn->cu_device_compute_capability = NULL;
-    dl_fn->cu_ctx_create = NULL;
-    dl_fn->cu_ctx_pop_current = NULL;
-    dl_fn->cu_ctx_destroy = NULL;
-
+    deinit_cuda();
     av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
 }
 
 static av_cold int nvenc_encode_init(AVCodecContext *avctx)
 {
     NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 };
+    NV_ENC_REGISTER_RESOURCE registerParams = { 0 };
     NV_ENC_PRESET_CONFIG preset_config = { 0 };
-    CUcontext cu_context_curr;
+    CudaDynLoadFunctions *p_cuda;
     CUresult cu_res;
     GUID encoder_preset = NV_ENC_PRESET_HQ_GUID;
     GUID codec;
@@ -557,6 +427,7 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     int res = 0;
     int dw, dh;
     int qp_inter_p;
+    extern char color_ptx[];
 
     NvencContext *ctx = avctx->priv_data;
     NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
@@ -574,28 +445,18 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
     encode_session_params.apiVersion = NVENCAPI_VERSION;
 
-    if (ctx->gpu >= dl_fn->nvenc_device_count) {
-        av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count);
-        res = AVERROR(EINVAL);
-        goto error;
-    }
-
-    ctx->cu_context = NULL;
-    cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins
+    cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu);
+    p_cuda = get_cuda_dl_func();
 
     if (cu_res != CUDA_SUCCESS) {
         av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
         res = AVERROR_EXTERNAL;
         goto error;
     }
+	av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context);
 
-    cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr);
-
-    if (cu_res != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
-        res = AVERROR_EXTERNAL;
-        goto error;
-    }
+    __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx));
+    __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma"));
 
     encode_session_params.device = ctx->cu_context;
     encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
@@ -859,6 +720,15 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
     }
 
+    if (ctx->aq)
+    {
+        ctx->encode_config.rcParams.enableAQ = 1;
+    }
+    else
+    {
+        ctx->encode_config.rcParams.enableAQ = 0;
+    }
+
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1;
@@ -875,9 +745,6 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
 
         if (!ctx->profile) {
             switch (avctx->profile) {
-            case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
-                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-                break;
             case FF_PROFILE_H264_BASELINE:
                 ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
                 break;
@@ -903,9 +770,6 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             } else if (!strcmp(ctx->profile, "baseline")) {
                 ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
                 avctx->profile = FF_PROFILE_H264_BASELINE;
-            } else if (!strcmp(ctx->profile, "high444p")) {
-                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-                avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
             } else {
                 av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile);
                 res = AVERROR(EINVAL);
@@ -913,13 +777,7 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             }
         }
 
-        // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
-        if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
-            ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-            avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
-        }
-
-        ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+        ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
 
         if (ctx->level) {
             res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level);
@@ -975,6 +833,8 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         goto error;
     }
 
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
+
     ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces));
 
     if (!ctx->input_surfaces) {
@@ -989,28 +849,32 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         goto error;
     }
 
+    // Allocation for temp surface used for sys mem -> device mem transfer
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
+    {
+		ctx->transferSurf.width = (avctx->width + 31) & ~31;
+        ctx->transferSurf.height = (avctx->height + 31) & ~31;
+        p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr,
+                &ctx->transferSurf.pitch,
+                ctx->transferSurf.width,
+                ctx->transferSurf.height/ 2, 16);
+    }
+
     for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) {
-        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
         NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
-        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
         allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
 
-        allocSurf.width = (avctx->width + 31) & ~31;
-        allocSurf.height = (avctx->height + 31) & ~31;
-
-        allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+        ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31;
+        ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31;
 
         switch (avctx->pix_fmt) {
         case AV_PIX_FMT_YUV420P:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
-            break;
-
         case AV_PIX_FMT_NV12:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
-            break;
-
-        case AV_PIX_FMT_YUV444P:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+            ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
+            p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr,
+                    &ctx->input_surfaces[surfaceCount].pitch,
+                    ctx->input_surfaces[surfaceCount].width,
+                    ctx->input_surfaces[surfaceCount].height * 3 / 2, 16);
             break;
 
         default:
@@ -1019,18 +883,21 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             goto error;
         }
 
-        nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+        registerParams.version = NV_ENC_REGISTER_RESOURCE_VER,
+        registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,
+        registerParams.width = ctx->input_surfaces[surfaceCount].width,
+        registerParams.height = ctx->input_surfaces[surfaceCount].height,
+        registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch,
+        registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format;
+        registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr,
+        nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &registerParams);
         if (nv_status != NV_ENC_SUCCESS) {
-            av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
+            av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n");
             res = AVERROR_EXTERNAL;
             goto error;
         }
-
+        ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource;
         ctx->input_surfaces[surfaceCount].lockCount = 0;
-        ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer;
-        ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt;
-        ctx->input_surfaces[surfaceCount].width = allocSurf.width;
-        ctx->input_surfaces[surfaceCount].height = allocSurf.height;
 
         /* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */
         allocOut.size = 1024 * 1024;
@@ -1086,21 +953,22 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     return 0;
 
 error:
-
     for (i = 0; i < surfaceCount; ++i) {
-        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+        p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
+
         if (ctx->output_surfaces[i].output_surface)
             p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
     }
 
+	p_cuda->cu_mem_free(ctx->transferSurf.dptr);
     if (ctx->nvencoder)
         p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
 
     if (ctx->cu_context)
-        dl_fn->cu_ctx_destroy(ctx->cu_context);
+        release_cuda_context(&ctx->cu_context, ctx->gpu);
 
     nvenc_unload_nvenc(avctx);
-
     ctx->nvencoder = NULL;
     ctx->cu_context = NULL;
 
@@ -1112,6 +980,7 @@ static av_cold int nvenc_encode_close(AVCodecContext *avctx)
     NvencContext *ctx = avctx->priv_data;
     NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
     NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
     int i;
 
     av_freep(&ctx->timestamp_list.data);
@@ -1119,16 +988,19 @@ static av_cold int nvenc_encode_close(AVCodecContext *avctx)
     av_freep(&ctx->output_surface_queue.data);
 
     for (i = 0; i < ctx->max_surface_count; ++i) {
-        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+        p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
         p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
     }
     ctx->max_surface_count = 0;
 
-    p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    if (ctx->nvencoder)
+        p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+
     ctx->nvencoder = NULL;
 
-    dl_fn->cu_ctx_destroy(ctx->cu_context);
-    ctx->cu_context = NULL;
+    if (ctx->cu_context)
+        release_cuda_context(&ctx->cu_context, ctx->gpu);
 
     nvenc_unload_nvenc(avctx);
 
@@ -1143,6 +1015,7 @@ static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencOut
 
     uint32_t slice_mode_data;
     uint32_t *slice_offsets;
+    char picType = 'X';
     NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
     NVENCSTATUS nv_status;
     int res = 0;
@@ -1195,12 +1068,15 @@ static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencOut
 FF_DISABLE_DEPRECATION_WARNINGS
     case NV_ENC_PIC_TYPE_I:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        picType = 'I';
         break;
     case NV_ENC_PIC_TYPE_P:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        picType = 'P';
         break;
     case NV_ENC_PIC_TYPE_B:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        picType = 'B';
         break;
     case NV_ENC_PIC_TYPE_BI:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
@@ -1214,6 +1090,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     }
 
+    av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d  PicType %c Frame AvgQP %d  SATD Cost %d  Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes);
+
     pkt->pts = lock_params.outputTimeStamp;
     pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list);
 
@@ -1241,6 +1119,61 @@ error:
     return res;
 }
 
+
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func,
+    CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride)
+{
+    void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride};
+    __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL));
+
+    return 0;
+}
+
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf)
+{
+    CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
+    if (frame->format == AV_PIX_FMT_NV12) {
+
+        // check opaque field, if there's already a deviceptr
+        if (frame->opaque && check_nvinfo(frame->opaque) &&
+            ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+            ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+            __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else
+        {
+            __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        }
+    }
+    else if (frame->format == AV_PIX_FMT_YUV420P) {
+        // check opaque field, if there's already a deviceptr
+        if (frame->opaque && check_nvinfo(frame->opaque) &&
+            ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+            ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+            __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+            call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch);
+        }
+        else
+        {
+            __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+            call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch);
+        }
+    }
+    else {
+        av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
 static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame *frame, int *got_packet)
 {
@@ -1256,7 +1189,7 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     pic_params.version = NV_ENC_PIC_PARAMS_VER;
 
     if (frame) {
-        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+        NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 };
         NvencInputSurface *inSurf = NULL;
 
         for (i = 0; i < ctx->max_surface_count; ++i) {
@@ -1270,69 +1203,27 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
         inSurf->lockCount = 1;
 
-        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
-        lockBufferParams.inputBuffer = inSurf->input_surface;
-
-        nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+        mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+        mapParams.registeredResource = inSurf->hRes;
+        nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams);
         if (nv_status != NV_ENC_SUCCESS) {
-            av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n");
+            av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n");
             return 0;
         }
 
-        if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
-                frame->data[2], frame->linesize[2],
-                avctx->width >> 1, avctx->height >> 1);
-
-            buf += (inSurf->height * lockBufferParams.pitch) >> 2;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
-                frame->data[1], frame->linesize[1],
-                avctx->width >> 1, avctx->height >> 1);
-        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[1], frame->linesize[1],
-                avctx->width, avctx->height >> 1);
-        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[1], frame->linesize[1],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
+        inSurf->input_surface = mapParams.mappedResource;
+        if (inSurf->format != mapParams.mappedBufferFmt) {
+            av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n");
+            return 0;
+        }
 
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[2], frame->linesize[2],
-                avctx->width, avctx->height);
-        } else {
-            av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
-            return AVERROR(EINVAL);
+        if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) {
+            p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
+            av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n");
+            return 0;
         }
 
-        nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface);
+        nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
         if (nv_status != NV_ENC_SUCCESS) {
             av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
             return AVERROR_EXTERNAL;
@@ -1450,7 +1341,6 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 static const enum AVPixelFormat pix_fmts_nvenc[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NV12,
-    AV_PIX_FMT_YUV444P,
     AV_PIX_FMT_NONE
 };
 
@@ -1458,13 +1348,14 @@ static const enum AVPixelFormat pix_fmts_nvenc[] = {
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
     { "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "hq" }, 0, 0, VE },
-    { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
     { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     { "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
+    { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { NULL }
 };
 
diff --git a/libavcodec/nvenc_ptx.c b/libavcodec/nvenc_ptx.c
new file mode 100644
index 0000000..df9b6fb
--- /dev/null
+++ b/libavcodec/nvenc_ptx.c
@@ -0,0 +1,240 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char color_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19830389\n"
+	"// Cuda compilation tools, release 8.0, V8.0.0\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.3\n"
+	".target sm_30\n"
+	".address_size 32\n"
+	"\n"
+	"// .globl	interleaveChroma\n"
+	"\n"
+	".visible .entry interleaveChroma(\n"
+	".param .u32 interleaveChroma_param_0,\n"
+	".param .u32 interleaveChroma_param_1,\n"
+	".param .u32 interleaveChroma_param_2,\n"
+	".param .u32 interleaveChroma_param_3,\n"
+	".param .u32 interleaveChroma_param_4,\n"
+	".param .u32 interleaveChroma_param_5,\n"
+	".param .u32 interleaveChroma_param_6\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<5>;\n"
+	".reg .b32 	%r<57>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r15, [interleaveChroma_param_0];\n"
+	"ld.param.u32 	%r16, [interleaveChroma_param_1];\n"
+	"ld.param.u32 	%r17, [interleaveChroma_param_2];\n"
+	"ld.param.u32 	%r18, [interleaveChroma_param_3];\n"
+	"ld.param.u32 	%r21, [interleaveChroma_param_4];\n"
+	"ld.param.u32 	%r19, [interleaveChroma_param_5];\n"
+	"ld.param.u32 	%r20, [interleaveChroma_param_6];\n"
+	"shr.s32 	%r1, %r21, 1;\n"
+	"mov.u32 	%r55, %ctaid.x;\n"
+	"setp.ge.s32	%p1, %r55, %r1;\n"
+	"@%p1 bra 	BB0_6;\n"
+	"\n"
+	"cvta.to.global.u32 	%r3, %r17;\n"
+	"cvta.to.global.u32 	%r4, %r16;\n"
+	"cvta.to.global.u32 	%r5, %r15;\n"
+	"mov.u32 	%r6, %tid.x;\n"
+	"shr.s32 	%r7, %r18, 3;\n"
+	"mov.u32 	%r8, %ntid.x;\n"
+	"\n"
+	"BB0_2:\n"
+	"setp.ge.s32	%p2, %r6, %r7;\n"
+	"@%p2 bra 	BB0_5;\n"
+	"\n"
+	"mul.lo.s32 	%r22, %r55, %r19;\n"
+	"mul.lo.s32 	%r23, %r55, %r20;\n"
+	"shr.s32 	%r10, %r23, 2;\n"
+	"shr.s32 	%r24, %r22, 2;\n"
+	"shr.u32 	%r11, %r24, 1;\n"
+	"mov.u32 	%r56, %r6;\n"
+	"\n"
+	"BB0_4:\n"
+	"mov.u32 	%r12, %r56;\n"
+	"add.s32 	%r25, %r12, %r11;\n"
+	"shl.b32 	%r26, %r25, 2;\n"
+	"add.s32 	%r27, %r5, %r26;\n"
+	"add.s32 	%r28, %r4, %r26;\n"
+	"ld.global.u32 	%r29, [%r28];\n"
+	"and.b32  	%r30, %r29, 65280;\n"
+	"shl.b32 	%r31, %r30, 16;\n"
+	"ld.global.u32 	%r32, [%r27];\n"
+	"shl.b32 	%r33, %r32, 8;\n"
+	"and.b32  	%r34, %r33, 16711680;\n"
+	"shl.b32 	%r35, %r29, 8;\n"
+	"and.b32  	%r36, %r35, 65280;\n"
+	"and.b32  	%r37, %r32, 255;\n"
+	"or.b32  	%r38, %r34, %r37;\n"
+	"or.b32  	%r39, %r38, %r31;\n"
+	"or.b32  	%r40, %r39, %r36;\n"
+	"shl.b32 	%r41, %r12, 1;\n"
+	"add.s32 	%r42, %r41, %r10;\n"
+	"shl.b32 	%r43, %r42, 2;\n"
+	"add.s32 	%r44, %r3, %r43;\n"
+	"st.global.u32 	[%r44], %r40;\n"
+	"and.b32  	%r45, %r29, -16777216;\n"
+	"and.b32  	%r46, %r32, -16777216;\n"
+	"shr.u32 	%r47, %r46, 8;\n"
+	"or.b32  	%r48, %r45, %r47;\n"
+	"and.b32  	%r49, %r29, 16711680;\n"
+	"shr.u32 	%r50, %r49, 8;\n"
+	"bfe.u32 	%r51, %r32, 16, 8;\n"
+	"or.b32  	%r52, %r48, %r51;\n"
+	"or.b32  	%r53, %r52, %r50;\n"
+	"st.global.u32 	[%r44+4], %r53;\n"
+	"add.s32 	%r13, %r8, %r12;\n"
+	"setp.lt.s32	%p3, %r13, %r7;\n"
+	"mov.u32 	%r56, %r13;\n"
+	"@%p3 bra 	BB0_4;\n"
+	"\n"
+	"BB0_5:\n"
+	"mov.u32 	%r54, %nctaid.x;\n"
+	"add.s32 	%r55, %r54, %r55;\n"
+	"setp.lt.s32	%p4, %r55, %r1;\n"
+	"@%p4 bra 	BB0_2;\n"
+	"\n"
+	"BB0_6:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#elif defined ENVIRONMENT64
+const char color_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19830389\n"
+	"// Cuda compilation tools, release 8.0, V8.0.0\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.3\n"
+	".target sm_30\n"
+	".address_size 64\n"
+	"\n"
+	"// .globl	interleaveChroma\n"
+	"\n"
+	".visible .entry interleaveChroma(\n"
+	".param .u64 interleaveChroma_param_0,\n"
+	".param .u64 interleaveChroma_param_1,\n"
+	".param .u64 interleaveChroma_param_2,\n"
+	".param .u32 interleaveChroma_param_3,\n"
+	".param .u32 interleaveChroma_param_4,\n"
+	".param .u32 interleaveChroma_param_5,\n"
+	".param .u32 interleaveChroma_param_6\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<5>;\n"
+	".reg .b32 	%r<47>;\n"
+	".reg .b64 	%rd<14>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd4, [interleaveChroma_param_0];\n"
+	"ld.param.u64 	%rd5, [interleaveChroma_param_1];\n"
+	"ld.param.u64 	%rd6, [interleaveChroma_param_2];\n"
+	"ld.param.u32 	%r12, [interleaveChroma_param_3];\n"
+	"ld.param.u32 	%r15, [interleaveChroma_param_4];\n"
+	"ld.param.u32 	%r13, [interleaveChroma_param_5];\n"
+	"ld.param.u32 	%r14, [interleaveChroma_param_6];\n"
+	"shr.s32 	%r1, %r15, 1;\n"
+	"mov.u32 	%r45, %ctaid.x;\n"
+	"setp.ge.s32	%p1, %r45, %r1;\n"
+	"@%p1 bra 	BB0_6;\n"
+	"\n"
+	"cvta.to.global.u64 	%rd1, %rd6;\n"
+	"cvta.to.global.u64 	%rd2, %rd5;\n"
+	"cvta.to.global.u64 	%rd3, %rd4;\n"
+	"mov.u32 	%r3, %tid.x;\n"
+	"shr.s32 	%r4, %r12, 3;\n"
+	"mov.u32 	%r5, %ntid.x;\n"
+	"\n"
+	"BB0_2:\n"
+	"setp.ge.s32	%p2, %r3, %r4;\n"
+	"@%p2 bra 	BB0_5;\n"
+	"\n"
+	"mul.lo.s32 	%r16, %r45, %r13;\n"
+	"mul.lo.s32 	%r17, %r45, %r14;\n"
+	"shr.s32 	%r7, %r17, 2;\n"
+	"shr.s32 	%r18, %r16, 2;\n"
+	"shr.u32 	%r8, %r18, 1;\n"
+	"mov.u32 	%r46, %r3;\n"
+	"\n"
+	"BB0_4:\n"
+	"mov.u32 	%r9, %r46;\n"
+	"add.s32 	%r19, %r9, %r8;\n"
+	"mul.wide.u32 	%rd7, %r19, 4;\n"
+	"add.s64 	%rd8, %rd3, %rd7;\n"
+	"add.s64 	%rd9, %rd2, %rd7;\n"
+	"ld.global.u32 	%r20, [%rd9];\n"
+	"and.b32  	%r21, %r20, 65280;\n"
+	"shl.b32 	%r22, %r21, 16;\n"
+	"ld.global.u32 	%r23, [%rd8];\n"
+	"shl.b32 	%r24, %r23, 8;\n"
+	"and.b32  	%r25, %r24, 16711680;\n"
+	"shl.b32 	%r26, %r20, 8;\n"
+	"and.b32  	%r27, %r26, 65280;\n"
+	"and.b32  	%r28, %r23, 255;\n"
+	"or.b32  	%r29, %r25, %r28;\n"
+	"or.b32  	%r30, %r29, %r22;\n"
+	"or.b32  	%r31, %r30, %r27;\n"
+	"shl.b32 	%r32, %r9, 1;\n"
+	"add.s32 	%r33, %r32, %r7;\n"
+	"mul.wide.u32 	%rd10, %r33, 4;\n"
+	"add.s64 	%rd11, %rd1, %rd10;\n"
+	"st.global.u32 	[%rd11], %r31;\n"
+	"and.b32  	%r34, %r20, -16777216;\n"
+	"and.b32  	%r35, %r23, -16777216;\n"
+	"shr.u32 	%r36, %r35, 8;\n"
+	"or.b32  	%r37, %r34, %r36;\n"
+	"and.b32  	%r38, %r20, 16711680;\n"
+	"shr.u32 	%r39, %r38, 8;\n"
+	"bfe.u32 	%r40, %r23, 16, 8;\n"
+	"or.b32  	%r41, %r37, %r40;\n"
+	"or.b32  	%r42, %r41, %r39;\n"
+	"add.s32 	%r43, %r33, 1;\n"
+	"mul.wide.u32 	%rd12, %r43, 4;\n"
+	"add.s64 	%rd13, %rd1, %rd12;\n"
+	"st.global.u32 	[%rd13], %r42;\n"
+	"add.s32 	%r10, %r5, %r9;\n"
+	"setp.lt.s32	%p3, %r10, %r4;\n"
+	"mov.u32 	%r46, %r10;\n"
+	"@%p3 bra 	BB0_4;\n"
+	"\n"
+	"BB0_5:\n"
+	"mov.u32 	%r44, %nctaid.x;\n"
+	"add.s32 	%r45, %r44, %r45;\n"
+	"setp.lt.s32	%p4, %r45, %r1;\n"
+	"@%p4 bra 	BB0_2;\n"
+	"\n"
+	"BB0_6:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#endif
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1b23085..3f16fe8 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -22,6 +22,8 @@ OBJS = allfilters.o                                                     \
        opencl_allkernels.o                                              \
        transform.o                                                      \
        video.o                                                          \
+       vf_nvresize.o                                                    \
+       vf_nvresize_ptx.o                                                \
 
 OBJS-$(CONFIG_ACROSSFADE_FILTER)             += af_afade.o
 OBJS-$(CONFIG_ADELAY_FILTER)                 += af_adelay.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index a538b81..593d2fb 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -270,6 +270,7 @@ void avfilter_register_all(void)
     REGISTER_FILTER(ZMQ,            zmq,            vf);
     REGISTER_FILTER(ZOOMPAN,        zoompan,        vf);
     REGISTER_FILTER(ZSCALE,         zscale,         vf);
+    REGISTER_FILTER(NVRESIZE,       nvresize,        vf);
 
     REGISTER_FILTER(ALLRGB,         allrgb,         vsrc);
     REGISTER_FILTER(ALLYUV,         allyuv,         vsrc);
diff --git a/libavfilter/vf_nvresize.c b/libavfilter/vf_nvresize.c
new file mode 100644
index 0000000..c0aaa9f
--- /dev/null
+++ b/libavfilter/vf_nvresize.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2011 Roger Pau Monné <roger....@entel.upc.edu>
+ * Copyright (c) 2011 Stefano Sabatini
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/eval.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/cudautils.h"
+
+#include "avfilter.h"
+#include "drawutils.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define MAX_OUTPUT 16
+#define BLOCKX 32
+#define BLOCKY 16
+
+typedef struct cu_tex {
+    int w;
+    int h;
+    size_t pitch;
+    CUdeviceptr dptr;
+} cu_tex;
+
+typedef struct NVResizeContext {
+    const AVClass *class;
+
+    /**
+    * New dimensions. Special values are:
+    *   0 = original width/height
+    *  -1 = keep original aspect
+    *  -N = try to keep aspect but make sure it is divisible by N
+    */
+    int nb_outputs;
+
+    char *size_str;
+    int force_original_aspect_ratio;
+    int readback_FB;
+    int gpu;
+
+    int cuda_inited;
+
+    CUcontext   cu_ctx;
+    CudaDynLoadFunctions* cu_dl_func;
+    CUmodule    cu_module;
+    CUfunction  cu_func_uchar;
+    CUfunction  cu_func_uchar2;
+    CUfunction  cu_func_uchar4;
+    CUtexref    cu_tex_uchar;
+    CUtexref    cu_tex_uchar2;
+    CUtexref    cu_tex_uchar4;
+    cu_tex      intex;
+    cu_tex      outtex[MAX_OUTPUT];
+
+} NVResizeContext;
+
+#define OFFSET(x) offsetof(NVResizeContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption nvresize_options[] = {
+    { "outputs",  "set number of outputs",  OFFSET(nb_outputs),  AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS },
+    { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+    { "size",     "set video size",         OFFSET(size_str),    AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+    { "s",        "set video size",         OFFSET(size_str),    AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+    { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nvresize);
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_NV12,
+        AV_PIX_FMT_ARGB,
+        AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_ABGR,
+        AV_PIX_FMT_BGRA,
+        AV_PIX_FMT_NONE,
+    };
+
+    AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    NVResizeContext *s = ctx->priv;
+
+    int outIdx = atoi(outlink->srcpad->name + 3);
+    int64_t w, h;
+    int factor_w, factor_h;
+
+    w = s->outtex[outIdx].w;
+    h = s->outtex[outIdx].h;
+
+    // Check if it is requested that the result has to be divisible by a some
+    // factor (w or h = -n with n being the factor).
+    factor_w = 1;
+    factor_h = 1;
+    if (w < -1) {
+        factor_w = -w;
+    }
+    if (h < -1) {
+        factor_h = -h;
+    }
+
+    if (w < 0 && h < 0)
+        s->outtex[outIdx].w = s->outtex[outIdx].h = 0;
+
+    if (!(w = s->outtex[outIdx].w))
+        w = inlink->w;
+    if (!(h = s->outtex[outIdx].h))
+        h = inlink->h;
+
+    // Make sure that the result is divisible by the factor we determined
+    // earlier. If no factor was set, it is nothing will happen as the default
+    // factor is 1
+    if (w < 0)
+        w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w;
+    if (h < 0)
+        h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h;
+
+    // Note that force_original_aspect_ratio may overwrite the previous set
+    // dimensions so that it is not divisible by the set factors anymore.
+    if (s->force_original_aspect_ratio) {
+        int tmp_w = av_rescale(h, inlink->w, inlink->h);
+        int tmp_h = av_rescale(w, inlink->h, inlink->w);
+
+        if (s->force_original_aspect_ratio == 1) {
+             w = FFMIN(tmp_w, w);
+             h = FFMIN(tmp_h, h);
+        } else {
+             w = FFMAX(tmp_w, w);
+             h = FFMAX(tmp_h, h);
+        }
+    }
+
+    if (w > INT_MAX || h > INT_MAX ||
+        (h * inlink->w) > INT_MAX  ||
+        (w * inlink->h) > INT_MAX)
+        av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n");
+
+    s->outtex[outIdx].w = outlink->w = w;
+    s->outtex[outIdx].h = outlink->h = h;
+
+    if (inlink->sample_aspect_ratio.num){
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+    } else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    // create output device memory
+    switch(outlink->format) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_NV12:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16));
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16));
+        break;
+    }
+
+    return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    extern char resize_ptx[];
+    NVResizeContext *s = ctx->priv;
+    int ret;
+    int i, j;
+    int count = 0;
+    for (i = 0; i < s->nb_outputs; i++) {
+        char name[32];
+        AVFilterPad pad = { 0 };
+
+        snprintf(name, sizeof(name), "out%d", i);
+        pad.type = ctx->filter->inputs[0].type;
+        pad.name = av_strdup(name);
+        pad.config_props = config_output;
+        if (!pad.name)
+            return AVERROR(ENOMEM);
+
+        ff_insert_outpad(ctx, i, &pad);
+    }
+
+    // parse size parameters here
+    if (s->size_str) {
+        char split = '|';
+        char* found = NULL;
+        char* head = s->size_str;
+        while ((found = strchr(head, split)) != NULL) {
+            *found = 0;
+            if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+                return ret;
+            }
+            head = found+1;
+            count++;
+        }
+
+        if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+            return ret;
+        }
+        count++;
+    }
+
+    // sort the output
+    for (i = 0; i < count; i++) {
+        for (j = i+1; j < count; j++) {
+            int tempH, tempW;
+            if (s->outtex[i].w < s->outtex[j].w) {
+                tempW = s->outtex[i].w;          tempH = s->outtex[i].h;
+                s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h;
+                s->outtex[j].w = tempW;          s->outtex[j].h = tempH;
+            }
+        }
+    }
+
+    if (count < s->nb_outputs) {
+        int offset = s->nb_outputs - count;
+        for (i = s->nb_outputs-1; i >= offset; i--) {
+            s->outtex[i].w = s->outtex[i-offset].w;
+            s->outtex[i].h = s->outtex[i-offset].h;
+        }
+        for (i = 0; i < offset; i++) {
+            s->outtex[i].w = s->outtex[i].h = 0;
+        }
+    }
+
+    // init cuda_context
+    if (!s->cu_ctx) {
+        init_cuda();
+        get_cuda_context(&s->cu_ctx, s->gpu);
+    }
+    s->cu_dl_func = get_cuda_dl_func();
+
+    __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx));
+
+    // load functions
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar,   s->cu_module, "Subsample_Bilinear_uchar"));
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2,  s->cu_module, "Subsample_Bilinear_uchar2"));
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4,  s->cu_module, "Subsample_Bilinear_uchar4"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar,  s->cu_module, "uchar_tex"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
+
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar,  CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar,  CU_TR_FILTER_MODE_LINEAR));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
+
+    return 0;
+}
+
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst)
+{
+    av_assert0(src->width == dst->w && src->height == dst->h);
+
+    switch (src->format) {
+    case AV_PIX_FMT_YUV420P:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy U channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy V channel
+        __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy U channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy V channel
+        __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        break;
+
+    case AV_PIX_FMT_NV12:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy UV channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        // copy the packed 32-bit plane
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format));
+        return -1;
+    }
+    return 0;
+}
+
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst)
+{
+    //av_assert0(src->w == dst->width && src->h == dst->height);
+
+    switch (dst->format) {
+    case AV_PIX_FMT_YUV420P:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy U channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy V channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy U channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy V channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy UV channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        // copy the packed 32-bit plane
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format));
+        return -1;
+    }
+    return 0;
+}
+
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels,
+                             CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch,
+                             CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch)
+{
+    void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height };
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Width  = src_width;
+    desc.Height = src_height;
+    desc.NumChannels = channels;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch));
+
+    __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+        BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL));
+
+    return 0;
+}
+
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format)
+{
+    switch (format) {
+    case AV_PIX_FMT_YUV420P:
+        if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2,
+                    dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2,
+                    dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2);
+        }
+
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h, src->w, src->h, src->pitch,
+                    dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch,
+                    dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch);
+        }
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2,
+                    src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch,
+                    dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+        }
+
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch/4);
+        }
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format));
+        return -1;
+    }
+
+    return 0;
+}
+
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target)
+{
+    int offset;
+    cu_tex* src;
+    if (source == NULL) {
+        return &s->intex;
+    }
+
+    if (target->w * 4 > source->w) {
+        return source;
+    }
+
+    offset = target - s->outtex;
+    for (int i = offset - 1; i >= 0; i--) {
+        if (target->w * 4 > s->outtex[i].w) {
+            return &s->outtex[i];
+        }
+    }
+
+    src = (offset == 0 ? source : &s->outtex[offset-1]);
+    av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, "
+            "might cause artificial\n", target->w, target->h, src->w, src->h);
+
+    return src;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *ctx  = inlink->dst;
+    NVResizeContext *s = ctx->priv;
+    int i;
+    cu_tex* resize_src = NULL;
+    ffnvinfo* info;
+
+    // copy input to gpu
+    if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) {
+        ffnvinfo* info = (ffnvinfo*)in->opaque;
+        s->intex.dptr = info->dptr[0];
+        s->intex.pitch = info->linesize[0];
+        s->intex.w = in->width;
+        s->intex.h = in->height;
+    }
+    else {
+        if ( (in->width != s->intex.h || in->height != s->intex.h) &&
+             !s->intex.dptr) {
+            __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr));
+            s->intex.w = in->width;
+            s->intex.h = in->height;
+            s->intex.dptr = (CUdeviceptr)NULL;
+        }
+        if (!s->intex.dptr) {
+            switch (in->format) {
+            case AV_PIX_FMT_YUV420P:
+            case AV_PIX_FMT_NV12:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16));
+                break;
+            case AV_PIX_FMT_YUV444P:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16));
+                break;
+            case AV_PIX_FMT_ARGB:
+            case AV_PIX_FMT_RGBA:
+            case AV_PIX_FMT_ABGR:
+            case AV_PIX_FMT_BGRA:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16));
+                break;
+            default:
+                av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format));
+                return -1;
+            }
+        }
+        copy_from_avframe(s, in, &s->intex);
+    }
+
+    for (i = 0; i < ctx->nb_outputs; i++) {
+        AVFrame *out;
+        if (ctx->outputs[i]->closed)
+            continue;
+
+        out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h);
+        if (!out) {
+            av_frame_free(&in);
+            return AVERROR(ENOMEM);
+        }
+        av_frame_copy_props(out, in);
+
+        // do works here
+        resize_src = find_resize_src(s, resize_src, &s->outtex[i]);
+        do_cuda_resize(s, resize_src, &s->outtex[i], in->format);
+        info = init_nvinfo();
+        switch (out->format) {
+        case AV_PIX_FMT_YUV444P:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2;
+            info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch;
+            break;
+
+        case AV_PIX_FMT_YUV420P:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4;
+            info->linesize[0] = s->outtex[i].pitch;
+            info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2;
+            break;
+
+        case AV_PIX_FMT_NV12:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->linesize[0] = info->linesize[1] = s->outtex[i].pitch;
+            break;
+
+        case AV_PIX_FMT_ARGB:
+        case AV_PIX_FMT_RGBA:
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_BGRA:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->linesize[0] = s->outtex[i].pitch;
+            break;
+
+        default:
+            break;
+        }
+
+        out->opaque = (void*)info;
+        if (s->readback_FB)
+            copy_to_avframe(s, &s->outtex[i], out);
+
+        if (ff_filter_frame(ctx->outputs[i], out) < 0)
+            break;
+    }
+
+    av_frame_free(&in);
+    return 0;
+}
+
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    NVResizeContext *s = ctx->priv;
+
+    for (int i = 0; i < s->nb_outputs; i++) {
+        av_freep(&ctx->output_pads[i].name);
+        if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr);
+    }
+    if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu);
+
+    av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n");
+
+}
+
+static const AVFilterPad nvresize_inputs[] = {
+    {
+        .name           = "default",
+        .type           = AVMEDIA_TYPE_VIDEO,
+        .filter_frame   = filter_frame,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_nvresize = {
+    .name = "nvresize",
+    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."),
+    .inputs  = nvresize_inputs,
+    .outputs = NULL,
+    .flags   = AVFILTER_FLAG_DYNAMIC_OUTPUTS,
+    .priv_class = &nvresize_class,
+    .init = init,
+    .uninit = uninit,
+    .query_formats = query_formats,
+    .priv_size = sizeof(NVResizeContext),
+};
diff --git a/libavfilter/vf_nvresize_ptx.c b/libavfilter/vf_nvresize_ptx.c
new file mode 100644
index 0000000..6aef2e3
--- /dev/null
+++ b/libavfilter/vf_nvresize_ptx.c
@@ -0,0 +1,659 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char resize_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19324607\n"
+	"// Cuda compilation tools, release 7.0, V7.0.27\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.2\n"
+	".target sm_30\n"
+	".address_size 32\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar\n"
+	".global .texref uchar_tex;\n"
+	".global .texref uchar2_tex;\n"
+	".global .texref uchar4_tex;\n"
+	"\n"
+	".visible .entry Subsample_Bilinear_uchar(\n"
+	".param .u32 Subsample_Bilinear_uchar_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<43>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB0_2;\n"
+	"bra.uni 	BB0_1;\n"
+	"\n"
+	"BB0_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n"
+	"and.b32  	%r20, %r16, 255;\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n"
+	"and.b32  	%r25, %r21, 255;\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n"
+	"and.b32  	%r30, %r26, 255;\n"
+	"tex.2d.v4.u32.f32	{%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n"
+	"and.b32  	%r35, %r31, 255;\n"
+	"add.s32 	%r36, %r20, %r25;\n"
+	"add.s32 	%r37, %r36, %r30;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"mad.lo.s32 	%r41, %r2, %r6, %r1;\n"
+	"add.s32 	%r42, %r15, %r41;\n"
+	"st.global.u8 	[%r42], %r40;\n"
+	"\n"
+	"BB0_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar2\n"
+	".visible .entry Subsample_Bilinear_uchar2(\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<3>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<53>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar2_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar2_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar2_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar2_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar2_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar2_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB1_2;\n"
+	"bra.uni 	BB1_1;\n"
+	"\n"
+	"BB1_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n"
+	"and.b32  	%r32, %r16, 255;\n"
+	"and.b32  	%r33, %r20, 255;\n"
+	"and.b32  	%r34, %r24, 255;\n"
+	"and.b32  	%r35, %r28, 255;\n"
+	"add.s32 	%r36, %r32, %r33;\n"
+	"add.s32 	%r37, %r36, %r34;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"and.b32  	%r41, %r17, 255;\n"
+	"and.b32  	%r42, %r21, 255;\n"
+	"and.b32  	%r43, %r25, 255;\n"
+	"and.b32  	%r44, %r29, 255;\n"
+	"add.s32 	%r45, %r41, %r42;\n"
+	"add.s32 	%r46, %r45, %r43;\n"
+	"add.s32 	%r47, %r46, %r44;\n"
+	"add.s32 	%r48, %r47, 2;\n"
+	"shr.u32 	%r49, %r48, 2;\n"
+	"mad.lo.s32 	%r50, %r2, %r6, %r1;\n"
+	"shl.b32 	%r51, %r50, 1;\n"
+	"add.s32 	%r52, %r15, %r51;\n"
+	"cvt.u16.u32	%rs1, %r49;\n"
+	"cvt.u16.u32	%rs2, %r40;\n"
+	"st.global.v2.u8 	[%r52], {%rs2, %rs1};\n"
+	"\n"
+	"BB1_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar4\n"
+	".visible .entry Subsample_Bilinear_uchar4(\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<5>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<71>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar4_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar4_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar4_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar4_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar4_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar4_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB2_2;\n"
+	"bra.uni 	BB2_1;\n"
+	"\n"
+	"BB2_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n"
+	"and.b32  	%r32, %r16, 255;\n"
+	"and.b32  	%r33, %r20, 255;\n"
+	"and.b32  	%r34, %r24, 255;\n"
+	"and.b32  	%r35, %r28, 255;\n"
+	"add.s32 	%r36, %r32, %r33;\n"
+	"add.s32 	%r37, %r36, %r34;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"and.b32  	%r41, %r17, 255;\n"
+	"and.b32  	%r42, %r21, 255;\n"
+	"and.b32  	%r43, %r25, 255;\n"
+	"and.b32  	%r44, %r29, 255;\n"
+	"add.s32 	%r45, %r41, %r42;\n"
+	"add.s32 	%r46, %r45, %r43;\n"
+	"add.s32 	%r47, %r46, %r44;\n"
+	"add.s32 	%r48, %r47, 2;\n"
+	"shr.u32 	%r49, %r48, 2;\n"
+	"and.b32  	%r50, %r18, 255;\n"
+	"and.b32  	%r51, %r22, 255;\n"
+	"and.b32  	%r52, %r26, 255;\n"
+	"and.b32  	%r53, %r30, 255;\n"
+	"add.s32 	%r54, %r50, %r51;\n"
+	"add.s32 	%r55, %r54, %r52;\n"
+	"add.s32 	%r56, %r55, %r53;\n"
+	"add.s32 	%r57, %r56, 2;\n"
+	"shr.u32 	%r58, %r57, 2;\n"
+	"and.b32  	%r59, %r19, 255;\n"
+	"and.b32  	%r60, %r23, 255;\n"
+	"and.b32  	%r61, %r27, 255;\n"
+	"and.b32  	%r62, %r31, 255;\n"
+	"add.s32 	%r63, %r59, %r60;\n"
+	"add.s32 	%r64, %r63, %r61;\n"
+	"add.s32 	%r65, %r64, %r62;\n"
+	"add.s32 	%r66, %r65, 2;\n"
+	"shr.u32 	%r67, %r66, 2;\n"
+	"mad.lo.s32 	%r68, %r2, %r6, %r1;\n"
+	"shl.b32 	%r69, %r68, 2;\n"
+	"add.s32 	%r70, %r15, %r69;\n"
+	"cvt.u16.u32	%rs1, %r67;\n"
+	"cvt.u16.u32	%rs2, %r58;\n"
+	"cvt.u16.u32	%rs3, %r49;\n"
+	"cvt.u16.u32	%rs4, %r40;\n"
+	"st.global.v4.u8 	[%r70], {%rs4, %rs3, %rs2, %rs1};\n"
+	"\n"
+	"BB2_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#elif defined ENVIRONMENT64
+const char resize_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19324607\n"
+	"// Cuda compilation tools, release 7.0, V7.0.27\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.2\n"
+	".target sm_30\n"
+	".address_size 64\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar\n"
+	".global .texref uchar_tex;\n"
+	".global .texref uchar2_tex;\n"
+	".global .texref uchar4_tex;\n"
+	"\n"
+	".visible .entry Subsample_Bilinear_uchar(\n"
+	".param .u64 Subsample_Bilinear_uchar_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<40>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB0_2;\n"
+	"bra.uni 	BB0_1;\n"
+	"\n"
+	"BB0_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n"
+	"and.b32  	%r18, %r14, 255;\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n"
+	"and.b32  	%r23, %r19, 255;\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n"
+	"and.b32  	%r28, %r24, 255;\n"
+	"tex.2d.v4.u32.f32	{%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n"
+	"and.b32  	%r33, %r29, 255;\n"
+	"add.s32 	%r34, %r18, %r23;\n"
+	"add.s32 	%r35, %r34, %r28;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"mad.lo.s32 	%r39, %r2, %r5, %r1;\n"
+	"cvt.s64.s32	%rd4, %r39;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"st.global.u8 	[%rd5], %r38;\n"
+	"\n"
+	"BB0_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar2\n"
+	".visible .entry Subsample_Bilinear_uchar2(\n"
+	".param .u64 Subsample_Bilinear_uchar2_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<3>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<49>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar2_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar2_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar2_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar2_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar2_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar2_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB1_2;\n"
+	"bra.uni 	BB1_1;\n"
+	"\n"
+	"BB1_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n"
+	"and.b32  	%r30, %r14, 255;\n"
+	"and.b32  	%r31, %r18, 255;\n"
+	"and.b32  	%r32, %r22, 255;\n"
+	"and.b32  	%r33, %r26, 255;\n"
+	"add.s32 	%r34, %r30, %r31;\n"
+	"add.s32 	%r35, %r34, %r32;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"and.b32  	%r39, %r15, 255;\n"
+	"and.b32  	%r40, %r19, 255;\n"
+	"and.b32  	%r41, %r23, 255;\n"
+	"and.b32  	%r42, %r27, 255;\n"
+	"add.s32 	%r43, %r39, %r40;\n"
+	"add.s32 	%r44, %r43, %r41;\n"
+	"add.s32 	%r45, %r44, %r42;\n"
+	"add.s32 	%r46, %r45, 2;\n"
+	"shr.u32 	%r47, %r46, 2;\n"
+	"mad.lo.s32 	%r48, %r2, %r5, %r1;\n"
+	"mul.wide.s32 	%rd4, %r48, 2;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"cvt.u16.u32	%rs1, %r47;\n"
+	"cvt.u16.u32	%rs2, %r38;\n"
+	"st.global.v2.u8 	[%rd5], {%rs2, %rs1};\n"
+	"\n"
+	"BB1_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar4\n"
+	".visible .entry Subsample_Bilinear_uchar4(\n"
+	".param .u64 Subsample_Bilinear_uchar4_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<5>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<67>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar4_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar4_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar4_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar4_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar4_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar4_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB2_2;\n"
+	"bra.uni 	BB2_1;\n"
+	"\n"
+	"BB2_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n"
+	"and.b32  	%r30, %r14, 255;\n"
+	"and.b32  	%r31, %r18, 255;\n"
+	"and.b32  	%r32, %r22, 255;\n"
+	"and.b32  	%r33, %r26, 255;\n"
+	"add.s32 	%r34, %r30, %r31;\n"
+	"add.s32 	%r35, %r34, %r32;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"and.b32  	%r39, %r15, 255;\n"
+	"and.b32  	%r40, %r19, 255;\n"
+	"and.b32  	%r41, %r23, 255;\n"
+	"and.b32  	%r42, %r27, 255;\n"
+	"add.s32 	%r43, %r39, %r40;\n"
+	"add.s32 	%r44, %r43, %r41;\n"
+	"add.s32 	%r45, %r44, %r42;\n"
+	"add.s32 	%r46, %r45, 2;\n"
+	"shr.u32 	%r47, %r46, 2;\n"
+	"and.b32  	%r48, %r16, 255;\n"
+	"and.b32  	%r49, %r20, 255;\n"
+	"and.b32  	%r50, %r24, 255;\n"
+	"and.b32  	%r51, %r28, 255;\n"
+	"add.s32 	%r52, %r48, %r49;\n"
+	"add.s32 	%r53, %r52, %r50;\n"
+	"add.s32 	%r54, %r53, %r51;\n"
+	"add.s32 	%r55, %r54, 2;\n"
+	"shr.u32 	%r56, %r55, 2;\n"
+	"and.b32  	%r57, %r17, 255;\n"
+	"and.b32  	%r58, %r21, 255;\n"
+	"and.b32  	%r59, %r25, 255;\n"
+	"and.b32  	%r60, %r29, 255;\n"
+	"add.s32 	%r61, %r57, %r58;\n"
+	"add.s32 	%r62, %r61, %r59;\n"
+	"add.s32 	%r63, %r62, %r60;\n"
+	"add.s32 	%r64, %r63, 2;\n"
+	"shr.u32 	%r65, %r64, 2;\n"
+	"mad.lo.s32 	%r66, %r2, %r5, %r1;\n"
+	"mul.wide.s32 	%rd4, %r66, 4;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"cvt.u16.u32	%rs1, %r65;\n"
+	"cvt.u16.u32	%rs2, %r56;\n"
+	"cvt.u16.u32	%rs3, %r47;\n"
+	"cvt.u16.u32	%rs4, %r38;\n"
+	"st.global.v4.u8 	[%rd5], {%rs4, %rs3, %rs2, %rs1};\n"
+	"\n"
+	"BB2_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#endif
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 1bac2b9..eae0371 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -20,6 +20,7 @@ HEADERS = adler32.h                                                     \
           common.h                                                      \
           cpu.h                                                         \
           crc.h                                                         \
+          cudautils.h                                                   \
           des.h                                                         \
           display.h                                                     \
           downmix_info.h                                                \
@@ -92,6 +93,7 @@ OBJS = adler32.o                                                        \
        color_utils.o                                                    \
        cpu.o                                                            \
        crc.o                                                            \
+       cudautils.o                                                      \
        des.o                                                            \
        display.o                                                        \
        downmix_info.o                                                   \
diff --git a/libavutil/cudautils.c b/libavutil/cudautils.c
new file mode 100644
index 0000000..63b0864
--- /dev/null
+++ b/libavutil/cudautils.c
@@ -0,0 +1,288 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "cudautils.h"
+#include "common.h"
+#include "log.h"
+
+#define FF_NVINFO_VERSION 1
+static NVGUID NV_INFO_GUID = { 0x2cab9a64, 0x7095, 0x11e5, { 0xad, 0x1d, 0x94, 0xde, 0x80, 0x65, 0xb7, 0x74 } };
+static CudaContext cudaCtx = { { NULL }, { 0 }, { NULL },  0 , { NULL }, { "" }, { 0 } };
+
+int dyload_cuda(void);
+int check_cuda(void);
+int check_cuda_errors(CUresult err, const char *func);
+
+
+#define CHECK_LOAD_FUNC(t, f, s) \
+do { \
+    (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
+    if (!(f)) { \
+        av_log(NULL, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
+        goto error; \
+        } \
+} while (0)
+
+int check_cuda_errors(CUresult err, const char *func)
+{
+    if (err != CUDA_SUCCESS) {
+        av_log(NULL, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
+        return 0;
+    }
+    return 1;
+}
+#define check_cuda_errors(f) if (!check_cuda_errors(f, #f)) goto error
+
+
+int dyload_cuda(void)
+{
+    CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+    if (dl_fn->cuda_lib)
+        return 1;
+
+#if defined(_WIN32)
+    dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
+#else
+    dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+#endif
+
+    if (!dl_fn->cuda_lib) {
+        av_log(NULL, AV_LOG_FATAL, "Failed loading CUDA library\n");
+        goto error;
+    }
+
+    CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
+    CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
+    CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
+    CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
+    CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
+    CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
+    CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
+    CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
+    CHECK_LOAD_FUNC(PCUMODULELOADDATA, dl_fn->cu_module_load_data, "cuModuleLoadData");
+    CHECK_LOAD_FUNC(PCUMODULEGETFUNCTION, dl_fn->cu_module_get_function, "cuModuleGetFunction");
+    CHECK_LOAD_FUNC(PCUMODULEGETTEXREF, dl_fn->cu_module_get_texref, "cuModuleGetTexRef");
+    CHECK_LOAD_FUNC(PCUTEXREFSETFLAGS, dl_fn->cu_texref_set_flags, "cuTexRefSetFlags");
+    CHECK_LOAD_FUNC(PCUTEXREFSETFILTERMODE, dl_fn->cu_texref_set_filtermode, "cuTexRefSetFilterMode");
+    CHECK_LOAD_FUNC(PCUTEXREFSETADDRESS2D, dl_fn->cu_texref_set_address_2D, "cuTexRefSetAddress2D_v3");
+    CHECK_LOAD_FUNC(PCUMEMALLOCPITCH, dl_fn->cu_mem_alloc_pitch, "cuMemAllocPitch_v2");
+    CHECK_LOAD_FUNC(PCUMEMCPY2D, dl_fn->cu_mem_cpy_2D, "cuMemcpy2D_v2");
+    CHECK_LOAD_FUNC(PCUMEMCPY2DASYNC, dl_fn->cu_mem_cpy_2D_async, "cuMemcpy2DAsync_v2");
+    CHECK_LOAD_FUNC(PCUMEMFREE, dl_fn->cu_mem_free, "cuMemFree_v2");
+    CHECK_LOAD_FUNC(PCULAUNCHKERNEL, dl_fn->cu_launch_kernel, "cuLaunchKernel");
+
+    av_log(NULL, AV_LOG_VERBOSE, "CUDA Library and Function loaded successfully\n");
+    return 1;
+
+error:
+    if (dl_fn->cuda_lib)
+        DL_CLOSE_FUNC(dl_fn->cuda_lib);
+
+    dl_fn->cuda_lib = NULL;
+    return 0;
+}
+
+
+int check_cuda(void)
+{
+    int device_count = 0;
+    int smminor = 0, smmajor = 0;
+    int i;
+
+    CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+
+    if (!dyload_cuda())
+        return 0;
+
+    check_cuda_errors(dl_fn->cu_init(0));
+    check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
+
+    if (!device_count) {
+        av_log(NULL, AV_LOG_FATAL, "No CUDA capable devices found\n");
+        goto error;
+    }
+
+    av_log(NULL, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
+
+    for (i = 0; i < device_count; ++i) {
+        check_cuda_errors(dl_fn->cu_device_get(&cudaCtx.cu_devices[i], i));
+        check_cuda_errors(dl_fn->cu_device_get_name(cudaCtx.gpu_name[i], sizeof(cudaCtx.gpu_name[i]), cudaCtx.cu_devices[i]));
+        check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cudaCtx.cu_devices[i]));
+
+        cudaCtx.smver[i] = (smmajor << 4) | smminor;
+        av_log(NULL, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d]\n", i, cudaCtx.gpu_name[i], smmajor, smminor);
+
+    }
+    cudaCtx.device_count = device_count;
+    return 1;
+
+error:
+    cudaCtx.device_count = 0;
+    return 0;
+}
+
+int init_cuda(void)
+{
+    if (cudaCtx.device_count == 0)
+    {
+        if (!check_cuda())
+            return 0;
+    }
+
+    return 1;
+}
+
+void deinit_cuda(void)
+{
+
+    int i, deinit_flag;
+    deinit_flag = 1;
+
+    for (i = 0; i < cudaCtx.device_count; i++)
+    {
+        if (cudaCtx.cuda_context_arr[i] != NULL)
+        {
+            deinit_flag = 0;
+        }
+    }
+
+    if (deinit_flag == 1)
+    {
+        CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+        DL_CLOSE_FUNC(dl_fn->cuda_lib);
+        dl_fn->cuda_lib = NULL;
+
+        dl_fn->cu_init = NULL;
+        dl_fn->cu_device_get_count = NULL;
+        dl_fn->cu_device_get = NULL;
+        dl_fn->cu_device_get_name = NULL;
+        dl_fn->cu_device_compute_capability = NULL;
+        dl_fn->cu_ctx_create = NULL;
+        dl_fn->cu_ctx_pop_current = NULL;
+        dl_fn->cu_ctx_destroy = NULL;
+        dl_fn->cu_module_load_data = NULL;
+        dl_fn->cu_module_get_function = NULL;
+        dl_fn->cu_module_get_texref = NULL;
+        dl_fn->cu_texref_set_flags = NULL;
+        dl_fn->cu_texref_set_filtermode = NULL;
+        dl_fn->cu_texref_set_address_2D = NULL;
+        dl_fn->cu_mem_alloc_pitch = NULL;
+        dl_fn->cu_mem_cpy_2D = NULL;
+        dl_fn->cu_mem_cpy_2D_async = NULL;
+        dl_fn->cu_mem_free = NULL;
+        dl_fn->cu_launch_kernel = NULL;
+        av_log(NULL, AV_LOG_VERBOSE, "Cuda Library unloaded\n");
+    }
+}
+
+int is_gpu_feature_available(int gpu, int target_smver)
+{
+    if (cudaCtx.device_count > 0)
+    {
+        if (cudaCtx.smver[gpu] >= target_smver)
+        {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+
+int get_cuda_context(CUcontext *ctx, int gpu)
+{
+    CUresult cu_res;
+    CUcontext get_ctx;
+    CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+    if (cudaCtx.cuda_context_arr[gpu] == NULL)
+    {
+        cu_res = dl_fn->cu_ctx_create(&get_ctx, 4, cudaCtx.cu_devices[gpu]);
+
+        if (cu_res != CUDA_SUCCESS) {
+            ctx = NULL;
+            return cu_res;
+        }
+
+        cudaCtx.cuda_context_arr[gpu] = get_ctx;
+    }
+
+    *ctx = cudaCtx.cuda_context_arr[gpu];
+    cudaCtx.cuda_context_count[gpu] += 1;
+    av_log(NULL, AV_LOG_VERBOSE, "cudalib : Cuda Context created 0x%p\n", *ctx);
+
+    return CUDA_SUCCESS;
+}
+
+void release_cuda_context(CUcontext *ctx, int gpu)
+{
+
+    CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+    ctx = NULL;
+    cudaCtx.cuda_context_count[gpu] -= 1;
+    if (cudaCtx.cuda_context_count[gpu] == 0)
+    {
+        dl_fn->cu_ctx_destroy(cudaCtx.cuda_context_arr[gpu]);
+        cudaCtx.cuda_context_arr[gpu] = NULL;
+    }
+
+}
+
+int check_nvinfo(void* ptr)
+{
+    ffnvinfo* info;
+    if (!ptr) return 0;
+    info = (ffnvinfo*)ptr;
+    if (memcmp(&info->guid, &NV_INFO_GUID, sizeof(info->guid)) != 0) return 0;
+    return 1;
+}
+
+ffnvinfo* init_nvinfo()
+{
+    ffnvinfo* info = av_mallocz(sizeof(ffnvinfo));
+    memcpy(&info->guid, &NV_INFO_GUID, sizeof(info->guid));
+    info->version = FF_NVINFO_VERSION;
+    return info;
+}
+
+CudaDynLoadFunctions* get_cuda_dl_func()
+{
+    return &cudaCtx.cuda_dload_funcs;
+}
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType)
+{
+    CUDA_MEMCPY2D memcpy2D = { 0 };
+    CudaDynLoadFunctions* cu_dl_func = get_cuda_dl_func();
+
+    memcpy2D.srcMemoryType = srcMemoryType;
+    memcpy2D.dstMemoryType = dstMemoryType;
+    memcpy2D.srcHost = srcHost;
+    memcpy2D.srcDevice = srcDevice;
+    memcpy2D.srcPitch = srcPitch;
+    memcpy2D.dstHost = dstHost;
+    memcpy2D.dstDevice = dstDevice;
+    memcpy2D.dstPitch = dstPitch;
+    memcpy2D.WidthInBytes = width;
+    memcpy2D.Height = height;
+    return cu_dl_func->cu_mem_cpy_2D_async(&memcpy2D, NULL);
+}
diff --git a/libavutil/cudautils.h b/libavutil/cudautils.h
new file mode 100644
index 0000000..98e12f0
--- /dev/null
+++ b/libavutil/cudautils.h
@@ -0,0 +1,216 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "common.h"
+
+
+#if defined(_WIN32)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+#if defined(_WIN32)
+#define LOAD_FUNC(l, s) GetProcAddress(l, s)
+#define DL_CLOSE_FUNC(l) FreeLibrary(l)
+#else
+#define LOAD_FUNC(l, s) dlsym(l, s)
+#define DL_CLOSE_FUNC(l) dlclose(l)
+#endif
+
+#define MAX_NUM_GPU 16
+
+#define CU_TRSF_READ_AS_INTEGER         0x01
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+#define CU_TRSF_SRGB  0x10
+
+#define __cu(a) do { \
+    CUresult  ret; \
+    if ((ret = (a)) != CUDA_SUCCESS) { \
+        av_log(NULL, AV_LOG_FATAL, "[%s:%d]%s has returned CUDA error %d\n", __FILE__, __LINE__, #a, ret); \
+        return AVERROR_EXTERNAL;\
+        }} while (0)
+
+
+typedef int CUdevice;
+typedef void* CUcontext;
+typedef void* CUmodule;
+typedef void* CUfunction;
+typedef void* CUtexref;
+typedef void* CUstream;
+typedef void* CUarray;
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0
+} CUresult;
+
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT = 0x20  /**< 32-bit floating point */
+} CUarray_format;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE = 0x02,  /**< Device memory */
+    CU_MEMORYTYPE_ARRAY = 0x03,   /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04  /**< Unified device or host memory */
+} CUmemorytype;
+
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+typedef CUresult(CUDAAPI *PCUMODULELOADDATA)(CUmodule *module, const void *image);
+typedef CUresult(CUDAAPI *PCUMODULEGETFUNCTION)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUMODULEGETTEXREF)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFLAGS)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFILTERMODE)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult(CUDAAPI *PCUTEXREFSETADDRESS2D)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+typedef CUresult(CUDAAPI *PCUMEMALLOCPITCH)(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult(CUDAAPI *PCUMEMCPY2D)(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult(CUDAAPI *PCUMEMCPY2DASYNC)(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+typedef CUresult(CUDAAPI *PCUMEMFREE)(CUdeviceptr dptr);
+typedef CUresult(CUDAAPI *PCULAUNCHKERNEL)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+
+
+typedef struct CudaDynLoadFunctions{
+    PCUINIT cu_init;
+    PCUDEVICEGETCOUNT cu_device_get_count;
+    PCUDEVICEGET cu_device_get;
+    PCUDEVICEGETNAME cu_device_get_name;
+    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
+    PCUCTXCREATE cu_ctx_create;
+    PCUCTXPOPCURRENT cu_ctx_pop_current;
+    PCUCTXDESTROY cu_ctx_destroy;
+    PCUMODULELOADDATA cu_module_load_data;
+    PCUMODULEGETFUNCTION cu_module_get_function;
+    PCUMODULEGETTEXREF cu_module_get_texref;
+    PCUTEXREFSETFLAGS cu_texref_set_flags;
+    PCUTEXREFSETFILTERMODE cu_texref_set_filtermode;
+    PCUTEXREFSETADDRESS2D cu_texref_set_address_2D;
+    PCUMEMALLOCPITCH cu_mem_alloc_pitch;
+    PCUMEMCPY2D cu_mem_cpy_2D;
+    PCUMEMCPY2DASYNC cu_mem_cpy_2D_async;
+    PCUMEMFREE cu_mem_free;
+    PCULAUNCHKERNEL cu_launch_kernel;
+
+#if defined(_WIN32)
+    HMODULE cuda_lib;
+#else
+    void* cuda_lib;
+#endif
+} CudaDynLoadFunctions;
+
+typedef struct CudaContext{
+
+    CUcontext cuda_context_arr[MAX_NUM_GPU];
+    unsigned int cuda_context_count[MAX_NUM_GPU];
+    CudaDynLoadFunctions cuda_dload_funcs;
+
+    int device_count;
+    CUdevice cu_devices[MAX_NUM_GPU];
+    char gpu_name[MAX_NUM_GPU][128];
+    int smver[MAX_NUM_GPU];
+} CudaContext;
+
+typedef struct _NVGUID {
+  uint32_t Data1;
+  uint16_t Data2;
+  uint16_t Data3;
+  uint8_t  Data4[8];
+} NVGUID;
+
+typedef struct _ffnvinfo {
+    NVGUID    guid;
+    uint32_t  version;
+    //CUcontext cudaCtx;
+    void*     vxCtx;
+    CUdeviceptr dptr[8];
+    uint32_t   linesize[8];
+} ffnvinfo;
+
+int init_cuda(void);
+void deinit_cuda(void);
+int get_cuda_context(CUcontext *ctx, int gpu);
+void release_cuda_context(CUcontext *ctx, int gpu);
+int is_gpu_feature_available(int gpu, int target_smver);
+int check_nvinfo(void* ptr);
+ffnvinfo* init_nvinfo(void);
+CudaDynLoadFunctions* get_cuda_dl_func(void);
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType);
-- 
1.8.1.msysgit.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to