Signed-off-by: UsingtcNower <nowe...@gmail.com> --- Changelog | 1 + configure | 1 + doc/filters.texi | 31 +++ libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/version.h | 2 +- libavfilter/vf_crop_cuda.c | 638 ++++++++++++++++++++++++++++++++++++++++++++ libavfilter/vf_crop_cuda.cu | 109 ++++++++ 8 files changed, 783 insertions(+), 1 deletion(-) create mode 100644 libavfilter/vf_crop_cuda.c create mode 100644 libavfilter/vf_crop_cuda.cu
diff --git a/Changelog b/Changelog index ad7e82f..f224fc8 100644 --- a/Changelog +++ b/Changelog @@ -20,6 +20,7 @@ version <next>: - libaribb24 based ARIB STD-B24 caption support (profiles A and C) - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec - removed libndi-newtek +- crop_cuda GPU accelerated video crop filter version 4.1: diff --git a/configure b/configure index 331393f..3f3ac2f 100755 --- a/configure +++ b/configure @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv" vaapi_encode_deps="vaapi" v4l2_m2m_deps="linux_videodev2_h sem_timedwait" +crop_cuda_filter_deps="ffnvcodec cuda_nvcc" hwupload_cuda_filter_deps="ffnvcodec" scale_npp_filter_deps="ffnvcodec libnpp" scale_cuda_filter_deps="ffnvcodec cuda_nvcc" diff --git a/doc/filters.texi b/doc/filters.texi index 4ffb392..ee16a2d 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it is kept at its current value. @end table +@section crop_cuda + +Crop the input video to given dimensions, implemented in CUDA. + +It accepts the following parameters: + +@table @option + +@item w +The width of the output video. It defaults to @code{iw}. +This expression is evaluated only once during the filter +configuration. + +@item h +The height of the output video. It defaults to @code{ih}. +This expression is evaluated only once during the filter +configuration. + +@item x +The horizontal position, in the input video, of the left edge of the output +video. It defaults to @code{(in_w-out_w)/2}. +This expression is evaluated only once during the filter +configuration. + +@item y +The vertical position, in the input video, of the top edge of the output video. +It defaults to @code{(in_h-out_h)/2}. +This expression is evaluated only once during the filter +configuration. +@end table + @section cropdetect Auto-detect the crop size. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index fef6ec5..84df037 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER) += vf_copy.o OBJS-$(CONFIG_COREIMAGE_FILTER) += vf_coreimage.o OBJS-$(CONFIG_COVER_RECT_FILTER) += vf_cover_rect.o lavfutils.o OBJS-$(CONFIG_CROP_FILTER) += vf_crop.o +OBJS-$(CONFIG_CROP_CUDA_FILTER) += vf_crop_cuda.o vf_crop_cuda.ptx.o OBJS-$(CONFIG_CROPDETECT_FILTER) += vf_cropdetect.o OBJS-$(CONFIG_CUE_FILTER) += f_cue.o OBJS-$(CONFIG_CURVES_FILTER) += vf_curves.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index c51ae0f..550e545 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy; extern AVFilter ff_vf_coreimage; extern AVFilter ff_vf_cover_rect; extern AVFilter ff_vf_crop; +extern AVFilter ff_vf_crop_cuda; extern AVFilter ff_vf_cropdetect; extern AVFilter ff_vf_cue; extern AVFilter ff_vf_curves; diff --git a/libavfilter/version.h b/libavfilter/version.h index c71282c..5aa95f4 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #define LIBAVFILTER_VERSION_MAJOR 7 #define LIBAVFILTER_VERSION_MINOR 48 -#define LIBAVFILTER_VERSION_MICRO 100 +#define LIBAVFILTER_VERSION_MICRO 101 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ LIBAVFILTER_VERSION_MINOR, \ diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c new file mode 100644 index 0000000..fc6a2a6 --- /dev/null +++ b/libavfilter/vf_crop_cuda.c @@ -0,0 +1,638 @@ +/* +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +*/ + +#include <stdio.h> +#include <string.h> + +#include "libavutil/avstring.h" +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/eval.h" + +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +static const char *const var_names[] = { + "in_w", "iw", ///< width of the input video + "in_h", "ih", ///< height of the input video + "out_w", "ow", ///< width of the cropped video + "out_h", "oh", ///< height of the cropped video + "x", + "y", + NULL +}; + +enum var_name { + VAR_IN_W, VAR_IW, + VAR_IN_H, VAR_IH, + VAR_OUT_W, VAR_OW, + VAR_OUT_H, VAR_OH, + VAR_X, + VAR_Y, + VAR_VARS_NB +}; + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P, + AV_PIX_FMT_P010, + AV_PIX_FMT_P016 +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1)) +#define NUM_BUFFERS 2 +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +typedef struct CUDACropContext { + const AVClass *class; + AVCUDADeviceContext *hwctx; + enum AVPixelFormat in_fmt; + enum AVPixelFormat out_fmt; + + struct { + int width; + int height; + int left; + int top; + } planes_in[3], planes_out[3]; + + AVBufferRef *frames_ctx; + AVFrame *frame; + + AVFrame *tmp_frame; + int passthrough; + + /** + * Output sw format. AV_PIX_FMT_NONE for no conversion. + */ + enum AVPixelFormat format; + + int w,h,x,y; + char *w_expr, *h_expr, *x_expr, *y_expr; + double var_values[VAR_VARS_NB]; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func_uchar; + CUfunction cu_func_uchar2; + CUfunction cu_func_uchar4; + CUfunction cu_func_ushort; + CUfunction cu_func_ushort2; + CUfunction cu_func_ushort4; + CUstream cu_stream; + + CUdeviceptr srcBuffer; + CUdeviceptr dstBuffer; + int tex_alignment; +} CUDACropContext; + +static av_cold int cudacrop_init(AVFilterContext *ctx) +{ + CUDACropContext *s = ctx->priv; + + s->format = AV_PIX_FMT_NONE; + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold void cudacrop_uninit(AVFilterContext *ctx) +{ + CUDACropContext *s = ctx->priv; + + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static int cudacrop_query_formats(AVFilterContext *ctx) +{ + static const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE, + }; + AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats); + + return ff_set_common_formats(ctx, pix_fmts); +} + +static av_cold int init_stage(CUDACropContext *s, AVBufferRef *device_ctx) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int in_sw, in_sh, out_sw, out_sh; + int ret, i; + + av_pix_fmt_get_chroma_sub_sample(s->in_fmt, &in_sw, &in_sh); + av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh); + if (!s->planes_out[0].width) { + s->planes_out[0].width = s->planes_in[0].width; + s->planes_out[0].height = s->planes_in[0].height; + s->planes_out[0].left = s->planes_in[0].left; + s->planes_out[0].top = s->planes_in[0].top; + } + + for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) { + s->planes_in[i].width = s->planes_in[0].width >> in_sw; + s->planes_in[i].height = s->planes_in[0].height >> in_sh; + s->planes_in[i].left = s->planes_in[0].left >> in_sw; + s->planes_in[i].top = s->planes_in[0].top >> in_sh; + s->planes_out[i].width = s->planes_out[0].width >> out_sw; + s->planes_out[i].height = s->planes_out[0].height >> out_sh; + s->planes_out[i].left = 0; + s->planes_out[i].top = 0; + + } + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext*)out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = FFALIGN(s->planes_out[0].width, 32); + out_ctx->height = FFALIGN(s->planes_out[0].height, 32); + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + s->frame->width = s->planes_out[0].width; + s->frame->height = s->planes_out[0].height; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; +fail: + av_buffer_unref(&out_ref); + return ret; +} + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height, + int out_width, int out_height, + int left, int top) +{ + CUDACropContext *s = ctx->priv; + + AVHWFramesContext *in_frames_ctx; + + enum AVPixelFormat in_format; + enum AVPixelFormat out_format; + int ret; + + /* check that we have a hw context */ + if (!ctx->inputs[0]->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + out_format = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format; + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + if (!format_is_supported(out_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", + av_get_pix_fmt_name(out_format)); + return AVERROR(ENOSYS); + } + + if (in_width == out_width && in_height == out_height) + s->passthrough = 1; + + s->in_fmt = in_format; + s->out_fmt = out_format; + + s->planes_in[0].width = in_width; + s->planes_in[0].height = in_height; + s->planes_in[0].left = left; + s->planes_in[0].top = top; + s->planes_out[0].width = out_width; + s->planes_out[0].height = out_height; + s->planes_out[0].left = 0; + s->planes_out[0].top = 0; + + ret = init_stage(s, in_frames_ctx->device_ref); + if (ret < 0) + return ret; + + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!ctx->outputs[0]->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static inline int normalize_double(int *n, double d) +{ + int ret = 0; + + if (isnan(d)) + ret = AVERROR(EINVAL); + else if (d > INT_MAX || d < INT_MIN) { + *n = d > INT_MAX ? INT_MAX : INT_MIN; + ret = AVERROR(EINVAL); + } else + *n = lrint(d); + + return ret; +} + +static av_cold int cudacrop_config_input(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->dst; + CUDACropContext *s = ctx->priv; + double res; + int ret; + + s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w; + s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h; + s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN; + s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN; + s->var_values[VAR_X] = NAN; + s->var_values[VAR_Y] = NAN; + if ((ret = av_expr_parse_and_eval(&res, s->w_expr, + var_names, s->var_values, + NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0) + goto fail; + s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res; + if ((ret = av_expr_parse_and_eval(&res, s->h_expr, + var_names, s->var_values, + NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0) + goto fail; + s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res; + if ((ret = av_expr_parse_and_eval(&res, s->x_expr, + var_names, s->var_values, + NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0) + goto fail; + s->var_values[VAR_X] = res; + if ((ret = av_expr_parse_and_eval(&res, s->y_expr, + var_names, s->var_values, + NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0) + goto fail; + s->var_values[VAR_Y] = res; + if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 || + normalize_double(&s->h, s->var_values[VAR_OH]) < 0 || + normalize_double(&s->x, s->var_values[VAR_X]) < 0 || + normalize_double(&s->y, s->var_values[VAR_Y]) < 0) { + av_log(ctx, AV_LOG_ERROR, + "Too big value or invalid expression for out_w/ow or out_h/oh or x or y"); + return AVERROR(EINVAL); + } + +fail: + return ret; +} + +static av_cold int cudacrop_config_output(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + CUDACropContext *s = ctx->priv; + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; + CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; + CudaFunctions *cu = device_hwctx->internal->cuda_dl; + int ret; + + extern char vf_crop_cuda_ptx[]; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + goto fail; + + ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_crop_cuda_ptx)); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Crop_uchar")); + if (ret < 0) + goto fail; + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Crop_uchar2")); + if (ret < 0) + goto fail; + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Crop_uchar4")); + if (ret < 0) + goto fail; + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Crop_ushort")); + if (ret < 0) + goto fail; + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Crop_ushort2")); + if (ret < 0) + goto fail; + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Crop_ushort4")); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + outlink->w = s->w; + outlink->h = s->h; + + ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h, s->x, s->y); + if (ret < 0) + return ret; + + if (inlink->sample_aspect_ratio.num) { + outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, + outlink->w*inlink->h}, + inlink->sample_aspect_ratio); + } else { + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + } + + return 0; + +fail: + return ret; +} + +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, + uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, + int left, int top, int pixel_size) +{ + CUDACropContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; + CUtexObject tex = 0; + void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height, &dst_pitch, &left, &top }; + int ret; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + return ret; +} + +static int cropcuda_crop_internal(AVFilterContext *ctx, + AVFrame *out, AVFrame *in) +{ + AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data; + CUDACropContext *s = ctx->priv; + + switch (in_frames_ctx->sw_format) { + case AV_PIX_FMT_YUV420P: + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0], in->width, in->height, in->linesize[0], + out->data[0], out->width, out->height, out->linesize[0], + s->planes_in[0].left, s->planes_in[0].top, 1); + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2, + out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2, + s->planes_in[1].left, s->planes_in[1].top, 1); + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2, + out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2, + s->planes_in[2].left, s->planes_in[2].top, 1); + break; + case AV_PIX_FMT_YUV444P: + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0], in->width, in->height, in->linesize[0], + out->data[0], out->width, out->height, out->linesize[0], + s->planes_in[0].left, s->planes_in[0].top, 1); + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0], + out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0], + s->planes_in[1].left, s->planes_in[1].top, 1); + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0], + out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0], + s->planes_in[2].left, s->planes_in[2].top, 1); + break; + case AV_PIX_FMT_NV12: + call_crop_kernel(ctx, s->cu_func_uchar, 1, + in->data[0], in->width, in->height, in->linesize[0], + out->data[0], out->width, out->height, out->linesize[0], + s->planes_in[0].left, s->planes_in[0].top, 1); + call_crop_kernel(ctx, s->cu_func_uchar2, 2, + in->data[1], in->width/2, in->height/2, in->linesize[1], + out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2, + s->planes_in[1].left, s->planes_in[1].top, 1); + break; + case AV_PIX_FMT_P010LE: + call_crop_kernel(ctx, s->cu_func_ushort, 1, + in->data[0], in->width, in->height, in->linesize[0]/2, + out->data[0], out->width, out->height, out->linesize[0]/2, + s->planes_in[0].left, s->planes_in[0].top, 2); + call_crop_kernel(ctx, s->cu_func_ushort2, 2, + in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2, + out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, + s->planes_in[1].left, s->planes_in[1].top, 2); + break; + case AV_PIX_FMT_P016LE: + call_crop_kernel(ctx, s->cu_func_ushort, 1, + in->data[0], in->width, in->height, in->linesize[0] / 2, + out->data[0], out->width, out->height, out->linesize[0] / 2, + s->planes_in[0].left, s->planes_in[0].top, 2); + call_crop_kernel(ctx, s->cu_func_ushort2, 2, + in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2, + out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, + s->planes_in[1].left, s->planes_in[1].top, 2); + break; + default: + return AVERROR_BUG; + } + + return 0; +} + +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame *in) +{ + CUDACropContext *s = ctx->priv; + AVFrame *src = in; + int ret; + + ret = cropcuda_crop_internal(ctx, s->frame, src); + if (ret < 0) + return ret; + + src = s->frame; + ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); + if (ret < 0) + return ret; + + av_frame_move_ref(out, s->frame); + av_frame_move_ref(s->frame, s->tmp_frame); + + ret = av_frame_copy_props(out, in); + if (ret < 0) + return ret; + + return 0; +} + +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *ctx = link->dst; + CUDACropContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext dummy; + int ret = 0; + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = cudacrop_crop(ctx, out, in); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto fail; + + av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, + (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, + (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, + INT_MAX); + + av_frame_free(&in); + return ff_filter_frame(outlink, out); +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(CUDACropContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption options[] = { + { "w", "set the width crop area expression", OFFSET(w_expr), AV_OPT_TYPE_STRING, { .str = "iw" }, .flags = FLAGS }, + { "h", "set the height crop area expression", OFFSET(h_expr), AV_OPT_TYPE_STRING, { .str = "ih" }, .flags = FLAGS }, + { "x", "set the x crop area expression", OFFSET(x_expr), AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS }, + { "y", "set the y crop area expression", OFFSET(y_expr), AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS }, + { NULL }, +}; + +static const AVClass cudacrop_class = { + .class_name = "cudacrop", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad cudacrop_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = cudacrop_filter_frame, + .config_props = cudacrop_config_input, + }, + { NULL } +}; + +static const AVFilterPad cudacrop_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = cudacrop_config_output, + }, + { NULL } +}; + +AVFilter ff_vf_crop_cuda = { + .name = "crop_cuda", + .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"), + + .init = cudacrop_init, + .uninit = cudacrop_uninit, + .query_formats = cudacrop_query_formats, + + .priv_size = sizeof(CUDACropContext), + .priv_class = &cudacrop_class, + + .inputs = cudacrop_inputs, + .outputs = cudacrop_outputs, + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu new file mode 100644 index 0000000..4b94b73 --- /dev/null +++ b/libavfilter/vf_crop_cuda.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +extern "C" { + +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex, + unsigned char *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned char>(uchar_tex, xi, yi); +} + +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex, + uchar2 *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi, yi); +} + +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex, + uchar4 *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi, yi); +} + +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex, + unsigned short *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned short>(ushort_tex, xi, yi); +} + +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex, + ushort2 *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex, xi, yi); +} + +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex, + ushort4 *dst, + int dst_width, int dst_height, int dst_pitch, + int left, int top) +{ + int xo = blockIdx.x * blockDim.x + threadIdx.x; + int yo = blockIdx.y * blockDim.y + threadIdx.y; + int xi = xo + left; + int yi = yo + top; + + if (yo < dst_height && xo < dst_width) + dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex, xi, yi); +} + +} -- 1.8.3.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".