opencv · asmorkalov · Apr 17, 2023 · Mar 8, 2023 · asmorkalov · Apr 13, 2023
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -310,7 +310,7 @@ enum DeinterlaceMode
 struct CV_EXPORTS_W_SIMPLE FormatInfo
 {
     CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
-        fps(0), ulNumDecodeSurfaces(0) {};
+        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false) {};
 
     CV_PROP_RW Codec codec;
     CV_PROP_RW ChromaFormat chromaFormat;
@@ -329,6 +329,7 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
     CV_PROP_RW cv::Size targetSz;//!< Post-processed size of the output frame.
     CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
     CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
+    CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.  Internally the conversion from NV12 to BGR obeys ITU 709.
 };
 
 /** @brief cv::cudacodec::VideoReader generic properties identifier.

diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
@@ -66,14 +66,20 @@ namespace
 {
     __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
 
+    template<bool fullRange>
     __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
     {
         float luma, chromaCb, chromaCr;
-
-        // Prepare for hue adjustment
-        luma     = (float)yuvi[0];
-        chromaCb = (float)((int)yuvi[1] - 512.0f);
-        chromaCr = (float)((int)yuvi[2] - 512.0f);
+        if (fullRange) {
+            luma = (float)(((int)yuvi[0] * 219.0f / 255.0f));
+            chromaCb = (float)(((int)yuvi[1] - 512.0f) * 224.0f / 255.0f);
+            chromaCr = (float)(((int)yuvi[2] - 512.0f) * 224.0f / 255.0f);
+        }
+        else {
+            luma = (float)((int)yuvi[0] - 64.0f);
+            chromaCb = (float)((int)yuvi[1] - 512.0f);
+            chromaCr = (float)((int)yuvi[2] - 512.0f);
+        }
 
        // Convert YUV To RGB with hue adjustment
        *red   = (luma     * constHueColorSpaceMat[0]) +
@@ -112,6 +118,7 @@ namespace
     #define COLOR_COMPONENT_BIT_SIZE 10
     #define COLOR_COMPONENT_MASK     0x3FF
 
+    template<bool fullRange>
     __global__ void NV12_to_BGRA(const uchar* srcImage, size_t nSourcePitch,
                                   uint* dstImage, size_t nDestPitch,
                                   uint width, uint height)
@@ -135,31 +142,11 @@ namespace
 
         const int y_chroma = y >> 1;
 
-        if (y & 1)  // odd scanline ?
-        {
-            uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
-            uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
-
-            if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-            {
-                chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
-                chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
-            }
+        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
 
-            yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-            yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-            yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-            yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-        }
-        else
-        {
-            yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-            yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-            yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-            yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-        }
+        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
 
         // this steps performs the color conversion
         uint yuvi[6];
@@ -174,8 +161,8 @@ namespace
         yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
 
         // YUV to RGB Transformation conversion
-        YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
-        YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+        YUV2RGB<fullRange>(&yuvi[0], &red[0], &green[0], &blue[0]);
+        YUV2RGB<fullRange>(&yuvi[3], &red[1], &green[1], &blue[1]);
 
         // Clamp the results to RGBA
 
@@ -186,13 +173,15 @@ namespace
     }
 }
 
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream)
+void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream)
 {
     outFrame.create(height, width, CV_8UC4);
     dim3 block(32, 8);
     dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
-    NV12_to_BGRA<< <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step,
-        outFrame.ptr<uint>(), outFrame.step, width, height);
+    if (videoFullRangeFlag)
+        NV12_to_BGRA<true> << <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
+    else
+        NV12_to_BGRA<false> << <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
     CV_CUDEV_SAFE_CALL(cudaGetLastError());
     if (stream == 0)
         CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());

diff --git a/modules/cudacodec/src/precomp.hpp b/modules/cudacodec/src/precomp.hpp
@@ -82,6 +82,7 @@
         #include "frame_queue.hpp"
         #include "video_decoder.hpp"
         #include "video_parser.hpp"
+        #include <opencv2/cudaarithm.hpp>
     #endif
     #if defined(HAVE_NVCUVENC)
         #include <fstream>

diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
@@ -115,6 +115,7 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
         format->min_num_decode_surfaces != thiz->videoDecoder_->nDecodeSurfaces())
     {
         FormatInfo newFormat;
+        newFormat.videoFullRangeFlag = format->video_signal_description.video_full_range_flag;
         newFormat.codec = static_cast<Codec>(format->codec);
         newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
         newFormat.nBitDepthMinus8 = format->bit_depth_luma_minus8;

diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
@@ -53,27 +53,54 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&, co
 
 #else // HAVE_NVCUVID
 
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream);
+void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream);
 bool ValidColorFormat(const ColorFormat colorFormat);
 
-void videoDecPostProcessFrame(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat,
+void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat, const bool videoFullRangeFlag,
     Stream stream)
 {
+    CV_Assert(decodedFrame.cols == width && decodedFrame.rows == height * 1.5f);
     if (colorFormat == ColorFormat::BGRA) {
-        nv12ToBgra(decodedFrame, outFrame, width, height, StreamAccessor::getStream(stream));
+        nv12ToBgra(decodedFrame, outFrame, width, height, videoFullRangeFlag, StreamAccessor::getStream(stream));
     }
     else if (colorFormat == ColorFormat::BGR) {
         outFrame.create(height, width, CV_8UC3);
         Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
         NppiSize oSizeROI = { width,height };
+#if (CUDART_VERSION < 9200)
+        CV_Error(Error::StsUnsupportedFormat, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
+#elif (CUDART_VERSION < 10100)
+        cv::cuda::NppStreamHandler h(stream);
+        if (videoFullRangeFlag)
+            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
+        else {
+            CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
+            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
+        }
+#elif (CUDART_VERSION >= 10100)
         NppStreamContext nppStreamCtx;
         nppSafeCall(nppGetStreamContext(&nppStreamCtx));
         nppStreamCtx.hStream = StreamAccessor::getStream(stream);
-        nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+        if (videoFullRangeFlag)
+            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+        else {
+#if (CUDART_VERSION < 11000)
+            CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
+            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+#else
+            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+#endif
+        }
+#endif
     }
     else if (colorFormat == ColorFormat::GRAY) {
         outFrame.create(height, width, CV_8UC1);
-        cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream));
+        if(videoFullRangeFlag)
+            cudaSafeCall(cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream)));
+        else {
+            cv::cuda::subtract(decodedFrame(Rect(0,0,width,height)), 16, outFrame, noArray(), CV_8U, stream);
+            cv::cuda::multiply(outFrame, 255.0f / 219.0f, outFrame, 1.0, CV_8U, stream);
+        }
     }
     else if (colorFormat == ColorFormat::NV_NV12) {
         decodedFrame.copyTo(outFrame, stream);
@@ -222,9 +249,7 @@ namespace
             // map decoded video frame to CUDA surface
             GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
-            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
-            // comment this out if we include the line of code seen above
-            videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, stream);
+            cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);
 
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)

diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
@@ -62,6 +62,11 @@ PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
 {
 };
 
+typedef tuple<std::string, bool> color_conversion_params_t;
+PARAM_TEST_CASE(ColorConversion, cv::cuda::DeviceInfo, cv::cudacodec::ColorFormat, color_conversion_params_t)
+{
+};
+
 PARAM_TEST_CASE(VideoReadRaw, cv::cuda::DeviceInfo, std::string)
 {
 };
@@ -220,7 +225,7 @@ CUDA_TEST_P(Scaling, Reader)
     cv::cuda::resize(frameOr(srcRoiOut), frameGs, targetRoiOut.size(), 0, 0, INTER_AREA);
     // assert on mean absolute error due to different resize algorithms
     const double mae = cv::cuda::norm(frameGs, frame(targetRoiOut), NORM_L1)/frameGs.size().area();
-    ASSERT_LT(mae, 2.35);
+    ASSERT_LT(mae, 2.75);
 }
 
 CUDA_TEST_P(Video, Reader)
@@ -265,6 +270,33 @@ CUDA_TEST_P(Video, Reader)
     }
 }
 
+CUDA_TEST_P(ColorConversion, Reader)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(1);
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
+    const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    reader->set(colorFormat);
+    cv::VideoCapture cap(inputFile);
+
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        const cv::cudacodec::FormatInfo fmt = reader->format();
+        ASSERT_TRUE(fmt.valid && fmt.videoFullRangeFlag == videoFullRangeFlag);
+        if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
+            cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
+        else
+            frameHostGs = frameHost;
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2.0);
+    }
+}
+
 CUDA_TEST_P(VideoReadRaw, Reader)
 {
     cv::cuda::setDevice(GET_PARAM(0).deviceID());
@@ -672,6 +704,18 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
     ALL_DEVICES,
     testing::Values(VIDEO_SRC_R)));
 
+const color_conversion_params_t color_conversion_params[] =
+{
+    color_conversion_params_t("highgui/video/big_buck_bunny.h264", false),
+    color_conversion_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
+};
+
+#define VIDEO_COLOR_OUTPUTS cv::cudacodec::ColorFormat::BGRA, cv::cudacodec::ColorFormat::BGRA
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversion, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(VIDEO_COLOR_OUTPUTS),
+    testing::ValuesIn(color_conversion_params)));
+
 #define VIDEO_SRC_RW "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265"
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
     ALL_DEVICES,