Skip to content

cudacodec::VideoReader: fix nv12 to bgr/bgra/grey conversion #3468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modules/cudacodec/include/opencv2/cudacodec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ enum DeinterlaceMode
struct CV_EXPORTS_W_SIMPLE FormatInfo
{
CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
fps(0), ulNumDecodeSurfaces(0) {};
fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false) {};

CV_PROP_RW Codec codec;
CV_PROP_RW ChromaFormat chromaFormat;
Expand All @@ -329,6 +329,7 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
CV_PROP_RW cv::Size targetSz;//!< Post-processed size of the output frame.
CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification. Internally the conversion from NV12 to BGR obeys ITU 709.
};

/** @brief cv::cudacodec::VideoReader generic properties identifier.
Expand Down
57 changes: 23 additions & 34 deletions modules/cudacodec/src/cuda/nv12_to_rgb.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,20 @@ namespace
{
__constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};

template<bool fullRange>
__device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
{
float luma, chromaCb, chromaCr;

// Prepare for hue adjustment
luma = (float)yuvi[0];
chromaCb = (float)((int)yuvi[1] - 512.0f);
chromaCr = (float)((int)yuvi[2] - 512.0f);
if (fullRange) {
luma = (float)(((int)yuvi[0] * 219.0f / 255.0f));
chromaCb = (float)(((int)yuvi[1] - 512.0f) * 224.0f / 255.0f);
chromaCr = (float)(((int)yuvi[2] - 512.0f) * 224.0f / 255.0f);
}
else {
luma = (float)((int)yuvi[0] - 64.0f);
chromaCb = (float)((int)yuvi[1] - 512.0f);
chromaCr = (float)((int)yuvi[2] - 512.0f);
}

// Convert YUV To RGB with hue adjustment
*red = (luma * constHueColorSpaceMat[0]) +
Expand Down Expand Up @@ -112,6 +118,7 @@ namespace
#define COLOR_COMPONENT_BIT_SIZE 10
#define COLOR_COMPONENT_MASK 0x3FF

template<bool fullRange>
__global__ void NV12_to_BGRA(const uchar* srcImage, size_t nSourcePitch,
uint* dstImage, size_t nDestPitch,
uint width, uint height)
Expand All @@ -135,31 +142,11 @@ namespace

const int y_chroma = y >> 1;

if (y & 1) // odd scanline ?
{
uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x ];
uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];

if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
{
chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x ] + 1) >> 1;
chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
}
yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x ] << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
else
{
yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x ] << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x ] << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x ] << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

// this steps performs the color conversion
uint yuvi[6];
Expand All @@ -174,8 +161,8 @@ namespace
yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

// YUV to RGB Transformation conversion
YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
YUV2RGB<fullRange>(&yuvi[0], &red[0], &green[0], &blue[0]);
YUV2RGB<fullRange>(&yuvi[3], &red[1], &green[1], &blue[1]);

// Clamp the results to RGBA

Expand All @@ -186,13 +173,15 @@ namespace
}
}

void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream)
void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream)
{
outFrame.create(height, width, CV_8UC4);
dim3 block(32, 8);
dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
NV12_to_BGRA<< <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step,
outFrame.ptr<uint>(), outFrame.step, width, height);
if (videoFullRangeFlag)
NV12_to_BGRA<true> << <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
else
NV12_to_BGRA<false> << <grid, block, 0, stream >> > (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
CV_CUDEV_SAFE_CALL(cudaGetLastError());
if (stream == 0)
CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
Expand Down
1 change: 1 addition & 0 deletions modules/cudacodec/src/precomp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
#include "frame_queue.hpp"
#include "video_decoder.hpp"
#include "video_parser.hpp"
#include <opencv2/cudaarithm.hpp>
#endif
#if defined(HAVE_NVCUVENC)
#include <fstream>
Expand Down
1 change: 1 addition & 0 deletions modules/cudacodec/src/video_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
format->min_num_decode_surfaces != thiz->videoDecoder_->nDecodeSurfaces())
{
FormatInfo newFormat;
newFormat.videoFullRangeFlag = format->video_signal_description.video_full_range_flag;
newFormat.codec = static_cast<Codec>(format->codec);
newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
newFormat.nBitDepthMinus8 = format->bit_depth_luma_minus8;
Expand Down
41 changes: 33 additions & 8 deletions modules/cudacodec/src/video_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,54 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&, co

#else // HAVE_NVCUVID

void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream);
void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream);
bool ValidColorFormat(const ColorFormat colorFormat);

void videoDecPostProcessFrame(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat,
void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat, const bool videoFullRangeFlag,
Stream stream)
{
CV_Assert(decodedFrame.cols == width && decodedFrame.rows == height * 1.5f);
if (colorFormat == ColorFormat::BGRA) {
nv12ToBgra(decodedFrame, outFrame, width, height, StreamAccessor::getStream(stream));
nv12ToBgra(decodedFrame, outFrame, width, height, videoFullRangeFlag, StreamAccessor::getStream(stream));
}
else if (colorFormat == ColorFormat::BGR) {
outFrame.create(height, width, CV_8UC3);
Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
NppiSize oSizeROI = { width,height };
#if (CUDART_VERSION < 9200)
CV_Error(Error::StsUnsupportedFormat, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
#elif (CUDART_VERSION < 10100)
cv::cuda::NppStreamHandler h(stream);
if (videoFullRangeFlag)
nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
else {
CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
}
#elif (CUDART_VERSION >= 10100)
NppStreamContext nppStreamCtx;
nppSafeCall(nppGetStreamContext(&nppStreamCtx));
nppStreamCtx.hStream = StreamAccessor::getStream(stream);
nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
if (videoFullRangeFlag)
nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
else {
#if (CUDART_VERSION < 11000)
CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose to make the check in constructor, and use LOG_INFO to not generate spam. M.b. could you also add own conversion kernel for BGR in the same way as BGRA?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose to make the check in constructor, and use LOG_INFO to not generate spam.

Unfortunately that information isn't available until a frame has been run through the decoder. This is something I've wanted to change for a while and was going to add in another PR. Should I add that change to this PR or simply remove the debug output which is not really necessary?

M.b. could you also add own conversion kernel for BGR in the same way as BGRA?

Actually I would prefer to remove the BGRA kernel. Much as I am not a fan of the NPP libs (buggy, performance oscillates depending on the CUDA version etc.), they should be the best option for performance on future hardware + they should get more user testing than the OpenCV CUDA modules.

nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
#else
nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
#endif
}
#endif
}
else if (colorFormat == ColorFormat::GRAY) {
outFrame.create(height, width, CV_8UC1);
cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream));
if(videoFullRangeFlag)
cudaSafeCall(cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream)));
else {
cv::cuda::subtract(decodedFrame(Rect(0,0,width,height)), 16, outFrame, noArray(), CV_8U, stream);
cv::cuda::multiply(outFrame, 255.0f / 219.0f, outFrame, 1.0, CV_8U, stream);
}
}
else if (colorFormat == ColorFormat::NV_NV12) {
decodedFrame.copyTo(outFrame, stream);
Expand Down Expand Up @@ -222,9 +249,7 @@ namespace
// map decoded video frame to CUDA surface
GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);

// perform post processing on the CUDA surface (performs colors space conversion and post processing)
// comment this out if we include the line of code seen above
videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, stream);
cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);

// unmap video frame
// unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
Expand Down
46 changes: 45 additions & 1 deletion modules/cudacodec/test/test_video.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
{
};

typedef tuple<std::string, bool> color_conversion_params_t;
PARAM_TEST_CASE(ColorConversion, cv::cuda::DeviceInfo, cv::cudacodec::ColorFormat, color_conversion_params_t)
{
};

PARAM_TEST_CASE(VideoReadRaw, cv::cuda::DeviceInfo, std::string)
{
};
Expand Down Expand Up @@ -220,7 +225,7 @@ CUDA_TEST_P(Scaling, Reader)
cv::cuda::resize(frameOr(srcRoiOut), frameGs, targetRoiOut.size(), 0, 0, INTER_AREA);
// assert on mean absolute error due to different resize algorithms
const double mae = cv::cuda::norm(frameGs, frame(targetRoiOut), NORM_L1)/frameGs.size().area();
ASSERT_LT(mae, 2.35);
ASSERT_LT(mae, 2.75);
}

CUDA_TEST_P(Video, Reader)
Expand Down Expand Up @@ -265,6 +270,33 @@ CUDA_TEST_P(Video, Reader)
}
}

CUDA_TEST_P(ColorConversion, Reader)
{
cv::cuda::setDevice(GET_PARAM(0).deviceID());
const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(1);
const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
reader->set(colorFormat);
cv::VideoCapture cap(inputFile);

cv::cuda::GpuMat frame;
Mat frameHost, frameHostGs, frameFromDevice;
for (int i = 0; i < 10; i++)
{
reader->nextFrame(frame);
frame.download(frameFromDevice);
cap.read(frameHost);
const cv::cudacodec::FormatInfo fmt = reader->format();
ASSERT_TRUE(fmt.valid && fmt.videoFullRangeFlag == videoFullRangeFlag);
if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
else
frameHostGs = frameHost;
EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2.0);
}
}

CUDA_TEST_P(VideoReadRaw, Reader)
{
cv::cuda::setDevice(GET_PARAM(0).deviceID());
Expand Down Expand Up @@ -672,6 +704,18 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
ALL_DEVICES,
testing::Values(VIDEO_SRC_R)));

const color_conversion_params_t color_conversion_params[] =
{
color_conversion_params_t("highgui/video/big_buck_bunny.h264", false),
color_conversion_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
};

#define VIDEO_COLOR_OUTPUTS cv::cudacodec::ColorFormat::BGRA, cv::cudacodec::ColorFormat::BGRA
INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversion, testing::Combine(
ALL_DEVICES,
testing::Values(VIDEO_COLOR_OUTPUTS),
testing::ValuesIn(color_conversion_params)));

#define VIDEO_SRC_RW "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265"
INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
ALL_DEVICES,
Expand Down