Added CUDA DXT compression

This commit is contained in:
Martin Pulec
2013-08-08 14:15:51 +02:00
parent ac04546855
commit 4762ae60ba
9 changed files with 476 additions and 2 deletions

View File

@@ -602,6 +602,10 @@ libavcodec: @LIBAVCODEC_DECOMPRESS_LIB_TARGET@ @LIBAVCODEC_COMPRESS_LIB_TARGET@
mkdir -p lib/ultragrid
$(LINKER) $(LDFLAGS) -shared -Wl,-soname,vcompress_jpeg.so.@video_compress_abi_version@ $^ @JPEG_LIB@ -o $@
@CUDA_DXT_COMPRESS_LIB_TARGET@: @CUDA_DXT_OBJ@
mkdir -p lib/ultragrid
$(LINKER) $(LDFLAGS) -shared -Wl,-soname,vcompress_cuda_dxt.so.@video_compress_abi_version@ $^ @CUDA_DXT_LIB@ -o $@
@RTDXT_DECOMPRESS_LIB_TARGET@: @GL_COMMON_OBJ@ @X_OBJ@ @RTDXT_COMMON_OBJ@ @RTDXT_DECOMPRESS_OBJ@ @RTDXT_COMMON_HEADERS@
mkdir -p lib/ultragrid
$(LINKER) $(LDFLAGS) -shared -Wl,-soname,vdecompress_rtdxt.so.@video_decompress_abi_version@ @GL_COMMON_OBJ@ @X_OBJ@ @RTDXT_COMMON_OBJ@ @RTDXT_DECOMPRESS_OBJ@ @RTDXT_LIB@ -o $@

View File

@@ -1962,6 +1962,50 @@ AC_SUBST(JPEG_DECOMPRESS_OBJ)
AC_SUBST(JPEG_INC)
AC_SUBST(JPEG_LIB)
# -------------------------------------------------------------------------------------------------
# CUDA DXT
# -------------------------------------------------------------------------------------------------
CUDA_DXT_OBJ=
CUDA_DXT_INC=
CUDA_DXT_LIB=
cuda_dxt=no
AC_ARG_ENABLE(cuda-dxt,
AS_HELP_STRING([--disable-cuda-dxt], [disable CUDA DXT compression (auto)]),
[cuda_dxt_req=$enableval],
[cuda_dxt_req=auto])
LIBS=$SAVED_LIBS
if test "$jpeg_req" != no -a $HAVE_CUDA = yes
then
cuda_dxt=yes
CUDA_DXT_LIB=" $CUDA_LIB"
CUDA_DXT_INC=$CUDA_INC
CUDA_DXT_OBJ="src/video_compress/cuda_dxt.o cuda_dxt/cuda_dxt.cu.o"
AC_DEFINE([HAVE_CUDA_DXT], [1], [Build with CUDA DXT support])
AC_SUBST(CUDA_DXT_COMPRESS_LIB_TARGET, "lib/ultragrid/vcompress_cuda_dxt.so.$video_compress_abi_version")
LIB_TARGETS="$LIB_TARGETS $CUDA_DXT_COMPRESS_LIB_TARGET"
LIB_OBJS="$LIB_OBJS src/video_compress/cuda_dxt.o"
if test -z "$included_shared_cuda_dxt_cu"; then
LIB_OBJS="$LIB_OBJS cuda_dxt/cuda_dxt.cu.o"
included_shared_cuda_dxt_cu=yes
fi
DEFINE_CUDA
fi
if test $cuda_dxt_req = yes -a $cuda_dxt = no; then
AC_MSG_ERROR([CUDA DXT not found]);
fi
LIB_MODULES="$LIB_MODULES $CUDA_DXT_LIB"
AC_SUBST(CUDA_DXT_OBJ)
AC_SUBST(CUDA_DXT_INC)
AC_SUBST(CUDA_DXT_LIB)
# -------------------------------------------------------------------------------------------------
# JPEG to DXT plugin
# -------------------------------------------------------------------------------------------------
@@ -1973,14 +2017,19 @@ AC_ARG_ENABLE(jpeg_to_dxt,
if test $jpeg_env_ok = yes -a $jpeg_to_dxt_req != no
then
jpeg_to_dxt=yes
JPEG_TO_DXT_INC=" $CUDA_INC"
JPEG_TO_DXT_LIB=" $CUDA_LIB -lgpujpeg"
JPEG_TO_DXT_OBJ="src/video_decompress/jpeg_to_dxt.o cuda_dxt/cuda_dxt.cu.o"
AC_SUBST(JPEG_TO_DXT_DECOMPRESS_LIB_TARGET, "lib/ultragrid/vdecompress_jpeg_to_dxt.so.$video_decompress_abi_version")
LIB_TARGETS="$LIB_TARGETS $JPEG_TO_DXT_DECOMPRESS_LIB_TARGET"
LIB_OBJS="$LIB_OBJS $JPEG_TO_DXT_OBJ"
LIB_OBJS="$LIB_OBJS src/video_decompress/jpeg_to_dxt.o"
AC_DEFINE([HAVE_JPEG_TO_DXT], [1], [Build with JPEG to DXT transcode support])
jpeg_to_dxt=yes
if test -z "$included_shared_cuda_dxt_cu"; then
LIB_OBJS="$LIB_OBJS cuda_dxt/cuda_dxt.cu.o"
included_shared_cuda_dxt_cu=yes
fi
DEFINE_CUDA
fi
if test $jpeg_to_dxt = no -a $jpeg_to_dxt_req = yes

View File

@@ -693,6 +693,42 @@ __global__ static void dxt_kernel(const void * src, void * out, int size_x, int
dxt_encode<DXT_TYPE>(out, block_idx, r, g, b);
}
__global__ static void yuv422_to_yuv444_kernel(const void * src, void * out, int pix_count) {
// coordinates of this thread
const int block_idx_x = threadIdx.x + blockIdx.x * blockDim.x;
// skip if out of bounds
if(block_idx_x >= pix_count / 2) {
return;
}
uchar4 *this_src = ((uchar4 *) src) + block_idx_x * 2;
uchar4 *this_out = ((uchar4 *) out) + block_idx_x * 3;
uchar4 pix12 = this_src[0];
uchar4 pix34 = this_src[1];
uchar4 out_pix[3];
out_pix[0].x = pix12.y;
out_pix[0].y = pix12.x;
out_pix[0].z = pix12.z;
out_pix[0].w = pix12.w;
out_pix[1].x = pix12.x;
out_pix[1].y = pix12.z;
out_pix[1].z = pix34.y;
out_pix[1].w = pix34.x;
out_pix[2].x = pix34.z;
out_pix[2].z = pix34.w;
out_pix[2].w = pix34.x;
out_pix[2].x = pix34.z;
this_out[0] = out_pix[0];
this_out[1] = out_pix[1];
this_out[2] = out_pix[2];
}
/// Compute grid size and launch DXT kernel.
template <bool YUV_TO_RGB, int DXT_TYPE>
@@ -722,6 +758,14 @@ static int dxt_launch(const void * src, void * out, int sx, int sy, cudaStream_t
return cudaSuccess != cudaStreamSynchronize(str) ? -3 : 0;
}
int cuda_yuv422_to_yuv444(const void * src, void * out, int pix_count, cudaStream_t str) {
// grid and threadblock sizes
const dim3 tsiz(64, 1);
int thread_count = pix_count / 4; // we process block of 4 pixels
const dim3 gsiz((thread_count + tsiz.x - 1) / tsiz.x, 1);
yuv422_to_yuv444_kernel<<<gsiz, tsiz, 0, str>>>(src, out, pix_count);
return cudaSuccess != cudaStreamSynchronize(str) ? -3 : 0;
}
/// CUDA DXT1 compression (only RGB without alpha).
/// @param src Pointer to top-left source pixel in device-memory buffer.
@@ -768,3 +812,8 @@ int cuda_yuv_to_dxt1(const void * src, void * out, int size_x, int size_y, cudaS
int cuda_rgb_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream) {
return dxt_launch<false, 6>(src, out, size_x, size_y, stream);
}
int cuda_yuv_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream) {
return dxt_launch<true, 6>(src, out, size_x, size_y, stream);
}

View File

@@ -84,6 +84,8 @@ int cuda_rgb_to_dxt6
cudaStream_t stream
);
int cuda_yuv_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream);
int cuda_yuv422_to_yuv444(const void * src, void * out, int pix_count, cudaStream_t str);
#ifdef __cplusplus
} /* end of extern "C" */

View File

@@ -1100,3 +1100,45 @@ int codec_is_a_rgb(codec_t codec)
}
return 0;
}
/**
* Returns line decoder for specifiedn input and output codec.
*/
bool get_decoder_from_to(codec_t in, codec_t out, decoder_t *decoder)
{
struct item {
decoder_t decoder;
codec_t in;
codec_t out;
};
struct item decoders[] = {
{ (decoder_t) vc_copylineDVS10, DVS10, UYVY },
{ (decoder_t) vc_copylinev210, v210, UYVY },
{ (decoder_t) vc_copylineYUYV, YUYV, UYVY },
{ (decoder_t) vc_copyliner10k, R10k, RGBA },
//{ vc_copylineRGBA, RGBA, RGBA },
{ (decoder_t) vc_copylineDVS10toV210, DVS10, v210 },
{ (decoder_t) vc_copylineRGBAtoRGB, RGBA, RGB },
{ (decoder_t) vc_copylineRGBtoRGBA, RGB, RGBA },
// following is disabled - shouldn't be senected automatically
//{ (decoder_t) vc_copylineRGBtoUYVY, RGB, UYVY },
//{ (decoder_t) vc_copylineUYVYtoRGB, UYVY, RGB },
//{ vc_copylineBGRtoUYVY, BGR, UYVY },
//{ vc_copylineRGBAtoUYVY, RGBA, UYVY },
{ (decoder_t) vc_copylineBGRtoRGB, BGR, RGB },
{ (decoder_t) vc_copylineDPX10toRGBA, DPX10, RGBA },
{ (decoder_t) vc_copylineDPX10toRGB, DPX10, RGB },
//{ vc_copylineRGB, RGB, RGB }.
};
for (unsigned int i = 0; i < sizeof(decoders)/sizeof(struct item); ++i) {
if (decoders[i].in == in && decoders[i].out == out) {
*decoder = decoders[i].decoder;
return true;
}
}
return false;
}

View File

@@ -139,6 +139,8 @@ void vc_copylineDPX10toRGB(unsigned char *dst, const unsigned char *src, int dst
void vc_copylineRGB(unsigned char *dst, const unsigned char *src, int dst_len,
int rshift, int gshift, int bshift);
bool get_decoder_from_to(codec_t in, codec_t out, decoder_t *decoder);
int codec_is_a_rgb(codec_t codec);
#ifdef __cplusplus

View File

@@ -49,6 +49,7 @@
#include "module.h"
#include "video.h"
#include "video_compress.h"
#include "video_compress/cuda_dxt.h"
#include "video_compress/dxt_glsl.h"
#include "video_compress/fastdxt.h"
#include "video_compress/libavcodec.h"
@@ -184,6 +185,16 @@ struct compress_t compress_modules[] = {
MK_NAME(libavcodec_compress_tile),
NULL
},
#endif
#if defined HAVE_CUDA_DXT || defined BUILD_LIBRARIES
{
"cuda_dxt",
"cuda_dxt",
MK_NAME(cuda_dxt_compress_init),
MK_NAME(NULL),
MK_NAME(cuda_dxt_compress_tile),
NULL
},
#endif
{
"none",

View File

@@ -0,0 +1,260 @@
/**
* @file video_compress/cuda_dxt.cpp
* @author Martin Pulec <pulec@cesnet.cz>
*/
/*
* Copyright (c) 2012-2013 CESNET z.s.p.o.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, is permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of CESNET nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#include "config_unix.h"
#include "config_win32.h"
#endif // HAVE_CONFIG_H
#include "video_compress/cuda_dxt.h"
#include "cuda_dxt/cuda_dxt.h"
#include "host.h"
#include "module.h"
#include "video.h"
struct state_video_compress_cuda_dxt {
state_video_compress_cuda_dxt() {
memset(&saved_desc, 0, sizeof(saved_desc));
out[0] = out[1] = NULL;
in_buffer = NULL;
cuda_in_buffer = NULL;
cuda_uyvy_buffer = NULL;
}
struct module module_data;
struct video_desc saved_desc;
char *in_buffer; ///< for decoded data
char *cuda_uyvy_buffer; ///< same as in_buffer but in device memory
char *cuda_in_buffer; ///< same as in_buffer but in device memory
struct tile *out[2];
codec_t in_codec;
codec_t out_codec;
decoder_t decoder;
};
static void cuda_dxt_compress_done(struct module *mod);
struct module *cuda_dxt_compress_init(struct module *parent, char *fmt)
{
state_video_compress_cuda_dxt *s =
new state_video_compress_cuda_dxt;
s->out_codec = DXT1;
if (fmt && fmt[0] != '\0') {
if (strcasecmp(fmt, "DXT5") == 0) {
s->out_codec = DXT5;
} else if (strcasecmp(fmt, "DXT1") == 0) {
s->out_codec = DXT1;
} else {
printf("usage:\n"
"\t-c cuda_dxt[:DXT1|:DXT5]\n");
return NULL;
}
}
module_init_default(&s->module_data);
s->module_data.cls = MODULE_CLASS_DATA;
s->module_data.priv_data = s;
s->module_data.deleter = cuda_dxt_compress_done;
module_register(&s->module_data, parent);
return &s->module_data;
}
static void cleanup(struct state_video_compress_cuda_dxt *s)
{
if (s->in_buffer) {
free(s->in_buffer);
s->in_buffer = NULL;
}
if (s->cuda_uyvy_buffer) {
cudaFree(s->cuda_uyvy_buffer);
s->cuda_uyvy_buffer = NULL;
}
if (s->cuda_in_buffer) {
cudaFree(s->cuda_in_buffer);
s->cuda_in_buffer = NULL;
}
for (int i = 0; i < 2; ++i) {
if (s->out[i] != NULL) {
cudaFree(s->out[i]->data);
s->out[i]->data = NULL;
}
}
}
static bool configure_with(struct state_video_compress_cuda_dxt *s, struct video_desc desc)
{
cleanup(s);
if (desc.color_spec == RGB || desc.color_spec == UYVY) {
s->in_codec = desc.color_spec;
} else if (get_decoder_from_to(desc.color_spec, RGB, &s->decoder)) {
s->in_codec = RGB;
} else if (get_decoder_from_to(desc.color_spec, UYVY, &s->decoder)) {
s->in_codec = UYVY;
} else {
fprintf(stderr, "Unsupported codec: %s\n", get_codec_name(desc.color_spec));
return false;
}
if (s->in_codec == UYVY) {
if (cudaSuccess != cudaMalloc((void **) &s->cuda_uyvy_buffer,
desc.width * desc.height * 2)) {
fprintf(stderr, "Could not allocate CUDA UYVY buffer.\n");
return false;
}
}
s->in_buffer = (char *) malloc(desc.width * desc.height * 3);
if (cudaSuccess != cudaMalloc((void **) &s->cuda_in_buffer,
desc.width * desc.height * 3)) {
fprintf(stderr, "Could not allocate CUDA output buffer.\n");
return false;
}
for (int i = 0; i < 2; ++i) {
struct video_desc compressed_desc = desc;
compressed_desc.color_spec = s->out_codec;
s->out[i] = tile_alloc_desc(compressed_desc);
s->out[i]->data_len = desc.width * desc.height / (s->out_codec == DXT1 ? 2 : 1);
if (cudaSuccess != cudaMallocHost((void **) &s->out[i]->data,
s->out[i]->data_len)) {
fprintf(stderr, "Could not allocate CUDA output buffer.\n");
return false;
}
}
return true;
}
struct tile *cuda_dxt_compress_tile(struct module *mod, struct tile *tx, struct video_desc *desc,
int buffer)
{
struct state_video_compress_cuda_dxt *s =
(struct state_video_compress_cuda_dxt *) mod->priv_data;
cudaSetDevice(cuda_devices[0]);
if (!video_desc_eq(*desc, s->saved_desc)) {
if(configure_with(s, *desc)) {
s->saved_desc = *desc;
} else {
fprintf(stderr, "[CUDA DXT] Reconfiguration failed!\n");
return NULL;
}
}
char *in_buffer;
if (desc->color_spec == s->in_codec) {
in_buffer = tx->data;
} else {
unsigned char *line1 = (unsigned char *) tx->data;
unsigned char *line2 = (unsigned char *) s->in_buffer;
for (int i = 0; i < (int) tx->height; ++i) {
s->decoder(line2, line1, vc_get_linesize(tx->width, s->in_codec),
0, 8, 16);
line1 += vc_get_linesize(tx->width, desc->color_spec);
line2 += vc_get_linesize(tx->width, s->in_codec);
}
in_buffer = s->in_buffer;
}
if (s->in_codec == UYVY) {
if (cudaMemcpy(s->cuda_uyvy_buffer, in_buffer, desc->width * desc->height * 2,
cudaMemcpyHostToDevice) != cudaSuccess) {
fprintf(stderr, "Memcpy failed: %s\n",
cudaGetErrorString(cudaGetLastError()));
return NULL;
}
if (cuda_yuv422_to_yuv444(s->cuda_uyvy_buffer, s->cuda_in_buffer,
desc->width * desc->height, 0) != 0) {
fprintf(stderr, "UYVY kernel failed: %s\n",
cudaGetErrorString(cudaGetLastError()));
}
} else {
if (cudaMemcpy(s->cuda_in_buffer, in_buffer, desc->width * desc->height * 3,
cudaMemcpyHostToDevice) != cudaSuccess) {
fprintf(stderr, "Memcpy failed: %s\n",
cudaGetErrorString(cudaGetLastError()));
return NULL;
}
}
int (*cuda_dxt_enc_func)(const void * src, void * out, int size_x, int size_y, cudaStream_t stream);
if (s->out_codec == DXT1) {
if (s->in_codec == RGB) {
cuda_dxt_enc_func = cuda_rgb_to_dxt1;
} else {
cuda_dxt_enc_func = cuda_yuv_to_dxt1;
}
} else {
if (s->in_codec == RGB) {
cuda_dxt_enc_func = cuda_rgb_to_dxt6;
} else {
cuda_dxt_enc_func = cuda_yuv_to_dxt6;
}
}
int ret = cuda_dxt_enc_func(s->cuda_in_buffer, s->out[buffer]->data, s->saved_desc.width,
s->saved_desc.height, 0);
if (ret != 0) {
fprintf(stderr, "Encoding failed: %s\n",
cudaGetErrorString(cudaGetLastError()));
return NULL;
}
desc->color_spec = s->out_codec;
return s->out[buffer];
}
static void cuda_dxt_compress_done(struct module *mod)
{
struct state_video_compress_cuda_dxt *s =
(struct state_video_compress_cuda_dxt *) mod->priv_data;
cleanup(s);
delete s;
}

View File

@@ -0,0 +1,55 @@
/**
* @file video_compress/cuda_dxt.h
* @author Martin Pulec <pulec@cesnet.cz>
*/
/*
* Copyright (c) 2013 CESNET z.s.p.o.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, is permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of CESNET nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "types.h"
#define JPEG_TO_DXT_MAGIC 0x20BF0088
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
struct module;
struct module *cuda_dxt_compress_init(struct module *parent, char * opts);
struct tile *cuda_dxt_compress_tile(struct module *mod, struct tile *tx, struct video_desc *desc,
int buffer);
#ifdef __cplusplus
}
#endif // __cplusplus