diff --git a/Makefile.in b/Makefile.in index f4ad73ba6..d50540bf7 100644 --- a/Makefile.in +++ b/Makefile.in @@ -602,6 +602,10 @@ libavcodec: @LIBAVCODEC_DECOMPRESS_LIB_TARGET@ @LIBAVCODEC_COMPRESS_LIB_TARGET@ mkdir -p lib/ultragrid $(LINKER) $(LDFLAGS) -shared -Wl,-soname,vcompress_jpeg.so.@video_compress_abi_version@ $^ @JPEG_LIB@ -o $@ +@CUDA_DXT_COMPRESS_LIB_TARGET@: @CUDA_DXT_OBJ@ + mkdir -p lib/ultragrid + $(LINKER) $(LDFLAGS) -shared -Wl,-soname,vcompress_cuda_dxt.so.@video_compress_abi_version@ $^ @CUDA_DXT_LIB@ -o $@ + @RTDXT_DECOMPRESS_LIB_TARGET@: @GL_COMMON_OBJ@ @X_OBJ@ @RTDXT_COMMON_OBJ@ @RTDXT_DECOMPRESS_OBJ@ @RTDXT_COMMON_HEADERS@ mkdir -p lib/ultragrid $(LINKER) $(LDFLAGS) -shared -Wl,-soname,vdecompress_rtdxt.so.@video_decompress_abi_version@ @GL_COMMON_OBJ@ @X_OBJ@ @RTDXT_COMMON_OBJ@ @RTDXT_DECOMPRESS_OBJ@ @RTDXT_LIB@ -o $@ diff --git a/configure.ac b/configure.ac index 112c19867..2e7a94a38 100644 --- a/configure.ac +++ b/configure.ac @@ -1962,6 +1962,50 @@ AC_SUBST(JPEG_DECOMPRESS_OBJ) AC_SUBST(JPEG_INC) AC_SUBST(JPEG_LIB) +# ------------------------------------------------------------------------------------------------- +# CUDA DXT +# ------------------------------------------------------------------------------------------------- +CUDA_DXT_OBJ= +CUDA_DXT_INC= +CUDA_DXT_LIB= + +cuda_dxt=no + +AC_ARG_ENABLE(cuda-dxt, + AS_HELP_STRING([--disable-cuda-dxt], [disable CUDA DXT compression (auto)]), + [cuda_dxt_req=$enableval], + [cuda_dxt_req=auto]) + +LIBS=$SAVED_LIBS + +if test "$jpeg_req" != no -a $HAVE_CUDA = yes +then + cuda_dxt=yes + + CUDA_DXT_LIB=" $CUDA_LIB" + CUDA_DXT_INC=$CUDA_INC + CUDA_DXT_OBJ="src/video_compress/cuda_dxt.o cuda_dxt/cuda_dxt.cu.o" + AC_DEFINE([HAVE_CUDA_DXT], [1], [Build with CUDA DXT support]) + AC_SUBST(CUDA_DXT_COMPRESS_LIB_TARGET, "lib/ultragrid/vcompress_cuda_dxt.so.$video_compress_abi_version") + LIB_TARGETS="$LIB_TARGETS $CUDA_DXT_COMPRESS_LIB_TARGET" + LIB_OBJS="$LIB_OBJS src/video_compress/cuda_dxt.o" + if test -z "$included_shared_cuda_dxt_cu"; then + LIB_OBJS="$LIB_OBJS cuda_dxt/cuda_dxt.cu.o" + included_shared_cuda_dxt_cu=yes + fi + DEFINE_CUDA +fi + +if test $cuda_dxt_req = yes -a $cuda_dxt = no; then + AC_MSG_ERROR([CUDA DXT not found]); +fi + +LIB_MODULES="$LIB_MODULES $CUDA_DXT_LIB" + +AC_SUBST(CUDA_DXT_OBJ) +AC_SUBST(CUDA_DXT_INC) +AC_SUBST(CUDA_DXT_LIB) + # ------------------------------------------------------------------------------------------------- # JPEG to DXT plugin # ------------------------------------------------------------------------------------------------- @@ -1973,14 +2017,19 @@ AC_ARG_ENABLE(jpeg_to_dxt, if test $jpeg_env_ok = yes -a $jpeg_to_dxt_req != no then + jpeg_to_dxt=yes JPEG_TO_DXT_INC=" $CUDA_INC" JPEG_TO_DXT_LIB=" $CUDA_LIB -lgpujpeg" JPEG_TO_DXT_OBJ="src/video_decompress/jpeg_to_dxt.o cuda_dxt/cuda_dxt.cu.o" AC_SUBST(JPEG_TO_DXT_DECOMPRESS_LIB_TARGET, "lib/ultragrid/vdecompress_jpeg_to_dxt.so.$video_decompress_abi_version") LIB_TARGETS="$LIB_TARGETS $JPEG_TO_DXT_DECOMPRESS_LIB_TARGET" - LIB_OBJS="$LIB_OBJS $JPEG_TO_DXT_OBJ" + LIB_OBJS="$LIB_OBJS src/video_decompress/jpeg_to_dxt.o" AC_DEFINE([HAVE_JPEG_TO_DXT], [1], [Build with JPEG to DXT transcode support]) - jpeg_to_dxt=yes + if test -z "$included_shared_cuda_dxt_cu"; then + LIB_OBJS="$LIB_OBJS cuda_dxt/cuda_dxt.cu.o" + included_shared_cuda_dxt_cu=yes + fi + DEFINE_CUDA fi if test $jpeg_to_dxt = no -a $jpeg_to_dxt_req = yes diff --git a/cuda_dxt/cuda_dxt.cu b/cuda_dxt/cuda_dxt.cu index 109aad231..a4d5d7c97 100644 --- a/cuda_dxt/cuda_dxt.cu +++ b/cuda_dxt/cuda_dxt.cu @@ -693,6 +693,42 @@ __global__ static void dxt_kernel(const void * src, void * out, int size_x, int dxt_encode(out, block_idx, r, g, b); } +__global__ static void yuv422_to_yuv444_kernel(const void * src, void * out, int pix_count) { + // coordinates of this thread + const int block_idx_x = threadIdx.x + blockIdx.x * blockDim.x; + + // skip if out of bounds + if(block_idx_x >= pix_count / 2) { + return; + } + + uchar4 *this_src = ((uchar4 *) src) + block_idx_x * 2; + uchar4 *this_out = ((uchar4 *) out) + block_idx_x * 3; + + uchar4 pix12 = this_src[0]; + uchar4 pix34 = this_src[1]; + + uchar4 out_pix[3]; + out_pix[0].x = pix12.y; + out_pix[0].y = pix12.x; + out_pix[0].z = pix12.z; + + out_pix[0].w = pix12.w; + out_pix[1].x = pix12.x; + out_pix[1].y = pix12.z; + + out_pix[1].z = pix34.y; + out_pix[1].w = pix34.x; + out_pix[2].x = pix34.z; + + out_pix[2].z = pix34.w; + out_pix[2].w = pix34.x; + out_pix[2].x = pix34.z; + + this_out[0] = out_pix[0]; + this_out[1] = out_pix[1]; + this_out[2] = out_pix[2]; +} /// Compute grid size and launch DXT kernel. template @@ -722,6 +758,14 @@ static int dxt_launch(const void * src, void * out, int sx, int sy, cudaStream_t return cudaSuccess != cudaStreamSynchronize(str) ? -3 : 0; } +int cuda_yuv422_to_yuv444(const void * src, void * out, int pix_count, cudaStream_t str) { + // grid and threadblock sizes + const dim3 tsiz(64, 1); + int thread_count = pix_count / 4; // we process block of 4 pixels + const dim3 gsiz((thread_count + tsiz.x - 1) / tsiz.x, 1); + yuv422_to_yuv444_kernel<<>>(src, out, pix_count); + return cudaSuccess != cudaStreamSynchronize(str) ? -3 : 0; +} /// CUDA DXT1 compression (only RGB without alpha). /// @param src Pointer to top-left source pixel in device-memory buffer. @@ -768,3 +812,8 @@ int cuda_yuv_to_dxt1(const void * src, void * out, int size_x, int size_y, cudaS int cuda_rgb_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream) { return dxt_launch(src, out, size_x, size_y, stream); } + +int cuda_yuv_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream) { + return dxt_launch(src, out, size_x, size_y, stream); +} + diff --git a/cuda_dxt/cuda_dxt.h b/cuda_dxt/cuda_dxt.h index c83baed45..348dfba6f 100644 --- a/cuda_dxt/cuda_dxt.h +++ b/cuda_dxt/cuda_dxt.h @@ -84,6 +84,8 @@ int cuda_rgb_to_dxt6 cudaStream_t stream ); +int cuda_yuv_to_dxt6(const void * src, void * out, int size_x, int size_y, cudaStream_t stream); +int cuda_yuv422_to_yuv444(const void * src, void * out, int pix_count, cudaStream_t str); #ifdef __cplusplus } /* end of extern "C" */ diff --git a/src/video_codec.c b/src/video_codec.c index 8126a639b..87e832dc7 100644 --- a/src/video_codec.c +++ b/src/video_codec.c @@ -1100,3 +1100,45 @@ int codec_is_a_rgb(codec_t codec) } return 0; } + +/** + * Returns line decoder for specifiedn input and output codec. + */ +bool get_decoder_from_to(codec_t in, codec_t out, decoder_t *decoder) +{ + struct item { + decoder_t decoder; + codec_t in; + codec_t out; + }; + + struct item decoders[] = { + { (decoder_t) vc_copylineDVS10, DVS10, UYVY }, + { (decoder_t) vc_copylinev210, v210, UYVY }, + { (decoder_t) vc_copylineYUYV, YUYV, UYVY }, + { (decoder_t) vc_copyliner10k, R10k, RGBA }, + //{ vc_copylineRGBA, RGBA, RGBA }, + { (decoder_t) vc_copylineDVS10toV210, DVS10, v210 }, + { (decoder_t) vc_copylineRGBAtoRGB, RGBA, RGB }, + { (decoder_t) vc_copylineRGBtoRGBA, RGB, RGBA }, + // following is disabled - shouldn't be senected automatically + //{ (decoder_t) vc_copylineRGBtoUYVY, RGB, UYVY }, + //{ (decoder_t) vc_copylineUYVYtoRGB, UYVY, RGB }, + //{ vc_copylineBGRtoUYVY, BGR, UYVY }, + //{ vc_copylineRGBAtoUYVY, RGBA, UYVY }, + { (decoder_t) vc_copylineBGRtoRGB, BGR, RGB }, + { (decoder_t) vc_copylineDPX10toRGBA, DPX10, RGBA }, + { (decoder_t) vc_copylineDPX10toRGB, DPX10, RGB }, + //{ vc_copylineRGB, RGB, RGB }. + }; + + for (unsigned int i = 0; i < sizeof(decoders)/sizeof(struct item); ++i) { + if (decoders[i].in == in && decoders[i].out == out) { + *decoder = decoders[i].decoder; + return true; + } + } + + return false; +} + diff --git a/src/video_codec.h b/src/video_codec.h index dca52cf09..e9870bcc4 100644 --- a/src/video_codec.h +++ b/src/video_codec.h @@ -139,6 +139,8 @@ void vc_copylineDPX10toRGB(unsigned char *dst, const unsigned char *src, int dst void vc_copylineRGB(unsigned char *dst, const unsigned char *src, int dst_len, int rshift, int gshift, int bshift); +bool get_decoder_from_to(codec_t in, codec_t out, decoder_t *decoder); + int codec_is_a_rgb(codec_t codec); #ifdef __cplusplus diff --git a/src/video_compress.c b/src/video_compress.c index c26b39ea6..9911fc2a5 100644 --- a/src/video_compress.c +++ b/src/video_compress.c @@ -49,6 +49,7 @@ #include "module.h" #include "video.h" #include "video_compress.h" +#include "video_compress/cuda_dxt.h" #include "video_compress/dxt_glsl.h" #include "video_compress/fastdxt.h" #include "video_compress/libavcodec.h" @@ -184,6 +185,16 @@ struct compress_t compress_modules[] = { MK_NAME(libavcodec_compress_tile), NULL }, +#endif +#if defined HAVE_CUDA_DXT || defined BUILD_LIBRARIES + { + "cuda_dxt", + "cuda_dxt", + MK_NAME(cuda_dxt_compress_init), + MK_NAME(NULL), + MK_NAME(cuda_dxt_compress_tile), + NULL + }, #endif { "none", diff --git a/src/video_compress/cuda_dxt.cpp b/src/video_compress/cuda_dxt.cpp new file mode 100644 index 000000000..20bf2bf77 --- /dev/null +++ b/src/video_compress/cuda_dxt.cpp @@ -0,0 +1,260 @@ +/** + * @file video_compress/cuda_dxt.cpp + * @author Martin Pulec + */ +/* + * Copyright (c) 2012-2013 CESNET z.s.p.o. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, is permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of CESNET nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#include "config_unix.h" +#include "config_win32.h" +#endif // HAVE_CONFIG_H + +#include "video_compress/cuda_dxt.h" + +#include "cuda_dxt/cuda_dxt.h" + +#include "host.h" +#include "module.h" +#include "video.h" + +struct state_video_compress_cuda_dxt { + state_video_compress_cuda_dxt() { + memset(&saved_desc, 0, sizeof(saved_desc)); + out[0] = out[1] = NULL; + in_buffer = NULL; + cuda_in_buffer = NULL; + cuda_uyvy_buffer = NULL; + } + struct module module_data; + struct video_desc saved_desc; + char *in_buffer; ///< for decoded data + char *cuda_uyvy_buffer; ///< same as in_buffer but in device memory + char *cuda_in_buffer; ///< same as in_buffer but in device memory + struct tile *out[2]; + codec_t in_codec; + codec_t out_codec; + decoder_t decoder; +}; + +static void cuda_dxt_compress_done(struct module *mod); + +struct module *cuda_dxt_compress_init(struct module *parent, char *fmt) +{ + state_video_compress_cuda_dxt *s = + new state_video_compress_cuda_dxt; + + s->out_codec = DXT1; + + if (fmt && fmt[0] != '\0') { + if (strcasecmp(fmt, "DXT5") == 0) { + s->out_codec = DXT5; + } else if (strcasecmp(fmt, "DXT1") == 0) { + s->out_codec = DXT1; + } else { + printf("usage:\n" + "\t-c cuda_dxt[:DXT1|:DXT5]\n"); + return NULL; + } + } + + module_init_default(&s->module_data); + s->module_data.cls = MODULE_CLASS_DATA; + s->module_data.priv_data = s; + s->module_data.deleter = cuda_dxt_compress_done; + module_register(&s->module_data, parent); + + return &s->module_data; +} + +static void cleanup(struct state_video_compress_cuda_dxt *s) +{ + if (s->in_buffer) { + free(s->in_buffer); + s->in_buffer = NULL; + } + if (s->cuda_uyvy_buffer) { + cudaFree(s->cuda_uyvy_buffer); + s->cuda_uyvy_buffer = NULL; + } + if (s->cuda_in_buffer) { + cudaFree(s->cuda_in_buffer); + s->cuda_in_buffer = NULL; + } + for (int i = 0; i < 2; ++i) { + if (s->out[i] != NULL) { + cudaFree(s->out[i]->data); + s->out[i]->data = NULL; + } + } +} + +static bool configure_with(struct state_video_compress_cuda_dxt *s, struct video_desc desc) +{ + cleanup(s); + + if (desc.color_spec == RGB || desc.color_spec == UYVY) { + s->in_codec = desc.color_spec; + } else if (get_decoder_from_to(desc.color_spec, RGB, &s->decoder)) { + s->in_codec = RGB; + } else if (get_decoder_from_to(desc.color_spec, UYVY, &s->decoder)) { + s->in_codec = UYVY; + } else { + fprintf(stderr, "Unsupported codec: %s\n", get_codec_name(desc.color_spec)); + return false; + } + + if (s->in_codec == UYVY) { + if (cudaSuccess != cudaMalloc((void **) &s->cuda_uyvy_buffer, + desc.width * desc.height * 2)) { + fprintf(stderr, "Could not allocate CUDA UYVY buffer.\n"); + return false; + } + } + + s->in_buffer = (char *) malloc(desc.width * desc.height * 3); + + if (cudaSuccess != cudaMalloc((void **) &s->cuda_in_buffer, + desc.width * desc.height * 3)) { + fprintf(stderr, "Could not allocate CUDA output buffer.\n"); + return false; + } + + for (int i = 0; i < 2; ++i) { + struct video_desc compressed_desc = desc; + compressed_desc.color_spec = s->out_codec; + + s->out[i] = tile_alloc_desc(compressed_desc); + s->out[i]->data_len = desc.width * desc.height / (s->out_codec == DXT1 ? 2 : 1); + if (cudaSuccess != cudaMallocHost((void **) &s->out[i]->data, + s->out[i]->data_len)) { + fprintf(stderr, "Could not allocate CUDA output buffer.\n"); + return false; + } + } + + return true; +} + +struct tile *cuda_dxt_compress_tile(struct module *mod, struct tile *tx, struct video_desc *desc, + int buffer) +{ + struct state_video_compress_cuda_dxt *s = + (struct state_video_compress_cuda_dxt *) mod->priv_data; + + cudaSetDevice(cuda_devices[0]); + + if (!video_desc_eq(*desc, s->saved_desc)) { + if(configure_with(s, *desc)) { + s->saved_desc = *desc; + } else { + fprintf(stderr, "[CUDA DXT] Reconfiguration failed!\n"); + return NULL; + } + } + + char *in_buffer; + if (desc->color_spec == s->in_codec) { + in_buffer = tx->data; + } else { + unsigned char *line1 = (unsigned char *) tx->data; + unsigned char *line2 = (unsigned char *) s->in_buffer; + + for (int i = 0; i < (int) tx->height; ++i) { + s->decoder(line2, line1, vc_get_linesize(tx->width, s->in_codec), + 0, 8, 16); + line1 += vc_get_linesize(tx->width, desc->color_spec); + line2 += vc_get_linesize(tx->width, s->in_codec); + } + in_buffer = s->in_buffer; + } + + if (s->in_codec == UYVY) { + if (cudaMemcpy(s->cuda_uyvy_buffer, in_buffer, desc->width * desc->height * 2, + cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "Memcpy failed: %s\n", + cudaGetErrorString(cudaGetLastError())); + return NULL; + } + if (cuda_yuv422_to_yuv444(s->cuda_uyvy_buffer, s->cuda_in_buffer, + desc->width * desc->height, 0) != 0) { + fprintf(stderr, "UYVY kernel failed: %s\n", + cudaGetErrorString(cudaGetLastError())); + } + } else { + if (cudaMemcpy(s->cuda_in_buffer, in_buffer, desc->width * desc->height * 3, + cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "Memcpy failed: %s\n", + cudaGetErrorString(cudaGetLastError())); + return NULL; + } + } + + int (*cuda_dxt_enc_func)(const void * src, void * out, int size_x, int size_y, cudaStream_t stream); + + if (s->out_codec == DXT1) { + if (s->in_codec == RGB) { + cuda_dxt_enc_func = cuda_rgb_to_dxt1; + } else { + cuda_dxt_enc_func = cuda_yuv_to_dxt1; + } + } else { + if (s->in_codec == RGB) { + cuda_dxt_enc_func = cuda_rgb_to_dxt6; + } else { + cuda_dxt_enc_func = cuda_yuv_to_dxt6; + } + } + int ret = cuda_dxt_enc_func(s->cuda_in_buffer, s->out[buffer]->data, s->saved_desc.width, + s->saved_desc.height, 0); + if (ret != 0) { + fprintf(stderr, "Encoding failed: %s\n", + cudaGetErrorString(cudaGetLastError())); + return NULL; + } + + desc->color_spec = s->out_codec; + return s->out[buffer]; +} + +static void cuda_dxt_compress_done(struct module *mod) +{ + struct state_video_compress_cuda_dxt *s = + (struct state_video_compress_cuda_dxt *) mod->priv_data; + + cleanup(s); + + delete s; +} + diff --git a/src/video_compress/cuda_dxt.h b/src/video_compress/cuda_dxt.h new file mode 100644 index 000000000..d8826748d --- /dev/null +++ b/src/video_compress/cuda_dxt.h @@ -0,0 +1,55 @@ +/** + * @file video_compress/cuda_dxt.h + * @author Martin Pulec + */ +/* + * Copyright (c) 2013 CESNET z.s.p.o. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, is permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of CESNET nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "types.h" + +#define JPEG_TO_DXT_MAGIC 0x20BF0088 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct module; + +struct module *cuda_dxt_compress_init(struct module *parent, char * opts); +struct tile *cuda_dxt_compress_tile(struct module *mod, struct tile *tx, struct video_desc *desc, + int buffer); + +#ifdef __cplusplus +} +#endif // __cplusplus +