mirror of
https://github.com/outbackdingo/UltraGrid.git
synced 2026-03-20 17:40:12 +00:00
Conflicts: libgpujpeg/gpujpeg_huffman_cpu_decoder.c libgpujpeg/gpujpeg_huffman_cpu_encoder.c libgpujpeg/gpujpeg_preprocessor.cu
749 lines
25 KiB
Plaintext
749 lines
25 KiB
Plaintext
/**
|
|
* Copyright (c) 2011, CESNET z.s.p.o
|
|
* Copyright (c) 2011, Silicon Genome, LLC.
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpujpeg_dct_gpu.h"
|
|
#include "gpujpeg_util.h"
|
|
|
|
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
/** Fast integer multiplication */
|
|
#define FMUL(x,y) (__mul24(x,y))
|
|
//#define FMUL(x,y) ((x)*(y))
|
|
|
|
// X block count which will be processed by one thread block
|
|
#define GPUJPEG_DCT_BLOCK_COUNT_X 4
|
|
// Y block count which will be processed by one thread block
|
|
#define GPUJPEG_DCT_BLOCK_COUNT_Y 4
|
|
|
|
// Thread block width
|
|
#define GPUJPEG_DCT_THREAD_BLOCK_WIDTH (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_X)
|
|
// Thread block height
|
|
#define GPUJPEG_DCT_THREAD_BLOCK_HEIGHT (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_Y)
|
|
|
|
// Stride of shared memory buffer (short kernel)
|
|
#define GPUJPEG_DCT_THREAD_BLOCK_STRIDE (GPUJPEG_DCT_THREAD_BLOCK_WIDTH + 4)
|
|
|
|
#define IMAD(a, b, c) ( ((a) * (b)) + (c) )
|
|
#define IMUL(a, b) ((a) * (b))
|
|
|
|
#define SIN_1_4 0x5A82
|
|
#define COS_1_4 0x5A82
|
|
#define SIN_1_8 0x30FC
|
|
#define COS_1_8 0x7642
|
|
|
|
#define OSIN_1_16 0x063E
|
|
#define OSIN_3_16 0x11C7
|
|
#define OSIN_5_16 0x1A9B
|
|
#define OSIN_7_16 0x1F63
|
|
|
|
#define OCOS_1_16 0x1F63
|
|
#define OCOS_3_16 0x1A9B
|
|
#define OCOS_5_16 0x11C7
|
|
#define OCOS_7_16 0x063E
|
|
|
|
/**
|
|
* Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid bank conflicts
|
|
*/
|
|
union PackedInteger
|
|
{
|
|
struct __align__(8)
|
|
{
|
|
int16_t hShort1;
|
|
int16_t hShort2;
|
|
};
|
|
int32_t hInt;
|
|
};
|
|
|
|
/**
|
|
* Converts fixed point value to short value
|
|
*/
|
|
__device__ inline int16_t
|
|
unfixh(int x)
|
|
{
|
|
return (int16_t)((x + 0x8000) >> 16);
|
|
}
|
|
|
|
/**
|
|
* Converts fixed point value to short value
|
|
*/
|
|
__device__ inline int
|
|
unfixo(int x)
|
|
{
|
|
return (x + 0x1000) >> 13;
|
|
}
|
|
|
|
/**
|
|
* Performs in-place DCT of vector of 8 elements (used to access columns in shared memory).
|
|
*
|
|
* @param SrcDst [IN/OUT] - Pointer to the first element of vector
|
|
* @param Stride [IN] - Value to add to ptr to access other elements
|
|
* @return None
|
|
*/
|
|
__device__ void
|
|
gpujpeg_dct_gpu_kernel_inplace(int16_t* SrcDst, int Stride)
|
|
{
|
|
int in0, in1, in2, in3, in4, in5, in6, in7;
|
|
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
|
int tmp10, tmp11, tmp12, tmp13;
|
|
int tmp14, tmp15, tmp16, tmp17;
|
|
int tmp25, tmp26;
|
|
|
|
int DoubleStride = Stride << 1;
|
|
|
|
int16_t* DstPtr = SrcDst;
|
|
in0 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in1 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in2 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in3 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in4 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in5 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in6 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in7 = *DstPtr;
|
|
|
|
tmp0 = in7 + in0;
|
|
tmp1 = in6 + in1;
|
|
tmp2 = in5 + in2;
|
|
tmp3 = in4 + in3;
|
|
tmp4 = in3 - in4;
|
|
tmp5 = in2 - in5;
|
|
tmp6 = in1 - in6;
|
|
tmp7 = in0 - in7;
|
|
|
|
tmp10 = tmp3 + tmp0;
|
|
tmp11 = tmp2 + tmp1;
|
|
tmp12 = tmp1 - tmp2;
|
|
tmp13 = tmp0 - tmp3;
|
|
|
|
tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
|
|
tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));
|
|
|
|
tmp4 <<= 2;
|
|
tmp7 <<= 2;
|
|
|
|
tmp14 = tmp4 + tmp15;
|
|
tmp25 = tmp4 - tmp15;
|
|
tmp26 = tmp7 - tmp16;
|
|
tmp17 = tmp7 + tmp16;
|
|
|
|
DstPtr = SrcDst;
|
|
*DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));
|
|
|
|
DstPtr = SrcDst + Stride;
|
|
*DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
|
|
DstPtr += DoubleStride;
|
|
*DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
|
|
}
|
|
|
|
/**
|
|
* Performs in-place DCT of vector of 8 elements (used to access rows in shared memory).
|
|
*
|
|
* @param V8 [IN/OUT] - Pointer to the first two elements of vector
|
|
* @return None
|
|
*/
|
|
__device__ void
|
|
gpujpeg_dct_gpu_kernel_inplace(uint32_t* V8)
|
|
{
|
|
int in0, in1, in2, in3, in4, in5, in6, in7;
|
|
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
|
int tmp10, tmp11, tmp12, tmp13;
|
|
int tmp14, tmp15, tmp16, tmp17;
|
|
int tmp25, tmp26;
|
|
PackedInteger sh0, sh1, sh2, sh3;
|
|
|
|
sh0.hInt = V8[0];
|
|
sh1.hInt = V8[1];
|
|
sh2.hInt = V8[2];
|
|
sh3.hInt = V8[3];
|
|
in0 = sh0.hShort1;
|
|
in1 = sh0.hShort2;
|
|
in2 = sh1.hShort1;
|
|
in3 = sh1.hShort2;
|
|
in4 = sh2.hShort1;
|
|
in5 = sh2.hShort2;
|
|
in6 = sh3.hShort1;
|
|
in7 = sh3.hShort2;
|
|
|
|
tmp0 = in7 + in0;
|
|
tmp1 = in6 + in1;
|
|
tmp2 = in5 + in2;
|
|
tmp3 = in4 + in3;
|
|
tmp4 = in3 - in4;
|
|
tmp5 = in2 - in5;
|
|
tmp6 = in1 - in6;
|
|
tmp7 = in0 - in7;
|
|
|
|
tmp10 = tmp3 + tmp0;
|
|
tmp11 = tmp2 + tmp1;
|
|
tmp12 = tmp1 - tmp2;
|
|
tmp13 = tmp0 - tmp3;
|
|
|
|
sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
|
|
sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4));
|
|
|
|
sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
|
|
sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));
|
|
|
|
tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
|
|
tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));
|
|
|
|
tmp4 <<= 2;
|
|
tmp7 <<= 2;
|
|
|
|
tmp14 = tmp4 + tmp15;
|
|
tmp25 = tmp4 - tmp15;
|
|
tmp26 = tmp7 - tmp16;
|
|
tmp17 = tmp7 + tmp16;
|
|
|
|
sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
|
|
sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
|
|
sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
|
|
sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));
|
|
|
|
V8[0] = sh0.hInt;
|
|
V8[1] = sh1.hInt;
|
|
V8[2] = sh2.hInt;
|
|
V8[3] = sh3.hInt;
|
|
}
|
|
|
|
/**
|
|
* Performs in-place IDCT of vector of 8 elements (used to access columns in shared memory).
|
|
*
|
|
* @param SrcDst [IN/OUT] - Pointer to the first element of vector
|
|
* @param Stride [IN] - Value to add to ptr to access other elements
|
|
* @return None
|
|
*/
|
|
__device__ void
|
|
gpujpeg_idct_gpu_kernel_inplace(int16_t* SrcDst, int Stride)
|
|
{
|
|
int in0, in1, in2, in3, in4, in5, in6, in7;
|
|
int tmp10, tmp11, tmp12, tmp13;
|
|
int tmp20, tmp21, tmp22, tmp23;
|
|
int tmp30, tmp31;
|
|
int tmp40, tmp41, tmp42, tmp43;
|
|
int tmp50, tmp51, tmp52, tmp53;
|
|
|
|
int16_t *DstPtr = SrcDst;
|
|
in0 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in1 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in2 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in3 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in4 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in5 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in6 = *DstPtr;
|
|
DstPtr += Stride;
|
|
in7 = *DstPtr;
|
|
|
|
tmp10 = FMUL(in0 + in4, COS_1_4);
|
|
tmp11 = FMUL(in0 - in4, COS_1_4);
|
|
tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
|
|
tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);
|
|
|
|
tmp20 = tmp10 + tmp13;
|
|
tmp21 = tmp11 + tmp12;
|
|
tmp22 = tmp11 - tmp12;
|
|
tmp23 = tmp10 - tmp13;
|
|
|
|
tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
|
|
tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));
|
|
|
|
in1 <<= 2;
|
|
in7 <<= 2;
|
|
|
|
tmp40 = in1 + tmp30;
|
|
tmp41 = in7 + tmp31;
|
|
tmp42 = in1 - tmp30;
|
|
tmp43 = in7 - tmp31;
|
|
|
|
tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
|
|
tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
|
|
tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
|
|
tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);
|
|
|
|
DstPtr = SrcDst;
|
|
*DstPtr = unfixh(tmp20 + tmp50);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp21 + tmp53);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp22 + tmp52);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp23 + tmp51);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp23 - tmp51);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp22 - tmp52);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp21 - tmp53);
|
|
DstPtr += Stride;
|
|
*DstPtr = unfixh(tmp20 - tmp50);
|
|
}
|
|
|
|
/**
|
|
* Performs in-place IDCT of vector of 8 elements (used to access rows in shared memory).
|
|
*
|
|
* @param V8 [IN/OUT] - Pointer to the first two elements of vector
|
|
* @return None
|
|
*/
|
|
__device__ void
|
|
gpujpeg_idct_gpu_kernel_inplace(uint32_t* V8)
|
|
{
|
|
int in0, in1, in2, in3, in4, in5, in6, in7;
|
|
int tmp10, tmp11, tmp12, tmp13;
|
|
int tmp20, tmp21, tmp22, tmp23;
|
|
int tmp30, tmp31;
|
|
int tmp40, tmp41, tmp42, tmp43;
|
|
int tmp50, tmp51, tmp52, tmp53;
|
|
PackedInteger sh0, sh1, sh2, sh3;
|
|
|
|
sh0.hInt = V8[0];
|
|
sh1.hInt = V8[1];
|
|
sh2.hInt = V8[2];
|
|
sh3.hInt = V8[3];
|
|
in0 = sh0.hShort1;
|
|
in1 = sh0.hShort2;
|
|
in2 = sh1.hShort1;
|
|
in3 = sh1.hShort2;
|
|
in4 = sh2.hShort1;
|
|
in5 = sh2.hShort2;
|
|
in6 = sh3.hShort1;
|
|
in7 = sh3.hShort2;
|
|
|
|
tmp10 = FMUL(in0 + in4, COS_1_4);
|
|
tmp11 = FMUL(in0 - in4, COS_1_4);
|
|
tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
|
|
tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);
|
|
|
|
tmp20 = tmp10 + tmp13;
|
|
tmp21 = tmp11 + tmp12;
|
|
tmp22 = tmp11 - tmp12;
|
|
tmp23 = tmp10 - tmp13;
|
|
|
|
tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
|
|
tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));
|
|
|
|
in1 <<= 2;
|
|
in7 <<= 2;
|
|
|
|
tmp40 = in1 + tmp30;
|
|
tmp41 = in7 + tmp31;
|
|
tmp42 = in1 - tmp30;
|
|
tmp43 = in7 - tmp31;
|
|
|
|
tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
|
|
tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
|
|
tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
|
|
tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);
|
|
|
|
sh0.hShort1 = unfixh(tmp20 + tmp50);
|
|
sh0.hShort2 = unfixh(tmp21 + tmp53);
|
|
sh1.hShort1 = unfixh(tmp22 + tmp52);
|
|
sh1.hShort2 = unfixh(tmp23 + tmp51);
|
|
sh2.hShort1 = unfixh(tmp23 - tmp51);
|
|
sh2.hShort2 = unfixh(tmp22 - tmp52);
|
|
sh3.hShort1 = unfixh(tmp21 - tmp53);
|
|
sh3.hShort2 = unfixh(tmp20 - tmp50);
|
|
|
|
V8[0] = sh0.hInt;
|
|
V8[1] = sh1.hInt;
|
|
V8[2] = sh2.hInt;
|
|
V8[3] = sh3.hInt;
|
|
}
|
|
|
|
/** Quantization table */
|
|
__constant__ uint16_t gpujpeg_dct_gpu_quantization_table[64];
|
|
|
|
/**
|
|
* Performs 8x8 block-wise Forward Discrete Cosine Transform of the given
|
|
* image plane and outputs result to the array of coefficients. Short implementation.
|
|
* This kernel is designed to process image by blocks of blocks8x8 that
|
|
* utilize maximum warps capacity, assuming that it is enough of 8 threads
|
|
* per block8x8.
|
|
*
|
|
* @param source [IN] - Source coefficients
|
|
* @param source_stride [IN] - Stride of source
|
|
* @param output [OUT] - Source coefficients
|
|
* @param output_stride [OUT] - Stride of source
|
|
* @param table [IN] - Quantization table
|
|
* @return None
|
|
*/
|
|
__global__ void
|
|
gpujpeg_dct_gpu_kernel(int block_count_x, int block_count_y, uint8_t* source, int source_stride,
|
|
int16_t* output, int output_stride, uint16_t* quantization_table)
|
|
{
|
|
// For pre-fermi GPUs, quantization table in constant memory is faster
|
|
#if __CUDA_ARCH__ < 200
|
|
quantization_table = gpujpeg_dct_gpu_quantization_table;
|
|
#endif
|
|
|
|
// Shared data
|
|
__shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
|
|
// Block position
|
|
int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y);
|
|
int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
|
|
|
|
// Thread position in thread block
|
|
int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x);
|
|
int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE);
|
|
int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F);
|
|
|
|
// Determine position into shared buffer
|
|
int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
|
|
|
|
// Determine position in source buffer and apply it
|
|
int source_x = IMAD(block_x, GPUJPEG_BLOCK_SIZE, threadIdx.x);
|
|
int source_y = IMUL(block_y, GPUJPEG_BLOCK_SIZE);
|
|
source += IMAD(source_y, source_stride, source_x);
|
|
|
|
// Load data to shared memory memory
|
|
if ( block_x < block_count_x && block_y < block_count_y ) {
|
|
|
|
// For pre-fermi GPUs, loading from global memory by 4 bytes is faster
|
|
#if __CUDA_ARCH__ < 200
|
|
__shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
|
|
if ( threadIdx.x % 4 == 0 ) {
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
|
|
((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)] = ((uint32_t*)source)[i * (source_stride / 4)];
|
|
}
|
|
source = block_byte_ptr;
|
|
source_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE;
|
|
#endif
|
|
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
|
|
int16_t coefficient = (int16_t)(source[i * source_stride]);
|
|
coefficient -= 128;
|
|
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
|
|
}
|
|
}
|
|
|
|
// Perform DCT
|
|
__syncthreads();
|
|
gpujpeg_dct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE);
|
|
__syncthreads();
|
|
gpujpeg_dct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE));
|
|
__syncthreads();
|
|
|
|
// Quantization
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
|
|
uint16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x];
|
|
int coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
|
|
if ( coefficient < 0 ) {
|
|
coefficient = -coefficient;
|
|
coefficient = (coefficient * quantization + 16384) / 32767;
|
|
coefficient = -coefficient;
|
|
} else {
|
|
coefficient = (coefficient * quantization + 16384) / 32767;
|
|
}
|
|
|
|
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
|
|
}
|
|
__syncthreads();
|
|
|
|
// Determine position in output buffer and apply it
|
|
int output_x = IMAD(IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y), GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2);
|
|
int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
|
|
output += IMAD(output_y, output_stride, output_x);
|
|
|
|
// Store data to global memory, only half of threads in each cell performs data moving (each thread moves 2 shorts)
|
|
int16_t* block_store_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)"
|
|
if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) && block_x < block_count_x && block_y < block_count_y ) {
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
|
|
((int*)output)[i * (GPUJPEG_BLOCK_SIZE / 2)] = ((int*)block_store_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)];
|
|
}
|
|
}
|
|
|
|
/** Quantization table */
|
|
__constant__ uint16_t gpujpeg_idct_gpu_quantization_table[64];
|
|
|
|
/**
|
|
* Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given
|
|
* image plane and outputs result to the array of coefficients. Short implementation.
|
|
* This kernel is designed to process image by blocks of blocks8x8 that
|
|
* utilize maximum warps capacity, assuming that it is enough of 8 threads
|
|
* per block8x8.
|
|
*
|
|
* @param source [IN] - Source coefficients
|
|
* @param source_stride [IN] - Stride of source
|
|
* @param output [OUT] - Source coefficients
|
|
* @param output_stride [OUT] - Stride of source
|
|
* @param table [IN] - Quantization table
|
|
* @return None
|
|
*/
|
|
__global__ void
|
|
gpujpeg_idct_gpu_kernel(int block_count_x, int block_count_y, int16_t* source, int source_stride,
|
|
uint8_t* output, int output_stride, uint16_t* quantization_table)
|
|
{
|
|
// For pre-fermi GPUs, quantization table in constant memory is faster
|
|
#if __CUDA_ARCH__ < 200
|
|
quantization_table = gpujpeg_idct_gpu_quantization_table;
|
|
#endif
|
|
|
|
// Shared data
|
|
__shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
|
|
// Block position
|
|
int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y);
|
|
int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
|
|
|
|
// Thread position in thread block
|
|
int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x);
|
|
int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE);
|
|
int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F);
|
|
|
|
// Determine position into shared buffer
|
|
int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
|
|
|
|
// Determine position in source buffer and apply it
|
|
int source_x = IMAD(block_x, GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2);
|
|
int source_y = block_y;
|
|
source += IMAD(source_y, source_stride, source_x);
|
|
|
|
// Load data to shared memory, only half of threads in each cell performs data moving (each thread moves 2 shorts)
|
|
if ( block_x < block_count_x && block_y < block_count_y ) {
|
|
int16_t* block_load_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)"
|
|
if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) ) {
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
|
|
((int*)block_load_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)] = ((int*)source)[i * (GPUJPEG_BLOCK_SIZE / 2)];
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
// Quantization
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
|
|
int16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x];
|
|
int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
|
|
coefficient = coefficient * quantization;
|
|
|
|
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
|
|
}
|
|
|
|
// Perform IDCT
|
|
__syncthreads();
|
|
gpujpeg_idct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE);
|
|
__syncthreads();
|
|
gpujpeg_idct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE));
|
|
__syncthreads();
|
|
|
|
// Determine position in output buffer and apply it
|
|
int output_x = IMAD(blockIdx.x, GPUJPEG_DCT_THREAD_BLOCK_WIDTH, thread_x);
|
|
int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_THREAD_BLOCK_HEIGHT, thread_y);
|
|
output += IMAD(output_y, output_stride, output_x);
|
|
|
|
// For pre-fermi GPUs, storing to global memory by 4 bytes is faster
|
|
#if __CUDA_ARCH__ < 200
|
|
__shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
|
|
uint8_t* __output = output;
|
|
int __output_stride = output_stride;
|
|
output = block_byte_ptr;
|
|
output_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE;
|
|
#endif
|
|
|
|
// Store data to global memory
|
|
if ( block_x < block_count_x && block_y < block_count_y ) {
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
|
|
int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
|
|
coefficient += 128;
|
|
if ( coefficient > 255 )
|
|
coefficient = 255;
|
|
if ( coefficient < 0 )
|
|
coefficient = 0;
|
|
output[i * output_stride] = (uint8_t)coefficient;
|
|
}
|
|
|
|
// For pre-fermi GPUs, storing to global memory by 4 bytes is faster
|
|
#if __CUDA_ARCH__ < 200
|
|
if ( threadIdx.x % 4 == 0 ) {
|
|
#pragma unroll
|
|
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
|
|
((uint32_t*)__output)[i * (__output_stride / 4)] = ((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)];
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
void
|
|
gpujpeg_dct_gpu(struct gpujpeg_encoder* encoder)
|
|
{
|
|
// Get coder
|
|
struct gpujpeg_coder* coder = &encoder->coder;
|
|
|
|
// Encode each component
|
|
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
|
|
// Get component
|
|
struct gpujpeg_component* component = &coder->component[comp];
|
|
|
|
// Determine table type
|
|
enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
|
|
|
|
int roi_width = component->data_width;
|
|
int roi_height = component->data_height;
|
|
assert(GPUJPEG_BLOCK_SIZE == 8);
|
|
|
|
int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE;
|
|
int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE;
|
|
|
|
// Get quantization table
|
|
uint16_t* d_quantization_table = encoder->table_quantization[type].d_table;
|
|
|
|
// Copy quantization table to constant memory
|
|
cudaMemcpyToSymbol(
|
|
(const char*)gpujpeg_dct_gpu_quantization_table,
|
|
d_quantization_table,
|
|
64 * sizeof(uint16_t),
|
|
0,
|
|
cudaMemcpyDeviceToDevice
|
|
);
|
|
gpujpeg_cuda_check_error("Copy DCT quantization table to constant memory");
|
|
|
|
// Perform block-wise DCT processing
|
|
dim3 dct_grid(
|
|
gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X),
|
|
gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y),
|
|
1
|
|
);
|
|
dim3 dct_block(
|
|
GPUJPEG_BLOCK_SIZE,
|
|
GPUJPEG_DCT_BLOCK_COUNT_X,
|
|
GPUJPEG_DCT_BLOCK_COUNT_Y
|
|
);
|
|
gpujpeg_dct_gpu_kernel<<<dct_grid, dct_block>>>(
|
|
block_count_x,
|
|
block_count_y,
|
|
component->d_data,
|
|
component->data_width,
|
|
component->d_data_quantized,
|
|
component->data_width * GPUJPEG_BLOCK_SIZE,
|
|
d_quantization_table
|
|
);
|
|
cudaThreadSynchronize();
|
|
gpujpeg_cuda_check_error("Forward Integer DCT failed");
|
|
}
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
void
|
|
gpujpeg_idct_gpu(struct gpujpeg_decoder* decoder)
|
|
{
|
|
// Get coder
|
|
struct gpujpeg_coder* coder = &decoder->coder;
|
|
|
|
// Encode each component
|
|
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
|
|
// Get component
|
|
struct gpujpeg_component* component = &coder->component[comp];
|
|
|
|
// Determine table type
|
|
enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
|
|
|
|
int roi_width = component->data_width;
|
|
int roi_height = component->data_height;
|
|
assert(GPUJPEG_BLOCK_SIZE == 8);
|
|
|
|
int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE;
|
|
int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE;
|
|
|
|
// Get quantization table
|
|
uint16_t* d_quantization_table = decoder->table_quantization[type].d_table;
|
|
|
|
// Copy quantization table to constant memory
|
|
cudaMemcpyToSymbol(
|
|
(const char*)gpujpeg_idct_gpu_quantization_table,
|
|
d_quantization_table,
|
|
64 * sizeof(uint16_t),
|
|
0,
|
|
cudaMemcpyDeviceToDevice
|
|
);
|
|
gpujpeg_cuda_check_error("Copy IDCT quantization table to constant memory");
|
|
|
|
// Perform block-wise IDCT processing
|
|
dim3 dct_grid(
|
|
gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X),
|
|
gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y),
|
|
1
|
|
);
|
|
dim3 dct_block(
|
|
GPUJPEG_BLOCK_SIZE,
|
|
GPUJPEG_DCT_BLOCK_COUNT_X,
|
|
GPUJPEG_DCT_BLOCK_COUNT_Y
|
|
);
|
|
gpujpeg_idct_gpu_kernel<<<dct_grid, dct_block>>>(
|
|
block_count_x,
|
|
block_count_y,
|
|
component->d_data_quantized,
|
|
component->data_width * GPUJPEG_BLOCK_SIZE,
|
|
component->d_data,
|
|
component->data_width,
|
|
d_quantization_table
|
|
);
|
|
cudaThreadSynchronize();
|
|
gpujpeg_cuda_check_error("Inverse Integer DCT failed");
|
|
}
|
|
}
|