Files
UltraGrid/libgpujpeg/gpujpeg_dct_gpu.cu
Martin Pulec c4b6215b67 update GPUJPEG to 06259756
Conflicts:

	libgpujpeg/gpujpeg_huffman_cpu_decoder.c
	libgpujpeg/gpujpeg_huffman_cpu_encoder.c
	libgpujpeg/gpujpeg_preprocessor.cu
2012-03-20 11:08:22 +01:00

749 lines
25 KiB
Plaintext

/**
* Copyright (c) 2011, CESNET z.s.p.o
* Copyright (c) 2011, Silicon Genome, LLC.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpujpeg_dct_gpu.h"
#include "gpujpeg_util.h"
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/** Fast integer multiplication */
#define FMUL(x,y) (__mul24(x,y))
//#define FMUL(x,y) ((x)*(y))
// X block count which will be processed by one thread block
#define GPUJPEG_DCT_BLOCK_COUNT_X 4
// Y block count which will be processed by one thread block
#define GPUJPEG_DCT_BLOCK_COUNT_Y 4
// Thread block width
#define GPUJPEG_DCT_THREAD_BLOCK_WIDTH (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_X)
// Thread block height
#define GPUJPEG_DCT_THREAD_BLOCK_HEIGHT (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_Y)
// Stride of shared memory buffer (short kernel)
#define GPUJPEG_DCT_THREAD_BLOCK_STRIDE (GPUJPEG_DCT_THREAD_BLOCK_WIDTH + 4)
#define IMAD(a, b, c) ( ((a) * (b)) + (c) )
#define IMUL(a, b) ((a) * (b))
#define SIN_1_4 0x5A82
#define COS_1_4 0x5A82
#define SIN_1_8 0x30FC
#define COS_1_8 0x7642
#define OSIN_1_16 0x063E
#define OSIN_3_16 0x11C7
#define OSIN_5_16 0x1A9B
#define OSIN_7_16 0x1F63
#define OCOS_1_16 0x1F63
#define OCOS_3_16 0x1A9B
#define OCOS_5_16 0x11C7
#define OCOS_7_16 0x063E
/**
* Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid bank conflicts
*/
union PackedInteger
{
struct __align__(8)
{
int16_t hShort1;
int16_t hShort2;
};
int32_t hInt;
};
/**
* Converts fixed point value to short value
*/
__device__ inline int16_t
unfixh(int x)
{
return (int16_t)((x + 0x8000) >> 16);
}
/**
* Converts fixed point value to short value
*/
__device__ inline int
unfixo(int x)
{
return (x + 0x1000) >> 13;
}
/**
* Performs in-place DCT of vector of 8 elements (used to access columns in shared memory).
*
* @param SrcDst [IN/OUT] - Pointer to the first element of vector
* @param Stride [IN] - Value to add to ptr to access other elements
* @return None
*/
__device__ void
gpujpeg_dct_gpu_kernel_inplace(int16_t* SrcDst, int Stride)
{
int in0, in1, in2, in3, in4, in5, in6, in7;
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int tmp10, tmp11, tmp12, tmp13;
int tmp14, tmp15, tmp16, tmp17;
int tmp25, tmp26;
int DoubleStride = Stride << 1;
int16_t* DstPtr = SrcDst;
in0 = *DstPtr;
DstPtr += Stride;
in1 = *DstPtr;
DstPtr += Stride;
in2 = *DstPtr;
DstPtr += Stride;
in3 = *DstPtr;
DstPtr += Stride;
in4 = *DstPtr;
DstPtr += Stride;
in5 = *DstPtr;
DstPtr += Stride;
in6 = *DstPtr;
DstPtr += Stride;
in7 = *DstPtr;
tmp0 = in7 + in0;
tmp1 = in6 + in1;
tmp2 = in5 + in2;
tmp3 = in4 + in3;
tmp4 = in3 - in4;
tmp5 = in2 - in5;
tmp6 = in1 - in6;
tmp7 = in0 - in7;
tmp10 = tmp3 + tmp0;
tmp11 = tmp2 + tmp1;
tmp12 = tmp1 - tmp2;
tmp13 = tmp0 - tmp3;
tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));
tmp4 <<= 2;
tmp7 <<= 2;
tmp14 = tmp4 + tmp15;
tmp25 = tmp4 - tmp15;
tmp26 = tmp7 - tmp16;
tmp17 = tmp7 + tmp16;
DstPtr = SrcDst;
*DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));
DstPtr = SrcDst + Stride;
*DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
DstPtr += DoubleStride;
*DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
}
/**
* Performs in-place DCT of vector of 8 elements (used to access rows in shared memory).
*
* @param V8 [IN/OUT] - Pointer to the first two elements of vector
* @return None
*/
__device__ void
gpujpeg_dct_gpu_kernel_inplace(uint32_t* V8)
{
int in0, in1, in2, in3, in4, in5, in6, in7;
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int tmp10, tmp11, tmp12, tmp13;
int tmp14, tmp15, tmp16, tmp17;
int tmp25, tmp26;
PackedInteger sh0, sh1, sh2, sh3;
sh0.hInt = V8[0];
sh1.hInt = V8[1];
sh2.hInt = V8[2];
sh3.hInt = V8[3];
in0 = sh0.hShort1;
in1 = sh0.hShort2;
in2 = sh1.hShort1;
in3 = sh1.hShort2;
in4 = sh2.hShort1;
in5 = sh2.hShort2;
in6 = sh3.hShort1;
in7 = sh3.hShort2;
tmp0 = in7 + in0;
tmp1 = in6 + in1;
tmp2 = in5 + in2;
tmp3 = in4 + in3;
tmp4 = in3 - in4;
tmp5 = in2 - in5;
tmp6 = in1 - in6;
tmp7 = in0 - in7;
tmp10 = tmp3 + tmp0;
tmp11 = tmp2 + tmp1;
tmp12 = tmp1 - tmp2;
tmp13 = tmp0 - tmp3;
sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4));
sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4));
sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8));
sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8));
tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4));
tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4));
tmp4 <<= 2;
tmp7 <<= 2;
tmp14 = tmp4 + tmp15;
tmp25 = tmp4 - tmp15;
tmp26 = tmp7 - tmp16;
tmp17 = tmp7 + tmp16;
sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16));
sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16));
sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16));
sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16));
V8[0] = sh0.hInt;
V8[1] = sh1.hInt;
V8[2] = sh2.hInt;
V8[3] = sh3.hInt;
}
/**
* Performs in-place IDCT of vector of 8 elements (used to access columns in shared memory).
*
* @param SrcDst [IN/OUT] - Pointer to the first element of vector
* @param Stride [IN] - Value to add to ptr to access other elements
* @return None
*/
__device__ void
gpujpeg_idct_gpu_kernel_inplace(int16_t* SrcDst, int Stride)
{
int in0, in1, in2, in3, in4, in5, in6, in7;
int tmp10, tmp11, tmp12, tmp13;
int tmp20, tmp21, tmp22, tmp23;
int tmp30, tmp31;
int tmp40, tmp41, tmp42, tmp43;
int tmp50, tmp51, tmp52, tmp53;
int16_t *DstPtr = SrcDst;
in0 = *DstPtr;
DstPtr += Stride;
in1 = *DstPtr;
DstPtr += Stride;
in2 = *DstPtr;
DstPtr += Stride;
in3 = *DstPtr;
DstPtr += Stride;
in4 = *DstPtr;
DstPtr += Stride;
in5 = *DstPtr;
DstPtr += Stride;
in6 = *DstPtr;
DstPtr += Stride;
in7 = *DstPtr;
tmp10 = FMUL(in0 + in4, COS_1_4);
tmp11 = FMUL(in0 - in4, COS_1_4);
tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);
tmp20 = tmp10 + tmp13;
tmp21 = tmp11 + tmp12;
tmp22 = tmp11 - tmp12;
tmp23 = tmp10 - tmp13;
tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));
in1 <<= 2;
in7 <<= 2;
tmp40 = in1 + tmp30;
tmp41 = in7 + tmp31;
tmp42 = in1 - tmp30;
tmp43 = in7 - tmp31;
tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);
DstPtr = SrcDst;
*DstPtr = unfixh(tmp20 + tmp50);
DstPtr += Stride;
*DstPtr = unfixh(tmp21 + tmp53);
DstPtr += Stride;
*DstPtr = unfixh(tmp22 + tmp52);
DstPtr += Stride;
*DstPtr = unfixh(tmp23 + tmp51);
DstPtr += Stride;
*DstPtr = unfixh(tmp23 - tmp51);
DstPtr += Stride;
*DstPtr = unfixh(tmp22 - tmp52);
DstPtr += Stride;
*DstPtr = unfixh(tmp21 - tmp53);
DstPtr += Stride;
*DstPtr = unfixh(tmp20 - tmp50);
}
/**
* Performs in-place IDCT of vector of 8 elements (used to access rows in shared memory).
*
* @param V8 [IN/OUT] - Pointer to the first two elements of vector
* @return None
*/
__device__ void
gpujpeg_idct_gpu_kernel_inplace(uint32_t* V8)
{
int in0, in1, in2, in3, in4, in5, in6, in7;
int tmp10, tmp11, tmp12, tmp13;
int tmp20, tmp21, tmp22, tmp23;
int tmp30, tmp31;
int tmp40, tmp41, tmp42, tmp43;
int tmp50, tmp51, tmp52, tmp53;
PackedInteger sh0, sh1, sh2, sh3;
sh0.hInt = V8[0];
sh1.hInt = V8[1];
sh2.hInt = V8[2];
sh3.hInt = V8[3];
in0 = sh0.hShort1;
in1 = sh0.hShort2;
in2 = sh1.hShort1;
in3 = sh1.hShort2;
in4 = sh2.hShort1;
in5 = sh2.hShort2;
in6 = sh3.hShort1;
in7 = sh3.hShort2;
tmp10 = FMUL(in0 + in4, COS_1_4);
tmp11 = FMUL(in0 - in4, COS_1_4);
tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8);
tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8);
tmp20 = tmp10 + tmp13;
tmp21 = tmp11 + tmp12;
tmp22 = tmp11 - tmp12;
tmp23 = tmp10 - tmp13;
tmp30 = unfixo(FMUL(in3 + in5, COS_1_4));
tmp31 = unfixo(FMUL(in3 - in5, COS_1_4));
in1 <<= 2;
in7 <<= 2;
tmp40 = in1 + tmp30;
tmp41 = in7 + tmp31;
tmp42 = in1 - tmp30;
tmp43 = in7 - tmp31;
tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16);
tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16);
tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16);
tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16);
sh0.hShort1 = unfixh(tmp20 + tmp50);
sh0.hShort2 = unfixh(tmp21 + tmp53);
sh1.hShort1 = unfixh(tmp22 + tmp52);
sh1.hShort2 = unfixh(tmp23 + tmp51);
sh2.hShort1 = unfixh(tmp23 - tmp51);
sh2.hShort2 = unfixh(tmp22 - tmp52);
sh3.hShort1 = unfixh(tmp21 - tmp53);
sh3.hShort2 = unfixh(tmp20 - tmp50);
V8[0] = sh0.hInt;
V8[1] = sh1.hInt;
V8[2] = sh2.hInt;
V8[3] = sh3.hInt;
}
/** Quantization table */
__constant__ uint16_t gpujpeg_dct_gpu_quantization_table[64];
/**
* Performs 8x8 block-wise Forward Discrete Cosine Transform of the given
* image plane and outputs result to the array of coefficients. Short implementation.
* This kernel is designed to process image by blocks of blocks8x8 that
* utilize maximum warps capacity, assuming that it is enough of 8 threads
* per block8x8.
*
* @param source [IN] - Source coefficients
* @param source_stride [IN] - Stride of source
* @param output [OUT] - Source coefficients
* @param output_stride [OUT] - Stride of source
* @param table [IN] - Quantization table
* @return None
*/
__global__ void
gpujpeg_dct_gpu_kernel(int block_count_x, int block_count_y, uint8_t* source, int source_stride,
int16_t* output, int output_stride, uint16_t* quantization_table)
{
// For pre-fermi GPUs, quantization table in constant memory is faster
#if __CUDA_ARCH__ < 200
quantization_table = gpujpeg_dct_gpu_quantization_table;
#endif
// Shared data
__shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
// Block position
int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y);
int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
// Thread position in thread block
int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x);
int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE);
int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F);
// Determine position into shared buffer
int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
// Determine position in source buffer and apply it
int source_x = IMAD(block_x, GPUJPEG_BLOCK_SIZE, threadIdx.x);
int source_y = IMUL(block_y, GPUJPEG_BLOCK_SIZE);
source += IMAD(source_y, source_stride, source_x);
// Load data to shared memory memory
if ( block_x < block_count_x && block_y < block_count_y ) {
// For pre-fermi GPUs, loading from global memory by 4 bytes is faster
#if __CUDA_ARCH__ < 200
__shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
if ( threadIdx.x % 4 == 0 ) {
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)] = ((uint32_t*)source)[i * (source_stride / 4)];
}
source = block_byte_ptr;
source_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE;
#endif
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
int16_t coefficient = (int16_t)(source[i * source_stride]);
coefficient -= 128;
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
}
}
// Perform DCT
__syncthreads();
gpujpeg_dct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE);
__syncthreads();
gpujpeg_dct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE));
__syncthreads();
// Quantization
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
uint16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x];
int coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
if ( coefficient < 0 ) {
coefficient = -coefficient;
coefficient = (coefficient * quantization + 16384) / 32767;
coefficient = -coefficient;
} else {
coefficient = (coefficient * quantization + 16384) / 32767;
}
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
}
__syncthreads();
// Determine position in output buffer and apply it
int output_x = IMAD(IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y), GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2);
int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
output += IMAD(output_y, output_stride, output_x);
// Store data to global memory, only half of threads in each cell performs data moving (each thread moves 2 shorts)
int16_t* block_store_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)"
if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) && block_x < block_count_x && block_y < block_count_y ) {
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
((int*)output)[i * (GPUJPEG_BLOCK_SIZE / 2)] = ((int*)block_store_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)];
}
}
/** Quantization table */
__constant__ uint16_t gpujpeg_idct_gpu_quantization_table[64];
/**
* Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given
* image plane and outputs result to the array of coefficients. Short implementation.
* This kernel is designed to process image by blocks of blocks8x8 that
* utilize maximum warps capacity, assuming that it is enough of 8 threads
* per block8x8.
*
* @param source [IN] - Source coefficients
* @param source_stride [IN] - Stride of source
* @param output [OUT] - Source coefficients
* @param output_stride [OUT] - Stride of source
* @param table [IN] - Quantization table
* @return None
*/
__global__ void
gpujpeg_idct_gpu_kernel(int block_count_x, int block_count_y, int16_t* source, int source_stride,
uint8_t* output, int output_stride, uint16_t* quantization_table)
{
// For pre-fermi GPUs, quantization table in constant memory is faster
#if __CUDA_ARCH__ < 200
quantization_table = gpujpeg_idct_gpu_quantization_table;
#endif
// Shared data
__shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
// Block position
int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y);
int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z);
// Thread position in thread block
int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x);
int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE);
int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F);
// Determine position into shared buffer
int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
// Determine position in source buffer and apply it
int source_x = IMAD(block_x, GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2);
int source_y = block_y;
source += IMAD(source_y, source_stride, source_x);
// Load data to shared memory, only half of threads in each cell performs data moving (each thread moves 2 shorts)
if ( block_x < block_count_x && block_y < block_count_y ) {
int16_t* block_load_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)"
if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) ) {
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
((int*)block_load_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)] = ((int*)source)[i * (GPUJPEG_BLOCK_SIZE / 2)];
}
}
__syncthreads();
// Quantization
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
int16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x];
int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
coefficient = coefficient * quantization;
block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient;
}
// Perform IDCT
__syncthreads();
gpujpeg_idct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE);
__syncthreads();
gpujpeg_idct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE));
__syncthreads();
// Determine position in output buffer and apply it
int output_x = IMAD(blockIdx.x, GPUJPEG_DCT_THREAD_BLOCK_WIDTH, thread_x);
int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_THREAD_BLOCK_HEIGHT, thread_y);
output += IMAD(output_y, output_stride, output_x);
// For pre-fermi GPUs, storing to global memory by 4 bytes is faster
#if __CUDA_ARCH__ < 200
__shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x);
uint8_t* __output = output;
int __output_stride = output_stride;
output = block_byte_ptr;
output_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE;
#endif
// Store data to global memory
if ( block_x < block_count_x && block_y < block_count_y ) {
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) {
int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE];
coefficient += 128;
if ( coefficient > 255 )
coefficient = 255;
if ( coefficient < 0 )
coefficient = 0;
output[i * output_stride] = (uint8_t)coefficient;
}
// For pre-fermi GPUs, storing to global memory by 4 bytes is faster
#if __CUDA_ARCH__ < 200
if ( threadIdx.x % 4 == 0 ) {
#pragma unroll
for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++)
((uint32_t*)__output)[i * (__output_stride / 4)] = ((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)];
}
#endif
}
}
/** Documented at declaration */
void
gpujpeg_dct_gpu(struct gpujpeg_encoder* encoder)
{
// Get coder
struct gpujpeg_coder* coder = &encoder->coder;
// Encode each component
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
// Get component
struct gpujpeg_component* component = &coder->component[comp];
// Determine table type
enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
int roi_width = component->data_width;
int roi_height = component->data_height;
assert(GPUJPEG_BLOCK_SIZE == 8);
int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE;
int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE;
// Get quantization table
uint16_t* d_quantization_table = encoder->table_quantization[type].d_table;
// Copy quantization table to constant memory
cudaMemcpyToSymbol(
(const char*)gpujpeg_dct_gpu_quantization_table,
d_quantization_table,
64 * sizeof(uint16_t),
0,
cudaMemcpyDeviceToDevice
);
gpujpeg_cuda_check_error("Copy DCT quantization table to constant memory");
// Perform block-wise DCT processing
dim3 dct_grid(
gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X),
gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y),
1
);
dim3 dct_block(
GPUJPEG_BLOCK_SIZE,
GPUJPEG_DCT_BLOCK_COUNT_X,
GPUJPEG_DCT_BLOCK_COUNT_Y
);
gpujpeg_dct_gpu_kernel<<<dct_grid, dct_block>>>(
block_count_x,
block_count_y,
component->d_data,
component->data_width,
component->d_data_quantized,
component->data_width * GPUJPEG_BLOCK_SIZE,
d_quantization_table
);
cudaThreadSynchronize();
gpujpeg_cuda_check_error("Forward Integer DCT failed");
}
}
/** Documented at declaration */
void
gpujpeg_idct_gpu(struct gpujpeg_decoder* decoder)
{
// Get coder
struct gpujpeg_coder* coder = &decoder->coder;
// Encode each component
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
// Get component
struct gpujpeg_component* component = &coder->component[comp];
// Determine table type
enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
int roi_width = component->data_width;
int roi_height = component->data_height;
assert(GPUJPEG_BLOCK_SIZE == 8);
int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE;
int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE;
// Get quantization table
uint16_t* d_quantization_table = decoder->table_quantization[type].d_table;
// Copy quantization table to constant memory
cudaMemcpyToSymbol(
(const char*)gpujpeg_idct_gpu_quantization_table,
d_quantization_table,
64 * sizeof(uint16_t),
0,
cudaMemcpyDeviceToDevice
);
gpujpeg_cuda_check_error("Copy IDCT quantization table to constant memory");
// Perform block-wise IDCT processing
dim3 dct_grid(
gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X),
gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y),
1
);
dim3 dct_block(
GPUJPEG_BLOCK_SIZE,
GPUJPEG_DCT_BLOCK_COUNT_X,
GPUJPEG_DCT_BLOCK_COUNT_Y
);
gpujpeg_idct_gpu_kernel<<<dct_grid, dct_block>>>(
block_count_x,
block_count_y,
component->d_data_quantized,
component->data_width * GPUJPEG_BLOCK_SIZE,
component->d_data,
component->data_width,
d_quantization_table
);
cudaThreadSynchronize();
gpujpeg_cuda_check_error("Inverse Integer DCT failed");
}
}