/** * Copyright (c) 2011, CESNET z.s.p.o * Copyright (c) 2011, Silicon Genome, LLC. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "gpujpeg_dct_gpu.h" #include "gpujpeg_util.h" /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /** Fast integer multiplication */ #define FMUL(x,y) (__mul24(x,y)) //#define FMUL(x,y) ((x)*(y)) // X block count which will be processed by one thread block #define GPUJPEG_DCT_BLOCK_COUNT_X 4 // Y block count which will be processed by one thread block #define GPUJPEG_DCT_BLOCK_COUNT_Y 4 // Thread block width #define GPUJPEG_DCT_THREAD_BLOCK_WIDTH (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_X) // Thread block height #define GPUJPEG_DCT_THREAD_BLOCK_HEIGHT (GPUJPEG_BLOCK_SIZE * GPUJPEG_DCT_BLOCK_COUNT_Y) // Stride of shared memory buffer (short kernel) #define GPUJPEG_DCT_THREAD_BLOCK_STRIDE (GPUJPEG_DCT_THREAD_BLOCK_WIDTH + 4) #define IMAD(a, b, c) ( ((a) * (b)) + (c) ) #define IMUL(a, b) ((a) * (b)) #define SIN_1_4 0x5A82 #define COS_1_4 0x5A82 #define SIN_1_8 0x30FC #define COS_1_8 0x7642 #define OSIN_1_16 0x063E #define OSIN_3_16 0x11C7 #define OSIN_5_16 0x1A9B #define OSIN_7_16 0x1F63 #define OCOS_1_16 0x1F63 #define OCOS_3_16 0x1A9B #define OCOS_5_16 0x11C7 #define OCOS_7_16 0x063E /** * Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid bank conflicts */ union PackedInteger { struct __align__(8) { int16_t hShort1; int16_t hShort2; }; int32_t hInt; }; /** * Converts fixed point value to short value */ __device__ inline int16_t unfixh(int x) { return (int16_t)((x + 0x8000) >> 16); } /** * Converts fixed point value to short value */ __device__ inline int unfixo(int x) { return (x + 0x1000) >> 13; } /** * Performs in-place DCT of vector of 8 elements (used to access columns in shared memory). * * @param SrcDst [IN/OUT] - Pointer to the first element of vector * @param Stride [IN] - Value to add to ptr to access other elements * @return None */ __device__ void gpujpeg_dct_gpu_kernel_inplace(int16_t* SrcDst, int Stride) { int in0, in1, in2, in3, in4, in5, in6, in7; int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int tmp10, tmp11, tmp12, tmp13; int tmp14, tmp15, tmp16, tmp17; int tmp25, tmp26; int DoubleStride = Stride << 1; int16_t* DstPtr = SrcDst; in0 = *DstPtr; DstPtr += Stride; in1 = *DstPtr; DstPtr += Stride; in2 = *DstPtr; DstPtr += Stride; in3 = *DstPtr; DstPtr += Stride; in4 = *DstPtr; DstPtr += Stride; in5 = *DstPtr; DstPtr += Stride; in6 = *DstPtr; DstPtr += Stride; in7 = *DstPtr; tmp0 = in7 + in0; tmp1 = in6 + in1; tmp2 = in5 + in2; tmp3 = in4 + in3; tmp4 = in3 - in4; tmp5 = in2 - in5; tmp6 = in1 - in6; tmp7 = in0 - in7; tmp10 = tmp3 + tmp0; tmp11 = tmp2 + tmp1; tmp12 = tmp1 - tmp2; tmp13 = tmp0 - tmp3; tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); tmp4 <<= 2; tmp7 <<= 2; tmp14 = tmp4 + tmp15; tmp25 = tmp4 - tmp15; tmp26 = tmp7 - tmp16; tmp17 = tmp7 + tmp16; DstPtr = SrcDst; *DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); DstPtr = SrcDst + Stride; *DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); DstPtr += DoubleStride; *DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); } /** * Performs in-place DCT of vector of 8 elements (used to access rows in shared memory). * * @param V8 [IN/OUT] - Pointer to the first two elements of vector * @return None */ __device__ void gpujpeg_dct_gpu_kernel_inplace(uint32_t* V8) { int in0, in1, in2, in3, in4, in5, in6, in7; int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int tmp10, tmp11, tmp12, tmp13; int tmp14, tmp15, tmp16, tmp17; int tmp25, tmp26; PackedInteger sh0, sh1, sh2, sh3; sh0.hInt = V8[0]; sh1.hInt = V8[1]; sh2.hInt = V8[2]; sh3.hInt = V8[3]; in0 = sh0.hShort1; in1 = sh0.hShort2; in2 = sh1.hShort1; in3 = sh1.hShort2; in4 = sh2.hShort1; in5 = sh2.hShort2; in6 = sh3.hShort1; in7 = sh3.hShort2; tmp0 = in7 + in0; tmp1 = in6 + in1; tmp2 = in5 + in2; tmp3 = in4 + in3; tmp4 = in3 - in4; tmp5 = in2 - in5; tmp6 = in1 - in6; tmp7 = in0 - in7; tmp10 = tmp3 + tmp0; tmp11 = tmp2 + tmp1; tmp12 = tmp1 - tmp2; tmp13 = tmp0 - tmp3; sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); tmp4 <<= 2; tmp7 <<= 2; tmp14 = tmp4 + tmp15; tmp25 = tmp4 - tmp15; tmp26 = tmp7 - tmp16; tmp17 = tmp7 + tmp16; sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); V8[0] = sh0.hInt; V8[1] = sh1.hInt; V8[2] = sh2.hInt; V8[3] = sh3.hInt; } /** * Performs in-place IDCT of vector of 8 elements (used to access columns in shared memory). * * @param SrcDst [IN/OUT] - Pointer to the first element of vector * @param Stride [IN] - Value to add to ptr to access other elements * @return None */ __device__ void gpujpeg_idct_gpu_kernel_inplace(int16_t* SrcDst, int Stride) { int in0, in1, in2, in3, in4, in5, in6, in7; int tmp10, tmp11, tmp12, tmp13; int tmp20, tmp21, tmp22, tmp23; int tmp30, tmp31; int tmp40, tmp41, tmp42, tmp43; int tmp50, tmp51, tmp52, tmp53; int16_t *DstPtr = SrcDst; in0 = *DstPtr; DstPtr += Stride; in1 = *DstPtr; DstPtr += Stride; in2 = *DstPtr; DstPtr += Stride; in3 = *DstPtr; DstPtr += Stride; in4 = *DstPtr; DstPtr += Stride; in5 = *DstPtr; DstPtr += Stride; in6 = *DstPtr; DstPtr += Stride; in7 = *DstPtr; tmp10 = FMUL(in0 + in4, COS_1_4); tmp11 = FMUL(in0 - in4, COS_1_4); tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); tmp20 = tmp10 + tmp13; tmp21 = tmp11 + tmp12; tmp22 = tmp11 - tmp12; tmp23 = tmp10 - tmp13; tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); in1 <<= 2; in7 <<= 2; tmp40 = in1 + tmp30; tmp41 = in7 + tmp31; tmp42 = in1 - tmp30; tmp43 = in7 - tmp31; tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); DstPtr = SrcDst; *DstPtr = unfixh(tmp20 + tmp50); DstPtr += Stride; *DstPtr = unfixh(tmp21 + tmp53); DstPtr += Stride; *DstPtr = unfixh(tmp22 + tmp52); DstPtr += Stride; *DstPtr = unfixh(tmp23 + tmp51); DstPtr += Stride; *DstPtr = unfixh(tmp23 - tmp51); DstPtr += Stride; *DstPtr = unfixh(tmp22 - tmp52); DstPtr += Stride; *DstPtr = unfixh(tmp21 - tmp53); DstPtr += Stride; *DstPtr = unfixh(tmp20 - tmp50); } /** * Performs in-place IDCT of vector of 8 elements (used to access rows in shared memory). * * @param V8 [IN/OUT] - Pointer to the first two elements of vector * @return None */ __device__ void gpujpeg_idct_gpu_kernel_inplace(uint32_t* V8) { int in0, in1, in2, in3, in4, in5, in6, in7; int tmp10, tmp11, tmp12, tmp13; int tmp20, tmp21, tmp22, tmp23; int tmp30, tmp31; int tmp40, tmp41, tmp42, tmp43; int tmp50, tmp51, tmp52, tmp53; PackedInteger sh0, sh1, sh2, sh3; sh0.hInt = V8[0]; sh1.hInt = V8[1]; sh2.hInt = V8[2]; sh3.hInt = V8[3]; in0 = sh0.hShort1; in1 = sh0.hShort2; in2 = sh1.hShort1; in3 = sh1.hShort2; in4 = sh2.hShort1; in5 = sh2.hShort2; in6 = sh3.hShort1; in7 = sh3.hShort2; tmp10 = FMUL(in0 + in4, COS_1_4); tmp11 = FMUL(in0 - in4, COS_1_4); tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); tmp20 = tmp10 + tmp13; tmp21 = tmp11 + tmp12; tmp22 = tmp11 - tmp12; tmp23 = tmp10 - tmp13; tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); in1 <<= 2; in7 <<= 2; tmp40 = in1 + tmp30; tmp41 = in7 + tmp31; tmp42 = in1 - tmp30; tmp43 = in7 - tmp31; tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); sh0.hShort1 = unfixh(tmp20 + tmp50); sh0.hShort2 = unfixh(tmp21 + tmp53); sh1.hShort1 = unfixh(tmp22 + tmp52); sh1.hShort2 = unfixh(tmp23 + tmp51); sh2.hShort1 = unfixh(tmp23 - tmp51); sh2.hShort2 = unfixh(tmp22 - tmp52); sh3.hShort1 = unfixh(tmp21 - tmp53); sh3.hShort2 = unfixh(tmp20 - tmp50); V8[0] = sh0.hInt; V8[1] = sh1.hInt; V8[2] = sh2.hInt; V8[3] = sh3.hInt; } /** Quantization table */ __constant__ uint16_t gpujpeg_dct_gpu_quantization_table[64]; /** * Performs 8x8 block-wise Forward Discrete Cosine Transform of the given * image plane and outputs result to the array of coefficients. Short implementation. * This kernel is designed to process image by blocks of blocks8x8 that * utilize maximum warps capacity, assuming that it is enough of 8 threads * per block8x8. * * @param source [IN] - Source coefficients * @param source_stride [IN] - Stride of source * @param output [OUT] - Source coefficients * @param output_stride [OUT] - Stride of source * @param table [IN] - Quantization table * @return None */ __global__ void gpujpeg_dct_gpu_kernel(int block_count_x, int block_count_y, uint8_t* source, int source_stride, int16_t* output, int output_stride, uint16_t* quantization_table) { // For pre-fermi GPUs, quantization table in constant memory is faster #if __CUDA_ARCH__ < 200 quantization_table = gpujpeg_dct_gpu_quantization_table; #endif // Shared data __shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; // Block position int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y); int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z); // Thread position in thread block int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x); int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE); int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F); // Determine position into shared buffer int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x); // Determine position in source buffer and apply it int source_x = IMAD(block_x, GPUJPEG_BLOCK_SIZE, threadIdx.x); int source_y = IMUL(block_y, GPUJPEG_BLOCK_SIZE); source += IMAD(source_y, source_stride, source_x); // Load data to shared memory memory if ( block_x < block_count_x && block_y < block_count_y ) { // For pre-fermi GPUs, loading from global memory by 4 bytes is faster #if __CUDA_ARCH__ < 200 __shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x); if ( threadIdx.x % 4 == 0 ) { #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) ((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)] = ((uint32_t*)source)[i * (source_stride / 4)]; } source = block_byte_ptr; source_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE; #endif #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) { int16_t coefficient = (int16_t)(source[i * source_stride]); coefficient -= 128; block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient; } } // Perform DCT __syncthreads(); gpujpeg_dct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE); __syncthreads(); gpujpeg_dct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE)); __syncthreads(); // Quantization for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) { uint16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x]; int coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; if ( coefficient < 0 ) { coefficient = -coefficient; coefficient = (coefficient * quantization + 16384) / 32767; coefficient = -coefficient; } else { coefficient = (coefficient * quantization + 16384) / 32767; } block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient; } __syncthreads(); // Determine position in output buffer and apply it int output_x = IMAD(IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y), GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2); int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z); output += IMAD(output_y, output_stride, output_x); // Store data to global memory, only half of threads in each cell performs data moving (each thread moves 2 shorts) int16_t* block_store_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)" if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) && block_x < block_count_x && block_y < block_count_y ) { #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) ((int*)output)[i * (GPUJPEG_BLOCK_SIZE / 2)] = ((int*)block_store_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)]; } } /** Quantization table */ __constant__ uint16_t gpujpeg_idct_gpu_quantization_table[64]; /** * Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given * image plane and outputs result to the array of coefficients. Short implementation. * This kernel is designed to process image by blocks of blocks8x8 that * utilize maximum warps capacity, assuming that it is enough of 8 threads * per block8x8. * * @param source [IN] - Source coefficients * @param source_stride [IN] - Stride of source * @param output [OUT] - Source coefficients * @param output_stride [OUT] - Stride of source * @param table [IN] - Quantization table * @return None */ __global__ void gpujpeg_idct_gpu_kernel(int block_count_x, int block_count_y, int16_t* source, int source_stride, uint8_t* output, int output_stride, uint16_t* quantization_table) { // For pre-fermi GPUs, quantization table in constant memory is faster #if __CUDA_ARCH__ < 200 quantization_table = gpujpeg_idct_gpu_quantization_table; #endif // Shared data __shared__ int16_t block[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; // Block position int block_x = IMAD(blockIdx.x, GPUJPEG_DCT_BLOCK_COUNT_X, threadIdx.y); int block_y = IMAD(blockIdx.y, GPUJPEG_DCT_BLOCK_COUNT_Y, threadIdx.z); // Thread position in thread block int thread_x = IMAD(threadIdx.y, GPUJPEG_BLOCK_SIZE, threadIdx.x); int thread_y = IMUL(threadIdx.z, GPUJPEG_BLOCK_SIZE); int thread_x_permutated = (thread_x & 0xFFFFFFE0) | (((thread_x << 1) | ((thread_x >> 4) & 0x1)) & 0x1F); // Determine position into shared buffer int16_t* block_ptr = block + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x); // Determine position in source buffer and apply it int source_x = IMAD(block_x, GPUJPEG_BLOCK_SQUARED_SIZE, threadIdx.x * 2); int source_y = block_y; source += IMAD(source_y, source_stride, source_x); // Load data to shared memory, only half of threads in each cell performs data moving (each thread moves 2 shorts) if ( block_x < block_count_x && block_y < block_count_y ) { int16_t* block_load_ptr = block_ptr + threadIdx.x; // Shortcut for "IMAD(..., threadIdx.x * 2)" if ( threadIdx.x < (GPUJPEG_BLOCK_SIZE / 2) ) { #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) ((int*)block_load_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 2)] = ((int*)source)[i * (GPUJPEG_BLOCK_SIZE / 2)]; } } __syncthreads(); // Quantization for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) { int16_t quantization = quantization_table[i * GPUJPEG_BLOCK_SIZE + threadIdx.x]; int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; coefficient = coefficient * quantization; block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE] = coefficient; } // Perform IDCT __syncthreads(); gpujpeg_idct_gpu_kernel_inplace(block + thread_y * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + thread_x_permutated, GPUJPEG_DCT_THREAD_BLOCK_STRIDE); __syncthreads(); gpujpeg_idct_gpu_kernel_inplace((uint32_t*)(block + (thread_y + threadIdx.x) * GPUJPEG_DCT_THREAD_BLOCK_STRIDE + threadIdx.y * GPUJPEG_BLOCK_SIZE)); __syncthreads(); // Determine position in output buffer and apply it int output_x = IMAD(blockIdx.x, GPUJPEG_DCT_THREAD_BLOCK_WIDTH, thread_x); int output_y = IMAD(blockIdx.y, GPUJPEG_DCT_THREAD_BLOCK_HEIGHT, thread_y); output += IMAD(output_y, output_stride, output_x); // For pre-fermi GPUs, storing to global memory by 4 bytes is faster #if __CUDA_ARCH__ < 200 __shared__ uint8_t block_byte[GPUJPEG_DCT_THREAD_BLOCK_HEIGHT * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; uint8_t* block_byte_ptr = block_byte + IMAD(thread_y, GPUJPEG_DCT_THREAD_BLOCK_STRIDE, thread_x); uint8_t* __output = output; int __output_stride = output_stride; output = block_byte_ptr; output_stride = GPUJPEG_DCT_THREAD_BLOCK_STRIDE; #endif // Store data to global memory if ( block_x < block_count_x && block_y < block_count_y ) { #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) { int16_t coefficient = block_ptr[i * GPUJPEG_DCT_THREAD_BLOCK_STRIDE]; coefficient += 128; if ( coefficient > 255 ) coefficient = 255; if ( coefficient < 0 ) coefficient = 0; output[i * output_stride] = (uint8_t)coefficient; } // For pre-fermi GPUs, storing to global memory by 4 bytes is faster #if __CUDA_ARCH__ < 200 if ( threadIdx.x % 4 == 0 ) { #pragma unroll for(int i = 0; i < GPUJPEG_BLOCK_SIZE; i++) ((uint32_t*)__output)[i * (__output_stride / 4)] = ((uint32_t*)block_byte_ptr)[i * (GPUJPEG_DCT_THREAD_BLOCK_STRIDE / 4)]; } #endif } } /** Documented at declaration */ void gpujpeg_dct_gpu(struct gpujpeg_encoder* encoder) { // Get coder struct gpujpeg_coder* coder = &encoder->coder; // Encode each component for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Determine table type enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE; int roi_width = component->data_width; int roi_height = component->data_height; assert(GPUJPEG_BLOCK_SIZE == 8); int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE; int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE; // Get quantization table uint16_t* d_quantization_table = encoder->table_quantization[type].d_table; // Copy quantization table to constant memory cudaMemcpyToSymbol( (const char*)gpujpeg_dct_gpu_quantization_table, d_quantization_table, 64 * sizeof(uint16_t), 0, cudaMemcpyDeviceToDevice ); gpujpeg_cuda_check_error("Copy DCT quantization table to constant memory"); // Perform block-wise DCT processing dim3 dct_grid( gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X), gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y), 1 ); dim3 dct_block( GPUJPEG_BLOCK_SIZE, GPUJPEG_DCT_BLOCK_COUNT_X, GPUJPEG_DCT_BLOCK_COUNT_Y ); gpujpeg_dct_gpu_kernel<<>>( block_count_x, block_count_y, component->d_data, component->data_width, component->d_data_quantized, component->data_width * GPUJPEG_BLOCK_SIZE, d_quantization_table ); cudaThreadSynchronize(); gpujpeg_cuda_check_error("Forward Integer DCT failed"); } } /** Documented at declaration */ void gpujpeg_idct_gpu(struct gpujpeg_decoder* decoder) { // Get coder struct gpujpeg_coder* coder = &decoder->coder; // Encode each component for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Determine table type enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE; int roi_width = component->data_width; int roi_height = component->data_height; assert(GPUJPEG_BLOCK_SIZE == 8); int block_count_x = roi_width / GPUJPEG_BLOCK_SIZE; int block_count_y = roi_height / GPUJPEG_BLOCK_SIZE; // Get quantization table uint16_t* d_quantization_table = decoder->table_quantization[type].d_table; // Copy quantization table to constant memory cudaMemcpyToSymbol( (const char*)gpujpeg_idct_gpu_quantization_table, d_quantization_table, 64 * sizeof(uint16_t), 0, cudaMemcpyDeviceToDevice ); gpujpeg_cuda_check_error("Copy IDCT quantization table to constant memory"); // Perform block-wise IDCT processing dim3 dct_grid( gpujpeg_div_and_round_up(block_count_x, GPUJPEG_DCT_BLOCK_COUNT_X), gpujpeg_div_and_round_up(block_count_y, GPUJPEG_DCT_BLOCK_COUNT_Y), 1 ); dim3 dct_block( GPUJPEG_BLOCK_SIZE, GPUJPEG_DCT_BLOCK_COUNT_X, GPUJPEG_DCT_BLOCK_COUNT_Y ); gpujpeg_idct_gpu_kernel<<>>( block_count_x, block_count_y, component->d_data_quantized, component->data_width * GPUJPEG_BLOCK_SIZE, component->d_data, component->data_width, d_quantization_table ); cudaThreadSynchronize(); gpujpeg_cuda_check_error("Inverse Integer DCT failed"); } }