/** * Copyright (c) 2011, CESNET z.s.p.o * Copyright (c) 2011, Silicon Genome, LLC. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "gpujpeg_huffman_gpu_encoder.h" #include "gpujpeg_util.h" #define THREAD_BLOCK_SIZE 48 #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT /** Allocate huffman tables in constant memory */ __constant__ struct gpujpeg_table_huffman_encoder gpujpeg_huffman_gpu_encoder_table_huffman[GPUJPEG_COMPONENT_TYPE_COUNT][GPUJPEG_HUFFMAN_TYPE_COUNT]; /** Pass huffman tables to encoder */ extern struct gpujpeg_table_huffman_encoder (*gpujpeg_encoder_table_huffman)[GPUJPEG_COMPONENT_TYPE_COUNT][GPUJPEG_HUFFMAN_TYPE_COUNT] = &gpujpeg_huffman_gpu_encoder_table_huffman; #endif /** Natural order in constant memory */ __constant__ int gpujpeg_huffman_gpu_encoder_order_natural[GPUJPEG_ORDER_NATURAL_SIZE]; /** * Write one byte to compressed data * * @param data_compressed Data compressed * @param value Byte value to write * @return void */ #define gpujpeg_huffman_gpu_encoder_emit_byte(data_compressed, value) { \ *data_compressed = (uint8_t)(value); \ data_compressed++; } /** * Write two bytes to compressed data * * @param data_compressed Data compressed * @param value Two-byte value to write * @return void */ #define gpujpeg_huffman_gpu_encoder_emit_2byte(data_compressed, value) { \ *data_compressed = (uint8_t)(((value) >> 8) & 0xFF); \ data_compressed++; \ *data_compressed = (uint8_t)((value) & 0xFF); \ data_compressed++; } /** * Write marker to compressed data * * @param data_compressed Data compressed * @oaran marker Marker to write (JPEG_MARKER_...) * @return void */ #define gpujpeg_huffman_gpu_encoder_marker(data_compressed, marker) { \ *data_compressed = 0xFF;\ data_compressed++; \ *data_compressed = (uint8_t)(marker); \ data_compressed++; } /** * Output bits to the file. Only the right 24 bits of put_buffer are used; * the valid bits are left-justified in this part. At most 16 bits can be * passed to EmitBits in one call, and we never retain more than 7 bits * in put_buffer between calls, so 24 bits are sufficient. * * @param coder Huffman coder structure * @param code Huffman code * @param size Size in bits of the Huffman code * @return void */ __device__ inline int gpujpeg_huffman_gpu_encoder_emit_bits(unsigned int code, int size, int & put_value, int & put_bits, uint8_t* & data_compressed) { // This routine is heavily used, so it's worth coding tightly int _put_buffer = (int)code; int _put_bits = put_bits; // If size is 0, caller used an invalid Huffman table entry if ( size == 0 ) return -1; // Mask off any extra bits in code _put_buffer &= (((int)1) << size) - 1; // New number of bits in buffer _put_bits += size; // Align incoming bits _put_buffer <<= 24 - _put_bits; // And merge with old buffer contents _put_buffer |= put_value; // If there are more than 8 bits, write it out unsigned char uc; while ( _put_bits >= 8 ) { // Write one byte out uc = (unsigned char) ((_put_buffer >> 16) & 0xFF); gpujpeg_huffman_gpu_encoder_emit_byte(data_compressed, uc); // If need to stuff a zero byte if ( uc == 0xFF ) { // Write zero byte out gpujpeg_huffman_gpu_encoder_emit_byte(data_compressed, 0); } _put_buffer <<= 8; _put_bits -= 8; } // update state variables put_value = _put_buffer; put_bits = _put_bits; return 0; } /** * Emit left bits * * @param coder Huffman coder structure * @return void */ __device__ inline void gpujpeg_huffman_gpu_encoder_emit_left_bits(int & put_value, int & put_bits, uint8_t* & data_compressed) { // Fill 7 bits with ones if ( gpujpeg_huffman_gpu_encoder_emit_bits(0x7F, 7, put_value, put_bits, data_compressed) != 0 ) return; //unsigned char uc = (unsigned char) ((put_value >> 16) & 0xFF); // Write one byte out //gpujpeg_huffman_gpu_encoder_emit_byte(data_compressed, uc); put_value = 0; put_bits = 0; } /** * Encode one 8x8 block * * @return 0 if succeeds, otherwise nonzero */ __device__ int gpujpeg_huffman_gpu_encoder_encode_block(int & put_value, int & put_bits, int & dc, int16_t* data, uint8_t* & data_compressed, struct gpujpeg_table_huffman_encoder* d_table_dc, struct gpujpeg_table_huffman_encoder* d_table_ac) { typedef uint64_t loading_t; const int loading_iteration_count = 64 * 2 / sizeof(loading_t); // Load block to shared memory __shared__ int16_t s_data[64 * THREAD_BLOCK_SIZE]; for ( int i = 0; i < loading_iteration_count; i++ ) { ((loading_t*)s_data)[loading_iteration_count * threadIdx.x + i] = ((loading_t*)data)[i]; } int data_start = 64 * threadIdx.x; // Encode the DC coefficient difference per section F.1.2.1 int temp = s_data[data_start + 0] - dc; dc = s_data[data_start + 0]; int temp2 = temp; if ( temp < 0 ) { // Temp is abs value of input temp = -temp; // For a negative input, want temp2 = bitwise complement of abs(input) // This code assumes we are on a two's complement machine temp2--; } // Find the number of bits needed for the magnitude of the coefficient int nbits = 0; while ( temp ) { nbits++; temp >>= 1; } // Write category number if ( gpujpeg_huffman_gpu_encoder_emit_bits(d_table_dc->code[nbits], d_table_dc->size[nbits], put_value, put_bits, data_compressed) != 0 ) { return -1; } // Write category offset (EmitBits rejects calls with size 0) if ( nbits ) { if ( gpujpeg_huffman_gpu_encoder_emit_bits((unsigned int) temp2, nbits, put_value, put_bits, data_compressed) != 0 ) return -1; } // Encode the AC coefficients per section F.1.2.2 (r = run length of zeros) int r = 0; for ( int k = 1; k < 64; k++ ) { temp = s_data[data_start + gpujpeg_huffman_gpu_encoder_order_natural[k]]; if ( temp == 0 ) { r++; } else { // If run length > 15, must emit special run-length-16 codes (0xF0) while ( r > 15 ) { if ( gpujpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[0xF0], d_table_ac->size[0xF0], put_value, put_bits, data_compressed) != 0 ) return -1; r -= 16; } temp2 = temp; if ( temp < 0 ) { // temp is abs value of input temp = -temp; // This code assumes we are on a two's complement machine temp2--; } // Find the number of bits needed for the magnitude of the coefficient // there must be at least one 1 bit nbits = 1; while ( (temp >>= 1) ) nbits++; // Emit Huffman symbol for run length / number of bits int i = (r << 4) + nbits; if ( gpujpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[i], d_table_ac->size[i], put_value, put_bits, data_compressed) != 0 ) return -1; // Write Category offset if ( gpujpeg_huffman_gpu_encoder_emit_bits((unsigned int) temp2, nbits, put_value, put_bits, data_compressed) != 0 ) return -1; r = 0; } } // If all the left coefs were zero, emit an end-of-block code if ( r > 0 ) { if ( gpujpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[0], d_table_ac->size[0], put_value, put_bits, data_compressed) != 0 ) return -1; } return 0; } /** * Huffman encoder kernel * * @return void */ __global__ void gpujpeg_huffman_encoder_encode_kernel( struct gpujpeg_component* d_component, struct gpujpeg_segment* d_segment, int comp_count, int segment_count, uint8_t* d_data_compressed #ifndef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT ,struct gpujpeg_table_huffman_encoder* d_table_y_dc ,struct gpujpeg_table_huffman_encoder* d_table_y_ac ,struct gpujpeg_table_huffman_encoder* d_table_cbcr_dc ,struct gpujpeg_table_huffman_encoder* d_table_cbcr_ac #endif ) { #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT // Get huffman tables from constant memory struct gpujpeg_table_huffman_encoder* d_table_y_dc = &gpujpeg_huffman_gpu_encoder_table_huffman[GPUJPEG_COMPONENT_LUMINANCE][GPUJPEG_HUFFMAN_DC]; struct gpujpeg_table_huffman_encoder* d_table_y_ac = &gpujpeg_huffman_gpu_encoder_table_huffman[GPUJPEG_COMPONENT_LUMINANCE][GPUJPEG_HUFFMAN_AC]; struct gpujpeg_table_huffman_encoder* d_table_cbcr_dc = &gpujpeg_huffman_gpu_encoder_table_huffman[GPUJPEG_COMPONENT_CHROMINANCE][GPUJPEG_HUFFMAN_DC]; struct gpujpeg_table_huffman_encoder* d_table_cbcr_ac = &gpujpeg_huffman_gpu_encoder_table_huffman[GPUJPEG_COMPONENT_CHROMINANCE][GPUJPEG_HUFFMAN_AC]; #endif int segment_index = blockIdx.x * blockDim.x + threadIdx.x; if ( segment_index >= segment_count ) return; struct gpujpeg_segment* segment = &d_segment[segment_index]; // Initialize huffman coder int put_value = 0; int put_bits = 0; int dc[GPUJPEG_MAX_COMPONENT_COUNT]; for ( int comp = 0; comp < GPUJPEG_MAX_COMPONENT_COUNT; comp++ ) dc[comp] = 0; // Prepare data pointers uint8_t* data_compressed = &d_data_compressed[segment->data_compressed_index]; uint8_t* data_compressed_start = data_compressed; // Non-interleaving mode if ( comp_count == 1 ) { int segment_index = segment->scan_segment_index; // Encode MCUs in segment for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) { // Get component for current scan struct gpujpeg_component* component = &d_component[segment->scan_index]; // Get component data for MCU int16_t* block = &component->d_data_quantized[(segment_index * component->segment_mcu_count + mcu_index) * component->mcu_size]; // Get coder parameters int & component_dc = dc[segment->scan_index]; // Get huffman tables struct gpujpeg_table_huffman_encoder* d_table_dc = NULL; struct gpujpeg_table_huffman_encoder* d_table_ac = NULL; if ( component->type == GPUJPEG_COMPONENT_LUMINANCE ) { d_table_dc = d_table_y_dc; d_table_ac = d_table_y_ac; } else { d_table_dc = d_table_cbcr_dc; d_table_ac = d_table_cbcr_ac; } // Encode 8x8 block if ( gpujpeg_huffman_gpu_encoder_encode_block(put_value, put_bits, component_dc, block, data_compressed, d_table_dc, d_table_ac) != 0 ) break; } } // Interleaving mode else { int segment_index = segment->scan_segment_index; // Encode MCUs in segment for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) { //assert(segment->scan_index == 0); for ( int comp = 0; comp < comp_count; comp++ ) { struct gpujpeg_component* component = &d_component[comp]; // Prepare mcu indexes int mcu_index_x = (segment_index * component->segment_mcu_count + mcu_index) % component->mcu_count_x; int mcu_index_y = (segment_index * component->segment_mcu_count + mcu_index) / component->mcu_count_x; // Compute base data index int data_index_base = mcu_index_y * (component->mcu_size * component->mcu_count_x) + mcu_index_x * (component->mcu_size_x * GPUJPEG_BLOCK_SIZE); // For all vertical 8x8 blocks for ( int y = 0; y < component->sampling_factor.vertical; y++ ) { // Compute base row data index int data_index_row = data_index_base + y * (component->mcu_count_x * component->mcu_size_x * GPUJPEG_BLOCK_SIZE); // For all horizontal 8x8 blocks for ( int x = 0; x < component->sampling_factor.horizontal; x++ ) { // Compute 8x8 block data index int data_index = data_index_row + x * GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE; // Get component data for MCU int16_t* block = &component->d_data_quantized[data_index]; // Get coder parameters int & component_dc = dc[comp]; // Get huffman tables struct gpujpeg_table_huffman_encoder* d_table_dc = NULL; struct gpujpeg_table_huffman_encoder* d_table_ac = NULL; if ( component->type == GPUJPEG_COMPONENT_LUMINANCE ) { d_table_dc = d_table_y_dc; d_table_ac = d_table_y_ac; } else { d_table_dc = d_table_cbcr_dc; d_table_ac = d_table_cbcr_ac; } // Encode 8x8 block gpujpeg_huffman_gpu_encoder_encode_block(put_value, put_bits, component_dc, block, data_compressed, d_table_dc, d_table_ac); } } } } } // Emit left bits if ( put_bits > 0 ) gpujpeg_huffman_gpu_encoder_emit_left_bits(put_value, put_bits, data_compressed); // Output restart marker int restart_marker = GPUJPEG_MARKER_RST0 + (segment->scan_segment_index % 8); gpujpeg_huffman_gpu_encoder_marker(data_compressed, restart_marker); // Set compressed size segment->data_compressed_size = data_compressed - data_compressed_start; } /** Documented at declaration */ int gpujpeg_huffman_gpu_encoder_init() { // Copy natural order to constant device memory cudaMemcpyToSymbol( (const char*)gpujpeg_huffman_gpu_encoder_order_natural, gpujpeg_order_natural, GPUJPEG_ORDER_NATURAL_SIZE * sizeof(int), 0, cudaMemcpyHostToDevice ); gpujpeg_cuda_check_error("Huffman encoder init"); return 0; } /** Documented at declaration */ int gpujpeg_huffman_gpu_encoder_encode(struct gpujpeg_encoder* encoder) { // Get coder struct gpujpeg_coder* coder = &encoder->coder; assert(coder->param.restart_interval > 0); int comp_count = 1; if ( coder->param.interleaved == 1 ) comp_count = coder->param_image.comp_count; assert(comp_count >= 1 && comp_count <= GPUJPEG_MAX_COMPONENT_COUNT); // Run kernel dim3 thread(THREAD_BLOCK_SIZE); dim3 grid(gpujpeg_div_and_round_up(coder->segment_count, thread.x)); gpujpeg_huffman_encoder_encode_kernel<<>>( coder->d_component, coder->d_segment, comp_count, coder->segment_count, coder->d_data_compressed #ifndef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT ,encoder->d_table_huffman[GPUJPEG_COMPONENT_LUMINANCE][GPUJPEG_HUFFMAN_DC] ,encoder->d_table_huffman[GPUJPEG_COMPONENT_LUMINANCE][GPUJPEG_HUFFMAN_AC] ,encoder->d_table_huffman[GPUJPEG_COMPONENT_CHROMINANCE][GPUJPEG_HUFFMAN_DC] ,encoder->d_table_huffman[GPUJPEG_COMPONENT_CHROMINANCE][GPUJPEG_HUFFMAN_AC] #endif ); cudaThreadSynchronize(); gpujpeg_cuda_check_error("Huffman encoding failed"); return 0; }