mirror of
https://github.com/outbackdingo/UltraGrid.git
synced 2026-03-20 21:40:15 +00:00
428 lines
17 KiB
C
428 lines
17 KiB
C
/**
|
|
* Copyright (c) 2011, CESNET z.s.p.o
|
|
* Copyright (c) 2011, Silicon Genome, LLC.
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpujpeg_encoder.h"
|
|
#include "gpujpeg_preprocessor.h"
|
|
#include "gpujpeg_dct_cpu.h"
|
|
#include "gpujpeg_dct_gpu.h"
|
|
#include "gpujpeg_huffman_cpu_encoder.h"
|
|
#include "gpujpeg_huffman_gpu_encoder.h"
|
|
#include "gpujpeg_util.h"
|
|
#include <npp.h>
|
|
|
|
#ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
|
|
/** Huffman tables in constant memory */
|
|
struct gpujpeg_table_huffman_encoder (*gpujpeg_encoder_table_huffman)[GPUJPEG_COMPONENT_TYPE_COUNT][GPUJPEG_HUFFMAN_TYPE_COUNT];
|
|
#endif
|
|
|
|
/** Documented at declaration */
|
|
void
|
|
gpujpeg_encoder_input_set_image(struct gpujpeg_encoder_input* input, uint8_t* image)
|
|
{
|
|
input->type = GPUJPEG_ENCODER_INPUT_IMAGE;
|
|
input->image = image;
|
|
input->texture = NULL;
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
void
|
|
gpujpeg_encoder_input_set_texture(struct gpujpeg_encoder_input* input, struct gpujpeg_opengl_texture* texture)
|
|
{
|
|
input->type = GPUJPEG_ENCODER_INPUT_OPENGL_TEXTURE;
|
|
input->image = NULL;
|
|
input->texture = texture;
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
struct gpujpeg_encoder*
|
|
gpujpeg_encoder_create(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters* param_image)
|
|
{
|
|
assert(param_image->comp_count == 3);
|
|
assert(param_image->comp_count <= GPUJPEG_MAX_COMPONENT_COUNT);
|
|
assert(param->quality >= 0 && param->quality <= 100);
|
|
assert(param->restart_interval >= 0);
|
|
assert(param->interleaved == 0 || param->interleaved == 1);
|
|
|
|
struct gpujpeg_encoder* encoder = malloc(sizeof(struct gpujpeg_encoder));
|
|
if ( encoder == NULL )
|
|
return NULL;
|
|
|
|
// Get coder
|
|
struct gpujpeg_coder* coder = &encoder->coder;
|
|
|
|
// Set parameters
|
|
memset(encoder, 0, sizeof(struct gpujpeg_encoder));
|
|
coder->param_image = *param_image;
|
|
coder->param = *param;
|
|
|
|
int result = 1;
|
|
|
|
// Create writer
|
|
encoder->writer = gpujpeg_writer_create(encoder);
|
|
if ( encoder->writer == NULL )
|
|
result = 0;
|
|
|
|
// Initialize coder
|
|
if ( gpujpeg_coder_init(coder) != 0 )
|
|
result = 0;
|
|
|
|
// Init preprocessor
|
|
if ( gpujpeg_preprocessor_encoder_init(&encoder->coder) != 0 ) {
|
|
fprintf(stderr, "Failed to init preprocessor!");
|
|
result = 0;
|
|
}
|
|
|
|
// Allocate quantization tables in device memory
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) )
|
|
result = 0;
|
|
}
|
|
// Allocate huffman tables in device memory
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
|
|
if ( cudaSuccess != cudaMalloc((void**)&encoder->d_table_huffman[comp_type][huff_type], sizeof(struct gpujpeg_table_huffman_encoder)) )
|
|
result = 0;
|
|
}
|
|
}
|
|
gpujpeg_cuda_check_error("Encoder table allocation");
|
|
|
|
// Init quantization tables for encoder
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
if ( gpujpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], (enum gpujpeg_component_type)comp_type, coder->param.quality) != 0 )
|
|
result = 0;
|
|
}
|
|
// Init huffman tables for encoder
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
|
|
if ( gpujpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], encoder->d_table_huffman[comp_type][huff_type], (enum gpujpeg_component_type)comp_type, (enum gpujpeg_huffman_type)huff_type) != 0 )
|
|
result = 0;
|
|
}
|
|
}
|
|
gpujpeg_cuda_check_error("Encoder table init");
|
|
|
|
#ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
|
|
// Copy huffman tables to constant memory
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
|
|
int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type);
|
|
cudaMemcpyToSymbol(
|
|
(char*)gpujpeg_encoder_table_huffman,
|
|
&encoder->table_huffman[comp_type][huff_type],
|
|
sizeof(struct gpujpeg_table_huffman_encoder),
|
|
index * sizeof(struct gpujpeg_table_huffman_encoder),
|
|
cudaMemcpyHostToDevice
|
|
);
|
|
}
|
|
}
|
|
gpujpeg_cuda_check_error("Encoder copy huffman tables to constant memory");
|
|
#endif
|
|
|
|
// Init huffman encoder
|
|
if ( gpujpeg_huffman_gpu_encoder_init() != 0 )
|
|
result = 0;
|
|
|
|
if ( result == 0 ) {
|
|
gpujpeg_encoder_destroy(encoder);
|
|
return NULL;
|
|
}
|
|
|
|
// Timers
|
|
GPUJPEG_CUSTOM_TIMER_CREATE(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_CREATE(encoder->in_gpu);
|
|
|
|
return encoder;
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
int
|
|
gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, struct gpujpeg_encoder_input* input, uint8_t** image_compressed, int* image_compressed_size)
|
|
{
|
|
// Get coder
|
|
struct gpujpeg_coder* coder = &encoder->coder;
|
|
|
|
// Reset durations
|
|
coder->duration_memory_to = 0.0;
|
|
coder->duration_memory_from = 0.0;
|
|
coder->duration_memory_map = 0.0;
|
|
coder->duration_memory_unmap = 0.0;
|
|
coder->duration_preprocessor = 0.0;
|
|
coder->duration_dct_quantization = 0.0;
|
|
coder->duration_huffman_coder = 0.0;
|
|
coder->duration_stream = 0.0;
|
|
coder->duration_in_gpu = 0.0;
|
|
|
|
// Load input image
|
|
if ( input->type == GPUJPEG_ENCODER_INPUT_IMAGE ) {
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Copy image to device memory
|
|
if ( cudaSuccess != cudaMemcpy(coder->d_data_raw, input->image, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyHostToDevice) )
|
|
return -1;
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
} else
|
|
if ( input->type == GPUJPEG_ENCODER_INPUT_OPENGL_TEXTURE ) {
|
|
assert(input->texture != NULL);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Map texture to CUDA
|
|
int data_size = 0;
|
|
uint8_t* d_data = gpujpeg_opengl_texture_map(input->texture, &data_size);
|
|
assert(data_size == (coder->data_raw_size));
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Copy image data from texture pixel buffer object to device data
|
|
cudaMemcpy(coder->d_data_raw, d_data, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Unmap texture from CUDA
|
|
gpujpeg_opengl_texture_unmap(input->texture);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
} else {
|
|
// Unknown output type
|
|
assert(0);
|
|
}
|
|
|
|
//gpujpeg_table_print(encoder->table[JPEG_COMPONENT_LUMINANCE]);
|
|
//gpujpeg_table_print(encoder->table[JPEG_COMPONENT_CHROMINANCE]);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->in_gpu);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Preprocessing
|
|
if ( gpujpeg_preprocessor_encode(&encoder->coder) != 0 )
|
|
return -1;
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
#ifdef GPUJPEG_DCT_FROM_NPP
|
|
// Perform DCT and quantization (implementation from NPP)
|
|
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
|
|
// Get component
|
|
struct gpujpeg_component* component = &coder->component[comp];
|
|
|
|
// Determine table type
|
|
enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
|
|
|
|
//gpujpeg_component_print8(&coder->component[comp], coder->component[comp].d_data);
|
|
|
|
//Perform forward DCT
|
|
NppiSize fwd_roi;
|
|
fwd_roi.width = component->data_width;
|
|
fwd_roi.height = component->data_height;
|
|
NppStatus status = nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R(
|
|
component->d_data,
|
|
component->data_width * sizeof(uint8_t),
|
|
component->d_data_quantized,
|
|
component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t),
|
|
encoder->table_quantization[type].d_table,
|
|
fwd_roi
|
|
);
|
|
if ( status != 0 ) {
|
|
fprintf(stderr, "[GPUJPEG] [Error] Forward DCT failed for component at index %d [error %d]!\n", comp, status);
|
|
return -1;
|
|
}
|
|
|
|
//gpujpeg_component_print16(&coder->component[comp], coder->component[comp].d_data_quantized);
|
|
}
|
|
#else
|
|
// Perform DCT and quantization (own CUDA implementation)
|
|
gpujpeg_dct_gpu(encoder);
|
|
#endif
|
|
|
|
// If restart interval is 0 then the GPU processing is in the end (even huffman coder will be performed on CPU)
|
|
if ( coder->param.restart_interval == 0 ) {
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->in_gpu);
|
|
coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->in_gpu);
|
|
}
|
|
|
|
// Initialize writer output buffer current position
|
|
encoder->writer->buffer_current = encoder->writer->buffer;
|
|
|
|
// Write header
|
|
gpujpeg_writer_write_header(encoder);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Perform huffman coding on CPU (when restart interval is not set)
|
|
if ( coder->param.restart_interval == 0 ) {
|
|
// Copy quantized data from device memory to cpu memory
|
|
cudaMemcpy(coder->data_quantized, coder->d_data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Perform huffman coding
|
|
if ( gpujpeg_huffman_cpu_encoder_encode(encoder) != 0 ) {
|
|
fprintf(stderr, "[GPUJPEG] [Error] Huffman encoder on CPU failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
}
|
|
// Perform huffman coding on GPU (when restart interval is set)
|
|
else {
|
|
// Perform huffman coding
|
|
if ( gpujpeg_huffman_gpu_encoder_encode(encoder) != 0 ) {
|
|
fprintf(stderr, "[GPUJPEG] [Error] Huffman encoder on GPU failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->in_gpu);
|
|
coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->in_gpu);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
// Copy compressed data from device memory to cpu memory
|
|
if ( cudaSuccess != cudaMemcpy(coder->data_compressed, coder->d_data_compressed, coder->data_compressed_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) != 0 )
|
|
return -1;
|
|
// Copy segments from device memory
|
|
if ( cudaSuccess != cudaMemcpy(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost) )
|
|
return -1;
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_START(encoder->def);
|
|
|
|
if ( coder->param.interleaved == 1 ) {
|
|
// Write scan header (only one scan is written, that contains all color components data)
|
|
gpujpeg_writer_write_scan_header(encoder, 0);
|
|
|
|
// Write scan data
|
|
for ( int segment_index = 0; segment_index < coder->segment_count; segment_index++ ) {
|
|
struct gpujpeg_segment* segment = &coder->segment[segment_index];
|
|
|
|
gpujpeg_writer_write_segment_info(encoder);
|
|
|
|
// Copy compressed data to writer
|
|
memcpy(
|
|
encoder->writer->buffer_current,
|
|
&coder->data_compressed[segment->data_compressed_index],
|
|
segment->data_compressed_size
|
|
);
|
|
encoder->writer->buffer_current += segment->data_compressed_size;
|
|
//printf("Compressed data %d bytes\n", segment->data_compressed_size);
|
|
}
|
|
// Remove last restart marker in scan (is not needed)
|
|
encoder->writer->buffer_current -= 2;
|
|
|
|
gpujpeg_writer_write_segment_info(encoder);
|
|
} else {
|
|
// Write huffman coder results as one scan for each color component
|
|
int segment_index = 0;
|
|
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
|
|
// Write scan header
|
|
gpujpeg_writer_write_scan_header(encoder, comp);
|
|
// Write scan data
|
|
for ( int index = 0; index < coder->component[comp].segment_count; index++ ) {
|
|
struct gpujpeg_segment* segment = &coder->segment[segment_index];
|
|
|
|
gpujpeg_writer_write_segment_info(encoder);
|
|
|
|
// Copy compressed data to writer
|
|
memcpy(
|
|
encoder->writer->buffer_current,
|
|
&coder->data_compressed[segment->data_compressed_index],
|
|
segment->data_compressed_size
|
|
);
|
|
encoder->writer->buffer_current += segment->data_compressed_size;
|
|
//printf("Compressed data %d bytes\n", segment->data_compressed_size);
|
|
|
|
segment_index++;
|
|
}
|
|
// Remove last restart marker in scan (is not needed)
|
|
encoder->writer->buffer_current -= 2;
|
|
|
|
gpujpeg_writer_write_segment_info(encoder);
|
|
}
|
|
}
|
|
|
|
GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
|
|
coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
|
|
}
|
|
gpujpeg_writer_emit_marker(encoder->writer, GPUJPEG_MARKER_EOI);
|
|
|
|
// Set compressed image
|
|
*image_compressed = encoder->writer->buffer;
|
|
*image_compressed_size = encoder->writer->buffer_current - encoder->writer->buffer;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/** Documented at declaration */
|
|
int
|
|
gpujpeg_encoder_destroy(struct gpujpeg_encoder* encoder)
|
|
{
|
|
assert(encoder != NULL);
|
|
|
|
GPUJPEG_CUSTOM_TIMER_DESTROY(encoder->def);
|
|
GPUJPEG_CUSTOM_TIMER_DESTROY(encoder->in_gpu);
|
|
|
|
if ( gpujpeg_coder_deinit(&encoder->coder) != 0 )
|
|
return -1;
|
|
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
if ( encoder->table_quantization[comp_type].d_table != NULL )
|
|
cudaFree(encoder->table_quantization[comp_type].d_table);
|
|
}
|
|
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
|
|
for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
|
|
if ( encoder->d_table_huffman[comp_type][huff_type] != NULL )
|
|
cudaFree(encoder->d_table_huffman[comp_type][huff_type]);
|
|
}
|
|
}
|
|
|
|
if ( encoder->writer != NULL )
|
|
gpujpeg_writer_destroy(encoder->writer);
|
|
|
|
free(encoder);
|
|
|
|
return 0;
|
|
}
|