Files
UltraGrid/libgpujpeg/gpujpeg_common.c
2012-06-06 14:05:35 +02:00

999 lines
39 KiB
C

/**
* Copyright (c) 2011, CESNET z.s.p.o
* Copyright (c) 2011, Silicon Genome, LLC.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpujpeg_common.h"
#include "gpujpeg_util.h"
#include "gpujpeg_preprocessor.h"
#include <npp.h>
#include <cuda_gl_interop.h>
#include <math.h>
#ifdef GPUJPEG_USE_OPENGL
# ifdef HAVE_MACOSX
# include <OpenGL/GL.h>
# else
# include <GL/gl.h>
# endif
#endif
/** Documented at declaration */
struct gpujpeg_devices_info
gpujpeg_get_devices_info()
{
struct gpujpeg_devices_info devices_info;
if ( cudaGetDeviceCount(&devices_info.device_count) != cudaSuccess ) {
fprintf(stderr, "[GPUJPEG] [Error] CUDA Driver and Runtime version may be mismatched.\n");
exit(-1);
}
if ( devices_info.device_count > GPUJPEG_MAX_DEVICE_COUNT ) {
fprintf(stderr, "[GPUJPEG] [Warning] There are available more CUDA devices (%d) than maximum count (%d).\n",
devices_info.device_count, GPUJPEG_MAX_DEVICE_COUNT);
fprintf(stderr, "[GPUJPEG] [Warning] Using maximum count (%d).\n", GPUJPEG_MAX_DEVICE_COUNT);
devices_info.device_count = GPUJPEG_MAX_DEVICE_COUNT;
}
for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) {
struct cudaDeviceProp device_properties;
cudaGetDeviceProperties(&device_properties, device_id);
struct gpujpeg_device_info* device_info = &devices_info.device[device_id];
device_info->id = device_id;
strncpy(device_info->name, device_properties.name, 255);
device_info->cc_major = device_properties.major;
device_info->cc_minor = device_properties.minor;
device_info->global_memory = device_properties.totalGlobalMem;
device_info->constant_memory = device_properties.totalConstMem;
device_info->shared_memory = device_properties.sharedMemPerBlock;
device_info->register_count = device_properties.regsPerBlock;
#if CUDART_VERSION >= 2000
device_info->multiprocessor_count = device_properties.multiProcessorCount;
#endif
}
return devices_info;
}
/** Documented at declaration */
void
gpujpeg_print_devices_info()
{
struct gpujpeg_devices_info devices_info = gpujpeg_get_devices_info();
if ( devices_info.device_count == 0 ) {
printf("There is no device supporting CUDA.\n");
return;
} else if ( devices_info.device_count == 1 ) {
printf("There is 1 device supporting CUDA:\n");
} else {
printf("There are %d devices supporting CUDA:\n", devices_info.device_count);
}
for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) {
struct gpujpeg_device_info* device_info = &devices_info.device[device_id];
printf("\nDevice %d: \"%s\"\n", device_info->id, device_info->name);
printf(" Compute capability: %d.%d\n", device_info->cc_major, device_info->cc_minor);
printf(" Total amount of global memory: %ld kB\n", device_info->global_memory / 1024);
printf(" Total amount of constant memory: %ld kB\n", device_info->constant_memory / 1024);
printf(" Total amount of shared memory per block: %ld kB\n", device_info->shared_memory / 1024);
printf(" Total number of registers available per block: %d\n", device_info->register_count);
printf(" Multiprocessors: %d\n", device_info->multiprocessor_count);
}
}
/** Documented at declaration */
int
gpujpeg_init_device(int device_id, int flags)
{
int dev_count;
cudaGetDeviceCount(&dev_count);
if ( dev_count == 0 ) {
fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n");
return -1;
}
if ( device_id < 0 || device_id >= dev_count ) {
fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
device_id, 0, dev_count - 1);
return -1;
}
struct cudaDeviceProp devProp;
if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) {
fprintf(stderr,
"[GPUJPEG] [Error] Can't get CUDA device properties!\n"
"[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n"
);
return -1;
}
if ( devProp.major < 1 ) {
fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id);
return -1;
}
if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) {
cudaGLSetGLDevice(device_id);
gpujpeg_cuda_check_error("Enabling OpenGL interoperability");
}
if ( flags & GPUJPEG_VERBOSE ) {
int cuda_driver_version = 0;
cudaDriverGetVersion(&cuda_driver_version);
printf("CUDA driver version: %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
int cuda_runtime_version = 0;
cudaRuntimeGetVersion(&cuda_runtime_version);
printf("CUDA runtime version: %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10);
const NppLibraryVersion* npp_version = nppGetLibVersion();
printf("NPP version: %d.%d\n", npp_version->major, npp_version->minor);
printf("Using Device #%d: %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
}
cudaSetDevice(device_id);
gpujpeg_cuda_check_error("Set CUDA device");
// Test by simple copying that the device is ready
uint8_t data[] = {8};
uint8_t* d_data = NULL;
cudaMalloc((void**)&d_data, 1);
cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice);
cudaFree(d_data);
cudaError_t error = cudaGetLastError();
if ( cudaSuccess != error ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n");
if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY )
fprintf(stderr, "[GPUJPEG] [Info] OpenGL interoperability is used, is OpenGL context available?\n");
return -1;
}
return 0;
}
/** Documented at declaration */
void
gpujpeg_set_default_parameters(struct gpujpeg_parameters* param)
{
param->verbose = 0;
param->quality = 75;
param->restart_interval = 8;
param->interleaved = 0;
param->segment_info = 0;
for ( int comp = 0; comp < GPUJPEG_MAX_COMPONENT_COUNT; comp++ ) {
param->sampling_factor[comp].horizontal = 1;
param->sampling_factor[comp].vertical = 1;
}
param->color_space_internal = GPUJPEG_YCBCR_BT601_256LVLS;
}
/** Documented at declaration */
void
gpujpeg_parameters_chroma_subsampling(struct gpujpeg_parameters* param)
{
for ( int comp = 0; comp < GPUJPEG_MAX_COMPONENT_COUNT; comp++ ) {
if ( comp == 0 ) {
param->sampling_factor[comp].horizontal = 2;
param->sampling_factor[comp].vertical = 2;
} else {
param->sampling_factor[comp].horizontal = 1;
param->sampling_factor[comp].vertical = 1;
}
}
}
/** Documented at declaration */
void
gpujpeg_image_set_default_parameters(struct gpujpeg_image_parameters* param)
{
param->width = 0;
param->height = 0;
param->comp_count = 3;
param->color_space = GPUJPEG_RGB;
param->sampling_factor = GPUJPEG_4_4_4;
}
/** Documented at declaration */
enum gpujpeg_image_file_format
gpujpeg_image_get_file_format(const char* filename)
{
static const char *extension[] = { "raw", "rgb", "yuv", "jpg" };
static const enum gpujpeg_image_file_format format[] = { GPUJPEG_IMAGE_FILE_RAW, GPUJPEG_IMAGE_FILE_RGB, GPUJPEG_IMAGE_FILE_YUV, GPUJPEG_IMAGE_FILE_JPEG };
char * ext = strrchr(filename, '.');
if ( ext == NULL )
return -1;
ext++;
for ( int i = 0; i < sizeof(format) / sizeof(*format); i++ ) {
if ( strncasecmp(ext, extension[i], 3) == 0 ) {
return format[i];
}
}
return GPUJPEG_IMAGE_FILE_UNKNOWN;
}
/** Documented at declaration */
void
gpujpeg_component_print8(struct gpujpeg_component* component, uint8_t* d_data)
{
int data_size = component->data_width * component->data_height;
uint8_t* data = NULL;
cudaMallocHost((void**)&data, data_size * sizeof(uint8_t));
cudaMemcpy(data, d_data, data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
printf("Print Data\n");
for ( int y = 0; y < component->data_height; y++ ) {
for ( int x = 0; x < component->data_width; x++ ) {
printf("%3u ", data[y * component->data_width + x]);
}
printf("\n");
}
cudaFreeHost(data);
}
/** Documented at declaration */
void
gpujpeg_component_print16(struct gpujpeg_component* component, int16_t* d_data)
{
int data_size = component->data_width * component->data_height;
int16_t* data = NULL;
cudaMallocHost((void**)&data, data_size * sizeof(int16_t));
cudaMemcpy(data, d_data, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
printf("Print Data\n");
for ( int y = 0; y < component->data_height; y++ ) {
for ( int x = 0; x < component->data_width; x++ ) {
printf("%3d ", data[y * component->data_width + x]);
}
printf("\n");
}
cudaFreeHost(data);
}
/** Documented at declaration */
int
gpujpeg_coder_init(struct gpujpeg_coder* coder)
{
int result = 1;
coder->preprocessor = NULL;
// Allocate color components
cudaMallocHost((void**)&coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component));
if ( coder->component == NULL )
result = 0;
// Allocate color components in device memory
if ( cudaSuccess != cudaMalloc((void**)&coder->d_component, coder->param_image.comp_count * sizeof(struct gpujpeg_component)) )
result = 0;
gpujpeg_cuda_check_error("Coder color component allocation");
// Initialize sampling factors and compute maximum sampling factor to coder->sampling_factor
coder->sampling_factor.horizontal = 0;
coder->sampling_factor.vertical = 0;
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
assert(coder->param.sampling_factor[comp].horizontal >= 1 && coder->param.sampling_factor[comp].horizontal <= 15);
assert(coder->param.sampling_factor[comp].vertical >= 1 && coder->param.sampling_factor[comp].vertical <= 15);
coder->component[comp].sampling_factor = coder->param.sampling_factor[comp];
if ( coder->component[comp].sampling_factor.horizontal > coder->sampling_factor.horizontal )
coder->sampling_factor.horizontal = coder->component[comp].sampling_factor.horizontal;
if ( coder->component[comp].sampling_factor.vertical > coder->sampling_factor.vertical )
coder->sampling_factor.vertical = coder->component[comp].sampling_factor.vertical;
}
// Calculate data size
coder->data_raw_size = gpujpeg_image_calculate_size(&coder->param_image);
coder->data_size = 0;
// Initialize color components
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
// Get component
struct gpujpeg_component* component = &coder->component[comp];
// Set type
component->type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
// Set proper color component sizes in pixels based on sampling factors
int samp_factor_h = component->sampling_factor.horizontal;
int samp_factor_v = component->sampling_factor.vertical;
component->width = (coder->param_image.width * samp_factor_h) / coder->sampling_factor.horizontal;
component->height = (coder->param_image.height * samp_factor_v) / coder->sampling_factor.vertical;
// Compute component MCU size
component->mcu_size_x = GPUJPEG_BLOCK_SIZE;
component->mcu_size_y = GPUJPEG_BLOCK_SIZE;
if ( coder->param.interleaved == 1 ) {
component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE * samp_factor_h * samp_factor_v;
component->mcu_size_x *= samp_factor_h;
component->mcu_size_y *= samp_factor_v;
} else {
component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE;
}
component->mcu_size = component->mcu_size_x * component->mcu_size_y;
// Compute allocated data size
component->data_width = gpujpeg_div_and_round_up(component->width, component->mcu_size_x) * component->mcu_size_x;
component->data_height = gpujpeg_div_and_round_up(component->height, component->mcu_size_y) * component->mcu_size_y;
component->data_size = component->data_width * component->data_height;
// Increase total data size
coder->data_size += component->data_size;
// Compute component MCU count
component->mcu_count_x = gpujpeg_div_and_round_up(component->data_width, component->mcu_size_x);
component->mcu_count_y = gpujpeg_div_and_round_up(component->data_height, component->mcu_size_y);
component->mcu_count = component->mcu_count_x * component->mcu_count_y;
// Compute MCU count per segment
component->segment_mcu_count = coder->param.restart_interval;
if ( component->segment_mcu_count == 0 ) {
// If restart interval is disabled, restart interval is equal MCU count
component->segment_mcu_count = component->mcu_count;
}
// Calculate segment count
component->segment_count = gpujpeg_div_and_round_up(component->mcu_count, component->segment_mcu_count);
//printf("Subsampling %dx%d, Resolution %d, %d, mcu size %d, mcu count %d\n",
// coder->param.sampling_factor[comp].horizontal, coder->param.sampling_factor[comp].vertical,
// component->data_width, component->data_height,
// component->mcu_compressed_size, component->mcu_count
//);
}
// Maximum component data size for allocated buffers
coder->data_width = gpujpeg_div_and_round_up(coder->param_image.width, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;
coder->data_height = gpujpeg_div_and_round_up(coder->param_image.height, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;
// Compute MCU size, MCU count, segment count and compressed data allocation size
coder->mcu_count = 0;
coder->mcu_size = 0;
coder->mcu_compressed_size = 0;
coder->segment_count = 0;
coder->data_compressed_size = 0;
if ( coder->param.interleaved == 1 ) {
assert(coder->param_image.comp_count > 0);
coder->mcu_count = coder->component[0].mcu_count;
coder->segment_count = coder->component[0].segment_count;
coder->segment_mcu_count = coder->component[0].segment_mcu_count;
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
struct gpujpeg_component* component = &coder->component[comp];
assert(coder->mcu_count == component->mcu_count);
assert(coder->segment_mcu_count == component->segment_mcu_count);
coder->mcu_size += component->mcu_size;
coder->mcu_compressed_size += component->mcu_compressed_size;
}
} else {
assert(coder->param_image.comp_count > 0);
coder->mcu_size = coder->component[0].mcu_size;
coder->mcu_compressed_size = coder->component[0].mcu_compressed_size;
coder->segment_mcu_count = 0;
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
struct gpujpeg_component* component = &coder->component[comp];
assert(coder->mcu_size == component->mcu_size);
assert(coder->mcu_compressed_size == component->mcu_compressed_size);
coder->mcu_count += component->mcu_count;
coder->segment_count += component->segment_count;
}
}
//printf("mcu size %d -> %d, mcu count %d, segment mcu count %d\n", coder->mcu_size, coder->mcu_compressed_size, coder->mcu_count, coder->segment_mcu_count);
// Allocate segments
cudaMallocHost((void**)&coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment));
if ( coder->segment == NULL )
result = 0;
// Allocate segments in device memory
if ( cudaSuccess != cudaMalloc((void**)&coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment)) )
result = 0;
gpujpeg_cuda_check_error("Coder segment allocation");
// Prepare segments
if ( result == 1 ) {
// While preparing segments compute input size and compressed size
int data_index = 0;
int data_compressed_index = 0;
// Prepare segments based on (non-)interleaved mode
if ( coder->param.interleaved == 1 ) {
// Prepare segments for encoding (only one scan for all color components)
int mcu_index = 0;
for ( int index = 0; index < coder->segment_count; index++ ) {
// Prepare segment MCU count
int mcu_count = coder->segment_mcu_count;
if ( (mcu_index + mcu_count) >= coder->mcu_count )
mcu_count = coder->mcu_count - mcu_index;
// Set parameters for segment
coder->segment[index].scan_index = 0;
coder->segment[index].scan_segment_index = index;
coder->segment[index].mcu_count = mcu_count;
coder->segment[index].data_compressed_index = data_compressed_index;
coder->segment[index].data_compressed_size = 0;
// Increase parameters for next segment
data_index += mcu_count * coder->mcu_size;
data_compressed_index += mcu_count * coder->mcu_compressed_size;
mcu_index += mcu_count;
}
} else {
// Prepare segments for encoding (one scan for each color component)
int index = 0;
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
// Get component
struct gpujpeg_component* component = &coder->component[comp];
// Prepare component segments
int mcu_index = 0;
for ( int segment = 0; segment < component->segment_count; segment++ ) {
// Prepare segment MCU count
int mcu_count = component->segment_mcu_count;
if ( (mcu_index + mcu_count) >= component->mcu_count )
mcu_count = component->mcu_count - mcu_index;
// Set parameters for segment
coder->segment[index].scan_index = comp;
coder->segment[index].scan_segment_index = segment;
coder->segment[index].mcu_count = mcu_count;
coder->segment[index].data_compressed_index = data_compressed_index;
coder->segment[index].data_compressed_size = 0;
// Increase parameters for next segment
data_index += mcu_count * component->mcu_size;
data_compressed_index += mcu_count * component->mcu_compressed_size;
mcu_index += mcu_count;
index++;
}
}
}
// Check data size
//printf("%d == %d\n", coder->data_size, data_index);
assert(coder->data_size == data_index);
// Set compressed size
coder->data_compressed_size = data_compressed_index;
}
//printf("Compressed size %d (segments %d)\n", coder->data_compressed_size, coder->segment_count);
// Copy segments to device memory
if ( cudaSuccess != cudaMemcpy(coder->d_segment, coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice) )
result = 0;
// Print allocation info
if ( coder->param.verbose ) {
int structures_size = 0;
structures_size += coder->segment_count * sizeof(struct gpujpeg_segment);
structures_size += coder->param_image.comp_count * sizeof(struct gpujpeg_component);
int total_size = 0;
total_size += structures_size;
total_size += coder->data_raw_size;
total_size += coder->data_size;
total_size += coder->data_size * 2;
total_size += coder->data_compressed_size;
printf("\nAllocation Info:\n");
printf(" Segment Count: %d\n", coder->segment_count);
printf(" Allocated Data Size: %dx%d\n", coder->data_width, coder->data_height);
printf(" Raw Buffer Size: %0.1f MB\n", (double)coder->data_raw_size / (1024.0 * 1024.0));
printf(" Preprocessor Buffer Size: %0.1f MB\n", (double)coder->data_size / (1024.0 * 1024.0));
printf(" DCT Buffer Size: %0.1f MB\n", (double)2 * coder->data_size / (1024.0 * 1024.0));
printf(" Compressed Buffer Size: %0.1f MB\n", (double)coder->data_compressed_size / (1024.0 * 1024.0));
printf(" Structures Size: %0.1f kB\n", (double)structures_size / (1024.0));
printf(" Total GPU Memory Size: %0.1f MB\n", (double)total_size / (1024.0 * 1024.0));
printf("");
}
// Allocate data buffers for all color components
if ( cudaSuccess != cudaMallocHost((void**)&coder->data_raw, coder->data_raw_size * sizeof(uint8_t)) )
return -1;
if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t)) )
result = 0;
if ( cudaSuccess != cudaMalloc((void**)&coder->d_data, coder->data_size * sizeof(uint8_t)) )
result = 0;
if ( cudaSuccess != cudaMallocHost((void**)&coder->data_quantized, coder->data_size * sizeof(int16_t)) )
result = 0;
if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_quantized, coder->data_size * sizeof(int16_t)) )
result = 0;
gpujpeg_cuda_check_error("Coder data allocation");
// Set data buffer to color components
uint8_t* d_comp_data = coder->d_data;
int16_t* d_comp_data_quantized = coder->d_data_quantized;
int16_t* comp_data_quantized = coder->data_quantized;
for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
struct gpujpeg_component* component = &coder->component[comp];
component->d_data = d_comp_data;
component->d_data_quantized = d_comp_data_quantized;
component->data_quantized = comp_data_quantized;
d_comp_data += component->data_width * component->data_height;
d_comp_data_quantized += component->data_width * component->data_height;
comp_data_quantized += component->data_width * component->data_height;
}
// Copy components to device memory
if ( cudaSuccess != cudaMemcpy(coder->d_component, coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component), cudaMemcpyHostToDevice) )
result = 0;
gpujpeg_cuda_check_error("Coder component copy");
// Allocate compressed data
int max_compressed_data_size = coder->data_compressed_size;
max_compressed_data_size += GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE;
max_compressed_data_size *= 2;
if ( cudaSuccess != cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t)) )
result = 0;
if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_compressed, max_compressed_data_size * sizeof(uint8_t)) )
result = 0;
gpujpeg_cuda_check_error("Coder data compressed allocation");
return 0;
}
/** Documented at declaration */
int
gpujpeg_coder_deinit(struct gpujpeg_coder* coder)
{
if ( coder->data_raw != NULL )
cudaFreeHost(coder->data_raw);
if ( coder->d_data_raw != NULL )
cudaFree(coder->d_data_raw);
if ( coder->d_data != NULL )
cudaFree(coder->d_data);
if ( coder->data_quantized != NULL )
cudaFreeHost(coder->data_quantized);
if ( coder->d_data_quantized != NULL )
cudaFree(coder->d_data_quantized);
if ( coder->data_compressed != NULL )
cudaFreeHost(coder->data_compressed);
if ( coder->d_data_compressed != NULL )
cudaFree(coder->d_data_compressed);
if ( coder->segment != NULL )
cudaFreeHost(coder->segment);
if ( coder->d_segment != NULL )
cudaFree(coder->d_segment);
return 0;
}
/** Documented at declaration */
int
gpujpeg_image_calculate_size(struct gpujpeg_image_parameters* param)
{
assert(param->comp_count == 3);
int image_size = 0;
if ( param->sampling_factor == GPUJPEG_4_4_4 ) {
image_size = param->width * param->height * param->comp_count;
} else if ( param->sampling_factor == GPUJPEG_4_2_2 ) {
int width = gpujpeg_div_and_round_up(param->width, 2) * 2 + 0;
image_size = (width * param->height) * 2;
} else {
assert(0);
}
return image_size;
}
/** Documented at declaration */
int
gpujpeg_image_load_from_file(const char* filename, uint8_t** image, int* image_size)
{
FILE* file;
file = fopen(filename, "rb");
if ( !file ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed open %s for reading!\n", filename);
return -1;
}
if ( *image_size == 0 ) {
fseek(file, 0, SEEK_END);
*image_size = ftell(file);
rewind(file);
}
uint8_t* data = NULL;
cudaMallocHost((void**)&data, *image_size * sizeof(uint8_t));
if ( *image_size != fread(data, sizeof(uint8_t), *image_size, file) ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to load image data [%d bytes] from file %s!\n", *image_size, filename);
return -1;
}
fclose(file);
*image = data;
return 0;
}
/** Documented at declaration */
int
gpujpeg_image_save_to_file(const char* filename, uint8_t* image, int image_size)
{
FILE* file;
file = fopen(filename, "wb");
if ( !file ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed open %s for writing!\n", filename);
return -1;
}
if ( image_size != fwrite(image, sizeof(uint8_t), image_size, file) ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to write image data [%d bytes] to file %s!\n", image_size, filename);
return -1;
}
fclose(file);
return 0;
}
/** Documented at declaration */
int
gpujpeg_image_destroy(uint8_t* image)
{
cudaFreeHost(image);
return 0;
}
/** Documented at declaration */
void
gpujpeg_image_range_info(const char* filename, int width, int height, enum gpujpeg_sampling_factor sampling_factor)
{
// Load image
int image_size = 0;
uint8_t* image = NULL;
if ( gpujpeg_image_load_from_file(filename, &image, &image_size) != 0 ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to load image [%s]!\n", filename);
return;
}
int c_min[3] = {256, 256, 256};
int c_max[3] = {0, 0, 0};
if ( sampling_factor == GPUJPEG_4_4_4 ) {
uint8_t* in_ptr = image;
for ( int i = 0; i < width * height; i++ ) {
for ( int c = 0; c < 3; c++ ) {
if ( in_ptr[c] < c_min[c] )
c_min[c] = in_ptr[c];
if ( in_ptr[c] > c_max[c] )
c_max[c] = in_ptr[c];
}
in_ptr += 3;
}
} else if ( sampling_factor == GPUJPEG_4_2_2 ) {
uint8_t* in_ptr = image;
for ( int i = 0; i < width * height; i++ ) {
if ( in_ptr[1] < c_min[0] )
c_min[0] = in_ptr[1];
if ( in_ptr[1] > c_max[0] )
c_max[0] = in_ptr[1];
if ( i % 2 == 1 ) {
if ( in_ptr[0] < c_min[1] )
c_min[1] = in_ptr[0];
if ( in_ptr[0] > c_max[1] )
c_max[1] = in_ptr[0];
} else {
if ( in_ptr[0] < c_min[2] )
c_min[2] = in_ptr[0];
if ( in_ptr[0] > c_max[2] )
c_max[2] = in_ptr[0];
}
in_ptr += 2;
}
} else {
assert(0);
}
printf("Image Samples Range:\n");
for ( int c = 0; c < 3; c++ ) {
printf("Component %d: %d - %d\n", c + 1, c_min[c], c_max[c]);
}
// Destroy image
gpujpeg_image_destroy(image);
}
/** Documented at declaration */
void
gpujpeg_image_convert(const char* input, const char* output, struct gpujpeg_image_parameters param_image_from,
struct gpujpeg_image_parameters param_image_to)
{
assert(param_image_from.width == param_image_to.width);
assert(param_image_from.height == param_image_to.height);
assert(param_image_from.comp_count == param_image_to.comp_count);
// Load image
int image_size = gpujpeg_image_calculate_size(&param_image_from);
uint8_t* image = NULL;
if ( gpujpeg_image_load_from_file(input, &image, &image_size) != 0 ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to load image [%s]!\n", input);
return;
}
struct gpujpeg_coder coder;
gpujpeg_set_default_parameters(&coder.param);
coder.param.color_space_internal = GPUJPEG_RGB;
// Initialize coder and preprocessor
coder.param_image = param_image_from;
assert(gpujpeg_coder_init(&coder) == 0);
assert(gpujpeg_preprocessor_encoder_init(&coder) == 0);
// Perform preprocessor
assert(cudaMemcpy(coder.d_data_raw, image, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
assert(gpujpeg_preprocessor_encode(&coder) == 0);
// Save preprocessor result
uint8_t* buffer = NULL;
assert(cudaMallocHost((void**)&buffer, coder.data_size * sizeof(uint8_t)) == cudaSuccess);
assert(buffer != NULL);
assert(cudaMemcpy(buffer, coder.d_data, coder.data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
// Deinitialize decoder
gpujpeg_coder_deinit(&coder);
// Initialize coder and postprocessor
coder.param_image = param_image_to;
assert(gpujpeg_coder_init(&coder) == 0);
assert(gpujpeg_preprocessor_decoder_init(&coder) == 0);
// Perform postprocessor
assert(cudaMemcpy(coder.d_data, buffer, coder.data_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
assert(gpujpeg_preprocessor_decode(&coder) == 0);
// Save preprocessor result
assert(cudaMemcpy(coder.data_raw, coder.d_data_raw, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
if ( gpujpeg_image_save_to_file(output, coder.data_raw, coder.data_raw_size) != 0 ) {
fprintf(stderr, "[GPUJPEG] [Error] Failed to save image [%s]!\n", output);
return;
}
// Deinitialize decoder
gpujpeg_coder_deinit(&coder);
}
/** Documented at declaration */
int
gpujpeg_opengl_init()
{
abort();
}
/** Documented at declaration */
int
gpujpeg_opengl_texture_create(int width, int height, uint8_t* data)
{
int texture_id = 0;
#ifdef GPUJPEG_USE_OPENGL
glGenTextures(1, &texture_id);
glBindTexture(GL_TEXTURE_2D, texture_id);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);
glBindTexture(GL_TEXTURE_2D, 0);
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
return texture_id;
}
/** Documented at declaration */
int
gpujpeg_opengl_texture_set_data(int texture_id, uint8_t* data)
{
#ifdef GPUJPEG_USE_OPENGL
glBindTexture(GL_TEXTURE_2D, texture_id);
int width = 0;
int height = 0;
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &width);
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &height);
assert(width != 0 && height != 0);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);
glBindTexture(GL_TEXTURE_2D, 0);
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
return 0;
}
/** Documented at declaration */
int
gpujpeg_opengl_texture_get_data(int texture_id, uint8_t* data, int* data_size)
{
#ifdef GPUJPEG_USE_OPENGL
glBindTexture(GL_TEXTURE_2D, texture_id);
int width = 0;
int height = 0;
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &width);
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &height);
assert(width != 0 && height != 0);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, data);
if ( data_size != NULL )
*data_size = width * height * 3;
glBindTexture(GL_TEXTURE_2D, 0);
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
return 0;
}
/** Documented at declaration */
void
gpujpeg_opengl_texture_destroy(int texture_id)
{
#ifdef GPUJPEG_USE_OPENGL
glDeleteTextures(1, &texture_id);
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
}
/** Documented at declaration */
struct gpujpeg_opengl_texture*
gpujpeg_opengl_texture_register(int texture_id, enum gpujpeg_opengl_texture_type texture_type)
{
struct gpujpeg_opengl_texture* texture = NULL;
cudaMallocHost((void**)&texture, sizeof(struct gpujpeg_opengl_texture));
assert(texture != NULL);
texture->texture_id = texture_id;
texture->texture_type = texture_type;
texture->texture_width = 0;
texture->texture_height = 0;
texture->texture_pbo_id = 0;
texture->texture_pbo_type = 0;
texture->texture_pbo_resource = 0;
texture->texture_callback_param = NULL;
texture->texture_callback_attach_opengl = NULL;
texture->texture_callback_detach_opengl = NULL;
#ifdef GPUJPEG_USE_OPENGL
glBindTexture(GL_TEXTURE_2D, texture->texture_id);
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &texture->texture_width);
glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &texture->texture_height);
glBindTexture(GL_TEXTURE_2D, 0);
assert(texture->texture_width != 0 && texture->texture_height != 0);
// Select PBO type
if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
texture->texture_pbo_type = GL_PIXEL_PACK_BUFFER;
} else if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
texture->texture_pbo_type = GL_PIXEL_UNPACK_BUFFER;
} else {
assert(0);
}
// Create PBO
glGenBuffers(1, &texture->texture_pbo_id);
glBindBuffer(texture->texture_pbo_type, texture->texture_pbo_id);
glBufferData(texture->texture_pbo_type, texture->texture_width * texture->texture_height * 3 * sizeof(uint8_t), NULL, GL_DYNAMIC_DRAW);
glBindBuffer(texture->texture_pbo_type, 0);
// Create CUDA PBO Resource
cudaGraphicsGLRegisterBuffer(&texture->texture_pbo_resource, texture->texture_pbo_id, cudaGraphicsMapFlagsNone);
gpujpeg_cuda_check_error("Register OpenGL buffer");
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
return texture;
}
/** Documented at declaration */
void
gpujpeg_opengl_texture_unregister(struct gpujpeg_opengl_texture* texture)
{
#ifdef GPUJPEG_USE_OPENGL
if ( texture->texture_pbo_id != 0 ) {
glDeleteBuffers(1, &texture->texture_pbo_id);
}
if ( texture->texture_pbo_resource != NULL ) {
cudaGraphicsUnregisterResource(texture->texture_pbo_resource);
}
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
assert(texture != NULL);
cudaFreeHost(texture);
}
/** Documented at declaration */
uint8_t*
gpujpeg_opengl_texture_map(struct gpujpeg_opengl_texture* texture, int* data_size)
{
assert(texture->texture_pbo_resource != NULL);
assert((texture->texture_callback_attach_opengl == NULL && texture->texture_callback_detach_opengl == NULL) ||
(texture->texture_callback_attach_opengl != NULL && texture->texture_callback_detach_opengl != NULL));
// Attach OpenGL context by callback
if ( texture->texture_callback_attach_opengl != NULL )
texture->texture_callback_attach_opengl(texture->texture_callback_param);
uint8_t* d_data = NULL;
#ifdef GPUJPEG_USE_OPENGL
if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
assert(texture->texture_pbo_type == GL_PIXEL_PACK_BUFFER);
glBindTexture(GL_TEXTURE_2D, texture->texture_id);
glBindBuffer(GL_PIXEL_PACK_BUFFER, texture->texture_pbo_id);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
glBindTexture(GL_TEXTURE_2D, 0);
}
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
// Map pixel buffer object to cuda
cudaGraphicsMapResources(1, &texture->texture_pbo_resource, 0);
gpujpeg_cuda_check_error("Encoder map texture PBO resource");
// Get device data pointer to pixel buffer object data
size_t d_data_size;
cudaGraphicsResourceGetMappedPointer((void **)&d_data, &d_data_size, texture->texture_pbo_resource);
gpujpeg_cuda_check_error("Encoder get device pointer for texture PBO resource");
if ( data_size != NULL )
*data_size = d_data_size;
return d_data;
}
/** Documented at declaration */
void
gpujpeg_opengl_texture_unmap(struct gpujpeg_opengl_texture* texture)
{
// Unmap pbo
cudaGraphicsUnmapResources(1, &texture->texture_pbo_resource, 0);
gpujpeg_cuda_check_error("Encoder unmap texture PBO resource");
#ifdef GPUJPEG_USE_OPENGL
if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
assert(texture->texture_pbo_type == GL_PIXEL_UNPACK_BUFFER);
glBindTexture(GL_TEXTURE_2D, texture->texture_id);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture->texture_pbo_id);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, texture->texture_width, texture->texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
glBindTexture(GL_TEXTURE_2D, 0);
glFinish();
}
#else
GPUJPEG_EXIT_MISSING_OPENGL();
#endif
// Dettach OpenGL context by callback
if ( texture->texture_callback_detach_opengl != NULL )
texture->texture_callback_detach_opengl(texture->texture_callback_param);
}