/**
 * Copyright (c) 2011, CESNET z.s.p.o
 * Copyright (c) 2011, Silicon Genome, LLC.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 
#include "gpujpeg_common.h"
#include "gpujpeg_util.h"
#include "gpujpeg_preprocessor.h"
#include <npp.h>
#include <cuda_gl_interop.h>
#include <math.h>

#ifdef GPUJPEG_USE_OPENGL
#       ifdef HAVE_MACOSX
#               include <OpenGL/GL.h>
#       else
#               include <GL/gl.h>
#       endif
#endif


/** Documented at declaration */
struct gpujpeg_devices_info
gpujpeg_get_devices_info()
{
    struct gpujpeg_devices_info devices_info;
    
    if ( cudaGetDeviceCount(&devices_info.device_count) != cudaSuccess ) {
        fprintf(stderr, "[GPUJPEG] [Error] CUDA Driver and Runtime version may be mismatched.\n");
        exit(-1);
    }
    
    if ( devices_info.device_count > GPUJPEG_MAX_DEVICE_COUNT ) {
        fprintf(stderr, "[GPUJPEG] [Warning] There are available more CUDA devices (%d) than maximum count (%d).\n",
            devices_info.device_count, GPUJPEG_MAX_DEVICE_COUNT);
        fprintf(stderr, "[GPUJPEG] [Warning] Using maximum count (%d).\n", GPUJPEG_MAX_DEVICE_COUNT);
        devices_info.device_count = GPUJPEG_MAX_DEVICE_COUNT;
    }

    for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) {
        struct cudaDeviceProp device_properties;
        cudaGetDeviceProperties(&device_properties, device_id);
        
        struct gpujpeg_device_info* device_info = &devices_info.device[device_id];
        
        device_info->id = device_id;
        strncpy(device_info->name, device_properties.name, 255);
        device_info->cc_major = device_properties.major;
        device_info->cc_minor = device_properties.minor;
        device_info->global_memory = device_properties.totalGlobalMem;
        device_info->constant_memory = device_properties.totalConstMem;
        device_info->shared_memory = device_properties.sharedMemPerBlock;
        device_info->register_count = device_properties.regsPerBlock;
#if CUDART_VERSION >= 2000
        device_info->multiprocessor_count = device_properties.multiProcessorCount;
#endif
    }
    
    return devices_info;
}

/** Documented at declaration */
void
gpujpeg_print_devices_info()
{
    struct gpujpeg_devices_info devices_info = gpujpeg_get_devices_info();
    if ( devices_info.device_count == 0 ) {
        printf("There is no device supporting CUDA.\n");
        return;
    } else if ( devices_info.device_count == 1 ) {
        printf("There is 1 device supporting CUDA:\n");
    } else {
        printf("There are %d devices supporting CUDA:\n", devices_info.device_count);
    }
    
    for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) {
        struct gpujpeg_device_info* device_info = &devices_info.device[device_id];
        printf("\nDevice %d: \"%s\"\n", device_info->id, device_info->name);
        printf("  Compute capability: %d.%d\n", device_info->cc_major, device_info->cc_minor);
        printf("  Total amount of global memory: %ld kB\n", device_info->global_memory / 1024);
        printf("  Total amount of constant memory: %ld kB\n", device_info->constant_memory / 1024); 
        printf("  Total amount of shared memory per block: %ld kB\n", device_info->shared_memory / 1024);
        printf("  Total number of registers available per block: %d\n", device_info->register_count);
        printf("  Multiprocessors: %d\n", device_info->multiprocessor_count);
    }
}

/** Documented at declaration */
int
gpujpeg_init_device(int device_id, int flags)
{
    int dev_count;
    cudaGetDeviceCount(&dev_count);
    if ( dev_count == 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n");
        return -1;
    }

    if ( device_id < 0 || device_id >= dev_count ) {
        fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
            device_id, 0, dev_count - 1);
        return -1;
    }

    struct cudaDeviceProp devProp;
    if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) {
        fprintf(stderr,
            "[GPUJPEG] [Error] Can't get CUDA device properties!\n"
            "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n"
        );
        return -1;
    }

    if ( devProp.major < 1 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id);
        return -1;
    }
    
    if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) {
        cudaGLSetGLDevice(device_id);
        gpujpeg_cuda_check_error("Enabling OpenGL interoperability");
    }

    if ( flags & GPUJPEG_VERBOSE ) {
        int cuda_driver_version = 0;
        cudaDriverGetVersion(&cuda_driver_version);
        printf("CUDA driver version:   %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
        
        int cuda_runtime_version = 0;
        cudaRuntimeGetVersion(&cuda_runtime_version);
        printf("CUDA runtime version:  %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10);
        
        const NppLibraryVersion* npp_version = nppGetLibVersion();
        printf("NPP version:           %d.%d\n", npp_version->major, npp_version->minor);
        
        printf("Using Device #%d:       %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
    }
    
    cudaSetDevice(device_id);
    gpujpeg_cuda_check_error("Set CUDA device");

    // Test by simple copying that the device is ready
    uint8_t data[] = {8};
    uint8_t* d_data = NULL;
    cudaMalloc((void**)&d_data, 1);
    cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice);
    cudaFree(d_data);
    cudaError_t error = cudaGetLastError();
    if ( cudaSuccess != error ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n");
        if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY )
            fprintf(stderr, "[GPUJPEG] [Info]  OpenGL interoperability is used, is OpenGL context available?\n");
        return -1;
    }

    return 0;
}

/** Documented at declaration */
void
gpujpeg_set_default_parameters(struct gpujpeg_parameters* param)
{
    param->verbose = 0;
    param->quality = 75;
    param->restart_interval = 8;
    param->interleaved = 0;
    param->segment_info = 0;
    for ( int comp = 0; comp < GPUJPEG_MAX_COMPONENT_COUNT; comp++ ) {
        param->sampling_factor[comp].horizontal = 1;
        param->sampling_factor[comp].vertical = 1;
    }
    param->color_space_internal = GPUJPEG_YCBCR_BT601_256LVLS;
}

/** Documented at declaration */
void
gpujpeg_parameters_chroma_subsampling(struct gpujpeg_parameters* param)
{
    for ( int comp = 0; comp < GPUJPEG_MAX_COMPONENT_COUNT; comp++ ) {
        if ( comp == 0 ) {
            param->sampling_factor[comp].horizontal = 2;
            param->sampling_factor[comp].vertical = 2;
        } else {
            param->sampling_factor[comp].horizontal = 1;
            param->sampling_factor[comp].vertical = 1;
        }
    }
}

/** Documented at declaration */
void
gpujpeg_image_set_default_parameters(struct gpujpeg_image_parameters* param)
{
    param->width = 0;
    param->height = 0;
    param->comp_count = 3;
    param->color_space = GPUJPEG_RGB;
    param->sampling_factor = GPUJPEG_4_4_4;
}

/** Documented at declaration */
enum gpujpeg_image_file_format
gpujpeg_image_get_file_format(const char* filename)
{
    static const char *extension[] = { "raw", "rgb", "yuv", "jpg" };
    static const enum gpujpeg_image_file_format format[] = { GPUJPEG_IMAGE_FILE_RAW, GPUJPEG_IMAGE_FILE_RGB, GPUJPEG_IMAGE_FILE_YUV, GPUJPEG_IMAGE_FILE_JPEG };
        
    char * ext = strrchr(filename, '.');
    if ( ext == NULL )
        return -1;
    ext++;
    for ( int i = 0; i < sizeof(format) / sizeof(*format); i++ ) {
        if ( strncasecmp(ext, extension[i], 3) == 0 ) {
            return format[i];
        }
    }
    return GPUJPEG_IMAGE_FILE_UNKNOWN;
}

/** Documented at declaration */
void
gpujpeg_component_print8(struct gpujpeg_component* component, uint8_t* d_data)
{
    int data_size = component->data_width * component->data_height;
    uint8_t* data = NULL;
    cudaMallocHost((void**)&data, data_size * sizeof(uint8_t)); 
    cudaMemcpy(data, d_data, data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
    
    printf("Print Data\n");
    for ( int y = 0; y < component->data_height; y++ ) {
        for ( int x = 0; x < component->data_width; x++ ) {
            printf("%3u ", data[y * component->data_width + x]);
        }
        printf("\n");
    }
    cudaFreeHost(data);
}

/** Documented at declaration */
void
gpujpeg_component_print16(struct gpujpeg_component* component, int16_t* d_data)
{
    int data_size = component->data_width * component->data_height;
    int16_t* data = NULL;
    cudaMallocHost((void**)&data, data_size * sizeof(int16_t)); 
    cudaMemcpy(data, d_data, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
    
    printf("Print Data\n");
    for ( int y = 0; y < component->data_height; y++ ) {
        for ( int x = 0; x < component->data_width; x++ ) {
            printf("%3d ", data[y * component->data_width + x]);
        }
        printf("\n");
    }
    cudaFreeHost(data);
}

/** Documented at declaration */
int
gpujpeg_coder_init(struct gpujpeg_coder* coder)
{
    int result = 1;
    
    coder->preprocessor = NULL;
    
    // Allocate color components
    cudaMallocHost((void**)&coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component));
    if ( coder->component == NULL )
        result = 0;
    // Allocate color components in device memory
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_component, coder->param_image.comp_count * sizeof(struct gpujpeg_component)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder color component allocation");
        
    // Initialize sampling factors and compute maximum sampling factor to coder->sampling_factor
    coder->sampling_factor.horizontal = 0;
    coder->sampling_factor.vertical = 0;
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        assert(coder->param.sampling_factor[comp].horizontal >= 1 && coder->param.sampling_factor[comp].horizontal <= 15);
        assert(coder->param.sampling_factor[comp].vertical >= 1 && coder->param.sampling_factor[comp].vertical <= 15);
        coder->component[comp].sampling_factor = coder->param.sampling_factor[comp];
        if ( coder->component[comp].sampling_factor.horizontal > coder->sampling_factor.horizontal )
            coder->sampling_factor.horizontal = coder->component[comp].sampling_factor.horizontal;
        if ( coder->component[comp].sampling_factor.vertical > coder->sampling_factor.vertical )
            coder->sampling_factor.vertical = coder->component[comp].sampling_factor.vertical;
    }
    
    // Calculate data size
    coder->data_raw_size = gpujpeg_image_calculate_size(&coder->param_image);
    coder->data_size = 0;
    
    // Initialize color components
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        // Get component
        struct gpujpeg_component* component = &coder->component[comp];
        
        // Set type
        component->type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
        
        // Set proper color component sizes in pixels based on sampling factors
        int samp_factor_h = component->sampling_factor.horizontal;
        int samp_factor_v = component->sampling_factor.vertical;
        component->width = (coder->param_image.width * samp_factor_h) / coder->sampling_factor.horizontal;
        component->height = (coder->param_image.height * samp_factor_v) / coder->sampling_factor.vertical;
        
        // Compute component MCU size
        component->mcu_size_x = GPUJPEG_BLOCK_SIZE;
        component->mcu_size_y = GPUJPEG_BLOCK_SIZE;
        if ( coder->param.interleaved == 1 ) {
            component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE * samp_factor_h * samp_factor_v;
            component->mcu_size_x *= samp_factor_h;
            component->mcu_size_y *= samp_factor_v;
        } else {
            component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE;
        }
        component->mcu_size = component->mcu_size_x * component->mcu_size_y;
        
        // Compute allocated data size
        component->data_width = gpujpeg_div_and_round_up(component->width, component->mcu_size_x) * component->mcu_size_x;
        component->data_height = gpujpeg_div_and_round_up(component->height, component->mcu_size_y) * component->mcu_size_y;
        component->data_size = component->data_width * component->data_height;
        // Increase total data size
        coder->data_size += component->data_size;
        
        // Compute component MCU count
        component->mcu_count_x = gpujpeg_div_and_round_up(component->data_width, component->mcu_size_x);
        component->mcu_count_y = gpujpeg_div_and_round_up(component->data_height, component->mcu_size_y);
        component->mcu_count = component->mcu_count_x * component->mcu_count_y;
        
        // Compute MCU count per segment
        component->segment_mcu_count = coder->param.restart_interval;
        if ( component->segment_mcu_count == 0 ) {
            // If restart interval is disabled, restart interval is equal MCU count
            component->segment_mcu_count = component->mcu_count;
        }
        
        // Calculate segment count
        component->segment_count = gpujpeg_div_and_round_up(component->mcu_count, component->segment_mcu_count);
        
        //printf("Subsampling %dx%d, Resolution %d, %d, mcu size %d, mcu count %d\n",
        //    coder->param.sampling_factor[comp].horizontal, coder->param.sampling_factor[comp].vertical,
        //    component->data_width, component->data_height,
        //    component->mcu_compressed_size, component->mcu_count
        //);
    }
    
    // Maximum component data size for allocated buffers
    coder->data_width = gpujpeg_div_and_round_up(coder->param_image.width, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;
    coder->data_height = gpujpeg_div_and_round_up(coder->param_image.height, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;
    
    // Compute MCU size, MCU count, segment count and compressed data allocation size
    coder->mcu_count = 0;
    coder->mcu_size = 0;
    coder->mcu_compressed_size = 0;
    coder->segment_count = 0;
    coder->data_compressed_size = 0;
    if ( coder->param.interleaved == 1 ) {
        assert(coder->param_image.comp_count > 0);
        coder->mcu_count = coder->component[0].mcu_count;
        coder->segment_count = coder->component[0].segment_count;
        coder->segment_mcu_count = coder->component[0].segment_mcu_count;
        for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
            struct gpujpeg_component* component = &coder->component[comp];
            assert(coder->mcu_count == component->mcu_count);
            assert(coder->segment_mcu_count == component->segment_mcu_count);
            coder->mcu_size += component->mcu_size;
            coder->mcu_compressed_size += component->mcu_compressed_size;
        }
    } else {
        assert(coder->param_image.comp_count > 0);
        coder->mcu_size = coder->component[0].mcu_size;
        coder->mcu_compressed_size = coder->component[0].mcu_compressed_size;
        coder->segment_mcu_count = 0;
        for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
            struct gpujpeg_component* component = &coder->component[comp];
            assert(coder->mcu_size == component->mcu_size);
            assert(coder->mcu_compressed_size == component->mcu_compressed_size);
            coder->mcu_count += component->mcu_count;
            coder->segment_count += component->segment_count;
        }
    }
    //printf("mcu size %d -> %d, mcu count %d, segment mcu count %d\n", coder->mcu_size, coder->mcu_compressed_size, coder->mcu_count, coder->segment_mcu_count);

    // Allocate segments
    cudaMallocHost((void**)&coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment));
    if ( coder->segment == NULL )
        result = 0;
    // Allocate segments in device memory
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder segment allocation");
    
    // Prepare segments
    if ( result == 1 ) {            
        // While preparing segments compute input size and compressed size
        int data_index = 0;
        int data_compressed_index = 0;
        
        // Prepare segments based on (non-)interleaved mode
        if ( coder->param.interleaved == 1 ) {
            // Prepare segments for encoding (only one scan for all color components)
            int mcu_index = 0;
            for ( int index = 0; index < coder->segment_count; index++ ) {
                // Prepare segment MCU count
                int mcu_count = coder->segment_mcu_count;
                if ( (mcu_index + mcu_count) >= coder->mcu_count )
                    mcu_count = coder->mcu_count - mcu_index;
                // Set parameters for segment
                coder->segment[index].scan_index = 0;
                coder->segment[index].scan_segment_index = index;
                coder->segment[index].mcu_count = mcu_count;
                coder->segment[index].data_compressed_index = data_compressed_index;
                coder->segment[index].data_compressed_size = 0;
                // Increase parameters for next segment
                data_index += mcu_count * coder->mcu_size;
                data_compressed_index += mcu_count * coder->mcu_compressed_size;
                mcu_index += mcu_count;
            }
        } else {
            // Prepare segments for encoding (one scan for each color component)
            int index = 0;
            for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
                // Get component
                struct gpujpeg_component* component = &coder->component[comp];
                // Prepare component segments
                int mcu_index = 0;
                for ( int segment = 0; segment < component->segment_count; segment++ ) {
                    // Prepare segment MCU count
                    int mcu_count = component->segment_mcu_count;
                    if ( (mcu_index + mcu_count) >= component->mcu_count )
                        mcu_count = component->mcu_count - mcu_index;
                    // Set parameters for segment
                    coder->segment[index].scan_index = comp;
                    coder->segment[index].scan_segment_index = segment;
                    coder->segment[index].mcu_count = mcu_count;
                    coder->segment[index].data_compressed_index = data_compressed_index;
                    coder->segment[index].data_compressed_size = 0;
                    // Increase parameters for next segment
                    data_index += mcu_count * component->mcu_size;
                    data_compressed_index += mcu_count * component->mcu_compressed_size;
                    mcu_index += mcu_count;
                    index++;
                }
            }
        }
        
        // Check data size
        //printf("%d == %d\n", coder->data_size, data_index);
        assert(coder->data_size == data_index);
            
        // Set compressed size
        coder->data_compressed_size = data_compressed_index;
    }
    //printf("Compressed size %d (segments %d)\n", coder->data_compressed_size, coder->segment_count);
        
    // Copy segments to device memory
    if ( cudaSuccess != cudaMemcpy(coder->d_segment, coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice) )
        result = 0;

    // Print allocation info
    if ( coder->param.verbose ) {
        int structures_size = 0;
        structures_size += coder->segment_count * sizeof(struct gpujpeg_segment);
        structures_size += coder->param_image.comp_count * sizeof(struct gpujpeg_component);
        int total_size = 0;
        total_size += structures_size;
        total_size += coder->data_raw_size;
        total_size += coder->data_size;
        total_size += coder->data_size * 2;
        total_size += coder->data_compressed_size;

        printf("\nAllocation Info:\n");
        printf("    Segment Count:            %d\n", coder->segment_count);
        printf("    Allocated Data Size:      %dx%d\n", coder->data_width, coder->data_height);
        printf("    Raw Buffer Size:          %d MB\n", coder->data_raw_size / (1024 * 1024));
        printf("    Preprocessor Buffer Size: %d MB\n", coder->data_size / (1024 * 1024));
        printf("    DCT Buffer Size:          %d MB\n", 2 * coder->data_size / (1024 * 1024));
        printf("    Compressed Buffer Size:   %d MB\n", coder->data_compressed_size / (1024 * 1024));
        printf("    Structures Size:          %d kB\n", structures_size / (1024));
        printf("    Total GPU Memory Size:    %d MB\n", total_size / (1024 * 1024));
        printf("");
    }

    // Allocate data buffers for all color components
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_raw, coder->data_raw_size * sizeof(uint8_t)) )
        return -1;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data, coder->data_size * sizeof(uint8_t)) )
        result = 0;
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_quantized, coder->data_size * sizeof(int16_t)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_quantized, coder->data_size * sizeof(int16_t)) )
         result = 0;
    gpujpeg_cuda_check_error("Coder data allocation");

    // Set data buffer to color components
    uint8_t* d_comp_data = coder->d_data;
    int16_t* d_comp_data_quantized = coder->d_data_quantized;
    int16_t* comp_data_quantized = coder->data_quantized;
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        struct gpujpeg_component* component = &coder->component[comp];
        component->d_data = d_comp_data;
        component->d_data_quantized = d_comp_data_quantized;
        component->data_quantized = comp_data_quantized;
        d_comp_data += component->data_width * component->data_height;
        d_comp_data_quantized += component->data_width * component->data_height;
        comp_data_quantized += component->data_width * component->data_height;
     }

    // Copy components to device memory
    if ( cudaSuccess != cudaMemcpy(coder->d_component, coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component), cudaMemcpyHostToDevice) )
        result = 0;
    gpujpeg_cuda_check_error("Coder component copy");
        
    // Allocate compressed data
    int max_compressed_data_size = coder->data_compressed_size;
    max_compressed_data_size += GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE;
    max_compressed_data_size *= 2;
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t)) ) 
        result = 0;   
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_compressed, max_compressed_data_size * sizeof(uint8_t)) ) 
        result = 0;   
    gpujpeg_cuda_check_error("Coder data compressed allocation");
     
    return 0;
}

/** Documented at declaration */
int
gpujpeg_coder_deinit(struct gpujpeg_coder* coder)
{
    if ( coder->data_raw != NULL )
        cudaFreeHost(coder->data_raw);
    if ( coder->d_data_raw != NULL )
        cudaFree(coder->d_data_raw);
    if ( coder->d_data != NULL )
        cudaFree(coder->d_data);
    if ( coder->data_quantized != NULL )
        cudaFreeHost(coder->data_quantized);    
    if ( coder->d_data_quantized != NULL )
        cudaFree(coder->d_data_quantized);    
    if ( coder->data_compressed != NULL )
        cudaFreeHost(coder->data_compressed);    
    if ( coder->d_data_compressed != NULL )
        cudaFree(coder->d_data_compressed);    
    if ( coder->segment != NULL )
        cudaFreeHost(coder->segment); 
    if ( coder->d_segment != NULL )
        cudaFree(coder->d_segment);
    return 0;
}

/** Documented at declaration */
int
gpujpeg_image_calculate_size(struct gpujpeg_image_parameters* param)
{
    assert(param->comp_count == 3);
    
    int image_size = 0;
    if ( param->sampling_factor == GPUJPEG_4_4_4 ) {
        image_size = param->width * param->height * param->comp_count;
    } else if ( param->sampling_factor == GPUJPEG_4_2_2 ) {
        int width = gpujpeg_div_and_round_up(param->width, 2) * 2 + 0;
        image_size = (width * param->height) * 2;
    } else {
        assert(0);
    }
    return image_size;
}

/** Documented at declaration */
int
gpujpeg_image_load_from_file(const char* filename, uint8_t** image, int* image_size)
{
    FILE* file;
    file = fopen(filename, "rb");
    if ( !file ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed open %s for reading!\n", filename);
        return -1;
    }

    if ( *image_size == 0 ) {
        fseek(file, 0, SEEK_END);
        *image_size = ftell(file);
        rewind(file);
    }
    
    uint8_t* data = NULL;
    cudaMallocHost((void**)&data, *image_size * sizeof(uint8_t));
    if ( *image_size != fread(data, sizeof(uint8_t), *image_size, file) ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to load image data [%d bytes] from file %s!\n", *image_size, filename);
        return -1;
    }
    fclose(file);
    
    *image = data;
    
    return 0;
}

/** Documented at declaration */
int
gpujpeg_image_save_to_file(const char* filename, uint8_t* image, int image_size)
{
    FILE* file;
    file = fopen(filename, "wb");
    if ( !file ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed open %s for writing!\n", filename);
        return -1;
    }
    
    if ( image_size != fwrite(image, sizeof(uint8_t), image_size, file) ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to write image data [%d bytes] to file %s!\n", image_size, filename);
        return -1;
    }
    fclose(file);
    
    return 0;
}

/** Documented at declaration */
int
gpujpeg_image_destroy(uint8_t* image)
{
    cudaFreeHost(image);

    return 0;
}

/** Documented at declaration */
void
gpujpeg_image_range_info(const char* filename, int width, int height, enum gpujpeg_sampling_factor sampling_factor)
{
    // Load image
    int image_size = 0;
    uint8_t* image = NULL;
    if ( gpujpeg_image_load_from_file(filename, &image, &image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to load image [%s]!\n", filename);
        return;
    }

    int c_min[3] = {256, 256, 256};
    int c_max[3] = {0, 0, 0};

    if ( sampling_factor == GPUJPEG_4_4_4 ) {
        uint8_t* in_ptr = image;
        for ( int i = 0; i < width * height; i++ ) {
            for ( int c = 0; c < 3; c++ ) {
                if ( in_ptr[c] < c_min[c] )
                    c_min[c] = in_ptr[c];
                if ( in_ptr[c] > c_max[c] )
                    c_max[c] = in_ptr[c];
            }
            in_ptr += 3;
        }
    } else if ( sampling_factor == GPUJPEG_4_2_2 ) {
        uint8_t* in_ptr = image;
        for ( int i = 0; i < width * height; i++ ) {
            if ( in_ptr[1] < c_min[0] )
                c_min[0] = in_ptr[1];
            if ( in_ptr[1] > c_max[0] )
                c_max[0] = in_ptr[1];
            if ( i % 2 == 1 ) {
                if ( in_ptr[0] < c_min[1] )
                    c_min[1] = in_ptr[0];
                if ( in_ptr[0] > c_max[1] )
                    c_max[1] = in_ptr[0];
            } else {
                if ( in_ptr[0] < c_min[2] )
                    c_min[2] = in_ptr[0];
                if ( in_ptr[0] > c_max[2] )
                    c_max[2] = in_ptr[0];
            }

            in_ptr += 2;
        }
    } else {
        assert(0);
    }

    printf("Image Samples Range:\n");
    for ( int c = 0; c < 3; c++ ) {
        printf("Component %d: %d - %d\n", c + 1, c_min[c], c_max[c]);
    }

    // Destroy image
    gpujpeg_image_destroy(image);
}

/** Documented at declaration */
void
gpujpeg_image_convert(const char* input, const char* output, struct gpujpeg_image_parameters param_image_from,
        struct gpujpeg_image_parameters param_image_to)
{
    assert(param_image_from.width == param_image_to.width);
    assert(param_image_from.height == param_image_to.height);
    assert(param_image_from.comp_count == param_image_to.comp_count);

    // Load image
    int image_size = gpujpeg_image_calculate_size(&param_image_from);
    uint8_t* image = NULL;
    if ( gpujpeg_image_load_from_file(input, &image, &image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to load image [%s]!\n", input);
        return;
    }

    struct gpujpeg_coder coder;
    gpujpeg_set_default_parameters(&coder.param);
    coder.param.color_space_internal = GPUJPEG_RGB;

    // Initialize coder and preprocessor
    coder.param_image = param_image_from;
    assert(gpujpeg_coder_init(&coder) == 0);
    assert(gpujpeg_preprocessor_encoder_init(&coder) == 0);
    // Perform preprocessor
    assert(cudaMemcpy(coder.d_data_raw, image, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
    assert(gpujpeg_preprocessor_encode(&coder) == 0);
    // Save preprocessor result
    uint8_t* buffer = NULL;
    assert(cudaMallocHost((void**)&buffer, coder.data_size * sizeof(uint8_t)) == cudaSuccess);
    assert(buffer != NULL);
    assert(cudaMemcpy(buffer, coder.d_data, coder.data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
    // Deinitialize decoder
    gpujpeg_coder_deinit(&coder);

    // Initialize coder and postprocessor
    coder.param_image = param_image_to;
    assert(gpujpeg_coder_init(&coder) == 0);
    assert(gpujpeg_preprocessor_decoder_init(&coder) == 0);
    // Perform postprocessor
    assert(cudaMemcpy(coder.d_data, buffer, coder.data_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
    assert(gpujpeg_preprocessor_decode(&coder) == 0);
    // Save preprocessor result
    assert(cudaMemcpy(coder.data_raw, coder.d_data_raw, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
    if ( gpujpeg_image_save_to_file(output, coder.data_raw, coder.data_raw_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to save image [%s]!\n", output);
        return;
    }
    // Deinitialize decoder
    gpujpeg_coder_deinit(&coder);
}

/** Documented at declaration */
int
gpujpeg_opengl_init()
{
    abort();
}

/** Documented at declaration */
int
gpujpeg_opengl_texture_create(int width, int height, uint8_t* data)
{
    int texture_id = 0;

#ifdef GPUJPEG_USE_OPENGL
    glGenTextures(1, &texture_id);
    glBindTexture(GL_TEXTURE_2D, texture_id);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);

    glBindTexture(GL_TEXTURE_2D, 0);
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    return texture_id;
}

/** Documented at declaration */
int
gpujpeg_opengl_texture_set_data(int texture_id, uint8_t* data)
{
#ifdef GPUJPEG_USE_OPENGL
    glBindTexture(GL_TEXTURE_2D, texture_id);

    int width = 0;
    int height = 0;
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &width);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &height);
    assert(width != 0 && height != 0);

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);

    glBindTexture(GL_TEXTURE_2D, 0);
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif
    return 0;
}

/** Documented at declaration */
int
gpujpeg_opengl_texture_get_data(int texture_id, uint8_t* data, int* data_size)
{
#ifdef GPUJPEG_USE_OPENGL
    glBindTexture(GL_TEXTURE_2D, texture_id);

    int width = 0;
    int height = 0;
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &width);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &height);
    assert(width != 0 && height != 0);

    glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, data);
    if ( data_size != NULL )
        *data_size = width * height * 3;

    glBindTexture(GL_TEXTURE_2D, 0);
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif
    return 0;
}

/** Documented at declaration */
void
gpujpeg_opengl_texture_destroy(int texture_id)
{
#ifdef GPUJPEG_USE_OPENGL
    glDeleteTextures(1, &texture_id);
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif
}

/** Documented at declaration */
struct gpujpeg_opengl_texture*
gpujpeg_opengl_texture_register(int texture_id, enum gpujpeg_opengl_texture_type texture_type)
{
    struct gpujpeg_opengl_texture* texture = NULL;
    cudaMallocHost((void**)&texture, sizeof(struct gpujpeg_opengl_texture));
    assert(texture != NULL);

    texture->texture_id = texture_id;
    texture->texture_type = texture_type;
    texture->texture_width = 0;
    texture->texture_height = 0;
    texture->texture_pbo_id = 0;
    texture->texture_pbo_type = 0;
    texture->texture_pbo_resource = 0;
    texture->texture_callback_param = NULL;
    texture->texture_callback_attach_opengl = NULL;
    texture->texture_callback_detach_opengl = NULL;

#ifdef GPUJPEG_USE_OPENGL
    glBindTexture(GL_TEXTURE_2D, texture->texture_id);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &texture->texture_width);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &texture->texture_height);
    glBindTexture(GL_TEXTURE_2D, 0);
    assert(texture->texture_width != 0 && texture->texture_height != 0);

    // Select PBO type
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
        texture->texture_pbo_type = GL_PIXEL_PACK_BUFFER;
    } else if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
        texture->texture_pbo_type = GL_PIXEL_UNPACK_BUFFER;
    } else {
        assert(0);
    }

    // Create PBO
    glGenBuffers(1, &texture->texture_pbo_id);
    glBindBuffer(texture->texture_pbo_type, texture->texture_pbo_id);
    glBufferData(texture->texture_pbo_type, texture->texture_width * texture->texture_height * 3 * sizeof(uint8_t), NULL, GL_DYNAMIC_DRAW);
    glBindBuffer(texture->texture_pbo_type, 0);

    // Create CUDA PBO Resource
    cudaGraphicsGLRegisterBuffer(&texture->texture_pbo_resource, texture->texture_pbo_id, cudaGraphicsMapFlagsNone);
    gpujpeg_cuda_check_error("Register OpenGL buffer");
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    return texture;
}

/** Documented at declaration */
void
gpujpeg_opengl_texture_unregister(struct gpujpeg_opengl_texture* texture)
{
#ifdef GPUJPEG_USE_OPENGL
    if ( texture->texture_pbo_id != 0 ) {
        glDeleteBuffers(1, &texture->texture_pbo_id);
    }
    if ( texture->texture_pbo_resource != NULL ) {
        cudaGraphicsUnregisterResource(texture->texture_pbo_resource);
    }
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    assert(texture != NULL);
    cudaFreeHost(texture);
}

/** Documented at declaration */
uint8_t*
gpujpeg_opengl_texture_map(struct gpujpeg_opengl_texture* texture, int* data_size)
{
    assert(texture->texture_pbo_resource != NULL);
    assert((texture->texture_callback_attach_opengl == NULL && texture->texture_callback_detach_opengl == NULL) ||
           (texture->texture_callback_attach_opengl != NULL && texture->texture_callback_detach_opengl != NULL));

    // Attach OpenGL context by callback
    if ( texture->texture_callback_attach_opengl != NULL )
        texture->texture_callback_attach_opengl(texture->texture_callback_param);

    uint8_t* d_data = NULL;

#ifdef GPUJPEG_USE_OPENGL
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
        assert(texture->texture_pbo_type == GL_PIXEL_PACK_BUFFER);

        glBindTexture(GL_TEXTURE_2D, texture->texture_id);
        glBindBuffer(GL_PIXEL_PACK_BUFFER, texture->texture_pbo_id);

        glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, 0);

        glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
        glBindTexture(GL_TEXTURE_2D, 0);
    }
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    // Map pixel buffer object to cuda
    cudaGraphicsMapResources(1, &texture->texture_pbo_resource, 0);
    gpujpeg_cuda_check_error("Encoder map texture PBO resource");

    // Get device data pointer to pixel buffer object data
    size_t d_data_size;
    cudaGraphicsResourceGetMappedPointer((void **)&d_data, &d_data_size, texture->texture_pbo_resource);
    gpujpeg_cuda_check_error("Encoder get device pointer for texture PBO resource");
    if ( data_size != NULL )
        *data_size = d_data_size;

    return d_data;
}

/** Documented at declaration */
void
gpujpeg_opengl_texture_unmap(struct gpujpeg_opengl_texture* texture)
{
    // Unmap pbo
    cudaGraphicsUnmapResources(1, &texture->texture_pbo_resource, 0);
    gpujpeg_cuda_check_error("Encoder unmap texture PBO resource");

#ifdef GPUJPEG_USE_OPENGL
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
        assert(texture->texture_pbo_type == GL_PIXEL_UNPACK_BUFFER);

        glBindTexture(GL_TEXTURE_2D, texture->texture_id);
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture->texture_pbo_id);

        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, texture->texture_width, texture->texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);

        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
        glBindTexture(GL_TEXTURE_2D, 0);
        glFinish();
    }
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    // Dettach OpenGL context by callback
    if ( texture->texture_callback_detach_opengl != NULL )
        texture->texture_callback_detach_opengl(texture->texture_callback_param);
}