UltraGrid/libgpujpeg/gpujpeg_table.c

/**
 * Copyright (c) 2011, CESNET z.s.p.o
 * Copyright (c) 2011, Silicon Genome, LLC.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "gpujpeg_table.h"
#include "gpujpeg_util.h"
#include <npp.h>

/** Default Quantization Table for Y component (zig-zag order)*/
static uint8_t gpujpeg_table_default_quantization_luminance[] = {
  16,  11,  12,  14,  12,  10,  16,  14,
  13,  14,  18,  17,  16,  19,  24,  40,
  26,  24,  22,  22,  24,  49,  35,  37,
  29,  40,  58,  51,  61,  60,  57,  51,
  56,  55,  64,  72,  92,  78,  64,  68,
  87,  69,  55,  56,  80, 109,  81,  87,
  95,  98, 103, 104, 103,  62,  77, 113,
 121, 112, 100, 120,  92, 101, 103,  99
};
/** Default Quantization Table for Cb or Cr component (zig-zag order) */
static uint8_t gpujpeg_table_default_quantization_chrominance[] = {
  17,  18,  18,  24,  21,  24,  47,  26,
  26,  47,  99,  66,  56,  66,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99
};

/**
 * Set default quantization table
 *
 * @param table_raw  Table buffer
 * @param type  Quantization table type
 */
void
gpujpeg_table_quantization_set_default(uint8_t* table_raw, enum gpujpeg_component_type type)
{
    uint8_t* table_default = NULL;
    if ( type == GPUJPEG_COMPONENT_LUMINANCE )
        table_default = gpujpeg_table_default_quantization_luminance;
    else if ( type == GPUJPEG_COMPONENT_CHROMINANCE )
        table_default = gpujpeg_table_default_quantization_chrominance;
    else
        assert(0);
    memcpy(table_raw, table_default, 64 * sizeof(uint8_t));
}

/** Documented at declaration */
int
gpujpeg_table_quantization_encoder_init(struct gpujpeg_table_quantization* table, enum gpujpeg_component_type type, int quality)
{
    // Load raw table in zig-zag order
    gpujpeg_table_quantization_set_default(table->table_raw, type);

    // Update raw table by quality
    nppiQuantFwdRawTableInit_JPEG_8u(table->table_raw, quality);

    // Fix NPP bug before version 4.1 [http://forums.nvidia.com/index.php?showtopic=191896]
    const NppLibraryVersion* npp_version = nppGetLibVersion();
    if ( npp_version->major < 4 || (npp_version->major == 4 && npp_version->minor == 0) ) {
        for ( int i = 0; i < 64; i++ ) {
            table->table[gpujpeg_order_natural[i]] = ((1 << 15) / (double)table->table_raw[i]) + 0.5;
        }
    } else {
        // Load forward table from raw table
        nppiQuantFwdTableInit_JPEG_8u16u(table->table_raw, table->table);
    }

    // Copy tables to device memory
    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
        return -1;

    return 0;
}

/** Documented at declaration */
int
gpujpeg_table_quantization_decoder_init(struct gpujpeg_table_quantization* table, enum gpujpeg_component_type type, int quality)
{
    // Load raw table in zig-zag order
    gpujpeg_table_quantization_set_default(table->table_raw, type);

    // Update raw table by quality
    nppiQuantFwdRawTableInit_JPEG_8u(table->table_raw, quality);

    // Fix NPP bug before version 4.1 [http://forums.nvidia.com/index.php?showtopic=191896]
    const NppLibraryVersion* npp_version = nppGetLibVersion();
    if ( npp_version->major < 4 || (npp_version->major == 4 && npp_version->minor == 0) ) {
        for ( int i = 0; i < 64; i++ ) {
            table->table[gpujpeg_order_natural[i]] = table->table_raw[i];
        }
    } else {
        // Load inverse table from raw table
        nppiQuantInvTableInit_JPEG_8u16u(table->table_raw, table->table);
    }

    // Copy tables to device memory
    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
        return -1;

    return 0;
}

int
gpujpeg_table_quantization_decoder_compute(struct gpujpeg_table_quantization* table)
{
    // Fix NPP bug before version 4.1 [http://forums.nvidia.com/index.php?showtopic=191896]
    const NppLibraryVersion* npp_version = nppGetLibVersion();
    if ( npp_version->major < 4 || (npp_version->major == 4 && npp_version->minor == 0) ) {
        for ( int i = 0; i < 64; i++ ) {
            table->table[gpujpeg_order_natural[i]] = table->table_raw[i];
        }
    } else {
        // Load inverse table from raw table
        nppiQuantInvTableInit_JPEG_8u16u(table->table_raw, table->table);
    }

    // Copy tables to device memory
    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
        return -1;

    return 0;
}

/** Documented at declaration */
void
gpujpeg_table_quantization_print(struct gpujpeg_table_quantization* table)
{
    puts("Raw Table (with quality):");
    for (int i = 0; i < 8; ++i) {
        for (int j = 0; j < 8; ++j) {
            printf("%4u", table->table_raw[i * 8 + j]);
        }
        puts("");
    }

    puts("Forward/Inverse Table:");
    for (int i = 0; i < 8; ++i) {
        for (int j = 0; j < 8; ++j) {
            printf("%6u", table->table[i * 8 + j]);
        }
        puts("");
    }
}

/** Huffman Table DC for Y component */
static unsigned char gpujpeg_table_huffman_y_dc_bits[17] = {
    0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
};
static unsigned char gpujpeg_table_huffman_y_dc_value[] = {
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
};
/** Huffman Table DC for Cb or Cr component */
static unsigned char gpujpeg_table_huffman_cbcr_dc_bits[17] = {
    0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
};
static unsigned char gpujpeg_table_huffman_cbcr_dc_value[] = {
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
};
/** Huffman Table AC for Y component */
static unsigned char gpujpeg_table_huffman_y_ac_bits[17] = {
    0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
};
static unsigned char gpujpeg_table_huffman_y_ac_value[] = {
    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
    0xf9, 0xfa
};
/** Huffman Table AC for Cb or Cr component */
static unsigned char gpujpeg_table_huffman_cbcr_ac_bits[17] = {
    0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
};
static unsigned char gpujpeg_table_huffman_cbcr_ac_value[] = {
    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
    0xf9, 0xfa
};

/**
 * Compute encoder huffman table from bits and values arrays (that are already set in table)
 *
 * @param table  Table structure
 * @return void
 */
void
gpujpeg_table_huffman_encoder_compute(struct gpujpeg_table_huffman_encoder* table)
{
    char huffsize[257];
    unsigned int huffcode[257];

    // Figure C.1: make table of Huffman code length for each symbol
    // Note that this is in code-length order
    int p = 0;
    for ( int l = 1; l <= 16; l++ ) {
        for ( int i = 1; i <= (int) table->bits[l]; i++ )
            huffsize[p++] = (char) l;
    }
    huffsize[p] = 0;
    int lastp = p;

    // Figure C.2: generate the codes themselves
    // Note that this is in code-length order
    unsigned int code = 0;
    int si = huffsize[0];
    p = 0;
    while ( huffsize[p] ) {
        while ( ((int) huffsize[p]) == si ) {
            huffcode[p++] = code;
            code++;
        }
        code <<= 1;
        si++;
    }

    // Figure C.3: generate encoding tables
    // These are code and size indexed by symbol value

    // Set any codeless symbols to have code length 0;
    // this allows EmitBits to detect any attempt to emit such symbols.
    memset(table->size, 0, sizeof(table->size));

    for (p = 0; p < lastp; p++) {
        table->code[table->huffval[p]] = huffcode[p];
        table->size[table->huffval[p]] = huffsize[p];
    }
}

/** Documented at declaration */
int
gpujpeg_table_huffman_encoder_init(struct gpujpeg_table_huffman_encoder* table, struct gpujpeg_table_huffman_encoder* d_table, enum gpujpeg_component_type comp_type, enum gpujpeg_huffman_type huff_type)
{
    assert(comp_type == GPUJPEG_COMPONENT_LUMINANCE || comp_type == GPUJPEG_COMPONENT_CHROMINANCE);
    assert(huff_type == GPUJPEG_HUFFMAN_DC || huff_type == GPUJPEG_HUFFMAN_AC);
    if ( comp_type == GPUJPEG_COMPONENT_LUMINANCE ) {
        if ( huff_type == GPUJPEG_HUFFMAN_DC ) {
            memcpy(table->bits, gpujpeg_table_huffman_y_dc_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_y_dc_value, sizeof(table->huffval));
        } else {
            memcpy(table->bits, gpujpeg_table_huffman_y_ac_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_y_ac_value, sizeof(table->huffval));
        }
    } else if ( comp_type == GPUJPEG_COMPONENT_CHROMINANCE ) {
        if ( huff_type == GPUJPEG_HUFFMAN_DC ) {
            memcpy(table->bits, gpujpeg_table_huffman_cbcr_dc_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_cbcr_dc_value, sizeof(table->huffval));
        } else {
            memcpy(table->bits, gpujpeg_table_huffman_cbcr_ac_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_cbcr_ac_value, sizeof(table->huffval));
        }
    }
    gpujpeg_table_huffman_encoder_compute(table);

#ifndef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
    // Copy table to device memory
    if ( cudaSuccess != cudaMemcpy(d_table, table, sizeof(struct gpujpeg_table_huffman_encoder), cudaMemcpyHostToDevice) )
        return -1;
#endif

    return 0;
}

/** Documented at declaration */
int
gpujpeg_table_huffman_decoder_init(struct gpujpeg_table_huffman_decoder* table, struct gpujpeg_table_huffman_decoder* d_table, enum gpujpeg_component_type comp_type, enum gpujpeg_huffman_type huff_type)
{
    assert(comp_type == GPUJPEG_COMPONENT_LUMINANCE || comp_type == GPUJPEG_COMPONENT_CHROMINANCE);
    assert(huff_type == GPUJPEG_HUFFMAN_DC || huff_type == GPUJPEG_HUFFMAN_AC);
    if ( comp_type == GPUJPEG_COMPONENT_LUMINANCE ) {
        if ( huff_type == GPUJPEG_HUFFMAN_DC ) {
            memcpy(table->bits, gpujpeg_table_huffman_y_dc_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_y_dc_value, sizeof(table->huffval));
        } else {
            memcpy(table->bits, gpujpeg_table_huffman_y_ac_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_y_ac_value, sizeof(table->huffval));
        }
    } else if ( comp_type == GPUJPEG_COMPONENT_CHROMINANCE ) {
        if ( huff_type == GPUJPEG_HUFFMAN_DC ) {
            memcpy(table->bits, gpujpeg_table_huffman_cbcr_dc_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_cbcr_dc_value, sizeof(table->huffval));
        } else {
            memcpy(table->bits, gpujpeg_table_huffman_cbcr_ac_bits, sizeof(table->bits));
            memcpy(table->huffval, gpujpeg_table_huffman_cbcr_ac_value, sizeof(table->huffval));
        }
    }
    gpujpeg_table_huffman_decoder_compute(table, d_table);

    return 0;
}

/** Documented at declaration */
void
gpujpeg_table_huffman_decoder_compute(struct gpujpeg_table_huffman_decoder* table, struct gpujpeg_table_huffman_decoder* d_table)
{
    // Figure C.1: make table of Huffman code length for each symbol
    // Note that this is in code-length order.
    char huffsize[257];
    int p = 0;
    for ( int l = 1; l <= 16; l++ ) {
        for ( int i = 1; i <= (int) table->bits[l]; i++ )
            huffsize[p++] = (char) l;
    }
    huffsize[p] = 0;

    // Figure C.2: generate the codes themselves
    // Note that this is in code-length order.
    unsigned int huffcode[257];
    unsigned int code = 0;
    int si = huffsize[0];
    p = 0;
    while ( huffsize[p] ) {
        while ( ((int) huffsize[p]) == si ) {
            huffcode[p++] = code;
            code++;
        }
        code <<= 1;
        si++;
    }

    // Figure F.15: generate decoding tables for bit-sequential decoding
    p = 0;
    for ( int l = 1; l <= 16; l++ ) {
        if ( table->bits[l] ) {
            table->valptr[l] = p; // huffval[] index of 1st symbol of code length l
            table->mincode[l] = huffcode[p]; // minimum code of length l
            p += table->bits[l];
            table->maxcode[l] = huffcode[p-1]; // maximum code of length l
        } else {
            table->maxcode[l] = -1;    // -1 if no codes of this length
        }
    }
    // Ensures gpujpeg_huff_decode terminates
    table->maxcode[17] = 0xFFFFFL;

    // Compute lookahead tables to speed up decoding.
    //First we set all the table entries to 0, indicating "too long";
    //then we iterate through the Huffman codes that are short enough and
    //fill in all the entries that correspond to bit sequences starting
    //with that code.
    memset(table->look_nbits, 0, sizeof(int) * 256);

    int HUFF_LOOKAHEAD = 8;
    p = 0;
    for ( int l = 1; l <= HUFF_LOOKAHEAD; l++ ) {
        for ( int i = 1; i <= (int) table->bits[l]; i++, p++ ) {
            // l = current code's length,
            // p = its index in huffcode[] & huffval[]. Generate left-justified
            // code followed by all possible bit sequences
            int lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
            for ( int ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr-- )
            {
                table->look_nbits[lookbits] = l;
                table->look_sym[lookbits] = table->huffval[p];
                lookbits++;
            }
        }
    }

#ifndef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
    // Copy table to device memory
    cudaMemcpy(d_table, table, sizeof(struct gpujpeg_table_huffman_decoder), cudaMemcpyHostToDevice);
#endif
}