UltraGrid/cuda_dxt/cuda_dxt.cu

///
///  @file    cuda_dxt.cu
///  @author  Martin Jirman <jirman@cesnet.cz>
///  @brief   CUDA implementation of DXT compression
///

#include <cuda_runtime_api.h>
#include <stdio.h>
#include "cuda_dxt.h"


typedef unsigned int u32;
typedef u32 UINT;

struct vec3;
struct uvec2;

struct uvec3 {
    u32 r, g, b;
    __device__ uvec3(u32 r, u32 g, u32 b) : r(r), g(g), b(b) {}
    __device__ uvec3(u32 n = 0) : r(n), g(n), b(n) {}
    __device__ uvec3(const vec3 & v);
};

struct vec2 {
    float r, g;
    __device__ vec2(float x, float y) : r(x), g(y) {}
    __device__ vec2(float n = 0.0f) : r(n), g(n) {}
    __device__ vec2(const vec3 & v);
    __device__ vec2(const uvec2 & v);
    __device__ vec2 operator+(const vec2 & o) const {
        return vec2(r + o.r, g + o.g);
    }
    __device__ vec2 operator*(const vec2 & v) const {
        return vec2(r * v.r, g * v.g);
    }
    __device__ vec2 operator*(const float n) const {
        return *this * vec2(n);
    }
    __device__ vec2 operator-(const vec2 & o) const {
        return *this + (o * -1.0f);
    }
    __device__ vec2 operator/(const float n) const {
        return *this * (1.0f / n);
    }
};

struct uvec2 {
    u32 r, g;
    __device__ uvec2(u32 x, u32 y) : r(x), g(y) {}
    __device__ uvec2(u32 n = 0) : r(n), g(n) {}
    __device__ uvec2(const vec2 & v);
};

struct vec3 : public vec2 {
    float b;
    __device__ vec3(float x, float y, float z) : vec2(x, y), b(z) {}
    __device__ vec3(float n = 0.0f) : vec2(n), b(n) {}
    __device__ vec3(const vec2 & v) : vec2(v), b(0.0f) {}
    __device__ vec3(const uvec3 & v) : vec2(v.r, v.g), b(v.b) {}
    __device__ vec3 operator+(const vec3 & o) const {
        return vec3(r + o.r, g + o.g, b + o.b);
    }
    __device__ vec3 operator*(const vec3 & v) const {
        return vec3(r * v.r, g * v.g, b * v.b);
    }
    __device__ vec3 operator*(const float n) const {
        return *this * vec3(n);
    }
    __device__ vec3 operator-(const vec3 & o) const {
        return *this + (o * -1.0f);
    }
    __device__ vec3 operator/(const float n) const {
        return *this * (1.0f / n);
    }
//     __device__ vec2 & yz() {
//         return *this;
//     }
    __device__ vec2 gb() const {
        return vec2(g, b);
    }
};

__device__ uvec3::uvec3(const vec3 & v) : r(v.r), g(v.g), b(v.b) {}
__device__ uvec2::uvec2(const vec2 & v) : r(v.r), g(v.g) {}
__device__ vec2::vec2(const vec3 & v) : r(v.r), g(v.g) {}
__device__ vec2::vec2(const uvec2 & v) : r(v.r), g(v.g) {}


__device__ static vec3 min(const vec3 & a, const vec3 & b) {
    return vec3(min(a.r, b.r), min(a.g, b.g), min(a.b, b.b));
}

__device__ static vec3 max(const vec3 & a, const vec3 & b) {
    return vec3(max(a.r, b.r), max(a.g, b.g), max(a.b, b.b));
}

__device__ static vec2 min(const vec2 & a, const vec2 & b) {
    return vec2(min(a.r, b.r), min(a.g, b.g));
}

__device__ static vec2 max(const vec2 & a, const vec2 & b) {
    return vec2(max(a.r, b.r), max(a.g, b.g));
}

__device__ static float dot(const vec3 & a, const vec3 & b) {
    return a.r * b.r + a.g * b.g + a.b * b.b;
}

__device__ static vec2 clamp(const vec2 & v, float min_val, float max_val) {
    return min(vec2(max_val), max(vec2(min_val), v));
}

__device__ static float clamp(const float & v, float min_val, float max_val) {
    return min(max_val, max(min_val, v));
}

__device__ static vec2 abs(const vec2 & v) {
    return vec2(fabsf(v.r), fabsf(v.g));
}

__device__ static vec3 round(const vec3 & v) {
    return vec3(roundf(v.r), roundf(v.g), roundf(v.b));
}

__device__ static vec3 lerp(const vec3 & a, const vec3 & b, const float q) {
    return a * (1.0f - q) + b * q;
}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////


__device__ static const float offset = 128.0 / 255.0;

__device__ static vec3 ConvertRGBToYCoCg(vec3 color)
{
    float Y = (color.r + 2.0 * color.g + color.b) * 0.25;
    float Co = ( ( 2.0 * color.r - 2.0 * color.b      ) * 0.25 + offset );
    float Cg = ( (    -color.r + 2.0 * color.g - color.b) * 0.25 + offset );

    return vec3(Y, Co, Cg);
}

// __device__ static float colorDistance(vec3 c0, vec3 c1)
// {
//     return dot(c0-c1, c0-c1);
// }
__device__ static float colorDistance(vec2 c0, vec2 c1)
{
    return dot(c0-c1, c0-c1);
}

__device__ static void FindMinMaxColorsBox(vec3 block[16], vec3 & mincol, vec3 & maxcol)
{
    mincol = block[0];
    maxcol = block[0];

    for ( int i = 1; i < 16; i++ ) {
        mincol = min(mincol, block[i]);
        maxcol = max(maxcol, block[i]);
    }
}

// __device__ static void InsetBBox(vec3 & mincol, vec3 & maxcol)
// {
//     vec3 inset = (maxcol - mincol) / 16.0 - (8.0 / 255.0) / 16.0;
//     mincol = clamp(mincol + inset, 0.0, 1.0);
//     maxcol = clamp(maxcol - inset, 0.0, 1.0);
// }
__device__ static void InsetYBBox(float & mincol, float & maxcol)
{
    float inset = (maxcol - mincol) / 32.0 - (16.0 / 255.0) / 32.0;
    mincol = clamp(mincol + inset, 0.0, 1.0);
    maxcol = clamp(maxcol - inset, 0.0, 1.0);
}
__device__ static void InsetCoCgBBox(vec2 & mincol, vec2 & maxcol)
{
    vec2 inset = (maxcol - mincol) / 16.0 - (8.0 / 255.0) / 16.0;
    mincol = clamp(mincol + inset, 0.0, 1.0);
    maxcol = clamp(maxcol - inset, 0.0, 1.0);
}

// __device__ static void SelectDiagonal(vec3 block[16], vec3 & mincol, vec3 & maxcol)
// {
//     vec3 center = (mincol + maxcol) * 0.5;
//
//     vec2 cov = vec2(0, 0);
//     for (int i = 0; i < 16; i++) {
//         vec3 t = block[i] - center;
//         cov.r += t.r * t.b;
//         cov.g += t.g * t.b;
//     }
//
//     if (cov.r < 0.0) {
//         float temp = maxcol.r;
//         maxcol.r = mincol.r;
//         mincol.r = temp;
//     }
//     if (cov.g < 0.0) {
//         float temp = maxcol.g;
//         maxcol.g = mincol.g;
//         mincol.g = temp;
//     }
// }

// __device__ static vec3 RoundAndExpand(vec3 v, UINT & w)
// {
//     uvec3 c = uvec3(round(v * vec3(31, 63, 31)));
//     w = (c.r << 11u) | (c.g << 5u) | c.b;
//
//     c.r = (c.r << 3u) | (c.r >> 2u);
//     c.b = (c.b << 3u) | (c.b >> 2u);
//     c.g = (c.g << 2u) | (c.g >> 4u);
//
//     return vec3(c) * (1.0 / 255.0);
// }

// __device__ static UINT EmitEndPointsDXT1(vec3 & mincol, vec3 & maxcol)
// {
//     uvec2 outp;
//     maxcol = RoundAndExpand(maxcol, outp.r);
//     mincol = RoundAndExpand(mincol, outp.g);
//
//     // We have to do this in case we select an alternate diagonal.
//     if (outp.r < outp.g) {
//         vec3 tmp = mincol;
//         mincol = maxcol;
//         maxcol = tmp;
//         return outp.g | (outp.r << 16u);
//     }
//
//     return outp.r | (outp.g << 16u);
// }

__device__ static UINT ScaleYCoCg(vec2 minColor, vec2 maxColor)
{
    vec2 m0 = abs(minColor - offset);
    vec2 m1 = abs(maxColor - offset);

    float m = max(max(m0.r, m0.g), max(m1.r, m1.g));

    const float s0 = 64.0 / 255.0;
    const float s1 = 32.0 / 255.0;

    UINT scale = 1u;
    if ( m < s0 )
        scale = 2u;
    if ( m < s1 )
        scale = 4u;

    return scale;
}

__device__ static bool SelectYCoCgDiagonal(const vec3 block[16], vec2 minColor, vec2 maxColor)
{
    vec2 mid = (maxColor + minColor) * 0.5;

    float cov = 0.0;
    for ( int i = 0; i < 16; i++ ) {
        vec2 t = block[i].gb() - mid;
        cov += t.r * t.g;
    }
    return cov < 0.0;
}

__device__ static UINT EmitEndPointsYCoCgDXT5(float & mincol_r, float & mincol_g,
                                              float & maxcol_r, float & maxcol_g,
                                              UINT scale)
{
    vec2 mincol = vec2(mincol_r, mincol_g);
    vec2 maxcol = vec2(maxcol_r, maxcol_g);

    maxcol = (maxcol - offset) * float(scale) + float(offset);
    mincol = (mincol - offset) * float(scale) + float(offset);

    InsetCoCgBBox(mincol, maxcol);

    maxcol = round(maxcol * vec2(31, 63));
    mincol = round(mincol * vec2(31, 63));

    uvec2 imaxcol = uvec2(maxcol);
    uvec2 imincol = uvec2(mincol);

    uvec2 outp;
    outp.r = (imaxcol.r << 11u) | (imaxcol.g << 5u) | (scale - UINT(1));
    outp.g = (imincol.r << 11u) | (imincol.g << 5u) | (scale - UINT(1));

    imaxcol.r = (imaxcol.r << 3u) | (imaxcol.r >> 2u);
    imaxcol.g = (imaxcol.g << 2u) | (imaxcol.g >> 4u);
    imincol.r = (imincol.r << 3u) | (imincol.r >> 2u);
    imincol.g = (imincol.g << 2u) | (imincol.g >> 4u);

    maxcol = vec2(imaxcol) * (1.0 / 255.0);
    mincol = vec2(imincol) * (1.0 / 255.0);

    // Undo rescale.
    maxcol = (maxcol - offset) / float(scale) + float(offset);
    mincol = (mincol - offset) / float(scale) + float(offset);

    // distribute back
    mincol_r = mincol.r;
    mincol_g = mincol.g;
    maxcol_r = maxcol_r;
    maxcol_g = maxcol.g;

    return outp.r | (outp.g << 16u);
}

__device__ static UINT EmitIndicesYCoCgDXT5(vec3 block[16], vec2 mincol, vec2 maxcol)
{
    // Compute palette
    vec2 c[4];
    c[0] = maxcol;
    c[1] = mincol;
    c[2] = lerp(c[0], c[1], 1.0/3.0);
    c[3] = lerp(c[0], c[1], 2.0/3.0);

    // Compute indices
    UINT indices = 0u;
    for ( int i = 0; i < 16; i++ )
    {
        // find index of closest color
        float4 dist;
        dist.x = colorDistance(block[i].gb(), c[0]);
        dist.y = colorDistance(block[i].gb(), c[1]);
        dist.z = colorDistance(block[i].gb(), c[2]);
        dist.w = colorDistance(block[i].gb(), c[3]);

        uint4 b;
        b.x = dist.x > dist.w ? 1u : 0u;
        b.y = dist.y > dist.z ? 1u : 0u;
        b.z = dist.x > dist.z ? 1u : 0u;
        b.w = dist.y > dist.w ? 1u : 0u;
        UINT b4 = dist.z > dist.w ? 1u : 0u;

        UINT index = (b.x & b4) | (((b.y & b.z) | (b.x & b.w)) << 1u);
        indices |= index << (UINT(i) * 2u);
    }

    // Output indices
    return indices;
}

__device__ static UINT EmitAlphaEndPointsYCoCgDXT5(float mincol, float maxcol)
{
    uvec2 tmp = uvec2(roundf(mincol * 255.0), roundf(maxcol * 255.0));
    UINT c0 = tmp.r;
    UINT c1 = tmp.g;

    return (c0 << 8u) | c1;
}

// Version shown in the YCoCg-DXT article.
__device__ static uvec2 EmitAlphaIndicesYCoCgDXT5(vec3 block[16], float minAlpha, float maxAlpha)
{
    const float ALPHA_RANGE = 7.0;

    float mid = (maxAlpha - minAlpha) / (2.0 * ALPHA_RANGE);

    float ab1 = minAlpha + mid;
    float ab2 = (6.0 * maxAlpha + 1.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab3 = (5.0 * maxAlpha + 2.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab4 = (4.0 * maxAlpha + 3.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab5 = (3.0 * maxAlpha + 4.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab6 = (2.0 * maxAlpha + 5.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab7 = (1.0 * maxAlpha + 6.0 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;

    uvec2 indices = uvec2(0, 0);

    UINT index = 1u;
    for ( int i = 0; i < 6; i++ ) {
        float a = block[i].r;
        index = 1u;
        index += (a <= ab1) ? 1u : 0u;
        index += (a <= ab2) ? 1u : 0u;
        index += (a <= ab3) ? 1u : 0u;
        index += (a <= ab4) ? 1u : 0u;
        index += (a <= ab5) ? 1u : 0u;
        index += (a <= ab6) ? 1u : 0u;
        index += (a <= ab7) ? 1u : 0u;
        index &= 7u;
        index ^= (2u > index) ? 1u : 0u;
        indices.r |= index << (3u * UINT(i) + 16u);
    }

    indices.g = index >> 1u;

    for ( int i = 6; i < 16; i++ ) {
        float a = block[i].r;
        index = 1u;
        index += (a <= ab1) ? 1u : 0u;
        index += (a <= ab2) ? 1u : 0u;
        index += (a <= ab3) ? 1u : 0u;
        index += (a <= ab4) ? 1u : 0u;
        index += (a <= ab5) ? 1u : 0u;
        index += (a <= ab6) ? 1u : 0u;
        index += (a <= ab7) ? 1u : 0u;
        index &= 7u;
        index ^= (2u > index) ? 1u : 0u;
        indices.g |= index << (3u * UINT(i) - 16u);
    }

    return indices;
}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////


/// Encodes color palette endpoint into 565 code and adjusts input values.
__device__ static u32 encode_endpoint(float & r, float & g, float & b) {
    // clamp to range [0,1] and use full output range for each component
    r = rintf(__saturatef(r) * 31.0f);
    g = rintf(__saturatef(g) * 63.0f);  // 6 bits for green sample
    b = rintf(__saturatef(b) * 31.0f);

    // compose output 16bit code representing the endpoint color
    const u32 code = ((u32)r << 11) + ((u32)g << 5) + (u32)b;

    // convert all 3 endpoint component samples back to unit range
    r *= 0.0322580645161f;  // divide by 31
    g *= 0.015873015873f;   // divide by 63
    b *= 0.0322580645161f;  // divide by 31

    // return output 16bit code for the endpoint
    return code;
}


/// Transform YUV to RGB.
__device__ static void yuv_to_rgb(float & r, float & g, float & b) {
    const float y = 1.1643f * (r - 0.0625f);  // TODO: convert to FFMA
    const float u = g - 0.5f;
    const float v = b - 0.5f;
    r = y + 1.7926f * v;
    g = y - 0.2132f * u - 0.5328f * v;
    b = y + 2.1124f * u;
}


/// Swaps two referenced values.
template <typename T>
__device__ static void swap(T & a, T & b) {
    const T temp = a;
    a = b;
    b = temp;
}


/// Encodes and saves the block.
template <int DXT_TYPE>
__device__ void dxt_encode(void * out, const int block_idx,
                                  float r[16], float g[16], float b[16]);


/// Encodes the block into DXT6 format (DXT5-YcOcG) and saves it into output
/// buffer
template <>
__device__ void dxt_encode<6>(void * out, const int block_idx,
                                     float r[16], float g[16], float b[16]) {
     // Read block of data
    vec3 block[16];

    for(int i = 0; i < 16; i++) {
        block[i] = ConvertRGBToYCoCg(vec3(r[i], g[i], b[i]));
    }

    // Find min and max colors
    vec3 mincol, maxcol;
    FindMinMaxColorsBox(block, mincol, maxcol);

    if(SelectYCoCgDiagonal(block, mincol.gb(), maxcol.gb()))  {
        float tmp = maxcol.b;
        maxcol.b = mincol.b;
        mincol.b = tmp;
    }

    u32 scale = ScaleYCoCg(mincol.gb(), maxcol.gb());
//     printf("Scale: %u.\n", scale);
    // Output CoCg in DXT1 block.
    uint4 outp;
    outp.z = EmitEndPointsYCoCgDXT5(mincol.g, mincol.b, maxcol.g, maxcol.b, scale);
    outp.w = EmitIndicesYCoCgDXT5(block, mincol.gb(), maxcol.gb());

    InsetYBBox(mincol.r, maxcol.r);

    // Output Y in DXT5 alpha block.
    outp.x = EmitAlphaEndPointsYCoCgDXT5(mincol.r, maxcol.r);

    uvec2 indices = EmitAlphaIndicesYCoCgDXT5(block, mincol.r, maxcol.r);
    outp.x |= indices.r;
    outp.y = indices.g;

    ((uint4*)out)[block_idx] = outp;

}

/// Encodes the block into DXT1 format and saves it into output buffer
template <>
__device__ void dxt_encode<1>(void * out, const int block_idx,
                                     float r[16], float g[16], float b[16]) {
    // find min and max sample values for each component
    float mincol_r = r[0];
    float mincol_g = g[0];
    float mincol_b = b[0];
    float maxcol_r = r[0];
    float maxcol_g = g[0];
    float maxcol_b = b[0];
    for(int i = 1; i < 16; i++) {
        mincol_r = min(mincol_r, r[i]);
        mincol_g = min(mincol_g, g[i]);
        mincol_b = min(mincol_b, b[i]);
        maxcol_r = max(maxcol_r, r[i]);
        maxcol_g = max(maxcol_g, g[i]);
        maxcol_b = max(maxcol_b, b[i]);
    }

    // inset the bounding box
    const float inset_r = (maxcol_r - mincol_r) * 0.0625f;
    const float inset_g = (maxcol_g - mincol_g) * 0.0625f;
    const float inset_b = (maxcol_b - mincol_b) * 0.0625f;
    mincol_r += inset_r;
    mincol_g += inset_g;
    mincol_b += inset_b;
    maxcol_r -= inset_r;
    maxcol_g -= inset_g;
    maxcol_b -= inset_b;

    // select diagonal
    const float center_r = (mincol_r + maxcol_r) * 0.5f;
    const float center_g = (mincol_g + maxcol_g) * 0.5f;
    const float center_b = (mincol_b + maxcol_b) * 0.5f;
    float cov_x = 0.0f;
    float cov_y = 0.0f;
    for(int i = 0; i < 16; i++) {
        const float dir_r = r[i] - center_r;
        const float dir_g = g[i] - center_g;
        const float dir_b = b[i] - center_b;
        cov_x += dir_r * dir_b;
        cov_y += dir_g * dir_b;
    }
    if(cov_x < 0.0f) {
        swap(maxcol_r, mincol_r);
    }
    if(cov_y < 0.0f) {
        swap(maxcol_g, mincol_g);
    }

    // encode both endpoints into 565 color format
    const u32 max_code = encode_endpoint(maxcol_r, maxcol_g, maxcol_b);
    const u32 min_code = encode_endpoint(mincol_r, mincol_g, mincol_b);

    // swap palette end colors if 'max' code is less than 'min' color code
    // (Palette color #3 would otherwise be interpreted as 'transparent'.)
    const bool swap_end_colors = max_code < min_code;

    // encode the palette into 32 bits (Only 2 end colors are stored.)
    const u32 palette_code = swap_end_colors ?
            min_code + (max_code << 16): max_code + (min_code << 16);

    // pack palette color indices (if both endpoint colors are not equal)
    u32 indices = 0;
    if(max_code != min_code) {
        // project each color to line maxcol-mincol, represent it as
        // "mincol + t * (maxcol - mincol)" and then use 't' to find closest
        // palette color index.
        const float dir_r = mincol_r - maxcol_r;
        const float dir_g = mincol_g - maxcol_g;
        const float dir_b = mincol_b - maxcol_b;
        const float dir_sqr_len = dir_r * dir_r + dir_g * dir_g + dir_b * dir_b;
        const float dir_inv_sqr_len = __fdividef(1.0f, dir_sqr_len);
        const float t_r = dir_r * dir_inv_sqr_len;
        const float t_g = dir_g * dir_inv_sqr_len;
        const float t_b = dir_b * dir_inv_sqr_len;
        const float t_bias = t_r * maxcol_r + t_g * maxcol_g + t_b * maxcol_b;

        // for each pixel color:
        for(int i = 0; i < 16; i++) {
            // get 't' for the color
            const float col_t = r[i] * t_r + g[i] * t_g + b[i] * t_b - t_bias;

            // scale the range of the 't' to [0..3] and convert to integer
            // to get the index of palette color
            const u32 col_idx = (u32)(3.0f * __saturatef(col_t) + 0.5f);

            // pack the color palette index with others
            indices += col_idx << (i * 2);
        }
    }

    // possibly invert indices if end colors must be swapped
    if(swap_end_colors) {
        indices = ~indices;
    }

    // substitute all packed indices (each index is packed into two bits)
    // 00 -> 00, 01 -> 10, 10 -> 11 and 11 -> 01
    const u32 lsbs = indices & 0x55555555;
    const u32 msbs = indices & 0xaaaaaaaa;
    indices = msbs ^ (2 * lsbs + (msbs >> 1));

    // compose and save output
    ((uint2*)out)[block_idx] = make_uint2(palette_code, indices);
}


/// DXT compression - each thread compresses one 4x4 DXT block.
/// Alpha-color palette mode is not used (always emmits 4color palette code).
template <bool YUV_TO_RGB, bool VERTICAL_MIRRORING, int DXT_TYPE>
__global__ static void dxt_kernel(const void * src, void * out, int size_x, int size_y) {
    // coordinates of this thread's 4x4 block
    const int block_idx_x = threadIdx.x + blockIdx.x * blockDim.x;
    const int block_idx_y = threadIdx.y + blockIdx.y * blockDim.y;

    // coordinates of block's top-left pixel
    const int block_x = block_idx_x * 4;
    const int block_y = block_idx_y * 4;

    // raster order index of the block
    const int block_idx = block_idx_x + (size_x >> 2) * block_idx_y;

    // skip if out of bounds
    if(block_y >= size_y || block_x >= size_x) {
        return;
    }

    // samples of 16 pixels
    float r[16];
    float g[16];
    float b[16];

    // load RGB samples for all 16 input pixels
    const int src_stride = (size_x >> 2) * 3;
    for(int y = 0; y < 4; y++) {
        // offset of loaded pixels in the buffer
        const int load_offset = y * 4;

        // pointer to source of this input row
        int row_idx = block_y + y;
        if(VERTICAL_MIRRORING) {
            row_idx = size_y - 1 - row_idx;
        }
        const uchar4 * const row_src = (uchar4*)src
                                     + src_stride * row_idx
                                     + block_idx_x * 3;

        // load all 4 3component pixels of the row
        const uchar4 p0 = row_src[0];
        const uchar4 p1 = row_src[1];
        const uchar4 p2 = row_src[2];

        // pixel #0
        r[load_offset + 0] = p0.x * 0.00392156862745f;
        g[load_offset + 0] = p0.y * 0.00392156862745f;
        b[load_offset + 0] = p0.z * 0.00392156862745f;

        // pixel #1
        r[load_offset + 1] = p0.w * 0.00392156862745f;
        g[load_offset + 1] = p1.x * 0.00392156862745f;
        b[load_offset + 1] = p1.y * 0.00392156862745f;

        // pixel #2
        r[load_offset + 2] = p1.z * 0.00392156862745f;
        g[load_offset + 2] = p1.w * 0.00392156862745f;
        b[load_offset + 2] = p2.x * 0.00392156862745f;

        // pixel #3
        r[load_offset + 3] = p2.y * 0.00392156862745f;
        g[load_offset + 3] = p2.z * 0.00392156862745f;
        b[load_offset + 3] = p2.w * 0.00392156862745f;
    }

    // transform colors from YUV to RGB if required
    if(YUV_TO_RGB) {
        for(int i = 0; i < 16; i++) {
            yuv_to_rgb(r[i], g[i], b[i]);
        }
    }

    // Select the right DXT type transform
    dxt_encode<DXT_TYPE>(out, block_idx, r, g, b);
}

__global__ static void yuv422_to_yuv444_kernel(const void * src, void * out, int pix_count) {
    // coordinates of this thread
    const int block_idx_x = threadIdx.x + blockIdx.x * blockDim.x;

    // skip if out of bounds
    if(block_idx_x >= pix_count / 2) {
        return;
    }

    uchar4 *this_src = ((uchar4 *) src) + block_idx_x * 2;
    uchar4 *this_out = ((uchar4 *) out) + block_idx_x * 3;

    uchar4 pix12 = this_src[0];
    uchar4 pix34 = this_src[1];

    uchar4 out_pix[3];
    out_pix[0].x = pix12.y;
    out_pix[0].y = pix12.x;
    out_pix[0].z = pix12.z;

    out_pix[0].w = pix12.w;
    out_pix[1].x = pix12.x;
    out_pix[1].y = pix12.z;

    out_pix[1].z = pix34.y;
    out_pix[1].w = pix34.x;
    out_pix[2].x = pix34.z;

    out_pix[2].y = pix34.w;
    out_pix[2].z = pix34.x;
    out_pix[2].w = pix34.z;

    this_out[0] = out_pix[0];
    this_out[1] = out_pix[1];
    this_out[2] = out_pix[2];
}

/// Compute grid size and launch DXT kernel.
template <bool YUV_TO_RGB, int DXT_TYPE>
static int dxt_launch(const void * src, void * out, int sx, int sy, cudaStream_t str) {
    // vertical mirroring?
    bool mirrored = false;
    if(sy < 0) {
        mirrored = true;
        sy = -sy;
    }

    // check image size and alignment
    if((sx & 3) || (sy & 3) || (15 & (size_t)src) || (7 & (size_t)out)) {
        return -1;
    }

    // grid and threadblock sizes
    const dim3 tsiz(16, 16);
    const dim3 gsiz((sx + tsiz.x - 1) / tsiz.x, (sy + tsiz.y - 1) / tsiz.y);

    // launch kernel, sync and check the result
    if(mirrored) {
        dxt_kernel<YUV_TO_RGB, true, DXT_TYPE><<<gsiz, tsiz, 0, str>>>(src, out, sx, sy);
    } else {
        dxt_kernel<YUV_TO_RGB, false, DXT_TYPE><<<gsiz, tsiz, 0, str>>>(src, out, sx, sy);
    }
    return cudaSuccess != cudaStreamSynchronize(str) ? -3 : 0;
}

CUDA_DLL_API int cuda_yuv422_to_yuv444(const void * src, void * out,
                int pix_count, cuda_wrapper_stream_t str) {
    // grid and threadblock sizes
    const dim3 tsiz(64, 1);
    int thread_count = pix_count / 4; // we process block of 4 pixels
    const dim3 gsiz((thread_count + tsiz.x - 1) / tsiz.x, 1);
    yuv422_to_yuv444_kernel<<<gsiz, tsiz, 0, (cudaStream_t) str>>>(src, out, pix_count);
    return cudaSuccess != cudaStreamSynchronize((cudaStream_t) str) ? -3 : 0;
}

/// CUDA DXT1 compression (only RGB without alpha).
/// @param src  Pointer to top-left source pixel in device-memory buffer.
///             8bit RGB samples are expected (no alpha and no padding).
///             (Pointer must be aligned to multiples of 16 bytes.)
/// @param out  Pointer to output buffer in device memory.
///             (Must be aligned to multiples of 8 bytes.)
/// @param size_x  Width of the input image (must be divisible by 4).
/// @param size_y  Height of the input image (must be divisible by 4).
/// @param stream  CUDA stream to run in, or 0 for default stream.
/// @return 0 if OK, nonzero if failed.
CUDA_DLL_API int cuda_rgb_to_dxt1(const void * src, void * out,
                int size_x, int size_y, cuda_wrapper_stream_t stream) {
    return dxt_launch<false, 1>(src, out, size_x, size_y, (cudaStream_t) stream);
}


/// CUDA DXT1 compression (only RGB without alpha).
/// Converts input from YUV to RGB color space.
/// @param src  Pointer to top-left source pixel in device-memory buffer.
///             8bit RGB samples are expected (no alpha and no padding).
///             (Pointer must be aligned to multiples of 16 bytes.)
/// @param out  Pointer to output buffer in device memory.
///             (Must be aligned to multiples of 8 bytes.)
/// @param size_x  Width of the input image (must be divisible by 4).
/// @param size_y  Height of the input image (must be divisible by 4).
/// @param stream  CUDA stream to run in, or 0 for default stream.
/// @return 0 if OK, nonzero if failed.
CUDA_DLL_API int cuda_yuv_to_dxt1(const void * src, void * out,
                int size_x, int size_y, cuda_wrapper_stream_t stream) {
    return dxt_launch<true, 1>(src, out, size_x, size_y, (cudaStream_t) stream);
}


/// CUDA DXT6 (DXT5-YcOcG) compression (only RGB without alpha).
/// @param src  Pointer to top-left source pixel in device-memory buffer.
///             8bit RGB samples are expected (no alpha and no padding).
///             (Pointer must be aligned to multiples of 16 bytes.)
/// @param out  Pointer to output buffer in device memory.
///             (Must be aligned to multiples of 8 bytes.)
/// @param size_x  Width of the input image (must be divisible by 4).
/// @param size_y  Height of the input image (must be divisible by 4).
///                (Input is read bottom up if negative)
/// @param stream  CUDA stream to run in, or 0 for default stream.
/// @return 0 if OK, nonzero if failed.
CUDA_DLL_API int cuda_rgb_to_dxt6(const void * src, void * out,
                int size_x, int size_y, cuda_wrapper_stream_t stream) {
    return dxt_launch<false, 6>(src, out, size_x, size_y, (cudaStream_t) stream);
}

CUDA_DLL_API int cuda_yuv_to_dxt6(const void * src, void * out,
                int size_x, int size_y, cuda_wrapper_stream_t stream) {
    return dxt_launch<true, 6>(src, out, size_x, size_y, (cudaStream_t) stream);
}