UltraGrid/dxt_compress/dxt.cg

/*
    Real-time DXT1 & YCoCg DXT5 compression (Cg 2.0)
    Copyright (c) NVIDIA Corporation.
    Written by: Ignacio Castano <icastano@nvidia.com>

    Thanks to JMP van Waveren, Simon Green, Eric Werness, Simon Brown

    Permission is hereby granted, free of charge, to any person
    obtaining a copy of this software and associated documentation
    files (the "Software"), to deal in the Software without
    restriction, including without limitation the rights to use,
    copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the
    Software is furnished to do so, subject to the following
    conditions:

    The above copyright notice and this permission notice shall be
    included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    OTHER DEALINGS IN THE SOFTWARE.
*/

// Image formats
const int FORMAT_RGB = 0;
const int FORMAT_YUV = 1;

// Covert YUV to RGB
vec3 ConvertYUVToRGB(vec3 color)
{
    float Y = color[0];
    float U = color[1] - 0.5;
    float V = color[2] - 0.5;
    Y = 1.1643 * (Y - 0.0625);

    float R = Y + 1.5958 * V;
    float G = Y - 0.39173 * U - 0.81290 * V;
    float B = Y + 2.017 * U;

    return vec3(R, G, B);
}
const float offset = 128.0 / 255.0;

vec3 ConvertRGBToYCoCg(vec3 color)
{
    float Y = (color.r + 2 * color.g + color.b) * 0.25;
    float Co = ( ( 2 * color.r - 2 * color.b      ) * 0.25 + offset );
    float Cg = ( (    -color.r + 2 * color.g - color.b) * 0.25 + offset );

    return vec3(Y, Co, Cg);
}

typedef unsigned int uint;
typedef unsigned int2 uint2;
typedef unsigned int3 uint3;
typedef unsigned int4 uint4;

// Use dot product to minimize RMS instead absolute distance like in the CPU compressor.
float colorDistance(float3 c0, float3 c1)
{
    return dot(c0-c1, c0-c1);
}
float colorDistance(float2 c0, float2 c1)
{
    return dot(c0-c1, c0-c1);
}

void ExtractColorBlock(out float3 col[16], sampler2D image, float2 texcoord, float2 imageSize)
{
    float2 texelSize = (1.0f / imageSize);
    texcoord -= texelSize * 2;
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 4; j++) {
            col[i*4+j] = tex2D(image, texcoord + float2(j, i) * texelSize).rgb;
        }
    }
}

// find minimum and maximum colors based on bounding box in color space
void FindMinMaxColorsBox(float3 block[16], out float3 mincol, out float3 maxcol)
{
    mincol = block[0];
    maxcol = block[0];

    for (int i = 1; i < 16; i++) {
        mincol = min(mincol, block[i]);
        maxcol = max(maxcol, block[i]);
    }
}

void InsetBBox(in out float3 mincol, in out float3 maxcol)
{
    float3 inset = (maxcol - mincol) / 16.0 - (8.0 / 255.0) / 16;
    mincol = saturate(mincol + inset);
    maxcol = saturate(maxcol - inset);
}
void InsetYBBox(in out float mincol, in out float maxcol)
{
    float inset = (maxcol - mincol) / 32.0 - (16.0 / 255.0) / 32.0;
    mincol = saturate(mincol + inset);
    maxcol = saturate(maxcol - inset);
}
void InsetCoCgBBox(in out float2 mincol, in out float2 maxcol)
{
    float2 inset = (maxcol - mincol) / 16.0 - (8.0 / 255.0) / 16;
    mincol = saturate(mincol + inset);
    maxcol = saturate(maxcol - inset);
}

void SelectDiagonal(float3 block[16], in out float3 mincol, in out float3 maxcol)
{
    float3 center = (mincol + maxcol) * 0.5;

    float2 cov = 0;
    for (int i = 0; i < 16; i++)
    {
        float3 t = block[i] - center;
        cov.x += t.x * t.z;
        cov.y += t.y * t.z;
    }

    if (cov.x < 0) {
        float temp = maxcol.x;
        maxcol.x = mincol.x;
        mincol.x = temp;
    }
    if (cov.y < 0) {
        float temp = maxcol.y;
        maxcol.y = mincol.y;
        mincol.y = temp;
    }
}

float3 RoundAndExpand(float3 v, out uint w)
{
    int3 c = round(v * float3(31, 63, 31));
    w = (c.r << 11) | (c.g << 5) | c.b;

    c.rb = (c.rb << 3) | (c.rb >> 2);
    c.g = (c.g << 2) | (c.g >> 4);

    return (float3)c * (1.0 / 255.0);
}

uint EmitEndPointsDXT1(in out float3 mincol, in out float3 maxcol)
{
    uint2 output;
    maxcol = RoundAndExpand(maxcol, output.x);
    mincol = RoundAndExpand(mincol, output.y);

    // We have to do this in case we select an alternate diagonal.
    if (output.x < output.y)
    {
        float3 tmp = mincol;
        mincol = maxcol;
        maxcol = tmp;
        return output.y | (output.x << 16);
    }

    return output.x | (output.y << 16);
}

#if 0

uint EmitIndicesDXT1(float3 block[16], float3 mincol, float3 maxcol)
{
    const float RGB_RANGE = 3;

    float3 dir = (maxcol - mincol);
    float3 origin = maxcol + dir / (2.0 * RGB_RANGE);
    dir /= dot(dir, dir);

    // Compute indices
    uint indices = 0;
    for (int i = 0; i < 16; i++)
    {
        uint index = saturate(dot(origin - block[i], dir)) * RGB_RANGE;
        indices |= index << (i * 2);
    }

    uint i0 = (indices & 0x55555555);
    uint i1 = (indices & 0xAAAAAAAA) >> 1;
    indices = ((i0 ^ i1) << 1) | i1;

    // Output indices
    return indices;
}

#else

uint EmitIndicesDXT1(float3 col[16], float3 mincol, float3 maxcol)
{
    // Compute palette
    float3 c[4];
    c[0] = maxcol;
    c[1] = mincol;
    c[2] = lerp(c[0], c[1], 1.0/3.0);
    c[3] = lerp(c[0], c[1], 2.0/3.0);

    // Compute indices
    uint indices = 0;
    for (int i = 0; i < 16; i++) {

        // find index of closest color
        float4 dist;
        dist.x = colorDistance(col[i], c[0]);
        dist.y = colorDistance(col[i], c[1]);
        dist.z = colorDistance(col[i], c[2]);
        dist.w = colorDistance(col[i], c[3]);

        uint4 b = dist.xyxy > dist.wzzw;
        uint b4 = dist.z > dist.w;

        uint index = (b.x & b4) | (((b.y & b.z) | (b.x & b.w)) << 1);
        indices |= index << (i*2);
    }

    // Output indices
    return indices;
}

#endif

int ScaleYCoCg(float2 minColor, float2 maxColor)
{
    float2 m0 = abs(minColor - offset);
    float2 m1 = abs(maxColor - offset);

    float m = max(max(m0.x, m0.y), max(m1.x, m1.y));

    const float s0 = 64.0 / 255.0;
    const float s1 = 32.0 / 255.0;

    int scale = 1;
    if (m < s0) scale = 2;
    if (m < s1) scale = 4;

    return scale;
}

void SelectYCoCgDiagonal(const float3 block[16], in out float2 minColor, in out float2 maxColor)
{
    float2 mid = (maxColor + minColor) * 0.5;

    float cov = 0;
    for (int i = 0; i < 16; i++)
    {
        float2 t = block[i].yz - mid;
        cov += t.x * t.y;
    }
    if (cov < 0) {
        float tmp = maxColor.y;
        maxColor.y = minColor.y;
        minColor.y = tmp;
    }
}


uint EmitEndPointsYCoCgDXT5(in out float2 mincol, in out float2 maxcol, int scale)
{
    maxcol = (maxcol - offset) * scale + offset;
    mincol = (mincol - offset) * scale + offset;

    InsetCoCgBBox(mincol, maxcol);

    maxcol = round(maxcol * float2(31, 63));
    mincol = round(mincol * float2(31, 63));

    int2 imaxcol = maxcol;
    int2 imincol = mincol;

    uint2 output;
    output.x = (imaxcol.r << 11) | (imaxcol.g << 5) | (scale - 1);
    output.y = (imincol.r << 11) | (imincol.g << 5) | (scale - 1);

    imaxcol.r = (imaxcol.r << 3) | (imaxcol.r >> 2);
    imaxcol.g = (imaxcol.g << 2) | (imaxcol.g >> 4);
    imincol.r = (imincol.r << 3) | (imincol.r >> 2);
    imincol.g = (imincol.g << 2) | (imincol.g >> 4);

    maxcol = (float2)imaxcol * (1.0 / 255.0);
    mincol = (float2)imincol * (1.0 / 255.0);

    // Undo rescale.
    maxcol = (maxcol - offset) / scale + offset;
    mincol = (mincol - offset) / scale + offset;

    return output.x | (output.y << 16);
}

#if 0

uint EmitIndicesYCoCgDXT5(float3 block[16], float2 mincol, float2 maxcol)
{
    const float COCG_RANGE = 3;

    float2 dir = (maxcol - mincol);
    float2 origin = maxcol + dir / (2.0 * COCG_RANGE);
    dir /= dot(dir, dir);

    // Compute indices
    uint indices = 0;
    for (int i = 0; i < 16; i++)
    {
        uint index = saturate(dot(origin - block[i].yz, dir)) * COCG_RANGE;
        indices |= index << (i * 2);
    }

    uint i0 = (indices & 0x55555555);
    uint i1 = (indices & 0xAAAAAAAA) >> 1;
    indices = ((i0 ^ i1) << 1) | i1;

    // Output indices
    return indices;
}

#else

uint EmitIndicesYCoCgDXT5(float3 block[16], float2 mincol, float2 maxcol)
{
    // Compute palette
    float2 c[4];
    c[0] = maxcol;
    c[1] = mincol;
    c[2] = lerp(c[0], c[1], 1.0/3.0);
    c[3] = lerp(c[0], c[1], 2.0/3.0);

    // Compute indices
    uint indices = 0;
    for (int i = 0; i < 16; i++)
    {
        // find index of closest color
        float4 dist;
        dist.x = colorDistance(block[i].yz, c[0]);
        dist.y = colorDistance(block[i].yz, c[1]);
        dist.z = colorDistance(block[i].yz, c[2]);
        dist.w = colorDistance(block[i].yz, c[3]);

        uint4 b = dist.xyxy > dist.wzzw;
        uint b4 = dist.z > dist.w;

        uint index = (b.x & b4) | (((b.y & b.z) | (b.x & b.w)) << 1);
        indices |= index << (i*2);
    }

    // Output indices
    return indices;
}

#endif

uint EmitAlphaEndPointsYCoCgDXT5(float mincol, float maxcol)
{
    uint c0 = round(mincol * 255);
    uint c1 = round(maxcol * 255);

    return (c0 << 8) | c1;
}

#if 0

// Optimized index selection.
uint2 EmitAlphaIndicesYCoCgDXT5(float3 block[16], float minAlpha, float maxAlpha)
{
    const int ALPHA_RANGE = 7;

    float bias = maxAlpha + (maxAlpha - minAlpha) / (2.0 * ALPHA_RANGE);
    float scale = 1.0f / (maxAlpha - minAlpha);

    uint2 indices = 0;

    for (int i = 0; i < 6; i++)
    {
        uint index = saturate((bias - block[i].x) * scale) * ALPHA_RANGE;
        indices.x |= index << (3 * i);
    }

    for (int i = 6; i < 16; i++)
    {
        uint index = saturate((bias - block[i].x) * scale) * ALPHA_RANGE;
        indices.y |= index << (3 * i - 18);
    }

    uint2 i0 = (indices >> 0) & 0x09249249;
    uint2 i1 = (indices >> 1) & 0x09249249;
    uint2 i2 = (indices >> 2) & 0x09249249;

    i2 ^= i0 & i1;
    i1 ^= i0;
    i0 ^= (i1 | i2);

    indices.x = (i2.x << 2) | (i1.x << 1) | i0.x;
    indices.y = (((i2.y << 2) | (i1.y << 1) | i0.y) << 2) | (indices.x >> 16);
    indices.x <<= 16;

    return indices;
}

#else

// Version shown in the YCoCg-DXT article.
uint2 EmitAlphaIndicesYCoCgDXT5(float3 block[16], float minAlpha, float maxAlpha)
{
    const int ALPHA_RANGE = 7;

    float mid = (maxAlpha - minAlpha) / (2.0 * ALPHA_RANGE);

    float ab1 = minAlpha + mid;
    float ab2 = (6 * maxAlpha + 1 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab3 = (5 * maxAlpha + 2 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab4 = (4 * maxAlpha + 3 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab5 = (3 * maxAlpha + 4 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab6 = (2 * maxAlpha + 5 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;
    float ab7 = (1 * maxAlpha + 6 * minAlpha) * (1.0 / ALPHA_RANGE) + mid;

    uint2 indices = 0;

    uint index;
    for (int i = 0; i < 6; i++)
    {
        float a = block[i].x;
        index = 1;
        index += (a <= ab1);
        index += (a <= ab2);
        index += (a <= ab3);
        index += (a <= ab4);
        index += (a <= ab5);
        index += (a <= ab6);
        index += (a <= ab7);
        index &= 7;
        index ^= (2 > index);
        indices.x |= index << (3 * i + 16);
    }

    indices.y = index >> 1;

    for (int i = 6; i < 16; i++)
    {
        float a = block[i].x;
        index = 1;
        index += (a <= ab1);
        index += (a <= ab2);
        index += (a <= ab3);
        index += (a <= ab4);
        index += (a <= ab5);
        index += (a <= ab6);
        index += (a <= ab7);
        index &= 7;
        index ^= (2 > index);
        indices.y |= index << (3 * i - 16);
    }

    return indices;
}

#endif

varying out uvec4 colorInt;
uniform int imageFormat = 0;

// compress a 4x4 block to DXT1 format
// integer version, renders to 2 x int32 buffer
void compress_DXT1_fp(
    float2 texcoord : TEXCOORD0,
    uniform sampler2D image,
    uniform float2 imageSize)
{
    // Read block
    float3 block[16];
    ExtractColorBlock(block, image, texcoord, imageSize);

    // Convert to RGB
    if ( int(imageFormat) == FORMAT_YUV ) {
        for ( int index = 0; index < 16; index++ )
            block[index] = ConvertYUVToRGB(block[index]);
    }

    // Find min and max colors
    float3 mincol, maxcol;
    FindMinMaxColorsBox(block, mincol, maxcol);

    SelectDiagonal(block, mincol, maxcol);

    InsetBBox(mincol, maxcol);

    uint4 output;
    output.x = EmitEndPointsDXT1(mincol, maxcol);
    output.w = EmitIndicesDXT1(block, mincol, maxcol);

    colorInt = output;
}

// compress a 4x4 block to YCoCg DXT5 format
// integer version, renders to 4 x int32 buffer
void compress_YCoCgDXT5_fp(
    float2 texcoord : TEXCOORD0,
    uniform sampler2D image,
    uniform float2 imageSize)
{
    // Read block
    float3 block[16];
    ExtractColorBlock(block, image, texcoord, imageSize);

    // Convert to RGB
    if ( int(imageFormat) == FORMAT_YUV ) {
        for ( int index = 0; index < 16; index++ )
            block[index] = ConvertYUVToRGB(block[index]);
    }

    // Convert to YCoCg
    for ( int index = 0; index < 16; index++ )
        block[index] = ConvertRGBToYCoCg(block[index]);

    // Find min and max colors
    float3 mincol, maxcol;
    FindMinMaxColorsBox(block, mincol, maxcol);

    SelectYCoCgDiagonal(block, mincol.yz, maxcol.yz);

    int scale = ScaleYCoCg(mincol.yz, maxcol.yz);

    // Output CoCg in DXT1 block.
    uint4 output;
    output.z = EmitEndPointsYCoCgDXT5(mincol.yz, maxcol.yz, scale);
    output.w = EmitIndicesYCoCgDXT5(block, mincol.yz, maxcol.yz);

    InsetYBBox(mincol.x, maxcol.x);

    // Output Y in DXT5 alpha block.
    output.x = EmitAlphaEndPointsYCoCgDXT5(mincol.x, maxcol.x);

    uint2 indices = EmitAlphaIndicesYCoCgDXT5(block, mincol.x, maxcol.x);
    output.x |= indices.x;
    output.y = indices.y;

    colorInt = output;
}

uniform sampler2D image : TEXUNIT0;

float4 display_fp(float2 texcoord : TEXCOORD0) : COLOR
{
    float4 rgba = tex2D(image, texcoord);

    return rgba;
}

float4 display_YCoCgDXT5_fp(float2 texcoord : TEXCOORD0) : COLOR
{
    float4 rgba = tex2D(image, texcoord);

    float Y = rgba.a;
    float scale = 1.0 / ((255.0 / 8.0) * rgba.b + 1);
    float Co = (rgba.r - offset) * scale;
    float Cg = (rgba.g - offset) * scale;

    float R = Y + Co - Cg;
    float G = Y + Cg;
    float B = Y - Co - Cg;

    rgba = float4(R, G, B, 1);

    return rgba;
}