Added jpeg_compress library

2026-04-07 02:05:11 +00:00 · 2011-11-23 16:22:50 +01:00
parent b06bd54d34
commit eca5ebedeb
29 changed files with 5677 additions and 0 deletions
--- a/jpeg_compress/.gitignore
+++ b/jpeg_compress/.gitignore
@@ -0,0 +1,3 @@
+jpeg_compress
+*.o
+*.rgb
--- a/jpeg_compress/Makefile
+++ b/jpeg_compress/Makefile
@@ -0,0 +1,111 @@
+# Copyright (c) 2011, Martin Srom
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+#    # Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#    # Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# Target executable
+TARGET := jpeg_compress
+# C files
+CFILES := \
+    main.c \
+    jpeg_common.c \
+    jpeg_encoder.c \
+    jpeg_decoder.c \
+    jpeg_table.c \
+    jpeg_huffman_cpu_encoder.c \
+    jpeg_huffman_cpu_decoder.c \
+    jpeg_writer.c \
+    jpeg_reader.c
+# CUDA files
+CUFILES := \
+    jpeg_preprocessor.cu \
+    jpeg_huffman_gpu_encoder.cu \
+    jpeg_huffman_gpu_decoder.cu
+
+# CUDA install path
+CUDA_INSTALL_PATH ?= /usr/local/cuda
+
+# Compilers
+CC := gcc
+LINK := g++ -fPIC
+NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
+
+# Common flags
+COMMONFLAGS += -I. -I$(CUDA_INSTALL_PATH)/include -O2
+# C flags
+CFLAGS += $(COMMONFLAGS) -std=c99
+# CUDA flags
+NVCCFLAGS += $(COMMONFLAGS) \
+	-gencode arch=compute_20,code=sm_20 \
+	-gencode arch=compute_11,code=sm_11
+# Linker flags
+LDFLAGS +=
+
+# Do 32bit vs. 64bit setup
+LBITS := $(shell getconf LONG_BIT)
+ifeq ($(LBITS),64)
+    # 64bit
+    LDFLAGS += -L$(CUDA_INSTALL_PATH)/lib64 -lcudart -lnpp
+else
+    # 32bit
+    LDFLAGS += -L$(CUDA_INSTALL_PATH)/lib -lcudart
+endif
+
+# Build
+build: $(TARGET)
+
+# Clean
+clean:
+	rm -f *.o $(TARGET)
+
+# Lists of object files
+COBJS=$(CFILES:.c=.c.o)
+CUOBJS=$(CUFILES:.cu=.cu.o)
+
+# Build target
+$(TARGET): $(COBJS) $(CUOBJS)
+	$(LINK) $(COBJS) $(CUOBJS) $(LDFLAGS) -o $(TARGET);    
+
+# Set suffix for CUDA files
+.SUFFIXES: .cu
+
+# Pattern rule for compiling C files
+%.c.o: %.c 
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Pattern rule for compiling CUDA files
+%.cu.o: %.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@;
+
+# Set file dependencies
+main.c.o: main.c
+jpeg_common.c.o: jpeg_common.c jpeg_common.h
+jpeg_encoder.c.o: jpeg_encoder.c jpeg_encoder.h
+jpeg_decoder.c.o: jpeg_decoder.c jpeg_decoder.h
+jpeg_table.c.o: jpeg_table.c jpeg_table.h
+jpeg_preprocessor.cu.o: jpeg_preprocessor.cu jpeg_preprocessor.h
+jpeg_huffman_cpu_encoder.c.o: jpeg_huffman_cpu_encoder.c jpeg_huffman_cpu_encoder.h
+jpeg_huffman_gpu_encoder.cu.o: jpeg_huffman_gpu_encoder.cu jpeg_huffman_gpu_encoder.h
+jpeg_huffman_cpu_decoder.c.o: jpeg_huffman_cpu_decoder.c jpeg_huffman_cpu_decoder.h
+jpeg_huffman_gpu_decoder.cu.o: jpeg_huffman_gpu_decoder.cu jpeg_huffman_gpu_decoder.h
+jpeg_writer.c.o: jpeg_writer.c jpeg_writer.h
+jpeg_reader.c.o: jpeg_reader.c jpeg_reader.h
--- a/jpeg_compress/README
+++ b/jpeg_compress/README
@@ -0,0 +1,82 @@
+jpeg-codec - JPEG encoder and decoder library and console application
+
+AUTHOR:
+Martin Srom
+
+DESCRIPTION:
+  The first test implementation of the JPEG image compression standard for NVIDIA GPUs 
+used for real-time transmission of high-definition video.
+
+OVERVIEW:
+-It uses NVIDIA CUDA platform.
+-Not optimized yet (it is only the first test implementation).
+-Encoder and Decoder use Huffman coder for entropy encoding/decoding.
+-Encoder produces baseline JPEG codestream which consists of proper codestream headers and one scan 
+ for each color component without subsampling and it uses restart flags that allows fast parallel encoding. 
+ The quality of encoded images can be specified by value 0-100 as standard defines.
+-Decoder can decompress only JPEG codestreams that contains separate scan for each color component 
+ which aren't subsampled. If scan contains restart flags, decoder can use parallelism for fast decoding.
+-Encoding/Decoding is divided into following phases:
+   Encoding:                       Decoding
+   1) Input data loading           1) Input data loading
+   2) Preprocessing                2) Parsing codestream 
+   3) Forward DCT                  3) Huffman decoder
+   4) Huffman encoder              4) Inverse DCT
+   5) Formatting codestream        5) Postprocessing
+ and they are implemented on CPU or/and GPU:
+   -CPU: 
+      -Input data loading
+      -Parsing codestream
+      -Huffman encoder/decoder (when restart flags are disabled)
+      -Output data formatting
+   -GPU: 
+      -Preprocessing/Postprocessing (color component parsing, color transformation RGB <-> YCbCr)
+      -Forward/Inverse DCT (Discrete cosine transform) implementation from NVIDIA
+       npp library (NVIDIA Performance Primitives)
+      -Huffman encoder/decoder (when restart flags are enabled)  
+
+PERFORMANCE:
+  Following tables summarizes encoding/decoding performance using GTX 580 for different quality 
+settings (time, PSNR and size values are averages of encoding several images, each of them 
+multiple times):
+
+Encoding:
+         |           4k (4096x2160)         |         HD (1920x1080)
+ --------+----------------------------------+---------------------------------
+ quality | duration |     psnr |       size | duration |     psnr |       size
+ --------+----------+----------+------------+---------------------------------
+      10 | 26.57 ms | 30.08 dB |  497.00 kB |  6.62 ms | 28.15 dB |  135.50 kB 
+      20 | 26.68 ms | 33.32 dB |  607.50 kB |  6.68 ms | 31.14 dB |  174.00 kB 
+      30 | 26.76 ms | 35.83 dB |  732.00 kB |  6.73 ms | 32.68 dB |  209.50 kB 
+      40 | 26.83 ms | 36.86 dB |  823.50 kB |  6.78 ms | 33.70 dB |  239.50 kB 
+      50 | 26.96 ms | 37.70 dB |  914.50 kB |  6.81 ms | 34.58 dB |  269.50 kB 
+      60 | 27.00 ms | 38.41 dB | 1018.00 kB |  6.88 ms | 35.44 dB |  303.00 kB 
+      70 | 27.13 ms | 39.57 dB | 1176.00 kB |  6.94 ms | 36.62 dB |  355.00 kB 
+      80 | 27.29 ms | 40.71 dB | 1442.50 kB |  7.00 ms | 38.01 dB |  440.50 kB 
+      90 | 27.78 ms | 42.42 dB | 2069.50 kB |  7.12 ms | 40.25 dB |  650.50 kB 
+     100 | 34.69 ms | 46.49 dB | 6650.00 kB |  9.20 ms | 46.34 dB | 2126.50 kB
+     
+Decoding:
+         |           4k (4096x2160)         |         HD (1920x1080)
+ --------+----------------------------------+---------------------------------
+ quality | duration |     psnr |       size | duration |     psnr |       size
+ --------+----------+----------+------------+---------------------------------
+      10 | 
+      20 | 
+      30 | 
+      40 | 
+      50 | 
+      60 | 
+      70 | 
+      80 | 
+      90 | 
+     100 | 
+     
+USAGE:
+  ...
+
+jpeg-codec library:
+  ...
+
+jpeg-codec application:
+  ...
--- a/jpeg_compress/jpeg_common.c
+++ b/jpeg_compress/jpeg_common.c
@@ -0,0 +1,148 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_common.h"
+#include "jpeg_util.h"
+
+/** Documented at declaration */
+void
+jpeg_image_set_default_parameters(struct jpeg_image_parameters* param)
+{
+    param->width = 0;
+    param->height = 0;
+    param->comp_count = 3;
+    param->color_space = JPEG_RGB;
+    param->sampling_factor = JPEG_4_4_4;
+}
+
+/** Documented at declaration */
+int
+jpeg_init_device(int device_id, int verbose)
+{
+    int dev_count;
+
+    cudaGetDeviceCount(&dev_count);
+
+    if ( dev_count == 0 ) {
+        printf("No CUDA enabled device\n");
+        return -1;
+    }
+
+    if ( device_id < 0 || device_id >= dev_count ) {
+        printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
+               device_id, 0, dev_count - 1);
+        return -1;
+    }
+
+    struct cudaDeviceProp devProp;
+    cudaGetDeviceProperties(&devProp, device_id);
+
+    if ( devProp.major < 1 ) {
+        printf("Device %d does not support CUDA\n", device_id);
+        return -1;
+    }
+
+    if ( verbose == 1 )
+        printf("Setting device %d: %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
+    cudaSetDevice(device_id);
+
+    return 0;
+}
+
+/** Documented at declaration */
+enum jpeg_image_file_format
+jpeg_image_get_file_format(const char* filename)
+{
+    static const char *extension[] = { "raw", "rgb", "yuv", "jpg" };
+    static const enum jpeg_image_file_format format[] = { IMAGE_FILE_RAW, IMAGE_FILE_RGB, IMAGE_FILE_YUV, IMAGE_FILE_JPEG };
+        
+    char * ext = strrchr(filename, '.');
+    if ( ext == NULL )
+        return -1;
+    ext++;
+    for ( int i = 0; i < sizeof(format) / sizeof(*format); i++ ) {
+        if ( strncasecmp(ext, extension[i], 3) == 0 ) {
+            return format[i];
+        }
+    }
+    return IMAGE_FILE_UNKNOWN;
+}
+
+/** Documented at declaration */
+int
+jpeg_image_load_from_file(const char* filename, uint8_t** image, int* image_size)
+{
+    FILE* file;
+	file = fopen(filename, "rb");
+	if ( !file ) {
+		fprintf(stderr, "Failed open %s for reading!\n", filename);
+		return -1;
+	}
+
+    if ( *image_size == 0 ) {
+        fseek(file, 0, SEEK_END);
+        *image_size = ftell(file);
+        rewind(file);
+    }
+    
+    uint8_t* data = (uint8_t*)malloc(*image_size * sizeof(uint8_t));
+    if ( *image_size != fread(data, sizeof(uint8_t), *image_size, file) ) {
+        fprintf(stderr, "Failed to load image data [%d bytes] from file %s!\n", *image_size, filename);
+        return -1;
+    }
+    fclose(file);
+    
+    *image = data;
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_image_save_to_file(const char* filename, uint8_t* image, int image_size)
+{
+    FILE* file;
+	file = fopen(filename, "wb");
+	if ( !file ) {
+		fprintf(stderr, "Failed open %s for writing!\n", filename);
+		return -1;
+	}
+    
+    if ( image_size != fwrite(image, sizeof(uint8_t), image_size, file) ) {
+        fprintf(stderr, "Failed to write image data [%d bytes] to file %s!\n", image_size, filename);
+        return -1;
+    }
+    fclose(file);
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_image_destroy(uint8_t* image)
+{
+    free(image);
+}
--- a/jpeg_compress/jpeg_common.h
+++ b/jpeg_compress/jpeg_common.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#ifndef JPEG_COMMON_H
+#define JPEG_COMMON_H
+
+#include <stdint.h>
+
+#include "jpeg_type.h"
+
+/** Image file formats */
+enum jpeg_image_file_format {
+    // Unknown image file format
+    IMAGE_FILE_UNKNOWN = 0,
+    // Raw file format
+    IMAGE_FILE_RAW = 1,
+    // JPEG file format
+    IMAGE_FILE_JPEG = 2,
+    // RGB file format, simple data format without header [R G B] [R G B] ...
+    IMAGE_FILE_RGB = 1 | 4,
+    // YUV file format, simple data format without header [Y U V] [Y U V] ...
+    IMAGE_FILE_YUV = 1 | 8
+};
+
+/** Image parameters */
+struct jpeg_image_parameters {
+    // Image data width
+    int width;
+    // Image data height
+    int height;
+    // Image data component count
+    int comp_count;
+    // Image data color space
+    enum jpeg_color_space color_space;
+    // Image data sampling factor
+    enum jpeg_sampling_factor sampling_factor;
+};
+
+/**
+ * Set default parameters for JPEG image
+ * 
+ * @param param  Parameters for image
+ * @return void
+ */
+void
+jpeg_image_set_default_parameters(struct jpeg_image_parameters* param);
+
+/**
+ * Init CUDA device
+ * 
+ * @param device_id  CUDA device id (starting at 0)
+ * @param verbose  Flag if device info should be printed out (0 or 1)
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_init_device(int device_id, int verbose);
+
+/**
+ * Get image file format from filename
+ *
+ * @param filename  Filename of image file
+ * @return image_file_format or IMAGE_FILE_UNKNOWN if type cannot be determined
+ */
+enum jpeg_image_file_format
+jpeg_image_get_file_format(const char* filename);
+
+/**
+ * Load RGB image from file
+ * 
+ * @param filaname  Image filename
+ * @param image  Image data buffer
+ * @param image_size  Image data buffer size (can be specified for verification or 0 for retrieval)
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_image_load_from_file(const char* filename, uint8_t** image, int* image_size);
+
+/**
+ * Save RGB image to file
+ * 
+ * @param filaname  Image filename
+ * @param image  Image data buffer
+ * @param image_size  Image data buffer size
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_image_save_to_file(const char* filename, uint8_t* image, int image_size);
+
+/**
+ * Destroy DXT image
+ * 
+ * @param image  Image data buffer
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_image_destroy(uint8_t* image);
+
+#endif // JPEG_COMMON_H
--- a/jpeg_compress/jpeg_decoder.c
+++ b/jpeg_compress/jpeg_decoder.c
@@ -0,0 +1,325 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_decoder.h"
+#include "jpeg_preprocessor.h"
+#include "jpeg_huffman_cpu_decoder.h"
+#include "jpeg_huffman_gpu_decoder.h"
+#include "jpeg_util.h"
+
+/** Documented at declaration */
+struct jpeg_decoder*
+jpeg_decoder_create(struct jpeg_image_parameters* param_image)
+{
+    struct jpeg_decoder* decoder = malloc(sizeof(struct jpeg_decoder));
+    if ( decoder == NULL )
+        return NULL;
+        
+    // Set parameters
+    decoder->param_image = *param_image;
+    decoder->param_image.width = 0;
+    decoder->param_image.height = 0;
+    decoder->param_image.comp_count = 0;
+    decoder->restart_interval = 0;
+    decoder->data_quantized = NULL;
+    decoder->d_data_quantized = NULL;
+    decoder->d_data = NULL;
+    decoder->data_target = NULL;
+    decoder->d_data_target = NULL;
+    
+    int result = 1;
+    
+    // Create reader
+    decoder->reader = jpeg_reader_create();
+    if ( decoder->reader == NULL )
+        result = 0;
+    
+    // Allocate quantization tables in device memory
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        if ( cudaSuccess != cudaMalloc((void**)&decoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) ) 
+            result = 0;
+    }
+    // Allocate huffman tables in device memory
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        for ( int huff_type = 0; huff_type < JPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
+            if ( cudaSuccess != cudaMalloc((void**)&decoder->d_table_huffman[comp_type][huff_type], sizeof(struct jpeg_table_huffman_decoder)) )
+                result = 0;
+        }
+    }
+    cudaCheckError("Decoder table allocation");
+    
+    // Init decoder
+    if ( param_image->width != 0 && param_image->height != 0 ) {
+        if ( jpeg_decoder_init(decoder, param_image->width, param_image->height, param_image->comp_count) != 0 )
+            result = 0;
+    }
+    
+    // Init huffman encoder
+    if ( jpeg_huffman_gpu_decoder_init() != 0 )
+        result = 0;
+    
+    if ( result == 0 ) {
+        jpeg_decoder_destroy(decoder);
+        return NULL;
+    }
+    
+    return decoder;
+}
+
+/** Documented at declaration */
+int
+jpeg_decoder_init(struct jpeg_decoder* decoder, int width, int height, int comp_count)
+{
+    assert(comp_count == 3);
+    
+    // No reinialization needed
+    if ( decoder->param_image.width == width && decoder->param_image.height == height && decoder->param_image.comp_count == comp_count ) {
+        return 0;
+    }
+    
+    // For now we can't reinitialize decoder, we can only do first initialization
+    if ( decoder->param_image.width != 0 || decoder->param_image.height != 0 || decoder->param_image.comp_count != 0 ) {
+        fprintf(stderr, "Can't reinitialize decoder, implement if needed!\n");
+        return -1;
+    }
+    
+    decoder->param_image.width = width;
+    decoder->param_image.height = height;
+    decoder->param_image.comp_count = comp_count;
+    
+    // Allocate scan data (we need more data ie twice, restart_interval could be 1 so a lot of data)
+    // and indexes to data for each segment
+    int data_scan_size = decoder->param_image.comp_count * decoder->param_image.width * decoder->param_image.height * 2;
+    int max_segment_count = decoder->param_image.comp_count * ((decoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE) * ((decoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE);
+    if ( cudaSuccess != cudaMallocHost((void**)&decoder->data_scan, data_scan_size * sizeof(uint8_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMalloc((void**)&decoder->d_data_scan, data_scan_size * sizeof(uint8_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMallocHost((void**)&decoder->data_scan_index, max_segment_count * sizeof(int)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMalloc((void**)&decoder->d_data_scan_index, max_segment_count * sizeof(int)) ) 
+        return -1;
+    cudaCheckError("Decoder scan allocation");
+    
+    // Allocate buffers
+    int data_size = decoder->param_image.width * decoder->param_image.width * decoder->param_image.comp_count;
+    if ( cudaSuccess != cudaMallocHost((void**)&decoder->data_quantized, data_size * sizeof(int16_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMalloc((void**)&decoder->d_data_quantized, data_size * sizeof(int16_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMalloc((void**)&decoder->d_data, data_size * sizeof(uint8_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMallocHost((void**)&decoder->data_target, data_size * sizeof(uint8_t)) ) 
+        return -1;
+    if ( cudaSuccess != cudaMalloc((void**)&decoder->d_data_target, data_size * sizeof(uint8_t)) ) 
+        return -1;
+    cudaCheckError("Decoder data allocation");
+    
+    return 0;
+}
+
+void
+jpeg_decoder_print8(struct jpeg_decoder* decoder, uint8_t* d_data)
+{
+    int data_size = decoder->param_image.width * decoder->param_image.height;
+    uint8_t* data = NULL;
+    cudaMallocHost((void**)&data, data_size * sizeof(uint8_t)); 
+    cudaMemcpy(data, d_data, data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    
+    printf("Print Data\n");
+    for ( int y = 0; y < decoder->param_image.height; y++ ) {
+        for ( int x = 0; x < decoder->param_image.width; x++ ) {
+            printf("%3u ", data[y * decoder->param_image.width + x]);
+        }
+        printf("\n");
+    }
+    cudaFreeHost(data);
+}
+
+void
+jpeg_decoder_print16(struct jpeg_decoder* decoder, int16_t* d_data)
+{
+    int data_size = decoder->param_image.width * decoder->param_image.height;
+    int16_t* data = NULL;
+    cudaMallocHost((void**)&data, data_size * sizeof(int16_t)); 
+    cudaMemcpy(data, d_data, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
+    
+    printf("Print Data\n");
+    for ( int y = 0; y < decoder->param_image.height; y++ ) {
+        for ( int x = 0; x < decoder->param_image.width; x++ ) {
+            printf("%3d ", data[y * decoder->param_image.width + x]);
+        }
+        printf("\n");
+    }
+    cudaFreeHost(data);
+}
+
+/** Documented at declaration */
+int
+jpeg_decoder_decode(struct jpeg_decoder* decoder, uint8_t* image, int image_size, uint8_t** image_decompressed, int* image_decompressed_size)
+{    
+    //TIMER_INIT();
+    //TIMER_START();
+    
+    // Read JPEG image data
+    if ( jpeg_reader_read_image(decoder, image, image_size) != 0 ) {
+        fprintf(stderr, "Decoder failed when decoding image data!\n");
+        return -1;
+    }
+    
+    int data_size = decoder->param_image.width * decoder->param_image.height * decoder->param_image.comp_count;
+    
+    //TIMER_STOP_PRINT("-Stream Reader:     ");
+    //TIMER_START();
+    
+    // Perform huffman decoding on CPU (when restart interval is not set)
+    if ( decoder->restart_interval == 0 ) {
+        // Perform huffman decoding for all components
+        for ( int index = 0; index < decoder->scan_count; index++ ) {
+            // Get scan and data buffer
+            struct jpeg_decoder_scan* scan = &decoder->scan[index];
+            int16_t* data_quantized_comp = &decoder->data_quantized[index * decoder->param_image.width * decoder->param_image.height];
+            // Determine table type
+            enum jpeg_component_type type = (index == 0) ? JPEG_COMPONENT_LUMINANCE : JPEG_COMPONENT_CHROMINANCE;
+            // Huffman decode
+            if ( jpeg_huffman_cpu_decoder_decode(decoder, type, scan, data_quantized_comp) != 0 ) {
+                fprintf(stderr, "Huffman decoder failed for scan at index %d!\n", index);
+                return -1;
+            }
+        }
+        
+        // Copy quantized data to device memory from cpu memory    
+        cudaMemcpy(decoder->d_data_quantized, decoder->data_quantized, data_size * sizeof(int16_t), cudaMemcpyHostToDevice);
+    }
+    // Perform huffman decoding on GPU (when restart interval is set)
+    else {
+        cudaMemset(decoder->d_data_quantized, 0, decoder->param_image.comp_count * decoder->param_image.width * decoder->param_image.height * sizeof(int16_t));
+        
+        // Copy scan data to device memory
+        cudaMemcpy(decoder->d_data_scan, decoder->data_scan, decoder->data_scan_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
+        cudaCheckError("Decoder copy scan data");
+        // Copy scan data to device memory
+        cudaMemcpy(decoder->d_data_scan_index, decoder->data_scan_index, decoder->segment_count * sizeof(int), cudaMemcpyHostToDevice);
+        cudaCheckError("Decoder copy scan data index");
+        
+        // Zero output memory
+        cudaMemset(decoder->d_data_quantized, 0, decoder->param_image.comp_count * decoder->param_image.width * decoder->param_image.height * sizeof(int16_t));
+        
+        // Perform huffman decoding
+        if ( jpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
+            fprintf(stderr, "Huffman decoder on GPU failed!\n");
+            return -1;
+        }
+    }
+    
+    //TIMER_STOP_PRINT("-Huffman Decoder:   ");
+    //TIMER_START();
+    
+    // Perform IDCT and dequantization
+    for ( int comp = 0; comp < decoder->param_image.comp_count; comp++ ) {
+        uint8_t* d_data_comp = &decoder->d_data[comp * decoder->param_image.width * decoder->param_image.height];
+        int16_t* d_data_quantized_comp = &decoder->d_data_quantized[comp * decoder->param_image.width * decoder->param_image.height];
+        
+        // Determine table type
+        enum jpeg_component_type type = (comp == 0) ? JPEG_COMPONENT_LUMINANCE : JPEG_COMPONENT_CHROMINANCE;
+        
+        //jpeg_decoder_print16(decoder, d_data_quantized_comp);
+        
+        cudaMemset(d_data_comp, 0, decoder->param_image.width * decoder->param_image.height * sizeof(uint8_t));
+        
+        //Perform inverse DCT
+        NppiSize inv_roi;
+        inv_roi.width = decoder->param_image.width * JPEG_BLOCK_SIZE;
+        inv_roi.height = decoder->param_image.height / JPEG_BLOCK_SIZE;
+        assert(JPEG_BLOCK_SIZE == 8);
+        NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
+            d_data_quantized_comp, 
+            decoder->param_image.width * JPEG_BLOCK_SIZE * sizeof(int16_t), 
+            d_data_comp, 
+            decoder->param_image.width * sizeof(uint8_t), 
+            decoder->table_quantization[type].d_table, 
+            inv_roi
+        );
+        if ( status != 0 )
+            printf("Error %d\n", status);
+        //jpeg_decoder_print8(decoder, d_data_comp);
+    }
+    
+    //TIMER_STOP_PRINT("-DCT & Quantization:");
+    //TIMER_START();
+    
+    // Preprocessing
+    if ( jpeg_preprocessor_decode(decoder) != 0 )
+        return -1;
+        
+    //TIMER_STOP_PRINT("-Postprocessing:    ");
+    
+    cudaMemcpy(decoder->data_target, decoder->d_data_target, data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    
+    // Set decompressed image
+    *image_decompressed = decoder->data_target;
+    *image_decompressed_size = data_size * sizeof(uint8_t);
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_decoder_destroy(struct jpeg_decoder* decoder)
+{    
+    assert(decoder != NULL);
+    
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        if ( decoder->table_quantization[comp_type].d_table != NULL )
+            cudaFree(decoder->table_quantization[comp_type].d_table);
+    }
+    
+    if ( decoder->reader != NULL )
+        jpeg_reader_destroy(decoder->reader);
+    
+    if ( decoder->data_scan != NULL )
+        cudaFreeHost(decoder->data_scan);
+    if ( decoder->data_scan != NULL )
+        cudaFree(decoder->d_data_scan);
+    if ( decoder->data_scan_index != NULL )
+        cudaFreeHost(decoder->data_scan_index);
+    if ( decoder->data_scan_index != NULL )
+        cudaFree(decoder->d_data_scan_index);
+    if ( decoder->data_quantized != NULL )
+        cudaFreeHost(decoder->data_quantized);
+    if ( decoder->d_data_quantized != NULL )
+        cudaFree(decoder->d_data_quantized);
+    if ( decoder->d_data != NULL )
+        cudaFree(decoder->d_data);
+    if ( decoder->data_target != NULL )
+        cudaFreeHost(decoder->data_target);
+    if ( decoder->d_data_target != NULL )
+        cudaFree(decoder->d_data_target);
+    
+    free(decoder);
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_decoder.h
+++ b/jpeg_compress/jpeg_decoder.h
@@ -0,0 +1,149 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_DECODER
+#define JPEG_DECODER
+
+#include "jpeg_common.h"
+#include "jpeg_table.h"
+#include "jpeg_reader.h"
+
+#define JPEG_MAX_COMPONENT_COUNT 3
+
+/** JPEG reader scan structure */
+struct jpeg_decoder_scan
+{
+    // Index into array of segment indexes [decoder->data_scan_index] for the first byte of scan
+    int segment_index;
+    // Segment count
+    int segment_count;
+};
+
+/**
+ * JPEG decoder structure
+ */
+struct jpeg_decoder
+{  
+    // Parameters for image data (width, height, comp_count, etc.)
+    struct jpeg_image_parameters param_image;
+    
+    // Quantization tables
+    struct jpeg_table_quantization table_quantization[JPEG_COMPONENT_TYPE_COUNT];
+    
+    // Huffman coder tables
+    struct jpeg_table_huffman_decoder table_huffman[JPEG_COMPONENT_TYPE_COUNT][JPEG_HUFFMAN_TYPE_COUNT];
+    // Huffman coder tables in device memory
+    struct jpeg_table_huffman_decoder* d_table_huffman[JPEG_COMPONENT_TYPE_COUNT][JPEG_HUFFMAN_TYPE_COUNT];
+    
+    // JPEG reader structure
+    struct jpeg_reader* reader;
+    
+    // Scan definitions
+    struct jpeg_decoder_scan scan[JPEG_MAX_COMPONENT_COUNT];
+    
+    // Number of used scans in current decoding image
+    int scan_count;
+    
+    // Restart interval for all scans (number of MCU that can be coded independatly, 
+    // 0 means seqeuential coding, 1 means every MCU can be coded independantly)
+    int restart_interval;
+    
+    // Data buffer for all scans
+    uint8_t* data_scan;
+    // Data buffer for all scans in device memory
+    uint8_t* d_data_scan;
+    
+    // Size for data buffer for all scans
+    int data_scan_size;
+    
+    // Indexes into scan data buffer for all segments (index point to segment data start in buffer)
+    int* data_scan_index;
+    // Indexes into scan data buffer for all segments in device memory (index point to segment data start in buffer)
+    int* d_data_scan_index;
+    
+    // Total segment count for all scans
+    int segment_count;
+    
+    // Data quantized (output from huffman coder)
+    int16_t* data_quantized;
+    // Data quantized in device memory (output from huffman coder)
+    int16_t* d_data_quantized;
+    
+    // Data in device memory (output from inverse DCT and quantization)
+    uint8_t* d_data;
+    
+    // Data target (output from preprocessing)
+    uint8_t* data_target;
+    // Data target in device memory (output from preprocessing)
+    uint8_t* d_data_target;
+};
+
+/**
+ * Create JPEG decoder
+ * 
+ * @param width  Width of decodable images
+ * @param height  Height of decodable images
+ * @param comp_count  Component count
+ * @return encoder structure if succeeds, otherwise NULL
+ */
+struct jpeg_decoder*
+jpeg_decoder_create(struct jpeg_image_parameters* param_image);
+
+/**
+ * Init JPEG decoder for specific image size
+ * 
+ * @param decoder  Decoder structure
+ * @param width  Width of decodable images
+ * @param height  Height of decodable images
+ * @param comp_count  Component count
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_decoder_init(struct jpeg_decoder* decoder, int width, int height, int comp_count);
+
+/**
+ * Decompress image by decoder
+ * 
+ * @param decoder  Decoder structure
+ * @param image  Source image data
+ * @param image_size  Source image data size
+ * @param image_decompressed  Pointer to variable where decompressed image data buffer will be placed
+ * @param image_decompressed_size  Pointer to variable where decompressed image size will be placed
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_decoder_decode(struct jpeg_decoder* decoder, uint8_t* image, int image_size, uint8_t** image_decompressed, int* image_decompressed_size);
+
+/**
+ * Destory JPEG decoder
+ * 
+ * @param decoder  Decoder structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_decoder_destroy(struct jpeg_decoder* decoder);
+
+#endif // JPEG_ENCODER
--- a/jpeg_compress/jpeg_encoder.c
+++ b/jpeg_compress/jpeg_encoder.c
@@ -0,0 +1,357 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_encoder.h"
+#include "jpeg_preprocessor.h"
+#include "jpeg_huffman_cpu_encoder.h"
+#include "jpeg_huffman_gpu_encoder.h"
+#include "jpeg_format_type.h"
+#include "jpeg_util.h"
+
+#define JPEG_ENCODER_MAX_BLOCK_COMPRESSED_SIZE (JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE * 2)
+
+/** Documented at declaration */
+void
+jpeg_encoder_set_default_parameters(struct jpeg_encoder_parameters* param)
+{
+    param->quality = 75;
+    param->restart_interval = 8;
+}
+
+/** Documented at declaration */
+struct jpeg_encoder*
+jpeg_encoder_create(struct jpeg_image_parameters* param_image, struct jpeg_encoder_parameters* param)
+{
+    assert(param_image->comp_count == 3);
+    
+    struct jpeg_encoder* encoder = malloc(sizeof(struct jpeg_encoder));
+    if ( encoder == NULL )
+        return NULL;
+        
+    // Set parameters
+    memset(encoder, 0, sizeof(struct jpeg_encoder));
+    encoder->param_image = *param_image;
+    encoder->param = *param;
+    
+    int result = 1;
+    
+    // Create writer
+    encoder->writer = jpeg_writer_create(encoder);
+    if ( encoder->writer == NULL )
+        result = 0;
+    
+    // Allocate data buffers
+    int data_size = encoder->param_image.width * encoder->param_image.width * encoder->param_image.comp_count;
+    if ( cudaSuccess != cudaMalloc((void**)&encoder->d_data_source, data_size * sizeof(uint8_t)) ) 
+        result = 0;
+    if ( cudaSuccess != cudaMalloc((void**)&encoder->d_data, data_size * sizeof(uint8_t)) ) 
+        result = 0;
+    if ( cudaSuccess != cudaMallocHost((void**)&encoder->data_quantized, data_size * sizeof(int16_t)) ) 
+        result = 0;
+    if ( cudaSuccess != cudaMalloc((void**)&encoder->d_data_quantized, data_size * sizeof(int16_t)) ) 
+        result = 0;
+	cudaCheckError("Encoder data allocation");
+
+    // Calculate segments count
+    if ( encoder->param.restart_interval != 0 ) {
+        int block_count = ((encoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE) * ((encoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE);
+        encoder->segment_count_per_comp = (block_count / encoder->param.restart_interval + 1);
+        encoder->segment_count = encoder->param_image.comp_count * encoder->segment_count_per_comp;
+        
+        // Allocate segments
+        cudaMallocHost((void**)&encoder->segments, encoder->segment_count * sizeof(struct jpeg_encoder_segment));
+        if ( encoder->segments == NULL )
+            result = 0;
+        // Allocate segments in device memory
+        if ( cudaSuccess != cudaMalloc((void**)&encoder->d_segments, encoder->segment_count * sizeof(struct jpeg_encoder_segment)) )
+            result = 0;
+        if ( result == 1 ) {
+            // Prepare segments for encoding
+            for ( int index = 0; index < encoder->segment_count; index++ ) {
+                encoder->segments[index].data_compressed_index = index * encoder->param.restart_interval * JPEG_ENCODER_MAX_BLOCK_COMPRESSED_SIZE;
+                encoder->segments[index].data_compressed_size = 0;
+            }
+            // Copy segments to device memory
+            if ( cudaSuccess != cudaMemcpy(encoder->d_segments, encoder->segments, encoder->segment_count * sizeof(struct jpeg_encoder_segment), cudaMemcpyHostToDevice) )
+                result = 0;
+        } 
+        
+        // Allocate compressed data
+        if ( cudaSuccess != cudaMallocHost((void**)&encoder->data_compressed, encoder->segment_count * encoder->param.restart_interval * JPEG_ENCODER_MAX_BLOCK_COMPRESSED_SIZE * sizeof(uint8_t)) ) 
+            result = 0;   
+        if ( cudaSuccess != cudaMalloc((void**)&encoder->d_data_compressed, encoder->segment_count * encoder->param.restart_interval * JPEG_ENCODER_MAX_BLOCK_COMPRESSED_SIZE * sizeof(uint8_t)) ) 
+            result = 0;   
+    }
+	cudaCheckError("Encoder segment allocation");
+     
+    // Allocate quantization tables in device memory
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) ) 
+            result = 0;
+    }
+    // Allocate huffman tables in device memory
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        for ( int huff_type = 0; huff_type < JPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
+            if ( cudaSuccess != cudaMalloc((void**)&encoder->d_table_huffman[comp_type][huff_type], sizeof(struct jpeg_table_huffman_encoder)) )
+                result = 0;
+        }
+    }
+	cudaCheckError("Encoder table allocation");
+    
+    // Init quantization tables for encoder
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        if ( jpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], comp_type, encoder->param.quality) != 0 )
+            result = 0;
+    }
+    // Init huffman tables for encoder
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        for ( int huff_type = 0; huff_type < JPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
+            if ( jpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], encoder->d_table_huffman[comp_type][huff_type], comp_type, huff_type) != 0 )
+                result = 0;
+        }
+    }
+	cudaCheckError("Encoder table init");
+    
+    // Init huffman encoder
+    if ( jpeg_huffman_gpu_encoder_init() != 0 )
+        result = 0;
+    
+    if ( result == 0 ) {
+        jpeg_encoder_destroy(encoder);
+        return NULL;
+    }
+    
+    return encoder;
+}
+
+void
+jpeg_encoder_print8(struct jpeg_encoder* encoder, uint8_t* d_data)
+{
+    int data_size = encoder->param_image.width * encoder->param_image.height;
+    uint8_t* data = NULL;
+    cudaMallocHost((void**)&data, data_size * sizeof(uint8_t)); 
+    cudaMemcpy(data, d_data, data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    
+    printf("Print Data\n");
+    for ( int y = 0; y < encoder->param_image.height; y++ ) {
+        for ( int x = 0; x < encoder->param_image.width; x++ ) {
+            printf("%3u ", data[y * encoder->param_image.width + x]);
+        }
+        printf("\n");
+    }
+    cudaFreeHost(data);
+}
+
+void
+jpeg_encoder_print16(struct jpeg_encoder* encoder, int16_t* d_data)
+{
+    int data_size = encoder->param_image.width * encoder->param_image.height;
+    int16_t* data = NULL;
+    cudaMallocHost((void**)&data, data_size * sizeof(int16_t)); 
+    cudaMemcpy(data, d_data, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
+    
+    printf("Print Data\n");
+    for ( int y = 0; y < encoder->param_image.height; y++ ) {
+        for ( int x = 0; x < encoder->param_image.width; x++ ) {
+            printf("%3d ", data[y * encoder->param_image.width + x]);
+        }
+        printf("\n");
+    }
+    cudaFreeHost(data);
+}
+
+/** Documented at declaration */
+int
+jpeg_encoder_encode(struct jpeg_encoder* encoder, uint8_t* image, uint8_t** image_compressed, int* image_compressed_size)
+{
+    int data_size = encoder->param_image.width * encoder->param_image.height * encoder->param_image.comp_count;
+    
+    //TIMER_INIT();
+    //TIMER_START();
+    
+    // Copy image to device memory
+    if ( cudaSuccess != cudaMemcpy(encoder->d_data_source, image, data_size * sizeof(uint8_t), cudaMemcpyHostToDevice) )
+        return -1;
+    
+    //jpeg_table_print(encoder->table[JPEG_COMPONENT_LUMINANCE]);
+    //jpeg_table_print(encoder->table[JPEG_COMPONENT_CHROMINANCE]);
+    
+    // Preprocessing
+    if ( jpeg_preprocessor_encode(encoder) != 0 )
+        return -1;
+        
+    //TIMER_STOP_PRINT("-Preprocessing:     ");
+    //TIMER_START();
+        
+    // Perform DCT and quantization
+    for ( int comp = 0; comp < encoder->param_image.comp_count; comp++ ) {
+        uint8_t* d_data_comp = &encoder->d_data[comp * encoder->param_image.width * encoder->param_image.height];
+        int16_t* d_data_quantized_comp = &encoder->d_data_quantized[comp * encoder->param_image.width * encoder->param_image.height];
+        
+        // Determine table type
+        enum jpeg_component_type type = (comp == 0) ? JPEG_COMPONENT_LUMINANCE : JPEG_COMPONENT_CHROMINANCE;
+        
+        //jpeg_encoder_print8(encoder, d_data_comp);
+        
+        //Perform forward DCT
+        NppiSize fwd_roi;
+        fwd_roi.width = encoder->param_image.width;
+        fwd_roi.height = encoder->param_image.height;
+        NppStatus status = nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R(
+            d_data_comp, 
+            encoder->param_image.width * sizeof(uint8_t), 
+            d_data_quantized_comp, 
+            encoder->param_image.width * JPEG_BLOCK_SIZE * sizeof(int16_t), 
+            encoder->table_quantization[type].d_table, 
+            fwd_roi
+        );
+        if ( status != 0 ) {
+            fprintf(stderr, "Forward DCT failed for component at index %d [error %d]!\n", comp, status);		
+            return -1;
+        }
+        
+        //jpeg_encoder_print16(encoder, d_data_quantized_comp);
+    }
+    
+    // Initialize writer output buffer current position
+    encoder->writer->buffer_current = encoder->writer->buffer;
+    
+    // Write header
+    jpeg_writer_write_header(encoder);
+    
+    //TIMER_STOP_PRINT("-DCT & Quantization:");
+    //TIMER_START();
+    
+    // Perform huffman coding on CPU (when restart interval is not set)
+    if ( encoder->param.restart_interval == 0 ) {
+        // Copy quantized data from device memory to cpu memory
+        cudaMemcpy(encoder->data_quantized, encoder->d_data_quantized, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);
+        
+        // Perform huffman coding for all components
+        for ( int comp = 0; comp < encoder->param_image.comp_count; comp++ ) {
+            // Get data buffer for component
+            int16_t* data_comp = &encoder->data_quantized[comp * encoder->param_image.width * encoder->param_image.height];
+            int16_t* d_data_comp = &encoder->d_data_quantized[comp * encoder->param_image.width * encoder->param_image.height];
+            // Determine table type
+            enum jpeg_component_type type = (comp == 0) ? JPEG_COMPONENT_LUMINANCE : JPEG_COMPONENT_CHROMINANCE;
+            // Write scan header
+            jpeg_writer_write_scan_header(encoder, comp, type);
+            // Perform huffman coding
+            if ( jpeg_huffman_cpu_encoder_encode(encoder, type, data_comp) != 0 ) {
+                fprintf(stderr, "Huffman encoder on CPU failed for component at index %d!\n", comp);
+                return -1;
+            }
+        }
+    }
+    // Perform huffman coding on GPU (when restart interval is set)
+    else {
+        // Perform huffman coding
+        if ( jpeg_huffman_gpu_encoder_encode(encoder) != 0 ) {
+            fprintf(stderr, "Huffman encoder on GPU failed!\n");
+            return -1;
+        }
+        
+        // Copy compressed data from device memory to cpu memory
+        if ( cudaSuccess != cudaMemcpy(encoder->data_compressed, encoder->d_data_compressed, encoder->segment_count * encoder->param.restart_interval * JPEG_ENCODER_MAX_BLOCK_COMPRESSED_SIZE * sizeof(uint8_t), cudaMemcpyDeviceToHost) != 0 )
+            return -1;
+        // Copy segments to device memory
+        if ( cudaSuccess != cudaMemcpy(encoder->segments, encoder->d_segments, encoder->segment_count * sizeof(struct jpeg_encoder_segment), cudaMemcpyDeviceToHost) )
+            return -1;
+        
+        // Write huffman coder results
+        for ( int comp = 0; comp < encoder->param_image.comp_count; comp++ ) {
+            // Determine table type
+            enum jpeg_component_type type = (comp == 0) ? JPEG_COMPONENT_LUMINANCE : JPEG_COMPONENT_CHROMINANCE;
+            // Write scan header
+            jpeg_writer_write_scan_header(encoder, comp, type);
+            // Write scan data
+            for ( int index = 0; index < encoder->segment_count_per_comp; index++ ) {
+                int segment_index = (comp * encoder->segment_count_per_comp + index);
+                struct jpeg_encoder_segment* segment = &encoder->segments[segment_index];
+                
+                // Copy compressed data to writer
+                memcpy(
+                    encoder->writer->buffer_current, 
+                    &encoder->data_compressed[segment->data_compressed_index],
+                    segment->data_compressed_size
+                );
+                encoder->writer->buffer_current += segment->data_compressed_size;
+                //printf("Compressed data %d bytes\n", segment->data_compressed_size);
+            }
+        }
+    }
+    jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_EOI);
+    
+    //TIMER_STOP_PRINT("-Huffman Encoder:   ");
+    
+    // Set compressed image
+    *image_compressed = encoder->writer->buffer;
+    *image_compressed_size = encoder->writer->buffer_current - encoder->writer->buffer;
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_encoder_destroy(struct jpeg_encoder* encoder)
+{
+    assert(encoder != NULL);
+    
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        if ( encoder->table_quantization[comp_type].d_table != NULL )
+            cudaFree(encoder->table_quantization[comp_type].d_table);
+    }
+    for ( int comp_type = 0; comp_type < JPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        for ( int huff_type = 0; huff_type < JPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
+            if ( encoder->d_table_huffman[comp_type][huff_type] != NULL )
+                cudaFree(encoder->d_table_huffman[comp_type][huff_type]);
+        }
+    }
+    
+    if ( encoder->writer != NULL )
+        jpeg_writer_destroy(encoder->writer);
+    
+    if ( encoder->d_data_source != NULL )
+        cudaFree(encoder->d_data_source);
+    if ( encoder->d_data != NULL )
+        cudaFree(encoder->d_data);
+    if ( encoder->data_quantized != NULL )
+        cudaFreeHost(encoder->data_quantized);    
+    if ( encoder->d_data_quantized != NULL )
+        cudaFree(encoder->d_data_quantized);    
+    if ( encoder->data_compressed != NULL )
+        cudaFreeHost(encoder->data_compressed);    
+    if ( encoder->d_data_compressed != NULL )
+        cudaFree(encoder->d_data_compressed);    
+    if ( encoder->segments != NULL )
+        cudaFreeHost(encoder->segments);  
+    if ( encoder->d_segments != NULL )
+        cudaFree(encoder->d_segments);    
+    
+    free(encoder);
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_encoder.h
+++ b/jpeg_compress/jpeg_encoder.h
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_ENCODER
+#define JPEG_ENCODER
+
+#include "jpeg_common.h"
+#include "jpeg_table.h"
+#include "jpeg_writer.h"
+
+/** JPEG segment */
+struct jpeg_encoder_segment {
+    // Data compressed index
+    int data_compressed_index;
+    // Data compressed size
+    int data_compressed_size;
+};
+
+/**
+ * JPEG encoder parameters
+ */
+struct jpeg_encoder_parameters
+{
+    // Quality level (0-100)
+    int quality;
+    
+    // Restart interval
+    int restart_interval;
+};
+
+/**
+ * JPEG encoder structure
+ */
+struct jpeg_encoder
+{  
+    // Parameters (quality, restart_interval, etc.)
+    struct jpeg_encoder_parameters param;
+    
+    // Parameters for image data (width, height, comp_count, etc.)
+    struct jpeg_image_parameters param_image;
+    
+    // Source image data in device memory (loaded from file)
+    uint8_t* d_data_source;
+    
+    // Preprocessed data in device memory (output from preprocessor)
+    uint8_t* d_data;
+    
+    // Data after DCT and quantization (output from DCT and quantization)
+    int16_t* data_quantized;
+    // Data after DCT and quantization in device memory (output from DCT and quantization)
+    int16_t* d_data_quantized;
+    
+    // Data after huffman coder (output from huffman coder)
+    uint8_t* data_compressed;
+    // Data after huffman coder (output from huffman coder)
+    uint8_t* d_data_compressed;
+    
+    // Segments for all components
+    struct jpeg_encoder_segment* segments;
+    // Segments in device memory for all components
+    struct jpeg_encoder_segment* d_segments;
+    // Segment count per component
+    int segment_count_per_comp;
+    // Segment total count for all components
+    int segment_count;
+    
+    // Quantization tables
+    struct jpeg_table_quantization table_quantization[JPEG_COMPONENT_TYPE_COUNT];
+    
+    // Huffman coder tables
+    struct jpeg_table_huffman_encoder table_huffman[JPEG_COMPONENT_TYPE_COUNT][JPEG_HUFFMAN_TYPE_COUNT];
+    // Huffman coder tables in device memory
+    struct jpeg_table_huffman_encoder* d_table_huffman[JPEG_COMPONENT_TYPE_COUNT][JPEG_HUFFMAN_TYPE_COUNT];
+    
+    // JPEG writer structure
+    struct jpeg_writer* writer;
+};
+
+/**
+ * Set default parameters for JPEG encoder
+ * 
+ * @param param  Parameters for encoder
+ * @return void
+ */
+void
+jpeg_encoder_set_default_parameters(struct jpeg_encoder_parameters* param);
+
+/**
+ * Create JPEG encoder
+ * 
+ * @param param_image  Parameters for image data
+ * @param param  Parameters for encoder
+ * @return encoder structure if succeeds, otherwise NULL
+ */
+struct jpeg_encoder*
+jpeg_encoder_create(struct jpeg_image_parameters* param_image, struct jpeg_encoder_parameters* param);
+
+/**
+ * Compress image by encoder
+ * 
+ * @param encoder  Encoder structure
+ * @param image  Source image data
+ * @param image_compressed  Pointer to variable where compressed image data buffer will be placed
+ * @param image_compressed_size  Pointer to variable where compressed image size will be placed
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_encoder_encode(struct jpeg_encoder* encoder, uint8_t* image, uint8_t** image_compressed, int* image_compressed_size);
+
+/**
+ * Destory JPEG encoder
+ * 
+ * @param encoder  Encoder structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_encoder_destroy(struct jpeg_encoder* encoder);
+
+#endif // JPEG_ENCODER
--- a/jpeg_compress/jpeg_format_type.h
+++ b/jpeg_compress/jpeg_format_type.h
@@ -0,0 +1,166 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#ifndef JPEG_FORMAT_TYPE
+#define JPEG_FORMAT_TYPE
+
+#include <stdio.h>
+
+/** JPEG marker codes */
+enum jpeg_marker_code {		
+    JPEG_MARKER_SOF0  = 0xc0,
+    JPEG_MARKER_SOF1  = 0xc1,
+    JPEG_MARKER_SOF2  = 0xc2,
+    JPEG_MARKER_SOF3  = 0xc3,
+  
+    JPEG_MARKER_SOF5  = 0xc5,
+    JPEG_MARKER_SOF6  = 0xc6,
+    JPEG_MARKER_SOF7  = 0xc7,
+  
+    JPEG_MARKER_JPG   = 0xc8,
+    JPEG_MARKER_SOF9  = 0xc9,
+    JPEG_MARKER_SOF10 = 0xca,
+    JPEG_MARKER_SOF11 = 0xcb,
+  
+    JPEG_MARKER_SOF13 = 0xcd,
+    JPEG_MARKER_SOF14 = 0xce,
+    JPEG_MARKER_SOF15 = 0xcf,
+  
+    JPEG_MARKER_DHT   = 0xc4,
+  
+    JPEG_MARKER_DAC   = 0xcc,
+  
+    JPEG_MARKER_RST0  = 0xd0,
+    JPEG_MARKER_RST1  = 0xd1,
+    JPEG_MARKER_RST2  = 0xd2,
+    JPEG_MARKER_RST3  = 0xd3,
+    JPEG_MARKER_RST4  = 0xd4,
+    JPEG_MARKER_RST5  = 0xd5,
+    JPEG_MARKER_RST6  = 0xd6,
+    JPEG_MARKER_RST7  = 0xd7,
+  
+    JPEG_MARKER_SOI   = 0xd8,
+    JPEG_MARKER_EOI   = 0xd9,
+    JPEG_MARKER_SOS   = 0xda,
+    JPEG_MARKER_DQT   = 0xdb,
+    JPEG_MARKER_DNL   = 0xdc,
+    JPEG_MARKER_DRI   = 0xdd,
+    JPEG_MARKER_DHP   = 0xde,
+    JPEG_MARKER_EXP   = 0xdf,
+  
+    JPEG_MARKER_APP0  = 0xe0,
+    JPEG_MARKER_APP1  = 0xe1,
+    JPEG_MARKER_APP2  = 0xe2,
+    JPEG_MARKER_APP3  = 0xe3,
+    JPEG_MARKER_APP4  = 0xe4,
+    JPEG_MARKER_APP5  = 0xe5,
+    JPEG_MARKER_APP6  = 0xe6,
+    JPEG_MARKER_APP7  = 0xe7,
+    JPEG_MARKER_APP8  = 0xe8,
+    JPEG_MARKER_APP9  = 0xe9,
+    JPEG_MARKER_APP10 = 0xea,
+    JPEG_MARKER_APP11 = 0xeb,
+    JPEG_MARKER_APP12 = 0xec,
+    JPEG_MARKER_APP13 = 0xed,
+    JPEG_MARKER_APP14 = 0xee,
+    JPEG_MARKER_APP15 = 0xef,
+  
+    JPEG_MARKER_JPG0  = 0xf0,
+    JPEG_MARKER_JPG13 = 0xfd,
+    JPEG_MARKER_COM   = 0xfe,
+  
+    JPEG_MARKER_TEM   = 0x01,
+  
+    JPEG_MARKER_ERROR = 0x100
+};
+
+static const char* 
+jpeg_marker_name(enum jpeg_marker_code code)
+{
+    switch (code) {
+        case JPEG_MARKER_SOF0: return "SOF0";
+        case JPEG_MARKER_SOF1: return "SOF1";
+        case JPEG_MARKER_SOF2: return "SOF2";
+        case JPEG_MARKER_SOF3: return "SOF3";
+        case JPEG_MARKER_SOF5: return "SOF5";
+        case JPEG_MARKER_SOF6: return "SOF6";
+        case JPEG_MARKER_SOF7: return "SOF7";
+        case JPEG_MARKER_JPG: return "JPG";
+        case JPEG_MARKER_SOF9: return "SOF9";
+        case JPEG_MARKER_SOF10: return "SOF10";
+        case JPEG_MARKER_SOF11: return "SOF11";
+        case JPEG_MARKER_SOF13: return "SOF13";
+        case JPEG_MARKER_SOF14: return "SOF14";
+        case JPEG_MARKER_SOF15: return "SOF15";
+        case JPEG_MARKER_DHT: return "DHT";
+        case JPEG_MARKER_DAC: return "DAC";
+        case JPEG_MARKER_RST0: return "RST0";
+        case JPEG_MARKER_RST1: return "RST1";
+        case JPEG_MARKER_RST2: return "RST2";
+        case JPEG_MARKER_RST3: return "RST3";
+        case JPEG_MARKER_RST4: return "RST4";
+        case JPEG_MARKER_RST5: return "RST5";
+        case JPEG_MARKER_RST6: return "RST6";
+        case JPEG_MARKER_RST7: return "RST7";
+        case JPEG_MARKER_SOI: return "SOI";
+        case JPEG_MARKER_EOI: return "EOI";
+        case JPEG_MARKER_SOS: return "SOS";
+        case JPEG_MARKER_DQT: return "DQT";
+        case JPEG_MARKER_DNL: return "DNL";
+        case JPEG_MARKER_DRI: return "DRI";
+        case JPEG_MARKER_DHP: return "DHP";
+        case JPEG_MARKER_EXP: return "EXP";
+        case JPEG_MARKER_APP0: return "APP0";
+        case JPEG_MARKER_APP1: return "APP1";
+        case JPEG_MARKER_APP2: return "APP2";
+        case JPEG_MARKER_APP3: return "APP3";
+        case JPEG_MARKER_APP4: return "APP4";
+        case JPEG_MARKER_APP5: return "APP5";
+        case JPEG_MARKER_APP6: return "APP6";
+        case JPEG_MARKER_APP7: return "APP7";
+        case JPEG_MARKER_APP8: return "APP8";
+        case JPEG_MARKER_APP9: return "APP9";
+        case JPEG_MARKER_APP10: return "APP10";
+        case JPEG_MARKER_APP11: return "APP11";
+        case JPEG_MARKER_APP12: return "APP12";
+        case JPEG_MARKER_APP13: return "APP13";
+        case JPEG_MARKER_APP14: return "APP14";
+        case JPEG_MARKER_APP15: return "APP15";
+        case JPEG_MARKER_JPG0: return "JPG0";
+        case JPEG_MARKER_JPG13: return "JPG13";
+        case JPEG_MARKER_COM: return "COM";
+        case JPEG_MARKER_TEM: return "TEM";
+        case JPEG_MARKER_ERROR: return "ERROR";
+        default: 
+        {
+            static char buffer[255];
+            sprintf(buffer, "Unknown (0x%X)", code);
+            return buffer;
+        }
+    }
+}
+
+#endif // JPEG_FORMAT_TYPE
--- a/jpeg_compress/jpeg_huffman_cpu_decoder.c
+++ b/jpeg_compress/jpeg_huffman_cpu_decoder.c
@@ -0,0 +1,358 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_huffman_cpu_decoder.h"
+#include "jpeg_util.h"
+
+#ifdef _DEBUG
+#define inline
+#endif
+
+/** Huffman encoder structure */
+struct jpeg_huffman_cpu_decoder
+{
+    // Scan data for all scans
+    uint8_t* data_scan;
+    // Size for data for all scans
+    int data_scan_size;
+    // Indexes into scan data buffer for all segments
+    int* data_scan_index;
+    // Total segment count for all scans
+    int segment_count;
+    // Huffman table DC
+    struct jpeg_table_huffman_decoder* table_dc;
+    // Huffman table AC
+    struct jpeg_table_huffman_decoder* table_ac;
+    // DC differentize for component
+    int dc;
+    // Get bits
+    int get_bits;
+    // Get buffer
+    int get_buff;
+    // Compressed data
+    uint8_t* data;
+    // Compressed data size
+    int data_size;
+    // Restart interval
+    int restart_interval;
+    // Restart interval position
+    int restart_position;
+    // Current segment index
+    int segment_index;
+};
+
+/**
+ * Fill more bit to current get buffer
+ * 
+ * @param coder
+ * @return void
+ */
+void
+jpeg_huffman_cpu_decoder_decode_fill_bit_buffer(struct jpeg_huffman_cpu_decoder* coder)
+{
+    while ( coder->get_bits < 25 ) {
+        //Are there some data?
+        if( coder->data_size > 0 ) { 
+            // Attempt to read a byte
+            //printf("read byte %X 0x%X\n", (int)coder->data, (unsigned char)*coder->data);
+            unsigned char uc = *coder->data++;
+            coder->data_size--;            
+
+            // If it's 0xFF, check and discard stuffed zero byte
+            if ( uc == 0xFF ) {
+                do {
+                    //printf("read byte %X 0x%X\n", (int)coder->data, (unsigned char)*coder->data);
+                    uc = *coder->data++;
+                    coder->data_size--;
+                } while ( uc == 0xFF );
+
+                if ( uc == 0 ) {
+                    // Found FF/00, which represents an FF data byte
+                    uc = 0xFF;
+                } else {                
+                    // There should be enough bits still left in the data segment;
+                    // if so, just break out of the outer while loop.
+                    //if (m_nGetBits >= nbits)
+                    if ( coder->get_bits >= 0 )
+                        break;
+                }
+            }
+
+            coder->get_buff = (coder->get_buff << 8) | ((int) uc);
+            coder->get_bits += 8;            
+        }
+        else
+            break;
+    }
+}
+
+/**
+ * Get bits
+ * 
+ * @param coder  Decoder structure
+ * @param nbits  Number of bits to get
+ * @return bits
+ */
+inline int
+jpeg_huffman_cpu_decoder_get_bits(struct jpeg_huffman_cpu_decoder* coder, int nbits) 
+{
+    //we should read nbits bits to get next data
+    if( coder->get_bits < nbits )
+        jpeg_huffman_cpu_decoder_decode_fill_bit_buffer(coder);
+    coder->get_bits -= nbits;
+    return (int)(coder->get_buff >> coder->get_bits) & ((1 << nbits) - 1);
+}
+
+
+/**
+ * Special Huffman decode:
+ * (1) For codes with length > 8
+ * (2) For codes with length < 8 while data is finished
+ * 
+ * @return int
+ */
+int
+jpeg_huffman_cpu_decoder_decode_special_decode(struct jpeg_huffman_cpu_decoder* coder, struct jpeg_table_huffman_decoder* table, int min_bits)
+{
+    // HUFF_DECODE has determined that the code is at least min_bits
+    // bits long, so fetch that many bits in one swoop.
+    int code = jpeg_huffman_cpu_decoder_get_bits(coder, min_bits);
+
+    // Collect the rest of the Huffman code one bit at a time.
+    // This is per Figure F.16 in the JPEG spec.
+    int l = min_bits;
+    while ( code > table->maxcode[l] ) {
+        code <<= 1;
+        code |= jpeg_huffman_cpu_decoder_get_bits(coder, 1);
+        l++;
+    }
+
+    // With garbage input we may reach the sentinel value l = 17.
+    if ( l > 16 ) {
+        // Fake a zero as the safest result
+        return 0;
+    }
+    
+    return table->huffval[table->valptr[l] + (int)(code - table->mincode[l])];
+}
+
+/**
+ * To find dc or ac value according to category and category offset
+ * 
+ * @return int
+ */
+inline int
+jpeg_huffman_cpu_decoder_value_from_category(int category, int offset)
+{
+    // Method 1: 
+    // On some machines, a shift and add will be faster than a table lookup.
+    // #define HUFF_EXTEND(x,s) \
+    // ((x)< (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x)) 
+
+    // Method 2: Table lookup
+    // If (offset < half[category]), then value is below zero
+    // Otherwise, value is above zero, and just the offset 
+    // entry n is 2**(n-1)
+    static const int half[16] =    { 
+        0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 
+        0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+    };
+
+    //start[i] is the starting value in this category; surely it is below zero
+    // entry n is (-1 << n) + 1
+    static const int start[16] = { 
+        0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
+        ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
+        ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
+        ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 
+    };    
+
+    return (offset < half[category]) ? (offset + start[category]) : offset;    
+}
+
+/**
+ * Get category number for dc, or (0 run length, ac category) for ac.
+ * The max length for Huffman codes is 15 bits; so we use 32 bits buffer    
+ * m_nGetBuff, with the validated length is m_nGetBits.
+ * Usually, more than 95% of the Huffman codes will be 8 or fewer bits long
+ * To speed up, we should pay more attention on the codes whose length <= 8
+ * 
+ * @return int
+ */
+inline int
+jpeg_huffman_cpu_decoder_get_category(struct jpeg_huffman_cpu_decoder* coder, struct jpeg_table_huffman_decoder* table)
+{
+    // If left bits < 8, we should get more data
+    if ( coder->get_bits < 8 )
+        jpeg_huffman_cpu_decoder_decode_fill_bit_buffer(coder);
+
+    // Call special process if data finished; min bits is 1
+    if( coder->get_bits < 8 )
+        return jpeg_huffman_cpu_decoder_decode_special_decode(coder, table, 1);
+
+    // Peek the first valid byte    
+    int look = ((coder->get_buff >> (coder->get_bits - 8)) & 0xFF);
+    int nb = table->look_nbits[look];
+
+    if ( nb ) { 
+        coder->get_bits -= nb;
+        return table->look_sym[look]; 
+    } else {
+        //Decode long codes with length >= 9
+        return jpeg_huffman_cpu_decoder_decode_special_decode(coder, table, 9);
+    }
+}
+
+/**
+ * Decode one 8x8 block
+ *
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_cpu_decoder_decode_block(struct jpeg_huffman_cpu_decoder* coder, int16_t* data)
+{    
+    // Restart coder
+    if ( coder->restart_interval > 0 && coder->restart_position == 0 ) {
+        coder->get_buff = 0;
+        coder->get_bits = 0;
+        coder->dc = 0;
+        coder->restart_position = coder->restart_interval;
+        coder->segment_index++;
+        
+        // Set coder data for next segment
+        int data_index = coder->data_scan_index[coder->segment_index];
+        coder->data = &coder->data_scan[data_index];
+        if ( (coder->segment_index + 1) >= coder->segment_count )
+            coder->data_size = coder->data_scan_size - data_index;
+        else
+            coder->data_size = coder->data_scan_index[coder->segment_index + 1] - data_index;
+    }
+    
+    // Zero block output
+    memset(data, 0, sizeof(int16_t) * JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE);
+
+    // Section F.2.2.1: decode the DC coefficient difference
+    // get dc category number, s
+    int s = jpeg_huffman_cpu_decoder_get_category(coder, coder->table_dc);
+    if ( s ) {
+        // Get offset in this dc category
+        int r = jpeg_huffman_cpu_decoder_get_bits(coder, s);
+        // Get dc difference value
+        s = jpeg_huffman_cpu_decoder_value_from_category(s, r);
+    }
+
+    // Convert DC difference to actual value, update last_dc_val
+    s += coder->dc;
+    coder->dc = s;
+
+    // Output the DC coefficient (assumes jpeg_natural_order[0] = 0)
+    data[0] = s;
+    
+    // Section F.2.2.2: decode the AC coefficients
+    // Since zeroes are skipped, output area must be cleared beforehand
+    for ( int k = 1; k < 64; k++ ) {
+        // s: (run, category)
+        int s = jpeg_huffman_cpu_decoder_get_category(coder, coder->table_ac);
+        // r: run length for ac zero, 0 <= r < 16
+        int r = s >> 4;
+        // s: category for this non-zero ac
+        s &= 15;
+        if ( s ) {
+            //    k: position for next non-zero ac
+            k += r;
+            //    r: offset in this ac category
+            r = jpeg_huffman_cpu_decoder_get_bits(coder, s);
+            //    s: ac value
+            s = jpeg_huffman_cpu_decoder_value_from_category(s, r);
+
+            data[jpeg_order_natural[k]] = s;
+        } else {
+            // s = 0, means ac value is 0 ? Only if r = 15.  
+            //means all the left ac are zero
+            if ( r != 15 )
+                break;
+            k += 15;
+        }
+    }
+    
+    coder->restart_position--;
+    
+    /*printf("CPU Decode Block\n");
+    for ( int y = 0; y < 8; y++ ) {
+        for ( int x = 0; x < 8; x++ ) {
+            printf("%4d ", data[y * 8 + x]);
+        }
+        printf("\n");
+    }*/
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_cpu_decoder_decode(struct jpeg_decoder* decoder, enum jpeg_component_type type, struct jpeg_decoder_scan* scan, int16_t* data_decompressed)
+{    
+    int block_cx = (decoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int block_cy = (decoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    
+    // Initialize huffman coder
+    struct jpeg_huffman_cpu_decoder coder;
+    coder.data_scan = decoder->data_scan;
+    coder.data_scan_size = decoder->data_scan_size;
+    coder.data_scan_index = decoder->data_scan_index;
+    coder.segment_count = decoder->segment_count;
+    coder.table_dc = &decoder->table_huffman[type][JPEG_HUFFMAN_DC];
+    coder.table_ac = &decoder->table_huffman[type][JPEG_HUFFMAN_AC];
+    coder.get_buff = 0;
+    coder.get_bits = 0;
+    coder.dc = 0;
+    coder.restart_interval = decoder->restart_interval;
+    coder.restart_position = decoder->restart_interval;
+    coder.segment_index = scan->segment_index;
+    
+    // Set coder data
+    int data_index = coder.data_scan_index[coder.segment_index];
+    coder.data = &coder.data_scan[data_index];
+    if ( (coder.segment_index + 1) >= coder.segment_count )
+        coder.data_size = coder.data_scan_size - data_index;
+    else
+        coder.data_size = coder.data_scan_index[coder.segment_index + 1] - data_index;
+        
+    //printf("start %d, size %d\n", coder.data, coder.data_size);
+    
+    // Decode all blocks
+    for ( int block_y = 0; block_y < block_cy; block_y++ ) {
+        for ( int block_x = 0; block_x < block_cx; block_x++ ) {
+            int data_index = (block_y * block_cx + block_x) * JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE;
+            if ( jpeg_huffman_cpu_decoder_decode_block(&coder, &data_decompressed[data_index]) != 0 ) {
+                fprintf(stderr, "Huffman decoder failed at block [%d, %d]!\n", block_y, block_x);
+                return -1;
+            }
+        }
+    }
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_huffman_cpu_decoder.h
+++ b/jpeg_compress/jpeg_huffman_cpu_decoder.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_HUFFMAN_CPU_DECODER
+#define JPEG_HUFFMAN_CPU_DECODER
+
+#include "jpeg_decoder.h"
+
+/**
+ * Perform huffman decoding
+ * 
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_cpu_decoder_decode(struct jpeg_decoder* decoder, enum jpeg_component_type type, struct jpeg_decoder_scan* scan, int16_t* data_decompressed);
+
+#endif // JPEG_HUFFMAN_CPU_DECODER
--- a/jpeg_compress/jpeg_huffman_cpu_encoder.c
+++ b/jpeg_compress/jpeg_huffman_cpu_encoder.c
@@ -0,0 +1,285 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_huffman_cpu_encoder.h"
+#include "jpeg_format_type.h"
+#include "jpeg_util.h"
+
+#ifdef _DEBUG
+#define inline
+#endif
+
+/** Huffman encoder structure */
+struct jpeg_huffman_cpu_encoder
+{
+    // Huffman table DC
+    struct jpeg_table_huffman_encoder* table_dc;
+    // Huffman table AC
+    struct jpeg_table_huffman_encoder* table_ac;
+    // The value (in 4 byte buffer) to be written out
+    int put_value;
+    // The size (in bits) to be written out
+    int put_bits;
+    // JPEG writer structure
+    struct jpeg_writer* writer;
+    // DC differentize for component
+    int dc;
+    // Restart interval
+    int restart_interval;
+    // Block count
+    int block_count;
+};
+
+/**
+ * Output bits to the file. Only the right 24 bits of put_buffer are used; 
+ * the valid bits are left-justified in this part.  At most 16 bits can be 
+ * passed to EmitBits in one call, and we never retain more than 7 bits 
+ * in put_buffer between calls, so 24 bits are sufficient.
+ * 
+ * @param coder  Huffman coder structure
+ * @param code  Huffman code
+ * @param size  Size in bits of the Huffman code
+ * @return void
+ */
+inline int
+jpeg_huffman_cpu_encoder_emit_bits(struct jpeg_huffman_cpu_encoder* coder, unsigned int code, int size)
+{
+    // This routine is heavily used, so it's worth coding tightly
+    int put_buffer = (int)code;
+    int put_bits = coder->put_bits;
+    // If size is 0, caller used an invalid Huffman table entry
+    if ( size == 0 )
+        return -1;
+    // Mask off any extra bits in code
+    put_buffer &= (((int)1) << size) - 1; 
+    // New number of bits in buffer
+    put_bits += size;                    
+    // Align incoming bits
+    put_buffer <<= 24 - put_bits;        
+    // And merge with old buffer contents
+    put_buffer |= coder->put_value;    
+    // If there are more than 8 bits, write it out
+    unsigned char uc;
+    while ( put_bits >= 8 ) {
+        // Write one byte out
+        uc = (unsigned char) ((put_buffer >> 16) & 0xFF);
+        jpeg_writer_emit_byte(coder->writer, uc);
+        // If need to stuff a zero byte
+        if ( uc == 0xFF ) {  
+            // Write zero byte out
+            jpeg_writer_emit_byte(coder->writer, 0);
+        }
+        put_buffer <<= 8;
+        put_bits -= 8;
+    }
+    // update state variables
+    coder->put_value = put_buffer; 
+    coder->put_bits = put_bits;
+    return 0;
+}
+
+/**
+ * Emit left bits
+ * 
+ * @param coder  Huffman coder structure
+ * @return void
+ */
+inline void
+jpeg_huffman_cpu_encoder_emit_left_bits(struct jpeg_huffman_cpu_encoder* coder)
+{
+    // Fill 7 bits with ones
+    if ( jpeg_huffman_cpu_encoder_emit_bits(coder, 0x7F, 7) != 0 )
+        return;
+    
+    //unsigned char uc = (unsigned char) ((coder->put_value >> 16) & 0xFF);
+    // Write one byte out
+    //jpeg_writer_emit_byte(coder->writer, uc);
+    
+    coder->put_value = 0; 
+    coder->put_bits = 0;
+}
+
+/**
+ * Encode one 8x8 block
+ *
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_cpu_encoder_encode_block(struct jpeg_huffman_cpu_encoder* coder, int16_t* data)
+{        
+    int16_t* block = data;
+    
+    /*printf("Encode block\n");
+    for ( int y = 0; y < 8; y++) {
+        for ( int x = 0; x < 8; x++ ) {
+            printf("%4d ", block[y * 8 + x]);
+        }
+        printf("\n");
+    }*/
+
+    // Encode the DC coefficient difference per section F.1.2.1
+    int temp = block[0] - coder->dc;
+    coder->dc = block[0];
+
+    int temp2 = temp;
+    if ( temp < 0 ) {
+        // Temp is abs value of input
+        temp = -temp;
+        // For a negative input, want temp2 = bitwise complement of abs(input)
+        // This code assumes we are on a two's complement machine
+        temp2--;
+    }
+
+    // Find the number of bits needed for the magnitude of the coefficient
+    int nbits = 0;
+    while ( temp ) {
+        nbits++;
+        temp >>= 1;
+    }
+
+    //    Write category number
+    if ( jpeg_huffman_cpu_encoder_emit_bits(coder, coder->table_dc->code[nbits], coder->table_dc->size[nbits]) != 0 ) {
+        fprintf(stderr, "Fail emit bits %d [code: %d, size: %d]!\n", nbits, coder->table_dc->code[nbits], coder->table_dc->size[nbits]);
+        return -1;
+    }
+
+    //    Write category offset (EmitBits rejects calls with size 0)
+    if ( nbits ) {
+        if ( jpeg_huffman_cpu_encoder_emit_bits(coder, (unsigned int) temp2, nbits) != 0 )
+            return -1;
+    }
+    
+    // Encode the AC coefficients per section F.1.2.2 (r = run length of zeros)
+    int r = 0;
+    for ( int k = 1; k < 64; k++ ) 
+    {
+        if ( (temp = block[jpeg_order_natural[k]]) == 0 ) {
+            r++;
+        } 
+        else {
+            // If run length > 15, must emit special run-length-16 codes (0xF0)
+            while ( r > 15 ) {
+                if ( jpeg_huffman_cpu_encoder_emit_bits(coder, coder->table_ac->code[0xF0], coder->table_ac->size[0xF0]) != 0 )
+                    return -1;
+                r -= 16;
+            }
+
+            temp2 = temp;
+            if ( temp < 0 ) {
+                // temp is abs value of input
+                temp = -temp;        
+                // This code assumes we are on a two's complement machine
+                temp2--;
+            }
+
+            // Find the number of bits needed for the magnitude of the coefficient
+            // there must be at least one 1 bit
+            nbits = 1;
+            while ( (temp >>= 1) )
+                nbits++;
+
+            // Emit Huffman symbol for run length / number of bits
+            int i = (r << 4) + nbits;
+            if ( jpeg_huffman_cpu_encoder_emit_bits(coder, coder->table_ac->code[i], coder->table_ac->size[i]) != 0 )
+                return -1;
+
+            // Write Category offset
+            if ( jpeg_huffman_cpu_encoder_emit_bits(coder, (unsigned int) temp2, nbits) != 0 )
+                return -1;
+
+            r = 0;
+        }
+    }
+
+    // If all the left coefs were zero, emit an end-of-block code
+    if ( r > 0 ) {
+        if ( jpeg_huffman_cpu_encoder_emit_bits(coder, coder->table_ac->code[0], coder->table_ac->size[0]) != 0 )
+            return -1;
+    }
+
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_cpu_encoder_encode(struct jpeg_encoder* encoder, enum jpeg_component_type type, int16_t* data)
+{    
+    int block_cx = (encoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int block_cy = (encoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    
+    // Initialize huffman coder
+    struct jpeg_huffman_cpu_encoder coder;
+    coder.table_dc = &encoder->table_huffman[type][JPEG_HUFFMAN_DC];
+    coder.table_ac = &encoder->table_huffman[type][JPEG_HUFFMAN_AC];
+    coder.put_value = 0;
+    coder.put_bits = 0;
+    coder.dc = 0;
+    coder.writer = encoder->writer;
+    coder.restart_interval = encoder->param.restart_interval;
+    coder.block_count = 0;
+    
+    //uint8_t* buffer = encoder->writer->buffer_current;
+    
+    // Encode all blocks
+    for ( int block_y = 0; block_y < block_cy; block_y++ ) {
+        for ( int block_x = 0; block_x < block_cx; block_x++ ) {
+            // Process restart interval
+            if ( coder.restart_interval != 0 ) {
+                if ( coder.block_count > 0 && (coder.block_count % coder.restart_interval) == 0 ) {
+                    // Emit left bits
+                    if ( coder.put_bits > 0 )
+                        jpeg_huffman_cpu_encoder_emit_left_bits(&coder);
+                    // Restart huffman coder
+                    coder.put_value = 0;
+                    coder.put_bits = 0;
+                    coder.dc = 0;
+                    // Output restart marker
+                    int restart_marker = JPEG_MARKER_RST0 + (((coder.block_count - coder.restart_interval) / coder.restart_interval) & 0x7);
+                    jpeg_writer_emit_marker(encoder->writer, restart_marker);
+                    //printf("byte count %d\n", (int)encoder->writer->buffer_current - (int)buffer);
+                    //buffer = encoder->writer->buffer_current;
+                }
+            }
+            uint8_t* buffer = encoder->writer->buffer_current;
+            
+            // Encoder block
+            int data_index = (block_y * block_cx + block_x) * JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE;
+            if ( jpeg_huffman_cpu_encoder_encode_block(&coder, &data[data_index]) != 0 ) {
+                fprintf(stderr, "Huffman encoder failed at block [%d, %d]!\n", block_y, block_x);
+                return -1;
+            }
+            coder.block_count++;
+        }
+    }
+    
+    // Emit left
+    if ( coder.put_bits > 0 )
+        jpeg_huffman_cpu_encoder_emit_left_bits(&coder);
+        
+    //printf("byte count %d\n", (int)encoder->writer->buffer_current - (int)buffer);
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_huffman_cpu_encoder.h
+++ b/jpeg_compress/jpeg_huffman_cpu_encoder.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_HUFFMAN_CPU_ENCODER
+#define JPEG_HUFFMAN_CPU_ENCODER
+
+#include "jpeg_encoder.h"
+
+/**
+ * Perform huffman encoding
+ * 
+ * @param encoder  Encoder structure
+ * @param type  Component type
+ * @param data  Data buffer
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_cpu_encoder_encode(struct jpeg_encoder* encoder, enum jpeg_component_type type, int16_t* data);
+
+#endif // JPEG_HUFFMAN_CPU_ENCODER
--- a/jpeg_compress/jpeg_huffman_gpu_decoder.cu
+++ b/jpeg_compress/jpeg_huffman_gpu_decoder.cu
@@ -0,0 +1,407 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_huffman_gpu_decoder.h"
+#include "jpeg_format_type.h"
+#include "jpeg_util.h"
+
+/** Natural order in constant memory */
+__constant__ int jpeg_huffman_gpu_decoder_order_natural[64];
+
+/**
+ * Fill more bit to current get buffer
+ * 
+ * @param get_bits
+ * @param get_buff
+ * @param data
+ * @param data_size
+ * @return void
+ */
+__device__ inline void
+jpeg_huffman_gpu_decoder_decode_fill_bit_buffer(int & get_bits, int & get_buff, uint8_t* & data, int & data_size)
+{
+    while ( get_bits < 25 ) {
+        //Are there some data?
+        if( data_size > 0 ) { 
+            // Attempt to read a byte
+            //printf("read byte %X 0x%X\n", (int)data, (unsigned char)*data);
+            unsigned char uc = *data++;
+            data_size--;            
+
+            // If it's 0xFF, check and discard stuffed zero byte
+            if ( uc == 0xFF ) {
+                do {
+                    //printf("read byte %X 0x%X\n", (int)data, (unsigned char)*data);
+                    uc = *data++;
+                    data_size--;
+                } while ( uc == 0xFF );
+
+                if ( uc == 0 ) {
+                    // Found FF/00, which represents an FF data byte
+                    uc = 0xFF;
+                } else {                
+                    // There should be enough bits still left in the data segment;
+                    // if so, just break out of the outer while loop.
+                    //if (m_nGetBits >= nbits)
+                    if ( get_bits >= 0 )
+                        break;
+                }
+            }
+
+            get_buff = (get_buff << 8) | ((int) uc);
+            get_bits += 8;            
+        }
+        else
+            break;
+    }
+}
+
+/**
+ * Get bits
+ * 
+ * @param nbits  Number of bits to get
+ * @param get_bits
+ * @param get_buff
+ * @param data
+ * @param data_size
+ * @return bits
+ */
+__device__ inline int
+jpeg_huffman_gpu_decoder_get_bits(int nbits, int & get_bits, int & get_buff, uint8_t* & data, int & data_size)
+{
+    //we should read nbits bits to get next data
+    if( get_bits < nbits )
+        jpeg_huffman_gpu_decoder_decode_fill_bit_buffer(get_bits, get_buff, data, data_size);
+    get_bits -= nbits;
+    return (int)(get_buff >> get_bits) & ((1 << nbits) - 1);
+}
+
+
+/**
+ * Special Huffman decode:
+ * (1) For codes with length > 8
+ * (2) For codes with length < 8 while data is finished
+ * 
+ * @param table
+ * @param min_bits
+ * @param get_bits
+ * @param get_buff
+ * @param data
+ * @param data_size
+ * @return int
+ */
+__device__ inline int
+jpeg_huffman_gpu_decoder_decode_special_decode(struct jpeg_table_huffman_decoder* table, int min_bits, int & get_bits, int & get_buff, uint8_t* & data, int & data_size)
+{
+    // HUFF_DECODE has determined that the code is at least min_bits
+    // bits long, so fetch that many bits in one swoop.
+    int code = jpeg_huffman_gpu_decoder_get_bits(min_bits, get_bits, get_buff, data, data_size);
+
+    // Collect the rest of the Huffman code one bit at a time.
+    // This is per Figure F.16 in the JPEG spec.
+    int l = min_bits;
+    while ( code > table->maxcode[l] ) {
+        code <<= 1;
+        code |= jpeg_huffman_gpu_decoder_get_bits(1, get_bits, get_buff, data, data_size);
+        l++;
+    }
+
+    // With garbage input we may reach the sentinel value l = 17.
+    if ( l > 16 ) {
+        // Fake a zero as the safest result
+        return 0;
+    }
+    
+    return table->huffval[table->valptr[l] + (int)(code - table->mincode[l])];
+}
+
+/**
+ * To find dc or ac value according to category and category offset
+ * 
+ * @param category
+ * @param offset
+ * @return int
+ */
+__device__ inline int
+jpeg_huffman_gpu_decoder_value_from_category(int category, int offset)
+{
+    // Method 1: 
+    // On some machines, a shift and add will be faster than a table lookup.
+    // #define HUFF_EXTEND(x,s) \
+    // ((x)< (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x)) 
+
+    // Method 2: Table lookup
+    // If (offset < half[category]), then value is below zero
+    // Otherwise, value is above zero, and just the offset 
+    // entry n is 2**(n-1)
+    const int half[16] =    { 
+        0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 
+        0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+    };
+
+    //start[i] is the starting value in this category; surely it is below zero
+    // entry n is (-1 << n) + 1
+    const int start[16] = { 
+        0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
+        ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
+        ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
+        ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 
+    };    
+
+    return (offset < half[category]) ? (offset + start[category]) : offset;    
+}
+
+/**
+ * Get category number for dc, or (0 run length, ac category) for ac.
+ * The max length for Huffman codes is 15 bits; so we use 32 bits buffer    
+ * m_nGetBuff, with the validated length is m_nGetBits.
+ * Usually, more than 95% of the Huffman codes will be 8 or fewer bits long
+ * To speed up, we should pay more attention on the codes whose length <= 8
+ * 
+ * @param table
+ * @param get_bits
+ * @param get_buff
+ * @param data
+ * @param data_size
+ * @return int
+ */
+__device__ inline int
+jpeg_huffman_gpu_decoder_get_category(int & get_bits, int & get_buff, uint8_t* & data, int & data_size, struct jpeg_table_huffman_decoder* table)
+{
+    // If left bits < 8, we should get more data
+    if ( get_bits < 8 )
+        jpeg_huffman_gpu_decoder_decode_fill_bit_buffer(get_bits, get_buff, data, data_size);
+
+    // Call special process if data finished; min bits is 1
+    if( get_bits < 8 )
+        return jpeg_huffman_gpu_decoder_decode_special_decode(table, 1, get_bits, get_buff, data, data_size);
+
+    // Peek the first valid byte    
+    int look = ((get_buff >> (get_bits - 8)) & 0xFF);
+    int nb = table->look_nbits[look];
+
+    if ( nb ) { 
+        get_bits -= nb;
+        return table->look_sym[look]; 
+    } else {
+        //Decode long codes with length >= 9
+        return jpeg_huffman_gpu_decoder_decode_special_decode(table, 9, get_bits, get_buff, data, data_size);
+    }
+}
+
+/**
+ * Decode one 8x8 block
+ *
+ * @return 0 if succeeds, otherwise nonzero
+ */
+__device__ inline int
+jpeg_huffman_gpu_decoder_decode_block(int & dc, int & get_bits, int & get_buff, uint8_t* & data, int & data_size, int16_t* data_output, 
+                                      struct jpeg_table_huffman_decoder* table_dc, struct jpeg_table_huffman_decoder* table_ac)
+{
+    // Section F.2.2.1: decode the DC coefficient difference
+    // get dc category number, s
+    int s = jpeg_huffman_gpu_decoder_get_category(get_bits, get_buff, data, data_size, table_dc);
+    if ( s ) {
+        // Get offset in this dc category
+        int r = jpeg_huffman_gpu_decoder_get_bits(s, get_bits, get_buff, data, data_size);
+        // Get dc difference value
+        s = jpeg_huffman_gpu_decoder_value_from_category(s, r);
+    }
+
+    // Convert DC difference to actual value, update last_dc_val
+    s += dc;
+    dc = s;
+
+    // Output the DC coefficient (assumes jpeg_natural_order[0] = 0)
+    data_output[0] = s;
+    
+    // Section F.2.2.2: decode the AC coefficients
+    // Since zeroes are skipped, output area must be cleared beforehand
+    for ( int k = 1; k < 64; k++ ) {
+        // s: (run, category)
+        int s = jpeg_huffman_gpu_decoder_get_category(get_bits, get_buff, data, data_size, table_ac);
+        // r: run length for ac zero, 0 <= r < 16
+        int r = s >> 4;
+        // s: category for this non-zero ac
+        s &= 15;
+        if ( s ) {
+            //    k: position for next non-zero ac
+            k += r;
+            //    r: offset in this ac category
+            r = jpeg_huffman_gpu_decoder_get_bits(s, get_bits, get_buff, data, data_size);
+            //    s: ac value
+            s = jpeg_huffman_gpu_decoder_value_from_category(s, r);
+
+            data_output[jpeg_huffman_gpu_decoder_order_natural[k]] = s;            
+        } else {
+            // s = 0, means ac value is 0 ? Only if r = 15.  
+            //means all the left ac are zero
+            if ( r != 15 )
+                break;
+            k += 15;
+        }
+    }
+    
+    /*printf("GPU Decode Block\n");
+    for ( int y = 0; y < 8; y++ ) {
+        for ( int x = 0; x < 8; x++ ) {
+            printf("%4d ", data_output[y * 8 + x]);
+        }
+        printf("\n");
+    }*/
+    
+    return 0;
+}
+
+/**
+ * Huffman decoder kernel
+ * 
+ * @return void
+ */
+__global__ void
+jpeg_huffman_decoder_decode_kernel(
+    int restart_interval,
+    int comp_block_count,
+    int comp_segment_count,
+    int segment_count,    
+    uint8_t* d_data_scan,
+    int data_scan_size,
+    int* d_data_scan_index,
+    int16_t* d_data_decompressed,
+    struct jpeg_table_huffman_decoder* d_table_y_dc,
+    struct jpeg_table_huffman_decoder* d_table_y_ac,
+    struct jpeg_table_huffman_decoder* d_table_cbcr_dc,
+    struct jpeg_table_huffman_decoder* d_table_cbcr_ac
+)
+{
+    int comp_index = blockIdx.y;
+    int comp_segment_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( comp_segment_index >= comp_segment_count )
+        return;
+    int segment_index = comp_index * comp_segment_count + comp_segment_index;
+    if ( segment_index >= segment_count )
+        return;
+    
+    // Get huffman tables
+    struct jpeg_table_huffman_decoder* d_table_dc = NULL;
+    struct jpeg_table_huffman_decoder* d_table_ac = NULL;
+    if ( comp_index == 0 ) {
+        d_table_dc = d_table_y_dc;
+        d_table_ac = d_table_y_ac;
+    } else {
+        d_table_dc = d_table_cbcr_dc;
+        d_table_ac = d_table_cbcr_ac;
+    }
+    
+    // Start coder
+    int get_buff = 0;
+    int get_bits = 0;
+    int dc = 0;
+    
+    // Prepare data pointer and its size
+    int data_index = d_data_scan_index[segment_index];
+    uint8_t* data = &d_data_scan[data_index];
+    int data_size = 0;
+    if ( (segment_index + 1) >= segment_count )
+        data_size = data_scan_size - data_index;
+    else
+        data_size = d_data_scan_index[segment_index + 1] - data_index;
+    
+    // Encode blocks in restart segment
+    int comp_block_index = comp_segment_index * restart_interval;
+    for ( int block = 0; block < restart_interval; block++ ) {
+        // Skip blocks out of memory
+        if ( comp_block_index >= comp_block_count )
+            break;
+        // Decode block
+        int data_index = (comp_block_count * comp_index + comp_block_index) * JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE;
+        jpeg_huffman_gpu_decoder_decode_block(
+            dc,
+            get_bits,
+            get_buff,
+            data,
+            data_size,
+            &d_data_decompressed[data_index],
+            d_table_dc,
+            d_table_ac
+        );
+        comp_block_index++;
+    }
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_gpu_decoder_init()
+{
+    // Copy natural order to constant device memory
+    cudaMemcpyToSymbol(
+        "jpeg_huffman_gpu_decoder_order_natural",
+        jpeg_order_natural, 
+        64 * sizeof(int),
+        0,
+        cudaMemcpyHostToDevice
+    );
+    cudaCheckError("Huffman decoder init");
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_gpu_decoder_decode(struct jpeg_decoder* decoder)
+{    
+    assert(decoder->restart_interval > 0);
+    
+    int comp_block_cx = (decoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int comp_block_cy = (decoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int comp_block_count = comp_block_cx * comp_block_cy;
+    int comp_segment_count = divAndRoundUp(comp_block_count, decoder->restart_interval);
+    
+    // Run kernel
+    dim3 thread(32);
+    dim3 grid(divAndRoundUp(comp_segment_count, thread.x), decoder->param_image.comp_count);
+    jpeg_huffman_decoder_decode_kernel<<<grid, thread>>>(
+        decoder->restart_interval,
+        comp_block_count, 
+        comp_segment_count,
+        decoder->segment_count,
+        decoder->d_data_scan,
+        decoder->data_scan_size,
+        decoder->d_data_scan_index,
+        decoder->d_data_quantized,
+        decoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_DC],
+        decoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_AC],
+        decoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_DC],
+        decoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_AC]
+    );
+    cudaError cuerr = cudaThreadSynchronize();
+    if ( cuerr != cudaSuccess ) {
+        fprintf(stderr, "Huffman decoding failed: %s!\n", cudaGetErrorString(cuerr));
+        return -1;
+    }
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_huffman_gpu_decoder.h
+++ b/jpeg_compress/jpeg_huffman_gpu_decoder.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_HUFFMAN_GPU_DECODER
+#define JPEG_HUFFMAN_GPU_DECODER
+
+#include "jpeg_decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Init huffman decoder
+ * 
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_gpu_decoder_init();
+
+/**
+ * Perform huffman decoding
+ * 
+ * @param encoder  Decoder structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_gpu_decoder_decode(struct jpeg_decoder* decoder);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JPEG_HUFFMAN_GPU_DECODER
--- a/jpeg_compress/jpeg_huffman_gpu_encoder.cu
+++ b/jpeg_compress/jpeg_huffman_gpu_encoder.cu
@@ -0,0 +1,362 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_huffman_gpu_encoder.h"
+#include "jpeg_format_type.h"
+#include "jpeg_util.h"
+
+/** Natural order in constant memory */
+__constant__ int jpeg_huffman_gpu_encoder_order_natural[64];
+
+/**
+ * Write one byte to compressed data
+ * 
+ * @param data_compressed  Data compressed
+ * @param value  Byte value to write
+ * @return void
+ */
+#define jpeg_huffman_gpu_encoder_emit_byte(data_compressed, value) { \
+    *data_compressed = (uint8_t)(value); \
+    data_compressed++; }
+    
+/**
+ * Write two bytes to compressed data
+ * 
+ * @param data_compressed  Data compressed
+ * @param value  Two-byte value to write
+ * @return void
+ */
+#define jpeg_huffman_gpu_encoder_emit_2byte(data_compressed, value) { \
+    *data_compressed = (uint8_t)(((value) >> 8) & 0xFF); \
+    data_compressed++; \
+    *data_compressed = (uint8_t)((value) & 0xFF); \
+    data_compressed++; }
+    
+/**
+ * Write marker to compressed data
+ * 
+ * @param data_compressed  Data compressed
+ * @oaran marker  Marker to write (JPEG_MARKER_...)
+ * @return void
+ */
+#define jpeg_huffman_gpu_encoder_marker(data_compressed, marker) { \
+    *data_compressed = 0xFF;\
+    data_compressed++; \
+    *data_compressed = (uint8_t)(marker); \
+    data_compressed++; }
+
+/**
+ * Output bits to the file. Only the right 24 bits of put_buffer are used; 
+ * the valid bits are left-justified in this part.  At most 16 bits can be 
+ * passed to EmitBits in one call, and we never retain more than 7 bits 
+ * in put_buffer between calls, so 24 bits are sufficient.
+ * 
+ * @param coder  Huffman coder structure
+ * @param code  Huffman code
+ * @param size  Size in bits of the Huffman code
+ * @return void
+ */
+__device__ inline int
+jpeg_huffman_gpu_encoder_emit_bits(unsigned int code, int size, int & put_value, int & put_bits, uint8_t* & data_compressed)
+{
+	// This routine is heavily used, so it's worth coding tightly
+	int _put_buffer = (int)code;
+	int _put_bits = put_bits;
+	// If size is 0, caller used an invalid Huffman table entry
+	if ( size == 0 )
+		return -1;
+    // Mask off any extra bits in code
+	_put_buffer &= (((int)1) << size) - 1; 
+    // New number of bits in buffer
+	_put_bits += size;					
+    // Align incoming bits
+	_put_buffer <<= 24 - _put_bits;		
+    // And merge with old buffer contents
+	_put_buffer |= put_value;    
+	// If there are more than 8 bits, write it out
+	unsigned char uc;
+	while ( _put_bits >= 8 ) {
+		// Write one byte out
+		uc = (unsigned char) ((_put_buffer >> 16) & 0xFF);
+		jpeg_huffman_gpu_encoder_emit_byte(data_compressed, uc);
+        // If need to stuff a zero byte
+		if ( uc == 0xFF ) {  
+            // Write zero byte out
+			jpeg_huffman_gpu_encoder_emit_byte(data_compressed, 0);
+		}
+		_put_buffer <<= 8;
+		_put_bits -= 8;
+	}
+    // update state variables
+	put_value = _put_buffer; 
+	put_bits = _put_bits;
+	return 0;
+}
+
+/**
+ * Emit left bits
+ * 
+ * @param coder  Huffman coder structure
+ * @return void
+ */
+__device__ inline void
+jpeg_huffman_gpu_encoder_emit_left_bits(int & put_value, int & put_bits, uint8_t* & data_compressed)
+{
+    // Fill 7 bits with ones
+	if ( jpeg_huffman_gpu_encoder_emit_bits(0x7F, 7, put_value, put_bits, data_compressed) != 0 )
+		return;
+	
+	//unsigned char uc = (unsigned char) ((put_value >> 16) & 0xFF);
+    // Write one byte out
+	//jpeg_huffman_gpu_encoder_emit_byte(data_compressed, uc);
+    
+	put_value = 0; 
+	put_bits = 0;
+}
+
+/**
+ * Encode one 8x8 block
+ *
+ * @return 0 if succeeds, otherwise nonzero
+ */
+__device__ int
+jpeg_huffman_gpu_encoder_encode_block(int & put_value, int & put_bits, int & dc, int16_t* data, uint8_t* & data_compressed, struct jpeg_table_huffman_encoder* d_table_dc, struct jpeg_table_huffman_encoder* d_table_ac)
+{
+	// Encode the DC coefficient difference per section F.1.2.1
+	int temp = data[0] - dc;
+	dc = data[0];
+
+    int temp2 = temp;
+	if ( temp < 0 ) {
+        // Temp is abs value of input
+		temp = -temp;
+		// For a negative input, want temp2 = bitwise complement of abs(input)
+		// This code assumes we are on a two's complement machine
+		temp2--;
+	}
+
+	// Find the number of bits needed for the magnitude of the coefficient
+	int nbits = 0;
+	while ( temp ) {
+		nbits++;
+		temp >>= 1;
+	}
+
+	//	Write category number
+	if ( jpeg_huffman_gpu_encoder_emit_bits(d_table_dc->code[nbits], d_table_dc->size[nbits], put_value, put_bits, data_compressed) != 0 ) {
+		return -1;
+    }
+
+	//	Write category offset (EmitBits rejects calls with size 0)
+	if ( nbits ) {
+		if ( jpeg_huffman_gpu_encoder_emit_bits((unsigned int) temp2, nbits, put_value, put_bits, data_compressed) != 0 )
+			return -1;
+	}
+    
+	// Encode the AC coefficients per section F.1.2.2 (r = run length of zeros)
+	int r = 0;
+	for ( int k = 1; k < 64; k++ ) 
+	{
+		if ( (temp = data[jpeg_huffman_gpu_encoder_order_natural[k]]) == 0 ) {
+			r++;
+		}
+		else {
+			// If run length > 15, must emit special run-length-16 codes (0xF0)
+			while ( r > 15 ) {
+				if ( jpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[0xF0], d_table_ac->size[0xF0], put_value, put_bits, data_compressed) != 0 )
+					return -1;
+				r -= 16;
+			}
+
+			temp2 = temp;
+			if ( temp < 0 ) {
+                // temp is abs value of input
+				temp = -temp;		
+				// This code assumes we are on a two's complement machine
+				temp2--;
+			}
+
+			// Find the number of bits needed for the magnitude of the coefficient
+            // there must be at least one 1 bit
+			nbits = 1;
+			while ( (temp >>= 1) )
+				nbits++;
+
+			// Emit Huffman symbol for run length / number of bits
+			int i = (r << 4) + nbits;
+			if ( jpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[i], d_table_ac->size[i], put_value, put_bits, data_compressed) != 0 )
+				return -1;
+
+			// Write Category offset
+			if ( jpeg_huffman_gpu_encoder_emit_bits((unsigned int) temp2, nbits, put_value, put_bits, data_compressed) != 0 )
+				return -1;
+
+			r = 0;
+		}
+	}
+
+	// If all the left coefs were zero, emit an end-of-block code
+	if ( r > 0 ) {
+		if ( jpeg_huffman_gpu_encoder_emit_bits(d_table_ac->code[0], d_table_ac->size[0], put_value, put_bits, data_compressed) != 0 )
+			return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Huffman encoder kernel
+ * 
+ * @return void
+ */
+__global__ void
+jpeg_huffman_encoder_encode_kernel(
+    int restart_interval,
+    int block_count,
+    struct jpeg_encoder_segment* d_segments,
+    int segment_count,    
+    int16_t* d_data,
+    uint8_t* d_data_compressed,
+    struct jpeg_table_huffman_encoder* d_table_y_dc,
+    struct jpeg_table_huffman_encoder* d_table_y_ac,
+    struct jpeg_table_huffman_encoder* d_table_cbcr_dc,
+    struct jpeg_table_huffman_encoder* d_table_cbcr_ac
+)
+{	
+    int segment_index = blockIdx.x * blockDim.x + threadIdx.x;
+    int comp_index = blockIdx.y;
+    if ( segment_index >= segment_count )
+        return;
+        
+    struct jpeg_encoder_segment* segment = &d_segments[comp_index * segment_count + segment_index];
+    
+    // Get huffman tables
+    struct jpeg_table_huffman_encoder* d_table_dc = NULL;
+    struct jpeg_table_huffman_encoder* d_table_ac = NULL;
+    if ( comp_index == 0 ) {
+        d_table_dc = d_table_y_dc;
+        d_table_ac = d_table_y_ac;
+    } else {
+        d_table_dc = d_table_cbcr_dc;
+        d_table_ac = d_table_cbcr_ac;
+    }
+    
+    // Initialize huffman coder
+    int put_value = 0;
+    int put_bits = 0;
+    int dc = 0;
+    
+    // Prepare data pointers
+    uint8_t* data_compressed = &d_data_compressed[segment->data_compressed_index];
+    uint8_t* data_compressed_start = data_compressed;
+    
+    // Encode blocks in restart segment
+    int block_index = segment_index * restart_interval;
+    for ( int block = 0; block < restart_interval; block++ ) {
+        // Skip blocks out of memory
+        if ( block_index >= block_count )
+            break;
+        // Encode block
+        int data_index = (block_count * comp_index + block_index) * JPEG_BLOCK_SIZE * JPEG_BLOCK_SIZE;
+        jpeg_huffman_gpu_encoder_encode_block(
+            put_value, 
+            put_bits, 
+            dc, 
+            &d_data[data_index], 
+            data_compressed,
+            d_table_dc,
+            d_table_ac
+        );
+        block_index++;
+    }
+    
+    // Emit left bits
+    if ( put_bits > 0 )
+        jpeg_huffman_gpu_encoder_emit_left_bits(put_value, put_bits, data_compressed);
+                        
+    // Output restart marker
+    if ( block_index < block_count ) {
+        int restart_marker = JPEG_MARKER_RST0 + (((block_index - restart_interval) / restart_interval) & 0x7);
+        //printf("%d,%d: marker 0x%X\n", comp_index, segment_index, restart_marker);
+        jpeg_huffman_gpu_encoder_marker(data_compressed, restart_marker);
+    }
+                
+    // Set compressed size
+    segment->data_compressed_size = data_compressed - data_compressed_start;
+    //printf("%d,%d: byte count %d (%d)\n", comp_index, segment_index, segment->data_compressed_size, segment->data_compressed_index);
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_gpu_encoder_init()
+{
+    // Copy natural order to constant device memory
+    cudaMemcpyToSymbol(
+        "jpeg_huffman_gpu_encoder_order_natural",
+        jpeg_order_natural, 
+        64 * sizeof(int),
+        0,
+        cudaMemcpyHostToDevice
+    );
+    cudaCheckError("Huffman encoder init");
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_huffman_gpu_encoder_encode(struct jpeg_encoder* encoder)
+{    
+    assert(encoder->param.restart_interval > 0);
+    
+    int block_cx = (encoder->param_image.width + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int block_cy = (encoder->param_image.height + JPEG_BLOCK_SIZE - 1) / JPEG_BLOCK_SIZE;
+    int block_count = block_cx * block_cy;
+    int segment_count = (block_count / encoder->param.restart_interval + 1);
+            
+    // Run kernel
+    dim3 thread(32);
+    dim3 grid(segment_count / thread.x + 1, encoder->param_image.comp_count);
+    jpeg_huffman_encoder_encode_kernel<<<grid, thread>>>(
+        encoder->param.restart_interval,
+        block_count, 
+        encoder->d_segments, 
+        segment_count,        
+        encoder->d_data_quantized, 
+        encoder->d_data_compressed, 
+        encoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_DC],
+        encoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_AC],
+        encoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_DC],
+        encoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_AC]
+    );
+    cudaError cuerr = cudaThreadSynchronize();
+    if ( cuerr != cudaSuccess ) {
+        fprintf(stderr, "Huffman encoding failed: %s!\n", cudaGetErrorString(cuerr));
+        return -1;
+    }
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_huffman_gpu_encoder.h
+++ b/jpeg_compress/jpeg_huffman_gpu_encoder.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_HUFFMAN_GPU_ENCODER
+#define JPEG_HUFFMAN_GPU_ENCODER
+
+#include "jpeg_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Init huffman encoder
+ * 
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_gpu_encoder_init();
+
+/**
+ * Perform huffman encoding
+ * 
+ * @param encoder  Encoder structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_huffman_gpu_encoder_encode(struct jpeg_encoder* encoder);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JPEG_HUFFMAN_GPU_ENCODER
--- a/jpeg_compress/jpeg_preprocessor.cu
+++ b/jpeg_compress/jpeg_preprocessor.cu
@@ -0,0 +1,351 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_preprocessor.h"
+#include "jpeg_util.h"
+
+/**
+ * Color space transformation
+ *
+ * @param color_space_from
+ * @param color_space_to
+ */
+template<enum jpeg_color_space color_space_from, enum jpeg_color_space color_space_to>
+struct jpeg_color_transform
+{
+    static __device__ void
+    perform(float & c1, float & c2, float & c3) {
+        assert(false);
+    }
+};
+
+/** Specialization [color_space_from = color_space_to] */
+template<enum jpeg_color_space color_space>
+struct jpeg_color_transform<color_space, color_space> {
+    /** None transform */
+    static __device__ void 
+    perform(float & c1, float & c2, float & c3) {
+        // Same color space so do nothing 
+    }
+};
+
+/** Specialization [color_space_from = JPEG_RGB, color_space_to = JPEG_YCBCR] */
+template<>
+struct jpeg_color_transform<JPEG_RGB, JPEG_YCBCR> {
+    /** RGB -> YCbCr transform (8 bit) */
+    static __device__ void 
+    perform(float & c1, float & c2, float & c3) {
+        float r1 = 0.299f * c1 + 0.587f * c2 + 0.114f * c3;
+        float r2 = -0.1687f * c1 - 0.3313f * c2 + 0.5f * c3 + 128.0f;
+        float r3 = 0.5f * c1 - 0.4187f * c2 - 0.0813f * c3 + 128.0f;
+        c1 = r1;
+        c2 = r2;
+        c3 = r3;
+    }
+};
+
+/** Specialization [color_space_from = JPEG_YUV, color_space_to = JPEG_YCBCR] */
+template<>
+struct jpeg_color_transform<JPEG_YUV, JPEG_YCBCR> {
+    /** YUV -> YCbCr transform (8 bit) */
+    static __device__ void 
+    perform(float & c1, float & c2, float & c3) {
+        // Do nothing
+    }
+};
+
+/** Specialization [color_space_from = JPEG_YCBCR, color_space_to = JPEG_RGB] */
+template<>
+struct jpeg_color_transform<JPEG_YCBCR, JPEG_RGB> {
+    /** YCbCr -> RGB transform (8 bit) */
+    static __device__ void 
+    perform(float & c1, float & c2, float & c3) {
+        // Update values
+        float r1 = c1 - 0.0f;
+        float r2 = c2 - 128.0f;
+        float r3 = c3 - 128.0f;
+        // Perfomr YCbCr -> RGB conversion
+        c1 = (1.0f * r1 + 0.0f * r2 + 1.402f * r3);
+        c2 = (1.0f * r1 - 0.344136f * r2 - 0.714136f * r3);
+        c3 = (1.0f * r1 + 1.772f * r2 + 0.0f * r3);
+        // Check minimum value 0
+        c1 = (c1 >= 0.0f) ? c1 : 0.0f;
+        c2 = (c2 >= 0.0f) ? c2 : 0.0f;
+        c3 = (c3 >= 0.0f) ? c3 : 0.0f;
+        // Check maximum value 255
+        c1 = (c1 <= 255.0) ? c1 : 255.0f;
+        c2 = (c2 <= 255.0) ? c2 : 255.0f;
+        c3 = (c3 <= 255.0) ? c3 : 255.0f;    
+    }
+};
+
+/** Specialization [color_space_from = JPEG_YCBCR, color_space_to = JPEG_YUV] */
+template<>
+struct jpeg_color_transform<JPEG_YCBCR, JPEG_YUV> {
+    /** YCbCr -> YUV transform (8 bit) */
+    static __device__ void 
+    perform(float & c1, float & c2, float & c3) {
+        // Do nothing
+    }
+};
+
+#define RGB_8BIT_THREADS 256
+
+/**
+ * Kernel - Copy raw image source data into three separated component buffers
+ *
+ * @param d_c1  First component buffer
+ * @param d_c2  Second component buffer
+ * @param d_c3  Third component buffer
+ * @param d_source  Image source data
+ * @param pixel_count  Number of pixels to copy
+ * @return void
+ */
+typedef void (*jpeg_preprocessor_encode_kernel)(uint8_t* d_c1, uint8_t* d_c2, uint8_t* d_c3, const uint8_t* d_source, int pixel_count);
+ 
+/** Specialization [sampling factor is 4:4:4] */
+template<enum jpeg_color_space color_space>
+__global__ void 
+jpeg_preprocessor_raw_to_comp_kernel_4_4_4(uint8_t* d_c1, uint8_t* d_c2, uint8_t* d_c3, const uint8_t* d_source, int pixel_count)
+{
+    int x  = threadIdx.x;
+    int gX = blockDim.x * blockIdx.x;
+        
+    // Load to shared
+    __shared__ unsigned char s_data[RGB_8BIT_THREADS * 3];
+    if ( (x * 4) < RGB_8BIT_THREADS * 3 ) {
+        int* s = (int*)d_source;
+        int* d = (int*)s_data;
+        d[x] = s[((gX * 3) >> 2) + x];
+    }
+    __syncthreads();
+
+    // Load
+    int offset = x * 3;
+    float r1 = (float)(s_data[offset]);
+    float r2 = (float)(s_data[offset + 1]);
+    float r3 = (float)(s_data[offset + 2]);
+    // Color transform
+    jpeg_color_transform<color_space, JPEG_YCBCR>::perform(r1, r2, r3);
+    // Store
+    int globalOutputPosition = gX + x;
+    if ( globalOutputPosition < pixel_count ) {
+        d_c1[globalOutputPosition] = (uint8_t)r1;
+        d_c2[globalOutputPosition] = (uint8_t)r2;
+        d_c3[globalOutputPosition] = (uint8_t)r3;
+    }
+}
+
+/** Specialization [sampling factor is 4:2:2] */
+template<enum jpeg_color_space color_space>
+__global__ void 
+jpeg_preprocessor_raw_to_comp_kernel_4_2_2(uint8_t* d_c1, uint8_t* d_c2, uint8_t* d_c3, const uint8_t* d_source, int pixel_count)
+{
+    int x  = threadIdx.x;
+    int gX = blockDim.x * blockIdx.x;
+        
+    // Load to shared
+    __shared__ unsigned char s_data[RGB_8BIT_THREADS * 2];
+    if ( (x * 4) < RGB_8BIT_THREADS * 2 ) {
+        int* s = (int*)d_source;
+        int* d = (int*)s_data;
+        d[x] = s[((gX * 2) >> 2) + x];
+    }
+    __syncthreads();
+
+    // Load
+    int offset = x * 2;
+    float r1 = (float)(s_data[offset + 1]);
+    float r2;
+    float r3;
+    if ( (gX + x) % 2 == 0 ) {
+        r2 = (float)(s_data[offset]);
+        r3 = (float)(s_data[offset + 2]);
+    } else {
+        r2 = (float)(s_data[offset - 2]);
+        r3 = (float)(s_data[offset]);
+    }
+    // Color transform
+    jpeg_color_transform<color_space, JPEG_YCBCR>::perform(r1, r2, r3);
+    // Store
+    int globalOutputPosition = gX + x;
+    if ( globalOutputPosition < pixel_count ) {
+        d_c1[globalOutputPosition] = (uint8_t)r1;
+        d_c2[globalOutputPosition] = (uint8_t)r2;
+        d_c3[globalOutputPosition] = (uint8_t)r3;
+    }
+}
+
+/**
+ * Select preprocessor encode kernel
+ * 
+ * @param encoder
+ * @return kernel
+ */
+jpeg_preprocessor_encode_kernel
+jpeg_preprocessor_select_encode_kernel(struct jpeg_encoder* encoder)
+{
+    // RGB color space
+    if ( encoder->param_image.color_space == JPEG_RGB ) {
+        assert(encoder->param_image.sampling_factor == JPEG_4_4_4);
+        return &jpeg_preprocessor_raw_to_comp_kernel_4_4_4<JPEG_RGB>;
+    } 
+    // YUV color space
+    else if ( encoder->param_image.color_space == JPEG_YUV ) {
+        if ( encoder->param_image.sampling_factor == JPEG_4_4_4 ) {
+            return &jpeg_preprocessor_raw_to_comp_kernel_4_4_4<JPEG_YUV>;
+        } else if ( encoder->param_image.sampling_factor == JPEG_4_2_2 ) {
+            return &jpeg_preprocessor_raw_to_comp_kernel_4_2_2<JPEG_YUV>;
+        } else {
+            assert(false);
+        }
+    }
+    // Unknown color space
+    else {
+        assert(false);
+    }
+    return NULL;
+}
+
+/** Documented at declaration */
+int
+jpeg_preprocessor_encode(struct jpeg_encoder* encoder)
+{        
+    int pixel_count = encoder->param_image.width * encoder->param_image.height;
+    int alignedSize = (pixel_count / RGB_8BIT_THREADS + 1) * RGB_8BIT_THREADS * 3;
+
+    // Select kernel
+    jpeg_preprocessor_encode_kernel kernel = jpeg_preprocessor_select_encode_kernel(encoder);
+    
+    // Prepare kernel
+    dim3 threads (RGB_8BIT_THREADS);
+    dim3 grid (alignedSize / (RGB_8BIT_THREADS * 3));
+    assert(alignedSize % (RGB_8BIT_THREADS * 3) == 0);
+
+    // Run kernel
+    uint8_t* d_c1 = &encoder->d_data[0 * pixel_count];
+    uint8_t* d_c2 = &encoder->d_data[1 * pixel_count];
+    uint8_t* d_c3 = &encoder->d_data[2 * pixel_count];
+    kernel<<<grid, threads>>>(d_c1, d_c2, d_c3, encoder->d_data_source, pixel_count);
+    cudaError cuerr = cudaThreadSynchronize();
+    if ( cuerr != cudaSuccess ) {
+        fprintf(stderr, "Preprocessor encoding failed: %s!\n", cudaGetErrorString(cuerr));
+        return -1;
+    }
+        
+    return 0;
+}
+
+/**
+ * Kernel - Copy three separated component buffers into target image data
+ *
+ * @param d_c1  First component buffer
+ * @param d_c2  Second component buffer
+ * @param d_c3  Third component buffer
+ * @param d_target  Image target data
+ * @param pixel_count  Number of pixels to copy
+ * @return void
+ */
+typedef void (*jpeg_preprocessor_decode_kernel)(const uint8_t* d_c1, const uint8_t* d_c2, const uint8_t* d_c3, uint8_t* d_target, int pixel_count);
+
+/** Specialization [sampling factor is 4:4:4] */
+template<enum jpeg_color_space color_space>
+__global__ void
+jpeg_preprocessor_comp_to_raw_kernel_4_4_4(const uint8_t* d_c1, const uint8_t* d_c2, const uint8_t* d_c3, uint8_t* d_target, int pixel_count)
+{
+    int x  = threadIdx.x;
+    int gX = blockDim.x * blockIdx.x;
+    int globalInputPosition = gX + x;
+    if ( globalInputPosition >= pixel_count )
+        return;
+    int globalOutputPosition = (gX + x) * 3;
+    
+    // Load
+    float r1 = (float)(d_c1[globalInputPosition]);
+    float r2 = (float)(d_c2[globalInputPosition]);
+    float r3 = (float)(d_c3[globalInputPosition]);
+    // Color transform
+    jpeg_color_transform<JPEG_YCBCR, color_space>::perform(r1, r2, r3);
+    // Save
+    d_target[globalOutputPosition + 0] = (uint8_t)r1;
+    d_target[globalOutputPosition + 1] = (uint8_t)r2;
+    d_target[globalOutputPosition + 2] = (uint8_t)r3;
+}
+
+/**
+ * Select preprocessor decode kernel
+ * 
+ * @param decoder
+ * @return kernel
+ */
+jpeg_preprocessor_decode_kernel
+jpeg_preprocessor_select_decode_kernel(struct jpeg_decoder* decoder)
+{
+    // RGB color space
+    if ( decoder->param_image.color_space == JPEG_RGB ) {
+        assert(decoder->param_image.sampling_factor == JPEG_4_4_4);
+        return &jpeg_preprocessor_comp_to_raw_kernel_4_4_4<JPEG_RGB>;
+    } 
+    // YUV color space
+    else if ( decoder->param_image.color_space == JPEG_YUV ) {
+        assert(decoder->param_image.sampling_factor == JPEG_4_4_4);
+        return &jpeg_preprocessor_comp_to_raw_kernel_4_4_4<JPEG_YUV>;
+    }
+    // Unknown color space
+    else {
+        assert(false);
+    }
+    return NULL;
+}
+
+/** Documented at declaration */
+int
+jpeg_preprocessor_decode(struct jpeg_decoder* decoder)
+{
+    int pixel_count = decoder->param_image.width * decoder->param_image.height;
+    int alignedSize = (pixel_count / RGB_8BIT_THREADS + 1) * RGB_8BIT_THREADS * 3;
+        
+    // Select kernel
+    jpeg_preprocessor_decode_kernel kernel = jpeg_preprocessor_select_decode_kernel(decoder);
+    
+    // Prepare kernel
+    dim3 threads (RGB_8BIT_THREADS);
+    dim3 grid (alignedSize / (RGB_8BIT_THREADS * 3));
+    assert(alignedSize % (RGB_8BIT_THREADS * 3) == 0);
+
+    // Run kernel
+    uint8_t* d_c1 = &decoder->d_data[0 * pixel_count];
+    uint8_t* d_c2 = &decoder->d_data[1 * pixel_count];
+    uint8_t* d_c3 = &decoder->d_data[2 * pixel_count];
+    kernel<<<grid, threads>>>(d_c1, d_c2, d_c3, decoder->d_data_target, pixel_count);
+    cudaError cuerr = cudaThreadSynchronize();
+    if ( cuerr != cudaSuccess ) {
+        fprintf(stderr, "Preprocessing decoding failed: %s!\n", cudaGetErrorString(cuerr));
+        return -1;
+    }
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_preprocessor.h
+++ b/jpeg_compress/jpeg_preprocessor.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_PREPROCESSOR
+#define JPEG_PREPROCESSOR
+
+#include "jpeg_encoder.h"
+#include "jpeg_decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Preprocessor encode
+ * 
+ * @param encoder  Encoder structure
+ * @param image  Image source data
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_preprocessor_encode(struct jpeg_encoder* encoder);
+
+/**
+ * Preprocessor decode
+ * 
+ * @param encoder  Encoder structure
+ * @param image  Image source data
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_preprocessor_decode(struct jpeg_decoder* decoder);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JPEG_PREPROCESSOR
--- a/jpeg_compress/jpeg_reader.c
+++ b/jpeg_compress/jpeg_reader.c
@@ -0,0 +1,594 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_reader.h"
+#include "jpeg_decoder.h"
+#include "jpeg_format_type.h"
+#include "jpeg_util.h"
+
+/** Documented at declaration */
+struct jpeg_reader*
+jpeg_reader_create()
+{
+    struct jpeg_reader* reader = malloc(sizeof(struct jpeg_reader));
+    if ( reader == NULL )
+        return NULL;
+    
+    return reader;
+}
+
+/** Documented at declaration */
+int
+jpeg_reader_destroy(struct jpeg_reader* reader)
+{
+    assert(reader != NULL);
+    free(reader);
+    return 0;
+}
+
+/**
+ * Read byte from image data
+ * 
+ * @param image
+ * @return byte
+ */
+#define jpeg_reader_read_byte(image) \
+    (uint8_t)(*(image)++)
+
+/**
+ * Read two-bytes from image data
+ * 
+ * @param image
+ * @return 2 bytes
+ */    
+#define jpeg_reader_read_2byte(image) \
+    (uint16_t)(((*(image)) << 8) + (*((image) + 1))); \
+    image += 2;
+
+/**
+ * Read marker from image data
+ * 
+ * @param image
+ * @return marker code
+ */
+int
+jpeg_reader_read_marker(uint8_t** image)
+{
+    if( jpeg_reader_read_byte(*image) != 0xFF )
+        return -1;
+    int marker = jpeg_reader_read_byte(*image);
+    return marker;
+}
+
+/**
+ * Skip marker content (read length and that much bytes - 2)
+ * 
+ * @param image
+ * @return void
+ */
+void
+jpeg_reader_skip_marker_content(uint8_t** image)
+{
+    int length = (int)jpeg_reader_read_2byte(*image);
+
+    *image += length - 2;
+}
+
+/**
+ * Read application ifno block from image
+ * 
+ * @param image
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_app0(uint8_t** image)
+{
+    int length = (int)jpeg_reader_read_2byte(*image);
+    if ( length != 16 ) {
+        fprintf(stderr, "Error: APP0 marker length should be 16 but %d was presented!\n", length);
+        return -1;
+    }
+
+    char jfif[4];
+    jfif[0] = jpeg_reader_read_byte(*image);
+    jfif[1] = jpeg_reader_read_byte(*image);
+    jfif[2] = jpeg_reader_read_byte(*image);
+    jfif[3] = jpeg_reader_read_byte(*image);
+    jfif[4] = jpeg_reader_read_byte(*image);
+    if ( strcmp(jfif, "JFIF") != 0 ) {
+        fprintf(stderr, "Error: APP0 marker identifier should be 'JFIF' but '%s' was presented!\n", jfif);
+        return -1;
+    }
+
+    int version_major = jpeg_reader_read_byte(*image);
+    int version_minor = jpeg_reader_read_byte(*image);
+    if ( version_major != 1 || version_minor != 1 ) {
+        fprintf(stderr, "Error: APP0 marker version should be 1.1 but %d.%d was presented!\n", version_major, version_minor);
+        return -1;
+    }
+    
+    int pixel_units = jpeg_reader_read_byte(*image);
+    int pixel_xdpu = jpeg_reader_read_2byte(*image);
+    int pixel_ydpu = jpeg_reader_read_2byte(*image);
+    int thumbnail_width = jpeg_reader_read_byte(*image);
+    int thumbnail_height = jpeg_reader_read_byte(*image);
+
+    return 0;
+}
+
+/**
+ * Read quantization table definition block from image
+ * 
+ * @param decoder
+ * @param image
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_dqt(struct jpeg_decoder* decoder, uint8_t** image)
+{    
+    int length = (int)jpeg_reader_read_2byte(*image);
+    length -= 2;
+    
+    if ( length != 65 ) {
+        fprintf(stderr, "Error: DQT marker length should be 65 but %d was presented!\n", length);
+        return -1;
+    }
+    
+    int index = jpeg_reader_read_byte(*image);
+    struct jpeg_table_quantization* table;
+    if( index == 0 ) {
+        table = &decoder->table_quantization[JPEG_COMPONENT_LUMINANCE];
+    } else if ( index == 1 ) {
+        table = &decoder->table_quantization[JPEG_COMPONENT_CHROMINANCE];
+    } else {
+        fprintf(stderr, "Error: DQT marker index should be 0 or 1 but %d was presented!\n", index);
+        return -1;
+    }
+
+    for ( int i = 0; i < 64; i++ ) {
+        table->table_raw[jpeg_order_natural[i]] = jpeg_reader_read_byte(*image);
+    }
+    
+    // Prepare quantization table for read raw table
+    jpeg_table_quantization_decoder_compute(table);
+    
+    return 0;
+}
+
+/**
+ * Read start of frame block from image
+ * 
+ * @param image
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_sof0(struct jpeg_decoder* decoder, uint8_t** image)
+{    
+    int length = (int)jpeg_reader_read_2byte(*image);
+    if ( length < 6 ) {
+        fprintf(stderr, "Error: SOF0 marker length should be greater than 6 but %d was presented!\n", length);
+        return -1;
+    }
+    length -= 2;
+
+    int precision = (int)jpeg_reader_read_byte(*image);
+    if ( precision != 8 ) {
+        fprintf(stderr, "Error: SOF0 marker precision should be 8 but %d was presented!\n", precision);
+        return -1;
+    }
+    int height = (int)jpeg_reader_read_2byte(*image);
+    int width = (int)jpeg_reader_read_2byte(*image);
+    int comp_count = (int)jpeg_reader_read_byte(*image);
+    jpeg_decoder_init(decoder, width, height, comp_count);
+    if ( width != decoder->param_image.width || height != decoder->param_image.height ) {
+        fprintf(stderr, "Error: SOF0 marker image size should be %dx%d but %dx%d was presented!\n", decoder->param_image.width, decoder->param_image.height, width, height);
+        return -1;
+    }
+    if ( comp_count != decoder->param_image.comp_count ) {
+        fprintf(stderr, "Error: SOF0 marker component count should be %d but %d was presented!\n", decoder->param_image.comp_count, comp_count);
+        return -1;
+    }
+    length -= 6;
+
+    for ( int comp = 0; comp < comp_count; comp++ ) {
+        int index = (int)jpeg_reader_read_byte(*image);
+        if ( index != (comp + 1) ) {
+            fprintf(stderr, "Error: SOF0 marker component %d id should be %d but %d was presented!\n", comp, comp + 1, index);
+            return -1;
+        }
+        int sampling = (int)jpeg_reader_read_byte(*image);
+        int sampling_h = (sampling >> 4) & 15;
+        int sampling_v = sampling & 15;
+        if ( sampling_h != 1 || sampling_v != 1 ) {
+            fprintf(stderr, "Error: SOF0 marker component %d sampling factor %dx%d is not supported!\n", comp, sampling_h, sampling_v);
+            return -1;
+        }
+        int table_index = (int)jpeg_reader_read_byte(*image);
+        if ( comp == 0 && table_index != 0 ) {
+            fprintf(stderr, "Error: SOF0 marker component Y should have quantization table index 0 but %d was presented!\n", table_index);
+            return -1;
+        }
+        if ( (comp == 1 || comp == 2) && table_index != 1 ) {
+            fprintf(stderr, "Error: SOF0 marker component Cb or Cr should have quantization table index 1 but %d was presented!\n", table_index);
+            return -1;
+        }
+        length -= 3;
+    }
+    
+    // Check length
+    if ( length > 0 ) {
+        fprintf(stderr, "Warning: SOF0 marker contains %d more bytes than needed!\n", length);
+        *image += length;
+    }
+    
+    return 0;
+}
+
+/**
+ * Read huffman table definition block from image
+ * 
+ * @param decoder
+ * @param image
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_dht(struct jpeg_decoder* decoder, uint8_t** image)
+{    
+    int length = (int)jpeg_reader_read_2byte(*image);
+    length -= 2;
+    
+    int index = jpeg_reader_read_byte(*image);
+    struct jpeg_table_huffman_decoder* table = NULL;
+    struct jpeg_table_huffman_decoder* d_table = NULL;
+    switch(index) {
+    case 0:
+        table = &decoder->table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_DC];
+        d_table = decoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_DC];
+        break;
+    case 16:
+        table = &decoder->table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_AC];
+        d_table = decoder->d_table_huffman[JPEG_COMPONENT_LUMINANCE][JPEG_HUFFMAN_AC];
+        break;
+    case 1:
+        table = &decoder->table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_DC];
+        d_table = decoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_DC];
+        break;
+    case 17:
+        table = &decoder->table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_AC];
+        d_table = decoder->d_table_huffman[JPEG_COMPONENT_CHROMINANCE][JPEG_HUFFMAN_AC];
+        break;
+    default:
+        fprintf(stderr, "Error: DHT marker index should be 0, 1, 16 or 17 but %d was presented!\n", index);
+        return -1;
+    }
+    length -= 1;
+    
+    // Read in bits[]
+    table->bits[0] = 0;
+    int count = 0;
+    for ( int i = 1; i <= 16; i++ ) {
+        table->bits[i] = jpeg_reader_read_byte(*image);
+        count += table->bits[i];
+        if ( length > 0 ) {
+            length--;
+        } else {
+            fprintf(stderr, "Error: DHT marker unexpected end when reading bit counts!\n", index);
+            return -1;
+        }
+    }   
+
+    // Read in huffval
+    for ( int i = 0; i < count; i++ ){
+        table->huffval[i] = jpeg_reader_read_byte(*image);
+        if ( length > 0 ) {
+            length--;
+        } else {
+            fprintf(stderr, "Error: DHT marker unexpected end when reading huffman values!\n", index);
+            return -1;
+        }
+    }
+    
+    // Check length
+    if ( length > 0 ) {
+        fprintf(stderr, "Warning: DHT marker contains %d more bytes than needed!\n", length);
+        *image += length;
+    }
+    
+    // Compute huffman table for read values
+    jpeg_table_huffman_decoder_compute(table, d_table);
+    
+    return 0;
+}
+
+/**
+ * Read restart interval block from image
+ * 
+ * @param decoder
+ * @param image
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_dri(struct jpeg_decoder* decoder, uint8_t** image)
+{
+    int length = (int)jpeg_reader_read_2byte(*image);
+    if ( length != 4 ) {
+        fprintf(stderr, "Error: DRI marker length should be 4 but %d was presented!\n", length);
+        return -1;
+    }
+    
+    if ( decoder->restart_interval != 0 ) {
+        fprintf(stderr, "Error: DRI marker can't redefine restart interval!");
+        fprintf(stderr, "This may be caused when more DRI markers are presented which is not supported!\n");
+        return -1;
+    }
+    
+    decoder->restart_interval = jpeg_reader_read_2byte(*image);
+    
+    return 0;
+}
+
+/**
+ * Read start of scan block from image
+ * 
+ * @param image
+ * @param image_end
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_sos(struct jpeg_decoder* decoder, uint8_t** image, uint8_t* image_end)
+{    
+    int length = (int)jpeg_reader_read_2byte(*image);
+    length -= 2;
+    
+    int comp_count = (int)jpeg_reader_read_byte(*image);
+    if ( comp_count != 1 ) {
+        fprintf(stderr, "Error: SOS marker component count %d is not supported!\n", comp_count);
+        return -1;
+    }
+    
+    // Collect the component-spec parameters
+    for ( int comp = 0; comp < comp_count; comp++ ) 
+    {
+        int index = (int)jpeg_reader_read_byte(*image);
+        int table = (int)jpeg_reader_read_byte(*image);
+        int table_dc = (table >> 4) & 15;
+        int table_ac = table & 15;
+        
+        if ( index == 1 && (table_ac != 0 || table_dc != 0) ) {
+            fprintf(stderr, "Error: SOS marker for Y should have huffman tables 0,0 but %d,%d was presented!\n", table_dc, table_ac);
+            return -1;
+        }
+        if ( (index == 2 || index == 3) && (table_ac != 1 || table_dc != 1) ) {
+            fprintf(stderr, "Error: SOS marker for Cb or Cr should have huffman tables 1,1 but %d,%d was presented!\n", table_dc, table_ac);
+            return -1;
+        }
+    }
+
+    // Collect the additional scan parameters Ss, Se, Ah/Al.
+    int Ss = (int)jpeg_reader_read_byte(*image);
+    int Se = (int)jpeg_reader_read_byte(*image);
+    int Ax = (int)jpeg_reader_read_byte(*image);
+    int Ah = (Ax >> 4) & 15;
+    int Al = (Ax) & 15;
+    
+    // Check maximum scan count
+    if ( decoder->scan_count >= 3 ) {
+        fprintf(stderr, "Error: SOS marker reached maximum number of scans (3)!\n");
+        return -1;
+    }
+    // Get scan structure
+    struct jpeg_decoder_scan* scan = &decoder->scan[decoder->scan_count];
+    decoder->scan_count++;
+    
+    // Scan segments begin at the end of previous scan segments or from zero index
+    scan->segment_index = decoder->segment_count;
+    // Every scan has first segment
+    scan->segment_count = 0;
+    decoder->data_scan_index[scan->segment_index + scan->segment_count] = decoder->data_scan_size;
+    scan->segment_count++;
+    // Read scan data
+    uint8_t byte = 0;
+    uint8_t byte_previous = 0;
+    do {
+        byte_previous = byte;
+        byte = jpeg_reader_read_byte(*image);
+        decoder->data_scan[decoder->data_scan_size] = byte;
+        //printf("set byte %d = 0x%X\n", &decoder->data_scan[decoder->data_scan_size], (unsigned char)byte);
+        decoder->data_scan_size++;        
+        
+        // Check markers
+        if ( byte_previous == 0xFF ) {
+            // Check restart marker
+            if ( byte >= JPEG_MARKER_RST0 && byte <= JPEG_MARKER_RST7 ) {
+                decoder->data_scan_size -= 2;
+                
+                // Set data start index for next scan segment
+                decoder->data_scan_index[scan->segment_index + scan->segment_count] = decoder->data_scan_size;
+                scan->segment_count++;
+                //printf("restart marker 0x%X (revert to %d)\n", (unsigned char)byte, &decoder->data_scan[decoder->data_scan_size]);
+            }
+            // Check scan end
+            else if ( byte == JPEG_MARKER_EOI || byte == JPEG_MARKER_SOS ) {
+                *image -= 2;
+                decoder->data_scan_size -= 2;
+                
+                // Add scan segment count to decoder segment count
+                decoder->segment_count += scan->segment_count;
+                
+                //printf("end marker 0x%X (revert to %d)\n", (unsigned char)byte, &decoder->data_scan[decoder->data_scan_size]);
+                return 0;
+            }
+        }
+    } while( *image < image_end );
+    
+    fprintf(stderr, "Error: JPEG data unexpected ended while reading SOS marker!\n");
+    
+    return -1;
+}
+
+/** Documented at declaration */
+int
+jpeg_reader_read_image(struct jpeg_decoder* decoder, uint8_t* image, int image_size)
+{
+    // Setup decoder
+    decoder->scan_count = 0;
+    decoder->data_scan_size = 0;
+    decoder->segment_count = 0; // Total segment count for all scans
+    decoder->restart_interval = 0;
+    
+    // Get image end
+    uint8_t* image_end = image + image_size;
+    
+    // Check first SOI marker
+    int marker_soi = jpeg_reader_read_marker(&image);
+    if ( marker_soi != JPEG_MARKER_SOI ) {
+        fprintf(stderr, "Error: JPEG data should begin with SOI marker, but marker %s was found!\n", jpeg_marker_name(marker_soi));
+        return -1;
+    }
+        
+    int eoi_presented = 0;
+    while ( eoi_presented == 0 ) {
+        // Read marker
+        int marker = jpeg_reader_read_marker(&image);
+
+        // Read more info according to the marker
+        // the order of cases is in jpg file made by ms paint
+        switch (marker) 
+        {
+        case JPEG_MARKER_APP0:
+            if ( jpeg_reader_read_app0(&image) != 0 )
+                return -1;
+            break;
+        case JPEG_MARKER_APP1:
+        case JPEG_MARKER_APP2:
+        case JPEG_MARKER_APP3:
+        case JPEG_MARKER_APP4:
+        case JPEG_MARKER_APP5:
+        case JPEG_MARKER_APP6:
+        case JPEG_MARKER_APP7:
+        case JPEG_MARKER_APP8:
+        case JPEG_MARKER_APP9:
+        case JPEG_MARKER_APP10:
+        case JPEG_MARKER_APP11:
+        case JPEG_MARKER_APP12:
+        case JPEG_MARKER_APP13:
+        case JPEG_MARKER_APP14:
+        case JPEG_MARKER_APP15:
+            fprintf(stderr, "Warning: JPEG data contains not supported %s marker\n", jpeg_marker_name(marker));
+            jpeg_reader_skip_marker_content(&image);
+            break;
+            
+        case JPEG_MARKER_DQT:
+            if ( jpeg_reader_read_dqt(decoder, &image) != 0 )
+                return -1;
+            break;
+
+        case JPEG_MARKER_SOF0: 
+            // Baseline
+            if ( jpeg_reader_read_sof0(decoder, &image) != 0 )
+                return -1;
+            break;
+        case JPEG_MARKER_SOF1:
+            // Extended sequential with Huffman coder
+            fprintf(stderr, "Warning: Reading SOF1 as it was SOF0 marker (should work but verify it)!\n", jpeg_marker_name(marker));
+            if ( jpeg_reader_read_sof0(decoder, &image) != 0 )
+                return -1;
+            break;
+        case JPEG_MARKER_SOF2:
+            fprintf(stderr, "Error: Marker SOF2 (Progressive with Huffman coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF3:
+            fprintf(stderr, "Error: Marker SOF3 (Lossless with Huffman coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF5:
+            fprintf(stderr, "Error: Marker SOF5 (Differential sequential with Huffman coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF6:
+            fprintf(stderr, "Error: Marker SOF6 (Differential progressive with Huffman coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF7:
+            fprintf(stderr, "Error: Marker SOF7 (Extended lossless with Arithmetic coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_JPG:
+            fprintf(stderr, "Error: Marker JPG (Reserved for JPEG extensions ) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF10:
+            fprintf(stderr, "Error: Marker SOF10 (Progressive with Arithmetic coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF11:
+            fprintf(stderr, "Error: Marker SOF11 (Lossless with Arithmetic coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF13:
+            fprintf(stderr, "Error: Marker SOF13 (Differential sequential with Arithmetic coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF14:
+            fprintf(stderr, "Error: Marker SOF14 (Differential progressive with Arithmetic coding) is not supported!");
+            return -1;
+        case JPEG_MARKER_SOF15:
+            fprintf(stderr, "Error: Marker SOF15 (Differential lossless with Arithmetic coding) is not supported!");
+            return -1;
+            
+        case JPEG_MARKER_DHT:
+            if ( jpeg_reader_read_dht(decoder, &image) != 0 )
+                return -1;
+            break;
+            
+        case JPEG_MARKER_DRI:
+            if ( jpeg_reader_read_dri(decoder, &image) != 0 )
+                return -1;
+            break;
+
+        case JPEG_MARKER_SOS:
+            if ( jpeg_reader_read_sos(decoder, &image, image_end) != 0 )
+                return -1;
+            break;
+            
+        case JPEG_MARKER_EOI:
+            eoi_presented = 1;
+            break;
+
+        case JPEG_MARKER_COM:
+        case JPEG_MARKER_DAC:
+        case JPEG_MARKER_DNL:
+            fprintf(stderr, "Warning: JPEG data contains not supported %s marker\n", jpeg_marker_name(marker));
+            jpeg_reader_skip_marker_content(&image);
+            break;
+            
+        default:   
+            fprintf(stderr, "Error: JPEG data contains not supported %s marker!\n", jpeg_marker_name(marker));
+            jpeg_reader_skip_marker_content(&image);
+            return -1;
+        }
+    }
+    
+    // Check EOI marker
+    if ( eoi_presented == 0 ) {
+        fprintf(stderr, "Error: JPEG data should end with EOI marker!\n");
+        return -1;
+    }
+    
+    return 0;
+}
--- a/jpeg_compress/jpeg_reader.h
+++ b/jpeg_compress/jpeg_reader.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_READER
+#define JPEG_READER
+
+#include <stdint.h>
+
+/** JPEG decoder structure predeclaration */
+struct jpeg_decoder;
+
+/** JPEG reader structure */
+struct jpeg_reader
+{
+};
+
+/**
+ * Create JPEG reader
+ * 
+ * @return reader structure if succeeds, otherwise NULL
+ */
+struct jpeg_reader*
+jpeg_reader_create();
+
+/**
+ * Destroy JPEG reader
+ * 
+ * @param reader  Reader structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_destroy(struct jpeg_reader* reader);
+
+/**
+ * Read JPEG image from data buffer
+ * 
+ * @param image  Image data
+ * @param image_size  Image data size
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_reader_read_image(struct jpeg_decoder* decoder, uint8_t* image, int image_size);
+
+#endif // JPEG_WRITER
--- a/jpeg_compress/jpeg_table.c
+++ b/jpeg_compress/jpeg_table.c
@@ -0,0 +1,352 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_table.h"
+#include "jpeg_util.h"
+
+/** Documented at declaration */
+int
+jpeg_table_quantization_encoder_init(struct jpeg_table_quantization* table, enum jpeg_component_type type, int quality)
+{
+    // Setup raw table
+    nppiSetDefaultQuantTable(table->table_raw, (int)type);
+    
+    // Init raw table
+    nppiQuantFwdRawTableInit_JPEG_8u(table->table_raw, quality);
+    
+    // Setup forward table by npp
+    nppiQuantFwdTableInit_JPEG_8u16u(table->table_raw, table->table);
+        
+    // Copy tables to device memory
+    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
+        return -1;
+    
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_table_quantization_decoder_init(struct jpeg_table_quantization* table, enum jpeg_component_type type, int quality)
+{
+    // Setup raw table
+    nppiSetDefaultQuantTable(table->table_raw, (int)type);
+    
+    // Init raw table
+    nppiQuantFwdRawTableInit_JPEG_8u(table->table_raw, quality);
+    
+    // Setup inverse table by npp
+    nppiQuantInvTableInit_JPEG_8u16u(table->table_raw, table->table);
+    
+    // Copy tables to device memory
+    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
+        return -1;
+        
+    return 0;
+}
+
+int
+jpeg_table_quantization_decoder_compute(struct jpeg_table_quantization* table)
+{
+    // Setup inverse table by npp
+    nppiQuantInvTableInit_JPEG_8u16u(table->table_raw, table->table);
+    
+    // Copy tables to device memory
+    if ( cudaSuccess != cudaMemcpy(table->d_table, table->table, 64 * sizeof(uint16_t), cudaMemcpyHostToDevice) )
+        return -1;
+        
+    return 0;
+}
+
+/** Documented at declaration */
+void
+jpeg_table_quantization_print(struct jpeg_table_quantization* table)
+{
+    puts("Raw Table (with quality):");
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+            printf("%4u", table->table_raw[i * 8 + j]);
+        }
+        puts("");
+    }
+    
+    puts("Forward/Inverse Table:");
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+            printf("%6u", table->table[i * 8 + j]);
+        }
+        puts("");
+    }
+}
+
+/** Huffman Table  DC for Y component */
+static unsigned char jpeg_table_huffman_y_dc_bits[17] = {
+    0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 
+};
+static unsigned char jpeg_table_huffman_y_dc_value[] = { 
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 
+};
+/** Huffman Table  DC for Cb or Cr component */
+static unsigned char jpeg_table_huffman_cbcr_dc_bits[17] = { 
+    0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 
+};
+static unsigned char jpeg_table_huffman_cbcr_dc_value[] = { 
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 
+};
+/** Huffman Table  AC for Y component */
+static unsigned char jpeg_table_huffman_y_ac_bits[17] = { 
+    0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d 
+};
+static unsigned char jpeg_table_huffman_y_ac_value[] = { 
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa 
+};
+/** Huffman Table AC for Cb or Cr component */
+static unsigned char jpeg_table_huffman_cbcr_ac_bits[17] = { 
+    0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 
+};
+static unsigned char jpeg_table_huffman_cbcr_ac_value[] = { 
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa 
+};
+
+/** 
+ * Compute encoder huffman table from bits and values arrays (that are already set in table)
+ * 
+ * @param table  Table structure
+ * @return void
+ */
+void
+jpeg_table_huffman_encoder_compute(struct jpeg_table_huffman_encoder* table)
+{
+	char huffsize[257];
+	unsigned int huffcode[257];
+
+	// Figure C.1: make table of Huffman code length for each symbol
+	// Note that this is in code-length order
+	int p = 0;
+	for ( int l = 1; l <= 16; l++ ) {
+		for ( int i = 1; i <= (int) table->bits[l]; i++ )
+			huffsize[p++] = (char) l;
+	}
+	huffsize[p] = 0;
+	int lastp = p;
+
+	// Figure C.2: generate the codes themselves
+	// Note that this is in code-length order
+	unsigned int code = 0;
+	int si = huffsize[0];
+	p = 0;
+	while ( huffsize[p] ) {
+		while ( ((int) huffsize[p]) == si ) {
+			huffcode[p++] = code;
+			code++;
+		}
+		code <<= 1;
+		si++;
+	}
+
+	// Figure C.3: generate encoding tables
+	// These are code and size indexed by symbol value
+
+	// Set any codeless symbols to have code length 0;
+	// this allows EmitBits to detect any attempt to emit such symbols.
+	memset(table->size, 0, sizeof(table->size));
+
+	for (p = 0; p < lastp; p++) {
+		table->code[table->huffval[p]] = huffcode[p];
+		table->size[table->huffval[p]] = huffsize[p];
+	}
+}
+
+/** Documented at declaration */
+int
+jpeg_table_huffman_encoder_init(struct jpeg_table_huffman_encoder* table, struct jpeg_table_huffman_encoder* d_table, enum jpeg_component_type comp_type, enum jpeg_huffman_type huff_type)
+{
+    assert(comp_type == JPEG_COMPONENT_LUMINANCE || comp_type == JPEG_COMPONENT_CHROMINANCE);
+    assert(huff_type == JPEG_HUFFMAN_DC || huff_type == JPEG_HUFFMAN_AC);
+    if ( comp_type == JPEG_COMPONENT_LUMINANCE ) {
+        if ( huff_type == JPEG_HUFFMAN_DC ) {
+            memcpy(table->bits, jpeg_table_huffman_y_dc_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_y_dc_value, sizeof(table->huffval));
+        } else {
+            memcpy(table->bits, jpeg_table_huffman_y_ac_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_y_ac_value, sizeof(table->huffval));
+        }        
+    } else if ( comp_type == JPEG_COMPONENT_CHROMINANCE ) {
+        if ( huff_type == JPEG_HUFFMAN_DC ) {
+            memcpy(table->bits, jpeg_table_huffman_cbcr_dc_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_cbcr_dc_value, sizeof(table->huffval));
+        } else {
+            memcpy(table->bits, jpeg_table_huffman_cbcr_ac_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_cbcr_ac_value, sizeof(table->huffval));
+        }
+    }
+    jpeg_table_huffman_encoder_compute(table);
+    
+    // Copy table to device memory
+    if ( cudaSuccess != cudaMemcpy(d_table, table, sizeof(struct jpeg_table_huffman_encoder), cudaMemcpyHostToDevice) )
+        return -1;
+        
+    return 0;
+}
+
+/** Documented at declaration */
+int
+jpeg_table_huffman_decoder_init(struct jpeg_table_huffman_decoder* table, struct jpeg_table_huffman_decoder* d_table, enum jpeg_component_type comp_type, enum jpeg_huffman_type huff_type)
+{
+    assert(comp_type == JPEG_COMPONENT_LUMINANCE || comp_type == JPEG_COMPONENT_CHROMINANCE);
+    assert(huff_type == JPEG_HUFFMAN_DC || huff_type == JPEG_HUFFMAN_AC);
+    if ( comp_type == JPEG_COMPONENT_LUMINANCE ) {
+        if ( huff_type == JPEG_HUFFMAN_DC ) {
+            memcpy(table->bits, jpeg_table_huffman_y_dc_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_y_dc_value, sizeof(table->huffval));
+        } else {
+            memcpy(table->bits, jpeg_table_huffman_y_ac_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_y_ac_value, sizeof(table->huffval));
+        }        
+    } else if ( comp_type == JPEG_COMPONENT_CHROMINANCE ) {
+        if ( huff_type == JPEG_HUFFMAN_DC ) {
+            memcpy(table->bits, jpeg_table_huffman_cbcr_dc_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_cbcr_dc_value, sizeof(table->huffval));
+        } else {
+            memcpy(table->bits, jpeg_table_huffman_cbcr_ac_bits, sizeof(table->bits));
+            memcpy(table->huffval, jpeg_table_huffman_cbcr_ac_value, sizeof(table->huffval));
+        }
+    }
+    jpeg_table_huffman_decoder_compute(table, d_table);
+        
+    return 0;
+}
+
+/** Documented at declaration */
+void
+jpeg_table_huffman_decoder_compute(struct jpeg_table_huffman_decoder* table, struct jpeg_table_huffman_decoder* d_table)
+{
+	// Figure C.1: make table of Huffman code length for each symbol
+	// Note that this is in code-length order.
+    char huffsize[257];
+	int p = 0;
+	for ( int l = 1; l <= 16; l++ ) {
+		for ( int i = 1; i <= (int) table->bits[l]; i++ )
+			huffsize[p++] = (char) l;
+	}
+	huffsize[p] = 0;
+
+	// Figure C.2: generate the codes themselves
+	// Note that this is in code-length order.
+    unsigned int huffcode[257];
+	unsigned int code = 0;
+	int si = huffsize[0];
+	p = 0;
+	while ( huffsize[p] ) {
+		while ( ((int) huffsize[p]) == si ) {
+			huffcode[p++] = code;
+			code++;
+		}
+		code <<= 1;
+		si++;
+	}
+
+	// Figure F.15: generate decoding tables for bit-sequential decoding
+	p = 0;
+	for ( int l = 1; l <= 16; l++ ) {
+		if ( table->bits[l] ) {
+			table->valptr[l] = p; // huffval[] index of 1st symbol of code length l
+			table->mincode[l] = huffcode[p]; // minimum code of length l
+			p += table->bits[l];
+			table->maxcode[l] = huffcode[p-1]; // maximum code of length l
+		} else {
+			table->maxcode[l] = -1;	// -1 if no codes of this length
+		}
+	}
+    // Ensures jpeg_huff_decode terminates
+	table->maxcode[17] = 0xFFFFFL;
+
+	// Compute lookahead tables to speed up decoding.
+    //First we set all the table entries to 0, indicating "too long";
+    //then we iterate through the Huffman codes that are short enough and
+    //fill in all the entries that correspond to bit sequences starting
+    //with that code.
+	memset(table->look_nbits, 0, sizeof(int) * 256);
+
+	int HUFF_LOOKAHEAD = 8;
+	p = 0;
+	for ( int l = 1; l <= HUFF_LOOKAHEAD; l++ ) {
+		for ( int i = 1; i <= (int) table->bits[l]; i++, p++ ) {
+			// l = current code's length, 
+			// p = its index in huffcode[] & huffval[]. Generate left-justified
+			// code followed by all possible bit sequences
+			int lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+			for ( int ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr-- ) 
+			{
+				table->look_nbits[lookbits] = l;
+				table->look_sym[lookbits] = table->huffval[p];
+				lookbits++;
+			}
+		}
+	}
+    
+    // Copy table to device memory
+    cudaMemcpy(d_table, table, sizeof(struct jpeg_table_huffman_decoder), cudaMemcpyHostToDevice);
+}
--- a/jpeg_compress/jpeg_table.h
+++ b/jpeg_compress/jpeg_table.h
@@ -0,0 +1,164 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_TABLE
+#define JPEG_TABLE
+
+#include "jpeg_type.h"
+
+/** JPEG natural order from zigzag order */
+static const int jpeg_order_natural[64] = {
+     0,  1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+/** JPEG quantization table structure */
+struct jpeg_table_quantization
+{
+    // Quantization raw table
+    uint8_t table_raw[64];
+    // Quantization forward/inverse table
+    uint16_t table[64];
+    // Quantization forward/inverse table in device memory
+    uint16_t* d_table;
+};
+
+/** JPEG table for huffman encoding */
+struct jpeg_table_huffman_encoder {
+    // Code for each symbol 
+    unsigned int code[256];	
+    // Length of code for each symbol 
+	char size[256];
+    // If no code has been allocated for a symbol S, size[S] is 0 
+
+	// These two fields directly represent the contents of a JPEG DHT marker
+    // bits[k] = # of symbols with codes of length k bits; bits[0] is unused
+    unsigned char bits[17];
+    // The symbols, in order of incr code length
+    unsigned char huffval[256];
+};
+
+/** JPEG table for huffman decoding */
+struct jpeg_table_huffman_decoder {
+    // Smallest code of length k
+    int mincode[17]; 
+    // Largest code of length k (-1 if none) 
+	int maxcode[18];
+    // Huffval[] index of 1st symbol of length k
+	int valptr[17];
+    // # bits, or 0 if too long
+    int look_nbits[256];
+    // Symbol, or unused
+	unsigned char look_sym[256];
+    
+    // These two fields directly represent the contents of a JPEG DHT marker
+    // bits[k] = # of symbols with codes of 
+	unsigned char bits[17];
+    // The symbols, in order of incr code length 
+	unsigned char huffval[256];
+};
+
+/**
+ * Init JPEG quantization table for encoder
+ * 
+ * @param table  Table structure
+ * @param type  Type of component for table
+ * @param quality  Quality (0-100)
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_table_quantization_encoder_init(struct jpeg_table_quantization* table, enum jpeg_component_type type, int quality);
+
+/**
+ * Init JPEG quantization table for decoder
+ * 
+ * @param table  Table structure
+ * @param type  Type of component for table
+ * @param quality  Quality (0-100)
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_table_quantization_decoder_init(struct jpeg_table_quantization* table, enum jpeg_component_type type, int quality);
+
+/**
+ * Compute JPEG quantization table for decoder
+ * 
+ * @param table  Table structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_table_quantization_decoder_compute(struct jpeg_table_quantization* table);
+
+/**
+ * Print JPEG quantization table
+ * 
+ * @param table  Table structure
+ * @return void
+ */
+void
+jpeg_table_quantization_print(struct jpeg_table_quantization* table);
+
+/**
+ * Initialize encoder huffman DC and AC table for component type
+ * 
+ * @param table  Table structure
+ * @param d_table  Table structure in device memory
+ * @param comp_type  Component type (luminance/chrominance)
+ * @param huff_type  Huffman type (DC/AC)
+ * @return void
+ */
+int
+jpeg_table_huffman_encoder_init(struct jpeg_table_huffman_encoder* table, struct jpeg_table_huffman_encoder* d_table, enum jpeg_component_type comp_type, enum jpeg_huffman_type huff_type);
+
+/**
+ * Initialize decoder huffman DC and AC table for component type. It copies bit and values arrays to table and call compute routine.
+ * 
+ * @param table  Table structure
+ * @param d_table  Table structure in device memory
+ * @param comp_type  Component type (luminance/chrominance)
+ * @param huff_type  Huffman type (DC/AC)
+ * @return void
+ */
+int
+jpeg_table_huffman_decoder_init(struct jpeg_table_huffman_decoder* table, struct jpeg_table_huffman_decoder* d_table, enum jpeg_component_type comp_type, enum jpeg_huffman_type huff_type);
+
+/** 
+ * Compute decoder huffman table from bits and values arrays (that are already set in table)
+ * 
+ * @param table
+ * @param d_table
+ * @return void
+ */
+void
+jpeg_table_huffman_decoder_compute(struct jpeg_table_huffman_decoder* table, struct jpeg_table_huffman_decoder* d_table);
+
+#endif // JPEG_TABLE
--- a/jpeg_compress/jpeg_type.h
+++ b/jpeg_compress/jpeg_type.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_TYPE
+#define JPEG_TYPE
+
+#include <stdint.h>
+
+static const int JPEG_BLOCK_SIZE = 8;
+
+/**
+ * Color spaces for JPEG codec
+ */
+enum jpeg_color_space {
+    JPEG_RGB = 1,
+    JPEG_YUV = 2,
+    JPEG_YCBCR = 3,
+};
+
+/**
+ * Sampling factor for JPEG codec
+ */
+enum jpeg_sampling_factor {
+    JPEG_4_4_4 = ((4 << 16) | (4 << 8) | 4),
+    JPEG_4_2_2 = ((4 << 16) | (2 << 8) | 2),
+};
+
+/**
+ * JPEG component type
+ */
+enum jpeg_component_type {
+    JPEG_COMPONENT_LUMINANCE = 0,
+    JPEG_COMPONENT_CHROMINANCE = 1,
+    JPEG_COMPONENT_TYPE_COUNT = 2
+};
+
+/** 
+ * JPEG huffman type 
+ */
+enum jpeg_huffman_type {
+    JPEG_HUFFMAN_DC = 0,
+    JPEG_HUFFMAN_AC = 1,
+    JPEG_HUFFMAN_TYPE_COUNT = 2
+};
+
+#endif // JPEG_TYPE
--- a/jpeg_compress/jpeg_util.h
+++ b/jpeg_compress/jpeg_util.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#ifndef JPEG_UTIL_H
+#define JPEG_UTIL_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <npp.h>
+
+// Timer
+#define TIMER_INIT() \
+    cudaEvent_t __start, __stop; \
+    cudaEventCreate(&__start); \
+    cudaEventCreate(&__stop); \
+    float __elapsedTime;
+#define TIMER_START() \
+    cudaEventRecord(__start,0)
+#define TIMER_STOP() \
+    cudaEventRecord(__stop,0); \
+    cudaEventSynchronize(__stop); \
+    cudaEventElapsedTime(&__elapsedTime, __start, __stop)
+#define TIMER_DURATION() __elapsedTime
+#define TIMER_STOP_PRINT(text) \
+    TIMER_STOP(); \
+    printf("%s %f ms\n", text, __elapsedTime)
+	
+// CUDA check error
+#define cudaCheckError(msg) \
+    { \
+        cudaError_t err = cudaGetLastError(); \
+        if( cudaSuccess != err) { \
+            fprintf(stderr, "%s (line %i): %s: %s.\n", \
+                __FILE__, __LINE__, msg, cudaGetErrorString( err) ); \
+            exit(-1); \
+        } \
+    } \
+    
+// Divide and round up
+#define divAndRoundUp(value, div) \
+    (((value % div) != 0) ? (value / div + 1) : (value / div))
+
+#endif // JPEG_UTIL_H
--- a/jpeg_compress/jpeg_writer.c
+++ b/jpeg_compress/jpeg_writer.c
@@ -0,0 +1,298 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#include "jpeg_writer.h"
+#include "jpeg_format_type.h"
+#include "jpeg_encoder.h"
+#include "jpeg_util.h"
+
+/** Documented at declaration */
+struct jpeg_writer*
+jpeg_writer_create(struct jpeg_encoder* encoder)
+{
+    struct jpeg_writer* writer = malloc(sizeof(struct jpeg_writer));
+    if ( writer == NULL )
+        return NULL;
+    
+    // Allocate output buffer
+    int buffer_size = 1000;
+    buffer_size += encoder->param_image.width * encoder->param_image.height * encoder->param_image.comp_count * 2;
+    writer->buffer = malloc(buffer_size * sizeof(uint8_t));
+    if ( writer->buffer == NULL )
+        return NULL;
+    
+    return writer;
+}
+
+/** Documented at declaration */
+int
+jpeg_writer_destroy(struct jpeg_writer* writer)
+{
+    assert(writer != NULL);
+    assert(writer->buffer != NULL);
+    free(writer->buffer);
+    free(writer);
+    return 0;
+}
+
+/**
+ * Write SOI
+ * 
+ * @param writer  Writer structure
+ * @return void
+ */
+void
+jpeg_writer_write_soi(struct jpeg_writer* writer)
+{
+	jpeg_writer_emit_marker(writer, JPEG_MARKER_SOI);
+}
+
+/**
+ * Write APP0 block
+ * 
+ * @param writer  Writer structure
+ * @return void
+ */
+void jpeg_writer_write_app0(struct jpeg_writer* writer)
+{
+    // Length of APP0 block	(2 bytes)
+    // Block ID			(4 bytes - ASCII "JFIF")
+    // Zero byte			(1 byte to terminate the ID string)
+    // Version Major, Minor	(2 bytes - 0x01, 0x01)
+    // Units			(1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
+    // Xdpu			(2 bytes - dots per unit horizontal)
+    // Ydpu			(2 bytes - dots per unit vertical)
+    // Thumbnail X size		(1 byte)
+    // Thumbnail Y size		(1 byte)
+    jpeg_writer_emit_marker(writer, JPEG_MARKER_APP0);
+    
+    // Length
+    jpeg_writer_emit_2byte(writer, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1);
+
+    // Identifier: ASCII "JFIF"
+    jpeg_writer_emit_byte(writer, 0x4A);
+    jpeg_writer_emit_byte(writer, 0x46);
+    jpeg_writer_emit_byte(writer, 0x49);
+    jpeg_writer_emit_byte(writer, 0x46);
+    jpeg_writer_emit_byte(writer, 0);
+
+    // We currently emit version code 1.01 since we use no 1.02 features.
+    // This may avoid complaints from some older decoders.
+    // Major version
+    jpeg_writer_emit_byte(writer, 1);
+    // Minor version 
+    jpeg_writer_emit_byte(writer, 1);
+    // Pixel size information
+    jpeg_writer_emit_byte(writer, 1);
+    jpeg_writer_emit_2byte(writer, 300);
+    jpeg_writer_emit_2byte(writer, 300);
+    // No thumbnail image
+    jpeg_writer_emit_byte(writer, 0);
+    jpeg_writer_emit_byte(writer, 0);
+}
+
+/**
+ * Write DQT block
+ * 
+ * @param encoder  Encoder structure
+ * @param type  Component type for table retrieve
+ * @return void
+ */
+void
+jpeg_writer_write_dqt(struct jpeg_encoder* encoder, enum jpeg_component_type type)
+{
+	jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_DQT);
+    
+    // Length
+	jpeg_writer_emit_2byte(encoder->writer, 67);
+    
+    // Index: Y component = 0, Cb or Cr component = 1
+	jpeg_writer_emit_byte(encoder->writer, (int)type); 
+
+    // Table changed from default with quality
+	uint8_t* dqt = encoder->table_quantization[type].table_raw;
+    
+    // Emit table
+	unsigned char qval;
+	for ( int i = 0; i < 64; i++ )  {
+        unsigned char qval = (unsigned char)((char)(dqt[jpeg_order_natural[i]]));
+		jpeg_writer_emit_byte(encoder->writer, qval);
+    }
+}
+
+/**
+ * Currently support JPEG_MARKER_SOF0 baseline implementation
+ * 
+ * @param encoder  Encoder structure 
+ * @return void
+ */
+void
+jpeg_writer_write_sof0(struct jpeg_encoder* encoder)
+{
+	jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_SOF0);
+    
+    // Length
+	jpeg_writer_emit_2byte(encoder->writer, 17);
+
+    // Precision (bit depth)
+	jpeg_writer_emit_byte(encoder->writer, 8);
+    // Dimensions
+	jpeg_writer_emit_2byte(encoder->writer, encoder->param_image.height);
+	jpeg_writer_emit_2byte(encoder->writer, encoder->param_image.width);
+    // Number of components
+	jpeg_writer_emit_byte(encoder->writer, 3);
+
+	// Component Y
+	jpeg_writer_emit_byte(encoder->writer, 1);  // component index
+	jpeg_writer_emit_byte(encoder->writer, 17); // (1 << 4) + 1 (sampling h: 1, v: 1)
+	jpeg_writer_emit_byte(encoder->writer, 0);  // quantization table index
+
+	// Component Cb
+	jpeg_writer_emit_byte(encoder->writer, 2);  // component index
+	jpeg_writer_emit_byte(encoder->writer, 17); // (1 << 4) + 1 (sampling h: 1, v: 1)
+	jpeg_writer_emit_byte(encoder->writer, 1);  // quantization table index
+
+	// Component Cr
+	jpeg_writer_emit_byte(encoder->writer, 3);  // component index
+	jpeg_writer_emit_byte(encoder->writer, 17); // (1 << 4) + 1 (sampling h: 1, v: 1)
+	jpeg_writer_emit_byte(encoder->writer, 1);  // quantization table index
+}
+
+/**
+ * Write DHT block
+ * 
+ * @param encoder  Encoder structure
+ * @param type  Component type for table retrieve
+ * @param is_ac  Flag if table AC or DC should be written
+ * @return void
+ */
+void
+jpeg_writer_write_dht(struct jpeg_encoder* encoder, enum jpeg_component_type comp_type, enum jpeg_huffman_type huff_type)
+{
+    // Get proper table and its index
+    struct jpeg_table_huffman_encoder* table = NULL;
+	int index;
+	if ( comp_type == JPEG_COMPONENT_LUMINANCE ) {
+		if ( huff_type == JPEG_HUFFMAN_AC ) {
+			table = &encoder->table_huffman[comp_type][huff_type];
+			index = 16;
+		} else {
+			table = &encoder->table_huffman[comp_type][huff_type];
+			index = 0;
+		}
+	} else {
+		if ( huff_type == JPEG_HUFFMAN_AC ) {
+			table = &encoder->table_huffman[comp_type][huff_type];
+			index = 17;
+		} else {
+			table = &encoder->table_huffman[comp_type][huff_type];
+			index = 1;
+		}
+	}
+
+	jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_DHT);
+
+	int length = 0;
+    for ( int i = 1; i <= 16; i++ )
+		length += table->bits[i];
+
+	jpeg_writer_emit_2byte(encoder->writer, length + 2 + 1 + 16);
+
+    jpeg_writer_emit_byte(encoder->writer, index);
+
+	for ( int i = 1; i <= 16; i++ )
+		jpeg_writer_emit_byte(encoder->writer, table->bits[i]);
+    
+    // Varible-length
+    for ( int i = 0; i < length; i++ )
+		jpeg_writer_emit_byte(encoder->writer, table->huffval[i]);  
+}
+
+/**
+ * Write restart interval
+ * 
+ * @param encoder  Encoder structure
+ * @return void
+ */
+void
+jpeg_writer_write_dri(struct jpeg_encoder* encoder)
+{
+	jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_DRI);
+    
+    // Length
+	jpeg_writer_emit_2byte(encoder->writer, 4);
+
+    // Restart interval
+    jpeg_writer_emit_2byte(encoder->writer, encoder->param.restart_interval);
+}
+
+/** Documented at declaration */
+void
+jpeg_writer_write_header(struct jpeg_encoder* encoder)
+{        
+	jpeg_writer_write_soi(encoder->writer);
+	jpeg_writer_write_app0(encoder->writer);
+    
+	jpeg_writer_write_dqt(encoder, JPEG_COMPONENT_LUMINANCE);      
+	jpeg_writer_write_dqt(encoder, JPEG_COMPONENT_CHROMINANCE);
+	
+    jpeg_writer_write_sof0(encoder);
+    
+	jpeg_writer_write_dht(encoder, JPEG_COMPONENT_LUMINANCE, JPEG_HUFFMAN_DC);   // DC table for Y component
+	jpeg_writer_write_dht(encoder, JPEG_COMPONENT_LUMINANCE, JPEG_HUFFMAN_AC);   // AC table for Y component
+	jpeg_writer_write_dht(encoder, JPEG_COMPONENT_CHROMINANCE, JPEG_HUFFMAN_DC); // DC table for Cb or Cr component
+	jpeg_writer_write_dht(encoder, JPEG_COMPONENT_CHROMINANCE, JPEG_HUFFMAN_AC); // AC table for Cb or Cr component
+    
+    jpeg_writer_write_dri(encoder);
+}
+
+/** Documented at declaration */
+void
+jpeg_writer_write_scan_header(struct jpeg_encoder* encoder, int index, enum jpeg_component_type type)
+{        
+    jpeg_writer_emit_marker(encoder->writer, JPEG_MARKER_SOS);
+
+    // Length
+	int length = 2 + 1 + 2 + 3;
+	jpeg_writer_emit_2byte(encoder->writer, length);
+
+    // Component count
+	jpeg_writer_emit_byte(encoder->writer, 1);
+
+    if ( type == JPEG_COMPONENT_LUMINANCE ) {
+        // Component Y
+        jpeg_writer_emit_byte(encoder->writer, index + 1); // index
+        jpeg_writer_emit_byte(encoder->writer, 0);         // (0 << 4) | 0
+    } else {
+        // Component Cb or Cr
+        jpeg_writer_emit_byte(encoder->writer, index + 1); // index
+        jpeg_writer_emit_byte(encoder->writer, 0x11);      // (1 << 4) | 1
+    }
+
+	jpeg_writer_emit_byte(encoder->writer, 0);    // Ss
+	jpeg_writer_emit_byte(encoder->writer, 0x3F); // Se
+	jpeg_writer_emit_byte(encoder->writer, 0);    // Ah/Al
+}
--- a/jpeg_compress/jpeg_writer.h
+++ b/jpeg_compress/jpeg_writer.h
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JPEG_WRITER
+#define JPEG_WRITER
+
+#include "jpeg_type.h"
+
+/** JPEG encoder structure predeclaration */
+struct jpeg_encoder;
+
+/** JPEG writer structure */
+struct jpeg_writer 
+{
+    // Output buffer
+    uint8_t* buffer;
+    // Output buffer current position
+    uint8_t* buffer_current;
+};
+
+/**
+ * Create JPEG writer
+ * 
+ * @return writer structure if succeeds, otherwise NULL
+ */
+struct jpeg_writer*
+jpeg_writer_create(struct jpeg_encoder* encoder);
+
+/**
+ * Destroy JPEG writer
+ * 
+ * @param writer  Writer structure
+ * @return 0 if succeeds, otherwise nonzero
+ */
+int
+jpeg_writer_destroy(struct jpeg_writer* writer);
+
+/**
+ * Write one byte to file
+ * 
+ * @param writer  Writer structure
+ * @param value  Byte value to write
+ * @return void
+ */
+#define jpeg_writer_emit_byte(writer, value) { \
+    *writer->buffer_current = (uint8_t)(value); \
+    writer->buffer_current++; }
+    
+/**
+ * Write two bytes to file
+ * 
+ * @param writer  Writer structure
+ * @param value  Two-byte value to write
+ * @return void
+ */
+#define jpeg_writer_emit_2byte(writer, value) { \
+    *writer->buffer_current = (uint8_t)(((value) >> 8) & 0xFF); \
+    writer->buffer_current++; \
+    *writer->buffer_current = (uint8_t)((value) & 0xFF); \
+    writer->buffer_current++; }
+    
+/**
+ * Write marker to file
+ * 
+ * @param writer  Writer structure
+ * @oaran marker  Marker to write (JPEG_MARKER_...)
+ * @return void
+ */
+#define jpeg_writer_emit_marker(writer, marker) { \
+    *writer->buffer_current = 0xFF;\
+    writer->buffer_current++; \
+    *writer->buffer_current = (uint8_t)(marker); \
+    writer->buffer_current++; }
+    
+/**
+ * Write JPEG header (write soi, app0, Y_dqt, CbCr_dqt, sof, 4 * dht blocks)
+ * 
+ * @param encoder  Encoder structure
+ * @return void
+ */
+void
+jpeg_writer_write_header(struct jpeg_encoder* encoder);
+
+/**
+ * Write scan header for one component
+ * 
+ * @param encoder  Encoder structure
+ * @param type  Component scan type
+ * @return void
+ */
+void
+jpeg_writer_write_scan_header(struct jpeg_encoder* encoder, int index, enum jpeg_component_type type);
+
+#endif // JPEG_WRITER
--- a/jpeg_compress/main.c
+++ b/jpeg_compress/main.c
@@ -0,0 +1,320 @@
+/* 
+ * Copyright (c) 2011, Martin Srom
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jpeg_encoder.h"
+#include "jpeg_decoder.h"
+#include "jpeg_util.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <strings.h>
+
+void
+print_help() 
+{
+    printf(
+        "jpeg_compress [options] input.rgb output.jpg [input2.rgb output2.jpg...]\n"
+        "   -h, --help\t\tprint help\n"
+        "   -s, --size\t\tset image size in pixels, e.g. 1920x1080\n"
+        "   -f, --sampling-factor\t\tset image sampling factor, e.g. 4:2:2\n"
+        "   -q, --quality\tset quality level 1-100 (default 75)\n"
+        "   -r, --restart\tset restart interval (default 8)\n"
+        "   -e, --encode\t\tencode images\n"
+        "   -d, --decode\t\tdecode images\n"
+        "   -D, --device\t\tcuda device id (default 0)\n"
+    );
+}
+
+int
+main(int argc, char *argv[])
+{       
+    struct option longopts[] = {
+        {"help",            no_argument,       0, 'h'},
+        {"size",            required_argument, 0, 's'},
+        {"sampling-factor", required_argument, 0, 'f'},
+        {"quality",         required_argument, 0, 'q'},
+        {"restart",         required_argument, 0, 'r'},
+        {"encode",          no_argument,       0, 'e'},
+        {"decode",          no_argument,       0, 'd'},
+        {"device",          required_argument, 0, 'D'},
+    };
+
+    // Default image parameters
+    struct jpeg_image_parameters param_image;
+    jpeg_image_set_default_parameters(&param_image);
+    
+    // Default encoder parameters
+    struct jpeg_encoder_parameters param_encoder;
+    jpeg_encoder_set_default_parameters(&param_encoder);   
+    
+    // Other parameters
+    int encode = 0;
+    int decode = 0;
+    int device_id = 0;
+    
+    // Parse command line
+    char ch = '\0';
+    int optindex = 0;
+    char* pos = 0;
+    while ( (ch = getopt_long(argc, argv, "hs:q:r:ed", longopts, &optindex)) != -1 ) {
+        switch (ch) {
+        case 'h':
+            print_help();
+            return 0;
+        case 's':
+            param_image.width = atoi(optarg);
+            pos = strstr(optarg, "x");
+            if ( pos == NULL || param_image.width == 0 || (strlen(pos) >= strlen(optarg)) ) {
+                print_help();
+                return -1;
+            }
+            param_image.height = atoi(pos + 1);
+            break;
+        case 'f':
+            if ( strcmp(optarg, "4:4:4") == 0 )
+                param_image.sampling_factor = JPEG_4_4_4;
+            else if ( strcmp(optarg, "4:2:2") == 0 )
+                param_image.sampling_factor = JPEG_4_2_2;
+            else
+                fprintf(stderr, "Sampling factor '%s' is not available!\n", optarg);
+            break;
+        case 'q':
+            param_encoder.quality = atoi(optarg);
+            if ( param_encoder.quality <= 0 )
+                param_encoder.quality = 1;
+            if ( param_encoder.quality > 100 )
+                param_encoder.quality = 100;
+            break;
+        case 'r':
+            param_encoder.restart_interval = atoi(optarg);
+            if ( param_encoder.restart_interval < 0 )
+                param_encoder.restart_interval = 0;
+            break;
+        case 'e':
+            encode = 1;
+            break;
+        case 'd':
+            decode = 1;
+            break;
+        case 'D':
+            device_id = atoi(optarg);
+            break;
+        case '?':
+            return -1;
+        default:
+            print_help();
+            return -1;
+        }
+    }
+	argc -= optind;
+	argv += optind;
+    
+    // Source image and target image must be presented
+    if ( argc < 2 ) {
+        fprintf(stderr, "Please supply source and destination image filename!\n");
+        print_help();
+        return -1;
+    }
+    
+    // Detect action if none is specified
+    if ( encode == 0 && decode == 0 ) {
+        enum jpeg_image_file_format input_format = jpeg_image_get_file_format(argv[0]);
+        enum jpeg_image_file_format output_format = jpeg_image_get_file_format(argv[1]);
+        if ( input_format & IMAGE_FILE_RAW && output_format == IMAGE_FILE_JPEG ) {
+            encode = 1;
+        } else if ( input_format == IMAGE_FILE_JPEG && output_format & IMAGE_FILE_RAW ) {
+            decode = 1;
+        } else {
+            fprintf(stderr, "Action can't be recognized for specified images!\n");
+            fprintf(stderr, "You must specify --encode or --decode option!\n");
+            return -1;
+        }
+    }
+    
+    // Init device
+    jpeg_init_device(device_id, 1);
+    
+    // Detect color spalce
+    if ( jpeg_image_get_file_format(argv[0]) == IMAGE_FILE_YUV )
+        param_image.color_space = JPEG_YUV;
+    
+    if ( encode == 1 ) {    
+        // Create encoder
+        struct jpeg_encoder* encoder = jpeg_encoder_create(&param_image, &param_encoder);
+        if ( encoder == NULL ) {
+            fprintf(stderr, "Failed to create encoder!\n");
+            return -1;
+        }
+        
+        // Encode images
+        for ( int index = 0; index < argc; index += 2 ) {
+            // Get and check input and output image
+            const char* input = argv[index];
+            const char* output = argv[index + 1];
+            enum jpeg_image_file_format input_format = jpeg_image_get_file_format(input);
+            enum jpeg_image_file_format output_format = jpeg_image_get_file_format(output);
+            if ( (input_format & IMAGE_FILE_RAW) == 0 ) {
+                fprintf(stderr, "Encoder input file [%s] should be RGB image (*.rgb)!\n", input);
+                return -1;
+            }
+            if ( output_format != IMAGE_FILE_JPEG ) {
+                fprintf(stderr, "Encoder output file [%s] should be JPEG image (*.jpg)!\n", output);
+                return -1;
+            }                
+            
+            // Encode image
+            TIMER_INIT();
+            TIMER_START();
+            
+            printf("\nEncoding Image [%s]\n", input);
+        
+            // Load image
+            int image_size = param_image.width * param_image.height * param_image.comp_count;
+            if ( param_image.sampling_factor == JPEG_4_2_2 ) {
+                assert(param_image.comp_count == 3);
+                image_size = image_size / 3 * 2;
+            }
+            uint8_t* image = NULL;
+            if ( jpeg_image_load_from_file(input, &image, &image_size) != 0 ) {
+                fprintf(stderr, "Failed to load image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Load Image:         ");
+            TIMER_START();
+                
+            // Encode image
+            uint8_t* image_compressed = NULL;
+            int image_compressed_size = 0;
+            if ( jpeg_encoder_encode(encoder, image, &image_compressed, &image_compressed_size) != 0 ) {
+                fprintf(stderr, "Failed to encode image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Encode Image:       ");
+            TIMER_START();
+            
+            // Save image
+            if ( jpeg_image_save_to_file(output, image_compressed, image_compressed_size) != 0 ) {
+                fprintf(stderr, "Failed to save image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Save Image:         ");
+            
+            printf("Compressed Size:     %d bytes [%s]\n", image_compressed_size, output);
+            
+            // Destroy image
+            jpeg_image_destroy(image);
+        }
+        
+        // Destroy encoder
+        jpeg_encoder_destroy(encoder);
+    }
+    
+    // Output sampling factor is always 4:4:4
+    param_image.sampling_factor = JPEG_4_4_4;
+    
+    if ( decode == 1 ) {    
+        // Create decoder
+        struct jpeg_decoder* decoder = jpeg_decoder_create(&param_image);
+        if ( decoder == NULL ) {
+            fprintf(stderr, "Failed to create decoder!\n");
+            return -1;
+        }
+        
+        // Decode images
+        for ( int index = 0; index < argc; index += 2 ) {
+            // Get and check input and output image
+            const char* input = argv[index];
+            const char* output = argv[index + 1];
+            if ( encode == 1 ) {
+                static char buffer_output[255];
+                sprintf(buffer_output, "%s.decoded.rgb", output);
+                input = output;
+                output = buffer_output;
+            }
+            enum jpeg_image_file_format input_format = jpeg_image_get_file_format(input);
+            enum jpeg_image_file_format output_format = jpeg_image_get_file_format(output);
+            if ( input_format != IMAGE_FILE_JPEG ) {
+                fprintf(stderr, "Encoder input file [%s] should be JPEG image (*.jpg)!\n", input);
+                return -1;
+            }
+            if ( (output_format & IMAGE_FILE_RAW) == 0 ) {
+                fprintf(stderr, "Encoder output file [%s] should be RGB image (*.rgb)!\n", output);
+                return -1;
+            }
+            
+            // Decode image
+            TIMER_INIT();
+            TIMER_START();
+            
+            printf("\nDecoding Image [%s]\n", input);
+        
+            // Load image
+            int image_size = 0;
+            uint8_t* image = NULL;
+            if ( jpeg_image_load_from_file(input, &image, &image_size) != 0 ) {
+                fprintf(stderr, "Failed to load image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Load Image:         ");
+            TIMER_START();
+                
+            // Encode image
+            uint8_t* image_decompressed = NULL;
+            int image_decompressed_size = 0;
+            if ( jpeg_decoder_decode(decoder, image, image_size, &image_decompressed, &image_decompressed_size) != 0 ) {
+                fprintf(stderr, "Failed to decode image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Decode Image:       ");
+            TIMER_START();
+            
+            // Save image
+            if ( jpeg_image_save_to_file(output, image_decompressed, image_decompressed_size) != 0 ) {
+                fprintf(stderr, "Failed to save image [%s]!\n", argv[index]);
+                return -1;
+            }
+            
+            TIMER_STOP_PRINT("Save Image:         ");
+            
+            printf("Decompressed Size:   %d bytes [%s]\n", image_decompressed_size, output);
+            
+            // Destroy image
+            jpeg_image_destroy(image);
+        }
+        
+        // Destroy decoder
+        jpeg_decoder_destroy(decoder);
+    }
+    
+	return 0;
+}