refactor image decoders

2019-12-19 01:46:47 +09:00
parent e683b3c91a
commit 39ae15d9aa
16 changed files with 992 additions and 979 deletions
@@ -1,106 +1,81 @@
 #include "astc.h"
-#include "fp16.h"
 #include <math.h>
 #include <ruby.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include "color.h"
+#include "fp16.h"

 static const int BitReverseTable[] = {
-    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0,
-    0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
-    0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4,
-    0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
-    0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC,
-    0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
-    0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
-    0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
-    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6,
-    0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
-    0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1,
-    0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
-    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
-    0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
-    0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD,
-    0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
-    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3,
-    0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
-    0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7,
-    0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
-    0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
-    0x3F, 0xBF, 0x7F, 0xFF
-};
+  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48,
+  0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4,
+  0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C,
+  0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2,
+  0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A,
+  0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E,
+  0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21,
+  0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9,
+  0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55,
+  0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD,
+  0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B,
+  0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7,
+  0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F,
+  0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF};

-static const int WeightPrecTableA[] = { 0, 0, 0, 3, 0, 5, 3, 0, 0, 0, 5, 3, 0, 5, 3, 0 };
-static const int WeightPrecTableB[] = { 0, 0, 1, 0, 2, 0, 1, 3, 0, 0, 1, 2, 4, 2, 3, 5 };
+static const int WeightPrecTableA[] = {0, 0, 0, 3, 0, 5, 3, 0, 0, 0, 5, 3, 0, 5, 3, 0};
+static const int WeightPrecTableB[] = {0, 0, 1, 0, 2, 0, 1, 3, 0, 0, 1, 2, 4, 2, 3, 5};

-static const int CemTableA[] = { 0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 0, 0 };
-static const int CemTableB[] = { 8, 6, 5, 7, 5, 4, 6, 4, 3, 5, 3, 2, 4, 2, 1, 3, 1, 2, 1 };
+static const int CemTableA[] = {0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 5, 0, 3, 0, 0};
+static const int CemTableB[] = {8, 6, 5, 7, 5, 4, 6, 4, 3, 5, 3, 2, 4, 2, 1, 3, 1, 2, 1};

-static inline uint_fast32_t color(uint_fast8_t r, uint_fast8_t g, uint_fast8_t b, uint_fast8_t a)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return r | g << 8 | b << 16 | a << 24;
-#else
-    return a | b << 8 | g << 16 | r << 24;
-#endif
-}
-
-static inline uint_fast8_t bit_reverse_u8(const uint_fast8_t c, const int bits)
-{
+static inline uint_fast8_t bit_reverse_u8(const uint_fast8_t c, const int bits) {
    return BitReverseTable[c] >> (8 - bits);
 }

-static inline uint_fast64_t bit_reverse_u64(const uint_fast64_t d, const int bits)
-{
-    uint_fast64_t ret = (uint_fast64_t)BitReverseTable[d & 0xff] << 56 | (uint_fast64_t)BitReverseTable[d >> 8 & 0xff] << 48 | (uint_fast64_t)BitReverseTable[d >> 16 & 0xff] << 40 | (uint_fast64_t)BitReverseTable[d >> 24 & 0xff] << 32 | (uint_fast32_t)BitReverseTable[d >> 32 & 0xff] << 24 | (uint_fast32_t)BitReverseTable[d >> 40 & 0xff] << 16 | (uint_fast16_t)BitReverseTable[d >> 48 & 0xff] << 8 | BitReverseTable[d >> 56 & 0xff];
+static inline uint_fast64_t bit_reverse_u64(const uint_fast64_t d, const int bits) {
+    uint_fast64_t ret = (uint_fast64_t)BitReverseTable[d & 0xff] << 56 |
+      (uint_fast64_t)BitReverseTable[d >> 8 & 0xff] << 48 | (uint_fast64_t)BitReverseTable[d >> 16 & 0xff] << 40 |
+      (uint_fast64_t)BitReverseTable[d >> 24 & 0xff] << 32 | (uint_fast32_t)BitReverseTable[d >> 32 & 0xff] << 24 |
+      (uint_fast32_t)BitReverseTable[d >> 40 & 0xff] << 16 | (uint_fast16_t)BitReverseTable[d >> 48 & 0xff] << 8 |
+      BitReverseTable[d >> 56 & 0xff];
    return ret >> (64 - bits);
 }

-static inline int getbits(const uint8_t* buf, const int bit, const int len)
-{
-    return (*(int*)(buf + bit / 8) >> (bit % 8)) & ((1 << len) - 1);
+static inline int getbits(const uint8_t *buf, const int bit, const int len) {
+    return (*(int *)(buf + bit / 8) >> (bit % 8)) & ((1 << len) - 1);
 }

-static inline uint_fast64_t getbits64(const uint8_t* buf, const int bit, const int len)
-{
+static inline uint_fast64_t getbits64(const uint8_t *buf, const int bit, const int len) {
    uint_fast64_t mask = len == 64 ? 0xffffffffffffffff : (1ull << len) - 1;
    if (len < 1)
        return 0;
    else if (bit >= 64)
-        return (*(uint_fast64_t*)(buf + 8)) >> (bit - 64) & mask;
+        return (*(uint_fast64_t *)(buf + 8)) >> (bit - 64) & mask;
    else if (bit <= 0)
-        return (*(uint_fast64_t*)buf) << -bit & mask;
+        return (*(uint_fast64_t *)buf) << -bit & mask;
    else if (bit + len <= 64)
-        return (*(uint_fast64_t*)buf) >> bit & mask;
+        return (*(uint_fast64_t *)buf) >> bit & mask;
    else
-        return ((*(uint_fast64_t*)buf) >> bit | *(uint_fast64_t*)(buf + 8) << (64 - bit)) & mask;
+        return ((*(uint_fast64_t *)buf) >> bit | *(uint_fast64_t *)(buf + 8) << (64 - bit)) & mask;
 }

-static inline uint16_t u8ptr_to_u16(const uint8_t* ptr)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return *(uint16_t*)ptr;
-#else
-    return ptr[0] | ptr[1] << 8;
-#endif
+static inline uint16_t u8ptr_to_u16(const uint8_t *ptr) {
+    return lton16(*(uint16_t *)ptr);
 }

-static inline uint_fast8_t clamp(const int n)
-{
+static inline uint_fast8_t clamp(const int n) {
    return n < 0 ? 0 : n > 255 ? 255 : n;
 }

-static inline void bit_transfer_signed(int* a, int* b)
-{
+static inline void bit_transfer_signed(int *a, int *b) {
    *b = (*b >> 1) | (*a & 0x80);
    *a = (*a >> 1) & 0x3f;
    if (*a & 0x20)
        *a -= 0x40;
 }

-static inline void set_endpoint(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2) {
    endpoint[0] = r1;
    endpoint[1] = g1;
    endpoint[2] = b1;
@@ -111,8 +86,7 @@ static inline void set_endpoint(int endpoint[8], int r1, int g1, int b1, int a1,
    endpoint[7] = a2;
 }

-static inline void set_endpoint_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2) {
    endpoint[0] = clamp(r1);
    endpoint[1] = clamp(g1);
    endpoint[2] = clamp(b1);
@@ -123,8 +97,7 @@ static inline void set_endpoint_clamp(int endpoint[8], int r1, int g1, int b1, i
    endpoint[7] = clamp(a2);
 }

-static inline void set_endpoint_blue(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint_blue(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2) {
    endpoint[0] = (r1 + b1) >> 1;
    endpoint[1] = (g1 + b1) >> 1;
    endpoint[2] = b1;
@@ -135,8 +108,8 @@ static inline void set_endpoint_blue(int endpoint[8], int r1, int g1, int b1, in
    endpoint[7] = a2;
 }

-static inline void set_endpoint_blue_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint_blue_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2,
+                                           int a2) {
    endpoint[0] = clamp((r1 + b1) >> 1);
    endpoint[1] = clamp((g1 + b1) >> 1);
    endpoint[2] = clamp(b1);
@@ -147,13 +120,11 @@ static inline void set_endpoint_blue_clamp(int endpoint[8], int r1, int g1, int
    endpoint[7] = clamp(a2);
 }

-static inline uint_fast16_t clamp_hdr(const int n)
-{
+static inline uint_fast16_t clamp_hdr(const int n) {
    return n < 0 ? 0 : n > 0xfff ? 0xfff : n;
 }

-static inline void set_endpoint_hdr(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint_hdr(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2) {
    endpoint[0] = r1;
    endpoint[1] = g1;
    endpoint[2] = b1;
@@ -164,8 +135,8 @@ static inline void set_endpoint_hdr(int endpoint[8], int r1, int g1, int b1, int
    endpoint[7] = a2;
 }

-static inline void set_endpoint_hdr_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2, int a2)
-{
+static inline void set_endpoint_hdr_clamp(int endpoint[8], int r1, int g1, int b1, int a1, int r2, int g2, int b2,
+                                          int a2) {
    endpoint[0] = clamp_hdr(r1);
    endpoint[1] = clamp_hdr(g1);
    endpoint[2] = clamp_hdr(b1);
@@ -178,13 +149,11 @@ static inline void set_endpoint_hdr_clamp(int endpoint[8], int r1, int g1, int b

 typedef uint_fast8_t (*t_select_folor_func_ptr)(int, int, int);

-static uint_fast8_t select_color(int v0, int v1, int weight)
-{
+static uint_fast8_t select_color(int v0, int v1, int weight) {
    return ((((v0 << 8 | v0) * (64 - weight) + (v1 << 8 | v1) * weight + 32) >> 6) * 255 + 32768) / 65536;
 }

-static uint_fast8_t select_color_hdr(int v0, int v1, int weight)
-{
+static uint_fast8_t select_color_hdr(int v0, int v1, int weight) {
    uint16_t c = ((v0 << 4) * (64 - weight) + (v1 << 4) * weight + 32) >> 6;
    uint16_t m = c & 0x7ff;
    if (m < 512)
@@ -197,8 +166,7 @@ static uint_fast8_t select_color_hdr(int v0, int v1, int weight)
    return isfinite(f) ? clamp(roundf(f * 255)) : 255;
 }

-static inline uint8_t f32_to_u8(const float f)
-{
+static inline uint8_t f32_to_u8(const float f) {
    float c = roundf(f * 255);
    if (c < 0)
        return 0;
@@ -208,16 +176,8 @@ static inline uint8_t f32_to_u8(const float f)
        return c;
 }

-static inline uint8_t f16ptr_to_u8(const uint8_t* ptr)
-{
-    const uint16_t c =
-#if BYTE_ORDER == LITTLE_ENDIAN
-        *(uint16_t*)ptr
-#else
-        ptr[0] | ptr[1] << 8
-#endif
-        ;
-    return f32_to_u8(fp16_ieee_to_fp32_value(c));
+static inline uint8_t f16ptr_to_u8(const uint8_t *ptr) {
+    return f32_to_u8(fp16_ieee_to_fp32_value(lton16(*(uint16_t *)ptr)));
 }

 typedef struct {
@@ -229,10 +189,10 @@ typedef struct {
    int dual_plane;
    int plane_selector;
    int weight_range;
-    int weight_num; // max: 120
+    int weight_num;  // max: 120
    int cem[4];
    int cem_range;
-    int endpoint_value_num; // max: 32
+    int endpoint_value_num;  // max: 32
    int endpoints[4][8];
    int weights[144][2];
    int partition[144];
@@ -243,87 +203,59 @@ typedef struct {
    int nonbits;
 } IntSeqData;

-void decode_intseq(const uint8_t* buf, int offset, const int a, const int b, const int count, const int reverse, IntSeqData* out)
-{
-    static int mt[] = { 0, 2, 4, 5, 7 };
-    static int mq[] = { 0, 3, 5 };
+void decode_intseq(const uint8_t *buf, int offset, const int a, const int b, const int count, const int reverse,
+                   IntSeqData *out) {
+    static int mt[] = {0, 2, 4, 5, 7};
+    static int mq[] = {0, 3, 5};
    static int TritsTable[5][256] = {
-        { 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1,
-            0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2,
-            0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 1,
-            0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2,
-            0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1,
-            0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2,
-            0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 1,
-            0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 1,
-            0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1,
-            0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2,
-            0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2 },
-        { 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-            2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0,
-            0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-            2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 0,
-            0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-            2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0,
-            0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-            2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1,
-            0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-            2, 2, 2, 1, 2, 2, 2, 1, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0,
-            0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1 },
-        { 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
-            1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2,
-            1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
-            0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2,
-            0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
-            1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2,
-            1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2,
-            0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2,
-            0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2,
-            1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2,
-            1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2 },
-        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 },
-        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }
-    };
+      {0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0,
+       1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1,
+       2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2,
+       2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0,
+       0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0,
+       1, 2, 2, 0, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1,
+       2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, 0, 1, 2, 2},
+      {0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+       1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2,
+       2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2,
+       0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1,
+       1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2,
+       2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+       2, 1, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1},
+      {0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0,
+       0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0,
+       0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2,
+       2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2,
+       1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1,
+       1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1,
+       1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2},
+      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2},
+      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
    static int QuintsTable[3][128] = {
-        { 0, 1, 2, 3, 4, 0, 4, 4, 0, 1, 2, 3, 4, 1, 4, 4, 0, 1, 2, 3, 4, 2,
-            4, 4, 0, 1, 2, 3, 4, 3, 4, 4, 0, 1, 2, 3, 4, 0, 4, 0, 0, 1, 2, 3,
-            4, 1, 4, 1, 0, 1, 2, 3, 4, 2, 4, 2, 0, 1, 2, 3, 4, 3, 4, 3, 0, 1,
-            2, 3, 4, 0, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 4, 2, 2, 3,
-            0, 1, 2, 3, 4, 3, 2, 3, 0, 1, 2, 3, 4, 0, 0, 1, 0, 1, 2, 3, 4, 1,
-            0, 1, 0, 1, 2, 3, 4, 2, 0, 1, 0, 1, 2, 3, 4, 3, 0, 1 },
-        { 0, 0, 0, 0, 0, 4, 4, 4, 1, 1, 1, 1, 1, 4, 4, 4, 2, 2, 2, 2, 2, 4,
-            4, 4, 3, 3, 3, 3, 3, 4, 4, 4, 0, 0, 0, 0, 0, 4, 0, 4, 1, 1, 1, 1,
-            1, 4, 1, 4, 2, 2, 2, 2, 2, 4, 2, 4, 3, 3, 3, 3, 3, 4, 3, 4, 0, 0,
-            0, 0, 0, 4, 0, 0, 1, 1, 1, 1, 1, 4, 1, 1, 2, 2, 2, 2, 2, 4, 2, 2,
-            3, 3, 3, 3, 3, 4, 3, 3, 0, 0, 0, 0, 0, 4, 0, 0, 1, 1, 1, 1, 1, 4,
-            1, 1, 2, 2, 2, 2, 2, 4, 2, 2, 3, 3, 3, 3, 3, 4, 3, 3 },
-        { 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0,
-            2, 4, 0, 0, 0, 0, 0, 0, 3, 4, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1,
-            1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 4, 4, 2, 2,
-            2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4,
-            2, 2, 2, 2, 2, 2, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3,
-            4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4 }
-    };
+      {0, 1, 2, 3, 4, 0, 4, 4, 0, 1, 2, 3, 4, 1, 4, 4, 0, 1, 2, 3, 4, 2, 4, 4, 0, 1, 2, 3, 4, 3, 4, 4,
+       0, 1, 2, 3, 4, 0, 4, 0, 0, 1, 2, 3, 4, 1, 4, 1, 0, 1, 2, 3, 4, 2, 4, 2, 0, 1, 2, 3, 4, 3, 4, 3,
+       0, 1, 2, 3, 4, 0, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 4, 2, 2, 3, 0, 1, 2, 3, 4, 3, 2, 3,
+       0, 1, 2, 3, 4, 0, 0, 1, 0, 1, 2, 3, 4, 1, 0, 1, 0, 1, 2, 3, 4, 2, 0, 1, 0, 1, 2, 3, 4, 3, 0, 1},
+      {0, 0, 0, 0, 0, 4, 4, 4, 1, 1, 1, 1, 1, 4, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 3, 3, 3, 3, 3, 4, 4, 4,
+       0, 0, 0, 0, 0, 4, 0, 4, 1, 1, 1, 1, 1, 4, 1, 4, 2, 2, 2, 2, 2, 4, 2, 4, 3, 3, 3, 3, 3, 4, 3, 4,
+       0, 0, 0, 0, 0, 4, 0, 0, 1, 1, 1, 1, 1, 4, 1, 1, 2, 2, 2, 2, 2, 4, 2, 2, 3, 3, 3, 3, 3, 4, 3, 3,
+       0, 0, 0, 0, 0, 4, 0, 0, 1, 1, 1, 1, 1, 4, 1, 1, 2, 2, 2, 2, 2, 4, 2, 2, 3, 3, 3, 3, 3, 4, 3, 3},
+      {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 3, 4,
+       1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 4, 4,
+       2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4,
+       3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4}};

    if (count <= 0)
        return;
@@ -341,17 +273,18 @@ void decode_intseq(const uint8_t* buf, int offset, const int a, const int b, con
            for (int i = 0, p = offset; i < block_count; i++, p -= block_size) {
                int now_size = (i < block_count - 1) ? block_size : last_block_size;
                uint_fast64_t d = bit_reverse_u64(getbits64(buf, p - now_size, now_size), now_size);
-                int x = (d >> b & 3) | (d >> b * 2 & 0xc) | (d >> b * 3 & 0x10) | (d >> b * 4 & 0x60) | (d >> b * 5 & 0x80);
+                int x =
+                  (d >> b & 3) | (d >> b * 2 & 0xc) | (d >> b * 3 & 0x10) | (d >> b * 4 & 0x60) | (d >> b * 5 & 0x80);
                for (int j = 0; j < 5 && n < count; j++, n++)
-                    out[n] = (IntSeqData){ d >> (mt[j] + b * j) & mask, TritsTable[j][x] };
+                    out[n] = (IntSeqData){d >> (mt[j] + b * j) & mask, TritsTable[j][x]};
            }
        } else {
            for (int i = 0, p = offset; i < block_count; i++, p += block_size) {
-                uint_fast64_t d = getbits64(
-                    buf, p, (i < block_count - 1) ? block_size : last_block_size);
-                int x = (d >> b & 3) | (d >> b * 2 & 0xc) | (d >> b * 3 & 0x10) | (d >> b * 4 & 0x60) | (d >> b * 5 & 0x80);
+                uint_fast64_t d = getbits64(buf, p, (i < block_count - 1) ? block_size : last_block_size);
+                int x =
+                  (d >> b & 3) | (d >> b * 2 & 0xc) | (d >> b * 3 & 0x10) | (d >> b * 4 & 0x60) | (d >> b * 5 & 0x80);
                for (int j = 0; j < 5 && n < count; j++, n++)
-                    out[n] = (IntSeqData){ d >> (mt[j] + b * j) & mask, TritsTable[j][x] };
+                    out[n] = (IntSeqData){d >> (mt[j] + b * j) & mask, TritsTable[j][x]};
            }
        }
    } else if (a == 5) {
@@ -367,29 +300,27 @@ void decode_intseq(const uint8_t* buf, int offset, const int a, const int b, con
                uint_fast64_t d = bit_reverse_u64(getbits64(buf, p - now_size, now_size), now_size);
                int x = (d >> b & 7) | (d >> b * 2 & 0x18) | (d >> b * 3 & 0x60);
                for (int j = 0; j < 3 && n < count; j++, n++)
-                    out[n] = (IntSeqData){ d >> (mq[j] + b * j) & mask, QuintsTable[j][x] };
+                    out[n] = (IntSeqData){d >> (mq[j] + b * j) & mask, QuintsTable[j][x]};
            }
        } else {
            for (int i = 0, p = offset; i < block_count; i++, p += block_size) {
-                uint_fast64_t d = getbits64(
-                    buf, p, (i < block_count - 1) ? block_size : last_block_size);
+                uint_fast64_t d = getbits64(buf, p, (i < block_count - 1) ? block_size : last_block_size);
                int x = (d >> b & 7) | (d >> b * 2 & 0x18) | (d >> b * 3 & 0x60);
                for (int j = 0; j < 3 && n < count; j++, n++)
-                    out[n] = (IntSeqData){ d >> (mq[j] + b * j) & mask, QuintsTable[j][x] };
+                    out[n] = (IntSeqData){d >> (mq[j] + b * j) & mask, QuintsTable[j][x]};
            }
        }
    } else {
        if (reverse)
            for (int p = offset - b; n < count; n++, p -= b)
-                out[n] = (IntSeqData){ bit_reverse_u8(getbits(buf, p, b), b), 0 };
+                out[n] = (IntSeqData){bit_reverse_u8(getbits(buf, p, b), b), 0};
        else
            for (int p = offset; n < count; n++, p += b)
-                out[n] = (IntSeqData){ getbits(buf, p, b), 0 };
+                out[n] = (IntSeqData){getbits(buf, p, b), 0};
    }
 }

-void decode_block_params(const uint8_t* buf, BlockData* block_data)
-{
+void decode_block_params(const uint8_t *buf, BlockData *block_data) {
    block_data->dual_plane = !!(buf[1] & 4);
    block_data->weight_range = (buf[0] >> 4 & 1) | (buf[1] << 2 & 8);

@@ -454,10 +385,12 @@ void decode_block_params(const uint8_t* buf, BlockData* block_data)

    switch (WeightPrecTableA[block_data->weight_range]) {
    case 3:
-        weight_bits = block_data->weight_num * WeightPrecTableB[block_data->weight_range] + (block_data->weight_num * 8 + 4) / 5;
+        weight_bits =
+          block_data->weight_num * WeightPrecTableB[block_data->weight_range] + (block_data->weight_num * 8 + 4) / 5;
        break;
    case 5:
-        weight_bits = block_data->weight_num * WeightPrecTableB[block_data->weight_range] + (block_data->weight_num * 7 + 2) / 3;
+        weight_bits =
+          block_data->weight_num * WeightPrecTableB[block_data->weight_range] + (block_data->weight_num * 7 + 2) / 3;
        break;
    default:
        weight_bits = block_data->weight_num * WeightPrecTableB[block_data->weight_range];
@@ -498,7 +431,8 @@ void decode_block_params(const uint8_t* buf, BlockData* block_data)

    if (block_data->dual_plane) {
        config_bits += 2;
-        block_data->plane_selector = getbits(buf, cem_base ? 130 - weight_bits - block_data->part_num * 3 : 126 - weight_bits, 2);
+        block_data->plane_selector =
+          getbits(buf, cem_base ? 130 - weight_bits - block_data->part_num * 3 : 126 - weight_bits, 2);
    }

    int remain_bits = 128 - config_bits - weight_bits;
@@ -510,10 +444,12 @@ void decode_block_params(const uint8_t* buf, BlockData* block_data)
    for (int i = 0, endpoint_bits; i < (int)(sizeof(CemTableA) / sizeof(int)); i++) {
        switch (CemTableA[i]) {
        case 3:
-            endpoint_bits = block_data->endpoint_value_num * CemTableB[i] + (block_data->endpoint_value_num * 8 + 4) / 5;
+            endpoint_bits =
+              block_data->endpoint_value_num * CemTableB[i] + (block_data->endpoint_value_num * 8 + 4) / 5;
            break;
        case 5:
-            endpoint_bits = block_data->endpoint_value_num * CemTableB[i] + (block_data->endpoint_value_num * 7 + 2) / 3;
+            endpoint_bits =
+              block_data->endpoint_value_num * CemTableB[i] + (block_data->endpoint_value_num * 7 + 2) / 3;
            break;
        default:
            endpoint_bits = block_data->endpoint_value_num * CemTableB[i];
@@ -526,8 +462,7 @@ void decode_block_params(const uint8_t* buf, BlockData* block_data)
    }
 }

-void decode_endpoints_hdr7(int* endpoints, int* v)
-{
+void decode_endpoints_hdr7(int *endpoints, int *v) {
    int modeval = (v[2] >> 4 & 0x8) | (v[1] >> 5 & 0x4) | (v[0] >> 6);
    int major_component, mode;
    if ((modeval & 0xc) != 0xc) {
@@ -540,7 +475,7 @@ void decode_endpoints_hdr7(int* endpoints, int* v)
        major_component = 0;
        mode = 5;
    }
-    int c[] = { v[0] & 0x3f, v[1] & 0x1f, v[2] & 0x1f, v[3] & 0x1f };
+    int c[] = {v[0] & 0x3f, v[1] & 0x1f, v[2] & 0x1f, v[3] & 0x1f};

    switch (mode) {
    case 0:
@@ -621,11 +556,11 @@ void decode_endpoints_hdr7(int* endpoints, int* v)
        set_endpoint_hdr_clamp(endpoints, c[0] - c[3], c[1] - c[3], c[2] - c[3], 0x780, c[0], c[1], c[2], 0x780);
 }

-void decode_endpoints_hdr11(int* endpoints, int* v, int alpha1, int alpha2)
-{
+void decode_endpoints_hdr11(int *endpoints, int *v, int alpha1, int alpha2) {
    int major_component = (v[4] >> 7) | (v[5] >> 6 & 2);
    if (major_component == 3) {
-        set_endpoint_hdr(endpoints, v[0] << 4, v[2] << 4, v[4] << 5 & 0xfe0, alpha1, v[1] << 4, v[3] << 4, v[5] << 5 & 0xfe0, alpha2);
+        set_endpoint_hdr(endpoints, v[0] << 4, v[2] << 4, v[4] << 5 & 0xfe0, alpha1, v[1] << 4, v[3] << 4,
+                         v[5] << 5 & 0xfe0, alpha2);
        return;
    }
    int mode = (v[1] >> 7) | (v[2] >> 6 & 2) | (v[3] >> 5 & 4);
@@ -726,20 +661,23 @@ void decode_endpoints_hdr11(int* endpoints, int* v, int alpha1, int alpha2)
    vd1 *= mult;

    if (major_component == 1)
-        set_endpoint_hdr_clamp(endpoints, va - vb0 - vc - vd0, va - vc, va - vb1 - vc - vd1, alpha1, va - vb0, va, va - vb1, alpha2);
+        set_endpoint_hdr_clamp(endpoints, va - vb0 - vc - vd0, va - vc, va - vb1 - vc - vd1, alpha1, va - vb0, va,
+                               va - vb1, alpha2);
    else if (major_component == 2)
-        set_endpoint_hdr_clamp(endpoints, va - vb1 - vc - vd1, va - vb0 - vc - vd0, va - vc, alpha1, va - vb1, va - vb0, va, alpha2);
+        set_endpoint_hdr_clamp(endpoints, va - vb1 - vc - vd1, va - vb0 - vc - vd0, va - vc, alpha1, va - vb1, va - vb0,
+                               va, alpha2);
    else
-        set_endpoint_hdr_clamp(endpoints, va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1, alpha1, va, va - vb0, va - vb1, alpha2);
+        set_endpoint_hdr_clamp(endpoints, va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1, alpha1, va, va - vb0,
+                               va - vb1, alpha2);
 }

-void decode_endpoints(const uint8_t* buf, BlockData* data)
-{
-    static const int TritsTable[] = { 0, 204, 93, 44, 22, 11, 5 };
-    static const int QuintsTable[] = { 0, 113, 54, 26, 13, 6 };
+void decode_endpoints(const uint8_t *buf, BlockData *data) {
+    static const int TritsTable[] = {0, 204, 93, 44, 22, 11, 5};
+    static const int QuintsTable[] = {0, 113, 54, 26, 13, 6};
    IntSeqData seq[32];
    int ev[32];
-    decode_intseq(buf, data->part_num == 1 ? 17 : 29, CemTableA[data->cem_range], CemTableB[data->cem_range], data->endpoint_value_num, 0, seq);
+    decode_intseq(buf, data->part_num == 1 ? 17 : 29, CemTableA[data->cem_range], CemTableB[data->cem_range],
+                  data->endpoint_value_num, 0, seq);

    switch (CemTableA[data->cem_range]) {
    case 3:
@@ -830,7 +768,7 @@ void decode_endpoints(const uint8_t* buf, BlockData* data)
        }
    }

-    int* v = ev;
+    int *v = ev;
    for (int cem = 0; cem < data->part_num; v += (data->cem[cem] / 4 + 1) * 2, cem++) {
        switch (data->cem[cem]) {
        case 0:
@@ -874,7 +812,8 @@ void decode_endpoints(const uint8_t* buf, BlockData* data)
            set_endpoint_clamp(data->endpoints[cem], v[0], v[0], v[0], v[2], v[1], v[1], v[1], v[2] + v[3]);
            break;
        case 6:
-            set_endpoint(data->endpoints[cem], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8, 255, v[0], v[1], v[2], 255);
+            set_endpoint(data->endpoints[cem], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8, 255, v[0], v[1],
+                         v[2], 255);
            break;
        case 7:
            decode_endpoints_hdr7(data->endpoints[cem], v);
@@ -890,12 +829,15 @@ void decode_endpoints(const uint8_t* buf, BlockData* data)
            bit_transfer_signed(&v[3], &v[2]);
            bit_transfer_signed(&v[5], &v[4]);
            if (v[1] + v[3] + v[5] >= 0)
-                set_endpoint_clamp(data->endpoints[cem], v[0], v[2], v[4], 255, v[0] + v[1], v[2] + v[3], v[4] + v[5], 255);
+                set_endpoint_clamp(data->endpoints[cem], v[0], v[2], v[4], 255, v[0] + v[1], v[2] + v[3], v[4] + v[5],
+                                   255);
            else
-                set_endpoint_blue_clamp(data->endpoints[cem], v[0] + v[1], v[2] + v[3], v[4] + v[5], 255, v[0], v[2], v[4], 255);
+                set_endpoint_blue_clamp(data->endpoints[cem], v[0] + v[1], v[2] + v[3], v[4] + v[5], 255, v[0], v[2],
+                                        v[4], 255);
            break;
        case 10:
-            set_endpoint(data->endpoints[cem], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8, v[4], v[0], v[1], v[2], v[5]);
+            set_endpoint(data->endpoints[cem], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8, v[4], v[0], v[1],
+                         v[2], v[5]);
            break;
        case 11:
            decode_endpoints_hdr11(data->endpoints[cem], v, 0x780, 0x780);
@@ -912,9 +854,11 @@ void decode_endpoints(const uint8_t* buf, BlockData* data)
            bit_transfer_signed(&v[5], &v[4]);
            bit_transfer_signed(&v[7], &v[6]);
            if (v[1] + v[3] + v[5] >= 0)
-                set_endpoint_clamp(data->endpoints[cem], v[0], v[2], v[4], v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5], v[6] + v[7]);
+                set_endpoint_clamp(data->endpoints[cem], v[0], v[2], v[4], v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5],
+                                   v[6] + v[7]);
            else
-                set_endpoint_blue_clamp(data->endpoints[cem], v[0] + v[1], v[2] + v[3], v[4] + v[5], v[6] + v[7], v[0], v[2], v[4], v[6]);
+                set_endpoint_blue_clamp(data->endpoints[cem], v[0] + v[1], v[2] + v[3], v[4] + v[5], v[6] + v[7], v[0],
+                                        v[2], v[4], v[6]);
            break;
        case 14:
            decode_endpoints_hdr11(data->endpoints[cem], v, v[6], v[7]);
@@ -939,12 +883,11 @@ void decode_endpoints(const uint8_t* buf, BlockData* data)
    }
 }

-void decode_weights(const uint8_t* buf, BlockData* data)
-{
+void decode_weights(const uint8_t *buf, BlockData *data) {
    IntSeqData seq[128];
    int wv[128] = {};
-    decode_intseq(buf, 128, WeightPrecTableA[data->weight_range],
-        WeightPrecTableB[data->weight_range], data->weight_num, 1, seq);
+    decode_intseq(buf, 128, WeightPrecTableA[data->weight_range], WeightPrecTableB[data->weight_range],
+                  data->weight_num, 1, seq);

    if (WeightPrecTableA[data->weight_range] == 0) {
        switch (WeightPrecTableB[data->weight_range]) {
@@ -1045,10 +988,9 @@ void decode_weights(const uint8_t* buf, BlockData* data)
    }
 }

-void select_partition(const uint8_t* buf, BlockData* data)
-{
+void select_partition(const uint8_t *buf, BlockData *data) {
    int small_block = data->bw * data->bh < 31;
-    int seed = (*(int*)buf >> 13 & 0x3ff) | (data->part_num - 1) << 10;
+    int seed = (*(int *)buf >> 13 & 0x3ff) | (data->part_num - 1) << 10;

    uint32_t rnum = seed;
    rnum ^= rnum >> 15;
@@ -1068,7 +1010,7 @@ void select_partition(const uint8_t* buf, BlockData* data)
        seeds[i] *= seeds[i];
    }

-    int sh[2] = { seed & 2 ? 4 : 5, data->part_num == 3 ? 6 : 5 };
+    int sh[2] = {seed & 2 ? 4 : 5, data->part_num == 3 ? 6 : 5};

    if (seed & 1)
        for (int i = 0; i < 8; i++)
@@ -1102,63 +1044,73 @@ void select_partition(const uint8_t* buf, BlockData* data)
    }
 }

-void applicate_color(const BlockData* data, uint32_t* outbuf)
-{
+void applicate_color(const BlockData *data, uint32_t *outbuf) {
    static const t_select_folor_func_ptr FuncTableC[] = {
-        select_color, select_color, select_color_hdr, select_color_hdr,
-        select_color, select_color, select_color, select_color_hdr,
-        select_color, select_color, select_color, select_color_hdr,
-        select_color, select_color, select_color_hdr, select_color_hdr
-    };
+      select_color, select_color,     select_color_hdr, select_color_hdr, select_color, select_color,
+      select_color, select_color_hdr, select_color,     select_color,     select_color, select_color_hdr,
+      select_color, select_color,     select_color_hdr, select_color_hdr};
    static const t_select_folor_func_ptr FuncTableA[] = {
-        select_color, select_color, select_color_hdr, select_color_hdr,
-        select_color, select_color, select_color, select_color_hdr,
-        select_color, select_color, select_color, select_color_hdr,
-        select_color, select_color, select_color, select_color_hdr
-    };
+      select_color, select_color,     select_color_hdr, select_color_hdr, select_color, select_color,
+      select_color, select_color_hdr, select_color,     select_color,     select_color, select_color_hdr,
+      select_color, select_color,     select_color,     select_color_hdr};
    if (data->dual_plane) {
-        int ps[] = { 0, 0, 0, 0 };
+        int ps[] = {0, 0, 0, 0};
        ps[data->plane_selector] = 1;
        if (data->part_num > 1) {
            for (int i = 0; i < data->bw * data->bh; i++) {
                int p = data->partition[i];
-                uint_fast8_t r = FuncTableC[data->cem[p]](data->endpoints[p][0], data->endpoints[p][4], data->weights[i][ps[0]]);
-                uint_fast8_t g = FuncTableC[data->cem[p]](data->endpoints[p][1], data->endpoints[p][5], data->weights[i][ps[1]]);
-                uint_fast8_t b = FuncTableC[data->cem[p]](data->endpoints[p][2], data->endpoints[p][6], data->weights[i][ps[2]]);
-                uint_fast8_t a = FuncTableA[data->cem[p]](data->endpoints[p][3], data->endpoints[p][7], data->weights[i][ps[3]]);
+                uint_fast8_t r =
+                  FuncTableC[data->cem[p]](data->endpoints[p][0], data->endpoints[p][4], data->weights[i][ps[0]]);
+                uint_fast8_t g =
+                  FuncTableC[data->cem[p]](data->endpoints[p][1], data->endpoints[p][5], data->weights[i][ps[1]]);
+                uint_fast8_t b =
+                  FuncTableC[data->cem[p]](data->endpoints[p][2], data->endpoints[p][6], data->weights[i][ps[2]]);
+                uint_fast8_t a =
+                  FuncTableA[data->cem[p]](data->endpoints[p][3], data->endpoints[p][7], data->weights[i][ps[3]]);
                outbuf[i] = color(r, g, b, a);
            }
        } else {
            for (int i = 0; i < data->bw * data->bh; i++) {
-                uint_fast8_t r = FuncTableC[data->cem[0]](data->endpoints[0][0], data->endpoints[0][4], data->weights[i][ps[0]]);
-                uint_fast8_t g = FuncTableC[data->cem[0]](data->endpoints[0][1], data->endpoints[0][5], data->weights[i][ps[1]]);
-                uint_fast8_t b = FuncTableC[data->cem[0]](data->endpoints[0][2], data->endpoints[0][6], data->weights[i][ps[2]]);
-                uint_fast8_t a = FuncTableA[data->cem[0]](data->endpoints[0][3], data->endpoints[0][7], data->weights[i][ps[3]]);
+                uint_fast8_t r =
+                  FuncTableC[data->cem[0]](data->endpoints[0][0], data->endpoints[0][4], data->weights[i][ps[0]]);
+                uint_fast8_t g =
+                  FuncTableC[data->cem[0]](data->endpoints[0][1], data->endpoints[0][5], data->weights[i][ps[1]]);
+                uint_fast8_t b =
+                  FuncTableC[data->cem[0]](data->endpoints[0][2], data->endpoints[0][6], data->weights[i][ps[2]]);
+                uint_fast8_t a =
+                  FuncTableA[data->cem[0]](data->endpoints[0][3], data->endpoints[0][7], data->weights[i][ps[3]]);
                outbuf[i] = color(r, g, b, a);
            }
        }
    } else if (data->part_num > 1) {
        for (int i = 0; i < data->bw * data->bh; i++) {
            int p = data->partition[i];
-            uint_fast8_t r = FuncTableC[data->cem[p]](data->endpoints[p][0], data->endpoints[p][4], data->weights[i][0]);
-            uint_fast8_t g = FuncTableC[data->cem[p]](data->endpoints[p][1], data->endpoints[p][5], data->weights[i][0]);
-            uint_fast8_t b = FuncTableC[data->cem[p]](data->endpoints[p][2], data->endpoints[p][6], data->weights[i][0]);
-            uint_fast8_t a = FuncTableA[data->cem[p]](data->endpoints[p][3], data->endpoints[p][7], data->weights[i][0]);
+            uint_fast8_t r =
+              FuncTableC[data->cem[p]](data->endpoints[p][0], data->endpoints[p][4], data->weights[i][0]);
+            uint_fast8_t g =
+              FuncTableC[data->cem[p]](data->endpoints[p][1], data->endpoints[p][5], data->weights[i][0]);
+            uint_fast8_t b =
+              FuncTableC[data->cem[p]](data->endpoints[p][2], data->endpoints[p][6], data->weights[i][0]);
+            uint_fast8_t a =
+              FuncTableA[data->cem[p]](data->endpoints[p][3], data->endpoints[p][7], data->weights[i][0]);
            outbuf[i] = color(r, g, b, a);
        }
    } else {
        for (int i = 0; i < data->bw * data->bh; i++) {
-            uint_fast8_t r = FuncTableC[data->cem[0]](data->endpoints[0][0], data->endpoints[0][4], data->weights[i][0]);
-            uint_fast8_t g = FuncTableC[data->cem[0]](data->endpoints[0][1], data->endpoints[0][5], data->weights[i][0]);
-            uint_fast8_t b = FuncTableC[data->cem[0]](data->endpoints[0][2], data->endpoints[0][6], data->weights[i][0]);
-            uint_fast8_t a = FuncTableA[data->cem[0]](data->endpoints[0][3], data->endpoints[0][7], data->weights[i][0]);
+            uint_fast8_t r =
+              FuncTableC[data->cem[0]](data->endpoints[0][0], data->endpoints[0][4], data->weights[i][0]);
+            uint_fast8_t g =
+              FuncTableC[data->cem[0]](data->endpoints[0][1], data->endpoints[0][5], data->weights[i][0]);
+            uint_fast8_t b =
+              FuncTableC[data->cem[0]](data->endpoints[0][2], data->endpoints[0][6], data->weights[i][0]);
+            uint_fast8_t a =
+              FuncTableA[data->cem[0]](data->endpoints[0][3], data->endpoints[0][7], data->weights[i][0]);
            outbuf[i] = color(r, g, b, a);
        }
    }
 }

-void decode_block(const uint8_t* buf, const int bw, const int bh, uint32_t* outbuf)
-{
+void decode_block(const uint8_t *buf, const int bw, const int bh, uint32_t *outbuf) {
    if (buf[0] == 0xfc && (buf[1] & 1) == 1) {
        // void-extent
        uint_fast32_t c;
@@ -1186,21 +1138,16 @@ void decode_block(const uint8_t* buf, const int bw, const int bh, uint32_t* outb
    }
 }

-void decode_astc(const uint8_t* data, const int w, const int h, const int bw, const int bh, uint32_t* image)
-{
-    const int num_blocks_x = (w + bw - 1) / bw;
-    const int num_blocks_y = (h + bh - 1) / bh;
-    const int copy_length_last = (w + bw - 1) % bw + 1;
-    uint32_t buf[144];
-    uint32_t* buf_end = buf + bw * bh;
-    const uint8_t* ptr = data;
-    for (int by = 0; by < num_blocks_y; by++) {
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, ptr += 16, x += bw) {
-            decode_block(ptr, bw, bh, buf);
-            int copy_length = (bx < num_blocks_x - 1 ? bw : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - by * bh - 1; b < buf_end && y >= 0; y--, b += bw)
-                memcpy(image + y * w + x, b, copy_length);
+int decode_astc(const uint8_t *data, const long w, const long h, const int bw, const int bh, uint32_t *image) {
+    const long num_blocks_x = (w + bw - 1) / bw;
+    const long num_blocks_y = (h + bh - 1) / bh;
+    uint32_t buffer[144];
+    const uint8_t *d = data;
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, d += 16) {
+            decode_block(d, bw, bh, buffer);
+            copy_block_buffer(bx, by, w, h, bw, bh, buffer, image);
        }
    }
+    return 1;
 }
@@ -3,6 +3,6 @@

 #include <stdint.h>

-void decode_astc(const uint8_t*, const int, const int, const int, const int, uint32_t*);
+int decode_astc(const uint8_t *, const long, const long, const int, const int, uint32_t *);

 #endif /* end of include guard: ASTC_H */
@@ -0,0 +1,87 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+#include <stdint.h>
+#include <string.h>
+#include "endianness.h"
+
+#ifdef __LITTLE_ENDIAN__
+static const uint_fast32_t TRANSPARENT_MASK = 0x00ffffff;
+#else
+static const uint_fast32_t TRANSPARENT_MASK = 0xffffff00;
+#endif
+
+static inline uint_fast32_t color(uint8_t r, uint8_t g, uint8_t b, uint8_t a) {
+#ifdef __LITTLE_ENDIAN__
+    return r | g << 8 | b << 16 | a << 24;
+#else
+    return a | b << 8 | g << 16 | r << 24;
+#endif
+}
+
+static inline uint_fast32_t alpha_mask(uint8_t a) {
+#ifdef __LITTLE_ENDIAN__
+    return TRANSPARENT_MASK | a << 24;
+#else
+    return TRANSPARENT_MASK | a;
+#endif
+}
+
+static inline void rgb565_le(const uint16_t d, uint8_t *r, uint8_t *g, uint8_t *b) {
+#ifdef __LITTLE_ENDIAN__
+    *r = (d >> 8 & 0xf8) | (d >> 13);
+    *g = (d >> 3 & 0xfc) | (d >> 9 & 3);
+    *b = (d << 3) | (d >> 2 & 7);
+#else
+    *r = (d & 0xf8) | (d >> 5 & 7);
+    *g = (d << 5 & 0xe0) | (d >> 11 & 0x1c) | (d >> 1 & 3);
+    *b = (d >> 5 & 0xf8) | (d >> 10 & 0x7);
+#endif
+}
+
+static inline void rgb565_be(const uint16_t d, uint8_t *r, uint8_t *g, uint8_t *b) {
+#ifdef __BIG_ENDIAN__
+    *r = (d >> 8 & 0xf8) | (d >> 13);
+    *g = (d >> 3 & 0xfc) | (d >> 9 & 3);
+    *b = (d << 3) | (d >> 2 & 7);
+#else
+    *r = (d & 0xf8) | (d >> 5 & 7);
+    *g = (d << 5 & 0xe0) | (d >> 11 & 0x1c) | (d >> 1 & 3);
+    *b = (d >> 5 & 0xf8) | (d >> 10 & 0x7);
+#endif
+}
+
+static inline void rgb565_lep(const uint16_t d, uint8_t *c) {
+#ifdef __LITTLE_ENDIAN__
+    *(c++) = (d >> 8 & 0xf8) | (d >> 13);
+    *(c++) = (d >> 3 & 0xfc) | (d >> 9 & 3);
+    *(c++) = (d << 3) | (d >> 2 & 7);
+#else
+    *(c++) = (d & 0xf8) | (d >> 5 & 7);
+    *(c++) = (d << 5 & 0xe0) | (d >> 11 & 0x1c) | (d >> 1 & 3);
+    *(c++) = (d >> 5 & 0xf8) | (d >> 10 & 0x7);
+#endif
+}
+
+static inline void rgb565_bep(const uint16_t d, uint8_t *c) {
+#ifdef __BIG_ENDIAN__
+    *(c++) = (d >> 8 & 0xf8) | (d >> 13);
+    *(c++) = (d >> 3 & 0xfc) | (d >> 9 & 3);
+    *(c++) = (d << 3) | (d >> 2 & 7);
+#else
+    *(c++) = (d & 0xf8) | (d >> 5 & 7);
+    *(c++) = (d << 5 & 0xe0) | (d >> 11 & 0x1c) | (d >> 1 & 3);
+    *(c++) = (d >> 5 & 0xf8) | (d >> 10 & 0x7);
+#endif
+}
+
+static inline void copy_block_buffer(const long bx, const long by, const long w, const long h, const long bw,
+                                     const long bh, const uint32_t *buffer, uint32_t *image) {
+    long x = bw * bx;
+    long xl = (bw * (bx + 1) > w ? w - bw * bx : bw) * 4;
+    const uint32_t *buffer_end = buffer + bw * bh;
+    for (long y = h - by * bh; buffer < buffer_end && y-- > 0; buffer += bw)
+        memcpy(image + y * w + x, buffer, xl);
+}
+
+#endif /* end of include guard: COLOR_H */
@@ -1,44 +0,0 @@
-#include <ruby.h>
-
-/* https://github.com/ruby/ruby/blob/master/siphash.c */
-
-#ifdef _WIN32
-#define BYTE_ORDER __LITTLE_ENDIAN
-#elif !defined BYTE_ORDER
-#include <endian.h>
-#endif
-
-#ifndef BYTE_ORDER
-#if defined(__BYTE_ORDER__)
-#define BYTE_ORDER __BYTE_ORDER__
-#elif defined(__BYTE_ORDER)
-#define BYTE_ORDER __BYTE_ORDER
-#else
-#error "Neither BYTE_ORDER nor __BYTE_ORDER__ is defined."
-#endif
-#endif
-
-#ifndef LITTLE_ENDIAN
-#if defined(__LITTLE_ENDIAN)
-#define LITTLE_ENDIAN __LITTLE_ENDIAN
-#define BIG_ENDIAN __BIG_ENDIAN
-#elif defined(__LITTLE_ENDIAN__)
-#define LITTLE_ENDIAN __LITTLE_ENDIAN__
-#define BIG_ENDIAN __BIG_ENDIAN__
-#elif defined(__ORDER_LITTLE_ENDIAN__)
-#define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
-#define BIG_ENDIAN __ORDER_BIG_ENDIAN__
-#else
-#error "Neither LITTLE_ENDIAN, __LITTLE_ENDIAN, nor __ORDER_LITTLE_ENDIAN__ is defined."
-#endif
-#endif
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define IS_LITTLE_ENDIAN 1
-#define IS_BIG_ENDIAN 0
-#elif BYTE_ORDER == BIG_ENDIAN
-#define IS_LITTLE_ENDIAN 0
-#define IS_BIG_ENDIAN 1
-#else
-#error "Only strictly little or big endian supported"
-#endif
@@ -1,38 +1,16 @@
 #include "dxtc.h"
-#include "common.h"
 #include <stdint.h>
 #include <string.h>
+#include "color.h"
+#include "endianness.h"

-static inline uint_fast32_t color(uint_fast8_t r, uint_fast8_t g, uint_fast8_t b, uint_fast8_t a)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return r | g << 8 | b << 16 | a << 24;
-#else
-    return a | b << 8 | g << 16 | r << 24;
-#endif
-}
-
-static inline void rgb565(const uint16_t d, uint8_t* r, uint8_t* g, uint8_t* b)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-    *r = (d >> 8 & 0xf8) | (d >> 13);
-    *g = (d >> 3 & 0xfc) | (d >> 9 & 3);
-    *b = (d << 3) | (d >> 2 & 7);
-#else
-    *r = (d & 0xf8) | (d >> 5 & 7);
-    *g = (d << 5 & 0xe0) | (d >> 11 & 0x1c) | (d >> 1 & 3);
-    *b = (d >> 5 & 0xf8) | (d >> 10 & 0x7);
-#endif
-}
-
-static inline void decode_dxt1_block(const uint8_t* data, uint32_t* outbuf)
-{
+static inline void decode_dxt1_block(const uint8_t *data, uint32_t *outbuf) {
    uint8_t r0, g0, b0, r1, g1, b1;
-    int q0 = *(uint16_t*)(data);
-    int q1 = *(uint16_t*)(data + 2);
-    rgb565(q0, &r0, &g0, &b0);
-    rgb565(q1, &r1, &g1, &b1);
-    uint_fast32_t c[4] = { color(r0, g0, b0, 255), color(r1, g1, b1, 255) };
+    int q0 = *(uint16_t *)(data);
+    int q1 = *(uint16_t *)(data + 2);
+    rgb565_le(q0, &r0, &g0, &b0);
+    rgb565_le(q1, &r1, &g1, &b1);
+    uint_fast32_t c[4] = {color(r0, g0, b0, 255), color(r1, g1, b1, 255)};
    if (q0 > q1) {
        c[2] = color((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255);
        c[3] = color((r0 + r1 * 2) / 3, (g0 + g1 * 2) / 3, (b0 + b1 * 2) / 3, 255);
@@ -40,37 +18,27 @@ static inline void decode_dxt1_block(const uint8_t* data, uint32_t* outbuf)
        c[2] = color((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255);
        c[3] = color(0, 0, 0, 255);
    }
-#if BYTE_ORDER == LITTLE_ENDIAN
-    uint_fast32_t d = *(uint32_t*)(data + 4);
-#else
-    uint_fast32_t d = data[4] | data[5] << 8 | data[6] << 16 | data[7] << 24;
-#endif
+    uint_fast32_t d = lton32(*(uint32_t *)(data + 4));
    for (int i = 0; i < 16; i++, d >>= 2)
        outbuf[i] = c[d & 3];
 }

-void decode_dxt1(const uint8_t* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t* buf_end = buf + 16;
-    const uint8_t* d = data;
-    for (int t = 0; t < num_blocks_y; t++) {
-        for (int s = 0; s < num_blocks_x; s++, d += 8) {
-            decode_dxt1_block(d, buf);
-            int copy_length = (s < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - t * 4 - 1; b < buf_end && y >= 0; b += 4, y--)
-                memcpy(image + y * w + s * 4, b, copy_length);
+int decode_dxt1(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    const uint8_t *d = data;
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, d += 8) {
+            decode_dxt1_block(d, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
        }
    }
+    return 1;
 }

-static inline void decode_dxt5_block(const uint8_t* data, uint32_t* outbuf)
-{
-    uint_fast32_t a[8] = { data[0], data[1] };
+static inline void decode_dxt5_block(const uint8_t *data, uint32_t *outbuf) {
+    uint_fast32_t a[8] = {data[0], data[1]};
    if (a[0] > a[1]) {
        a[2] = (a[0] * 6 + a[1]) / 7;
        a[3] = (a[0] * 5 + a[1] * 2) / 7;
@@ -87,32 +55,23 @@ static inline void decode_dxt5_block(const uint8_t* data, uint32_t* outbuf)
        a[7] = 255;
    }
    for (int i = 0; i < 8; i++)
-        a[i] = color(255, 255, 255, a[i]);
+        a[i] = alpha_mask(a[i]);
    decode_dxt1_block(data + 8, outbuf);
-#if BYTE_ORDER == LITTLE_ENDIAN
-    uint_fast64_t d = *(uint64_t*)data >> 16;
-#else
-    uint_fast64_t d = data[2] | data[3] << 8 | data[4] << 16 | data[5] << 24 | data[6] << 32 | data[7] << 40;
-#endif
+    uint_fast64_t d = lton64(*(uint64_t *)data) >> 16;
    for (int i = 0; i < 16; i++, d >>= 3)
        outbuf[i] &= a[d & 7];
 }

-void decode_dxt5(const uint8_t* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t *buf_end = buf + 16;
-    const uint8_t* d = data;
-    for (int t = 0; t < num_blocks_y; t++) {
-        for (int s = 0; s < num_blocks_x; s++, d += 16) {
-            decode_dxt5_block(d, buf);
-            int copy_length = (s < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t *b = buf;
-            for (int y = h - t * 4 - 1; b < buf_end && y >= 0; b += 4, y--)
-                memcpy(image + y * w + s * 4, b, copy_length);
+int decode_dxt5(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    const uint8_t *d = data;
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, d += 16) {
+            decode_dxt5_block(d, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
        }
    }
+    return 1;
 }
@@ -3,7 +3,7 @@

 #include <stdint.h>

-void decode_dxt1(const uint8_t*, const int, const int, uint32_t*);
-void decode_dxt5(const uint8_t*, const int, const int, uint32_t*);
+int decode_dxt1(const uint8_t *, const long, const long, uint32_t *);
+int decode_dxt5(const uint8_t *, const long, const long, uint32_t *);

 #endif /* end of include guard: DXTC_H */
@@ -0,0 +1,180 @@
+/*
+ *
+ * License Information
+ *
+ * endianness.h is derived from https://gist.github.com/jtbr/7a43e6281e6cca353b33ee501421860c
+ * The file is licensed under the MIT License shown below.
+ *
+ *
+ * The MIT License (MIT)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _ENDIANNESS_H
+#define _ENDIANNESS_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Detect platform endianness at compile time */
+
+// If boost were available on all platforms, could use this instead to detect endianness
+// #include <boost/predef/endian.h>
+
+// When available, these headers can improve platform endianness detection
+#ifdef __has_include  // C++17, supported as extension to C++11 in clang, GCC 5+, vs2015
+#if __has_include(<endian.h>)
+#include <endian.h>  // gnu libc normally provides, linux
+#elif __has_include(<machine/endian.h>)
+#include <machine/endian.h>  //open bsd, macos
+#elif __has_include(<sys/param.h>)
+#include <sys/param.h>  // mingw, some bsd (not open/macos)
+#elif __has_include(<sys/isadefs.h>)
+#include <sys/isadefs.h>  // solaris
+#endif
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ||                                            \
+  (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN) ||  \
+  (defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN) || (defined(__sun) && defined(__SVR4) && defined(_BIG_ENDIAN)) || \
+  defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) ||    \
+  defined(__MIBSEB__) || defined(_M_PPC)
+#define __BIG_ENDIAN__
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || /* gcc */                           \
+  (defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN) /* linux header */ ||                                  \
+  (defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN) ||                                                        \
+  (defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN) /* mingw header */ ||                                        \
+  (defined(__sun) && defined(__SVR4) && defined(_LITTLE_ENDIAN)) || /* solaris */                                   \
+  defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(_MIPSEL) || defined(__MIPSEL) ||  \
+  defined(__MIPSEL__) || defined(_M_IX86) || defined(_M_X64) || defined(_M_IA64) || /* msvc for intel processors */ \
+  defined(_M_ARM) /* msvc code on arm executes in little endian mode */
+#define __LITTLE_ENDIAN__
+#endif
+#endif
+
+#ifdef bswap16
+#undef bswap16
+#endif
+#ifdef bswap32
+#undef bswap32
+#endif
+#ifdef bswap64
+#undef bswap64
+#endif
+
+/* Define byte-swap functions, using fast processor-native built-ins where possible */
+// needs to be first because msvc doesn't short-circuit after failing defined(__has_builtin)
+#if defined(_MSC_VER)
+#define bswap16(x) _byteswap_ushort((x))
+#define bswap32(x) _byteswap_ulong((x))
+#define bswap64(x) _byteswap_uint64((x))
+#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+#define bswap16(x) __builtin_bswap16((x))
+#define bswap32(x) __builtin_bswap32((x))
+#define bswap64(x) __builtin_bswap64((x))
+#elif defined(__has_builtin) && __has_builtin(__builtin_bswap64)
+/* for clang; gcc 5 fails on this and && shortcircuit fails; must be after GCC check */
+#define bswap16(x) __builtin_bswap16((x))
+#define bswap32(x) __builtin_bswap32((x))
+#define bswap64(x) __builtin_bswap64((x))
+#else
+/* even in this case, compilers often optimize by using native instructions */
+static inline uint16_t bswap16(uint16_t x) {
+    return (((x >> 8) & 0xffu) | ((x & 0xffu) << 8));
+}
+static inline uint32_t bswap32(uint32_t x) {
+    return (((x & 0xff000000u) >> 24) | ((x & 0x00ff0000u) >> 8) | ((x & 0x0000ff00u) << 8) |
+            ((x & 0x000000ffu) << 24));
+}
+static inline uint64_t bswap64(uint64_t x) {
+    return (((x & 0xff00000000000000ull) >> 56) | ((x & 0x00ff000000000000ull) >> 40) |
+            ((x & 0x0000ff0000000000ull) >> 24) | ((x & 0x000000ff00000000ull) >> 8) |
+            ((x & 0x00000000ff000000ull) << 8) | ((x & 0x0000000000ff0000ull) << 24) |
+            ((x & 0x000000000000ff00ull) << 40) | ((x & 0x00000000000000ffull) << 56));
+}
+#endif
+
+
+/* Defines network - host byte swaps as needed depending upon platform endianness */
+// note that network order is big endian)
+
+#if defined(__LITTLE_ENDIAN__)
+#define ntoh16(x) bswap16((x))
+#define hton16(x) bswap16((x))
+#define ntoh32(x) bswap32((x))
+#define hton32(x) bswap32((x))
+#define ntoh64(x) bswap64((x))
+#define hton64(x) bswap64((x))
+#define lton16(x) (x)
+#define lton32(x) (x)
+#define lton64(x) (x)
+#define ltonf(x) (x)
+#define ltond(x) (x)
+#define bton16(x) bswap16((x))
+#define bton32(x) bswap32((x))
+#define bton64(x) bswap64((x))
+#define btonf(x) htonf((x))
+#define btond(x) htond((x))
+#elif defined(__BIG_ENDIAN__)
+#define ntoh16(x) (x)
+#define hton16(x) (x)
+#define ntoh32(x) (x)
+#define hton32(x) (x)
+#define ntoh64(x) (x)
+#define hton64(x) (x)
+#define bton16(x) (x)
+#define bton32(x) (x)
+#define bton64(x) (x)
+#define btonf(x) (x)
+#define btond(x) (x)
+#define lton16(x) bswap16((x))
+#define lton32(x) bswap32((x))
+#define lton64(x) bswap64((x))
+#define ltonf(x) htonf((x))
+#define ltond(x) htond((x))
+#else
+#warning "UNKNOWN Platform / endianness; network / host byte swaps not defined."
+#endif
+
+
+//! Convert 32-bit float from host to network byte order
+static inline float htonf(float f) {
+#ifdef __cplusplus
+    static_assert(sizeof(float) == sizeof(uint32_t), "Unexpected float format");
+    uint32_t val = hton32(*(reinterpret_cast<const uint32_t *>(&f)));
+    return *(reinterpret_cast<float *>(&val));
+#else
+    uint32_t val = hton32(*(const uint32_t *)(&f));
+    return *((float *)(&val));
+#endif
+}
+#define ntohf(x) htonf((x))
+
+//! Convert 64-bit double from host to network byte order
+static inline double htond(double f) {
+#ifdef __cplusplus
+    static_assert(sizeof(double) == sizeof(uint64_t), "Unexpected double format");
+    uint64_t val = hton64(*(reinterpret_cast<const uint64_t *>(&f)));
+    return *(reinterpret_cast<double *>(&val));
+#else
+    uint64_t val = hton64(*(const uint64_t *)(&f));
+    return *((double *)(&val));
+#endif
+}
+#define ntohd(x) htond((x))
+
+#endif  //_ENDIANNESS_H
@@ -1,75 +1,45 @@
 #include "etc.h"
-#include "common.h"
 #include <stdint.h>
 #include <string.h>
+#include "color.h"

-const uint_fast8_t WriteOrderTable[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-const uint_fast8_t WriteOrderTableRev[16] = { 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0 };
-const uint_fast8_t Etc1ModifierTable[8][2] = { { 2, 8 }, { 5, 17 }, { 9, 29 }, { 13, 42 }, { 18, 60 }, { 24, 80 }, { 33, 106 }, { 47, 183 } };
+const uint_fast8_t WriteOrderTable[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+const uint_fast8_t WriteOrderTableRev[16] = {15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0};
+const uint_fast8_t Etc1ModifierTable[8][2] = {{2, 8},   {5, 17},  {9, 29},   {13, 42},
+                                              {18, 60}, {24, 80}, {33, 106}, {47, 183}};
 const uint_fast8_t Etc2aModifierTable[2][8][2] = {
-    { { 0, 8 }, { 0, 17 }, { 0, 29 }, { 0, 42 }, { 0, 60 }, { 0, 80 }, { 0, 106 }, { 0, 183 } },
-    { { 2, 8 }, { 5, 17 }, { 9, 29 }, { 13, 42 }, { 18, 60 }, { 24, 80 }, { 33, 106 }, { 47, 183 } }
-};
-const uint_fast8_t Etc1SubblockTable[2][16] = { { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 } };
-const uint_fast8_t Etc2DistanceTable[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+  {{0, 8}, {0, 17}, {0, 29}, {0, 42}, {0, 60}, {0, 80}, {0, 106}, {0, 183}},
+  {{2, 8}, {5, 17}, {9, 29}, {13, 42}, {18, 60}, {24, 80}, {33, 106}, {47, 183}}};
+const uint_fast8_t Etc1SubblockTable[2][16] = {{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1},
+                                               {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}};
+const uint_fast8_t Etc2DistanceTable[8] = {3, 6, 11, 16, 23, 32, 41, 64};
 const int_fast8_t Etc2AlphaModTable[16][8] = {
-    { -3, -6, -9, -15, 2, 5, 8, 14 },
-    { -3, -7, -10, -13, 2, 6, 9, 12 },
-    { -2, -5, -8, -13, 1, 4, 7, 12 },
-    { -2, -4, -6, -13, 1, 3, 5, 12 },
-    { -3, -6, -8, -12, 2, 5, 7, 11 },
-    { -3, -7, -9, -11, 2, 6, 8, 10 },
-    { -4, -7, -8, -11, 3, 6, 7, 10 },
-    { -3, -5, -8, -11, 2, 4, 7, 10 },
-    { -2, -6, -8, -10, 1, 5, 7, 9 },
-    { -2, -5, -8, -10, 1, 4, 7, 9 },
-    { -2, -4, -8, -10, 1, 3, 7, 9 },
-    { -2, -5, -7, -10, 1, 4, 6, 9 },
-    { -3, -4, -7, -10, 2, 3, 6, 9 },
-    { -1, -2, -3, -10, 0, 1, 2, 9 },
-    { -4, -6, -8, -9, 3, 5, 7, 8 },
-    { -3, -5, -7, -9, 2, 4, 6, 8 }
-};
+  {-3, -6, -9, -15, 2, 5, 8, 14}, {-3, -7, -10, -13, 2, 6, 9, 12}, {-2, -5, -8, -13, 1, 4, 7, 12},
+  {-2, -4, -6, -13, 1, 3, 5, 12}, {-3, -6, -8, -12, 2, 5, 7, 11},  {-3, -7, -9, -11, 2, 6, 8, 10},
+  {-4, -7, -8, -11, 3, 6, 7, 10}, {-3, -5, -8, -11, 2, 4, 7, 10},  {-2, -6, -8, -10, 1, 5, 7, 9},
+  {-2, -5, -8, -10, 1, 4, 7, 9},  {-2, -4, -8, -10, 1, 3, 7, 9},   {-2, -5, -7, -10, 1, 4, 6, 9},
+  {-3, -4, -7, -10, 2, 3, 6, 9},  {-1, -2, -3, -10, 0, 1, 2, 9},   {-4, -6, -8, -9, 3, 5, 7, 8},
+  {-3, -5, -7, -9, 2, 4, 6, 8}};

-#if BYTE_ORDER == LITTLE_ENDIAN
-static const uint_fast32_t TRANSPARENT_MASK = 0x00ffffff;
-#else
-static const uint_fast32_t TRANSPARENT_MASK = 0xffffff00;
-#endif
-
-static inline uint_fast32_t color(uint_fast8_t r, uint_fast8_t g, uint_fast8_t b, uint_fast8_t a)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return r | g << 8 | b << 16 | a << 24;
-#else
-    return a | b << 8 | g << 16 | r << 24;
-#endif
-}
-
-static inline uint_fast8_t clamp(const int n)
-{
+static inline uint_fast8_t clamp(const int n) {
    return n < 0 ? 0 : n > 255 ? 255 : n;
 }

-static inline uint32_t applicate_color(uint_fast8_t c[3], int_fast16_t m)
-{
+static inline uint32_t applicate_color(uint_fast8_t c[3], int_fast16_t m) {
    return color(clamp(c[0] + m), clamp(c[1] + m), clamp(c[2] + m), 255);
 }

-static inline uint32_t applicate_color_alpha(uint_fast8_t c[3], int_fast16_t m, int transparent)
-{
+static inline uint32_t applicate_color_alpha(uint_fast8_t c[3], int_fast16_t m, int transparent) {
    return color(clamp(c[0] + m), clamp(c[1] + m), clamp(c[2] + m), transparent ? 0 : 255);
 }

-static inline uint32_t applicate_color_raw(uint_fast8_t c[3])
-{
+static inline uint32_t applicate_color_raw(uint_fast8_t c[3]) {
    return color(c[0], c[1], c[2], 255);
 }

-static inline void decode_etc1_block(const uint8_t* data, uint32_t* outbuf)
-{
-    const uint_fast8_t code[2] = { data[3] >> 5, data[3] >> 2 & 7 }; // Table codewords
-    const uint_fast8_t* table = Etc1SubblockTable[data[3] & 1];
+static void decode_etc1_block(const uint8_t *data, uint32_t *outbuf) {
+    const uint_fast8_t code[2] = {data[3] >> 5, data[3] >> 2 & 7};  // Table codewords
+    const uint_fast8_t *table = Etc1SubblockTable[data[3] & 1];
    uint_fast8_t c[2][3];
    if (data[3] & 2) {
        // diff bit == 1
@@ -95,8 +65,8 @@ static inline void decode_etc1_block(const uint8_t* data, uint32_t* outbuf)
        c[1][2] = (data[2] & 0x0f) | data[2] << 4;
    }

-    uint_fast16_t j = data[6] << 8 | data[7]; // less significant pixel index bits
-    uint_fast16_t k = data[4] << 8 | data[5]; // more significant pixel index bits
+    uint_fast16_t j = data[6] << 8 | data[7];  // less significant pixel index bits
+    uint_fast16_t k = data[4] << 8 | data[5];  // more significant pixel index bits
    for (int i = 0; i < 16; i++, j >>= 1, k >>= 1) {
        uint_fast8_t s = table[i];
        uint_fast8_t m = Etc1ModifierTable[code[s]][j & 1];
@@ -104,29 +74,9 @@ static inline void decode_etc1_block(const uint8_t* data, uint32_t* outbuf)
    }
 }

-void decode_etc1(const void* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t* buf_end = buf + 16;
-    const uint8_t* d = (uint8_t*)data;
-    for (int by = 0; by < num_blocks_y; by++) {
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, d += 8, x += 4) {
-            decode_etc1_block(d, buf);
-            int copy_length = (bx < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - 1 - by * 4; b < buf_end && y >= 0; y--, b += 4)
-                memcpy(image + y * w + x, b, copy_length);
-        }
-    }
-}
-
-static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
-{
-    uint_fast16_t j = data[6] << 8 | data[7]; // 15 -> 0
-    uint_fast32_t k = data[4] << 8 | data[5]; // 31 -> 16
+static void decode_etc2_block(const uint8_t *data, uint32_t *outbuf) {
+    uint_fast16_t j = data[6] << 8 | data[7];  // 15 -> 0
+    uint_fast32_t k = data[4] << 8 | data[5];  // 31 -> 16
    uint_fast8_t c[3][3] = {};

    if (data[3] & 2) {
@@ -146,7 +96,8 @@ static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
            c[1][1] = (data[2] & 0x0f) | data[2] << 4;
            c[1][2] = (data[3] & 0xf0) | data[3] >> 4;
            const uint_fast8_t d = Etc2DistanceTable[(data[3] >> 1 & 6) | (data[3] & 1)];
-            uint_fast32_t color_set[4] = { applicate_color_raw(c[0]), applicate_color(c[1], d), applicate_color_raw(c[1]), applicate_color(c[1], -d) };
+            uint_fast32_t color_set[4] = {applicate_color_raw(c[0]), applicate_color(c[1], d),
+                                          applicate_color_raw(c[1]), applicate_color(c[1], -d)};
            k <<= 1;
            for (int i = 0; i < 16; i++, j >>= 1, k >>= 1)
                outbuf[WriteOrderTable[i]] = color_set[(k & 2) | (j & 1)];
@@ -162,10 +113,12 @@ static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
            c[1][1] |= c[1][1] >> 4;
            c[1][2] = (data[3] << 1 & 0xf0) | (data[3] >> 3 & 0xf);
            uint_fast8_t d = (data[3] & 4) | (data[3] << 1 & 2);
-            if (c[0][0] > c[1][0] || (c[0][0] == c[1][0] && (c[0][1] > c[1][1] || (c[0][1] == c[1][1] && c[0][2] >= c[1][2]))))
+            if (c[0][0] > c[1][0] ||
+                (c[0][0] == c[1][0] && (c[0][1] > c[1][1] || (c[0][1] == c[1][1] && c[0][2] >= c[1][2]))))
                ++d;
            d = Etc2DistanceTable[d];
-            uint_fast32_t color_set[4] = { applicate_color(c[0], d), applicate_color(c[0], -d), applicate_color(c[1], d), applicate_color(c[1], -d) };
+            uint_fast32_t color_set[4] = {applicate_color(c[0], d), applicate_color(c[0], -d), applicate_color(c[1], d),
+                                          applicate_color(c[1], -d)};
            k <<= 1;
            for (int i = 0; i < 16; i++, j >>= 1, k >>= 1)
                outbuf[WriteOrderTable[i]] = color_set[(k & 2) | (j & 1)];
@@ -192,8 +145,8 @@ static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
            }
        } else {
            // differential
-            const uint_fast8_t code[2] = { data[3] >> 5, data[3] >> 2 & 7 };
-            const uint_fast8_t* table = Etc1SubblockTable[data[3] & 1];
+            const uint_fast8_t code[2] = {data[3] >> 5, data[3] >> 2 & 7};
+            const uint_fast8_t *table = Etc1SubblockTable[data[3] & 1];
            c[0][0] = r | r >> 5;
            c[0][1] = g | g >> 5;
            c[0][2] = b | b >> 5;
@@ -211,8 +164,8 @@ static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
        }
    } else {
        // individual (diff bit == 0)
-        const uint_fast8_t code[2] = { data[3] >> 5, data[3] >> 2 & 7 };
-        const uint_fast8_t* table = Etc1SubblockTable[data[3] & 1];
+        const uint_fast8_t code[2] = {data[3] >> 5, data[3] >> 2 & 7};
+        const uint_fast8_t *table = Etc1SubblockTable[data[3] & 1];
        c[0][0] = (data[0] & 0xf0) | data[0] >> 4;
        c[1][0] = (data[0] & 0x0f) | data[0] << 4;
        c[0][1] = (data[1] & 0xf0) | data[1] >> 4;
@@ -227,10 +180,9 @@ static inline void decode_etc2_block(const uint8_t* data, uint32_t* outbuf)
    }
 }

-static inline void decode_etc2a1_block(const uint8_t* data, uint32_t* outbuf)
-{
-    uint_fast16_t j = data[6] << 8 | data[7]; // 15 -> 0
-    uint_fast32_t k = data[4] << 8 | data[5]; // 31 -> 16
+static void decode_etc2a1_block(const uint8_t *data, uint32_t *outbuf) {
+    uint_fast16_t j = data[6] << 8 | data[7];  // 15 -> 0
+    uint_fast32_t k = data[4] << 8 | data[5];  // 31 -> 16
    uint_fast8_t c[3][3] = {};

    int obaq = data[3] >> 1 & 1;
@@ -251,7 +203,8 @@ static inline void decode_etc2a1_block(const uint8_t* data, uint32_t* outbuf)
        c[1][1] = (data[2] & 0x0f) | data[2] << 4;
        c[1][2] = (data[3] & 0xf0) | data[3] >> 4;
        const uint_fast8_t d = Etc2DistanceTable[(data[3] >> 1 & 6) | (data[3] & 1)];
-        uint_fast32_t color_set[4] = { applicate_color_raw(c[0]), applicate_color(c[1], d), applicate_color_raw(c[1]), applicate_color(c[1], -d) };
+        uint_fast32_t color_set[4] = {applicate_color_raw(c[0]), applicate_color(c[1], d), applicate_color_raw(c[1]),
+                                      applicate_color(c[1], -d)};
        k <<= 1;
        for (int i = 0; i < 16; i++, j >>= 1, k >>= 1) {
            int index = (k & 2) | (j & 1);
@@ -271,10 +224,12 @@ static inline void decode_etc2a1_block(const uint8_t* data, uint32_t* outbuf)
        c[1][1] |= c[1][1] >> 4;
        c[1][2] = (data[3] << 1 & 0xf0) | (data[3] >> 3 & 0xf);
        uint_fast8_t d = (data[3] & 4) | (data[3] << 1 & 2);
-        if (c[0][0] > c[1][0] || (c[0][0] == c[1][0] && (c[0][1] > c[1][1] || (c[0][1] == c[1][1] && c[0][2] >= c[1][2]))))
+        if (c[0][0] > c[1][0] ||
+            (c[0][0] == c[1][0] && (c[0][1] > c[1][1] || (c[0][1] == c[1][1] && c[0][2] >= c[1][2]))))
            ++d;
        d = Etc2DistanceTable[d];
-        uint_fast32_t color_set[4] = { applicate_color(c[0], d), applicate_color(c[0], -d), applicate_color(c[1], d), applicate_color(c[1], -d) };
+        uint_fast32_t color_set[4] = {applicate_color(c[0], d), applicate_color(c[0], -d), applicate_color(c[1], d),
+                                      applicate_color(c[1], -d)};
        k <<= 1;
        for (int i = 0; i < 16; i++, j >>= 1, k >>= 1) {
            int index = (k & 2) | (j & 1);
@@ -305,8 +260,8 @@ static inline void decode_etc2a1_block(const uint8_t* data, uint32_t* outbuf)
        }
    } else {
        // differential
-        const uint_fast8_t code[2] = { data[3] >> 5, data[3] >> 2 & 7 };
-        const uint_fast8_t* table = Etc1SubblockTable[data[3] & 1];
+        const uint_fast8_t code[2] = {data[3] >> 5, data[3] >> 2 & 7};
+        const uint_fast8_t *table = Etc1SubblockTable[data[3] & 1];
        c[0][0] = r | r >> 5;
        c[0][1] = g | g >> 5;
        c[0][2] = b | b >> 5;
@@ -324,76 +279,71 @@ static inline void decode_etc2a1_block(const uint8_t* data, uint32_t* outbuf)
    }
 }

-static inline void decode_etc2a8_block(const uint8_t* data, uint32_t* outbuf)
-{
+static void decode_etc2a8_block(const uint8_t *data, uint32_t *outbuf) {
    if (data[1] & 0xf0) {
        // multiplier != 0
        const uint_fast8_t multiplier = data[1] >> 4;
-        const int_fast8_t* table = Etc2AlphaModTable[data[1] & 0xf];
-        uint_fast64_t l = data[7] | (uint_fast16_t)data[6] << 8 | (uint_fast32_t)data[5] << 16 | (uint_fast32_t)data[4] << 24 | (uint_fast64_t)data[3] << 32 | (uint_fast64_t)data[2] << 40;
+        const int_fast8_t *table = Etc2AlphaModTable[data[1] & 0xf];
+        uint_fast64_t l = data[7] | (uint_fast16_t)data[6] << 8 | (uint_fast32_t)data[5] << 16 |
+          (uint_fast32_t)data[4] << 24 | (uint_fast64_t)data[3] << 32 | (uint_fast64_t)data[2] << 40;
        for (int i = 0; i < 16; i++, l >>= 3)
-            ((uint8_t*)(outbuf + WriteOrderTableRev[i]))[3] = clamp(data[0] + multiplier * table[l & 7]);
+            ((uint8_t *)(outbuf + WriteOrderTableRev[i]))[3] = clamp(data[0] + multiplier * table[l & 7]);
    } else {
        // multiplier == 0 (always same as base codeword)
        for (int i = 0; i < 16; i++, outbuf++)
-            ((uint8_t*)outbuf)[3] = data[0];
+            ((uint8_t *)outbuf)[3] = data[0];
    }
 }

-void decode_etc2(const void* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t* buf_end = buf + 16;
-    const uint8_t* d = (uint8_t*)data;
-    for (int by = 0; by < num_blocks_y; by++) {
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, d += 8, x += 4) {
-            decode_etc2_block(d, buf);
-            int copy_length = (bx < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - by * 4 - 1; b < buf_end && y >= 0; y--, b += 4)
-                memcpy(image + y * w + x, b, copy_length);
+int decode_etc1(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, data += 8) {
+            decode_etc1_block(data, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
        }
    }
+    return 1;
 }

-void decode_etc2a1(const void* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t* buf_end = buf + 16;
-    const uint8_t* d = (uint8_t*)data;
-    for (int by = 0; by < num_blocks_y; by++) {
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, d += 8, x += 4) {
-            decode_etc2a1_block(d, buf);
-            int copy_length = (bx < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - by * 4 - 1; b < buf_end && y >= 0; y--, b += 4)
-                memcpy(image + y * w + x, b, copy_length);
+int decode_etc2(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, data += 8) {
+            decode_etc2_block(data, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
        }
    }
+    return 1;
 }

-void decode_etc2a8(const void* data, const int w, const int h, uint32_t* image)
-{
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int copy_length_last = (w + 3) % 4 + 1;
-    uint32_t buf[16];
-    uint32_t* buf_end = buf + 16;
-    const uint8_t* d = (uint8_t*)data;
-    for (int by = 0; by < num_blocks_y; by++) {
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, d += 16, x += 4) {
-            decode_etc2_block(d + 8, buf);
-            decode_etc2a8_block(d, buf);
-            int copy_length = (bx < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t* b = buf;
-            for (int y = h - by * 4 - 1; b < buf_end && y >= 0; y--, b += 4)
-                memcpy(image + y * w + x, b, copy_length);
+int decode_etc2a1(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, data += 8) {
+            decode_etc2a1_block(data, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
        }
    }
+    return 1;
+}
+
+int decode_etc2a8(const uint8_t *data, const long w, const long h, uint32_t *image) {
+    long num_blocks_x = (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    uint32_t buffer[16];
+    for (long by = 0; by < num_blocks_y; by++) {
+        for (long bx = 0; bx < num_blocks_x; bx++, data += 16) {
+            decode_etc2_block(data + 8, buffer);
+            decode_etc2a8_block(data, buffer);
+            copy_block_buffer(bx, by, w, h, 4, 4, buffer, image);
+        }
+    }
+    return 1;
 }
@@ -3,9 +3,9 @@

 #include <stdint.h>

-void decode_etc1(const void*, const int, const int, uint32_t*);
-void decode_etc2(const void*, const int, const int, uint32_t*);
-void decode_etc2a1(const void*, const int, const int, uint32_t*);
-void decode_etc2a8(const void*, const int, const int, uint32_t*);
+int decode_etc1(const uint8_t *, const long, const long, uint32_t *);
+int decode_etc2(const uint8_t *, const long, const long, uint32_t *);
+int decode_etc2a1(const uint8_t *, const long, const long, uint32_t *);
+int decode_etc2a8(const uint8_t *, const long, const long, uint32_t *);

 #endif /* end of include guard: ETC_H */
@@ -6,35 +6,31 @@

 #endif /* FP16_H */

-/**
+/*
 *
 * License Information
 *
 * FP16 library is derived from https://github.com/Maratyszcza/FP16.
 * The library is licensed under the MIT License shown below.
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Facebook Inc.
-Copyright (c) 2017 Georgia Institute of Technology
-Copyright 2019 Google LLC
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
- **/
+ *
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2017 Facebook Inc.
+ * Copyright (c) 2017 Georgia Institute of Technology
+ * Copyright 2019 Google LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
@@ -1,79 +1,174 @@
+#include <ruby.h>
+#include <stdint.h>
+#include <stdlib.h>
 #include "astc.h"
 #include "dxtc.h"
 #include "etc.h"
 #include "pvrtc.h"
 #include "rgb.h"
-#include <ruby.h>
-#include <stdint.h>
-#include <stdlib.h>
+
+const char *error_msg = NULL;
+
+#define DECODE_CHECK(call)                                                                  \
+    if (!call) {                                                                            \
+        rb_raise(rb_eRuntimeError, "%s", error_msg ? error_msg : "unknown internal error"); \
+        error_msg = NULL;                                                                   \
+        return Qnil;                                                                        \
+    }
+
+static int check_str_len(VALUE data, long len, long unit) {
+    if (RSTRING_LEN(data) < len * unit) {
+        rb_raise(rb_eStandardError, "Data size is not enough.");
+        return 0;
+    }
+    return 1;
+}
+
+static int check_str_len_block(VALUE data, long w, long h, long bw, long bh, long unit) {
+    long size = ((w + bw - 1) / bw) * ((h + bh - 1) / bh);
+    return check_str_len(data, size, unit);
+}
+
+static VALUE rb_alloc_rgb(long n) {
+    VALUE ret = rb_str_buf_new(n * 3);
+    rb_str_set_len(ret, n * 3);
+    return ret;
+}
+
+static VALUE rb_alloc_rgba(long n) {
+    VALUE ret = rb_str_buf_new(n * 4);
+    rb_str_set_len(ret, n * 4);
+    return ret;
+}

 /*
 * Decode image from A8 binary
+ * Returned image is not flipped
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] size width * height
+ * @param [Integer] rb_size width * height
 * @return [String] decoded rgb binary
 */
-static VALUE rb_decode_a8(VALUE self, VALUE rb_data, VALUE size)
-{
-    if (RSTRING_LEN(rb_data) < FIX2LONG(size))
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    VALUE ret = rb_str_buf_new(FIX2LONG(size) * 3);
-    decode_a8((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(size), (uint8_t*)RSTRING_PTR(ret));
-    rb_str_set_len(ret, FIX2LONG(size) * 3);
+static VALUE rb_decode_a8(VALUE self, VALUE rb_data, VALUE rb_size) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 1))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_a8((uint8_t *)RSTRING_PTR(rb_data), size, (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

 /*
 * Decode image from R8 binary
+ * Returned image is not flipped
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] size width * height
+ * @param [Integer] rb_size width * height
 * @return [String] decoded rgb binary
 */
-static VALUE rb_decode_r8(VALUE self, VALUE rb_data, VALUE size)
-{
-    if (RSTRING_LEN(rb_data) < FIX2LONG(size))
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    VALUE ret = rb_str_buf_new(FIX2LONG(size) * 3);
-    decode_r8((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(size), (uint8_t*)RSTRING_PTR(ret));
-    rb_str_set_len(ret, FIX2LONG(size) * 3);
+static VALUE rb_decode_r8(VALUE self, VALUE rb_data, VALUE rb_size) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 1))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_r8((uint8_t *)RSTRING_PTR(rb_data), size, (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

 /*
 * Decode image from R16 binary
+ * Returned image is not flipped
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] size width * height
- * @param [Boolean] big whether input data are big endian
+ * @param [Integer] rb_size width * height
+ * @param [Boolean] rb_big whether input data are big endian
 * @return [String] decoded rgb binary
 */
-static VALUE rb_decode_r16(VALUE self, VALUE rb_data, VALUE size, VALUE big)
-{
-    if (RSTRING_LEN(rb_data) < FIX2LONG(size) * 2)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    VALUE ret = rb_str_buf_new(FIX2LONG(size) * 3);
-    decode_r16((uint16_t*)RSTRING_PTR(rb_data), FIX2INT(size), RTEST(big), (uint8_t*)RSTRING_PTR(ret));
-    rb_str_set_len(ret, FIX2LONG(size) * 3);
+static VALUE rb_decode_r16(VALUE self, VALUE rb_data, VALUE rb_size, VALUE rb_big) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 2))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_r16((uint8_t *)RSTRING_PTR(rb_data), size, RTEST(rb_big), (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

 /*
 * Decode image from RGB565 binary
+ * Returned image is not flipped
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] size width * height
- * @param [Boolean] big whether input data are big endian
+ * @param [Integer] rb_size width * height
+ * @param [Boolean] rb_big whether input data are big endian
 * @return [String] decoded rgb binary
 */
-static VALUE rb_decode_rgb565(VALUE self, VALUE rb_data, VALUE size, VALUE big)
-{
-    if (RSTRING_LEN(rb_data) < FIX2LONG(size) * 2)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    VALUE ret = rb_str_buf_new(FIX2LONG(size) * 3);
-    decode_rgb565((uint16_t*)RSTRING_PTR(rb_data), FIX2INT(size), RTEST(big), (uint8_t*)RSTRING_PTR(ret));
-    rb_str_set_len(ret, FIX2LONG(size) * 3);
+static VALUE rb_decode_rgb565(VALUE self, VALUE rb_data, VALUE rb_size, VALUE rb_big) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 2))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_rgb565((uint16_t *)RSTRING_PTR(rb_data), size, RTEST(rb_big), (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
+    return ret;
+}
+
+/*
+ * Decode image from RHalf binary
+ * Returned image is not flipped
+ *
+ * @param [String] rb_data binary to decode
+ * @param [Integer] rb_size width * height
+ * @param [Boolean] rb_big whether input data are big endian
+ * @return [String] decoded rgb binary
+ */
+static VALUE rb_decode_rhalf(VALUE self, VALUE rb_data, VALUE rb_size, VALUE rb_big) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 2))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_rhalf((uint16_t *)RSTRING_PTR(rb_data), size, RTEST(rb_big), (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
+    return ret;
+}
+
+/*
+ * Decode image from RGHalf binary
+ * Returned image is not flipped
+ *
+ * @param [String] rb_data binary to decode
+ * @param [Integer] rb_size width * height
+ * @param [Boolean] rb_big whether input data are big endian
+ * @return [String] decoded rgb binary
+ */
+static VALUE rb_decode_rghalf(VALUE self, VALUE rb_data, VALUE rb_size, VALUE rb_big) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 4))
+        return Qnil;
+    VALUE ret = rb_alloc_rgb(size);
+    if (!decode_rghalf((uint16_t *)RSTRING_PTR(rb_data), size, RTEST(rb_big), (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
+    return ret;
+}
+
+/*
+ * Decode image from RGBAHalf binary
+ * Returned image is not flipped
+ *
+ * @param [String] rb_data binary to decode
+ * @param [Integer] rb_size width * height
+ * @param [Boolean] rb_big whether input data are big endian
+ * @return [String] decoded rgba binary
+ */
+static VALUE rb_decode_rgbahalf(VALUE self, VALUE rb_data, VALUE rb_size, VALUE rb_big) {
+    long size = FIX2LONG(rb_size);
+    if (!check_str_len(rb_data, size, 8))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(size);
+    if (!decode_rgbahalf((uint16_t *)RSTRING_PTR(rb_data), size, RTEST(rb_big), (uint8_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

@@ -81,18 +176,17 @@ static VALUE rb_decode_rgb565(VALUE self, VALUE rb_data, VALUE size, VALUE big)
 * Decode image from ETC1 compressed binary
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
+ * @param [Integer] rb_w image width
+ * @param [Integer] rb_h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_etc1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 8)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_etc1((uint64_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_etc1(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w), h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 8))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    if (!decode_etc1((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

@@ -104,14 +198,13 @@ static VALUE rb_decode_etc1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
 * @param [Integer] h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_etc2(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 8)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_etc2((uint64_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_etc2(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w), h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 8))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    if (!decode_etc2((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

@@ -123,14 +216,13 @@ static VALUE rb_decode_etc2(VALUE self, VALUE rb_data, VALUE w, VALUE h)
 * @param [Integer] h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_etc2a1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 8)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_etc2a1((uint64_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_etc2a1(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w), h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 8))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    if (!decode_etc2a1((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

@@ -142,14 +234,13 @@ static VALUE rb_decode_etc2a1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
 * @param [Integer] h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_etc2a8(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 16)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_etc2a8((uint64_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_etc2a8(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w), h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 16))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    if (!decode_etc2a8((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)))
+        return Qnil;
    return ret;
 }

@@ -157,21 +248,21 @@ static VALUE rb_decode_etc2a8(VALUE self, VALUE rb_data, VALUE w, VALUE h)
 * Decode image from ASTC compressed binary
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
- * @param [Integer] bw block width
- * @param [Integer] bh block height
+ * @param [Integer] rb_w image width
+ * @param [Integer] rb_h image height
+ * @param [Integer] rb_bw block width
+ * @param [Integer] rb_bh block height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_astc(VALUE self, VALUE rb_data, VALUE w, VALUE h,
-    VALUE bw, VALUE bh)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + FIX2LONG(bw) - 1) / FIX2LONG(bw)) * ((FIX2LONG(h) + FIX2LONG(bh) - 1) / FIX2LONG(bh)) * 16)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_astc((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), FIX2INT(bw), FIX2INT(bh), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_astc(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h, VALUE rb_bw, VALUE rb_bh) {
+    long w = FIX2LONG(rb_w);
+    long h = FIX2LONG(rb_h);
+    int bw = FIX2INT(rb_bw);
+    int bh = FIX2INT(rb_bh);
+    if (!check_str_len_block(rb_data, w, h, bw, bh, 16))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    DECODE_CHECK(decode_astc((uint8_t *)RSTRING_PTR(rb_data), w, h, bw, bh, (uint32_t *)RSTRING_PTR(ret)));
    return ret;
 }

@@ -179,18 +270,17 @@ static VALUE rb_decode_astc(VALUE self, VALUE rb_data, VALUE w, VALUE h,
 * Decode image from DXT1 compressed binary
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
+ * @param [Integer] rb_w image width
+ * @param [Integer] rb_h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_dxt1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 8)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_dxt1((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_dxt1(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w);
+    long h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 8))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    DECODE_CHECK(decode_dxt1((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)));
    return ret;
 }

@@ -198,77 +288,50 @@ static VALUE rb_decode_dxt1(VALUE self, VALUE rb_data, VALUE w, VALUE h)
 * Decode image from DXT5 compressed binary
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
+ * @param [Integer] rb_w image width
+ * @param [Integer] rb_h image height
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_dxt5(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 16)
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-    uint32_t* image = (uint32_t*)calloc(FIX2LONG(w) * FIX2LONG(h), sizeof(uint32_t));
-    decode_dxt5((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), image);
-    VALUE ret = rb_str_new((char*)image, FIX2LONG(w) * FIX2LONG(h) * sizeof(uint32_t));
-    free(image);
+static VALUE rb_decode_dxt5(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h) {
+    long w = FIX2LONG(rb_w);
+    long h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, 4, 4, 16))
+        return Qnil;
+    VALUE ret = rb_alloc_rgba(w * h);
+    DECODE_CHECK(decode_dxt5((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret)));
    return ret;
 }

 /*
- * Decode image from PVRTC1 4bpp compressed binary
+ * Decode image from PVRTC1 compressed binary
 *
 * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
+ * @param [Integer] rb_w image width
+ * @param [Integer] rb_h image height
+ * @param [Boolean] rb_is2bpp whether 2bpp or not (4bpp)
 * @return [String] decoded rgba binary
 */
-static VALUE rb_decode_pvrtc1_4bpp(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 3) / 4) * ((FIX2LONG(h) + 3) / 4) * 8) {
-        rb_raise(rb_eStandardError, "Data size is not enough.");
+static VALUE rb_decode_pvrtc1(VALUE self, VALUE rb_data, VALUE rb_w, VALUE rb_h, VALUE rb_is2bpp) {
+    int is2bpp = RTEST(rb_is2bpp);
+    long w = FIX2LONG(rb_w);
+    long h = FIX2LONG(rb_h);
+    if (!check_str_len_block(rb_data, w, h, is2bpp ? 8 : 4, 4, 8))
        return Qnil;
-    }
-    size_t buffer_length = FIX2LONG(w) * FIX2LONG(h) * 8;
-    VALUE ret = rb_str_buf_new(buffer_length);
-    if (!decode_pvrtc_4bpp((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), (uint32_t*)RSTRING_PTR(ret))) {
-        rb_raise(rb_eStandardError, "internal error");
-        return Qnil;
-    }
-    rb_str_set_len(ret, buffer_length);
+    VALUE ret = rb_alloc_rgba(w * h);
+    DECODE_CHECK(decode_pvrtc((uint8_t *)RSTRING_PTR(rb_data), w, h, (uint32_t *)RSTRING_PTR(ret), is2bpp));
    return ret;
 }

-/*
- * Decode image from PVRTC1 2bpp compressed binary
- *
- * @param [String] rb_data binary to decode
- * @param [Integer] w image width
- * @param [Integer] h image height
- * @return [String] decoded rgba binary
- */
-static VALUE rb_decode_pvrtc1_2bpp(VALUE self, VALUE rb_data, VALUE w, VALUE h)
-{
-    if (RSTRING_LEN(rb_data) < ((FIX2LONG(w) + 7) / 8) * ((FIX2LONG(h) + 3) / 4) * 8) {
-        rb_raise(rb_eStandardError, "Data size is not enough.");
-        return Qnil;
-    }
-    size_t buffer_length = FIX2LONG(w) * FIX2LONG(h) * 8;
-    VALUE ret = rb_str_buf_new(buffer_length);
-    if (!decode_pvrtc_2bpp((uint8_t*)RSTRING_PTR(rb_data), FIX2INT(w), FIX2INT(h), (uint32_t*)RSTRING_PTR(ret))) {
-        rb_raise(rb_eStandardError, "internal error");
-        return Qnil;
-    }
-    rb_str_set_len(ret, buffer_length);
-    return ret;
-}
-
-void Init_native()
-{
+void Init_native() {
    VALUE mMikunyan = rb_define_module("Mikunyan");
    VALUE mDecodeHelper = rb_define_module_under(mMikunyan, "DecodeHelper");
    rb_define_module_function(mDecodeHelper, "decode_a8", rb_decode_a8, 2);
    rb_define_module_function(mDecodeHelper, "decode_r8", rb_decode_r8, 2);
    rb_define_module_function(mDecodeHelper, "decode_r16", rb_decode_r16, 3);
    rb_define_module_function(mDecodeHelper, "decode_rgb565", rb_decode_rgb565, 3);
+    rb_define_module_function(mDecodeHelper, "decode_rhalf", rb_decode_rhalf, 3);
+    rb_define_module_function(mDecodeHelper, "decode_rghalf", rb_decode_rghalf, 3);
+    rb_define_module_function(mDecodeHelper, "decode_rgbahalf", rb_decode_rgbahalf, 3);
    rb_define_module_function(mDecodeHelper, "decode_etc1", rb_decode_etc1, 3);
    rb_define_module_function(mDecodeHelper, "decode_etc2", rb_decode_etc2, 3);
    rb_define_module_function(mDecodeHelper, "decode_etc2a1", rb_decode_etc2a1, 3);
@@ -276,6 +339,5 @@ void Init_native()
    rb_define_module_function(mDecodeHelper, "decode_astc", rb_decode_astc, 5);
    rb_define_module_function(mDecodeHelper, "decode_dxt1", rb_decode_dxt1, 3);
    rb_define_module_function(mDecodeHelper, "decode_dxt5", rb_decode_dxt5, 3);
-    rb_define_module_function(mDecodeHelper, "decode_pvrtc1_4bpp", rb_decode_pvrtc1_4bpp, 3);
-    rb_define_module_function(mDecodeHelper, "decode_pvrtc1_2bpp", rb_decode_pvrtc1_2bpp, 3);
+    rb_define_module_function(mDecodeHelper, "decode_pvrtc1", rb_decode_pvrtc1, 4);
 }
@@ -1,36 +1,23 @@
 #include "pvrtc.h"
-#include "common.h"
 #include <stdint.h>
 #include <string.h>
+#include "color.h"
+#include "endianness.h"

-#define MORTON_POS(x, y) (morton_table[num_blocks_x * (y) + (x)])
+static const int PVRTC1_STANDARD_WEIGHT[] = {0, 3, 5, 8};
+static const int PVRTC1_PUNCHTHROUGH_WEIGHT[] = {0, 4, 4, 8};

-static inline uint32_t color(uint8_t r, uint8_t g, uint8_t b, uint8_t a) {
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return r | g << 8 | b << 16 | a << 24;
-#else
-    return a | b << 8 | g << 16 | r << 24;
-#endif
-}
-
-static inline int morton_index(const int x, const int y, const int numblocks_x, const int numblocks_y) {
-    const int min_dim = numblocks_x <= numblocks_y ? numblocks_x : numblocks_y;
-    int offset = 0, shift = 0;
-    for (int mask = 1; mask < min_dim; mask <<= 1, shift++) {
+static inline long morton_index(const long x, const long y, const long min_dim) {
+    long offset = 0, shift = 0;
+    for (long mask = 1; mask < min_dim; mask <<= 1, shift++)
        offset |= (((y & mask) | ((x & mask) << 1))) << shift;
-    }
    offset |= ((x | y) >> shift) << (shift * 2);
    return offset;
 }

 static void get_texel_colors(const uint8_t *data, PVRTCTexelInfo *info) {
-#if BYTE_ORDER == LITTLE_ENDIAN
-    uint16_t ca = *(uint16_t *)(data + 4);
-    uint16_t cb = *(uint16_t *)(data + 6);
-#else
-    uint16_t ca = data[4] | data[5] << 8;
-    uint16_t cb = data[6] | data[7] << 8;
-#endif
+    uint16_t ca = lton16(*(uint16_t *)(data + 4));
+    uint16_t cb = lton16(*(uint16_t *)(data + 6));
    if (ca & 0x8000) {
        info->a.r = ca >> 10 & 0x1f;
        info->a.g = ca >> 5 & 0x1f;
@@ -59,47 +46,19 @@ static void get_texel_weights_4bpp(const uint8_t *data, PVRTCTexelInfo *info) {
    info->punch_through_flag = 0;

    int mod_mode = data[4] & 1;
-#if BYTE_ORDER == LITTLE_ENDIAN
-    uint32_t mod_bits = *(uint32_t *)data;
-#else
-    uint32_t mod_bits = data[0] | data[1] << 8 | data[2] << 16 | data[3] << 24;
-#endif
+    uint32_t mod_bits = lton32(*(uint32_t *)data);

    if (mod_mode) {
        // punch-through
        for (int i = 0; i < 16; i++, mod_bits >>= 2) {
-            switch (mod_bits & 3) {
-            case 0:
-                info->weight[i] = 0;
-                break;
-            case 3:
-                info->weight[i] = 8;
-                break;
-            case 2:
+            info->weight[i] = PVRTC1_PUNCHTHROUGH_WEIGHT[mod_bits & 3];
+            if ((mod_bits & 3) == 2)
                info->punch_through_flag |= 1 << i;
-                // fall through
-            default:
-                info->weight[i] = 4;
-            }
        }
    } else {
        // standard
-        for (int i = 0; i < 16; i++, mod_bits >>= 2) {
-            switch (mod_bits & 3) {
-            case 0:
-                info->weight[i] = 0;
-                break;
-            case 1:
-                info->weight[i] = 3;
-                break;
-            case 2:
-                info->weight[i] = 5;
-                break;
-            case 3:
-                info->weight[i] = 8;
-                break;
-            }
-        }
+        for (int i = 0; i < 16; i++, mod_bits >>= 2)
+            info->weight[i] = PVRTC1_STANDARD_WEIGHT[mod_bits & 3];
    }
 }

@@ -107,11 +66,7 @@ static void get_texel_weights_2bpp(const uint8_t *data, PVRTCTexelInfo *info) {
    info->punch_through_flag = 0;

    int mod_mode = data[4] & 1;
-#if BYTE_ORDER == LITTLE_ENDIAN
-    uint32_t mod_bits = *(uint32_t *)data;
-#else
-    uint32_t mod_bits = data[0] | data[1] << 8 | data[2] << 16 | data[3] << 24;
-#endif
+    uint32_t mod_bits = lton32(*(uint32_t *)data);

    if (mod_mode) {
        // interporated modulation
@@ -123,24 +78,9 @@ static void get_texel_weights_2bpp(const uint8_t *data, PVRTCTexelInfo *info) {
        for (int y = 0, i = 1; y < 4; ++y & 1 ? --i : ++i)
            for (int x = 0; x < 4; x++, i += 2)
                info->weight[i] = fillflag;
-        for (int y = 0, i = 0; y < 4; ++y & 1 ? ++i : --i) {
-            for (int x = 0; x < 4; x++, i += 2, mod_bits >>= 2) {
-                switch (mod_bits & 3) {
-                case 0:
-                    info->weight[i] = 0;
-                    break;
-                case 1:
-                    info->weight[i] = 3;
-                    break;
-                case 2:
-                    info->weight[i] = 5;
-                    break;
-                case 3:
-                    info->weight[i] = 8;
-                    break;
-                }
-            }
-        }
+        for (int y = 0, i = 0; y < 4; ++y & 1 ? ++i : --i)
+            for (int x = 0; x < 4; x++, i += 2, mod_bits >>= 2)
+                info->weight[i] = PVRTC1_STANDARD_WEIGHT[mod_bits & 3];
        // 0 は常に 1bpp
        info->weight[0] = (info->weight[0] + 3) & 8;
        if (data[0] & 1)
@@ -153,7 +93,7 @@ static void get_texel_weights_2bpp(const uint8_t *data, PVRTCTexelInfo *info) {
    }
 }

-static void applicate_color_4bpp(const uint8_t *data, PVRTCTexelInfo *const info[9], uint32_t buf[16]) {
+static void applicate_color_4bpp(const uint8_t *data, PVRTCTexelInfo *const info[9], uint32_t buf[32]) {
    static const int INTERP_WEIGHT[4][3] = {{2, 2, 0}, {1, 3, 0}, {0, 4, 0}, {0, 3, 1}};
    PVRTCTexelColorInt clr_a[16] = {}, clr_b[16] = {};

@@ -195,7 +135,7 @@ static void applicate_color_4bpp(const uint8_t *data, PVRTCTexelInfo *const info
    }
 }

-static void applicate_color_2bpp(const uint8_t *data, PVRTCTexelInfo *info[9], uint32_t buf[32]) {
+static void applicate_color_2bpp(const uint8_t *data, PVRTCTexelInfo *const info[9], uint32_t buf[32]) {
    static const int INTERP_WEIGHT_X[8][3] = {{4, 4, 0}, {3, 5, 0}, {2, 6, 0}, {1, 7, 0},
                                              {0, 8, 0}, {0, 7, 1}, {0, 6, 2}, {0, 5, 3}};
    static const int INTERP_WEIGHT_Y[4][3] = {{2, 2, 0}, {1, 3, 0}, {0, 4, 0}, {0, 3, 1}};
@@ -262,108 +202,57 @@ static void applicate_color_2bpp(const uint8_t *data, PVRTCTexelInfo *info[9], u
    }
 }

-int decode_pvrtc_4bpp(const uint8_t *data, const int w, const int h, uint32_t *image) {
-    int num_blocks_x = (w + 3) / 4;
-    int num_blocks_y = (h + 3) / 4;
-    int num_blocks = num_blocks_x * num_blocks_y;
-    int copy_length_last = (w + 3) % 4 + 1;
+int decode_pvrtc(const uint8_t *data, const long w, const long h, uint32_t *image, const int is2bpp) {
+    long bw = is2bpp ? 8 : 4;
+    long num_blocks_x = is2bpp ? (w + 7) / 8 : (w + 3) / 4;
+    long num_blocks_y = (h + 3) / 4;
+    long num_blocks = num_blocks_x * num_blocks_y;
+    long min_num_blocks = num_blocks_x <= num_blocks_y ? num_blocks_x : num_blocks_y;

-    int *morton_table = (int *)malloc(sizeof(int) * num_blocks);
-    if (morton_table == NULL)
+    if ((num_blocks_x & (num_blocks_x - 1)) || (num_blocks_y & (num_blocks_y - 1))) {
+        extern const char *error_msg;
+        error_msg = "the number of blocks of each side must be a power of 2";
        return 0;
+    }
+
    PVRTCTexelInfo *texel_info = (PVRTCTexelInfo *)malloc(sizeof(PVRTCTexelInfo) * num_blocks);
    if (texel_info == NULL) {
-        free(morton_table);
+        extern const char *error_msg;
+        error_msg = "memory allocation failed";
        return 0;
    }

-    for (int y = 0; y < num_blocks_y; y++)
-        for (int x = 0; x < num_blocks_x; x++)
-            MORTON_POS(x, y) = morton_index(x, y, num_blocks_x, num_blocks_y);
+    void (*get_texel_weights_func)(const uint8_t *, PVRTCTexelInfo *) =
+      is2bpp ? get_texel_weights_2bpp : get_texel_weights_4bpp;
+    void (*applicate_color_func)(const uint8_t *, PVRTCTexelInfo *const[9], uint32_t[32]) =
+      is2bpp ? applicate_color_2bpp : applicate_color_4bpp;

    const uint8_t *d = data;
-    for (int i = 0; i < num_blocks; i++, d += 8) {
+    for (long i = 0; i < num_blocks; i++, d += 8) {
        get_texel_colors(d, &texel_info[i]);
-        get_texel_weights_4bpp(d, &texel_info[i]);
-    }
-
-    uint32_t buffer[16];
-    uint32_t *buffer_end = buffer + 16;
-    PVRTCTexelInfo *local_info[9];
-    int pos_x[3], pos_y[3];
-    for (int by = 0; by < num_blocks_y; by++) {
-        pos_y[0] = by == 0 ? num_blocks_y - 1 : by - 1;
-        pos_y[1] = by;
-        pos_y[2] = by == num_blocks_y - 1 ? 0 : by + 1;
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, x += 4) {
-            pos_x[0] = bx == 0 ? num_blocks_x - 1 : bx - 1;
-            pos_x[1] = bx;
-            pos_x[2] = bx == num_blocks_x - 1 ? 0 : bx + 1;
-            for (int cy = 0, c = 0; cy < 3; cy++)
-                for (int cx = 0; cx < 3; cx++, c++)
-                    local_info[c] = &texel_info[MORTON_POS(pos_x[cx], pos_y[cy])];
-            applicate_color_4bpp(data + MORTON_POS(bx, by) * 8, local_info, buffer);
-            int copy_length = (bx < num_blocks_x - 1 ? 4 : copy_length_last) * 4;
-            uint32_t *b = buffer;
-            for (int y = h - by * 4 - 1; b < buffer_end && y >= 0; y--, b += 4)
-                memcpy(image + y * w + x, b, copy_length);
-        }
-    }
-
-    free(morton_table);
-    free(texel_info);
-    return 1;
-}
-
-int decode_pvrtc_2bpp(const uint8_t *data, const int w, const int h, uint32_t *image) {
-    int num_blocks_x = (w + 7) / 8;
-    int num_blocks_y = (h + 3) / 4;
-    int num_blocks = num_blocks_x * num_blocks_y;
-    int copy_length_last = (w + 7) % 8 + 1;
-
-    int *morton_table = (int *)malloc(sizeof(int) * num_blocks);
-    if (morton_table == NULL)
-        return 0;
-    PVRTCTexelInfo *texel_info = (PVRTCTexelInfo *)malloc(sizeof(PVRTCTexelInfo) * num_blocks);
-    if (texel_info == NULL) {
-        free(morton_table);
-        return 0;
-    }
-
-    for (int y = 0; y < num_blocks_y; y++)
-        for (int x = 0; x < num_blocks_x; x++)
-            MORTON_POS(x, y) = morton_index(x, y, num_blocks_x, num_blocks_y);
-
-    const uint8_t *d = data;
-    for (int i = 0; i < num_blocks; i++, d += 8) {
-        get_texel_colors(d, &texel_info[i]);
-        get_texel_weights_2bpp(d, &texel_info[i]);
+        get_texel_weights_func(d, &texel_info[i]);
    }

    uint32_t buffer[32];
-    uint32_t *buffer_end = buffer + 32;
    PVRTCTexelInfo *local_info[9];
-    int pos_x[3], pos_y[3];
-    for (int by = 0; by < num_blocks_y; by++) {
+    long pos_x[3], pos_y[3];
+
+    for (long by = 0; by < num_blocks_y; by++) {
        pos_y[0] = by == 0 ? num_blocks_y - 1 : by - 1;
        pos_y[1] = by;
        pos_y[2] = by == num_blocks_y - 1 ? 0 : by + 1;
-        for (int bx = 0, x = 0; bx < num_blocks_x; bx++, x += 8) {
+        for (long bx = 0, x = 0; bx < num_blocks_x; bx++, x += 4) {
            pos_x[0] = bx == 0 ? num_blocks_x - 1 : bx - 1;
            pos_x[1] = bx;
            pos_x[2] = bx == num_blocks_x - 1 ? 0 : bx + 1;
-            for (int cy = 0, c = 0; cy < 3; cy++)
-                for (int cx = 0; cx < 3; cx++, c++)
-                    local_info[c] = &texel_info[MORTON_POS(pos_x[cx], pos_y[cy])];
-            applicate_color_2bpp(data + MORTON_POS(bx, by) * 8, local_info, buffer);
-            int copy_length = (bx < num_blocks_x - 1 ? 8 : copy_length_last) * 4;
-            uint32_t *b = buffer;
-            for (int y = h - by * 4 - 1; b < buffer_end && y >= 0; y--, b += 8)
-                memcpy(image + y * w + x, b, copy_length);
+            for (long cy = 0, c = 0; cy < 3; cy++)
+                for (long cx = 0; cx < 3; cx++, c++)
+                    local_info[c] = &texel_info[morton_index(pos_x[cx], pos_y[cy], min_num_blocks)];
+            applicate_color_func(data + morton_index(bx, by, min_num_blocks) * 8, local_info, buffer);
+            copy_block_buffer(bx, by, w, h, bw, 4, buffer, image);
        }
    }

-    free(morton_table);
    free(texel_info);
    return 1;
 }
@@ -24,7 +24,6 @@ typedef struct {
    uint32_t punch_through_flag;
 } PVRTCTexelInfo;

-int decode_pvrtc_4bpp(const uint8_t*, const int, const int, uint32_t*);
-int decode_pvrtc_2bpp(const uint8_t*, const int, const int, uint32_t*);
+int decode_pvrtc(const uint8_t *, const long, const long, uint32_t *, const int);

 #endif /* end of include guard: PVRTC_H */
@@ -1,66 +1,102 @@
 #include "rgb.h"
-#include "common.h"
+#include <math.h>
 #include <stdint.h>
+#include "color.h"
+#include "fp16.h"

-void decode_a8(const uint8_t* data, const int size, uint8_t* image)
-{
+int decode_a8(const uint8_t *const data, const long size, uint8_t *image) {
    const uint8_t *d = data, *d_end = data + size;
    for (int i = 0; d < d_end; d++) {
        image[i++] = *d;
        image[i++] = *d;
        image[i++] = *d;
    }
+    return 1;
 }

-void decode_r8(const uint8_t* data, const int size, uint8_t* image)
-{
+int decode_r8(const uint8_t *const data, const long size, uint8_t *image) {
    const uint8_t *d = data, *d_end = data + size;
    for (int i = 0; d < d_end; d++) {
        image[i++] = *d;
        image[i++] = 0;
        image[i++] = 0;
    }
+    return 1;
 }

-void decode_r16(const uint16_t* data, const int size, const int endian_big, uint8_t* image)
-{
-    const uint16_t *d = data, *d_end = data + size;
-    if (IS_LITTLE_ENDIAN == !endian_big) {
-        // Same endian
-        for (int i = 0; d < d_end; d++) {
-            uint8_t c = *d >> 8;
-            image[i++] = c;
-            image[i++] = 0;
-            image[i++] = 0;
-        }
-    } else {
-        // Different endian
-        for (int i = 0; d < d_end; d++) {
-            image[i++] = *d;
-            image[i++] = 0;
-            image[i++] = 0;
-        }
+int decode_r16(const uint8_t *const data, const long size, const int endian_big, uint8_t *image) {
+    const uint8_t *d = endian_big ? data : data + 1;
+    const uint8_t *d_end = data + size * 2;
+    for (int i = 0; d < d_end; d += 2) {
+        image[i++] = *d;
+        image[i++] = 0;
+        image[i++] = 0;
    }
+    return 1;
 }

-void decode_rgb565(const uint16_t* data, const int size, const int endian_big, uint8_t* image)
-{
+int decode_rgb565(const uint16_t *const data, const long size, const int endian_big, uint8_t *image) {
    const uint16_t *d = data, *d_end = data + size;
-    if (IS_LITTLE_ENDIAN == !endian_big) {
-        // Same endian
-        // RRRRR GGG | GGG BBBBB
-        for (int i = 0; d < d_end; d++) {
-            image[i++] = (*d >> 8 & 0xf8) | (*d >> 13);
-            image[i++] = (*d >> 3 & 0xfc) | (*d >> 9 & 3);
-            image[i++] = (*d << 3) | (*d >> 2 & 7);
+    if (endian_big)
+        for (; d < d_end; d++, image += 3)
+            rgb565_bep(*d, image);
+    else
+        for (; d < d_end; d++, image += 3)
+            rgb565_lep(*d, image);
+    return 1;
+}
+
+static inline uint8_t u16_f16_u8(const uint16_t val) {
+    float f = fp16_ieee_to_fp32_value(val);
+    if (!isfinite(f) || f < 0)
+        return 0;
+    else if (f > 1)
+        return 255;
+    else
+        return roundf(f * 255);
+}
+
+int decode_rhalf(const uint16_t *data, const long size, const int endian_big, uint8_t *image) {
+    if (endian_big) {
+        for (long i = 0; i < size; i++, data++) {
+            *image++ = u16_f16_u8(bton16(*data));
+            *image++ = 0;
+            *image++ = 0;
        }
    } else {
-        // Different endian
-        // GGG BBBBB | RRRRR GGG
-        for (int i = 0; d < d_end; d++) {
-            image[i++] = (*d & 0xf8) | (*d >> 5 & 7);
-            image[i++] = (*d << 5 & 0xe0) | (*d >> 11 & 0x1c) | (*d >> 1 & 3);
-            image[i++] = (*d >> 5 & 0xf8) | (*d >> 10 & 0x7);
+        for (long i = 0; i < size; i++, data++) {
+            *image++ = u16_f16_u8(lton16(*data));
+            *image++ = 0;
+            *image++ = 0;
        }
    }
+    return 1;
+}
+
+int decode_rghalf(const uint16_t *data, const long size, const int endian_big, uint8_t *image) {
+    if (endian_big) {
+        for (long i = 0; i < size; i++, data++, image++) {
+            *image++ = u16_f16_u8(bton16(*data++));
+            *image++ = u16_f16_u8(bton16(*data++));
+            *image++ = 0;
+        }
+    } else {
+        for (long i = 0; i < size; i++, data++) {
+            *image++ = u16_f16_u8(lton16(*data++));
+            *image++ = u16_f16_u8(lton16(*data++));
+            *image++ = 0;
+        }
+    }
+    return 1;
+}
+
+int decode_rgbahalf(const uint16_t *data, const long size, const int endian_big, uint8_t *image) {
+    long lsize = size * 4;
+    if (endian_big)
+        for (long i = 0; i < lsize; i++, data++, image++)
+            *image = u16_f16_u8(bton16(*data));
+    else
+        for (long i = 0; i < lsize; i++, data++, image++)
+            *image = u16_f16_u8(lton16(*data));
+    return 1;
 }
@@ -3,9 +3,12 @@

 #include <stdint.h>

-void decode_a8(const uint8_t*, const int, uint8_t*);
-void decode_r8(const uint8_t*, const int, uint8_t*);
-void decode_r16(const uint16_t*, const int, const int, uint8_t*);
-void decode_rgb565(const uint16_t*, const int, const int, uint8_t*);
+int decode_a8(const uint8_t *const, const long, uint8_t *);
+int decode_r8(const uint8_t *const, const long, uint8_t *);
+int decode_r16(const uint8_t *const, const long, const int, uint8_t *);
+int decode_rgb565(const uint16_t *const, const long, const int, uint8_t *);
+int decode_rhalf(const uint16_t *const, const long, const int, uint8_t *);
+int decode_rghalf(const uint16_t *const, const long, const int, uint8_t *);
+int decode_rgbahalf(const uint16_t *const, const long, const int, uint8_t *);

 #endif /* end of include guard: RGB_H */