Optimize DXT endpoints refinement

This change improves the compression speed for DXT encoding. Explanation: When creating the array of trial alpha endpoints, there is no need to use bit array for tracking duplicate entries. Instead, the uniqueness of the endpoint pair can be determined using simple comparison operations. Moreover, it is not necessary to go through all the source pixels on every iteration in order to calculate the total squared error for a specific trial endpoint. Considering that selector values are not modified during the refinement step, each selector has a fixed set of pixels associated with it during optimization. This means that calculation of the total squared error can be optimized in the following way: sum((x - p(i)) * (x - p(i))) = sum(x * x) + sum(2 * x * p(i)) + sum(p(i) * p(i)) = N * x * x + sum(2 * p(i)) * x + sum(p(i) * p(i)) As the set of pixels, associated with a specific selector is fixed, the sum(2 * p(i)) and sum(p(i) * p(i)) values can be precalculated in advance. This means that error computation for each component now requires only (3 * S) multiplications instead of N (where N is the number of pixels in the processed cluster, and S is the number of selectors, equal to 4 for color components and 8 for alpha components). DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.864 sec Modified: 1468204 bytes / 10.794 sec Improvement: 7.21% (compression ratio) / 62.60% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.912 sec Modified: 1914805 bytes / 14.244 sec Improvement: 7.28% (compression ratio) / 61.41% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 17.125 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
2017-09-12 16:50:03 +02:00
parent 3053c9dd93
commit 335f0ee056
2 changed files with 88 additions and 207 deletions
@@ -91,237 +91,118 @@ bool dxt_endpoint_refiner::refine(const params& p, results& r) {
  else
    optimize_dxt5(l, h);

-  //if (r.m_low_color < r.m_high_color)
-  //   utils::swap(r.m_low_color, r.m_high_color);
-
  return r.m_error < p.m_error_to_beat;
 }

 void dxt_endpoint_refiner::optimize_dxt5(vec3F low_color, vec3F high_color) {
-  float nl = low_color[0];
-  float nh = high_color[0];
+  uint8 L0 = math::clamp<int>(low_color[0] * 256.0f, 0, 255);
+  uint8 H0 = math::clamp<int>(high_color[0] * 256.0f, 0, 255);

-#if CRNLIB_DXT_ALT_ROUNDING
-  nl = math::clamp(nl, 0.0f, .999f);
-  nh = math::clamp(nh, 0.0f, .999f);
-  uint il = (int)floor(nl * 256.0f);
-  uint ih = (int)floor(nh * 256.0f);
-#else
-  uint il = (int)floor(.5f + math::clamp(nl, 0.0f, 1.0f) * 255.0f);
-  uint ih = (int)floor(.5f + math::clamp(nh, 0.0f, 1.0f) * 255.0f);
-#endif
+  uint64 hist[8] = {}, D2[8] = {}, DD[8] = {};
+  for (uint c = m_pParams->m_alpha_comp_index, i = 0; i < m_pParams->m_num_pixels; i++) {
+    uint8 a = m_pParams->m_pPixels[i][c];
+    uint8 s = m_pParams->m_pSelectors[i];
+    hist[s]++;
+    D2[s] += a * 2;
+    DD[s] += a * a;
+  }

-  crnlib::vector<uint> trial_solutions;
-  trial_solutions.reserve(256);
-  trial_solutions.push_back(il | (ih << 8));
-
-  sparse_bit_array flags;
-  flags.resize(256 * 256);
-
-  flags.set_bit((il * 256) + ih);
-
-  const int cProbeAmount = 11;
-
-  for (int l_delta = -cProbeAmount; l_delta <= cProbeAmount; l_delta++) {
-    const int l = il + l_delta;
-    if (l < 0)
-      continue;
-    else if (l > 255)
-      break;
-
-    const uint bit_index = l * 256;
-
-    for (int h_delta = -cProbeAmount; h_delta <= cProbeAmount; h_delta++) {
-      const int h = ih + h_delta;
-      if (h < 0)
-        continue;
-      else if (h > 255)
-        break;
-
-      if ((flags.get_bit(bit_index + h)) || (flags.get_bit(h * 256 + l)))
-        continue;
-
-      flags.set_bit(bit_index + h);
-
-      trial_solutions.push_back(l | (h << 8));
+  uint16 solutions[529];
+  uint solutions_count = 0;
+  solutions[solutions_count++] = L0 == H0 ? H0 ? H0 - 1 << 8 | L0 : 1 : L0 > H0 ? H0 << 8 | L0 : L0 << 8 | H0;
+  uint8 minL = L0 <= 11 ? 0 : L0 - 11, maxL = L0 >= 244 ? 255 : L0 + 11;
+  uint8 minH = H0 <= 11 ? 0 : H0 - 11, maxH = H0 >= 244 ? 255 : H0 + 11;
+  for (uint16 L = minL; L <= maxL; L++) {
+    for (uint16 H = minH; H <= maxH; H++) {
+      if ((maxH < L || L <= H || H < minL) && (L != L0 || H != H0) && (L != H0 || H != L0))
+        solutions[solutions_count++] = L == H ? H ? H - 1 << 8 | L : 1 : L > H ? H << 8 | L : L << 8 | H;
    }
  }

-  for (uint trial = 0; trial < trial_solutions.size(); trial++) {
-    uint l = trial_solutions[trial] & 0xFF;
-    uint h = trial_solutions[trial] >> 8;
-
-    if (l == h) {
-      if (h)
-        h--;
-      else
-        l++;
-    } else if (l < h) {
-      utils::swap(l, h);
-    }
-
-    CRNLIB_ASSERT(l > h);
-
-    uint values[cDXT5SelectorValues];
-    dxt5_block::get_block_values8(values, l, h);
-
-    uint total_error = 0;
-
-    for (uint j = 0; j < m_pParams->m_num_pixels; j++) {
-      int p = m_pParams->m_pPixels[j][m_pParams->m_alpha_comp_index];
-      int c = values[m_pParams->m_pSelectors[j]];
-
-      int error = p - c;
-      error *= error;
-
-      total_error += error;
-
-      if (total_error > m_pResults->m_error)
-        break;
-    }
-
-    if (total_error < m_pResults->m_error) {
-      m_pResults->m_error = total_error;
-      m_pResults->m_low_color = static_cast<uint16>(l);
-      m_pResults->m_high_color = static_cast<uint16>(h);
-
-      if (m_pResults->m_error == 0)
+  for (uint i = 0; i < solutions_count; i++) {
+    uint8 L = solutions[i] & 0xFF;
+    uint8 H = solutions[i] >> 8;
+    uint values[8];
+    dxt5_block::get_block_values8(values, L, H);
+    uint64 error = 0;
+    for (uint64 s = 0; s < 8; s++)
+      error += hist[s] * values[s] * values[s] - D2[s] * values[s] + DD[s];
+    if (error < m_pResults->m_error) {
+      m_pResults->m_low_color = L;
+      m_pResults->m_high_color = H;
+      m_pResults->m_error = error;
+      if (!m_pResults->m_error)
        return;
    }
  }
 }

 void dxt_endpoint_refiner::optimize_dxt1(vec3F low_color, vec3F high_color) {
-  uint selector_hist[4];
-  utils::zero_object(selector_hist);
-  for (uint i = 0; i < m_pParams->m_num_pixels; i++)
-    selector_hist[m_pParams->m_pSelectors[i]]++;
+  uint16 L0 = math::clamp<int>(low_color[0] * 32.0f, 0, 31) << 11 | math::clamp<int>(low_color[1] * 64.0f, 0, 63) << 5 | math::clamp<int>(low_color[2] * 32.0f, 0, 31);
+  uint16 H0 = math::clamp<int>(high_color[0] * 32.0f, 0, 31) << 11 | math::clamp<int>(high_color[1] * 64.0f, 0, 63) << 5 | math::clamp<int>(high_color[2] * 32.0f, 0, 31);

-  dxt1_solution_coordinates c(low_color, high_color);
-
-  for (uint pass = 0; pass < 8; pass++) {
-    const uint64 initial_error = m_pResults->m_error;
-
-    dxt1_solution_coordinates_vec coords_to_try;
-
-    coords_to_try.resize(0);
-
-    color_quad_u8 lc(dxt1_block::unpack_color(c.m_low_color, false));
-    color_quad_u8 hc(dxt1_block::unpack_color(c.m_high_color, false));
-
-    for (int i = 0; i < 27; i++) {
-      if (13 == i)
-        continue;
-
-      const int ir = (i % 3) - 1;
-      const int ig = ((i / 3) % 3) - 1;
-      const int ib = ((i / 9) % 3) - 1;
-
-      int r = lc.r + ir;
-      int g = lc.g + ig;
-      int b = lc.b + ib;
-      if ((r < 0) || (r > 31) || (g < 0) || (g > 63) || (b < 0) || (b > 31))
-        continue;
-
-      coords_to_try.push_back(
-          dxt1_solution_coordinates(dxt1_block::pack_color(r, g, b, false), c.m_high_color));
+  uint64 hist[4] = {}, D2[4][3] = {}, DD[4][3] = {};
+  for (uint i = 0; i < m_pParams->m_num_pixels; i++) {
+    const color_quad_u8& pixel = m_pParams->m_pPixels[i];
+    uint8 s = m_pParams->m_pSelectors[i];
+    hist[s]++;
+    for (uint c = 0; c < 3; c++) {
+      D2[s][c] += pixel[c] * 2;
+      DD[s][c] += pixel[c] * pixel[c];
    }
+  }
+  crnlib::vector<uint> solutions(54);
+  bool preserveL = hist[0] + hist[2] > hist[1] + hist[3];
+  bool improved = true;

-    for (int i = 0; i < 27; i++) {
-      if (13 == i)
-        continue;
-
-      const int ir = (i % 3) - 1;
-      const int ig = ((i / 3) % 3) - 1;
-      const int ib = ((i / 9) % 3) - 1;
-
-      int r = hc.r + ir;
-      int g = hc.g + ig;
-      int b = hc.b + ib;
-      if ((r < 0) || (r > 31) || (g < 0) || (g > 63) || (b < 0) || (b > 31))
-        continue;
-
-      coords_to_try.push_back(dxt1_solution_coordinates(c.m_low_color, dxt1_block::pack_color(r, g, b, false)));
+  for (uint iterations = 8; improved && iterations; iterations--) {
+    improved = false;
+    uint solutions_count = 0;
+    for (uint16 b0 = L0 & 31, g0 = L0 >> 5 & 63, r0 = L0 >> 11 & 31, b = b0 ? b0 - 1 : b0; b <= b0 + 1 && b <= 31; b++) {
+      for (uint16 g = g0 ? g0 - 1 : g0; g <= g0 + 1 && g <= 63; g++) {
+        for (uint16 r = r0 ? r0 - 1 : r0; r <= r0 + 1 && r <= 31; r++) {
+          uint16 L = r << 11 | g << 5 | b;
+          if (L != L0)
+            solutions[solutions_count++] = L > H0 ? L | H0 << 16 : H0 | L << 16;
+        }
+      }
    }
-
-    std::sort(coords_to_try.begin(), coords_to_try.end());
-
-    dxt1_solution_coordinates_vec::const_iterator p_last = std::unique(coords_to_try.begin(), coords_to_try.end());
-    uint num_coords_to_try = (uint)(p_last - coords_to_try.begin());
-
-    for (uint i = 0; i < num_coords_to_try; i++) {
+    for (uint16 b0 = H0 & 31, g0 = H0 >> 5 & 63, r0 = H0 >> 11 & 31, b = b0 ? b0 - 1 : b0; b <= b0 + 1 && b <= 31; b++) {
+      for (uint16 g = g0 ? g0 - 1 : g0; g <= g0 + 1 && g <= 63; g++) {
+        for (uint16 r = r0 ? r0 - 1 : r0; r <= r0 + 1 && r <= 31; r++) {
+          uint16 H = r << 11 | g << 5 | b;
+          if (H != H0)
+            solutions[solutions_count++] = H > L0 ? H | L0 << 16 : L0 | H << 16;
+        }
+      }
+    }
+    std::sort(solutions.begin(), solutions.begin() + solutions_count);
+    for (uint i = 0; i < solutions_count; i++) {
+      if (i && solutions[i] == solutions[i - 1])
+        continue;
+      uint16 L = solutions[i] & 0xFFFF;
+      uint16 H = solutions[i] >> 16;
+      if (L == H) {
+        L += !preserveL ? ~L & 0x1F ? 0x1 : ~L & 0xF800 ? 0x800 : ~L & 0x7E0 ? 0x20 : 0 : !L ? 0x1 : 0;
+        H -= preserveL ? H & 0x1F ? 0x1 : H & 0xF800 ? 0x800 : H & 0x7E0 ? 0x20 : 0 : H == 0xFFFF ? 0x1 : 0;
+      }
      color_quad_u8 block_colors[4];
-      uint16 l = coords_to_try[i].m_low_color;
-      uint16 h = coords_to_try[i].m_high_color;
-      if (l < h)
-        utils::swap(l, h);
-      else if (l == h) {
-        color_quad_u8 lc(dxt1_block::unpack_color(l, false));
-        color_quad_u8 hc(dxt1_block::unpack_color(h, false));
-
-        bool retry = false;
-        if ((selector_hist[0] + selector_hist[2]) > (selector_hist[1] + selector_hist[3])) {
-          // l affects the output more than h, so muck with h
-          if (hc[2] != 0)
-            hc[2]--;
-          else if (hc[0] != 0)
-            hc[0]--;
-          else if (hc[1] != 0)
-            hc[1]--;
-          else
-            retry = true;
-        } else {
-          // h affects the output more than l, so muck with l
-          if (lc[2] != 31)
-            lc[2]++;
-          else if (lc[0] != 31)
-            lc[0]++;
-          else if (lc[1] != 63)
-            lc[1]++;
-          else
-            retry = true;
-        }
-
-        if (retry) {
-          if (l == 0)
-            l++;
-          else
-            h--;
-        } else {
-          l = dxt1_block::pack_color(lc, false);
-          h = dxt1_block::pack_color(hc, false);
-        }
-
-        CRNLIB_ASSERT(l > h);
+      dxt1_block::get_block_colors4(block_colors, L, H);
+      uint64 error = 0;
+      for (uint64 s = 0, d[3]; s < 4; s++) {
+        for (uint c = 0; c < 3; c++)
+          d[c] = hist[s] * block_colors[s][c] * block_colors[s][c] - D2[s][c] * block_colors[s][c] + DD[s][c];
+        error += m_pParams->m_perceptual ? d[0] * 8 + d[1] * 25 + d[2] : d[0] + d[1] + d[2];
      }
-
-      dxt1_block::get_block_colors4(block_colors, l, h);
-
-      uint total_error = 0;
-
-      for (uint j = 0; j < m_pParams->m_num_pixels; j++) {
-        const color_quad_u8& c = block_colors[m_pParams->m_pSelectors[j]];
-        total_error += color::color_distance(m_pParams->m_perceptual, c, m_pParams->m_pPixels[j], false);
-
-        if (total_error > m_pResults->m_error)
-          break;
-      }
-
-      if (total_error < m_pResults->m_error) {
-        m_pResults->m_error = total_error;
-        m_pResults->m_low_color = l;
-        m_pResults->m_high_color = h;
-        CRNLIB_ASSERT(l > h);
-        if (m_pResults->m_error == 0)
+      if (error < m_pResults->m_error) {
+        m_pResults->m_low_color = L0 = L;
+        m_pResults->m_high_color = H0 = H;
+        m_pResults->m_error = error;
+        if (!m_pResults->m_error)
          return;
+        improved = true;
      }
    }
-
-    if (m_pResults->m_error == initial_error)
-      break;
-
-    c.m_low_color = m_pResults->m_low_color;
-    c.m_high_color = m_pResults->m_high_color;
  }
 }