Optimize computation of the endpoint cluster indices

This change improves the compression speed for both DXT and ETC encodings. Explanation: The vectors which are processed in the cluster indices computation step, are the very same vectors which were used in the vector quantization step. This means that every processed vector already has a specific centroid associated with it. Even though the associated centroid is not necessarily the closest one to the processed vector, the distance to the associated centroid can be used as an upper boundary of the distance to the closest centroid. This allows to efficiently perform early out while computing the distances to the other centroids. Note: The modified algorithm is supposed to generate decompression result identical to the original version of Crunch. For this reason the centroid associated with a specific training vector is not used as an initial best solution, because it could potentially change the decompression result in cases when the processed training vector is equidistant from multiple centroids (selection of the closest centroid in such cases depends on the processing order). DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.847 sec Modified: 1468204 bytes / 8.929 sec Improvement: 7.21% (compression ratio) / 69.05% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.953 sec Modified: 1914805 bytes / 11.651 sec Improvement: 7.28% (compression ratio) / 68.47% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 15.695 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
2017-10-10 17:13:41 +02:00
parent 51f73fdfed
commit 65f44319c0
4 changed files with 61 additions and 23 deletions
@@ -629,10 +629,36 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {

 void dxt_hc::determine_color_endpoint_clusters_task(uint64 data, void* pData_ptr) {
  tree_clusterizer<vec6F>* vq = (tree_clusterizer<vec6F>*)pData_ptr;
+  const crnlib::vector<vec6F>& codebook = vq->get_codebook();
  uint num_tasks = m_pTask_pool->get_num_threads() + 1;
  for (uint t = m_tiles.size() * data / num_tasks, tEnd = m_tiles.size() * (data + 1) / num_tasks; t < tEnd; t++) {
-    if (m_tiles[t].pixels.size())
-      m_tiles[t].cluster_indices[cColor] = vq->find_best_codebook_entry_fs(m_tiles[t].color_endpoint);
+    if (m_tiles[t].pixels.size()) {
+      const vec6F& v = m_tiles[t].color_endpoint;
+      float node_dist = codebook[vq->get_node_index(v)].squared_distance(v);
+      float best_dist = math::cNearlyInfinite;
+      uint best_index = 0;
+      for (uint i = 0; i < codebook.size(); i++) {
+        const vec6F& c = codebook[i];
+        float dist = 0;
+        dist += (c[0] - v[0]) * (c[0] - v[0]);
+        dist += (c[1] - v[1]) * (c[1] - v[1]);
+        if (dist > node_dist)
+          continue;
+        dist += (c[2] - v[2]) * (c[2] - v[2]);
+        dist += (c[3] - v[3]) * (c[3] - v[3]);
+        if (dist > node_dist)
+          continue;
+        dist += (c[4] - v[4]) * (c[4] - v[4]);
+        dist += (c[5] - v[5]) * (c[5] - v[5]);
+        if (dist < best_dist) {
+          best_dist = dist;
+          best_index = i;
+          if (best_dist == 0.0f)
+            break;
+        }
+      }
+      m_tiles[t].cluster_indices[cColor] = best_index;
+    }
  }
 }

@@ -643,7 +669,7 @@ void dxt_hc::determine_color_endpoints() {
      vq.add_training_vec(m_tiles[t].color_endpoint, (uint)(m_tiles[t].pixels.size() * m_tiles[t].weight));
  }

-  vq.generate_codebook(math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size));
+  vq.generate_codebook(math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size), true);
  m_color_clusters.resize(vq.get_codebook_size());

  for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++)
@@ -773,11 +799,25 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {

 void dxt_hc::determine_alpha_endpoint_clusters_task(uint64 data, void* pData_ptr) {
  tree_clusterizer<vec2F>* vq = (tree_clusterizer<vec2F>*)pData_ptr;
+  const crnlib::vector<vec2F>& codebook = vq->get_codebook();
  uint num_tasks = m_pTask_pool->get_num_threads() + 1;
  for (uint t = m_tiles.size() * data / num_tasks, tEnd = m_tiles.size() * (data + 1) / num_tasks; t < tEnd; t++) {
    if (m_tiles[t].pixels.size()) {
-      for (uint a = 0; a < m_num_alpha_blocks; a++)
-        m_tiles[t].cluster_indices[cAlpha0 + a] = vq->find_best_codebook_entry_fs(m_tiles[t].alpha_endpoints[a]);
+      for (uint a = 0; a < m_num_alpha_blocks; a++) {
+        const vec2F& v = m_tiles[t].alpha_endpoints[a];
+        float best_dist = math::cNearlyInfinite;
+        uint best_index = 0;
+        for (uint i = 0; i < codebook.size(); i++) {
+          float dist = (codebook[i][0] - v[0]) * (codebook[i][0] - v[0]) + (codebook[i][1] - v[1]) * (codebook[i][1] - v[1]);
+          if (dist < best_dist) {
+            best_dist = dist;
+            best_index = i;
+            if (best_dist == 0.0f)
+              break;
+          }
+        }
+        m_tiles[t].cluster_indices[cAlpha0 + a] = best_index;
+      }
    }
  }
 }
@@ -20,13 +20,14 @@ class tree_clusterizer {
    m_vectors.clear();
    m_codebook.clear();
    m_nodes.clear();
+    m_node_index_map.clear();
  }

  void add_training_vec(const VectorType& v, uint weight) {
    m_hist.push_back(std::make_pair(v, weight));
  }

-  bool generate_codebook(uint max_size) {
+  bool generate_codebook(uint max_size, bool generate_node_index_map = false) {
    if (m_hist.empty())
      return false;

@@ -115,11 +116,20 @@ class tree_clusterizer {

      node.m_codebook_index = m_codebook.size();
      m_codebook.push_back(node.m_centroid);
+
+      if (generate_node_index_map) {
+        for (uint j = 0; j < node.m_vectors.size(); j++)
+          m_node_index_map.insert(std::make_pair(m_vectors[node.m_vectors[j].index], node.m_codebook_index));
+      }
    }

    return true;
  }

+  inline uint get_node_index(const VectorType& v) {
+    return m_node_index_map.find(v)->second;
+  }
+
  inline uint get_codebook_size() const {
    return m_codebook.size();
  }
@@ -133,23 +143,6 @@ class tree_clusterizer {
    return m_codebook;
  }

-  uint find_best_codebook_entry_fs(const VectorType& v) const {
-    float best_dist = math::cNearlyInfinite;
-    uint best_index = 0;
-
-    for (uint i = 0; i < m_codebook.size(); i++) {
-      float dist = m_codebook[i].squared_distance(v);
-      if (dist < best_dist) {
-        best_dist = dist;
-        best_index = i;
-        if (best_dist == 0.0f)
-          break;
-      }
-    }
-
-    return best_index;
-  }
-
 private:

  crnlib::vector<std::pair<VectorType, uint> > m_hist;
@@ -157,6 +150,7 @@ class tree_clusterizer {
  crnlib::vector<VectorType> m_weightedVectors;
  crnlib::vector<uint> m_left_children_indices;
  crnlib::vector<uint> m_right_children_indices;
+  crnlib::hash_map<VectorType, uint> m_node_index_map;

  struct vq_node {
    vq_node()
@@ -206,6 +206,10 @@ class vec : public helpers::rel_ops<vec<N, T> > {
    return m_s[i];
  }

+  inline operator size_t() const {
+    return (size_t)fast_hash(this, sizeof(*this));
+  }
+
  inline T get_x(void) const { return m_s[0]; }
  inline T get_y(void) const {
    CRNLIB_ASSUME(N >= 2);