Optimize DXT endpoints computation

This change improves the compression speed for DXT encoding. Explanation: The main ideas used for the DXT endpoints computation optimization: - Instead of using map in tree clusterizer, the source vectors can be stored in an array and sorted before the quantization. This might increase the amount of used memory, but is much more efficient in terms of memory reallocation. - Endpoint caching can be used throughout the color endpoint computation, and not just within the optimize_endpoints function. The only place where endpoint caching can not be used is the final step of the try_combinatorial_encoding function, where alternate rounding is used. - When computing endpoint codebooks, endpoint optimizer and endpoint refiner can be reused, which eliminates unnecessary memory reallocations. DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.879 sec Modified: 1468204 bytes / 11.099 sec Improvement: 7.21% (compression ratio) / 61.57% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.919 sec Modified: 1914805 bytes / 14.621 sec Improvement: 7.28% (compression ratio) / 60.40% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 17.108 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
2017-09-12 13:03:56 +02:00
parent 3e12aff909
commit 3053c9dd93
4 changed files with 42 additions and 92 deletions
@@ -541,10 +541,7 @@ bool dxt1_endpoint_optimizer::refine_solution(int refinement_level) {
            nc.m_high_color = dxt1_block::pack_color(c[1], false);

            nc.canonicalize();
-
-            if ((nc.m_low_color != m_best_solution.m_coords.m_low_color) || (nc.m_high_color != m_best_solution.m_coords.m_high_color)) {
-              improved |= evaluate_solution(nc);
-            }
+            improved |= evaluate_solution(nc);
          }
        }
      }
@@ -644,20 +641,11 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c
      break;
  }

-  m_solutions_tried.reset();
-
  if (m_pParams->m_endpoint_caching) {
    // Try the previous X winning endpoints. This may not give us optimal results, but it may increase the probability of early outs while evaluating potential solutions.
    const uint num_prev_results = math::minimum<uint>(cMaxPrevResults, m_num_prev_results);
-    for (uint i = 0; i < num_prev_results; i++) {
-      const dxt1_solution_coordinates& coords = m_prev_results[i];
-
-      solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-      if (!solution_res.second)
-        continue;
-
-      evaluate_solution(coords);
-    }
+    for (uint i = 0; i < num_prev_results; i++)
+      evaluate_solution(m_prev_results[i]);

    if (!m_best_solution.m_error) {
      // Got lucky - one of the previous endpoints is optimal.
@@ -769,11 +757,6 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c
      for (uint j = 0; j < num_high_trials; j++) {
        dxt1_solution_coordinates coords((uint16)probe_low[i], (uint16)probe_high[j]);
        coords.canonicalize();
-
-        solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-        if (!solution_res.second)
-          continue;
-
        evaluate_solution(coords);
      }
    }
@@ -797,10 +780,7 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c

        dxt1_solution_coordinates coords(dxt1_block::pack_color(r, g, b, false), m_best_solution.m_coords.m_high_color);
        coords.canonicalize();
-
-        solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-        if (solution_res.second)
-          evaluate_solution(coords);
+        evaluate_solution(coords);
      }

      if (m_pParams->m_quality == cCRNDXTQualityUber) {
@@ -820,10 +800,7 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c

            dxt1_solution_coordinates coords(dxt1_block::pack_color(c, false), m_best_solution.m_coords.m_high_color);
            coords.canonicalize();
-
-            solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-            if (solution_res.second)
-              evaluate_solution(coords);
+            evaluate_solution(coords);
          }
        }
      }
@@ -846,10 +823,7 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c

        dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(r, g, b, false));
        coords.canonicalize();
-
-        solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-        if (solution_res.second)
-          evaluate_solution(coords);
+        evaluate_solution(coords);
      }

      if (m_pParams->m_quality == cCRNDXTQualityUber) {
@@ -869,10 +843,7 @@ void dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_c

            dxt1_solution_coordinates coords(m_best_solution.m_coords.m_low_color, dxt1_block::pack_color(c, false));
            coords.canonicalize();
-
-            solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | (coords.m_high_color << 16U)));
-            if (solution_res.second)
-              evaluate_solution(coords);
+            evaluate_solution(coords);
          }
        }
      }
@@ -1103,7 +1074,6 @@ bool dxt1_endpoint_optimizer::try_median4(const vec3F& low_color, const vec3F& h
          color_quad_u8((int)floor(.5f + v1[0] * 31.0f), (int)floor(.5f + v1[1] * 63.0f), (int)floor(.5f + v1[2] * 31.0f), 255), false);

      sc.canonicalize();
-
      improved |= evaluate_solution(sc);
    }
  }
@@ -1116,6 +1086,11 @@ bool dxt1_endpoint_optimizer::try_median4(const vec3F& low_color, const vec3F& h
 // Given candidate low/high endpoints, find the optimal selectors for 3 and 4 color blocks, compute the resulting error,
 // and use the candidate if it results in less error than the best found result so far.
 bool dxt1_endpoint_optimizer::evaluate_solution(const dxt1_solution_coordinates& coords, bool alternate_rounding) {
+  if (!alternate_rounding) {
+    solution_hash_map::insert_result solution_res(m_solutions_tried.insert(coords.m_low_color | coords.m_high_color << 16));
+    if (!solution_res.second)
+      return false;
+  }
  if (m_evaluate_hc)
    return evaluate_solution_hc(coords, alternate_rounding);
  if (m_pParams->m_quality >= cCRNDXTQualityBetter)
@@ -1698,6 +1673,8 @@ void dxt1_endpoint_optimizer::compute_internal(const params& p, results& r) {
    m_unique_color_hash_map.reset();
  if (m_solutions_tried.get_table_size() > 8192)
    m_solutions_tried.clear();
+  else
+    m_solutions_tried.reset();
  m_unique_colors.clear();
  m_norm_unique_colors.clear();
  m_mean_norm_color.clear();
@@ -484,31 +484,16 @@ void dxt_hc::determine_tiles_task_etc(uint64 data, void*) {
 }

 void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
-  const uint thread_index = static_cast<uint>(data);
-
-  if (!m_has_color_blocks)
-    return;
-
-  for (uint cluster_index = 0; cluster_index < m_color_clusters.size(); cluster_index++) {
-    if (m_canceled)
-      return;
-
-    if ((crn_get_current_thread_id() == m_main_thread_id) && ((cluster_index & 63) == 0)) {
-      if (!update_progress(3, cluster_index, m_color_clusters.size()))
-        return;
-    }
-
-    if (m_pTask_pool->get_num_threads()) {
-      if ((cluster_index % (m_pTask_pool->get_num_threads() + 1)) != thread_index)
-        continue;
-    }
+  const uint num_tasks = m_pTask_pool->get_num_threads() + 1;
+  dxt1_endpoint_optimizer optimizer;
+  dxt_endpoint_refiner refiner;
+  crnlib::vector<uint8> selectors;

+  for (uint cluster_index = (uint)data; cluster_index < m_color_clusters.size(); cluster_index += num_tasks) {
    color_cluster& cluster = m_color_clusters[cluster_index];
    if (cluster.pixels.empty())
      continue;

-    crnlib::vector<uint8> selectors(cluster.pixels.size());
-
    dxt1_endpoint_optimizer::params params;
    params.m_block_index = cluster_index;
    params.m_pPixels = cluster.pixels.get_ptr();
@@ -520,9 +505,9 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
    params.m_endpoint_caching = false;

    dxt1_endpoint_optimizer::results results;
+    selectors.resize(params.m_num_pixels);
    results.m_pSelectors = selectors.get_ptr();

-    dxt1_endpoint_optimizer optimizer;
    optimizer.compute(params, results);
    cluster.first_endpoint = results.m_low_color;
    cluster.second_endpoint = results.m_high_color;
@@ -561,7 +546,6 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
      m_block_selectors[cColor][b] = selector | (uint64)weight << 32;
    }

-    dxt_endpoint_refiner refiner;
    dxt_endpoint_refiner::params refinerParams;
    dxt_endpoint_refiner::results refinerResults;
    refinerParams.m_perceptual = m_params.m_perceptual;
@@ -692,28 +676,16 @@ void dxt_hc::determine_color_endpoints() {
 }

 void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
-  const uint thread_index = static_cast<uint>(data);
-
-  for (uint cluster_index = 0; cluster_index < m_alpha_clusters.size(); cluster_index++) {
-    if (m_canceled)
-      return;
-
-    if ((crn_get_current_thread_id() == m_main_thread_id) && ((cluster_index & 63) == 0)) {
-      if (!update_progress(8, cluster_index, m_alpha_clusters.size()))
-        return;
-    }
-
-    if (m_pTask_pool->get_num_threads()) {
-      if ((cluster_index % (m_pTask_pool->get_num_threads() + 1)) != thread_index)
-        continue;
-    }
+  const uint num_tasks = m_pTask_pool->get_num_threads() + 1;
+  dxt5_endpoint_optimizer optimizer;
+  dxt_endpoint_refiner refiner;
+  crnlib::vector<uint8> selectors;

+  for (uint cluster_index = (uint)data; cluster_index < m_alpha_clusters.size(); cluster_index += num_tasks) {
    alpha_cluster& cluster = m_alpha_clusters[cluster_index];
    if (cluster.pixels.empty())
      continue;

-    crnlib::vector<uint8> selectors(cluster.pixels.size());
-
    dxt5_endpoint_optimizer::params params;
    params.m_pPixels = cluster.pixels.get_ptr();
    params.m_num_pixels = cluster.pixels.size();
@@ -722,9 +694,9 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
    params.m_use_both_block_types = false;

    dxt5_endpoint_optimizer::results results;
+    selectors.resize(params.m_num_pixels);
    results.m_pSelectors = selectors.get_ptr();

-    dxt5_endpoint_optimizer optimizer;
    optimizer.compute(params, results);
    cluster.first_endpoint = results.m_first_endpoint;
    cluster.second_endpoint = results.m_second_endpoint;
@@ -777,7 +749,6 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
      }
    }

-    dxt_endpoint_refiner refiner;
    dxt_endpoint_refiner::params refinerParams;
    dxt_endpoint_refiner::results refinerResults;
    refinerParams.m_perceptual = m_params.m_perceptual;
@@ -19,15 +19,7 @@ class tree_clusterizer {
  }

  void add_training_vec(const VectorType& v, uint weight) {
-    const std::pair<typename vector_map_type::iterator, bool> insert_result(m_hist.insert(std::make_pair(v, 0U)));
-
-    typename vector_map_type::iterator it(insert_result.first);
-
-    uint max_weight = UINT_MAX - weight;
-    if (weight > max_weight)
-      it->second = UINT_MAX;
-    else
-      it->second = it->second + weight;
+    m_hist.push_back(std::make_pair(v, weight));
  }

  bool generate_codebook(uint max_size) {
@@ -39,13 +31,23 @@ class tree_clusterizer {
    vq_node root;
    root.m_vectors.reserve(static_cast<uint>(m_hist.size()));

-    for (typename vector_map_type::const_iterator it = m_hist.begin(); it != m_hist.end(); ++it) {
-      const VectorType& v = it->first;
-      const uint weight = it->second;
+    std::sort(m_hist.begin(), m_hist.end());
+    for (uint i = 0; i < m_hist.size(); i++) {
+      if (!root.m_vectors.size() || m_hist[i].first != root.m_vectors.back().first) {
+        root.m_vectors.push_back(m_hist[i]);
+      } else if (root.m_vectors.back().second > UINT_MAX - m_hist[i].second) {
+        root.m_vectors.back().second = UINT_MAX;
+      } else {
+        root.m_vectors.back().second += m_hist[i].second;
+      }
+    }    
+
+    for (uint i = 0; i < root.m_vectors.size(); i++) {
+      const VectorType& v = root.m_vectors[i].first;
+      const uint weight = root.m_vectors[i].second;

      root.m_centroid += (v * (float)weight);
      root.m_total_weight += weight;
-      root.m_vectors.push_back(std::make_pair(v, weight));

      ttsum += v.dot(v) * weight;
    }
@@ -166,7 +168,7 @@ class tree_clusterizer {
 private:
  typedef std::map<VectorType, uint> vector_map_type;

-  vector_map_type m_hist;
+  crnlib::vector<std::pair<VectorType, uint> > m_hist;

  struct vq_node {
    vq_node()