Initial checkin of v1.04 - KTX file format support, basic ETC1 compression/decompression, Linux makefile with proper gcc options, lots of high-level improvements to get crnlib into a state where I can more easily add additional formats.

2012-11-25 08:41:25 +00:00
parent a8011e9d7f
commit f71b49be60
92 changed files with 20362 additions and 781 deletions
@@ -1,5 +1,13 @@
 // File: crn_dxt1.cpp
 // See Copyright Notice and license at the end of inc/crnlib.h
+//
+// Notes:
+// This class is not optimized for performance on small blocks, unlike typical DXT1 compressors. It's optimized for scalability and quality:
+// - Very high quality in terms of avg. RMSE or Luma RMSE. Goal is to always match or beat every other known offline DXTc compressor: ATI_Compress, squish, NVidia texture tools, nvdxt.exe, etc.
+// - Reasonable scalability and stability with hundreds to many thousands of input colors (including inputs with many thousands of equal/nearly equal colors).
+// - Any quality optimization which results in even a tiny improvement is worth it -- as long as it's either a constant or linear slowdown.
+//   Tiny quality improvements can be extremely valuable in large clusters.
+// - Quality should scale well vs. CPU time cost, i.e. the more time you spend the higher the quality.
 #include "crn_core.h"
 #include "crn_dxt1.h"
 #include "crn_ryg_dxt.hpp"
@@ -9,6 +17,22 @@

 namespace crnlib
 {
+   //-----------------------------------------------------------------------------------------------------------------------------------------
+
+   static const int16 g_fast_probe_table[] = { 0, 1, 2, 3 };
+   static const uint cFastProbeTableSize = sizeof(g_fast_probe_table) / sizeof(g_fast_probe_table[0]);
+
+   static const int16 g_normal_probe_table[] = { 0, 1, 3, 5, 7 };
+   static const uint cNormalProbeTableSize = sizeof(g_normal_probe_table) / sizeof(g_normal_probe_table[0]);
+
+   static const int16 g_better_probe_table[] = { 0, 1, 2, 3, 5, 9, 15, 19, 27, 43 };
+   static const uint cBetterProbeTableSize = sizeof(g_better_probe_table) / sizeof(g_better_probe_table[0]);
+
+   static const int16 g_uber_probe_table[] = { 0, 1, 2, 3, 5, 7, 9, 10, 13, 15, 19, 27, 43, 59, 91 };
+   static const uint cUberProbeTableSize = sizeof(g_uber_probe_table) / sizeof(g_uber_probe_table[0]);
+
+   //-----------------------------------------------------------------------------------------------------------------------------------------
+
   dxt1_endpoint_optimizer::dxt1_endpoint_optimizer() :
      m_pParams(NULL),
      m_pResults(NULL),
@@ -75,6 +99,7 @@ namespace crnlib
      return true;
   }

+   // All selectors are equal. Try compressing as if it was solid, using the block's average color, using ryg's optimal single color compression tables.
   bool dxt1_endpoint_optimizer::try_average_block_as_solid()
   {
      uint64 tot_r = 0;
@@ -110,6 +135,7 @@ namespace crnlib

      if (m_pParams->m_quality == cCRNDXTQualityUber)
      {
+         // Try compressing as all-solid using the other (non-average) colors in the block in uber.
         for (uint i = 0; i < m_unique_colors.size(); i++)
         {
            uint r = m_unique_colors[i].m_color[0];
@@ -134,6 +160,7 @@ namespace crnlib
      return improved;
   }

+   // Block is solid, trying using ryg's optimal single color tables.
   bool dxt1_endpoint_optimizer::handle_solid_block()
   {
      int r = m_unique_colors[0].m_color.r;
@@ -195,6 +222,7 @@ namespace crnlib
      }
   }

+   // Compute PCA (principle axis, i.e. direction of largest variance) of input vectors.
   void dxt1_endpoint_optimizer::compute_pca(vec3F& axis, const vec3F_array& norm_colors, const vec3F& def)
   {
 #if 0
@@ -202,6 +230,7 @@ namespace crnlib

      CRNLIB_ASSERT(m_unique_colors.size() == norm_colors.size());

+      // Incremental PCA
      bool first = true;
      for (uint i = 0; i < norm_colors.size(); i++)
      {
@@ -272,6 +301,7 @@ namespace crnlib
      //vfr = hi[0] - lo[0];
      //vfg = hi[1] - lo[1];
      //vfb = hi[2] - lo[2];
+      // This is more stable.
      vfr = .9f;
      vfg = 1.0f;
      vfb = .7f;
@@ -325,6 +355,7 @@ namespace crnlib
   static const uint8 g_invTableAlpha[4] = { 1, 0, 2, 3 };
   static const uint8 g_invTableColor[4] = { 1, 0, 3, 2 };

+   // Computes a valid (encodable) DXT1 solution (low/high colors, swizzled selectors) from input.
   void dxt1_endpoint_optimizer::return_solution(results& res, const potential_solution& solution)
   {
      bool invert_selectors;
@@ -433,6 +464,7 @@ namespace crnlib
      return vec3F(c.r, c.g, c.b);
   }

+   // Per-component 1D endpoint optimization.
   void dxt1_endpoint_optimizer::optimize_endpoint_comps()
   {
      if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
@@ -583,6 +615,7 @@ namespace crnlib
      } // comp_index
   }

+   // Voxel adjacency delta coordinations.
   static const struct adjacent_coords
   {
      int8 x, y, z;
@@ -618,6 +651,7 @@ namespace crnlib
      {1, 1, 1}
   };

+   // Attempt to refine current solution's endpoints given the current selectors using least squares.
   bool dxt1_endpoint_optimizer::refine_solution(int refinement_level)
   {
      CRNLIB_ASSERT(m_best_solution.m_valid);
@@ -694,11 +728,15 @@ namespace crnlib
      }
      else if (refinement_level == 1)
      {
+         // Try exploring the local lattice neighbors of the least squares optimized result.
         color_quad_u8 e[2];
+
+         e[0].clear();
         e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r*yy - At2_r*xy)*frb+0.5f),0,31);
         e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g*yy - At2_g*xy)*fg +0.5f),0,63);
         e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b*yy - At2_b*xy)*frb+0.5f),0,31);

+         e[1].clear();
         e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r*xx - At1_r*xy)*frb+0.5f),0,31);
         e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g*xx - At1_g*xy)*fg +0.5f),0,63);
         e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b*xx - At1_b*xy)*frb+0.5f),0,31);
@@ -737,11 +775,14 @@ namespace crnlib
      }
      else
      {
+         // Try even harder to explore the local lattice neighbors of the least squares optimized result.
         color_quad_u8 e[2];
+         e[0].clear();
         e[0][0] = (uint8)math::clamp<int>(static_cast<int>((At1_r*yy - At2_r*xy)*frb+0.5f),0,31);
         e[0][1] = (uint8)math::clamp<int>(static_cast<int>((At1_g*yy - At2_g*xy)*fg +0.5f),0,63);
         e[0][2] = (uint8)math::clamp<int>(static_cast<int>((At1_b*yy - At2_b*xy)*frb+0.5f),0,31);

+         e[1].clear();
         e[1][0] = (uint8)math::clamp<int>(static_cast<int>((At2_r*xx - At1_r*xy)*frb+0.5f),0,31);
         e[1][1] = (uint8)math::clamp<int>(static_cast<int>((At2_g*xx - At1_g*xy)*fg +0.5f),0,63);
         e[1][2] = (uint8)math::clamp<int>(static_cast<int>((At2_b*xx - At1_b*xy)*frb+0.5f),0,31);
@@ -790,63 +831,7 @@ namespace crnlib

   //-----------------------------------------------------------------------------------------------------------------------------------------

-   static int16 g_fast_probe_table[] =
-   {
-      0,
-      1,
-      2,
-      3
-   };
-   const uint cFastProbeTableSize = sizeof(g_fast_probe_table) / sizeof(g_fast_probe_table[0]);
-
-   static int16 g_normal_probe_table[] =
-   {
-      0,
-      1,
-      3,
-      5,
-      7
-   };
-   const uint cNormalProbeTableSize = sizeof(g_normal_probe_table) / sizeof(g_normal_probe_table[0]);
-
-   static int16 g_better_probe_table[] =
-   {
-      0,
-      1,
-      2,
-      3,
-
-      5,
-      9,
-      15,
-      19,
-
-      27,
-      43
-   };
-   const uint cBetterProbeTableSize = sizeof(g_better_probe_table) / sizeof(g_better_probe_table[0]);
-
-   static int16 g_uber_probe_table[] =
-   {
-      0,
-      1,
-      2,
-      3,
-      5,
-      7,
-      9,
-      10,
-      13,
-      15,
-      19,
-      27,
-      43,
-      59,
-      91
-   };
-
-   const uint cUberProbeTableSize = sizeof(g_uber_probe_table) / sizeof(g_uber_probe_table[0]);
-
+   // Primary endpoint optimization entrypoint.
   bool dxt1_endpoint_optimizer::optimize_endpoints(vec3F& low_color, vec3F& high_color)
   {
      vec3F orig_low_color(low_color);
@@ -855,10 +840,11 @@ namespace crnlib
      m_trial_solution.clear();

      uint num_passes;
-      int16* pProbe_table = g_uber_probe_table;
+      const int16* pProbe_table = g_uber_probe_table;
      uint probe_range;
      float dist_per_trial = .015625f;

+      // How many probes, and the distance between each probe depends on the quality level.
      switch (m_pParams->m_quality)
      {
         case cCRNDXTQualitySuperFast:
@@ -895,6 +881,7 @@ namespace crnlib

      if (m_pParams->m_endpoint_caching)
      {
+         // Try the previous X winning endpoints. This may not give us optimal results, but it may increase the probability of early outs while evaluating potential solutions.
         const uint num_prev_results = math::minimum<uint>(cMaxPrevResults, m_num_prev_results);
         for (uint i = 0; i < num_prev_results; i++)
         {
@@ -909,6 +896,7 @@ namespace crnlib

         if (!m_best_solution.m_error)
         {
+            // Got lucky - one of the previous endpoints is optimal.
            return_solution(*m_pResults, m_best_solution);
            return true;
         }
@@ -949,6 +937,12 @@ namespace crnlib

      for (uint pass = 0; pass < num_passes; pass++)
      {
+         // Now separately sweep or probe the low and high colors along the principle axis, both positively and negatively.
+         // This results in two arrays of candidate low/high endpoints. Every unique combination of candidate endpoints is tried as a potential solution.
+         // In higher quality modes, the various nearby lattice neighbors of each candidate endpoint are also explored, which allows the current solution to "wobble" or "migrate"
+         // to areas with lower error.
+         // This entire process can be repeated up to X times (depending on the quality level) until a local minimum is established.
+         // This method is very stable and scalable. It could be implemented more elegantly, but I'm now very cautious of touching this code.
         if (pass)
         {
            low_color = unpack_to_vec3F_raw(m_best_solution.m_coords.m_low_color);
@@ -959,6 +953,7 @@ namespace crnlib
         if (!prev_best_error)
            break;

+         // Sweep low endpoint along principle axis, record positions
         int prev_packed_color[2] = { -1, -1 };
         uint num_low_trials = 0;
         vec3F initial_probe_low_color(low_color + vec3F(.5f));
@@ -987,6 +982,7 @@ namespace crnlib
         prev_packed_color[0] = -1;
         prev_packed_color[1] = -1;

+         // Sweep high endpoint along principle axis, record positions
         uint num_high_trials = 0;
         vec3F initial_probe_high_color(high_color + vec3F(.5f));
         for (uint i = 0; i < probe_range; i++)
@@ -1011,6 +1007,7 @@ namespace crnlib
            }
         }

+         // Now try all unique combinations.
         for (uint i = 0; i < num_low_trials; i++)
         {
            for (uint j = 0; j < num_high_trials; j++)
@@ -1028,6 +1025,7 @@ namespace crnlib

         if (m_pParams->m_quality >= cCRNDXTQualityNormal)
         {
+            // Generate new candidates by exploring the low color's direct lattice neighbors
            color_quad_u8 lc(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));

            for (int i = 0; i < 26; i++)
@@ -1051,6 +1049,7 @@ namespace crnlib

            if (m_pParams->m_quality == cCRNDXTQualityUber)
            {
+               // Generate new candidates by exploring the low color's direct lattice neighbors - this time, explore much further separately on each axis.
               lc = dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false);

               for (int a = 0; a < 3; a++)
@@ -1075,6 +1074,7 @@ namespace crnlib
               }
            }

+            // Generate new candidates by exploring the high color's direct lattice neighbors
            color_quad_u8 hc(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));

            for (int i = 0; i < 26; i++)
@@ -1098,6 +1098,7 @@ namespace crnlib

            if (m_pParams->m_quality == cCRNDXTQualityUber)
            {
+               // Generate new candidates by exploring the high color's direct lattice neighbors - this time, explore much further separately on each axis.
               hc = dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false);

               for (int a = 0; a < 3; a++)
@@ -1127,7 +1128,10 @@ namespace crnlib
            break;

         if (m_pParams->m_quality >= cCRNDXTQualityUber)
+         {
+            // Attempt to refine current solution's endpoints given the current selectors using least squares.
            refine_solution(1);
+         }
      }

      if (m_pParams->m_quality >= cCRNDXTQualityNormal)
@@ -1136,16 +1140,26 @@ namespace crnlib
         {
            bool choose_solid_block = false;
            if (m_best_solution.are_selectors_all_equal())
+            {
+               // All selectors equal - try various solid-block optimizations
               choose_solid_block = try_average_block_as_solid();
+            }

            if ((!choose_solid_block) && (m_pParams->m_quality == cCRNDXTQualityUber))
+            {
+               // Per-component 1D endpoint optimization.
               optimize_endpoint_comps();
+            }
         }

         if (m_pParams->m_quality == cCRNDXTQualityUber)
         {
            if (m_best_solution.m_error)
+            {
+               // The pixels may have already been DXTc compressed by another compressor.
+               // It's usually possible to recover the endpoints used to previously pack the block.
               try_combinatorial_encoding();
+            }
         }
      }

@@ -1153,6 +1167,7 @@ namespace crnlib

      if (m_pParams->m_endpoint_caching)
      {
+         // Remember result for later reruse.
         m_prev_results[m_num_prev_results & (cMaxPrevResults - 1)] = m_best_solution.m_coords;
         m_num_prev_results++;
      }
@@ -1173,6 +1188,8 @@ namespace crnlib

      if (m_perceptual)
      {
+         // Compute RGB weighting for use in perceptual mode.
+         // The more saturated the block, the more the weights deviate from (1,1,1).
         float ave_redness = 0;
         float ave_blueness = 0;
         float ave_l = 0;
@@ -1224,6 +1241,8 @@ namespace crnlib
         im.transpose_in_place();
         m_principle_axis = m_principle_axis * im;
 #else
+         // Purposely scale the components of the principle axis by the perceptual weighting.
+         // There's probably a cleaner way to go about this, but it works (more competitive in perceptual mode against nvdxt.exe or ATI_Compress).
         m_principle_axis[0] /= perceptual_weights[0];
         m_principle_axis[1] /= perceptual_weights[1];
         m_principle_axis[2] /= perceptual_weights[2];
@@ -1232,6 +1251,7 @@ namespace crnlib

         if (num_passes > 1)
         {
+            // Check for obviously wild principle axes and try to compensate by backing off the component weightings.
            if (fabs(m_principle_axis[0]) >= .795f)
               perceptual_weights.set(.424f, .6f, .072f);
            else if (fabs(m_principle_axis[2]) >= .795f)
@@ -1241,6 +1261,7 @@ namespace crnlib
         }
      }

+      // Find bounds of projection onto (potentially skewed) principle axis.
      float l = 1e+9;
      float h = -1e+9;

@@ -1256,6 +1277,7 @@ namespace crnlib

      if (!low_color.is_within_bounds(0.0f, 1.0f))
      {
+         // Low color is outside the lattice, so bring it back in by casting a ray.
         vec3F coord;
         float t;
         aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
@@ -1266,6 +1288,7 @@ namespace crnlib

      if (!high_color.is_within_bounds(0.0f, 1.0f))
      {
+         // High color is outside the lattice, so bring it back in by casting a ray.
         vec3F coord;
         float t;
         aabb3F bounds(vec3F(0.0f), vec3F(1.0f));
@@ -1274,6 +1297,7 @@ namespace crnlib
            high_color = coord;
      }

+      // Now optimize the endpoints using the projection bounds on the (potentially skewed) principle axis as a starting point.
      if (!optimize_endpoints(low_color, high_color))
         return false;

@@ -1286,6 +1310,7 @@ namespace crnlib
      return true;
   }

+   // Tries quantizing the block to 4 colors using vanilla LBG. It tries all combinations of the quantized results as potential endpoints.
   bool dxt1_endpoint_optimizer::try_median4(const vec3F& low_color, const vec3F& high_color)
   {
      vec3F means[4];
@@ -1408,6 +1433,8 @@ namespace crnlib
      return improved;
   }

+   // Given candidate low/high endpoints, find the optimal selectors for 3 and 4 color blocks, compute the resulting error,
+   // and use the candidate if it results in less error than the best found result so far.
   bool dxt1_endpoint_optimizer::evaluate_solution(
      const dxt1_solution_coordinates& coords,
      bool early_out,
@@ -1428,6 +1455,7 @@ namespace crnlib

      CRNLIB_ASSERT(m_trial_solution.m_valid);

+      // Caller has requested all considered candidate solutions for later analysis.
      m_pSolutions->resize(m_pSolutions->size() + 1);
      solution& new_solution = m_pSolutions->back();
      new_solution.m_selectors.resize(m_pParams->m_num_pixels);
@@ -1843,6 +1871,8 @@ namespace crnlib
      return unique_color(res, 1);
   }

+   // The block may have been already compressed using another DXTc compressor, such as squish, ATI_Compress, ryg_dxt, etc.
+   // Attempt to recover the endpoints used by that block compressor.
   void dxt1_endpoint_optimizer::try_combinatorial_encoding()
   {
      if ((m_unique_colors.size() < 2) || (m_unique_colors.size() > 4))
@@ -1954,6 +1984,8 @@ namespace crnlib
      return;
   }

+   // The fourth (transparent) color in 3 color "transparent" blocks is black, which can be optionally exploited for small gains in DXT1 mode if the caller
+   // doesn't actually use alpha. (But not in DXT5 mode, because 3-color blocks aren't permitted by GPU's for DXT5.)
   bool dxt1_endpoint_optimizer::try_alpha_as_black_optimization()
   {
      const params*  pOrig_params = m_pParams;
@@ -2077,6 +2109,7 @@ namespace crnlib
      return true;
   }

+   // Build array of unique colors and their weights.
   void dxt1_endpoint_optimizer::find_unique_colors()
   {
      m_has_transparent_pixels = false;