diff --git a/bin/crunch_x64.exe b/bin/crunch_x64.exe index 311f8b6..768e239 100644 Binary files a/bin/crunch_x64.exe and b/bin/crunch_x64.exe differ diff --git a/crnlib/crn_dxt1.cpp b/crnlib/crn_dxt1.cpp index e0b84f8..ff80799 100644 --- a/crnlib/crn_dxt1.cpp +++ b/crnlib/crn_dxt1.cpp @@ -295,125 +295,87 @@ void dxt1_endpoint_optimizer::return_solution() { } // Per-component 1D endpoint optimization. + +void dxt1_endpoint_optimizer::compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]) { + uint64 W[4] = {}, WP2[4] = {}, WPP[4] = {}; + for (uint i = 0; i < m_unique_colors.size(); i++) { + uint p = m_unique_colors[i].m_color[comp_index]; + uint w = m_unique_colors[i].m_weight; + uint8 s = m_best_solution.m_selectors[i]; + W[s] += (int64)w; + WP2[s] += (int64)w * p * 2; + WPP[s] += (int64)w * p * p; + } + const uint comp_limit = comp_index == 1 ? 64 : 32; + for (uint8 s = 0; s < 2; s++) { + uint64 best_error = error[s][0] = WPP[s]; + for (uint8 c = 1; c < comp_limit; c++) { + uint8 p = comp_index == 1 ? c << 2 | c >> 4 : c << 3 | c >> 2; + error[s][c] = W[s] * p * p - WP2[s] * p + WPP[s]; + if (error[s][c] < best_error) + best_error = error[s][c]; + } + best_remaining_error[s] = best_error; + } + for (uint8 s = 2; s < 4; s++) { + uint64 best_error = error[s][0] = WPP[s], d = W[s] - WP2[s], dd = W[s] << 1, e = WPP[s] + d; + for (uint p = 1; p < 256; p++, d += dd, e += d) { + error[s][p] = e; + if (e < best_error) + best_error = e; + } + best_remaining_error[s] = best_error; + } + for (uint8 s = 3; s; s--) + best_remaining_error[s - 1] += best_remaining_error[s]; +} + void dxt1_endpoint_optimizer::optimize_endpoint_comps() { compute_selectors(); - if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error)) + if (m_best_solution.m_alpha_block || !m_best_solution.m_error) return; - - color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true)); - color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true)); - - color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF); - color_quad_u8 max_color(0, 0, 0, 0); - for (uint i = 0; i < m_unique_colors.size(); i++) { - min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color); - max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color); - } - - // Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds. - uint64 W[4] = {}, WD2[4] = {}, WDD[4] = {}; + color_quad_u8 source_low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true)); + color_quad_u8 source_high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true)); + uint64 error[4][256], best_remaining_error[4]; for (uint comp_index = 0; comp_index < 3; comp_index++) { - uint min_color_weight = 0; - uint max_color_weight = 0; - for (uint s = 0; s < 4; s++) - W[s] = WD2[s] = WDD[s] = 0; - for (uint i = 0; i < m_unique_colors.size(); i++) { - uint c = m_unique_colors[i].m_color[comp_index]; - uint w = m_unique_colors[i].m_weight; - uint8 s = m_best_solution.m_selectors[i]; - W[s] += (int64)w; - WD2[s] += (int64)w * c * 2; - WDD[s] += (int64)w * c * c; - if (c == min_color[comp_index]) - min_color_weight += w; - if (c == max_color[comp_index]) - max_color_weight += w; - } - - uint ll[4]; - ll[0] = orig_l_scaled[comp_index]; - ll[1] = orig_h_scaled[comp_index]; - ll[2] = (ll[0] * 2 + ll[1]) / 3; - ll[3] = (ll[0] + ll[1] * 2) / 3; - - uint64 error_to_beat = 0; - for (int s = 0; s < 4; s++) - error_to_beat += W[s] * ll[s] * ll[s] - WD2[s] * ll[s] + WDD[s]; - - if (!error_to_beat) + uint8 p0 = source_low[comp_index]; + uint8 p1 = source_high[comp_index]; + color_quad_u8 low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false)); + color_quad_u8 high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false)); + compute_endpoint_component_errors(comp_index, error, best_remaining_error); + uint64 best_error = error[0][low[comp_index]] + error[1][high[comp_index]] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3]; + if (best_remaining_error[0] >= best_error) continue; - - CRNLIB_ASSERT((min_color_weight > 0) && (max_color_weight > 0)); - const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0; - const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0; - - const uint m = (comp_index == 1) ? 63 : 31; - const uint m_shift = (comp_index == 1) ? 3 : 2; - - for (uint o = 0; o <= m; o++) { - uint tl[4]; - - tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2)); - - for (uint h = 0; h < 8; h++) { - const uint pl = h << m_shift; - const uint ph = ((h + 1) << m_shift) - 1; - - uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2)); - uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2)); - - tl_l = math::minimum(tl_l, tl[0]); - tl_h = math::maximum(tl_h, tl[0]); - - uint c_l = min_color[comp_index]; - uint c_h = max_color[comp_index]; - - if (c_h < tl_l) { - uint min_possible_error = math::square(tl_l - c_l); - if (min_possible_error > error_to_beat_div_min_color_weight) - continue; - } else if (c_l > tl_h) { - uint min_possible_error = math::square(c_h - tl_h); - if (min_possible_error > error_to_beat_div_max_color_weight) - continue; - } - - for (uint p = pl; p <= ph; p++) { - tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2)); - - tl[2] = (tl[0] * 2 + tl[1]) / 3; - tl[3] = (tl[0] + tl[1] * 2) / 3; - - uint64 trial_error = 0; - for (int s = 0; s < 4; s++) - trial_error += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s]; - - if (trial_error < error_to_beat) { - color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false)); - color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false)); - l[comp_index] = static_cast(o); - h[comp_index] = static_cast(p); - - if (evaluate_solution(dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)))) { - if (!m_best_solution.m_error) - return; - compute_selectors(); - for (uint s = 0; s < 4; s++) - W[s] = WD2[s] = WDD[s] = 0; - for (uint i = 0; i < m_unique_colors.size(); i++) { - uint c = m_unique_colors[i].m_color[comp_index]; - uint w = m_unique_colors[i].m_weight; - uint8 s = m_best_solution.m_selectors[i]; - W[s] += (int64)w; - WD2[s] += (int64)w * c * 2; - WDD[s] += (int64)w * c * c; - } - error_to_beat = 0; - for (int s = 0; s < 4; s++) - error_to_beat += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s]; - } - } - } + const uint comp_limit = comp_index == 1 ? 64 : 32; + for (uint8 c0 = 0; c0 < comp_limit; c0++) { + uint64 e0 = error[0][c0]; + if (e0 + best_remaining_error[1] >= best_error) + continue; + low[comp_index] = c0; + uint16 packed_low = dxt1_block::pack_color(low, false); + p0 = comp_index == 1 ? c0 << 2 | c0 >> 4 : c0 << 3 | c0 >> 2; + for (uint8 c1 = 0; c1 < comp_limit; c1++) { + uint64 e = e0 + error[1][c1]; + if (e + best_remaining_error[2] >= best_error) + continue; + p1 = comp_index == 1 ? c1 << 2 | c1 >> 4 : c1 << 3 | c1 >> 2; + e += error[2][(p0 * 2 + p1) / 3]; + if (e + best_remaining_error[3] >= best_error) + continue; + e += error[3][(p0 + p1 * 2) / 3]; + if (e >= best_error) + continue; + high[comp_index] = c1; + if (!evaluate_solution(dxt1_solution_coordinates(packed_low, dxt1_block::pack_color(high, false)))) + continue; + if (!m_best_solution.m_error) + return; + compute_selectors(); + compute_endpoint_component_errors(comp_index, error, best_remaining_error); + best_error = error[0][c0] + error[1][c1] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3]; + e0 = error[0][c0]; + if (e0 + best_remaining_error[1] >= best_error) + break; } } } diff --git a/crnlib/crn_dxt1.h b/crnlib/crn_dxt1.h index 2637d1b..7551fc0 100644 --- a/crnlib/crn_dxt1.h +++ b/crnlib/crn_dxt1.h @@ -259,6 +259,7 @@ class dxt1_endpoint_optimizer { void compute_vectors(const vec3F& perceptual_weights); void return_solution(); void try_combinatorial_encoding(); + void compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]); void optimize_endpoint_comps(); void optimize_endpoints(vec3F& low_color, vec3F& high_color); bool try_alpha_as_black_optimization();