Optimize DXT endpoints computation

This change improves the compression speed for DXT encoding.

Explanation:

When performing per-component endpoint optimization, the trial solutions are generated using all possible combinations of the component values. Then the error boundary computation is performed for each block color of the trial solution in order to check the possibility of early out. The important observation here is that some component values are present in several trial solutions and therefore are processed multiple times. The overall performance can therefore be improved by computing and caching the errors for all the possible component values in advance.

DXT Testing:

The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8).

[Compressing Kodak set without mipmaps using DXT1 encoding]
Original: 1582222 bytes / 28.843 sec
Modified: 1468204 bytes / 6.067 sec
Improvement: 7.21% (compression ratio) / 78.97% (compression time)

[Compressing Kodak set with mipmaps using DXT1 encoding]
Original: 2065243 bytes / 36.983 sec
Modified: 1914805 bytes / 8.080 sec
Improvement: 7.28% (compression ratio) / 78.15% (compression time)

ETC Testing:

The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings).

[Compressing Kodak set without mipmaps using ETC1 encoding]
Total size: 1607858 bytes
Total time: 13.421 sec
Average bitrate: 1.363 bpp
Average Luma PSNR: 34.050 dB
This commit is contained in:
Alexander Suvorov
2017-10-24 19:48:37 +02:00
parent dbbef6a21f
commit 7143913032
3 changed files with 77 additions and 114 deletions
Binary file not shown.
+76 -114
View File
@@ -295,125 +295,87 @@ void dxt1_endpoint_optimizer::return_solution() {
}
// Per-component 1D endpoint optimization.
void dxt1_endpoint_optimizer::compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]) {
uint64 W[4] = {}, WP2[4] = {}, WPP[4] = {};
for (uint i = 0; i < m_unique_colors.size(); i++) {
uint p = m_unique_colors[i].m_color[comp_index];
uint w = m_unique_colors[i].m_weight;
uint8 s = m_best_solution.m_selectors[i];
W[s] += (int64)w;
WP2[s] += (int64)w * p * 2;
WPP[s] += (int64)w * p * p;
}
const uint comp_limit = comp_index == 1 ? 64 : 32;
for (uint8 s = 0; s < 2; s++) {
uint64 best_error = error[s][0] = WPP[s];
for (uint8 c = 1; c < comp_limit; c++) {
uint8 p = comp_index == 1 ? c << 2 | c >> 4 : c << 3 | c >> 2;
error[s][c] = W[s] * p * p - WP2[s] * p + WPP[s];
if (error[s][c] < best_error)
best_error = error[s][c];
}
best_remaining_error[s] = best_error;
}
for (uint8 s = 2; s < 4; s++) {
uint64 best_error = error[s][0] = WPP[s], d = W[s] - WP2[s], dd = W[s] << 1, e = WPP[s] + d;
for (uint p = 1; p < 256; p++, d += dd, e += d) {
error[s][p] = e;
if (e < best_error)
best_error = e;
}
best_remaining_error[s] = best_error;
}
for (uint8 s = 3; s; s--)
best_remaining_error[s - 1] += best_remaining_error[s];
}
void dxt1_endpoint_optimizer::optimize_endpoint_comps() {
compute_selectors();
if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
if (m_best_solution.m_alpha_block || !m_best_solution.m_error)
return;
color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF);
color_quad_u8 max_color(0, 0, 0, 0);
for (uint i = 0; i < m_unique_colors.size(); i++) {
min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color);
max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color);
}
// Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds.
uint64 W[4] = {}, WD2[4] = {}, WDD[4] = {};
color_quad_u8 source_low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
color_quad_u8 source_high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
uint64 error[4][256], best_remaining_error[4];
for (uint comp_index = 0; comp_index < 3; comp_index++) {
uint min_color_weight = 0;
uint max_color_weight = 0;
for (uint s = 0; s < 4; s++)
W[s] = WD2[s] = WDD[s] = 0;
for (uint i = 0; i < m_unique_colors.size(); i++) {
uint c = m_unique_colors[i].m_color[comp_index];
uint w = m_unique_colors[i].m_weight;
uint8 s = m_best_solution.m_selectors[i];
W[s] += (int64)w;
WD2[s] += (int64)w * c * 2;
WDD[s] += (int64)w * c * c;
if (c == min_color[comp_index])
min_color_weight += w;
if (c == max_color[comp_index])
max_color_weight += w;
}
uint ll[4];
ll[0] = orig_l_scaled[comp_index];
ll[1] = orig_h_scaled[comp_index];
ll[2] = (ll[0] * 2 + ll[1]) / 3;
ll[3] = (ll[0] + ll[1] * 2) / 3;
uint64 error_to_beat = 0;
for (int s = 0; s < 4; s++)
error_to_beat += W[s] * ll[s] * ll[s] - WD2[s] * ll[s] + WDD[s];
if (!error_to_beat)
uint8 p0 = source_low[comp_index];
uint8 p1 = source_high[comp_index];
color_quad_u8 low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
color_quad_u8 high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
compute_endpoint_component_errors(comp_index, error, best_remaining_error);
uint64 best_error = error[0][low[comp_index]] + error[1][high[comp_index]] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3];
if (best_remaining_error[0] >= best_error)
continue;
CRNLIB_ASSERT((min_color_weight > 0) && (max_color_weight > 0));
const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0;
const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0;
const uint m = (comp_index == 1) ? 63 : 31;
const uint m_shift = (comp_index == 1) ? 3 : 2;
for (uint o = 0; o <= m; o++) {
uint tl[4];
tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2));
for (uint h = 0; h < 8; h++) {
const uint pl = h << m_shift;
const uint ph = ((h + 1) << m_shift) - 1;
uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2));
uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2));
tl_l = math::minimum(tl_l, tl[0]);
tl_h = math::maximum(tl_h, tl[0]);
uint c_l = min_color[comp_index];
uint c_h = max_color[comp_index];
if (c_h < tl_l) {
uint min_possible_error = math::square<int>(tl_l - c_l);
if (min_possible_error > error_to_beat_div_min_color_weight)
continue;
} else if (c_l > tl_h) {
uint min_possible_error = math::square<int>(c_h - tl_h);
if (min_possible_error > error_to_beat_div_max_color_weight)
continue;
}
for (uint p = pl; p <= ph; p++) {
tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2));
tl[2] = (tl[0] * 2 + tl[1]) / 3;
tl[3] = (tl[0] + tl[1] * 2) / 3;
uint64 trial_error = 0;
for (int s = 0; s < 4; s++)
trial_error += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s];
if (trial_error < error_to_beat) {
color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
l[comp_index] = static_cast<uint8>(o);
h[comp_index] = static_cast<uint8>(p);
if (evaluate_solution(dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)))) {
if (!m_best_solution.m_error)
return;
compute_selectors();
for (uint s = 0; s < 4; s++)
W[s] = WD2[s] = WDD[s] = 0;
for (uint i = 0; i < m_unique_colors.size(); i++) {
uint c = m_unique_colors[i].m_color[comp_index];
uint w = m_unique_colors[i].m_weight;
uint8 s = m_best_solution.m_selectors[i];
W[s] += (int64)w;
WD2[s] += (int64)w * c * 2;
WDD[s] += (int64)w * c * c;
}
error_to_beat = 0;
for (int s = 0; s < 4; s++)
error_to_beat += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s];
}
}
}
const uint comp_limit = comp_index == 1 ? 64 : 32;
for (uint8 c0 = 0; c0 < comp_limit; c0++) {
uint64 e0 = error[0][c0];
if (e0 + best_remaining_error[1] >= best_error)
continue;
low[comp_index] = c0;
uint16 packed_low = dxt1_block::pack_color(low, false);
p0 = comp_index == 1 ? c0 << 2 | c0 >> 4 : c0 << 3 | c0 >> 2;
for (uint8 c1 = 0; c1 < comp_limit; c1++) {
uint64 e = e0 + error[1][c1];
if (e + best_remaining_error[2] >= best_error)
continue;
p1 = comp_index == 1 ? c1 << 2 | c1 >> 4 : c1 << 3 | c1 >> 2;
e += error[2][(p0 * 2 + p1) / 3];
if (e + best_remaining_error[3] >= best_error)
continue;
e += error[3][(p0 + p1 * 2) / 3];
if (e >= best_error)
continue;
high[comp_index] = c1;
if (!evaluate_solution(dxt1_solution_coordinates(packed_low, dxt1_block::pack_color(high, false))))
continue;
if (!m_best_solution.m_error)
return;
compute_selectors();
compute_endpoint_component_errors(comp_index, error, best_remaining_error);
best_error = error[0][c0] + error[1][c1] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3];
e0 = error[0][c0];
if (e0 + best_remaining_error[1] >= best_error)
break;
}
}
}
+1
View File
@@ -259,6 +259,7 @@ class dxt1_endpoint_optimizer {
void compute_vectors(const vec3F& perceptual_weights);
void return_solution();
void try_combinatorial_encoding();
void compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]);
void optimize_endpoint_comps();
void optimize_endpoints(vec3F& low_color, vec3F& high_color);
bool try_alpha_as_black_optimization();