Optimize selector codebook creation algorithm

This change significantly improves compression speed.

Explanation:
When generating selector codebook, pixel selectors can be processed in groups, while the intermediate error results for those groups can be precalculated.

Testing:
The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch.

[Compressing Kodak set without mipmaps]
Original: 1582222 bytes / 28.865 sec
Modified: 1482780 bytes / 13.340 sec
Improvement: 6.28% (compression ratio) / 53.78% (compression time)

[Compressing Kodak set with mipmaps]
Original: 2065243 bytes / 36.988 sec
Modified: 1931586 bytes / 18.087 sec
Improvement: 6.47% (compression ratio) / 51.10% (compression time)
This commit is contained in:
Alexander Suvorov
2017-06-16 14:55:32 +02:00
parent eee6b26e5d
commit 39b85b74c2
2 changed files with 31 additions and 39 deletions
Binary file not shown.
+31 -39
View File
@@ -663,33 +663,28 @@ struct color_selector_details {
void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) {
crnlib::vector<color_selector_details>& selector_details = *static_cast<crnlib::vector<color_selector_details>*>(pData_ptr);
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
uint errors[16][4];
uint E2[16][4];
uint E4[8][16];
uint E8[4][256];
for (uint b = m_num_blocks * data / num_tasks, bEnd = m_num_blocks * (data + 1) / num_tasks; b < bEnd; b++) {
color_cluster& cluster = m_color_clusters[m_endpoint_indices[b].color];
color_quad_u8* endpoint_colors = cluster.color_values;
for (uint p = 0; p < 16; p++) {
for (uint s = 0; s < 4; s++)
errors[p][s] = color::color_distance(m_params.m_perceptual, m_blocks[b][p], endpoint_colors[s], false);
E2[p][s] = color::color_distance(m_params.m_perceptual, m_blocks[b][p], endpoint_colors[s], false);
}
for (uint p = 0; p < 8; p++) {
for (uint s = 0; s < 16; s++)
E4[p][s] = E2[p << 1][s & 3] + E2[p << 1 | 1][s >> 2];
}
for (uint p = 0; p < 4; p++) {
for (uint s = 0; s < 256; s++)
E8[p][s] = E4[p << 1][s & 15] + E4[p << 1 | 1][s >> 4];
}
uint best_index = 0;
for (uint best_error = cUINT32_MAX, s = 0; s < m_color_selectors.size(); s++) {
uint32 selector = m_color_selectors[s];
uint error = errors[0][selector & 3];
error += errors[ 1][(selector >> 2) & 3];
error += errors[ 2][(selector >> 4) & 3];
error += errors[ 3][(selector >> 6) & 3];
error += errors[ 4][(selector >> 8) & 3];
error += errors[ 5][(selector >> 10) & 3];
error += errors[ 6][(selector >> 12) & 3];
error += errors[ 7][(selector >> 14) & 3];
error += errors[ 8][(selector >> 16) & 3];
error += errors[ 9][(selector >> 18) & 3];
error += errors[10][(selector >> 20) & 3];
error += errors[11][(selector >> 22) & 3];
error += errors[12][(selector >> 24) & 3];
error += errors[13][(selector >> 26) & 3];
error += errors[14][(selector >> 28) & 3];
error += errors[15][(selector >> 30) & 3];
uint error = E8[0][selector & 255] + E8[1][selector >> 8 & 255] + E8[2][selector >> 16 & 255] + E8[3][selector >> 24 & 255];
if (error < best_error) {
best_error = error;
best_index = s;
@@ -698,7 +693,7 @@ void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) {
uint (&total_errors)[16][4] = selector_details[best_index].error;
for (uint p = 0; p < 16; p++) {
for (uint s = 0; s < 4; s++)
total_errors[p][s] += errors[p][s];
total_errors[p][s] += E2[p][s];
}
selector_details[best_index].used = true;
m_selector_indices[b].color = best_index;
@@ -764,7 +759,8 @@ struct alpha_selector_details {
void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
crnlib::vector<alpha_selector_details>& selector_details = *static_cast<crnlib::vector<alpha_selector_details>*>(pData_ptr);
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
uint errors[16][8];
uint E3[16][8];
uint E6[8][64];
for (uint b = m_num_blocks * data / num_tasks, bEnd = m_num_blocks * (data + 1) / num_tasks; b < bEnd; b++) {
for (uint c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
const uint alpha_pixel_comp = m_params.m_alpha_component_indices[c - cAlpha0];
@@ -773,28 +769,24 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
for (uint p = 0; p < 16; p++) {
for (uint s = 0; s < 8; s++) {
int delta = m_blocks[b][p][alpha_pixel_comp] - block_values[s];
errors[p][s] = delta * delta;
E3[p][s] = delta * delta;
}
}
for (uint p = 0; p < 8; p++) {
for (uint s = 0; s < 64; s++)
E6[p][s] = E3[p << 1][s & 7] + E3[p << 1 | 1][s >> 3];
}
uint best_index = 0;
for (uint best_error = cUINT32_MAX, s = 0; s < m_alpha_selectors.size(); s++) {
uint64 selector = m_alpha_selectors[s];
uint error = errors[0][selector & 7];
error += errors[ 1][(selector >> 3) & 7];
error += errors[ 2][(selector >> 6) & 7];
error += errors[ 3][(selector >> 9) & 7];
error += errors[ 4][(selector >> 12) & 7];
error += errors[ 5][(selector >> 15) & 7];
error += errors[ 6][(selector >> 18) & 7];
error += errors[ 7][(selector >> 21) & 7];
error += errors[ 8][(selector >> 24) & 7];
error += errors[ 9][(selector >> 27) & 7];
error += errors[10][(selector >> 30) & 7];
error += errors[11][(selector >> 33) & 7];
error += errors[12][(selector >> 36) & 7];
error += errors[13][(selector >> 39) & 7];
error += errors[14][(selector >> 42) & 7];
error += errors[15][(selector >> 45) & 7];
uint error = E6[0][selector & 63];
error += E6[1][selector >> 6 & 63];
error += E6[2][selector >> 12 & 63];
error += E6[3][selector >> 18 & 63];
error += E6[4][selector >> 24 & 63];
error += E6[5][selector >> 30 & 63];
error += E6[6][selector >> 36 & 63];
error += E6[7][selector >> 42 & 63];
if (error < best_error) {
best_error = error;
best_index = s;
@@ -805,14 +797,14 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
for (uint p = 0; p < 16; p++) {
for (uint s = 0; s < 8; s++) {
int delta = m_blocks[b][p][alpha_pixel_comp] - block_values[s];
errors[p][s] = delta * delta;
E3[p][s] = delta * delta;
}
}
}
uint (&total_errors)[16][8] = selector_details[best_index].error;
for (uint p = 0; p < 16; p++) {
for (uint s = 0; s < 8; s++)
total_errors[p][s] += errors[p][s];
total_errors[p][s] += E3[p][s];
}
selector_details[best_index].used = true;
m_selector_indices[b].component[c] = best_index;