diff --git a/bin/crunch_x64.exe b/bin/crunch_x64.exe index 6a3e27d..8b7053d 100644 Binary files a/bin/crunch_x64.exe and b/bin/crunch_x64.exe differ diff --git a/crnlib/crn_dxt_hc.cpp b/crnlib/crn_dxt_hc.cpp index 62a974d..aea1f6a 100644 --- a/crnlib/crn_dxt_hc.cpp +++ b/crnlib/crn_dxt_hc.cpp @@ -530,7 +530,7 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) { uint b = blocks[i]; uint weight = (uint)(math::clamp(endpoint_weight * m_block_weights[b], 1, 2048) * encoding_weight[m_block_encodings[b]]); uint32 selector = 0; - for (uint sh = 0, p = 0; p < 16; p++, sh += 2) { + for (uint p = 0; p < 16; p++) { uint error_best = cUINT32_MAX; uint8 s_best = 0; for (uint8 t = 0; t < 4; t++) { @@ -541,9 +541,9 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) { error_best = error; } } - selector |= s_best << sh; + selector = selector << 2 | s_best; } - m_block_selectors[cColor][b] = selector | (uint64)weight << 32; + m_block_selectors[cColor][b] = (uint64)selector << 32 | weight; } dxt_endpoint_refiner::params refinerParams; @@ -609,7 +609,7 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) { uint b = blocks[i]; uint weight = (uint)(math::clamp(0x8000 * endpoint_weight * m_block_weights[b] * (m_block_encodings[b] ? 0.972f : 1.0f), 1, 0xFFFF)); uint32 selector = 0; - for (uint sh = 0, p = 0; p < 8; p++, sh += 2) { + for (uint p = 0; p < 8; p++) { uint error_best = cUINT32_MAX; uint8 s_best = 0; for (uint8 s = 0; s < 4; s++) { @@ -619,9 +619,9 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) { error_best = error; } } - selector |= s_best << sh; + selector = selector << 2 | s_best; } - m_block_selectors[cColor][b] = selector | (uint64)weight << 32; + m_block_selectors[cColor][b] = (uint64)selector << ((b & 1) ? 32 : 48) | weight; } } } @@ -663,13 +663,59 @@ void dxt_hc::determine_color_endpoint_clusters_task(uint64 data, void* pData_ptr } void dxt_hc::determine_color_endpoints() { - tree_clusterizer vq; + uint num_tasks = m_pTask_pool->get_num_threads() + 1; + crnlib::vector > endpoints; for (uint t = 0; t < m_tiles.size(); t++) { if (m_tiles[t].pixels.size()) - vq.add_training_vec(m_tiles[t].color_endpoint, (uint)(m_tiles[t].pixels.size() * m_tiles[t].weight)); + endpoints.push_back(std::make_pair(m_tiles[t].color_endpoint, (uint)(m_tiles[t].pixels.size() * m_tiles[t].weight))); } - vq.generate_codebook(math::minimum(m_num_tiles, m_params.m_color_endpoint_codebook_size), true, m_pTask_pool); + struct Node { + std::pair *p, *pEnd; + Node (std::pair* begin, std::pair* end) : p(begin), pEnd(end) {} + bool operator<(const Node& other) const { return *p > *other.p; } + static void sort_task(uint64 data, void* ptr) { std::sort(((Node*)ptr)->p, ((Node*)ptr)->pEnd); } + }; + + crnlib::vector nodes; + Node node(0, endpoints.get_ptr()); + for (uint i = 0; i < num_tasks; i++) { + node.p = node.pEnd; + node.pEnd = endpoints.get_ptr() + endpoints.size() * (i + 1) / num_tasks; + if (node.p != node.pEnd) + nodes.push_back(node); + } + + for (uint i = 0; i < nodes.size(); i++) + m_pTask_pool->queue_task(&Node::sort_task, i, &nodes[i]); + m_pTask_pool->join(); + + std::priority_queue queue; + for (uint i = 0; i < nodes.size(); i++) + queue.push(nodes[i]); + + crnlib::vector vectors; + crnlib::vector weights; + vectors.reserve(endpoints.size()); + weights.reserve(endpoints.size()); + while (queue.size()) { + Node node = queue.top(); + std::pair* endpoint = node.p++; + queue.pop(); + if (node.p != node.pEnd) + queue.push(node); + if (!vectors.size() || endpoint->first != vectors.back()) { + vectors.push_back(endpoint->first); + weights.push_back(endpoint->second); + } else if (weights.back() > UINT_MAX - endpoint->second) { + weights.back() = UINT_MAX; + } else { + weights.back() += endpoint->second; + } + } + + tree_clusterizer vq; + vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), math::minimum(m_num_tiles, m_params.m_color_endpoint_codebook_size), true, m_pTask_pool); m_color_clusters.resize(vq.get_codebook_size()); for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++) @@ -757,7 +803,7 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) { uint b = blocks[i]; uint weight = encoding_weight[m_block_encodings[b]]; uint64 selector = 0; - for (uint sh = 0, p = 0; p < 16; p++, sh += 3) { + for (uint p = 0; p < 16; p++) { uint error_best = cUINT32_MAX; uint8 s_best = 0; for (uint8 t = 0; t < 8; t++) { @@ -769,9 +815,9 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) { error_best = error; } } - selector |= (uint64)s_best << sh; + selector = selector << 3 | s_best; } - m_block_selectors[cAlpha0 + a][b] = selector | (uint64)weight << 48; + m_block_selectors[cAlpha0 + a][b] = selector << 16 | weight; } } @@ -823,18 +869,64 @@ void dxt_hc::determine_alpha_endpoint_clusters_task(uint64 data, void* pData_ptr } void dxt_hc::determine_alpha_endpoints() { - tree_clusterizer vq; + uint num_tasks = m_pTask_pool->get_num_threads() + 1; + crnlib::vector > endpoints; for (uint a = 0; a < m_num_alpha_blocks; a++) { for (uint t = 0; t < m_tiles.size(); t++) { if (m_tiles[t].pixels.size()) - vq.add_training_vec(m_tiles[t].alpha_endpoints[a], m_tiles[t].pixels.size()); + endpoints.push_back(std::make_pair(m_tiles[t].alpha_endpoints[a], m_tiles[t].pixels.size())); } } - vq.generate_codebook(math::minimum(m_num_tiles, m_params.m_alpha_endpoint_codebook_size), false, m_pTask_pool); + struct Node { + std::pair *p, *pEnd; + Node (std::pair* begin, std::pair* end) : p(begin), pEnd(end) {} + bool operator<(const Node& other) const { return *p > *other.p; } + static void sort_task(uint64 data, void* ptr) { std::sort(((Node*)ptr)->p, ((Node*)ptr)->pEnd); } + }; + + crnlib::vector nodes; + Node node(0, endpoints.get_ptr()); + for (uint i = 0; i < num_tasks; i++) { + node.p = node.pEnd; + node.pEnd = endpoints.get_ptr() + endpoints.size() * (i + 1) / num_tasks; + if (node.p != node.pEnd) + nodes.push_back(node); + } + + for (uint i = 0; i < nodes.size(); i++) + m_pTask_pool->queue_task(&Node::sort_task, i, &nodes[i]); + m_pTask_pool->join(); + + std::priority_queue queue; + for (uint i = 0; i < nodes.size(); i++) + queue.push(nodes[i]); + + crnlib::vector vectors; + crnlib::vector weights; + vectors.reserve(endpoints.size()); + weights.reserve(endpoints.size()); + while (queue.size()) { + Node node = queue.top(); + std::pair* endpoint = node.p++; + queue.pop(); + if (node.p != node.pEnd) + queue.push(node); + if (!vectors.size() || endpoint->first != vectors.back()) { + vectors.push_back(endpoint->first); + weights.push_back(endpoint->second); + } else if (weights.back() > UINT_MAX - endpoint->second) { + weights.back() = UINT_MAX; + } else { + weights.back() += endpoint->second; + } + } + + tree_clusterizer vq; + vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), math::minimum(m_num_tiles, m_params.m_alpha_endpoint_codebook_size), false, m_pTask_pool); m_alpha_clusters.resize(vq.get_codebook_size()); - for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++) + for (uint i = 0; i < num_tasks; i++) m_pTask_pool->queue_object_task(this, &dxt_hc::determine_alpha_endpoint_clusters_task, i, &vq); m_pTask_pool->join(); @@ -859,7 +951,7 @@ void dxt_hc::determine_alpha_endpoints() { } } - for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++) + for (uint i = 0; i < num_tasks; i++) m_pTask_pool->queue_object_task(this, &dxt_hc::determine_alpha_endpoint_codebook_task, i, NULL); m_pTask_pool->join(); } @@ -911,16 +1003,68 @@ void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) { } } +struct SelectorNode { + uint64 *p, *pEnd; + SelectorNode (uint64* begin, uint64* end) : p(begin), pEnd(end) {} + bool operator<(const SelectorNode& other) const { return *p > *other.p; } + static void sort_task(uint64 data, void* ptr) { std::sort(((SelectorNode*)ptr)->p, ((SelectorNode*)ptr)->pEnd); } +}; + void dxt_hc::create_color_selector_codebook() { - tree_clusterizer selector_vq; - vec16F v; - for (uint n = m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks, b = 0; b < n; b++) { - uint64 selector = m_has_etc_color_blocks ? m_block_selectors[cColor][b << 1] | m_block_selectors[cColor][b << 1 | 1] << 16 : m_block_selectors[cColor][b]; - for (uint8 p = 0; p < 16; p++, selector >>= 2) - v[p] = ((selector & 3) + 0.5f) * 0.25f; - selector_vq.add_training_vec(v, m_has_etc_color_blocks ? (selector & 0xFFFF) + (selector >> 16) : selector); + uint num_tasks = m_pTask_pool->get_num_threads() + 1; + crnlib::vector selectors(m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks); + for (uint i = 0, b = 0, step = m_has_etc_color_blocks ? 2 : 1; b < m_num_blocks; b += step) + selectors[i++] = m_block_selectors[cColor][b] + (m_has_etc_color_blocks ? m_block_selectors[cColor][b + 1] : 0); + + crnlib::vector nodes; + SelectorNode node(0, selectors.get_ptr()); + for (uint i = 0; i < num_tasks; i++) { + node.p = node.pEnd; + node.pEnd = selectors.get_ptr() + selectors.size() * (i + 1) / num_tasks; + if (node.p != node.pEnd) + nodes.push_back(node); } - selector_vq.generate_codebook(m_params.m_color_selector_codebook_size, false, m_pTask_pool); + + for (uint i = 0; i < nodes.size(); i++) + m_pTask_pool->queue_task(&SelectorNode::sort_task, i, &nodes[i]); + m_pTask_pool->join(); + + std::priority_queue queue; + for (uint i = 0; i < nodes.size(); i++) + queue.push(nodes[i]); + + float v[4]; + for (uint s = 0; s < 4; s++) + v[s] = (s + 0.5f) * 0.25f; + + crnlib::vector vectors; + crnlib::vector weights; + vectors.reserve(selectors.size()); + weights.reserve(selectors.size()); + for (uint64 prev_selector = 0; queue.size();) { + SelectorNode node = queue.top(); + uint64 selector = *node.p++; + queue.pop(); + if (node.p != node.pEnd) + queue.push(node); + uint weight = (uint)selector; + selector >>= 32; + if (!vectors.size() || selector != prev_selector) { + prev_selector = selector; + vec16F vector; + for (uint p = 0; p < 16; p++, selector >>= 2) + vector[15 - p] = v[selector & 3]; + vectors.push_back(vector); + weights.push_back(weight); + } else if (weights.back() > UINT_MAX - weight) { + weights.back() = UINT_MAX; + } else { + weights.back() += weight; + } + } + + tree_clusterizer selector_vq; + selector_vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), m_params.m_color_selector_codebook_size, false, m_pTask_pool); m_color_selectors.resize(selector_vq.get_codebook_size()); m_color_selectors_used.resize(selector_vq.get_codebook_size()); for (uint i = 0; i < selector_vq.get_codebook_size(); i++) { @@ -930,7 +1074,6 @@ void dxt_hc::create_color_selector_codebook() { m_color_selectors[i] |= (uint)(v[j] * 4.0f) << sh; } - uint num_tasks = m_pTask_pool->get_num_threads() + 1; crnlib::vector > selector_details(num_tasks); for (uint t = 0; t < num_tasks; t++) { selector_details[t].resize(m_color_selectors.size()); @@ -1024,17 +1167,62 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) { } void dxt_hc::create_alpha_selector_codebook() { - tree_clusterizer selector_vq; - vec16F v; - for (uint c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) { - for (uint b = 0; b < m_num_blocks; b += m_has_etc_color_blocks ? 2 : 1) { - uint64 selector = m_block_selectors[c][b]; - for (uint8 p = 0; p < 16; p++, selector >>= 3) - v[p] = ((selector & 7) + 0.5f) * 0.125f; - selector_vq.add_training_vec(v, selector); + uint num_tasks = m_pTask_pool->get_num_threads() + 1; + crnlib::vector selectors(m_num_alpha_blocks * (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks)); + for (uint i = 0, c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) { + for (uint b = 0, step = m_has_etc_color_blocks ? 2 : 1; b < m_num_blocks; b += step) + selectors[i++] = m_block_selectors[c][b]; + } + + crnlib::vector nodes; + SelectorNode node(0, selectors.get_ptr()); + for (uint i = 0; i < num_tasks; i++) { + node.p = node.pEnd; + node.pEnd = selectors.get_ptr() + selectors.size() * (i + 1) / num_tasks; + if (node.p != node.pEnd) + nodes.push_back(node); + } + + for (uint i = 0; i < nodes.size(); i++) + m_pTask_pool->queue_task(&SelectorNode::sort_task, i, &nodes[i]); + m_pTask_pool->join(); + + std::priority_queue queue; + for (uint i = 0; i < nodes.size(); i++) + queue.push(nodes[i]); + + float v[8]; + for (uint s = 0; s < 8; s++) + v[s] = (s + 0.5f) * 0.125f; + + crnlib::vector vectors; + crnlib::vector weights; + vectors.reserve(selectors.size()); + weights.reserve(selectors.size()); + for (uint64 prev_selector = 0; queue.size();) { + SelectorNode node = queue.top(); + uint64 selector = *node.p++; + queue.pop(); + if (node.p != node.pEnd) + queue.push(node); + uint weight = (uint16)selector; + selector >>= 16; + if (!vectors.size() || selector != prev_selector) { + prev_selector = selector; + vec16F vector; + for (uint p = 0; p < 16; p++, selector >>= 3) + vector[15 - p] = v[selector & 7]; + vectors.push_back(vector); + weights.push_back(weight); + } else if (weights.back() > UINT_MAX - weight) { + weights.back() = UINT_MAX; + } else { + weights.back() += weight; } } - selector_vq.generate_codebook(m_params.m_alpha_selector_codebook_size, false, m_pTask_pool); + + tree_clusterizer selector_vq; + selector_vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), m_params.m_alpha_selector_codebook_size, false, m_pTask_pool); m_alpha_selectors.resize(selector_vq.get_codebook_size()); m_alpha_selectors_used.resize(selector_vq.get_codebook_size()); for (uint i = 0; i < selector_vq.get_codebook_size(); i++) { @@ -1044,7 +1232,6 @@ void dxt_hc::create_alpha_selector_codebook() { m_alpha_selectors[i] |= (uint64)(v[j] * 8.0f) << sh; } - uint num_tasks = m_pTask_pool->get_num_threads() + 1; crnlib::vector > selector_details(num_tasks); for (uint t = 0; t < num_tasks; t++) { selector_details[t].resize(m_alpha_selectors.size()); diff --git a/crnlib/crn_tree_clusterizer.h b/crnlib/crn_tree_clusterizer.h index 113b52c..d6785c3 100644 --- a/crnlib/crn_tree_clusterizer.h +++ b/crnlib/crn_tree_clusterizer.h @@ -25,14 +25,10 @@ class tree_clusterizer { } }; - void add_training_vec(const VectorType& v, uint weight) { - m_hist.push_back(std::make_pair(v, weight)); - } - struct split_alternative_node_task_params { uint main_node; uint alternative_node; - uint max_size; + uint max_splits; }; void split_alternative_node_task(uint64, void* pData_ptr) { @@ -45,7 +41,7 @@ class tree_clusterizer { end_node++; splits++; - while (splits < pParams->max_size && split_node(node_queue, end_node)) + while (splits < pParams->max_splits && split_node(node_queue, end_node)) splits++; m_nodes[pParams->main_node] = m_nodes[pParams->alternative_node]; @@ -53,105 +49,73 @@ class tree_clusterizer { } - bool generate_codebook(uint max_size, bool generate_node_index_map = false, task_pool* pTask_pool = 0) { - if (m_hist.empty()) - return false; - - double ttsum = 0.0f; - - m_vectors.reserve(m_hist.size()); - m_vectorsInfo.reserve(m_hist.size()); - - std::sort(m_hist.begin(), m_hist.end()); - for (uint i = 0; i < m_hist.size(); i++) { - if (!i || m_hist[i].first != m_hist[i - 1].first) { - VectorInfo vectorInfo; - vectorInfo.index = m_vectors.size(); - vectorInfo.weight = m_hist[i].second; - m_vectorsInfo.push_back(vectorInfo); - m_vectors.push_back(m_hist[i].first); - } else if (m_vectorsInfo.back().weight > UINT_MAX - m_hist[i].second) { - m_vectorsInfo.back().weight = UINT_MAX; - } else { - m_vectorsInfo.back().weight += m_hist[i].second; - } - } - - m_weightedVectors.resize(m_vectors.size()); - m_weightedDotProducts.resize(m_vectors.size()); - m_vectorsInfoLeft.resize(m_vectors.size()); - m_vectorsInfoRight.resize(m_vectors.size()); - m_vectorComparison.resize(m_vectors.size()); + void generate_codebook(VectorType* vectors, uint* weights, uint size, uint max_splits, bool generate_node_index_map = false, task_pool* pTask_pool = 0) { + m_vectors = vectors; + m_vectorsInfo.resize(size); + m_weightedVectors.resize(size); + m_weightedDotProducts.resize(size); + m_vectorsInfoLeft.resize(size); + m_vectorsInfoRight.resize(size); + m_vectorComparison.resize(size); + m_nodes.resize(max_splits << 2); + m_codebook.clear(); + uint num_tasks = pTask_pool ? pTask_pool->get_num_threads() + 1 : 1; vq_node root; root.m_begin = 0; - root.m_end = m_vectorsInfo.size(); - - for (uint i = 0; i < m_vectors.size(); i++) { - const VectorType& v = m_vectors[i]; - const uint weight = m_vectorsInfo[i].weight; + root.m_end = size; + double ttsum = 0.0f; + for (uint i = 0; i < m_vectorsInfo.size(); i++) { + const VectorType& v = vectors[i]; + m_vectorsInfo[i].index = i; + const uint weight = m_vectorsInfo[i].weight = weights[i]; m_weightedVectors[i] = v * (float)weight; root.m_centroid += m_weightedVectors[i]; root.m_total_weight += weight; m_weightedDotProducts[i] = v.dot(v) * weight; ttsum += m_weightedDotProducts[i]; } - root.m_variance = (float)(ttsum - (root.m_centroid.dot(root.m_centroid) / root.m_total_weight)); - root.m_centroid *= (1.0f / root.m_total_weight); - m_nodes.resize(max_size << 2); - std::priority_queue node_queue; uint begin_node = 0, end_node = begin_node, splits = 0; - m_nodes[end_node] = root; node_queue.push(NodeInfo(end_node, root.m_variance)); end_node++; splits++; - uint num_tasks = pTask_pool ? pTask_pool->get_num_threads() + 1 : 1; - if (num_tasks > 1) { - while (splits < max_size && node_queue.size() != num_tasks && split_node(node_queue, end_node, pTask_pool)) + while (splits < max_splits && node_queue.size() != num_tasks && split_node(node_queue, end_node, pTask_pool)) splits++; if (node_queue.size() == num_tasks) { std::priority_queue alternative_node_queue = node_queue; - uint alternative_node = max_size << 1, alternative_max_size = max_size / num_tasks; + uint alternative_node = max_splits << 1, alternative_max_splits = max_splits / num_tasks; crnlib::vector params(num_tasks); - for (uint task = 0; !alternative_node_queue.empty(); alternative_node_queue.pop(), alternative_node += alternative_max_size << 1, task++) { + for (uint task = 0; !alternative_node_queue.empty(); alternative_node_queue.pop(), alternative_node += alternative_max_splits << 1, task++) { params[task].main_node = alternative_node_queue.top().m_index; params[task].alternative_node = alternative_node; - params[task].max_size = alternative_max_size; + params[task].max_splits = alternative_max_splits; pTask_pool->queue_object_task(this, &tree_clusterizer::split_alternative_node_task, task, ¶ms[task]); } pTask_pool->join(); } } - while (splits < max_size && split_node(node_queue, end_node, pTask_pool)) + while (splits < max_splits && split_node(node_queue, end_node, pTask_pool)) splits++; - m_codebook.clear(); - for (uint i = begin_node; i < end_node; i++) { vq_node& node = m_nodes[i]; - if (!node.m_alternative && node.m_left != -1) { - CRNLIB_ASSERT(node.m_right != -1); + if (!node.m_alternative && node.m_left != -1) continue; - } - node.m_codebook_index = m_codebook.size(); m_codebook.push_back(node.m_centroid); - if (generate_node_index_map) { for (uint j = node.m_begin; j < node.m_end; j++) m_node_index_map.insert(std::make_pair(m_vectors[m_vectorsInfo[j].index], node.m_codebook_index)); } } - - return true; } inline uint get_node_index(const VectorType& v) { @@ -172,9 +136,7 @@ class tree_clusterizer { } private: - - crnlib::vector > m_hist; - crnlib::vector m_vectors; + VectorType* m_vectors; crnlib::vector m_weightedVectors; crnlib::vector m_weightedDotProducts; crnlib::vector m_vectorsInfo, m_vectorsInfoLeft, m_vectorsInfoRight;