From 3d6ca050a8163e8dca2796892261b02f918beef7 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sat, 14 Dec 2024 14:33:27 +0100 Subject: [PATCH 01/12] Use templated is_less instead of runtime comparator This changes serves both immediate purposes and lays the groundwork for further changes. Previously the qsort implementation used a comparator construct that dispatched the comparison function through a tagged union, this incurs an additional branch on each comparison to check the tag `comp_type`. There are also structural changes to the quick_sort implementation to lay the groundwork for future changes to the partition and equal element handling logic. The comparator also did an equal idx comparison for each comparison. While this may sound like a nice optimization for expensive to compare types, a well constructed implementation will never compare an element to itself. In effect this change reduces the required comparison for integer sorting from 4 to 2 per comparison operation in the implementation. Performance impact, the tested types i32, u64, string and f128 improve by ~1.5x, and 1k by ~2.4x for the random pattern. Equal element patterns see an algorithmic degradation, but this will be addressed by a future change. Nice to have, the `[[clang::no_sanitize("function")]]` is no longer needed. --- libc/src/stdlib/heap_sort.h | 11 +- libc/src/stdlib/qsort.cpp | 8 +- libc/src/stdlib/qsort_data.h | 86 +++------ libc/src/stdlib/qsort_r.cpp | 9 +- libc/src/stdlib/qsort_util.h | 6 +- libc/src/stdlib/quick_sort.h | 98 ++++++----- libc/test/src/stdlib/CMakeLists.txt | 14 +- libc/test/src/stdlib/SortingTest.h | 163 +++++++----------- libc/test/src/stdlib/heap_sort_test.cpp | 19 +- libc/test/src/stdlib/qsort_r_test.cpp | 4 +- libc/test/src/stdlib/qsort_test.cpp | 19 +- libc/test/src/stdlib/quick_sort_test.cpp | 16 -- .../libc/test/src/stdlib/BUILD.bazel | 12 +- 13 files changed, 188 insertions(+), 277 deletions(-) delete mode 100644 libc/test/src/stdlib/quick_sort_test.cpp diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h index ccb9ec5f82149..543c834c0197c 100644 --- a/libc/src/stdlib/heap_sort.h +++ b/libc/src/stdlib/heap_sort.h @@ -18,11 +18,11 @@ namespace internal { // A simple in-place heapsort implementation. // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. -LIBC_INLINE void heap_sort(const Array &array) { - size_t end = array.size(); +template void heap_sort(const Array &array, const F &is_less) { + size_t end = array.len(); size_t start = end / 2; - auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; + const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; while (end > 1) { if (start > 0) { @@ -40,12 +40,11 @@ LIBC_INLINE void heap_sort(const Array &array) { while (left_child(root) < end) { size_t child = left_child(root); // If there are two children, set child to the greater. - if (child + 1 < end && - array.elem_compare(child, array.get(child + 1)) < 0) + if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1))) ++child; // If the root is less than the greater child - if (array.elem_compare(root, array.get(child)) >= 0) + if (!is_less(array.get(root), array.get(child))) break; // Swap the root with the greater child and continue sifting down. diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 65a63c239f5c0..ff55b1bd4455e 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -20,12 +20,14 @@ LLVM_LIBC_FUNCTION(void, qsort, int (*compare)(const void *, const void *))) { if (array == nullptr || array_size == 0 || elem_size == 0) return; - internal::Comparator c(compare); auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); + elem_size); - internal::sort(arr); + internal::unstable_sort( + arr, [compare](const void *a, const void *b) noexcept -> bool { + return compare(a, b) < 0; + }); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index c529d55ca46ff..f296c26db4fd6 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -17,60 +17,25 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -using Compare = int(const void *, const void *); -using CompareWithState = int(const void *, const void *, void *); - -enum class CompType { COMPARE, COMPARE_WITH_STATE }; - -struct Comparator { - union { - Compare *comp_func; - CompareWithState *comp_func_r; - }; - const CompType comp_type; - - void *arg; - - Comparator(Compare *func) - : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {} - - Comparator(CompareWithState *func, void *arg_val) - : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE), - arg(arg_val) {} - -#if defined(__clang__) - // Recent upstream changes to -fsanitize=function find more instances of - // function type mismatches. One case is with the comparator passed to this - // class. Libraries will tend to pass comparators that take pointers to - // varying types while this comparator expects to accept const void pointers. - // Ideally those tools would pass a function that strictly accepts const - // void*s to avoid UB, or would use qsort_r to pass their own comparator. - [[clang::no_sanitize("function")]] -#endif - int comp_vals(const void *a, const void *b) const { - if (comp_type == CompType::COMPARE) { - return comp_func(a, b); - } else { - return comp_func_r(a, b, arg); - } - } -}; - class Array { - uint8_t *array; - size_t array_size; + uint8_t *array_base; + size_t array_len; size_t elem_size; - Comparator compare; + + uint8_t *get_internal(size_t i) const { return array_base + (i * elem_size); } public: - Array(uint8_t *a, size_t s, size_t e, Comparator c) - : array(a), array_size(s), elem_size(e), compare(c) {} + Array(uint8_t *a, size_t s, size_t e) + : array_base(a), array_len(s), elem_size(e) {} - uint8_t *get(size_t i) const { return array + i * elem_size; } + inline void *get(size_t i) const { + return reinterpret_cast(get_internal(i)); + } void swap(size_t i, size_t j) const { - uint8_t *elem_i = get(i); - uint8_t *elem_j = get(j); + uint8_t *elem_i = get_internal(i); + uint8_t *elem_j = get_internal(j); + for (size_t b = 0; b < elem_size; ++b) { uint8_t temp = elem_i[b]; elem_i[b] = elem_j[b]; @@ -78,30 +43,21 @@ class Array { } } - int elem_compare(size_t i, const uint8_t *other) const { - // An element must compare equal to itself so we don't need to consult the - // user provided comparator. - if (get(i) == other) - return 0; - return compare.comp_vals(get(i), other); - } - - size_t size() const { return array_size; } + size_t len() const { return array_len; } - // Make an Array starting at index |i| and size |s|. - LIBC_INLINE Array make_array(size_t i, size_t s) const { - return Array(get(i), s, elem_size, compare); + // Make an Array starting at index |i| and length |s|. + inline Array make_array(size_t i, size_t s) const { + return Array(get_internal(i), s, elem_size); } - // Reset this Array to point at a different interval of the same items. - LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) { - array = a; - array_size = s; + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + inline void reset_bounds(size_t i, size_t s) { + array_base = get_internal(i); + array_len = s; } }; -using SortingRoutine = void(const Array &); - } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index bf61a40e84734..530e3f5f58e19 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -21,11 +21,14 @@ LLVM_LIBC_FUNCTION(void, qsort_r, void *arg)) { if (array == nullptr || array_size == 0 || elem_size == 0) return; - internal::Comparator c(compare, arg); + auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); + elem_size); - internal::sort(arr); + internal::unstable_sort( + arr, [compare, arg](const void *a, const void *b) noexcept -> bool { + return compare(a, b, arg) < 0; + }); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index d42adde06d976..0faf6e1afdec0 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -27,11 +27,13 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { +template void unstable_sort(Array array, const F &is_less) { #if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT -constexpr auto sort = quick_sort; + quick_sort(array, is_less); #elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT -constexpr auto sort = heap_sort; + heap_sort(array, is_less); #endif +} } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 82b90a7d511d9..6972c233dde25 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -19,70 +19,78 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { // A simple quicksort implementation using the Hoare partition scheme. -LIBC_INLINE size_t partition(const Array &array) { - const size_t array_size = array.size(); - size_t pivot_index = array_size / 2; - uint8_t *pivot = array.get(pivot_index); - size_t i = 0; - size_t j = array_size - 1; +template +size_t partition_hoare(const Array &array, const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = array_len; while (true) { - int compare_i, compare_j; - - while ((compare_i = array.elem_compare(i, pivot)) < 0) - ++i; - while ((compare_j = array.elem_compare(j, pivot)) > 0) - --j; - - // At some point i will crossover j so we will definitely break out of - // this while loop. - if (i >= j) - return j + 1; - - array.swap(i, j); - - // The pivot itself might have got swapped so we will update the pivot. - if (i == pivot_index) { - pivot = array.get(j); - pivot_index = j; - } else if (j == pivot_index) { - pivot = array.get(i); - pivot_index = i; + while (left < right && is_less(array.get(left), pivot)) + ++left; + + while (true) { + --right; + if (left >= right || is_less(array.get(right), pivot)) { + break; + } } - if (compare_i == 0 && compare_j == 0) { - // If we do not move the pointers, we will end up with an - // infinite loop as i and j will be stuck without advancing. - ++i; - --j; - } + if (left >= right) + break; + + array.swap(left, right); + ++left; } + + return left; } -LIBC_INLINE void quick_sort(Array array) { +template +size_t partition(const Array &array, size_t pivot_index, const F &is_less) { + // Place the pivot at the beginning of the array. + array.swap(0, pivot_index); + + const Array array_without_pivot = array.make_array(1, array.len() - 1); + const void *pivot = array.get(0); + const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less); + + // Place the pivot between the two partitions. + array.swap(0, num_lt); + + return num_lt; +} + +template void quick_sort(Array &array, const F &is_less) { while (true) { - const size_t array_size = array.size(); - if (array_size <= 1) + const size_t array_len = array.len(); + if (array_len <= 1) return; - size_t split_index = partition(array); - if (array_size == 2) + + const size_t pivot_index = array_len / 2; + size_t split_index = partition(array, pivot_index, is_less); + + if (array_len == 2) // The partition operation sorts the two element array. return; - // Make Arrays describing the two sublists that still need sorting. + // Split the array into `left`, `pivot`, and `right`. Array left = array.make_array(0, split_index); - Array right = array.make_array(split_index, array.size() - split_index); + const size_t right_start = split_index + 1; + Array right = array.make_array(right_start, array.len() - right_start); // Recurse to sort the smaller of the two, and then loop round within this // function to sort the larger. This way, recursive call depth is bounded // by log2 of the total array size, because every recursive call is sorting // a list at most half the length of the one in its caller. - if (left.size() < right.size()) { - quick_sort(left); - array.reset_bounds(right.get(0), right.size()); + if (left.len() < right.len()) { + quick_sort(left, is_less); + array.reset_bounds(right_start, right.len()); } else { - quick_sort(right); - array.reset_bounds(left.get(0), left.size()); + quick_sort(right, is_less); + array.reset_bounds(0, left.len()); } } } diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index ff6034e43e9f6..0044afed1d12c 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -300,18 +300,6 @@ add_libc_test( libc.src.stdlib.bsearch ) -add_libc_test( - quick_sort_test - SUITE - libc-stdlib-tests - SRCS - quick_sort_test.cpp - HDRS - SortingTest.h - DEPENDS - libc.src.stdlib.qsort_util -) - add_libc_test( heap_sort_test SUITE @@ -321,7 +309,7 @@ add_libc_test( HDRS SortingTest.h DEPENDS - libc.src.stdlib.qsort_util + libc.src.stdlib.qsort ) add_libc_test( diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h index d34584e5addf0..cba16566d89e7 100644 --- a/libc/test/src/stdlib/SortingTest.h +++ b/libc/test/src/stdlib/SortingTest.h @@ -7,19 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort.h" #include "test/UnitTest/Test.h" class SortingTest : public LIBC_NAMESPACE::testing::Test { - using Array = LIBC_NAMESPACE::internal::Array; - using Comparator = LIBC_NAMESPACE::internal::Comparator; - using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine; + using SortFn = void (*)(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)); -public: static int int_compare(const void *l, const void *r) { int li = *reinterpret_cast(l); int ri = *reinterpret_cast(r); + if (li == ri) return 0; else if (li > ri) @@ -28,16 +27,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { return -1; } - void test_sorted_array(SortingRoutine sort_func) { + static void int_sort(SortFn sort_fn, int *array, size_t array_len) { + sort_fn(reinterpret_cast(array), array_len, sizeof(int), + int_compare); + } + +public: + void test_sorted_array(SortFn sort_fn) { int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, 1133, 1135, 1155, 1170, 1171, 11100, 12310}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_LE(array[0], 10); ASSERT_LE(array[1], 23); @@ -66,45 +68,36 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_LE(array[24], 12310); } - void test_reversed_sorted_array(SortingRoutine sort_func) { + void test_reversed_sorted_array(SortFn sort_fn) { int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + int_sort(sort_fn, array, ARRAY_LEN); - sort_func(arr); - - for (int i = 0; i < int(ARRAY_SIZE - 1); ++i) + for (int i = 0; i < int(ARRAY_LEN - 1); ++i) ASSERT_EQ(array[i], i + 1); } - void test_all_equal_elements(SortingRoutine sort_func) { + void test_all_equal_elements(SortFn sort_fn) { int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); - for (size_t i = 0; i < ARRAY_SIZE; ++i) + for (size_t i = 0; i < ARRAY_LEN; ++i) ASSERT_EQ(array[i], 100); } - void test_unsorted_array_1(SortingRoutine sort_func) { + void test_unsorted_array_1(SortFn sort_fn) { int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, 123, 90, 80, 70, 60, 171, 11, 1, -1, -5, -10, 1155, 1170, 1171, 12, -100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], -100); ASSERT_EQ(array[1], -10); @@ -133,14 +126,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[24], 1171); } - void test_unsorted_array_2(SortingRoutine sort_func) { + void test_unsorted_array_2(SortFn sort_fn) { int array[7] = {10, 40, 45, 55, 35, 23, 60}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 23); @@ -151,14 +141,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[6], 60); } - void test_unsorted_array_duplicated_1(SortingRoutine sort_func) { + void test_unsorted_array_duplicated_1(SortFn sort_fn) { int array[6] = {10, 10, 20, 20, 5, 5}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 5); ASSERT_EQ(array[1], 5); @@ -168,14 +155,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[5], 20); } - void test_unsorted_array_duplicated_2(SortingRoutine sort_func) { + void test_unsorted_array_duplicated_2(SortFn sort_fn) { int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 10); @@ -189,14 +173,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[9], 21); } - void test_unsorted_array_duplicated_3(SortingRoutine sort_func) { + void test_unsorted_array_duplicated_3(SortFn sort_fn) { int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 20); ASSERT_EQ(array[1], 20); @@ -210,117 +191,93 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[9], 30); } - void test_unsorted_three_element_1(SortingRoutine sort_func) { + void test_unsorted_three_element_1(SortFn sort_fn) { int array[3] = {14999024, 0, 3}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_unsorted_three_element_2(SortingRoutine sort_func) { + void test_unsorted_three_element_2(SortFn sort_fn) { int array[3] = {3, 14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_unsorted_three_element_3(SortingRoutine sort_func) { + void test_unsorted_three_element_3(SortFn sort_fn) { int array[3] = {3, 0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_same_three_element(SortingRoutine sort_func) { + void test_same_three_element(SortFn sort_fn) { int array[3] = {12345, 12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); ASSERT_EQ(array[2], 12345); } - void test_unsorted_two_element_1(SortingRoutine sort_func) { + void test_unsorted_two_element_1(SortFn sort_fn) { int array[] = {14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); } - void test_unsorted_two_element_2(SortingRoutine sort_func) { + void test_unsorted_two_element_2(SortFn sort_fn) { int array[] = {0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); } - void test_same_two_element(SortingRoutine sort_func) { + void test_same_two_element(SortFn sort_fn) { int array[] = {12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); } - void test_single_element(SortingRoutine sort_func) { + void test_single_element(SortFn sort_fn) { int array[] = {12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_fn, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); } diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index d70e3dc2272be..7223d9b23f98a 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -7,10 +7,21 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/heap_sort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::heap_sort(array); +void heap_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + + if (array == nullptr || array_size == 0 || elem_size == 0) + return; + + auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast(array), + array_size, elem_size); + + LIBC_NAMESPACE::internal::heap_sort( + arr, [compare](const void *a, const void *b) noexcept -> bool { + return compare(a, b) < 0; + }); } -LIST_SORTING_TESTS(HeapSort, sort); +LIST_SORTING_TESTS(HeapSort, heap_sort); diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp index 6893fdc7b74c8..f18923618ed5e 100644 --- a/libc/test/src/stdlib/qsort_r_test.cpp +++ b/libc/test/src/stdlib/qsort_r_test.cpp @@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) { ASSERT_LE(array[23], 11100); ASSERT_LE(array[24], 12310); - // This is a sorted list, but there still have to have been at least N + // This is a sorted list, but there still have to have been at least N - 1 // comparisons made. - ASSERT_GE(count, ARRAY_SIZE); + ASSERT_GE(count, ARRAY_SIZE - 1); } TEST(LlvmLibcQsortRTest, ReverseSortedArray) { diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp index 1e921a86fd1fd..e89a092d37ccc 100644 --- a/libc/test/src/stdlib/qsort_test.cpp +++ b/libc/test/src/stdlib/qsort_test.cpp @@ -7,11 +7,20 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/qsort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(), - sizeof(int), SortingTest::int_compare); +void quick_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + if (array == nullptr || array_size == 0 || elem_size == 0) + return; + + auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast(array), + array_size, elem_size); + + LIBC_NAMESPACE::internal::quick_sort( + arr, [compare](const void *a, const void *b) noexcept -> bool { + return compare(a, b) < 0; + }); } -LIST_SORTING_TESTS(Qsort, sort); +LIST_SORTING_TESTS(Qsort, quick_sort); diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp deleted file mode 100644 index d6bf77ebfd40d..0000000000000 --- a/libc/test/src/stdlib/quick_sort_test.cpp +++ /dev/null @@ -1,16 +0,0 @@ -//===-- Unittests for quick sort ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "SortingTest.h" -#include "src/stdlib/quick_sort.h" - -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::quick_sort(array); -} - -LIST_SORTING_TESTS(QuickSort, sort); diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index e4b4b075705e8..a6dac1f82b9f9 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -130,21 +130,13 @@ libc_test( ], ) -libc_test( - name = "quick_sort_test", - srcs = ["quick_sort_test.cpp"], - deps = [ - ":qsort_test_helper", - "//libc:qsort_util", - ], -) - libc_test( name = "heap_sort_test", srcs = ["heap_sort_test.cpp"], + libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", - "//libc:qsort_util", + "//libc:types_size_t", ], ) From 5dfe9bb0af16b960d6ee5e4990a6fa86d4ee4cf2 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sat, 14 Dec 2024 15:33:11 +0100 Subject: [PATCH 02/12] Use higher quality pivot selection This commit ports the glidesort recursive median pivot selection. Without a dedicated small-sort this pivot selection is worse for cheap to compare types, but it is a precursor to optimally leverage pdqsort style equal element filtering. --- libc/src/stdlib/qsort_data.h | 17 ++++--- libc/src/stdlib/qsort_pivot.h | 93 +++++++++++++++++++++++++++++++++++ libc/src/stdlib/quick_sort.h | 6 +-- 3 files changed, 105 insertions(+), 11 deletions(-) create mode 100644 libc/src/stdlib/qsort_pivot.h diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index f296c26db4fd6..affff3da25cda 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -16,23 +16,24 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { - class Array { uint8_t *array_base; size_t array_len; size_t elem_size; - uint8_t *get_internal(size_t i) const { return array_base + (i * elem_size); } + uint8_t *get_internal(size_t i) const noexcept { + return array_base + (i * elem_size); + } public: - Array(uint8_t *a, size_t s, size_t e) + Array(uint8_t *a, size_t s, size_t e) noexcept : array_base(a), array_len(s), elem_size(e) {} - inline void *get(size_t i) const { + inline void *get(size_t i) const noexcept { return reinterpret_cast(get_internal(i)); } - void swap(size_t i, size_t j) const { + void swap(size_t i, size_t j) const noexcept { uint8_t *elem_i = get_internal(i); uint8_t *elem_j = get_internal(j); @@ -43,16 +44,16 @@ class Array { } } - size_t len() const { return array_len; } + size_t len() const noexcept { return array_len; } // Make an Array starting at index |i| and length |s|. - inline Array make_array(size_t i, size_t s) const { + inline Array make_array(size_t i, size_t s) const noexcept { return Array(get_internal(i), s, elem_size); } // Reset this Array to point at a different interval of the same // items starting at index |i|. - inline void reset_bounds(size_t i, size_t s) { + inline void reset_bounds(size_t i, size_t s) noexcept { array_base = get_internal(i); array_len = s; } diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h new file mode 100644 index 0000000000000..8411a324f737e --- /dev/null +++ b/libc/src/stdlib/qsort_pivot.h @@ -0,0 +1,93 @@ +//===-- Implementation header for qsort utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H +#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H + +#include "qsort_data.h" + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// Recursively select a pseudomedian if above this threshold. +constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64; + +// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters. +// +// This chooses a pivot by sampling an adaptive amount of points, approximating +// the quality of a median of sqrt(n) elements. +template +size_t choose_pivot(const Array &array, const F &is_less) { + const size_t len = array.len(); + + if (len < 8) { + return 0; + } + + const size_t len_div_8 = len / 8; + + const size_t a = 0; // [0, floor(n/8)) + const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8)) + const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8)) + + if (len < PSEUDO_MEDIAN_REC_THRESHOLD) { + return median3(array, a, b, c, is_less); + } else { + return median3_rec(array, a, b, c, len_div_8, is_less); + } +} + +// Calculates an approximate median of 3 elements from sections a, b, c, or +// recursively from an approximation of each, if they're large enough. By +// dividing the size of each section by 8 when recursing we have logarithmic +// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) = +// O(n^(log(3)/log(8))) ~= O(n^0.528) elements. +template +size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n, + const F &is_less) { + if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) { + const size_t n8 = n / 8; + a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less); + b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less); + c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less); + } + return median3(array, a, b, c, is_less); +} + +/// Calculates the median of 3 elements. +template +size_t median3(const Array &array, size_t a, size_t b, size_t c, + const F &is_less) { + const void *a_ptr = array.get(a); + const void *b_ptr = array.get(b); + const void *c_ptr = array.get(c); + + const bool x = is_less(a_ptr, b_ptr); + const bool y = is_less(a_ptr, c_ptr); + if (x == y) { + // If x=y=0 then b, c <= a. In this case we want to return max(b, c). + // If x=y=1 then a < b, c. In this case we want to return min(b, c). + // By toggling the outcome of b < c using XOR x we get this behavior. + const bool z = is_less(b_ptr, c_ptr); + if (z ^ x) { + return c; + } else { + return b; + } + } else { + // Either c <= a < b or b <= a < c, thus a is our median. + return a; + } +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 6972c233dde25..a21826681ff2c 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H -#include "src/__support/macros/attributes.h" +#include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort_pivot.h" #include @@ -69,7 +69,7 @@ template void quick_sort(Array &array, const F &is_less) { if (array_len <= 1) return; - const size_t pivot_index = array_len / 2; + const size_t pivot_index = choose_pivot(array, is_less); size_t split_index = partition(array, pivot_index, is_less); if (array_len == 2) From ddf07964b9770513e58ddf5f5f20adc46774fa50 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sat, 14 Dec 2024 15:52:46 +0100 Subject: [PATCH 03/12] Use pdqsort style equal element filtering This commits the technique introduced by Orson Peters in pdqsort to efficiently handle inputs with repeated elements for example as found in zipfian or low cardinality distributions. The random_z1 pattern sees a ~3.5x improvement in performance for input length 10k across the different types. This change implies O(N x log(K)) for inputs with K distinct elements. --- libc/src/stdlib/quick_sort.h | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index a21826681ff2c..91a8ba8c524c1 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -63,13 +63,36 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) { return num_lt; } -template void quick_sort(Array &array, const F &is_less) { +template +void quick_sort_impl(Array &array, const void *ancestor_pivot, + const F &is_less) { while (true) { const size_t array_len = array.len(); if (array_len <= 1) return; const size_t pivot_index = choose_pivot(array, is_less); + + // If the chosen pivot is equal to the predecessor, then it's the smallest + // element in the slice. Partition the slice into elements equal to and + // elements greater than the pivot. This case is usually hit when the slice + // contains many duplicate elements. + if (ancestor_pivot) { + if (!is_less(ancestor_pivot, array.get(pivot_index))) { + const size_t num_lt = + partition(array, pivot_index, + [is_less](const void *a, const void *b) noexcept -> bool { + return !is_less(b, a); + }); + + // Continue sorting elements greater than the pivot. We know that + // `num_lt` cont + array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1)); + ancestor_pivot = nullptr; + continue; + } + } + size_t split_index = partition(array, pivot_index, is_less); if (array_len == 2) @@ -78,6 +101,7 @@ template void quick_sort(Array &array, const F &is_less) { // Split the array into `left`, `pivot`, and `right`. Array left = array.make_array(0, split_index); + const void *pivot = array.get(split_index); const size_t right_start = split_index + 1; Array right = array.make_array(right_start, array.len() - right_start); @@ -86,15 +110,21 @@ template void quick_sort(Array &array, const F &is_less) { // by log2 of the total array size, because every recursive call is sorting // a list at most half the length of the one in its caller. if (left.len() < right.len()) { - quick_sort(left, is_less); + quick_sort_impl(left, ancestor_pivot, is_less); array.reset_bounds(right_start, right.len()); + ancestor_pivot = pivot; } else { - quick_sort(right, is_less); + quick_sort_impl(right, pivot, is_less); array.reset_bounds(0, left.len()); } } } +template void quick_sort(Array &array, const F &is_less) { + const void *ancestor_pivot = nullptr; + quick_sort_impl(array, ancestor_pivot, is_less); +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL From 9e05a1f959570f6bf0c1f023f0a8ca157e4af8ee Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 16 Dec 2024 17:57:53 +0100 Subject: [PATCH 04/12] Use heapsort fallback to guarantee O(N x log(N)) worst case performance The combination of a high quality pivot selection function with equal element filtering as added in the previous commit makes for quicksort that in practice hardly ever hits quadratic run-time scaling, but malicious inputs and accidents are possible. By limiting the recursion depth and switching to heapsort - introsort the second i in idisort - we can guarantee a worst case expected time to sort the data of O(N x log(N)). --- libc/src/stdlib/quick_sort.h | 39 +++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 91a8ba8c524c1..3442fd7aa0f76 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #include "src/__support/CPP/cstddef.h" +#include "src/__support/big_int.h" #include "src/__support/macros/config.h" #include "src/stdlib/qsort_pivot.h" @@ -64,13 +65,22 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) { } template -void quick_sort_impl(Array &array, const void *ancestor_pivot, +void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit, const F &is_less) { while (true) { const size_t array_len = array.len(); if (array_len <= 1) return; + // If too many bad pivot choices were made, simply fall back to + // heapsort in order to guarantee `O(N x log(N))` worst-case. + if (limit == 0) { + heap_sort(array, is_less); + return; + } + + limit -= 1; + const size_t pivot_index = choose_pivot(array, is_less); // If the chosen pivot is equal to the predecessor, then it's the smallest @@ -105,24 +115,25 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot, const size_t right_start = split_index + 1; Array right = array.make_array(right_start, array.len() - right_start); - // Recurse to sort the smaller of the two, and then loop round within this - // function to sort the larger. This way, recursive call depth is bounded - // by log2 of the total array size, because every recursive call is sorting - // a list at most half the length of the one in its caller. - if (left.len() < right.len()) { - quick_sort_impl(left, ancestor_pivot, is_less); - array.reset_bounds(right_start, right.len()); - ancestor_pivot = pivot; - } else { - quick_sort_impl(right, pivot, is_less); - array.reset_bounds(0, left.len()); - } + // Recurse into the left side. We have a fixed recursion limit, + // testing shows no real benefit for recursing into the shorter + // side. + quick_sort_impl(left, ancestor_pivot, limit, is_less); + + // Continue with the right side. + array = right; + ancestor_pivot = pivot; } } +constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; } + template void quick_sort(Array &array, const F &is_less) { const void *ancestor_pivot = nullptr; - quick_sort_impl(array, ancestor_pivot, is_less); + // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. + // The binary OR by one is used to eliminate the zero-check in the logarithm. + const size_t limit = 2 * ilog2((array.len() | 1)); + quick_sort_impl(array, ancestor_pivot, limit, is_less); } } // namespace internal From 064678d7a6bec493671576c1e22a204bdbe6db9b Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 16 Dec 2024 20:51:18 +0100 Subject: [PATCH 05/12] Use fixed size array specialization By itself this optimization only nets a ~1.2x improvement for types like i32, u64 and f128, but it unlocks ~2x improvements for partitioning. The cost is a modest increase in binary-size of libc, `unstable_sort` is only instantiated twice, which raises the total from 2 to 8 instantiations in the library. But since the fixed size instantiations have much cheaper swap routines they only add ~1.5k each vs the ~4.5k generic instantiation. In effect binary size for qsort and qsort_r goes from ~9k to ~18k. --- libc/src/stdlib/heap_sort.h | 3 +- libc/src/stdlib/qsort.cpp | 9 +-- libc/src/stdlib/qsort_data.h | 98 +++++++++++++++++++++++-- libc/src/stdlib/qsort_pivot.h | 15 ++-- libc/src/stdlib/qsort_r.cpp | 8 +- libc/src/stdlib/qsort_util.h | 35 ++++++++- libc/src/stdlib/quick_sort.h | 21 +++--- libc/test/src/stdlib/heap_sort_test.cpp | 4 +- libc/test/src/stdlib/qsort_test.cpp | 11 +-- 9 files changed, 152 insertions(+), 52 deletions(-) diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h index 543c834c0197c..de08984497782 100644 --- a/libc/src/stdlib/heap_sort.h +++ b/libc/src/stdlib/heap_sort.h @@ -18,7 +18,8 @@ namespace internal { // A simple in-place heapsort implementation. // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. -template void heap_sort(const Array &array, const F &is_less) { +template +void heap_sort(const A &array, const F &is_less) { size_t end = array.len(); size_t start = end / 2; diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index ff55b1bd4455e..1402f7963c389 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -18,14 +18,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, qsort, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *))) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size); - internal::unstable_sort( - arr, [compare](const void *a, const void *b) noexcept -> bool { + array, array_size, elem_size, + [compare](const void *a, const void *b) noexcept -> bool { return compare(a, b) < 0; }); } diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index affff3da25cda..d06418050c186 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -11,12 +11,33 @@ #include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" +#include "src/string/memory_utils/inline_memcpy.h" +#include "src/string/memory_utils/inline_memmove.h" #include namespace LIBC_NAMESPACE_DECL { namespace internal { -class Array { +// Returns the max amount of bytes deemed reasonable - based on the target +// settings - for use in local stack arrays. +constexpr size_t max_stack_array_size() { + constexpr size_t ptr_diff_size = sizeof(ptrdiff_t); + + if constexpr (ptr_diff_size >= 8) { + return 4096; + } + + if constexpr (ptr_diff_size == 4) { + return 512; + } + + // 8-bit platforms are just not gonna work well with libc, qsort + // won't be the problem. + // 16-bit platforms ought to be able to store 64 bytes on the stack. + return 64; +} + +class ArrayGenericSize { uint8_t *array_base; size_t array_len; size_t elem_size; @@ -26,7 +47,7 @@ class Array { } public: - Array(uint8_t *a, size_t s, size_t e) noexcept + ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept : array_base(a), array_len(s), elem_size(e) {} inline void *get(size_t i) const noexcept { @@ -34,21 +55,82 @@ class Array { } void swap(size_t i, size_t j) const noexcept { + // For sizes below this doing the extra function call is not + // worth it. + constexpr size_t MIN_MEMCPY_SIZE = 32; + + constexpr size_t STACK_ARRAY_SIZE = max_stack_array_size(); + alignas(32) uint8_t tmp[STACK_ARRAY_SIZE]; + uint8_t *elem_i = get_internal(i); uint8_t *elem_j = get_internal(j); - for (size_t b = 0; b < elem_size; ++b) { - uint8_t temp = elem_i[b]; - elem_i[b] = elem_j[b]; - elem_j[b] = temp; + if (elem_size >= MIN_MEMCPY_SIZE && elem_size <= STACK_ARRAY_SIZE) { + // Block copies are much more efficient, even if `elem_size` + // is unknown once `elem_size` passes a certain CPU specific + // threshold. + inline_memcpy(tmp, elem_i, elem_size); + inline_memmove(elem_i, elem_j, elem_size); + inline_memcpy(elem_j, tmp, elem_size); + } else { + for (size_t b = 0; b < elem_size; ++b) { + uint8_t temp = elem_i[b]; + elem_i[b] = elem_j[b]; + elem_j[b] = temp; + } } } size_t len() const noexcept { return array_len; } // Make an Array starting at index |i| and length |s|. - inline Array make_array(size_t i, size_t s) const noexcept { - return Array(get_internal(i), s, elem_size); + inline ArrayGenericSize make_array(size_t i, size_t s) const noexcept { + return ArrayGenericSize(get_internal(i), s, elem_size); + } + + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + inline void reset_bounds(size_t i, size_t s) noexcept { + array_base = get_internal(i); + array_len = s; + } +}; + +// Having a specialized Array type for sorting that knowns at +// compile-time what the size of the element is, allows for much more +// efficient swapping and for cheaper offset calculations. +template class ArrayFixedSize { + uint8_t *array_base; + size_t array_len; + + uint8_t *get_internal(size_t i) const noexcept { + return array_base + (i * ELEM_SIZE); + } + +public: + ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {} + + inline void *get(size_t i) const noexcept { + return reinterpret_cast(get_internal(i)); + } + + void swap(size_t i, size_t j) const noexcept { + alignas(32) uint8_t tmp[ELEM_SIZE]; + + uint8_t *elem_i = get_internal(i); + uint8_t *elem_j = get_internal(j); + + inline_memcpy(tmp, elem_i, ELEM_SIZE); + inline_memmove(elem_i, elem_j, ELEM_SIZE); + inline_memcpy(elem_j, tmp, ELEM_SIZE); + } + + size_t len() const noexcept { return array_len; } + + // Make an Array starting at index |i| and length |s|. + inline ArrayFixedSize make_array(size_t i, + size_t s) const noexcept { + return ArrayFixedSize(get_internal(i), s); } // Reset this Array to point at a different interval of the same diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h index 8411a324f737e..68716d01c47a0 100644 --- a/libc/src/stdlib/qsort_pivot.h +++ b/libc/src/stdlib/qsort_pivot.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H -#include "qsort_data.h" +#include "src/stdlib/qsort_pivot.h" #include @@ -23,8 +23,8 @@ constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64; // // This chooses a pivot by sampling an adaptive amount of points, approximating // the quality of a median of sqrt(n) elements. -template -size_t choose_pivot(const Array &array, const F &is_less) { +template +size_t choose_pivot(const A &array, const F &is_less) { const size_t len = array.len(); if (len < 8) { @@ -49,8 +49,8 @@ size_t choose_pivot(const Array &array, const F &is_less) { // dividing the size of each section by 8 when recursing we have logarithmic // recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) = // O(n^(log(3)/log(8))) ~= O(n^0.528) elements. -template -size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n, +template +size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n, const F &is_less) { if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) { const size_t n8 = n / 8; @@ -62,9 +62,8 @@ size_t median3_rec(const Array &array, size_t a, size_t b, size_t c, size_t n, } /// Calculates the median of 3 elements. -template -size_t median3(const Array &array, size_t a, size_t b, size_t c, - const F &is_less) { +template +size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) { const void *a_ptr = array.get(a); const void *b_ptr = array.get(b); const void *c_ptr = array.get(c); diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index 530e3f5f58e19..1d340560bcbe1 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -19,14 +19,10 @@ LLVM_LIBC_FUNCTION(void, qsort_r, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *, void *), void *arg)) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size); internal::unstable_sort( - arr, [compare, arg](const void *a, const void *b) noexcept -> bool { + array, array_size, elem_size, + [compare, arg](const void *a, const void *b) noexcept -> bool { return compare(a, b, arg) < 0; }); } diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index 0faf6e1afdec0..ee3f737926f53 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -template void unstable_sort(Array array, const F &is_less) { +template void sort_inst(A &array, const F &is_less) { #if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT quick_sort(array, is_less); #elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT @@ -35,6 +35,39 @@ template void unstable_sort(Array array, const F &is_less) { #endif } +template +void unstable_sort(void *array, size_t array_len, size_t elem_size, + const F &is_less) { + if (array == nullptr || array_len == 0 || elem_size == 0) { + return; + } + + uint8_t *array_base = reinterpret_cast(array); + + switch (elem_size) { + case 4: { + auto arr_fixed_size = internal::ArrayFixedSize<4>(array_base, array_len); + sort_inst(arr_fixed_size, is_less); + return; + } + case 8: { + auto arr_fixed_size = internal::ArrayFixedSize<8>(array_base, array_len); + sort_inst(arr_fixed_size, is_less); + return; + } + case 16: { + auto arr_fixed_size = internal::ArrayFixedSize<16>(array_base, array_len); + sort_inst(arr_fixed_size, is_less); + return; + } + default: + auto arr_generic_size = + internal::ArrayGenericSize(array_base, array_len, elem_size); + sort_inst(arr_generic_size, is_less); + return; + } +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 3442fd7aa0f76..ace450d6cf805 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -20,9 +20,8 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { // A simple quicksort implementation using the Hoare partition scheme. -template -size_t partition_hoare(const Array &array, const void *pivot, - const F &is_less) { +template +size_t partition_hoare(const A &array, const void *pivot, const F &is_less) { const size_t array_len = array.len(); size_t left = 0; @@ -49,12 +48,12 @@ size_t partition_hoare(const Array &array, const void *pivot, return left; } -template -size_t partition(const Array &array, size_t pivot_index, const F &is_less) { +template +size_t partition(const A &array, size_t pivot_index, const F &is_less) { // Place the pivot at the beginning of the array. array.swap(0, pivot_index); - const Array array_without_pivot = array.make_array(1, array.len() - 1); + const A array_without_pivot = array.make_array(1, array.len() - 1); const void *pivot = array.get(0); const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less); @@ -64,8 +63,8 @@ size_t partition(const Array &array, size_t pivot_index, const F &is_less) { return num_lt; } -template -void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit, +template +void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit, const F &is_less) { while (true) { const size_t array_len = array.len(); @@ -110,10 +109,10 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit, return; // Split the array into `left`, `pivot`, and `right`. - Array left = array.make_array(0, split_index); + A left = array.make_array(0, split_index); const void *pivot = array.get(split_index); const size_t right_start = split_index + 1; - Array right = array.make_array(right_start, array.len() - right_start); + A right = array.make_array(right_start, array.len() - right_start); // Recurse into the left side. We have a fixed recursion limit, // testing shows no real benefit for recursing into the shorter @@ -128,7 +127,7 @@ void quick_sort_impl(Array &array, const void *ancestor_pivot, size_t limit, constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; } -template void quick_sort(Array &array, const F &is_less) { +template void quick_sort(A &array, const F &is_less) { const void *ancestor_pivot = nullptr; // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. // The binary OR by one is used to eliminate the zero-check in the logarithm. diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index 7223d9b23f98a..43c8c69739615 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -15,8 +15,8 @@ void heap_sort(void *array, size_t array_size, size_t elem_size, if (array == nullptr || array_size == 0 || elem_size == 0) return; - auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast(array), - array_size, elem_size); + auto arr = LIBC_NAMESPACE::internal::ArrayGenericSize( + reinterpret_cast(array), array_size, elem_size); LIBC_NAMESPACE::internal::heap_sort( arr, [compare](const void *a, const void *b) noexcept -> bool { diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp index e89a092d37ccc..63cbb205c4e17 100644 --- a/libc/test/src/stdlib/qsort_test.cpp +++ b/libc/test/src/stdlib/qsort_test.cpp @@ -11,14 +11,9 @@ void quick_sort(void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *)) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - - auto arr = LIBC_NAMESPACE::internal::Array(reinterpret_cast(array), - array_size, elem_size); - - LIBC_NAMESPACE::internal::quick_sort( - arr, [compare](const void *a, const void *b) noexcept -> bool { + LIBC_NAMESPACE::internal::unstable_sort( + array, array_size, elem_size, + [compare](const void *a, const void *b) noexcept -> bool { return compare(a, b) < 0; }); } From 702fa77112a620c45c795454fd1d756eb47aaf7e Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 16 Dec 2024 22:17:04 +0100 Subject: [PATCH 06/12] Use branchless lomuto partition when elem size is known at compile-time Uses a simplified version of the branchless partition scheme used in ipnsort. This achieves a ~2.5x perf improvement for i32, u64 and f128 for random patterns. This is mainly achieved by avoiding branch misprediction penalties. --- libc/src/stdlib/qsort_data.h | 12 +++++++++- libc/src/stdlib/quick_sort.h | 44 +++++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index d06418050c186..d2487e36333a5 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -18,9 +18,12 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { + // Returns the max amount of bytes deemed reasonable - based on the target -// settings - for use in local stack arrays. +// properties - for use in local stack arrays. constexpr size_t max_stack_array_size() { + // Uses target pointer size as heuristic how much memory is available and + // unlikely to run into stack overflow and perf problems. constexpr size_t ptr_diff_size = sizeof(ptrdiff_t); if constexpr (ptr_diff_size >= 8) { @@ -50,6 +53,8 @@ class ArrayGenericSize { ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept : array_base(a), array_len(s), elem_size(e) {} + static constexpr bool has_fixed_size() { return false; } + inline void *get(size_t i) const noexcept { return reinterpret_cast(get_internal(i)); } @@ -110,6 +115,11 @@ template class ArrayFixedSize { public: ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {} + // Beware this function is used a heuristic for cheap to swap types, so + // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad + // idea perf wise. + static constexpr bool has_fixed_size() { return true; } + inline void *get(size_t i) const noexcept { return reinterpret_cast(get_internal(i)); } diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index ace450d6cf805..fddcd1a18282b 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -19,9 +19,38 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -// A simple quicksort implementation using the Hoare partition scheme. +// Branchless Lomuto partition based on the implementation by Lukas +// Bergdoll and Orson Peters +// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md. +// Simplified to avoid having to stack allocate. template -size_t partition_hoare(const A &array, const void *pivot, const F &is_less) { +size_t partition_lomuto_branchless(const A &array, const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = 0; + + while (right < array_len) { + const bool right_is_lt = is_less(array.get(right), pivot); + array.swap(left, right); + left += static_cast(right_is_lt); + right += 1; + } + + return left; +} + +// Optimized for large types that are expensive to move. Not optimized +// for integers. It's possible to use a cyclic permutation here for +// large types as done in ipnsort but the advantages of this are limited +// as `is_less` is a small wrapper around a call to a function pointer +// and won't incur much binary-size overhead. The other reason to use +// cyclic permutation is to have more efficient swapping, but we don't +// know the element size so this isn't applicable here either. +template +size_t partition_hoare_branchy(const A &array, const void *pivot, + const F &is_less) { const size_t array_len = array.len(); size_t left = 0; @@ -55,7 +84,16 @@ size_t partition(const A &array, size_t pivot_index, const F &is_less) { const A array_without_pivot = array.make_array(1, array.len() - 1); const void *pivot = array.get(0); - const size_t num_lt = partition_hoare(array_without_pivot, pivot, is_less); + + size_t num_lt; + if constexpr (A::has_fixed_size()) { + // Branchless Lomuto avoid branch misprediction penalties, but + // it also swaps more often which only is faster if the swap a + // constant operation. + num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less); + } else { + num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less); + } // Place the pivot between the two partitions. array.swap(0, num_lt); From 2b1660affad3ada309b2e99499cf888cab5e65b8 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 17 Dec 2024 11:13:23 +0100 Subject: [PATCH 07/12] Use more generic dynamic element size swap impl The new impl is better for types < 100 bytes and worse for very large types like 1k, but still not bad, plus it has smaller binary-size footprint and avoids a surprising performance cliff past `STACK_ARRAY_SIZE` as well as heuristics. Also remove inline "hints" it's the wrong way to do it and the situation is simple enough that the compiler should now best and it's the wrong way to hint it anyway. --- libc/src/stdlib/qsort_data.h | 62 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index d2487e36333a5..f7446ec2bb637 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -55,47 +55,58 @@ class ArrayGenericSize { static constexpr bool has_fixed_size() { return false; } - inline void *get(size_t i) const noexcept { + void *get(size_t i) const noexcept { return reinterpret_cast(get_internal(i)); } void swap(size_t i, size_t j) const noexcept { - // For sizes below this doing the extra function call is not - // worth it. - constexpr size_t MIN_MEMCPY_SIZE = 32; - - constexpr size_t STACK_ARRAY_SIZE = max_stack_array_size(); - alignas(32) uint8_t tmp[STACK_ARRAY_SIZE]; + // It's possible to use 8 byte blocks with `uint64_t`, but that + // generates more machine code as the remainder loop gets + // unrolled, plus 4 byte operations are more likely to be + // efficient on a wider variety of hardware. On x86 LLVM tends + // to unroll the block loop again into 2 16 byte swaps per + // iteration which is another reason that 4 byte blocks yields + // good performance even for big types. + using block_t = uint32_t; + constexpr size_t BLOCK_SIZE = sizeof(block_t); uint8_t *elem_i = get_internal(i); uint8_t *elem_j = get_internal(j); - if (elem_size >= MIN_MEMCPY_SIZE && elem_size <= STACK_ARRAY_SIZE) { - // Block copies are much more efficient, even if `elem_size` - // is unknown once `elem_size` passes a certain CPU specific - // threshold. - inline_memcpy(tmp, elem_i, elem_size); - inline_memmove(elem_i, elem_j, elem_size); - inline_memcpy(elem_j, tmp, elem_size); - } else { - for (size_t b = 0; b < elem_size; ++b) { - uint8_t temp = elem_i[b]; - elem_i[b] = elem_j[b]; - elem_j[b] = temp; - } + const size_t elem_size_rem = elem_size % BLOCK_SIZE; + const block_t *elem_i_block_end = + reinterpret_cast(elem_i + (elem_size - elem_size_rem)); + + block_t *elem_i_block = reinterpret_cast(elem_i); + block_t *elem_j_block = reinterpret_cast(elem_j); + + while (elem_i_block != elem_i_block_end) { + block_t tmp = *elem_i_block; + *elem_i_block = *elem_j_block; + *elem_j_block = tmp; + elem_i_block += 1; + elem_j_block += 1; + } + + elem_i = reinterpret_cast(elem_i_block); + elem_j = reinterpret_cast(elem_j_block); + for (size_t n = 0; n < elem_size_rem; ++n) { + uint8_t tmp = elem_i[n]; + elem_i[n] = elem_j[n]; + elem_j[n] = tmp; } } size_t len() const noexcept { return array_len; } // Make an Array starting at index |i| and length |s|. - inline ArrayGenericSize make_array(size_t i, size_t s) const noexcept { + ArrayGenericSize make_array(size_t i, size_t s) const noexcept { return ArrayGenericSize(get_internal(i), s, elem_size); } // Reset this Array to point at a different interval of the same // items starting at index |i|. - inline void reset_bounds(size_t i, size_t s) noexcept { + void reset_bounds(size_t i, size_t s) noexcept { array_base = get_internal(i); array_len = s; } @@ -120,7 +131,7 @@ template class ArrayFixedSize { // idea perf wise. static constexpr bool has_fixed_size() { return true; } - inline void *get(size_t i) const noexcept { + void *get(size_t i) const noexcept { return reinterpret_cast(get_internal(i)); } @@ -138,14 +149,13 @@ template class ArrayFixedSize { size_t len() const noexcept { return array_len; } // Make an Array starting at index |i| and length |s|. - inline ArrayFixedSize make_array(size_t i, - size_t s) const noexcept { + ArrayFixedSize make_array(size_t i, size_t s) const noexcept { return ArrayFixedSize(get_internal(i), s); } // Reset this Array to point at a different interval of the same // items starting at index |i|. - inline void reset_bounds(size_t i, size_t s) noexcept { + void reset_bounds(size_t i, size_t s) noexcept { array_base = get_internal(i); array_len = s; } From efb98579a7f56c721402d526086fa24a9cbc3110 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 17 Dec 2024 12:04:17 +0100 Subject: [PATCH 08/12] Avoid unnecessary pivot swap for common recursion case The pivot selection will pick 0 as the pivot index if the array length is < 8, which is a common recursion leaf case. This isn't a big perf win more on the order 1-2% but's it's trivial and more or less free. --- libc/src/stdlib/quick_sort.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index fddcd1a18282b..13cce7ee5d86c 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -80,7 +80,9 @@ size_t partition_hoare_branchy(const A &array, const void *pivot, template size_t partition(const A &array, size_t pivot_index, const F &is_less) { // Place the pivot at the beginning of the array. - array.swap(0, pivot_index); + if (pivot_index != 0) { + array.swap(0, pivot_index); + } const A array_without_pivot = array.make_array(1, array.len() - 1); const void *pivot = array.get(0); From aecca523598ef05189835ab4dddf8aa000870c3a Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Thu, 19 Dec 2024 17:35:20 +0100 Subject: [PATCH 09/12] Apply review comments --- libc/src/stdlib/qsort.cpp | 9 +- libc/src/stdlib/qsort_data.h | 79 ++++------ libc/src/stdlib/qsort_pivot.h | 7 +- libc/src/stdlib/qsort_r.cpp | 9 +- libc/src/stdlib/qsort_util.h | 77 +++++----- libc/src/stdlib/quick_sort.h | 6 +- libc/test/src/stdlib/CMakeLists.txt | 4 +- libc/test/src/stdlib/SortingTest.h | 144 +++++++++++++----- libc/test/src/stdlib/heap_sort_test.cpp | 11 +- .../{qsort_test.cpp => quick_sort_test.cpp} | 4 +- .../libc/test/src/stdlib/BUILD.bazel | 4 +- 11 files changed, 202 insertions(+), 152 deletions(-) rename libc/test/src/stdlib/{qsort_test.cpp => quick_sort_test.cpp} (87%) diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 1402f7963c389..914297a0ddba9 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -18,11 +18,10 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, qsort, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *))) { - internal::unstable_sort( - array, array_size, elem_size, - [compare](const void *a, const void *b) noexcept -> bool { - return compare(a, b) < 0; - }); + internal::unstable_sort(array, array_size, elem_size, + [compare](const void *a, const void *b) -> bool { + return compare(a, b) < 0; + }); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index f7446ec2bb637..647239878add6 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -19,47 +19,25 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -// Returns the max amount of bytes deemed reasonable - based on the target -// properties - for use in local stack arrays. -constexpr size_t max_stack_array_size() { - // Uses target pointer size as heuristic how much memory is available and - // unlikely to run into stack overflow and perf problems. - constexpr size_t ptr_diff_size = sizeof(ptrdiff_t); - - if constexpr (ptr_diff_size >= 8) { - return 4096; - } - - if constexpr (ptr_diff_size == 4) { - return 512; - } - - // 8-bit platforms are just not gonna work well with libc, qsort - // won't be the problem. - // 16-bit platforms ought to be able to store 64 bytes on the stack. - return 64; -} - class ArrayGenericSize { - uint8_t *array_base; + cpp::byte *array_base; size_t array_len; size_t elem_size; - uint8_t *get_internal(size_t i) const noexcept { + LIBC_INLINE cpp::byte *get_internal(size_t i) const { return array_base + (i * elem_size); } public: - ArrayGenericSize(uint8_t *a, size_t s, size_t e) noexcept - : array_base(a), array_len(s), elem_size(e) {} + ArrayGenericSize(void *a, size_t s, size_t e) + : array_base(reinterpret_cast(a)), array_len(s), + elem_size(e) {} static constexpr bool has_fixed_size() { return false; } - void *get(size_t i) const noexcept { - return reinterpret_cast(get_internal(i)); - } + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } - void swap(size_t i, size_t j) const noexcept { + void swap(size_t i, size_t j) const { // It's possible to use 8 byte blocks with `uint64_t`, but that // generates more machine code as the remainder loop gets // unrolled, plus 4 byte operations are more likely to be @@ -70,8 +48,8 @@ class ArrayGenericSize { using block_t = uint32_t; constexpr size_t BLOCK_SIZE = sizeof(block_t); - uint8_t *elem_i = get_internal(i); - uint8_t *elem_j = get_internal(j); + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); const size_t elem_size_rem = elem_size % BLOCK_SIZE; const block_t *elem_i_block_end = @@ -88,74 +66,73 @@ class ArrayGenericSize { elem_j_block += 1; } - elem_i = reinterpret_cast(elem_i_block); - elem_j = reinterpret_cast(elem_j_block); + elem_i = reinterpret_cast(elem_i_block); + elem_j = reinterpret_cast(elem_j_block); for (size_t n = 0; n < elem_size_rem; ++n) { - uint8_t tmp = elem_i[n]; + cpp::byte tmp = elem_i[n]; elem_i[n] = elem_j[n]; elem_j[n] = tmp; } } - size_t len() const noexcept { return array_len; } + LIBC_INLINE size_t len() const { return array_len; } // Make an Array starting at index |i| and length |s|. - ArrayGenericSize make_array(size_t i, size_t s) const noexcept { + LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const { return ArrayGenericSize(get_internal(i), s, elem_size); } // Reset this Array to point at a different interval of the same // items starting at index |i|. - void reset_bounds(size_t i, size_t s) noexcept { + LIBC_INLINE void reset_bounds(size_t i, size_t s) { array_base = get_internal(i); array_len = s; } }; -// Having a specialized Array type for sorting that knowns at +// Having a specialized Array type for sorting that knows at // compile-time what the size of the element is, allows for much more // efficient swapping and for cheaper offset calculations. template class ArrayFixedSize { - uint8_t *array_base; + cpp::byte *array_base; size_t array_len; - uint8_t *get_internal(size_t i) const noexcept { + LIBC_INLINE cpp::byte *get_internal(size_t i) const { return array_base + (i * ELEM_SIZE); } public: - ArrayFixedSize(uint8_t *a, size_t s) noexcept : array_base(a), array_len(s) {} + ArrayFixedSize(void *a, size_t s) + : array_base(reinterpret_cast(a)), array_len(s) {} // Beware this function is used a heuristic for cheap to swap types, so // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad // idea perf wise. static constexpr bool has_fixed_size() { return true; } - void *get(size_t i) const noexcept { - return reinterpret_cast(get_internal(i)); - } + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } - void swap(size_t i, size_t j) const noexcept { - alignas(32) uint8_t tmp[ELEM_SIZE]; + LIBC_INLINE void swap(size_t i, size_t j) const { + alignas(32) cpp::byte tmp[ELEM_SIZE]; - uint8_t *elem_i = get_internal(i); - uint8_t *elem_j = get_internal(j); + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); inline_memcpy(tmp, elem_i, ELEM_SIZE); inline_memmove(elem_i, elem_j, ELEM_SIZE); inline_memcpy(elem_j, tmp, ELEM_SIZE); } - size_t len() const noexcept { return array_len; } + LIBC_INLINE size_t len() const { return array_len; } // Make an Array starting at index |i| and length |s|. - ArrayFixedSize make_array(size_t i, size_t s) const noexcept { + LIBC_INLINE ArrayFixedSize make_array(size_t i, size_t s) const { return ArrayFixedSize(get_internal(i), s); } // Reset this Array to point at a different interval of the same // items starting at index |i|. - void reset_bounds(size_t i, size_t s) noexcept { + LIBC_INLINE void reset_bounds(size_t i, size_t s) { array_base = get_internal(i); array_len = s; } diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h index 68716d01c47a0..616f3ed09676e 100644 --- a/libc/src/stdlib/qsort_pivot.h +++ b/libc/src/stdlib/qsort_pivot.h @@ -9,8 +9,6 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H -#include "src/stdlib/qsort_pivot.h" - #include namespace LIBC_NAMESPACE_DECL { @@ -75,11 +73,10 @@ size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) { // If x=y=1 then a < b, c. In this case we want to return min(b, c). // By toggling the outcome of b < c using XOR x we get this behavior. const bool z = is_less(b_ptr, c_ptr); - if (z ^ x) { + if (z ^ x) return c; - } else { + else return b; - } } else { // Either c <= a < b or b <= a < c, thus a is our median. return a; diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index 1d340560bcbe1..dfd2d28f483d6 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -20,11 +20,10 @@ LLVM_LIBC_FUNCTION(void, qsort_r, int (*compare)(const void *, const void *, void *), void *arg)) { - internal::unstable_sort( - array, array_size, elem_size, - [compare, arg](const void *a, const void *b) noexcept -> bool { - return compare(a, b, arg) < 0; - }); + internal::unstable_sort(array, array_size, elem_size, + [compare, arg](const void *a, const void *b) -> bool { + return compare(a, b, arg) < 0; + }); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index ee3f737926f53..630a668bcf0e0 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -15,9 +15,13 @@ #define LIBC_QSORT_QUICK_SORT 1 #define LIBC_QSORT_HEAP_SORT 2 +#ifdef LIBC_OPTIMIZE_FOR_SIZE +#define LIBC_QSORT_IMPL LIBC_QSORT_HEAP_SORT +#else #ifndef LIBC_QSORT_IMPL #define LIBC_QSORT_IMPL LIBC_QSORT_QUICK_SORT #endif // LIBC_QSORT_IMPL +#endif // LIBC_OPTIMIZE_FOR_SIZE #if (LIBC_QSORT_IMPL != LIBC_QSORT_QUICK_SORT && \ LIBC_QSORT_IMPL != LIBC_QSORT_HEAP_SORT) @@ -27,45 +31,50 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -template void sort_inst(A &array, const F &is_less) { -#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT - quick_sort(array, is_less); -#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT - heap_sort(array, is_less); -#endif +template +void unstable_sort_impl(void *array, size_t array_len, size_t elem_size, + const F &is_less) { + if (array == nullptr || array_len == 0 || elem_size == 0) + return; + + if constexpr (USE_QUICKSORT) { + switch (elem_size) { + case 4: { + auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 8: { + auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 16: { + auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + default: + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + quick_sort(arr_generic_size, is_less); + return; + } + } else { + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + heap_sort(arr_generic_size, is_less); + } } template void unstable_sort(void *array, size_t array_len, size_t elem_size, const F &is_less) { - if (array == nullptr || array_len == 0 || elem_size == 0) { - return; - } - - uint8_t *array_base = reinterpret_cast(array); - - switch (elem_size) { - case 4: { - auto arr_fixed_size = internal::ArrayFixedSize<4>(array_base, array_len); - sort_inst(arr_fixed_size, is_less); - return; - } - case 8: { - auto arr_fixed_size = internal::ArrayFixedSize<8>(array_base, array_len); - sort_inst(arr_fixed_size, is_less); - return; - } - case 16: { - auto arr_fixed_size = internal::ArrayFixedSize<16>(array_base, array_len); - sort_inst(arr_fixed_size, is_less); - return; - } - default: - auto arr_generic_size = - internal::ArrayGenericSize(array_base, array_len, elem_size); - sort_inst(arr_generic_size, is_less); - return; - } +#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT + unstable_sort_impl(array, array_len, elem_size, is_less); +#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT + unstable_sort_impl(array, array_len, elem_size, is_less); +#endif } } // namespace internal diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 13cce7ee5d86c..6eb8e00bd7891 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H +#include "src/__support/CPP/bit.h" #include "src/__support/CPP/cstddef.h" -#include "src/__support/big_int.h" #include "src/__support/macros/config.h" #include "src/stdlib/qsort_pivot.h" @@ -90,7 +90,7 @@ size_t partition(const A &array, size_t pivot_index, const F &is_less) { size_t num_lt; if constexpr (A::has_fixed_size()) { // Branchless Lomuto avoid branch misprediction penalties, but - // it also swaps more often which only is faster if the swap a + // it also swaps more often which is only faster if the swap is a fast // constant operation. num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less); } else { @@ -130,7 +130,7 @@ void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit, if (!is_less(ancestor_pivot, array.get(pivot_index))) { const size_t num_lt = partition(array, pivot_index, - [is_less](const void *a, const void *b) noexcept -> bool { + [is_less](const void *a, const void *b) -> bool { return !is_less(b, a); }); diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 0044afed1d12c..aa596178f21fd 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -313,11 +313,11 @@ add_libc_test( ) add_libc_test( - qsort_test + quick_sort_test SUITE libc-stdlib-tests SRCS - qsort_test.cpp + quick_sort_test.cpp HDRS SortingTest.h DEPENDS diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h index cba16566d89e7..ecdfdb76d0113 100644 --- a/libc/test/src/stdlib/SortingTest.h +++ b/libc/test/src/stdlib/SortingTest.h @@ -10,10 +10,13 @@ #include "src/stdlib/qsort.h" #include "test/UnitTest/Test.h" +#include + class SortingTest : public LIBC_NAMESPACE::testing::Test { - using SortFn = void (*)(void *array, size_t array_size, size_t elem_size, - int (*compare)(const void *, const void *)); + using SortingRoutine = void (*)(void *array, size_t array_len, + size_t elem_size, + int (*compare)(const void *, const void *)); static int int_compare(const void *l, const void *r) { int li = *reinterpret_cast(l); @@ -27,19 +30,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { return -1; } - static void int_sort(SortFn sort_fn, int *array, size_t array_len) { - sort_fn(reinterpret_cast(array), array_len, sizeof(int), - int_compare); + static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) { + sort_func(reinterpret_cast(array), array_len, sizeof(int), + int_compare); } public: - void test_sorted_array(SortFn sort_fn) { + void test_sorted_array(SortingRoutine sort_func) { int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, 1133, 1135, 1155, 1170, 1171, 11100, 12310}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_LE(array[0], 10); ASSERT_LE(array[1], 23); @@ -68,36 +71,36 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_LE(array[24], 12310); } - void test_reversed_sorted_array(SortFn sort_fn) { + void test_reversed_sorted_array(SortingRoutine sort_func) { int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); for (int i = 0; i < int(ARRAY_LEN - 1); ++i) ASSERT_EQ(array[i], i + 1); } - void test_all_equal_elements(SortFn sort_fn) { + void test_all_equal_elements(SortingRoutine sort_func) { int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); for (size_t i = 0; i < ARRAY_LEN; ++i) ASSERT_EQ(array[i], 100); } - void test_unsorted_array_1(SortFn sort_fn) { + void test_unsorted_array_1(SortingRoutine sort_func) { int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, 123, 90, 80, 70, 60, 171, 11, 1, -1, -5, -10, 1155, 1170, 1171, 12, -100}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], -100); ASSERT_EQ(array[1], -10); @@ -126,11 +129,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[24], 1171); } - void test_unsorted_array_2(SortFn sort_fn) { + void test_unsorted_array_2(SortingRoutine sort_func) { int array[7] = {10, 40, 45, 55, 35, 23, 60}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 23); @@ -141,11 +144,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[6], 60); } - void test_unsorted_array_duplicated_1(SortFn sort_fn) { + void test_unsorted_array_duplicated_1(SortingRoutine sort_func) { int array[6] = {10, 10, 20, 20, 5, 5}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 5); ASSERT_EQ(array[1], 5); @@ -155,11 +158,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[5], 20); } - void test_unsorted_array_duplicated_2(SortFn sort_fn) { + void test_unsorted_array_duplicated_2(SortingRoutine sort_func) { int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 10); @@ -173,11 +176,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[9], 21); } - void test_unsorted_array_duplicated_3(SortFn sort_fn) { + void test_unsorted_array_duplicated_3(SortingRoutine sort_func) { int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 20); ASSERT_EQ(array[1], 20); @@ -191,96 +194,160 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { ASSERT_EQ(array[9], 30); } - void test_unsorted_three_element_1(SortFn sort_fn) { + void test_unsorted_three_element_1(SortingRoutine sort_func) { int array[3] = {14999024, 0, 3}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_unsorted_three_element_2(SortFn sort_fn) { + void test_unsorted_three_element_2(SortingRoutine sort_func) { int array[3] = {3, 14999024, 0}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_unsorted_three_element_3(SortFn sort_fn) { + void test_unsorted_three_element_3(SortingRoutine sort_func) { int array[3] = {3, 0, 14999024}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); ASSERT_EQ(array[2], 14999024); } - void test_same_three_element(SortFn sort_fn) { + void test_same_three_element(SortingRoutine sort_func) { int array[3] = {12345, 12345, 12345}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); ASSERT_EQ(array[2], 12345); } - void test_unsorted_two_element_1(SortFn sort_fn) { + void test_unsorted_two_element_1(SortingRoutine sort_func) { int array[] = {14999024, 0}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); } - void test_unsorted_two_element_2(SortFn sort_fn) { + void test_unsorted_two_element_2(SortingRoutine sort_func) { int array[] = {0, 14999024}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); } - void test_same_two_element(SortFn sort_fn) { + void test_same_two_element(SortingRoutine sort_func) { int array[] = {12345, 12345}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); } - void test_single_element(SortFn sort_fn) { + void test_single_element(SortingRoutine sort_func) { int array[] = {12345}; constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - int_sort(sort_fn, array, ARRAY_LEN); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); } + + void test_different_elem_size(SortingRoutine sort_func) { + // Random order of values [0,50) to avoid only testing pre-sorted handling. + // Long enough to reach interesting code. + constexpr uint8_t ARRAY_INITIAL_VALS[] = { + 42, 13, 8, 4, 17, 28, 20, 32, 22, 29, 7, 2, 46, 37, 26, 49, 24, + 38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1, 5, 16, 35, 39, 41, + 14, 23, 3, 9, 6, 27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0}; + + constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS); + constexpr size_t MAX_ELEM_SIZE = 150; + constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE; + + static_assert(ARRAY_LEN < 256); // so we can encode the values. + + // Minimum alignment to test implementation for bugs related to assuming + // incorrect association between alignment and element size. + alignas(1) uint8_t buf[BUF_SIZE]; + + const auto fill_buf = [&buf](size_t elem_size) { + for (size_t i = 0; i < BUF_SIZE; ++i) { + buf[i] = 0; + } + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i]; + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + buf[buf_i] = elem_val; + buf_i += 1; + } + } + }; + + for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) { + // Fill all bytes with data to ensure mistakes in elem swap are noticed. + fill_buf(elem_size); + + sort_func(reinterpret_cast(buf), ARRAY_LEN, elem_size, + [](const void *a, const void *b) -> int { + const uint8_t a_val = *reinterpret_cast(a); + const uint8_t b_val = *reinterpret_cast(b); + + if (a_val < b_val) { + return -1; + } else if (a_val > b_val) { + return 1; + } else { + return 0; + } + }); + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t expected_elem_val = static_cast(elem_i); + + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + const uint8_t buf_val = buf[buf_i]; + // Check that every byte in the element has the expected value. + ASSERT_EQ(buf_val, expected_elem_val) + << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n'; + buf_i += 1; + } + } + } + } }; #define LIST_SORTING_TESTS(Name, Func) \ @@ -331,4 +398,7 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##Name##Test, SingleElementArray) { \ test_single_element(Func); \ } \ + TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) { \ + test_different_elem_size(Func); \ + } \ static_assert(true) diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index 43c8c69739615..cd802ff2099e3 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -12,14 +12,11 @@ void heap_sort(void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *)) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; + constexpr bool USE_QUICKSORT = false; - auto arr = LIBC_NAMESPACE::internal::ArrayGenericSize( - reinterpret_cast(array), array_size, elem_size); - - LIBC_NAMESPACE::internal::heap_sort( - arr, [compare](const void *a, const void *b) noexcept -> bool { + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, elem_size, + [compare](const void *a, const void *b) noexcept -> bool { return compare(a, b) < 0; }); } diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp similarity index 87% rename from libc/test/src/stdlib/qsort_test.cpp rename to libc/test/src/stdlib/quick_sort_test.cpp index 63cbb205c4e17..0b3bcad94a20a 100644 --- a/libc/test/src/stdlib/qsort_test.cpp +++ b/libc/test/src/stdlib/quick_sort_test.cpp @@ -11,7 +11,9 @@ void quick_sort(void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *)) { - LIBC_NAMESPACE::internal::unstable_sort( + constexpr bool USE_QUICKSORT = true; + + LIBC_NAMESPACE::internal::unstable_sort_impl( array, array_size, elem_size, [compare](const void *a, const void *b) noexcept -> bool { return compare(a, b) < 0; diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index a6dac1f82b9f9..c0f1546912662 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -121,8 +121,8 @@ libc_support_library( ) libc_test( - name = "qsort_test", - srcs = ["qsort_test.cpp"], + name = "quick_sort_test", + srcs = ["quick_sort_test.cpp"], libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", From 850ca5aa2176680b2caeef26d718e024a43b081d Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Thu, 19 Dec 2024 18:28:28 +0100 Subject: [PATCH 10/12] Fix strict aliasing violation in `swap` function There might be platform where addressing the user provided array of unknown alignment as `uint32_t` would be incorrect. Generally the previous code violates strict aliasing and thus invokes UB. By using the "magic" properties of `memcpy` we can sidestep the issue by letting it do the load and store for us. In practice this means on platforms where addressing the memory as 4 byte regions is safe and efficient to do - which is most widely used platforms - the compiler can insert the appropriate `mov` instructions inline and if a platform does not support this operation it will likely insert a call to the `memcpy` function. --- libc/src/stdlib/qsort_data.h | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index 647239878add6..f85960e085471 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -11,8 +11,6 @@ #include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" -#include "src/string/memory_utils/inline_memmove.h" #include @@ -48,26 +46,23 @@ class ArrayGenericSize { using block_t = uint32_t; constexpr size_t BLOCK_SIZE = sizeof(block_t); + alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE]; + cpp::byte *elem_i = get_internal(i); cpp::byte *elem_j = get_internal(j); const size_t elem_size_rem = elem_size % BLOCK_SIZE; - const block_t *elem_i_block_end = - reinterpret_cast(elem_i + (elem_size - elem_size_rem)); - - block_t *elem_i_block = reinterpret_cast(elem_i); - block_t *elem_j_block = reinterpret_cast(elem_j); - - while (elem_i_block != elem_i_block_end) { - block_t tmp = *elem_i_block; - *elem_i_block = *elem_j_block; - *elem_j_block = tmp; - elem_i_block += 1; - elem_j_block += 1; + const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem); + + while (elem_i != elem_i_block_end) { + __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE); + __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE); + __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE); + + elem_i += BLOCK_SIZE; + elem_j += BLOCK_SIZE; } - elem_i = reinterpret_cast(elem_i_block); - elem_j = reinterpret_cast(elem_j_block); for (size_t n = 0; n < elem_size_rem; ++n) { cpp::byte tmp = elem_i[n]; elem_i[n] = elem_j[n]; @@ -118,9 +113,9 @@ template class ArrayFixedSize { cpp::byte *elem_i = get_internal(i); cpp::byte *elem_j = get_internal(j); - inline_memcpy(tmp, elem_i, ELEM_SIZE); - inline_memmove(elem_i, elem_j, ELEM_SIZE); - inline_memcpy(elem_j, tmp, ELEM_SIZE); + __builtin_memcpy(tmp, elem_i, ELEM_SIZE); + __builtin_memmove(elem_i, elem_j, ELEM_SIZE); + __builtin_memcpy(elem_j, tmp, ELEM_SIZE); } LIBC_INLINE size_t len() const { return array_len; } From f4b20ba219d199d6323008b6baf7df21c57df8b7 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Fri, 20 Dec 2024 11:04:24 +0100 Subject: [PATCH 11/12] Apply review comments --- libc/src/stdlib/heap_sort.h | 2 +- libc/src/stdlib/qsort.cpp | 10 ++++++---- libc/src/stdlib/qsort_data.h | 6 +++--- libc/src/stdlib/qsort_pivot.h | 5 +---- libc/src/stdlib/qsort_r.cpp | 9 +++++---- libc/src/stdlib/qsort_util.h | 15 ++++++--------- libc/src/stdlib/quick_sort.h | 19 +++++++++++-------- libc/test/src/stdlib/SortingTest.h | 2 -- libc/test/src/stdlib/heap_sort_test.cpp | 10 ++++++---- libc/test/src/stdlib/quick_sort_test.cpp | 10 ++++++---- 10 files changed, 45 insertions(+), 43 deletions(-) diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h index de08984497782..b9699776df89c 100644 --- a/libc/src/stdlib/heap_sort.h +++ b/libc/src/stdlib/heap_sort.h @@ -19,7 +19,7 @@ namespace internal { // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. template -void heap_sort(const A &array, const F &is_less) { +LIBC_INLINE void heap_sort(const A &array, const F &is_less) { size_t end = array.len(); size_t start = end / 2; diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 914297a0ddba9..0bf5fc7980527 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -18,10 +18,12 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, qsort, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *))) { - internal::unstable_sort(array, array_size, elem_size, - [compare](const void *a, const void *b) -> bool { - return compare(a, b) < 0; - }); + + const auto is_less = [compare](const void *a, const void *b) -> bool { + return compare(a, b) < 0; + }; + + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index f85960e085471..aa6d9bbc123de 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -27,7 +27,7 @@ class ArrayGenericSize { } public: - ArrayGenericSize(void *a, size_t s, size_t e) + LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e) : array_base(reinterpret_cast(a)), array_len(s), elem_size(e) {} @@ -35,7 +35,7 @@ class ArrayGenericSize { LIBC_INLINE void *get(size_t i) const { return get_internal(i); } - void swap(size_t i, size_t j) const { + LIBC_INLINE void swap(size_t i, size_t j) const { // It's possible to use 8 byte blocks with `uint64_t`, but that // generates more machine code as the remainder loop gets // unrolled, plus 4 byte operations are more likely to be @@ -97,7 +97,7 @@ template class ArrayFixedSize { } public: - ArrayFixedSize(void *a, size_t s) + LIBC_INLINE ArrayFixedSize(void *a, size_t s) : array_base(reinterpret_cast(a)), array_len(s) {} // Beware this function is used a heuristic for cheap to swap types, so diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h index 616f3ed09676e..f5522813a00ee 100644 --- a/libc/src/stdlib/qsort_pivot.h +++ b/libc/src/stdlib/qsort_pivot.h @@ -73,10 +73,7 @@ size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) { // If x=y=1 then a < b, c. In this case we want to return min(b, c). // By toggling the outcome of b < c using XOR x we get this behavior. const bool z = is_less(b_ptr, c_ptr); - if (z ^ x) - return c; - else - return b; + return z ^ x ? c : b; } else { // Either c <= a < b or b <= a < c, thus a is our median. return a; diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index dfd2d28f483d6..4e60998b6a6df 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -20,10 +20,11 @@ LLVM_LIBC_FUNCTION(void, qsort_r, int (*compare)(const void *, const void *, void *), void *arg)) { - internal::unstable_sort(array, array_size, elem_size, - [compare, arg](const void *a, const void *b) -> bool { - return compare(a, b, arg) < 0; - }); + const auto is_less = [compare, arg](const void *a, const void *b) -> bool { + return compare(a, b, arg) < 0; + }; + + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index 630a668bcf0e0..b99c4a8701073 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -32,8 +32,8 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { template -void unstable_sort_impl(void *array, size_t array_len, size_t elem_size, - const F &is_less) { +LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len, + size_t elem_size, const F &is_less) { if (array == nullptr || array_len == 0 || elem_size == 0) return; @@ -68,13 +68,10 @@ void unstable_sort_impl(void *array, size_t array_len, size_t elem_size, } template -void unstable_sort(void *array, size_t array_len, size_t elem_size, - const F &is_less) { -#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT - unstable_sort_impl(array, array_len, elem_size, is_less); -#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT - unstable_sort_impl(array, array_len, elem_size, is_less); -#endif +LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size, + const F &is_less) { +#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT)) + unstable_sort_impl(array, array_len, elem_size, is_less); } } // namespace internal diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 6eb8e00bd7891..9ab2830250018 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -24,8 +24,9 @@ namespace internal { // https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md. // Simplified to avoid having to stack allocate. template -size_t partition_lomuto_branchless(const A &array, const void *pivot, - const F &is_less) { +LIBC_INLINE size_t partition_lomuto_branchless(const A &array, + const void *pivot, + const F &is_less) { const size_t array_len = array.len(); size_t left = 0; @@ -49,8 +50,8 @@ size_t partition_lomuto_branchless(const A &array, const void *pivot, // cyclic permutation is to have more efficient swapping, but we don't // know the element size so this isn't applicable here either. template -size_t partition_hoare_branchy(const A &array, const void *pivot, - const F &is_less) { +LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot, + const F &is_less) { const size_t array_len = array.len(); size_t left = 0; @@ -78,7 +79,8 @@ size_t partition_hoare_branchy(const A &array, const void *pivot, } template -size_t partition(const A &array, size_t pivot_index, const F &is_less) { +LIBC_INLINE size_t partition(const A &array, size_t pivot_index, + const F &is_less) { // Place the pivot at the beginning of the array. if (pivot_index != 0) { array.swap(0, pivot_index); @@ -104,8 +106,8 @@ size_t partition(const A &array, size_t pivot_index, const F &is_less) { } template -void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit, - const F &is_less) { +LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot, + size_t limit, const F &is_less) { while (true) { const size_t array_len = array.len(); if (array_len <= 1) @@ -167,7 +169,8 @@ void quick_sort_impl(A &array, const void *ancestor_pivot, size_t limit, constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; } -template void quick_sort(A &array, const F &is_less) { +template +LIBC_INLINE void quick_sort(A &array, const F &is_less) { const void *ancestor_pivot = nullptr; // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. // The binary OR by one is used to eliminate the zero-check in the logarithm. diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h index ecdfdb76d0113..034c0e4f1fd01 100644 --- a/libc/test/src/stdlib/SortingTest.h +++ b/libc/test/src/stdlib/SortingTest.h @@ -10,8 +10,6 @@ #include "src/stdlib/qsort.h" #include "test/UnitTest/Test.h" -#include - class SortingTest : public LIBC_NAMESPACE::testing::Test { using SortingRoutine = void (*)(void *array, size_t array_len, diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index cd802ff2099e3..18d4244506ec2 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -14,11 +14,13 @@ void heap_sort(void *array, size_t array_size, size_t elem_size, constexpr bool USE_QUICKSORT = false; + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + LIBC_NAMESPACE::internal::unstable_sort_impl( - array, array_size, elem_size, - [compare](const void *a, const void *b) noexcept -> bool { - return compare(a, b) < 0; - }); + array, array_size, elem_size, is_less); } LIST_SORTING_TESTS(HeapSort, heap_sort); diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp index 0b3bcad94a20a..2832c855370bc 100644 --- a/libc/test/src/stdlib/quick_sort_test.cpp +++ b/libc/test/src/stdlib/quick_sort_test.cpp @@ -13,11 +13,13 @@ void quick_sort(void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *)) { constexpr bool USE_QUICKSORT = true; + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + LIBC_NAMESPACE::internal::unstable_sort_impl( - array, array_size, elem_size, - [compare](const void *a, const void *b) noexcept -> bool { - return compare(a, b) < 0; - }); + array, array_size, elem_size, is_less); } LIST_SORTING_TESTS(Qsort, quick_sort); From 0e62c9040ef32dec27f10729b3a8fc1194c4de4c Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 29 Dec 2024 15:10:28 +0100 Subject: [PATCH 12/12] Apply review comments --- libc/src/stdlib/qsort_pivot.h | 5 ++--- libc/src/stdlib/qsort_util.h | 4 ---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h index f5522813a00ee..b7e1b4294f6d6 100644 --- a/libc/src/stdlib/qsort_pivot.h +++ b/libc/src/stdlib/qsort_pivot.h @@ -35,11 +35,10 @@ size_t choose_pivot(const A &array, const F &is_less) { const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8)) const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8)) - if (len < PSEUDO_MEDIAN_REC_THRESHOLD) { + if (len < PSEUDO_MEDIAN_REC_THRESHOLD) return median3(array, a, b, c, is_less); - } else { + else return median3_rec(array, a, b, c, len_div_8, is_less); - } } // Calculates an approximate median of 3 elements from sections a, b, c, or diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index b99c4a8701073..7882b829d3274 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -15,13 +15,9 @@ #define LIBC_QSORT_QUICK_SORT 1 #define LIBC_QSORT_HEAP_SORT 2 -#ifdef LIBC_OPTIMIZE_FOR_SIZE -#define LIBC_QSORT_IMPL LIBC_QSORT_HEAP_SORT -#else #ifndef LIBC_QSORT_IMPL #define LIBC_QSORT_IMPL LIBC_QSORT_QUICK_SORT #endif // LIBC_QSORT_IMPL -#endif // LIBC_OPTIMIZE_FOR_SIZE #if (LIBC_QSORT_IMPL != LIBC_QSORT_QUICK_SORT && \ LIBC_QSORT_IMPL != LIBC_QSORT_HEAP_SORT)