diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 5ab0447bb08b2..1f81de4248ff0 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -30,18 +30,20 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } void CharacterConverter::clear() { state->partial = 0; - state->bytes_processed = 0; + state->bytes_stored = 0; state->total_bytes = 0; } -bool CharacterConverter::isComplete() { - return state->bytes_processed == state->total_bytes; +bool CharacterConverter::isFull() { + return state->bytes_stored == state->total_bytes && state->total_bytes != 0; } +bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; } + int CharacterConverter::push(char8_t utf8_byte) { uint8_t num_ones = static_cast(cpp::countl_one(utf8_byte)); // Checking the first byte if first push - if (state->bytes_processed == 0) { + if (isEmpty()) { // UTF-8 char has 1 byte total if (num_ones == 0) { state->total_bytes = 1; @@ -58,21 +60,21 @@ int CharacterConverter::push(char8_t utf8_byte) { } // Invalid first byte else { - // bytes_processed and total_bytes will always be 0 here + // bytes_stored and total_bytes will always be 0 here state->partial = static_cast(0); return -1; } state->partial = static_cast(utf8_byte); - state->bytes_processed++; + state->bytes_stored++; return 0; } // Any subsequent push // Adding 6 more bits so need to left shift - if (num_ones == 1 && !isComplete()) { + if (num_ones == 1 && !isFull()) { char32_t byte = utf8_byte & MASK_ENCODED_BITS; state->partial = state->partial << ENCODED_BITS_PER_UTF8; state->partial |= byte; - state->bytes_processed++; + state->bytes_stored++; return 0; } // Invalid byte -> reset the state @@ -82,11 +84,10 @@ int CharacterConverter::push(char8_t utf8_byte) { int CharacterConverter::push(char32_t utf32) { // we can't be partially through a conversion when pushing a utf32 value - if (!isComplete()) + if (!isEmpty()) return -1; state->partial = utf32; - state->bytes_processed = 0; // determine number of utf-8 bytes needed to represent this utf32 value constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; @@ -94,6 +95,7 @@ int CharacterConverter::push(char32_t utf32) { for (uint8_t i = 0; i < NUM_RANGES; i++) { if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) { state->total_bytes = i + 1; + state->bytes_stored = i + 1; return 0; } } @@ -107,7 +109,7 @@ int CharacterConverter::push(char32_t utf32) { ErrorOr CharacterConverter::pop_utf32() { // If pop is called too early, do not reset the state, use error to determine // whether enough bytes have been pushed - if (!isComplete() || state->bytes_processed == 0) + if (!isFull()) return Error(-1); char32_t utf32 = state->partial; // reset if successful pop @@ -116,7 +118,7 @@ ErrorOr CharacterConverter::pop_utf32() { } ErrorOr CharacterConverter::pop_utf8() { - if (isComplete()) + if (isEmpty()) return Error(-1); constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0}; @@ -125,9 +127,8 @@ ErrorOr CharacterConverter::pop_utf8() { char32_t output; // Shift to get the next 6 bits from the utf32 encoding - const size_t shift_amount = - (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8; - if (state->bytes_processed == 0) { + const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8; + if (isFull()) { /* Choose the correct set of most significant bits to encode the length of the utf8 sequence. The remaining bits contain the most significant @@ -141,7 +142,7 @@ ErrorOr CharacterConverter::pop_utf8() { ((state->partial >> shift_amount) & MASK_ENCODED_BITS); } - state->bytes_processed++; + state->bytes_stored--; return static_cast(output); } diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index c4ba7cf6b689f..be0e6129df236 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -26,7 +26,8 @@ class CharacterConverter { CharacterConverter(mbstate *mbstate); void clear(); - bool isComplete(); + bool isFull(); + bool isEmpty(); int push(char8_t utf8_byte); int push(char32_t utf32); diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h index fb08fb4eaa188..fea693f73c3b5 100644 --- a/libc/src/__support/wchar/mbstate.h +++ b/libc/src/__support/wchar/mbstate.h @@ -22,10 +22,10 @@ struct mbstate { /* Progress towards a conversion - For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte) - For utf32 -> utf8, increases with each CharacterConverter::pop_utf8() + Increases with each push(...) until it reaches total_bytes + Decreases with each pop(...) until it reaches 0 */ - uint8_t bytes_processed; + uint8_t bytes_stored; // Total number of bytes that will be needed to represent this character uint8_t total_bytes; diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp index f4c5cb863ff38..a6a7bc4aa6f4c 100644 --- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp +++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp @@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) { // utf8 1-byte encodings are identical to their utf32 representations char32_t utf32_A = 0x41; // 'A' cr.push(utf32_A); + ASSERT_TRUE(cr.isFull()); auto popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 'A'); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); char32_t utf32_B = 0x42; // 'B' cr.push(utf32_B); + ASSERT_TRUE(cr.isFull()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 'B'); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // should error if we try to pop another utf8 byte out popped = cr.pop_utf8(); @@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) { // testing utf32: 0xff -> utf8: 0xc3 0xbf char32_t utf32 = 0xff; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); auto popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xc3); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xbf); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // testing utf32: 0x58e -> utf8: 0xd6 0x8e utf32 = 0x58e; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xd6); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x8e); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // should error if we try to pop another utf8 byte out popped = cr.pop_utf8(); @@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) { // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95 char32_t utf32 = 0xac15; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); auto popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xea); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xb0); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x95); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb utf32 = 0x267b; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xe2); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x99); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xbb); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // should error if we try to pop another utf8 byte out popped = cr.pop_utf8(); @@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) { // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1 char32_t utf32 = 0x1f921; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); auto popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xf0); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x9f); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xa4); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xa1); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1 utf32 = 0x12121; cr.push(utf32); + ASSERT_TRUE(cr.isFull()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xf0); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x92); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0x84); - ASSERT_TRUE(!cr.isComplete()); + ASSERT_TRUE(!cr.isEmpty()); popped = cr.pop_utf8(); ASSERT_TRUE(popped.has_value()); ASSERT_EQ(static_cast(popped.value()), 0xa1); - ASSERT_TRUE(cr.isComplete()); + ASSERT_TRUE(cr.isEmpty()); // should error if we try to pop another utf8 byte out popped = cr.pop_utf8(); diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp index 9cb059faa9374..36ae7d689cc00 100644 --- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp +++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp @@ -13,7 +13,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; char ch = 'A'; @@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) { TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[2] = {static_cast(0xC2), static_cast(0x8E)}; // Ž car symbol @@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) { TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[3] = {static_cast(0xE2), static_cast(0x88), static_cast(0x91)}; // ∑ sigma symbol @@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) { TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[4] = {static_cast(0xF0), static_cast(0x9F), static_cast(0xA4), @@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) { TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch = static_cast(0x80); // invalid starting bit sequence @@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) { TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[4] = { static_cast(0x80), static_cast(0x00), static_cast(0x80), @@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) { TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; // Last byte is invalid since it does not have correct starting sequence. // 0xC0 --> 11000000 starting sequence should be 10xxxxxx @@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) { TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[3] = {static_cast(0xC2), static_cast(0x8E), static_cast(0x80)}; @@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) { TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; const char ch[4] = {static_cast(0xC2), static_cast(0x8E), static_cast(0xC7), static_cast(0x8C)}; @@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) { TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) { LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; + state.bytes_stored = 0; state.total_bytes = 0; LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); const char ch[2] = {static_cast(0xC2), static_cast(0x8E)};