Skip to content

Commit 8d6e29d

Browse files
authored
[libc] Reworked CharacterConverter isComplete into isFull and isEmpty (#144799)
isComplete previously meant different things for different conversion directions. Refactored bytes_processed to bytes_stored which now consistently increments on every push and decrements on pop making both directions more consistent with each other
1 parent 7157f33 commit 8d6e29d

File tree

5 files changed

+60
-50
lines changed

5 files changed

+60
-50
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,20 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
3030

3131
void CharacterConverter::clear() {
3232
state->partial = 0;
33-
state->bytes_processed = 0;
33+
state->bytes_stored = 0;
3434
state->total_bytes = 0;
3535
}
3636

37-
bool CharacterConverter::isComplete() {
38-
return state->bytes_processed == state->total_bytes;
37+
bool CharacterConverter::isFull() {
38+
return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
3939
}
4040

41+
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
42+
4143
int CharacterConverter::push(char8_t utf8_byte) {
4244
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
4345
// Checking the first byte if first push
44-
if (state->bytes_processed == 0) {
46+
if (isEmpty()) {
4547
// UTF-8 char has 1 byte total
4648
if (num_ones == 0) {
4749
state->total_bytes = 1;
@@ -58,21 +60,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
5860
}
5961
// Invalid first byte
6062
else {
61-
// bytes_processed and total_bytes will always be 0 here
63+
// bytes_stored and total_bytes will always be 0 here
6264
state->partial = static_cast<char32_t>(0);
6365
return -1;
6466
}
6567
state->partial = static_cast<char32_t>(utf8_byte);
66-
state->bytes_processed++;
68+
state->bytes_stored++;
6769
return 0;
6870
}
6971
// Any subsequent push
7072
// Adding 6 more bits so need to left shift
71-
if (num_ones == 1 && !isComplete()) {
73+
if (num_ones == 1 && !isFull()) {
7274
char32_t byte = utf8_byte & MASK_ENCODED_BITS;
7375
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
7476
state->partial |= byte;
75-
state->bytes_processed++;
77+
state->bytes_stored++;
7678
return 0;
7779
}
7880
// Invalid byte -> reset the state
@@ -82,18 +84,18 @@ int CharacterConverter::push(char8_t utf8_byte) {
8284

8385
int CharacterConverter::push(char32_t utf32) {
8486
// we can't be partially through a conversion when pushing a utf32 value
85-
if (!isComplete())
87+
if (!isEmpty())
8688
return -1;
8789

8890
state->partial = utf32;
89-
state->bytes_processed = 0;
9091

9192
// determine number of utf-8 bytes needed to represent this utf32 value
9293
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
9394
constexpr int NUM_RANGES = 4;
9495
for (uint8_t i = 0; i < NUM_RANGES; i++) {
9596
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
9697
state->total_bytes = i + 1;
98+
state->bytes_stored = i + 1;
9799
return 0;
98100
}
99101
}
@@ -107,7 +109,7 @@ int CharacterConverter::push(char32_t utf32) {
107109
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
108110
// If pop is called too early, do not reset the state, use error to determine
109111
// whether enough bytes have been pushed
110-
if (!isComplete() || state->bytes_processed == 0)
112+
if (!isFull())
111113
return Error(-1);
112114
char32_t utf32 = state->partial;
113115
// reset if successful pop
@@ -116,7 +118,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
116118
}
117119

118120
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
119-
if (isComplete())
121+
if (isEmpty())
120122
return Error(-1);
121123

122124
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
@@ -125,9 +127,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
125127
char32_t output;
126128

127129
// Shift to get the next 6 bits from the utf32 encoding
128-
const size_t shift_amount =
129-
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
130-
if (state->bytes_processed == 0) {
130+
const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
131+
if (isFull()) {
131132
/*
132133
Choose the correct set of most significant bits to encode the length
133134
of the utf8 sequence. The remaining bits contain the most significant
@@ -141,7 +142,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
141142
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
142143
}
143144

144-
state->bytes_processed++;
145+
state->bytes_stored--;
145146
return static_cast<char8_t>(output);
146147
}
147148

libc/src/__support/wchar/character_converter.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ class CharacterConverter {
2626
CharacterConverter(mbstate *mbstate);
2727

2828
void clear();
29-
bool isComplete();
29+
bool isFull();
30+
bool isEmpty();
3031

3132
int push(char8_t utf8_byte);
3233
int push(char32_t utf32);

libc/src/__support/wchar/mbstate.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ struct mbstate {
2222

2323
/*
2424
Progress towards a conversion
25-
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
26-
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
25+
Increases with each push(...) until it reaches total_bytes
26+
Decreases with each pop(...) until it reaches 0
2727
*/
28-
uint8_t bytes_processed;
28+
uint8_t bytes_stored;
2929

3030
// Total number of bytes that will be needed to represent this character
3131
uint8_t total_bytes;

libc/test/src/__support/wchar/utf32_to_8_test.cpp

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
2020
// utf8 1-byte encodings are identical to their utf32 representations
2121
char32_t utf32_A = 0x41; // 'A'
2222
cr.push(utf32_A);
23+
ASSERT_TRUE(cr.isFull());
2324
auto popped = cr.pop_utf8();
2425
ASSERT_TRUE(popped.has_value());
2526
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
26-
ASSERT_TRUE(cr.isComplete());
27+
ASSERT_TRUE(cr.isEmpty());
2728

2829
char32_t utf32_B = 0x42; // 'B'
2930
cr.push(utf32_B);
31+
ASSERT_TRUE(cr.isFull());
3032
popped = cr.pop_utf8();
3133
ASSERT_TRUE(popped.has_value());
3234
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
33-
ASSERT_TRUE(cr.isComplete());
35+
ASSERT_TRUE(cr.isEmpty());
3436

3537
// should error if we try to pop another utf8 byte out
3638
popped = cr.pop_utf8();
@@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
4547
// testing utf32: 0xff -> utf8: 0xc3 0xbf
4648
char32_t utf32 = 0xff;
4749
cr.push(utf32);
50+
ASSERT_TRUE(cr.isFull());
4851
auto popped = cr.pop_utf8();
4952
ASSERT_TRUE(popped.has_value());
5053
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
51-
ASSERT_TRUE(!cr.isComplete());
54+
ASSERT_TRUE(!cr.isEmpty());
5255
popped = cr.pop_utf8();
5356
ASSERT_TRUE(popped.has_value());
5457
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
55-
ASSERT_TRUE(cr.isComplete());
58+
ASSERT_TRUE(cr.isEmpty());
5659

5760
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
5861
utf32 = 0x58e;
5962
cr.push(utf32);
63+
ASSERT_TRUE(cr.isFull());
6064
popped = cr.pop_utf8();
6165
ASSERT_TRUE(popped.has_value());
6266
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
63-
ASSERT_TRUE(!cr.isComplete());
67+
ASSERT_TRUE(!cr.isEmpty());
6468
popped = cr.pop_utf8();
6569
ASSERT_TRUE(popped.has_value());
6670
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
67-
ASSERT_TRUE(cr.isComplete());
71+
ASSERT_TRUE(cr.isEmpty());
6872

6973
// should error if we try to pop another utf8 byte out
7074
popped = cr.pop_utf8();
@@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
7983
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
8084
char32_t utf32 = 0xac15;
8185
cr.push(utf32);
86+
ASSERT_TRUE(cr.isFull());
8287
auto popped = cr.pop_utf8();
8388
ASSERT_TRUE(popped.has_value());
8489
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
85-
ASSERT_TRUE(!cr.isComplete());
90+
ASSERT_TRUE(!cr.isEmpty());
8691
popped = cr.pop_utf8();
8792
ASSERT_TRUE(popped.has_value());
8893
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
89-
ASSERT_TRUE(!cr.isComplete());
94+
ASSERT_TRUE(!cr.isEmpty());
9095
popped = cr.pop_utf8();
9196
ASSERT_TRUE(popped.has_value());
9297
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
93-
ASSERT_TRUE(cr.isComplete());
98+
ASSERT_TRUE(cr.isEmpty());
9499

95100
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
96101
utf32 = 0x267b;
97102
cr.push(utf32);
103+
ASSERT_TRUE(cr.isFull());
98104
popped = cr.pop_utf8();
99105
ASSERT_TRUE(popped.has_value());
100106
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
101-
ASSERT_TRUE(!cr.isComplete());
107+
ASSERT_TRUE(!cr.isEmpty());
102108
popped = cr.pop_utf8();
103109
ASSERT_TRUE(popped.has_value());
104110
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
105-
ASSERT_TRUE(!cr.isComplete());
111+
ASSERT_TRUE(!cr.isEmpty());
106112
popped = cr.pop_utf8();
107113
ASSERT_TRUE(popped.has_value());
108114
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
109-
ASSERT_TRUE(cr.isComplete());
115+
ASSERT_TRUE(cr.isEmpty());
110116

111117
// should error if we try to pop another utf8 byte out
112118
popped = cr.pop_utf8();
@@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
121127
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
122128
char32_t utf32 = 0x1f921;
123129
cr.push(utf32);
130+
ASSERT_TRUE(cr.isFull());
124131
auto popped = cr.pop_utf8();
125132
ASSERT_TRUE(popped.has_value());
126133
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
127-
ASSERT_TRUE(!cr.isComplete());
134+
ASSERT_TRUE(!cr.isEmpty());
128135
popped = cr.pop_utf8();
129136
ASSERT_TRUE(popped.has_value());
130137
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
131-
ASSERT_TRUE(!cr.isComplete());
138+
ASSERT_TRUE(!cr.isEmpty());
132139
popped = cr.pop_utf8();
133140
ASSERT_TRUE(popped.has_value());
134141
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
135-
ASSERT_TRUE(!cr.isComplete());
142+
ASSERT_TRUE(!cr.isEmpty());
136143
popped = cr.pop_utf8();
137144
ASSERT_TRUE(popped.has_value());
138145
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
139-
ASSERT_TRUE(cr.isComplete());
146+
ASSERT_TRUE(cr.isEmpty());
140147

141148
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
142149
utf32 = 0x12121;
143150
cr.push(utf32);
151+
ASSERT_TRUE(cr.isFull());
144152
popped = cr.pop_utf8();
145153
ASSERT_TRUE(popped.has_value());
146154
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
147-
ASSERT_TRUE(!cr.isComplete());
155+
ASSERT_TRUE(!cr.isEmpty());
148156
popped = cr.pop_utf8();
149157
ASSERT_TRUE(popped.has_value());
150158
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
151-
ASSERT_TRUE(!cr.isComplete());
159+
ASSERT_TRUE(!cr.isEmpty());
152160
popped = cr.pop_utf8();
153161
ASSERT_TRUE(popped.has_value());
154162
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
155-
ASSERT_TRUE(!cr.isComplete());
163+
ASSERT_TRUE(!cr.isEmpty());
156164
popped = cr.pop_utf8();
157165
ASSERT_TRUE(popped.has_value());
158166
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
159-
ASSERT_TRUE(cr.isComplete());
167+
ASSERT_TRUE(cr.isEmpty());
160168

161169
// should error if we try to pop another utf8 byte out
162170
popped = cr.pop_utf8();

libc/test/src/__support/wchar/utf8_to_32_test.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
1515
LIBC_NAMESPACE::internal::mbstate state;
16-
state.bytes_processed = 0;
16+
state.bytes_stored = 0;
1717
state.total_bytes = 0;
1818
char ch = 'A';
1919

@@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
2828

2929
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
3030
LIBC_NAMESPACE::internal::mbstate state;
31-
state.bytes_processed = 0;
31+
state.bytes_stored = 0;
3232
state.total_bytes = 0;
3333
const char ch[2] = {static_cast<char>(0xC2),
3434
static_cast<char>(0x8E)}; // Ž car symbol
@@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
4444

4545
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
4646
LIBC_NAMESPACE::internal::mbstate state;
47-
state.bytes_processed = 0;
47+
state.bytes_stored = 0;
4848
state.total_bytes = 0;
4949
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
5050
static_cast<char>(0x91)}; // ∑ sigma symbol
@@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
6161

6262
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
6363
LIBC_NAMESPACE::internal::mbstate state;
64-
state.bytes_processed = 0;
64+
state.bytes_stored = 0;
6565
state.total_bytes = 0;
6666
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
6767
static_cast<char>(0xA4),
@@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
8080

8181
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
8282
LIBC_NAMESPACE::internal::mbstate state;
83-
state.bytes_processed = 0;
83+
state.bytes_stored = 0;
8484
state.total_bytes = 0;
8585
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
8686

@@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
9292

9393
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
9494
LIBC_NAMESPACE::internal::mbstate state;
95-
state.bytes_processed = 0;
95+
state.bytes_stored = 0;
9696
state.total_bytes = 0;
9797
const char ch[4] = {
9898
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
@@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
112112

113113
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
114114
LIBC_NAMESPACE::internal::mbstate state;
115-
state.bytes_processed = 0;
115+
state.bytes_stored = 0;
116116
state.total_bytes = 0;
117117
// Last byte is invalid since it does not have correct starting sequence.
118118
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
@@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
132132

133133
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
134134
LIBC_NAMESPACE::internal::mbstate state;
135-
state.bytes_processed = 0;
135+
state.bytes_stored = 0;
136136
state.total_bytes = 0;
137137
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
138138
static_cast<char>(0x80)};
@@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
153153

154154
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
155155
LIBC_NAMESPACE::internal::mbstate state;
156-
state.bytes_processed = 0;
156+
state.bytes_stored = 0;
157157
state.total_bytes = 0;
158158
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
159159
static_cast<char>(0xC7), static_cast<char>(0x8C)};
@@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
179179

180180
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
181181
LIBC_NAMESPACE::internal::mbstate state;
182-
state.bytes_processed = 0;
182+
state.bytes_stored = 0;
183183
state.total_bytes = 0;
184184
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
185185
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};

0 commit comments

Comments
 (0)