py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
This commit is contained in:
28
py/unicode.c
28
py/unicode.c
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
bool utf8_check(const byte *p, size_t len) {
|
||||
uint8_t need = 0;
|
||||
const byte *end = p + len;
|
||||
for (; p < end; p++) {
|
||||
byte c = *p;
|
||||
if (need) {
|
||||
if (c >= 0x80) {
|
||||
need--;
|
||||
} else {
|
||||
// mismatch
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
if (c >= 0xc0) {
|
||||
if (c >= 0xf8) {
|
||||
// mismatch
|
||||
return 0;
|
||||
}
|
||||
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
|
||||
} else if (c >= 0x80) {
|
||||
// mismatch
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return need == 0; // no pending fragments allowed
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user