py/objstr: Add check for valid UTF-8 when making a str from bytes.

This patch adds a function utf8_check() to check for a valid UTF-8 encoded
string, and calls it when constructing a str from raw bytes.  The feature
is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and
is enabled if unicode is enabled.  It costs about 110 bytes on Thumb-2, 150
bytes on Xtensa and 170 bytes on x86-64.
This commit is contained in:
tll
2017-06-24 08:38:32 +08:00
committed by Damien George
parent 069fc48bf6
commit 68c28174d0
5 changed files with 58 additions and 0 deletions

View File

@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
}
return n;
}
bool utf8_check(const byte *p, size_t len) {
uint8_t need = 0;
const byte *end = p + len;
for (; p < end; p++) {
byte c = *p;
if (need) {
if (c >= 0x80) {
need--;
} else {
// mismatch
return 0;
}
} else {
if (c >= 0xc0) {
if (c >= 0xf8) {
// mismatch
return 0;
}
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
} else if (c >= 0x80) {
// mismatch
return 0;
}
}
}
return need == 0; // no pending fragments allowed
}