py/objstr: Add check for valid UTF-8 when making a str from bytes.

This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
2017-06-24 08:38:32 +08:00
parent 069fc48bf6
commit 68c28174d0
5 changed files with 58 additions and 0 deletions
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
    }
    return n;
 }
+
+bool utf8_check(const byte *p, size_t len) {
+    uint8_t need = 0;
+    const byte *end = p + len;
+    for (; p < end; p++) {
+        byte c = *p;
+        if (need) {
+            if (c >= 0x80) {
+                need--;
+            } else {
+                // mismatch
+                return 0;
+            }
+        } else {
+            if (c >= 0xc0) {
+                if (c >= 0xf8) {
+                    // mismatch
+                    return 0;
+                }
+                need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
+            } else if (c >= 0x80) {
+                // mismatch
+                return 0;
+            }
+        }
+    }
+    return need == 0; // no pending fragments allowed
+}