Reimplement the utf8 string check (#389)

Previous implementation doesn't take care of overlong encoding

Signed-off-by: Xiaokang Qin <xiaokang.qxk@antgroup.com>
This commit is contained in:
Xiaokang Qin 2020-09-18 18:06:13 +08:00 committed by GitHub
parent 0226dbbb3d
commit 2d06567cd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -250,36 +250,72 @@ loader_malloc(uint64 size, char *error_buf, uint32 error_buf_size)
static bool
check_utf8_str(const uint8* str, uint32 len)
{
const uint8 *p = str, *p_end = str + len, *p_end1;
uint8 chr, n_bytes;
/* The valid ranges are taken from page 125, below link
https://www.unicode.org/versions/Unicode9.0.0/ch03.pdf */
const uint8 *p = str, *p_end = str + len;
uint8 chr;
while (p < p_end) {
chr = *p++;
if (chr >= 0x80) {
/* Calculate the byte count: the first byte must be
110XXXXX, 1110XXXX, 11110XXX, 111110XX, or 1111110X,
the count of leading '1' denotes the total byte count */
n_bytes = 0;
while ((chr & 0x80) != 0) {
chr = (uint8)(chr << 1);
n_bytes++;
}
/* Check byte count */
if (n_bytes < 2 || n_bytes > 6
|| p + n_bytes - 1 > p_end)
chr = *p;
if (chr < 0x80) {
p++;
}
else if (chr >= 0xC2 && chr <= 0xDF && p + 1 < p_end) {
if (p[1] < 0x80 || p[1] > 0xBF) {
return false;
/* Check the following bytes, which must be 10XXXXXX */
p_end1 = p + n_bytes - 1;
while (p < p_end1) {
if (!(*p & 0x80) || (*p | 0x40))
return false;
p++;
}
p += 2;
}
else if (chr >= 0xE0 && chr <= 0xEF && p + 2 < p_end) {
if (chr == 0xE0) {
if (p[1] < 0xA0 || p[1] > 0xBF
|| p[2] < 0x80 || p[2] > 0xBF) {
return false;
}
}
else if (chr == 0xED) {
if (p[1] < 0x80 || p[1] > 0x9F
|| p[2] < 0x80 || p[2] > 0xBF) {
return false;
}
}
else if (chr >= 0xE1 && chr <= 0xEF) {
if (p[1] < 0x80 || p[1] > 0xBF
|| p[2] < 0x80 || p[2] > 0xBF) {
return false;
}
}
p += 3;
}
else if (chr >= 0xF0 && chr <= 0xF4 && p + 3 < p_end) {
if (chr == 0xF0) {
if (p[1] < 0x90 || p[1] > 0xBF
|| p[2] < 0x80 || p[2] > 0xBF
|| p[3] < 0x80 || p[3] > 0xBF) {
return false;
}
}
else if (chr >= 0xF1 && chr <= 0xF3) {
if (p[1] < 0x80 || p[1] > 0xBF
|| p[2] < 0x80 || p[2] > 0xBF
|| p[3] < 0x80 || p[3] > 0xBF) {
return false;
}
}
else if (chr == 0xF4) {
if (p[1] < 0x80 || p[1] > 0x8F
|| p[2] < 0x80 || p[2] > 0xBF
|| p[3] < 0x80 || p[3] > 0xBF) {
return false;
}
}
p += 4;
}
else {
return false;
}
}
return true;
return (p == p_end);
}
static char*