mirror of
https://github.com/Telecominfraproject/OpenCellular.git
synced 2025-11-27 11:44:02 +00:00
The right implementation of CGPT label conversion between UTF8 and UTF16.
For security quick fix, the original UTF8/UTF16 conversion only supports ASCII area. This CL extends the library to support multiple code units conversion between UTF8 and UTF16. The UTF8/UTF16 encoded byte(s) would be decoded to code point first, then be encoded to UTF16/UTF8 correspondingly. Bill, please kindly review the UTF8/UTF16 conversion. Peter, please kindly comment if any security concern. Thanks. Change-Id: I99c558ff27556e0b8635ba2b8d9925d042e75cb2 BUG=chromium-os:7542 TEST=RUNTESTS=1 emerge-x86-generic vboot_reference Manually tested the following commands (intentionally mix Chinese and ASCII): export C=.../cgpt export D=/tmp/hda $C add $D -i 1 -l 批P踢T踢T許C夕C餐 $C find $D -l 批P踢T踢T許C夕C餐 $C show $D $C add $D -i 1 -l 批P踢T踢T許C夕C餐 $C find $D -l 批P踢T踢T許C夕C餐 $C add $D -i 1 -l abc012 $C add $D -i 1 -l 是否看過坊間常見的許茹芸淚海慶功宴吃蓋飯第四集 $C add $D -i 1 -l 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ # ok and truncated $C add $D -i 1 -l `printf "\xf4\x91\x81\x81"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf4\x8f\xbf\xbf"` $C add $D -i 1 -l `printf "\xf4\x8f\x44\x44"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf4\x8f\xbf"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf0\xbf\xbf\xbf"` $C add $D -i 1 -l `printf "\xf0\xbf\xbf\x44"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf0\x80\x80\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf0\x80\x84\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf0\x80\x90\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xf0\x88\x80\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xed\x80\x80"` $C add $D -i 1 -l `printf "\xed\xa0\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xe0\xbf\xbf"` $C add $D -i 1 -l `printf "\xe0\xbf\x44"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xe0\x80\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xe0\x90\x80"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xe0\xbf"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xd0\x80"` $C add $D -i 1 -l `printf "\xd0\x11"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xd0"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\xc0\xaf"` # (EXPECT: failed) $C add $D -i 1 -l `printf "\x80"` # (EXPECT: failed) Review URL: http://codereview.chromium.org/5025003
This commit is contained in:
16
cgpt/cgpt.h
16
cgpt/cgpt.h
@@ -81,14 +81,22 @@ int WritePMBR(struct drive *drive);
|
|||||||
|
|
||||||
/* Convert possibly unterminated UTF16 string to UTF8.
|
/* Convert possibly unterminated UTF16 string to UTF8.
|
||||||
* Caller must prepare enough space for UTF8, which could be up to
|
* Caller must prepare enough space for UTF8, which could be up to
|
||||||
* twice the number of UTF16 chars plus the terminating '\0'.
|
* twice the byte length of UTF16 string plus the terminating '\0'.
|
||||||
|
*
|
||||||
|
* Return: CGPT_OK --- all character are converted successfully.
|
||||||
|
* CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
||||||
*/
|
*/
|
||||||
void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
||||||
uint8_t *utf8, unsigned int maxoutput);
|
uint8_t *utf8, unsigned int maxoutput);
|
||||||
|
|
||||||
/* Convert null-terminated UTF8 string to UTF16.
|
/* Convert null-terminated UTF8 string to UTF16.
|
||||||
* Caller must prepare enough space for UTF16, including a terminating 0x0000
|
* Caller must prepare enough space for UTF16, which is the byte length of UTF8
|
||||||
|
* plus the terminating 0x0000.
|
||||||
|
*
|
||||||
|
* Return: CGPT_OK --- all character are converted successfully.
|
||||||
|
* CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
||||||
*/
|
*/
|
||||||
void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput);
|
int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput);
|
||||||
|
|
||||||
/* Helper functions for supported GPT types. */
|
/* Helper functions for supported GPT types. */
|
||||||
int ResolveType(const Guid *type, char *buf);
|
int ResolveType(const Guid *type, char *buf);
|
||||||
|
|||||||
@@ -350,56 +350,209 @@ void GuidToStr(const Guid *guid, char *str, unsigned int buflen) {
|
|||||||
|
|
||||||
/* Convert possibly unterminated UTF16 string to UTF8.
|
/* Convert possibly unterminated UTF16 string to UTF8.
|
||||||
* Caller must prepare enough space for UTF8, which could be up to
|
* Caller must prepare enough space for UTF8, which could be up to
|
||||||
* twice the number of UTF16 chars plus the terminating '\0'.
|
* twice the byte length of UTF16 string plus the terminating '\0'.
|
||||||
* FIXME(wfrichar): The original implementation had security issues. As a
|
* See the following table for encoding lengths.
|
||||||
* temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542
|
*
|
||||||
* (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix
|
* Code point UTF16 UTF8
|
||||||
* this.
|
* 0x0000-0x007F 2 bytes 1 byte
|
||||||
|
* 0x0080-0x07FF 2 bytes 2 bytes
|
||||||
|
* 0x0800-0xFFFF 2 bytes 3 bytes
|
||||||
|
* 0x10000-0x10FFFF 4 bytes 4 bytes
|
||||||
|
*
|
||||||
|
* This function uses a simple state meachine to convert UTF-16 char(s) to
|
||||||
|
* a code point. Once a code point is parsed out, the state machine throws
|
||||||
|
* out sequencial UTF-8 chars in one time.
|
||||||
|
*
|
||||||
|
* Return: CGPT_OK --- all character are converted successfully.
|
||||||
|
* CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
||||||
*/
|
*/
|
||||||
void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
||||||
uint8_t *utf8, unsigned int maxoutput)
|
uint8_t *utf8, unsigned int maxoutput)
|
||||||
{
|
{
|
||||||
size_t s16idx, s8idx;
|
size_t s16idx, s8idx;
|
||||||
uint32_t utfchar;
|
uint32_t code_point;
|
||||||
|
int code_point_ready = 1; // code point is ready to output.
|
||||||
|
int retval = CGPT_OK;
|
||||||
|
|
||||||
if (!utf16 || !maxinput || !utf8 || !maxoutput)
|
if (!utf16 || !maxinput || !utf8 || !maxoutput)
|
||||||
return;
|
return CGPT_FAILED;
|
||||||
|
|
||||||
maxoutput--; /* plan for termination now */
|
maxoutput--; /* plan for termination now */
|
||||||
|
|
||||||
for (s16idx = s8idx = 0;
|
for (s16idx = s8idx = 0;
|
||||||
s16idx < maxinput && utf16[s16idx] && maxoutput;
|
s16idx < maxinput && utf16[s16idx] && maxoutput;
|
||||||
s16idx++, maxoutput--) {
|
s16idx++) {
|
||||||
utfchar = le16toh(utf16[s16idx]);
|
uint16_t codeunit = le16toh(utf16[s16idx]);
|
||||||
utf8[s8idx++] = utfchar & 0x7F;
|
|
||||||
|
if (code_point_ready) {
|
||||||
|
if (codeunit >= 0xD800 && codeunit <= 0xDBFF) {
|
||||||
|
/* high surrogate, need the low surrogate. */
|
||||||
|
code_point_ready = 0;
|
||||||
|
code_point = (codeunit & 0x03FF) + 0x0040;
|
||||||
|
} else {
|
||||||
|
/* BMP char, output it. */
|
||||||
|
code_point = codeunit;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* expect the low surrogate */
|
||||||
|
if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) {
|
||||||
|
code_point = (code_point << 10) | (codeunit & 0x03FF);
|
||||||
|
code_point_ready = 1;
|
||||||
|
} else {
|
||||||
|
/* the second code unit is NOT the low surrogate. Unexpected. */
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If UTF code point is ready, output it. */
|
||||||
|
if (code_point_ready) {
|
||||||
|
require(code_point <= 0x10FFFF);
|
||||||
|
if (code_point <= 0x7F && maxoutput >= 1) {
|
||||||
|
maxoutput -= 1;
|
||||||
|
utf8[s8idx++] = code_point & 0x7F;
|
||||||
|
} else if (code_point <= 0x7FF && maxoutput >= 2) {
|
||||||
|
maxoutput -= 2;
|
||||||
|
utf8[s8idx++] = 0xC0 | (code_point >> 6);
|
||||||
|
utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
||||||
|
} else if (code_point <= 0xFFFF && maxoutput >= 3) {
|
||||||
|
maxoutput -= 3;
|
||||||
|
utf8[s8idx++] = 0xE0 | (code_point >> 12);
|
||||||
|
utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
|
||||||
|
utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
||||||
|
} else if (code_point <= 0x10FFFF && maxoutput >= 4) {
|
||||||
|
maxoutput -= 4;
|
||||||
|
utf8[s8idx++] = 0xF0 | (code_point >> 18);
|
||||||
|
utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F);
|
||||||
|
utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
|
||||||
|
utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
||||||
|
} else {
|
||||||
|
/* buffer underrun */
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
utf8[s8idx++] = 0;
|
utf8[s8idx++] = 0;
|
||||||
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.
|
/* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.
|
||||||
* Caller must prepare enough space for UTF16, including a terminating 0x0000.
|
* Caller must prepare enough space for UTF16, including a terminating 0x0000.
|
||||||
* FIXME(wfrichar): The original implementation had security issues. As a
|
* See the following table for encoding lengths. In any case, the caller
|
||||||
* temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542
|
* just needs to prepare the byte length of UTF8 plus the terminating 0x0000.
|
||||||
* (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix
|
*
|
||||||
* this.
|
* Code point UTF16 UTF8
|
||||||
|
* 0x0000-0x007F 2 bytes 1 byte
|
||||||
|
* 0x0080-0x07FF 2 bytes 2 bytes
|
||||||
|
* 0x0800-0xFFFF 2 bytes 3 bytes
|
||||||
|
* 0x10000-0x10FFFF 4 bytes 4 bytes
|
||||||
|
*
|
||||||
|
* This function converts UTF8 chars to a code point first. Then, convrts it
|
||||||
|
* to UTF16 code unit(s).
|
||||||
|
*
|
||||||
|
* Return: CGPT_OK --- all character are converted successfully.
|
||||||
|
* CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
||||||
*/
|
*/
|
||||||
void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)
|
int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)
|
||||||
{
|
{
|
||||||
size_t s16idx, s8idx;
|
size_t s16idx, s8idx;
|
||||||
uint32_t utfchar;
|
uint32_t code_point = 0;
|
||||||
|
unsigned int expected_units = 1;
|
||||||
|
unsigned int decoded_units = 1;
|
||||||
|
int retval = CGPT_OK;
|
||||||
|
|
||||||
if (!utf8 || !utf16 || !maxoutput)
|
if (!utf8 || !utf16 || !maxoutput)
|
||||||
return;
|
return CGPT_FAILED;
|
||||||
|
|
||||||
maxoutput--; /* plan for termination */
|
maxoutput--; /* plan for termination */
|
||||||
|
|
||||||
for (s8idx = s16idx = 0;
|
for (s8idx = s16idx = 0;
|
||||||
utf8[s8idx] && maxoutput;
|
utf8[s8idx] && maxoutput;
|
||||||
s8idx++, maxoutput--) {
|
s8idx++) {
|
||||||
utfchar = utf8[s8idx];
|
uint8_t code_unit;
|
||||||
utf16[s16idx++] = utfchar & 0x7F;
|
code_unit = utf8[s8idx];
|
||||||
|
|
||||||
|
if (expected_units != decoded_units) {
|
||||||
|
/* Trailing bytes of multi-byte character */
|
||||||
|
if ((code_unit & 0xC0) == 0x80) {
|
||||||
|
code_point = (code_point << 6) | (code_unit & 0x3F);
|
||||||
|
++decoded_units;
|
||||||
|
} else {
|
||||||
|
/* Unexpected code unit. */
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/* parsing a new code point. */
|
||||||
|
decoded_units = 1;
|
||||||
|
if (code_unit <= 0x7F) {
|
||||||
|
code_point = code_unit;
|
||||||
|
expected_units = 1;
|
||||||
|
} else if (code_unit <= 0xBF) {
|
||||||
|
/* 0x80-0xBF must NOT be the heading byte unit of a new code point. */
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
} else if (code_unit >= 0xC2 && code_unit <= 0xDF) {
|
||||||
|
code_point = code_unit & 0x1F;
|
||||||
|
expected_units = 2;
|
||||||
|
} else if (code_unit >= 0xE0 && code_unit <= 0xEF) {
|
||||||
|
code_point = code_unit & 0x0F;
|
||||||
|
expected_units = 3;
|
||||||
|
} else if (code_unit >= 0xF0 && code_unit <= 0xF4) {
|
||||||
|
code_point = code_unit & 0x07;
|
||||||
|
expected_units = 4;
|
||||||
|
} else {
|
||||||
|
/* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If no more unit is needed, output the UTF16 unit(s). */
|
||||||
|
if (expected_units == decoded_units) {
|
||||||
|
/* Check if the encoding is the shortest possible UTF-8 sequence. */
|
||||||
|
switch (expected_units) {
|
||||||
|
case 2:
|
||||||
|
if (code_point <= 0x7F) retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
if (code_point <= 0x7FF) retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
if (code_point <= 0xFFFF) retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (retval == CGPT_FAILED) break; /* leave immediately */
|
||||||
|
|
||||||
|
if ((code_point <= 0xD7FF) ||
|
||||||
|
(code_point >= 0xE000 && code_point <= 0xFFFF)) {
|
||||||
|
utf16[s16idx++] = code_point;
|
||||||
|
maxoutput -= 1;
|
||||||
|
} else if (code_point >= 0x10000 && code_point <= 0x10FFFF &&
|
||||||
|
maxoutput >= 2) {
|
||||||
|
utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040);
|
||||||
|
utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF);
|
||||||
|
maxoutput -= 2;
|
||||||
|
} else {
|
||||||
|
/* Three possibilities fall into here. Both are failure cases.
|
||||||
|
* a. surrogate pair (non-BMP characters; 0xD800~0xDFFF)
|
||||||
|
* b. invalid code point > 0x10FFFF
|
||||||
|
* c. buffer underrun
|
||||||
|
*/
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* A null-terminator shows up before the UTF8 sequence ends. */
|
||||||
|
if (expected_units != decoded_units) {
|
||||||
|
retval = CGPT_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
utf16[s16idx++] = 0;
|
utf16[s16idx++] = 0;
|
||||||
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
|
|||||||
@@ -251,8 +251,11 @@ int cmd_add(int argc, char *argv[]) {
|
|||||||
if (set_unique)
|
if (set_unique)
|
||||||
memcpy(&entry->unique, &unique_guid, sizeof(Guid));
|
memcpy(&entry->unique, &unique_guid, sizeof(Guid));
|
||||||
if (label) {
|
if (label) {
|
||||||
UTF8ToUTF16((uint8_t *)label, entry->name,
|
if (CGPT_OK != UTF8ToUTF16((uint8_t *)label, entry->name,
|
||||||
sizeof(entry->name) / sizeof(entry->name[0]));
|
sizeof(entry->name) / sizeof(entry->name[0]))) {
|
||||||
|
Error("The label cannot be converted to UTF16.\n");
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (set_raw) {
|
if (set_raw) {
|
||||||
entry->attrs.fields.gpt_att = raw_value;
|
entry->attrs.fields.gpt_att = raw_value;
|
||||||
|
|||||||
@@ -181,8 +181,12 @@ static int do_search(char *filename) {
|
|||||||
(set_type && !memcmp(&type_guid, &entry->type, sizeof(Guid)))) {
|
(set_type && !memcmp(&type_guid, &entry->type, sizeof(Guid)))) {
|
||||||
found = 1;
|
found = 1;
|
||||||
} else if (set_label) {
|
} else if (set_label) {
|
||||||
UTF16ToUTF8(entry->name, sizeof(entry->name) / sizeof(entry->name[0]),
|
if (CGPT_OK != UTF16ToUTF8(entry->name,
|
||||||
(uint8_t *)partlabel, sizeof(partlabel));
|
sizeof(entry->name) / sizeof(entry->name[0]),
|
||||||
|
(uint8_t *)partlabel, sizeof(partlabel))) {
|
||||||
|
Error("The label cannot be converted to UTF16, so abort.\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!strncmp(label, partlabel, sizeof(partlabel))) {
|
if (!strncmp(label, partlabel, sizeof(partlabel))) {
|
||||||
found = 1;
|
found = 1;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user