Это моё решение в чистом ANSI-C, включая юнит-тест для угловых случаев.
Помните, что ширина int
должна быть не менее 32 бит. В противном случае вам придется изменить определение codepoint
.
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
typedef unsigned char byte;
typedef unsigned int codepoint;
/**
* Reads the next UTF-8-encoded character from the byte array ranging
* from {@code *pstart} up to, but not including, {@code end}. If the
* conversion succeeds, the {@code *pstart} iterator is advanced,
* the codepoint is stored into {@code *pcp}, and the function returns
* 0. Otherwise the conversion fails, {@code errno} is set to
* {@code EILSEQ} and the function returns -1.
*/
int
from_utf8(const byte **pstart, const byte *end, codepoint *pcp) {
size_t len, i;
codepoint cp, min;
const byte *buf;
buf = *pstart;
if (buf == end)
goto error;
if (buf[0] < 0x80) {
len = 1;
min = 0;
cp = buf[0];
} else if (buf[0] < 0xC0) {
goto error;
} else if (buf[0] < 0xE0) {
len = 2;
min = 1 << 7;
cp = buf[0] & 0x1F;
} else if (buf[0] < 0xF0) {
len = 3;
min = 1 << (5 + 6);
cp = buf[0] & 0x0F;
} else if (buf[0] < 0xF8) {
len = 4;
min = 1 << (4 + 6 + 6);
cp = buf[0] & 0x07;
} else {
goto error;
}
if (buf + len > end)
goto error;
for (i = 1; i < len; i++) {
if ((buf[i] & 0xC0) != 0x80)
goto error;
cp = (cp << 6) | (buf[i] & 0x3F);
}
if (cp < min)
goto error;
if (0xD800 <= cp && cp <= 0xDFFF)
goto error;
if (0x110000 <= cp)
goto error;
*pstart += len;
*pcp = cp;
return 0;
error:
errno = EILSEQ;
return -1;
}
static void
assert_valid(const byte **buf, const byte *end, codepoint expected) {
codepoint cp;
if (from_utf8(buf, end, &cp) == -1) {
fprintf(stderr, "invalid unicode sequence for codepoint %u\n", expected);
exit(EXIT_FAILURE);
}
if (cp != expected) {
fprintf(stderr, "expected %u, got %u\n", expected, cp);
exit(EXIT_FAILURE);
}
}
static void
assert_invalid(const char *name, const byte **buf, const byte *end) {
const byte *p;
codepoint cp;
p = *buf + 1;
if (from_utf8(&p, end, &cp) == 0) {
fprintf(stderr, "unicode sequence \"%s\" unexpectedly converts to %#x.\n", name, cp);
exit(EXIT_FAILURE);
}
*buf += (*buf)[0] + 1;
}
static const byte valid[] = {
0x00, /* first ASCII */
0x7F, /* last ASCII */
0xC2, 0x80, /* first two-byte */
0xDF, 0xBF, /* last two-byte */
0xE0, 0xA0, 0x80, /* first three-byte */
0xED, 0x9F, 0xBF, /* last before surrogates */
0xEE, 0x80, 0x80, /* first after surrogates */
0xEF, 0xBF, 0xBF, /* last three-byte */
0xF0, 0x90, 0x80, 0x80, /* first four-byte */
0xF4, 0x8F, 0xBF, 0xBF /* last codepoint */
};
static const byte invalid[] = {
1, 0x80,
1, 0xC0,
1, 0xC1,
2, 0xC0, 0x80,
2, 0xC2, 0x00,
2, 0xC2, 0x7F,
2, 0xC2, 0xC0,
3, 0xE0, 0x80, 0x80,
3, 0xE0, 0x9F, 0xBF,
3, 0xED, 0xA0, 0x80,
3, 0xED, 0xBF, 0xBF,
4, 0xF0, 0x80, 0x80, 0x80,
4, 0xF0, 0x8F, 0xBF, 0xBF,
4, 0xF4, 0x90, 0x80, 0x80
};
int
main() {
const byte *p, *end;
p = valid;
end = valid + sizeof valid;
assert_valid(&p, end, 0x000000);
assert_valid(&p, end, 0x00007F);
assert_valid(&p, end, 0x000080);
assert_valid(&p, end, 0x0007FF);
assert_valid(&p, end, 0x000800);
assert_valid(&p, end, 0x00D7FF);
assert_valid(&p, end, 0x00E000);
assert_valid(&p, end, 0x00FFFF);
assert_valid(&p, end, 0x010000);
assert_valid(&p, end, 0x10FFFF);
p = invalid;
end = invalid + sizeof invalid;
assert_invalid("80", &p, end);
assert_invalid("C0", &p, end);
assert_invalid("C1", &p, end);
assert_invalid("C0 80", &p, end);
assert_invalid("C2 00", &p, end);
assert_invalid("C2 7F", &p, end);
assert_invalid("C2 C0", &p, end);
assert_invalid("E0 80 80", &p, end);
assert_invalid("E0 9F BF", &p, end);
assert_invalid("ED A0 80", &p, end);
assert_invalid("ED BF BF", &p, end);
assert_invalid("F0 80 80 80", &p, end);
assert_invalid("F0 8F BF BF", &p, end);
assert_invalid("F4 90 80 80", &p, end);
return 0;
}