Решение с использованием mbstowcs()
для преобразования символа в кодировке UTF-8 из многобайтовой символьной строки в широкий символ.
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
int main(int argc, char *argv[])
{
wchar_t u;
/* Set locale according to the environment variables */
if (setlocale(LC_ALL, "") == NULL) {
perror("setlocale");
exit(EXIT_FAILURE);
}
/* Convert the multibyte character string in argv[1] to a
wide character */
if (mbstowcs(&u, argv[1], 1) == (size_t) -1) {
perror("mbstowcs");
exit(EXIT_FAILURE);
}
printf("%u\n", u);
}
Другое решение декодирует символ UTF-8 вручную. Код импортируется из st (suckless terminal emulator)
.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define UTF_INVALID 0xFFFD
#define UTF_SIZ 4
typedef unsigned char uchar;
typedef uint_least32_t Rune;
#define LEN(a) (sizeof(a) / sizeof(a)[0])
#define BETWEEN(x, a, b) ((a) <= (x) && (x) <= (b))
static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
static Rune utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
Rune
utf8decodebyte(char c, size_t *i)
{
for (*i = 0; *i < LEN(utfmask); ++(*i))
if (((uchar)c & utfmask[*i]) == utfbyte[*i])
return (uchar)c & ~utfmask[*i];
return 0;
}
size_t
utf8validate(Rune *u, size_t i)
{
if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
*u = UTF_INVALID;
for (i = 1; *u > utfmax[i]; ++i)
;
return i;
}
size_t
utf8decode(const char *c, Rune *u, size_t clen)
{
size_t i, j, len, type;
Rune udecoded;
*u = UTF_INVALID;
if (!clen)
return 0;
udecoded = utf8decodebyte(c[0], &len);
if (!BETWEEN(len, 1, UTF_SIZ))
return 1;
for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
if (type != 0)
return j;
}
if (j < len)
return 0;
*u = udecoded;
utf8validate(u, len);
return len;
}
int main(int argc, char *argv[])
{
Rune u;
utf8decode(argv[1], &u, UTF_SIZ);
printf("%u\n", u);
}