본문 바로가기

Native/C

UTF8token (unicode)

출처: http://jinsuk.pe.kr/Unicode/public/UTF8token.c


/* by Jinsuk Kim, http://www.jinsuk.pe.kr */

[code]
#include <stdio.h>
#include <UniCharts.h>

int UTF8token(char **strPtr, char *Token, int *len)
{
    int charType = 0;
    unsigned short unicode = 0x0000;
    unsigned char c;
    int charOffset; /* how many bytes for this UTF-8 char */

    *len = 0;
    while (**strPtr) {
c = (**strPtr)&0xe0;
if (c < 0x80) {
    Token[*len] = **strPtr;
    (*len)++;
    unicode = (unsigned short) **strPtr;
}
else if (c < 0xe0) {
    Token[*len] = **strPtr;
    Token[*len+1] = *(*strPtr+1);
    (*len) += 2;
    unicode = (unsigned short) **strPtr & 0x1f;
    ++*strPtr;
    unicode = unicode << 6;
    unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}
else if (c < 0xf0) {
    Token[*len] = **strPtr;
    Token[*len+1] = *(*strPtr+1);
    Token[*len+2] = *(*strPtr+2);
    (*len) += 3;
    unicode = (unsigned short) **strPtr & 0x0f;
    ++*strPtr;
    unicode = unicode << 6;
    unicode = unicode | ((unsigned short) **strPtr & 0x3f);
    ++*strPtr;
    unicode = unicode << 6;
    unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}

charType = cType2[unicode];

if (unicode < 0x80) charOffset = 1;
else if (unicode < 0x0800) charOffset = 2;
else charOffset = 3;

switch(charType) {
    case T_BLN: /* blank characters */
    case T_SPC: /* special characters */
    case T_SYM: /* symbolic characters */
    case T_CTK: /* control characters */
switch( (char)unicode ) {
    case '+':
//     case '.':
//     case ',':
break;
    default:
// *strPtr -= charOffset-1; /* no need to backward ? */
*len -= charOffset;
++*strPtr;
goto escape_loop;
}
break;
    default:
break;
}
++*strPtr;
    }
escape_loop:
    Token[*len] = '\0';

    return charType;
}
[/code]

[출처] UTF8token.c|작성자 형기

'Native > C' 카테고리의 다른 글

UCS2UTF8 (unicode)  (0) 2013.10.02
IconvString (unicode)  (0) 2013.10.02
유니코드영역 header (unicode)  (0) 2013.10.02
유니코드(Unicode) 관련 C 언어 source (unicode)  (0) 2013.10.02
UTF8 기반 한자한글 변환기(Hanja2Hangul) (unicode)  (0) 2013.10.02