출처: http://jinsuk.pe.kr/Unicode/public/UTF8token.c
/* by Jinsuk Kim, http://www.jinsuk.pe.kr */
[code]
#include <stdio.h>
#include <UniCharts.h>
int UTF8token(char **strPtr, char *Token, int *len)
{
int charType = 0;
unsigned short unicode = 0x0000;
unsigned char c;
int charOffset; /* how many bytes for this UTF-8 char */
*len = 0;
while (**strPtr) {
c = (**strPtr)&0xe0;
if (c < 0x80) {
Token[*len] = **strPtr;
(*len)++;
unicode = (unsigned short) **strPtr;
}
else if (c < 0xe0) {
Token[*len] = **strPtr;
Token[*len+1] = *(*strPtr+1);
(*len) += 2;
unicode = (unsigned short) **strPtr & 0x1f;
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}
else if (c < 0xf0) {
Token[*len] = **strPtr;
Token[*len+1] = *(*strPtr+1);
Token[*len+2] = *(*strPtr+2);
(*len) += 3;
unicode = (unsigned short) **strPtr & 0x0f;
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}
charType = cType2[unicode];
if (unicode < 0x80) charOffset = 1;
else if (unicode < 0x0800) charOffset = 2;
else charOffset = 3;
switch(charType) {
case T_BLN: /* blank characters */
case T_SPC: /* special characters */
case T_SYM: /* symbolic characters */
case T_CTK: /* control characters */
switch( (char)unicode ) {
case '+':
// case '.':
// case ',':
break;
default:
// *strPtr -= charOffset-1; /* no need to backward ? */
*len -= charOffset;
++*strPtr;
goto escape_loop;
}
break;
default:
break;
}
++*strPtr;
}
escape_loop:
Token[*len] = '\0';
return charType;
}
[/code]
[출처] UTF8token.c|작성자 형기
/* by Jinsuk Kim, http://www.jinsuk.pe.kr */
[code]
#include <stdio.h>
#include <UniCharts.h>
int UTF8token(char **strPtr, char *Token, int *len)
{
int charType = 0;
unsigned short unicode = 0x0000;
unsigned char c;
int charOffset; /* how many bytes for this UTF-8 char */
*len = 0;
while (**strPtr) {
c = (**strPtr)&0xe0;
if (c < 0x80) {
Token[*len] = **strPtr;
(*len)++;
unicode = (unsigned short) **strPtr;
}
else if (c < 0xe0) {
Token[*len] = **strPtr;
Token[*len+1] = *(*strPtr+1);
(*len) += 2;
unicode = (unsigned short) **strPtr & 0x1f;
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}
else if (c < 0xf0) {
Token[*len] = **strPtr;
Token[*len+1] = *(*strPtr+1);
Token[*len+2] = *(*strPtr+2);
(*len) += 3;
unicode = (unsigned short) **strPtr & 0x0f;
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
++*strPtr;
unicode = unicode << 6;
unicode = unicode | ((unsigned short) **strPtr & 0x3f);
}
charType = cType2[unicode];
if (unicode < 0x80) charOffset = 1;
else if (unicode < 0x0800) charOffset = 2;
else charOffset = 3;
switch(charType) {
case T_BLN: /* blank characters */
case T_SPC: /* special characters */
case T_SYM: /* symbolic characters */
case T_CTK: /* control characters */
switch( (char)unicode ) {
case '+':
// case '.':
// case ',':
break;
default:
// *strPtr -= charOffset-1; /* no need to backward ? */
*len -= charOffset;
++*strPtr;
goto escape_loop;
}
break;
default:
break;
}
++*strPtr;
}
escape_loop:
Token[*len] = '\0';
return charType;
}
[/code]
[출처] UTF8token.c|작성자 형기
'Native > C' 카테고리의 다른 글
UCS2UTF8 (unicode) (0) | 2013.10.02 |
---|---|
IconvString (unicode) (0) | 2013.10.02 |
유니코드영역 header (unicode) (0) | 2013.10.02 |
유니코드(Unicode) 관련 C 언어 source (unicode) (0) | 2013.10.02 |
UTF8 기반 한자한글 변환기(Hanja2Hangul) (unicode) (0) | 2013.10.02 |