Browse Source

utf8_is_cjk() function

Serj Kalichev 10 years ago
parent
commit
d01c8ea291
1 changed files with 132 additions and 0 deletions
  1. 132 0
      tinyrl/tinyrl.c

+ 132 - 0
tinyrl/tinyrl.c

@@ -20,6 +20,138 @@
 
 #include "private.h"
 
+/*-------------------------------------------------------- */
+#if 0
+int get_utf8(const char *sp, unsigned long *sym_out)
+{
+	int i = 0, l = 0;
+	unsigned long sym = 0;
+	const unsigned char *p = (const unsigned char*)sp;
+
+	if (sym_out)
+		*sym_out = *p;
+
+	if (!*p)
+		return 0;
+
+	if (!(*p & 0xc0))
+		return 1;
+
+	if ((*p & 0xe0) == 0xc0) {
+		l = 1;
+		sym = (*p & 0x1f);
+	} else if ((*p & 0xf0) == 0xe0) {
+		l = 2;
+		sym = (*p & 0xf);
+	} else if ((*p & 0xf8) == 0xf0) {
+		l = 3;
+		sym = (*p & 7);
+	} else if ((*p & 0xfc) == 0xf8) {
+		l = 4;
+		sym = (*p & 3);
+	} else if ((*p & 0xfe) == 0xfc) {
+		l = 5;
+		sym = (*p & 1);
+	} else {
+		return 1;
+	}
+	p ++;
+	for (i = 0; i < l; i ++) {
+		sym <<= 6;
+		if ((*p & 0xc0) != 0x80) {
+			return 1;
+		}
+		sym |= (*p++ & 0x3f);
+	}
+	if (sym_out)
+		*sym_out = sym;
+	return l + 1;
+}
+#endif
+
+/*-------------------------------------------------------- */
+static int utf8_is_cjk(unsigned long sym)
+{
+	if (sym < 0x1100) /* Speed up for non-CJK chars */
+		return 0;
+
+	if (sym >= 0x1100 && sym <= 0x11FF) /* Hangul Jamo */
+		return 1;
+#if 0
+	if (sym >=0x2E80 && sym <= 0x2EFF) /* CJK Radicals Supplement */
+		return 1;
+	if (sym >=0x2F00 && sym <= 0x2FDF) /* Kangxi Radicals */
+		return 1;
+	if (sym >= 0x2FF0 && sym <= 0x2FFF) /* Ideographic Description Characters */
+		return 1;
+	if (sym >= 0x3000 && sym < 0x303F) /* CJK Symbols and Punctuation. The U+303f is half space */
+		return 1;
+	if (sym >= 0x3040 && sym <= 0x309F) /* Hiragana */
+		return 1;
+	if (sym >= 0x30A0 && sym <=0x30FF) /* Katakana */
+		return 1;
+	if (sym >= 0x3100 && sym <=0x312F) /* Bopomofo */
+		return 1;
+	if (sym >= 0x3130 && sym <= 0x318F) /* Hangul Compatibility Jamo */
+		return 1;
+	if (sym >= 0x3190 && sym <= 0x319F) /* Kanbun */
+		return 1;
+	if (sym >= 0x31A0 && sym <= 0x31BF) /* Bopomofo Extended */
+		return 1;
+	if (sym >= 0x31C0 && sym <= 0x31EF) /* CJK strokes */
+		return 1;
+	if (sym >= 0x31F0 && sym <= 0x31FF) /* Katakana Phonetic Extensions */
+		return 1;
+	if (sym >= 0x3200 && sym <= 0x32FF) /* Enclosed CJK Letters and Months */
+		return 1;
+	if (sym >= 0x3300 && sym <= 0x33FF) /* CJK Compatibility */
+		return 1;
+	if (sym >= 0x3400 && sym <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+		return 1;
+	if (sym >= 0x4DC0 && sym <= 0x4DFF) /* Yijing Hexagram Symbols */
+		return 1;
+	if (sym >= 0x4E00 && sym <= 0x9FFF) /* CJK Unified Ideographs */
+		return 1;
+	if (sym >= 0xA000 && sym <= 0xA48F) /* Yi Syllables */
+		return 1;
+	if (sym >= 0xA490 && sym <= 0xA4CF) /* Yi Radicals */
+		return 1;
+#endif
+	/* Speed up previous block */
+	if (sym >= 0x2E80 && sym <= 0xA4CF && sym != 0x303F)
+		return 1;
+
+	if (sym >= 0xAC00 && sym <= 0xD7AF) /* Hangul Syllables */
+		return 1;
+	if (sym >= 0xF900 && sym <= 0xFAFF) /* CJK Compatibility Ideographs */
+		return 1;
+	if (sym >= 0xFE10 && sym <= 0xFE1F) /* Vertical Forms */
+		return 1;
+
+#if 0
+	if (sym >= 0xFE30 && sym <= 0xFE4F) /* CJK Compatibility Forms */
+		return 1;
+	if (sym >= 0xFE50 && sym <= 0xFE6F) /* Small Form Variants */
+		return 1;
+#endif
+	/* Speed up previous block */
+	if (sym >= 0xFE30 && sym <= 0xFE6F)
+		return 1;
+
+	if ((sym >= 0xFF00 && sym <= 0xFF60) ||
+		(sym >= 0xFFE0 && sym <= 0xFFE6)) /* Fullwidth Forms */
+		return 1;
+
+	if (sym >= 0x1D300 && sym <= 0x1D35F) /* Tai Xuan Jing Symbols */
+		return 1;
+	if (sym >= 0x20000 && sym <= 0x2B81F) /* CJK Unified Ideographs Extensions B, C, D */
+		return 1;
+	if (sym >= 0x2F800 && sym <= 0x2FA1F) /* CJK Compatibility Ideographs Supplement */
+		return 1;
+
+	return 0;
+}
+
 /*-------------------------------------------------------- */
 static void utf8_point_left(tinyrl_t * this)
 {