utf8.c 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /*
  2. * tinyrl.c
  3. */
  4. #include <assert.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include <ctype.h>
  9. #include <errno.h>
  10. #include <unistd.h>
  11. #include <faux/str.h>
  12. #include "private.h"
  13. /** @brief Converts UTF-8 char to unsigned long wchar
  14. *
  15. * @param [in] sp Pointer to UTF-8 string to get char from.
  16. * @param [out] sym_out Resulting wchar.
  17. * @return Number of bytes for current UTF-8 symbol.
  18. */
  19. ssize_t utf8_to_wchar(const char *sp, unsigned long *sym_out)
  20. {
  21. int i = 0;
  22. int octets = 0; // Number of 0x10xxxxxx UTF-8 sequence bytes
  23. unsigned long sym = 0;
  24. const unsigned char *p = (const unsigned char *)sp;
  25. if (sym_out)
  26. *sym_out = *p;
  27. if (!*p)
  28. return 0;
  29. // Check for first byte of UTF-8
  30. if ((*p & 0x80) == 0) // 0xxxxxxx
  31. return 1;
  32. // Analyze first byte to get number of UTF-8 octets
  33. if ((*p & 0xe0) == 0xc0) { // 110xxxxx 10xxxxxx
  34. octets = 1;
  35. sym = (*p & 0x1f);
  36. } else if ((*p & 0xf0) == 0xe0) { // 1110xxxx 10xxxxxx 10xxxxxx
  37. octets = 2;
  38. sym = (*p & 0xf);
  39. } else if ((*p & 0xf8) == 0xf0) { // 11110xxx 10xxxxxx 10xxxxxx
  40. octets = 3;
  41. sym = (*p & 7);
  42. } else if ((*p & 0xfc) == 0xf8) { // depricated
  43. octets = 4;
  44. sym = (*p & 3);
  45. } else if ((*p & 0xfe) == 0xfc) { // depricated
  46. octets = 5;
  47. sym = (*p & 1);
  48. } else {
  49. return 1; // Error but be robust and skip one byte
  50. }
  51. p++;
  52. // Analyze next UTF-8 bytes 10xxxxxx
  53. for (i = 0; i < octets; i++) {
  54. sym <<= 6;
  55. // Check if it's really UTF-8 bytes
  56. if ((*p & 0xc0) != 0x80)
  57. return 1; // Skip one byte if broken UTF-8 symbol
  58. sym |= (*p & 0x3f);
  59. p++;
  60. }
  61. if (sym_out)
  62. *sym_out = sym;
  63. return (octets + 1);
  64. }
  65. /** @brief Checks is wchar CJK
  66. *
  67. * @param [in] sym Widechar symbol to analyze
  68. * @return BOOL_TRUE if CJK and BOOL_FALSE else
  69. */
  70. bool_t utf8_wchar_is_cjk(unsigned long sym)
  71. {
  72. if (sym < 0x1100) /* Speed up for non-CJK chars */
  73. return BOOL_FALSE;
  74. if (sym >= 0x1100 && sym <= 0x11FF) /* Hangul Jamo */
  75. return BOOL_TRUE;
  76. #if 0
  77. if (sym >=0x2E80 && sym <= 0x2EFF) /* CJK Radicals Supplement */
  78. return BOOL_TRUE;
  79. if (sym >=0x2F00 && sym <= 0x2FDF) /* Kangxi Radicals */
  80. return BOOL_TRUE;
  81. if (sym >= 0x2FF0 && sym <= 0x2FFF) /* Ideographic Description Characters */
  82. return BOOL_TRUE;
  83. if (sym >= 0x3000 && sym < 0x303F) /* CJK Symbols and Punctuation. The U+303f is half space */
  84. return BOOL_TRUE;
  85. if (sym >= 0x3040 && sym <= 0x309F) /* Hiragana */
  86. return BOOL_TRUE;
  87. if (sym >= 0x30A0 && sym <=0x30FF) /* Katakana */
  88. return BOOL_TRUE;
  89. if (sym >= 0x3100 && sym <=0x312F) /* Bopomofo */
  90. return BOOL_TRUE;
  91. if (sym >= 0x3130 && sym <= 0x318F) /* Hangul Compatibility Jamo */
  92. return BOOL_TRUE;
  93. if (sym >= 0x3190 && sym <= 0x319F) /* Kanbun */
  94. return BOOL_TRUE;
  95. if (sym >= 0x31A0 && sym <= 0x31BF) /* Bopomofo Extended */
  96. return BOOL_TRUE;
  97. if (sym >= 0x31C0 && sym <= 0x31EF) /* CJK strokes */
  98. return BOOL_TRUE;
  99. if (sym >= 0x31F0 && sym <= 0x31FF) /* Katakana Phonetic Extensions */
  100. return BOOL_TRUE;
  101. if (sym >= 0x3200 && sym <= 0x32FF) /* Enclosed CJK Letters and Months */
  102. return BOOL_TRUE;
  103. if (sym >= 0x3300 && sym <= 0x33FF) /* CJK Compatibility */
  104. return BOOL_TRUE;
  105. if (sym >= 0x3400 && sym <= 0x4DBF) /* CJK Unified Ideographs Extension A */
  106. return BOOL_TRUE;
  107. if (sym >= 0x4DC0 && sym <= 0x4DFF) /* Yijing Hexagram Symbols */
  108. return BOOL_TRUE;
  109. if (sym >= 0x4E00 && sym <= 0x9FFF) /* CJK Unified Ideographs */
  110. return BOOL_TRUE;
  111. if (sym >= 0xA000 && sym <= 0xA48F) /* Yi Syllables */
  112. return BOOL_TRUE;
  113. if (sym >= 0xA490 && sym <= 0xA4CF) /* Yi Radicals */
  114. return BOOL_TRUE;
  115. #endif
  116. /* Speed up previous block */
  117. if (sym >= 0x2E80 && sym <= 0xA4CF && sym != 0x303F)
  118. return BOOL_TRUE;
  119. if (sym >= 0xAC00 && sym <= 0xD7AF) /* Hangul Syllables */
  120. return BOOL_TRUE;
  121. if (sym >= 0xF900 && sym <= 0xFAFF) /* CJK Compatibility Ideographs */
  122. return BOOL_TRUE;
  123. if (sym >= 0xFE10 && sym <= 0xFE1F) /* Vertical Forms */
  124. return BOOL_TRUE;
  125. #if 0
  126. if (sym >= 0xFE30 && sym <= 0xFE4F) /* CJK Compatibility Forms */
  127. return BOOL_TRUE;
  128. if (sym >= 0xFE50 && sym <= 0xFE6F) /* Small Form Variants */
  129. return BOOL_TRUE;
  130. #endif
  131. /* Speed up previous block */
  132. if (sym >= 0xFE30 && sym <= 0xFE6F)
  133. return BOOL_TRUE;
  134. if ((sym >= 0xFF00 && sym <= 0xFF60) ||
  135. (sym >= 0xFFE0 && sym <= 0xFFE6)) /* Fullwidth Forms */
  136. return BOOL_TRUE;
  137. if (sym >= 0x1D300 && sym <= 0x1D35F) /* Tai Xuan Jing Symbols */
  138. return BOOL_TRUE;
  139. if (sym >= 0x20000 && sym <= 0x2B81F) /* CJK Unified Ideographs Extensions B, C, D */
  140. return BOOL_TRUE;
  141. if (sym >= 0x2F800 && sym <= 0x2FA1F) /* CJK Compatibility Ideographs Supplement */
  142. return BOOL_TRUE;
  143. return BOOL_FALSE;
  144. }
  145. /** @brief Get position of previous UTF-8 char within string
  146. *
  147. * @param [in] line UTF-8 string.
  148. * @param [in] cur_pos Current position within UTF-8 string.
  149. * @return Position of previous UTF-8 character or NULL on error.
  150. */
  151. size_t utf8_move_left(const char *line, size_t cur_pos)
  152. {
  153. const char *pos = line + cur_pos;
  154. if (!line)
  155. return 0;
  156. if (pos == line)
  157. return cur_pos; // It's already leftmost position
  158. do {
  159. pos--;
  160. } while ((pos > line) && (UTF8_10 == (*pos & UTF8_MASK)));
  161. return pos - line;
  162. }
  163. /** @brief Get position of next UTF-8 char within string
  164. *
  165. * @param [in] line UTF-8 string.
  166. * @param [in] cur_pos Current position within UTF-8 string.
  167. * @return Position of next UTF-8 character or NULL on error.
  168. */
  169. size_t utf8_move_right(const char *line, size_t cur_pos)
  170. {
  171. const char *pos = line + cur_pos;
  172. if (!line)
  173. return 0;
  174. if (*pos == '\0')
  175. return cur_pos; // It's already rightmost position
  176. do {
  177. pos++;
  178. } while ((*pos != '\0') && (UTF8_10 == (*pos & UTF8_MASK)));
  179. return pos - line;
  180. }
  181. /** @brief Counts number of printable symbols within UTF-8 string
  182. *
  183. * One printable symbol can consist of several UTF-8 bytes.
  184. * CJK UTF-8 character can occupy 2 printable positions.
  185. *
  186. * @param [in] str UTF-8 string.
  187. * @param [in] end End of line position (pointer). Can be NULL - no limit.
  188. * @param Number of printable symbols or < 0 on error.
  189. */
  190. ssize_t utf8_nsyms(const char *str, size_t len)
  191. {
  192. const char *pos = str;
  193. ssize_t nsym = 0;
  194. if (!str)
  195. return -1;
  196. while ((pos < (str + len)) && (*pos != '\0')) {
  197. unsigned long sym = 0;
  198. // ASCII char
  199. if ((UTF8_7BIT_MASK & *pos) == 0) {
  200. pos++;
  201. nsym++;
  202. continue;
  203. }
  204. // Multibyte UTF-8
  205. pos += utf8_to_wchar(pos, &sym);
  206. if (utf8_wchar_is_cjk(sym)) // CJK chars have double-width
  207. nsym += 2;
  208. else
  209. nsym += 1;
  210. }
  211. return nsym;
  212. }