pkun
/
klish


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
							/*
 * tinyrl.c
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <unistd.h>

#include <faux/str.h>

#include "private.h"


/** @brief Converts UTF-8 char to unsigned long wchar
 *
 * @param [in] sp Pointer to UTF-8 string to get char from.
 * @param [out] sym_out Resulting wchar.
 * @return Number of bytes for current UTF-8 symbol.
 */
ssize_t utf8_to_wchar(const char *sp, unsigned long *sym_out)
{
	int i = 0;
	int octets = 0; // Number of 0x10xxxxxx UTF-8 sequence bytes
	unsigned long sym = 0;
	const unsigned char *p = (const unsigned char *)sp;

	if (sym_out)
		*sym_out = *p;

	if (!*p)
		return 0;

	// Check for first byte of UTF-8
	if ((*p & 0x80) == 0) // 0xxxxxxx
		return 1;

	// Analyze first byte to get number of UTF-8 octets
	if ((*p & 0xe0) == 0xc0) { // 110xxxxx 10xxxxxx
		octets = 1;
		sym = (*p & 0x1f);
	} else if ((*p & 0xf0) == 0xe0) { // 1110xxxx 10xxxxxx 10xxxxxx
		octets = 2;
		sym = (*p & 0xf);
	} else if ((*p & 0xf8) == 0xf0) { // 11110xxx 10xxxxxx 10xxxxxx
		octets = 3;
		sym = (*p & 7);
	} else if ((*p & 0xfc) == 0xf8) { // depricated
		octets = 4;
		sym = (*p & 3);
	} else if ((*p & 0xfe) == 0xfc) { // depricated
		octets = 5;
		sym = (*p & 1);
	} else {
		return 1; // Error but be robust and skip one byte
	}
	p++;

	// Analyze next UTF-8 bytes 10xxxxxx
	for (i = 0; i < octets; i++) {
		sym <<= 6;
		// Check if it's really UTF-8 bytes
		if ((*p & 0xc0) != 0x80)
			return 1; // Skip one byte if broken UTF-8 symbol
		sym |= (*p & 0x3f);
		p++;
	}

	if (sym_out)
		*sym_out = sym;

	return (octets + 1);
}


/** @brief Checks is wchar CJK
 *
 * @param [in] sym Widechar symbol to analyze
 * @return BOOL_TRUE if CJK and BOOL_FALSE else
 */
bool_t utf8_wchar_is_cjk(unsigned long sym)
{
	if (sym < 0x1100) /* Speed up for non-CJK chars */
		return BOOL_FALSE;

	if (sym >= 0x1100 && sym <= 0x11FF) /* Hangul Jamo */
		return BOOL_TRUE;
#if 0
	if (sym >=0x2E80 && sym <= 0x2EFF) /* CJK Radicals Supplement */
		return BOOL_TRUE;
	if (sym >=0x2F00 && sym <= 0x2FDF) /* Kangxi Radicals */
		return BOOL_TRUE;
	if (sym >= 0x2FF0 && sym <= 0x2FFF) /* Ideographic Description Characters */
		return BOOL_TRUE;
	if (sym >= 0x3000 && sym < 0x303F) /* CJK Symbols and Punctuation. The U+303f is half space */
		return BOOL_TRUE;
	if (sym >= 0x3040 && sym <= 0x309F) /* Hiragana */
		return BOOL_TRUE;
	if (sym >= 0x30A0 && sym <=0x30FF) /* Katakana */
		return BOOL_TRUE;
	if (sym >= 0x3100 && sym <=0x312F) /* Bopomofo */
		return BOOL_TRUE;
	if (sym >= 0x3130 && sym <= 0x318F) /* Hangul Compatibility Jamo */
		return BOOL_TRUE;
	if (sym >= 0x3190 && sym <= 0x319F) /* Kanbun */
		return BOOL_TRUE;
	if (sym >= 0x31A0 && sym <= 0x31BF) /* Bopomofo Extended */
		return BOOL_TRUE;
	if (sym >= 0x31C0 && sym <= 0x31EF) /* CJK strokes */
		return BOOL_TRUE;
	if (sym >= 0x31F0 && sym <= 0x31FF) /* Katakana Phonetic Extensions */
		return BOOL_TRUE;
	if (sym >= 0x3200 && sym <= 0x32FF) /* Enclosed CJK Letters and Months */
		return BOOL_TRUE;
	if (sym >= 0x3300 && sym <= 0x33FF) /* CJK Compatibility */
		return BOOL_TRUE;
	if (sym >= 0x3400 && sym <= 0x4DBF) /* CJK Unified Ideographs Extension A */
		return BOOL_TRUE;
	if (sym >= 0x4DC0 && sym <= 0x4DFF) /* Yijing Hexagram Symbols */
		return BOOL_TRUE;
	if (sym >= 0x4E00 && sym <= 0x9FFF) /* CJK Unified Ideographs */
		return BOOL_TRUE;
	if (sym >= 0xA000 && sym <= 0xA48F) /* Yi Syllables */
		return BOOL_TRUE;
	if (sym >= 0xA490 && sym <= 0xA4CF) /* Yi Radicals */
		return BOOL_TRUE;
#endif
	/* Speed up previous block */
	if (sym >= 0x2E80 && sym <= 0xA4CF && sym != 0x303F)
		return BOOL_TRUE;

	if (sym >= 0xAC00 && sym <= 0xD7AF) /* Hangul Syllables */
		return BOOL_TRUE;
	if (sym >= 0xF900 && sym <= 0xFAFF) /* CJK Compatibility Ideographs */
		return BOOL_TRUE;
	if (sym >= 0xFE10 && sym <= 0xFE1F) /* Vertical Forms */
		return BOOL_TRUE;

#if 0
	if (sym >= 0xFE30 && sym <= 0xFE4F) /* CJK Compatibility Forms */
		return BOOL_TRUE;
	if (sym >= 0xFE50 && sym <= 0xFE6F) /* Small Form Variants */
		return BOOL_TRUE;
#endif
	/* Speed up previous block */
	if (sym >= 0xFE30 && sym <= 0xFE6F)
		return BOOL_TRUE;

	if ((sym >= 0xFF00 && sym <= 0xFF60) ||
		(sym >= 0xFFE0 && sym <= 0xFFE6)) /* Fullwidth Forms */
		return BOOL_TRUE;

	if (sym >= 0x1D300 && sym <= 0x1D35F) /* Tai Xuan Jing Symbols */
		return BOOL_TRUE;
	if (sym >= 0x20000 && sym <= 0x2B81F) /* CJK Unified Ideographs Extensions B, C, D */
		return BOOL_TRUE;
	if (sym >= 0x2F800 && sym <= 0x2FA1F) /* CJK Compatibility Ideographs Supplement */
		return BOOL_TRUE;

	return BOOL_FALSE;
}


/** @brief Get position of previous UTF-8 char within string
 *
 * @param [in] line UTF-8 string.
 * @param [in] cur_pos Current position within UTF-8 string.
 * @return Position of previous UTF-8 character or NULL on error.
 */
off_t utf8_move_left(const char *line, off_t cur_pos)
{
	const char *pos = line + cur_pos;

	if (!line)
		return 0;
	if (pos == line)
		return cur_pos; // It's already leftmost position
	do {
		pos--;
	} while ((pos > line) && (UTF8_10 == (*pos & UTF8_MASK)));

	return pos - line;
}


/** @brief Get position of next UTF-8 char within string
 *
 * @param [in] line UTF-8 string.
 * @param [in] cur_pos Current position within UTF-8 string.
 * @return Position of next UTF-8 character or NULL on error.
 */
off_t utf8_move_right(const char *line, off_t cur_pos)
{
	const char *pos = line + cur_pos;

	if (!line)
		return 0;
	if (*pos == '\0')
		return cur_pos; // It's already rightmost position
	do {
		pos++;
	} while ((*pos != '\0') && (UTF8_10 == (*pos & UTF8_MASK)));

	return pos - line;
}


/** @brief Counts number of printable symbols within UTF-8 string
 *
 * One printable symbol can consist of several UTF-8 bytes.
 * CJK UTF-8 character can occupy 2 printable positions.
 *
 * @param [in] str UTF-8 string.
 * @param [in] end End of line position (pointer). Can be NULL - no limit.
 * @param Number of printable symbols or < 0 on error.
 */
ssize_t utf8_nsyms(const char *str, size_t len)
{
	const char *pos = str;
	ssize_t nsym = 0;

	if (!str)
		return -1;

	while ((pos < (str + len)) && (*pos != '\0')) {
		unsigned long sym = 0;

		// ASCII char
		if ((UTF8_7BIT_MASK & *pos) == 0) {
			pos++;
			nsym++;
			continue;
		}

		// Multibyte UTF-8
		pos += utf8_to_wchar(pos, &sym);
		if (utf8_wchar_is_cjk(sym)) // CJK chars have double-width
			nsym += 2;
		else
			nsym += 1;
	}

	return nsym;
}