simple library for fancy terminal io based on termbox
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

272 lines
8.6 KiB

/* mini_utf8.h
*
* Gunnar Zötl <gz@tset.de> 2014
*
* a tiny library to deal with utf8 encoded strings. Tries to fault
* invalid unicode codepoints and invalid utf8 sequences.
*
* Stuff starting with _mini_utf8_* is reserved and private. Don't name your
* identifiers like that, and don't use stuff named like that.
*
* Needed #includes:
* -----------------
* -
*
* Functions:
* ----------
*
* int mini_utf8_check_encoding(const char* str)
* test all characters in a string for valid utf8 encoding. Returns
* 0 if the string is valid utf8, 1 if it is pure ASCII, or -1, if
* the string is not valid utf8. We do a somewhat relaxed test in
* that all chars in the range [0x01-0x1F] are considered valid.
*
* int mini_utf8_decode(const char **str)
* returns the next valid utf8 character from *str, updating *str
* to point behind that char. If *str points to a 0 byte, 0 is
* returned and *str is not updated. If *str does not point to a
* valid utf8 encoded char, -1 is returned and *str is not updated.
*
* int mini_utf8_encode(int cp, const char* str, int len)
* encodes the codepoint cp into an utf8 byte sequence and stores
* that into str, where len bytes are available. If that went without
* errors, the length of the encoded sequence is returned. If cp is
* not a valid code point, -1 is returned, for all other problems,
* 0 is returned. If cp is 0, it is stored as a single byte 0, even
* if that is not really valid utf8. Also, all chars in the range
* [0x01-0x1F] are considered valid.
*
* int mini_utf8_strlen(const char *str)
* returns the number of utf8 codepoints in the string str, or -1 if
* the string contains invalid utf8 sequences.
*
* int mini_utf8_byteoffset(const char *str, int cpno)
* returns the number of bytes from the start of the string to the
* start of codepoint number cpno. Returns >=0 for the offset, or
* -1 if the string had less than cpno codepoints, or contained an
* invalid utf8 sequence.
*
* Example:
* --------
*
#include <stdio.h>
#include <stdlib.h>
#include "mini_utf8.h"
int main(int argc, char **argv)
{
int size = 0x11FFFF;
int l = size * 4 + 1, i = 0, ok = 1, cp = 0;
int *ibuf = calloc(size, sizeof(int));
char *cbuf = calloc(l, sizeof(char));
char *str = cbuf;
while (cp < size) {
cp = cp + 1;
int n = mini_utf8_encode(cp, str, l);
if (n > 0) {
l -= n;
str += n;
ibuf[i++] = cp;
}
}
*str = 0;
size = i;
str = cbuf;
for (i = 0; ok && (i < size); ++i) {
cp = mini_utf8_decode((const char**)&str);
ok = (cp == ibuf[i]);
}
ok = ok && (mini_utf8_strlen(cbuf) == size);
printf("Roundtrip test %s.\n", ok ? "succeeded" : "failed");
ok = mini_utf8_check_encoding(cbuf);
printf("utf8 check %s.\n", ok >= 0 ? "succeeded" : "failed");
return ok < 0;
}
*
* License:
* --------
*
* Copyright (c) 2014 Gunnar Zötl <gz@tset.de>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _mini_utf8
#define _mini_utf8
#define _mini_utf8_in_range(c, s, e) ((s) <= (c) && (c) <= (e))
/* The patterns for the encoding check are taken from
* http://www.w3.org/International/questions/qa-forms-utf-8
*/
static inline int mini_utf8_check_encoding(const char *str)
{
const unsigned char *s = (const unsigned char*) str;
int isu = 1;
int isa = 1;
while (*s && isu) {
if (*s <= 0x7F) {
s += 1;
continue; /* [\x09\x0A\x0D\x20-\x7E] # ASCII (somewhat relaxed) */
}
isa = 0; /* if we get here, the file is not pure ASCII */
if (_mini_utf8_in_range(*s, 0xC2, 0xDF) && _mini_utf8_in_range(s[1], 0x80, 0xBF)) {
s += 2; /* [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte */
} else if (*s == 0xE0 && _mini_utf8_in_range(s[1], 0xA0, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
s += 3; /* \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs */
} else if ((*s <= 0xEC || *s == 0xEE || *s == 0xEF) && _mini_utf8_in_range(s[1], 0x80, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
s += 3; /* [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte */
} else if (*s == 0xED && _mini_utf8_in_range(s[1], 0x80, 0x9F) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
s += 3; /* \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates */
} else if (*s == 0xF0 && _mini_utf8_in_range(s[1], 0x90, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
s += 4; /* \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 */
} else if (*s <= 0xF3 && _mini_utf8_in_range(s[1], 0x80, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
s += 4; /* [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 */
} else if (*s == 0xF4 && _mini_utf8_in_range(s[1], 0x80, 0x8F) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
s += 4; /* \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 */
} else
isu = 0;
}
if (isa && isu)
return 1;
else if (isu)
return 0;
return -1;
}
/* bits start end bytes encoding
* 7 U+0000 U+007F 1 0xxxxxxx
* 11 U+0080 U+07FF 2 110xxxxx 10xxxxxx
* 16 U+0800 U+FFFF 3 1110xxxx 10xxxxxx 10xxxxxx
* 21 U+10000 U+1FFFFF 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* validity checking derived from above patterns
*/
static inline int mini_utf8_decode(const char **str)
{
const unsigned char *s = (const unsigned char*) *str;
int ret = -1;
if (!*s) return 0;
if (*s <= 0x7F) {
ret = s[0]; /* ASCII */
*str = (char*) s+1;
return ret;
} else if (*s < 0xC2) {
return -1;
} else if (*s<= 0xDF) {
if ((s[1] & 0xC0) != 0x80) return -1;
ret = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
*str = (char*) s+2;
return ret;
} else if (*s <= 0xEF) {
if ((s[1] & 0xC0) != 0x80) return -1;
if (*s == 0xE0 && s[1] < 0xA0) return -1;
if (*s == 0xED && s[1] > 0x9F) return -1;
if ((s[2] & 0xC0) != 0x80) return -1;
ret = ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
*str = (char*) s+3;
return ret;
} else if (*s <= 0xF4) {
if ((s[1] & 0xC0) != 0x80) return -1;
if (*s == 0xF0 && s[1] < 0x90) return -1;
if (*s == 0xF4 && s[1] > 0x8F) return -1;
if ((s[2] & 0xC0) != 0x80) return -1;
if ((s[3] & 0xC0) != 0x80) return -1;
ret = ((s[0] & 0x0F) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
*str = (char*) s+4;
return ret;
}
return ret;
}
/* only utf16 surrogate pairs (0xD800-0xDFFF) are invalid unicode
* codepoints
*/
static inline int mini_utf8_encode(int cp, char *str, int len)
{
unsigned char *s = (unsigned char*) str;
if (cp <= 0x7F) {
if (len < 1) return 0;
*s = (cp & 0x7F);
return 1;
} else if (cp <= 0x7FF) {
if (len < 2) return 0;
*s++ = (cp >> 6) | 0xC0;
*s = (cp & 0x3F) | 0x80;
return 2;
} else if (cp <= 0xFFFF) {
if (0xD800 <= cp && cp <= 0xDFFF) return -1;
if (len < 3) return 0;
*s++ = (cp >> 12) | 0xE0;
*s++ = ((cp >> 6) & 0x3F) | 0x80;
*s = (cp & 0x3F) | 0x80;
return 3;
} else if (cp <= 0x10FFFF) {
if (len < 4) return 0;
*s++ =(cp >> 18) | 0xF0;
*s++ =((cp >> 12) & 0x3F) | 0x80;
*s++ =((cp >> 6) & 0x3F) | 0x80;
*s =(cp & 0x3F) | 0x80;
return 4;
}
return -1;
}
static inline int mini_utf8_strlen(const char *str)
{
const char *s = str;
int len = 0;
int ok = mini_utf8_decode(&s);
while (ok > 0) {
++len;
ok = mini_utf8_decode(&s);
}
if (ok == 0)
return len;
return -1;
}
static inline int mini_utf8_byteoffset(const char *str, int cpno)
{
const char *s = str;
int cnt = 0;
int ok = 1;
for (cnt = 0; (cnt < cpno) && (ok > 0); ++cnt) {
ok = mini_utf8_decode(&s);
}
if (ok > 0)
return (int)(s - str);
return -1;
}
#endif /* _mini_utf8 */