tiny header-only utf-8 handling lib
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

441 lines
11 KiB

/* gcc -o test_check_utf8 test_check_utf8.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mini_utf8.h"
static const char* to_bin(unsigned long l, int bl)
{
static char binstr[sizeof(long) * 8 + 1];
int i;
for (i = 1; i <= bl; ++i) {
if (l & (1 << (bl - i)))
binstr[i - 1] = '1';
else
binstr[i - 1] = '0';
}
binstr[bl] = 0;
return binstr;
}
/* simple check, just generate like all chars and then some, encode them,
* and see whether we can roundtrip them.
*/
int check_roundtrip(int flags)
{
printf("\n--- roundtrip test with flags %s...\n", to_bin(flags, 8));
int size = 0x11FFFF;
int l = size * 4 + 1, i = 0, ok = 1, cp = 0;
int *ibuf = calloc(size, sizeof(int));
char *cbuf = calloc(l, sizeof(char));
char *str = cbuf;
mini_utf8_check_encoding_f(str, flags);
/* generate */
while (cp < size) {
cp = cp + 1;
if (cp >= 0xD800 && cp <= 0xDFFF) continue;
int n = mini_utf8_encode_f(cp, str, l, flags);
if (n > 0) {
l -= n;
str += n;
ibuf[i++] = cp;
}
}
*str = 0;
size = i;
printf("encoding range from 1 to 0x11FFFF (%d) yielded 0x%X (%d) valid code points.\n", 0x11FFFF, size, size);
/* check */
str = cbuf;
for (i = 0; ok && (i < size); ++i) {
cp = mini_utf8_decode_f((const char**)&str, flags);
ok = (cp == ibuf[i]);
}
ok = ok && (*str == 0);
if (!ok) {
printf("roundtrip test failed at codepoint %d (%06X)\n", i, i);
return 0;
}
ok = ok && (mini_utf8_strlen_f(cbuf, flags) == size);
if (!ok) {
printf("roundtrip test failed at strlen (%d != %d)\n", mini_utf8_strlen_f(cbuf, flags), size);
return 0;
}
int bo = mini_utf8_byteoffset_f(cbuf, size / 3, flags);
str = cbuf + bo;
cp = mini_utf8_decode_f((const char**)&str, flags);
ok = cp == ibuf[size / 3];
if (!ok) {
printf("roundtrip test failed at byteoffset for %d: %d\n", size / 3, bo);
return 0;
}
printf("roundtrip test succeeded.\n");
ok = mini_utf8_check_encoding_f(cbuf, flags);
if (ok == 0)
printf("test data is utf8 encoded.\n");
else if (ok == 1)
printf("test data is ascii encoded.\n");
else if (ok == -1)
printf("test data is not utf8 encoded.\n");
bo = mini_utf8_byteoffset_f(cbuf, size / 2, flags);
cbuf[bo + 1] = 0; /* definitive error! */
ok = mini_utf8_check_encoding_f(cbuf, flags);
if (ok == 0)
printf("garbled test data is utf8 encoded.\n");
else if (ok == 1)
printf("garbled test data is ascii encoded.\n");
else if (ok == -1)
printf("garbled test data is not utf8 encoded.\n");
return ok <= 0;
}
int force_encode(int cp, char *str, int len)
{
unsigned char *s = (unsigned char*) str;
if (cp <= 0x7F) {
if (len < 1) return 0;
*s = (cp & 0x7F);
return 1;
} else if (cp <= 0x7FF) {
if (len < 2) return 0;
*s++ = (cp >> 6) | 0xC0;
*s = (cp & 0x3F) | 0x80;
return 2;
} else if (cp <= 0xFFFF) {
if (len < 3) return 0;
*s++ = (cp >> 12) | 0xE0;
*s++ = ((cp >> 6) & 0x3F) | 0x80;
*s = (cp & 0x3F) | 0x80;
return 3;
} else if (cp <= 0x1FFFFF) {
if (len < 4) return 0;
*s++ =(cp >> 18) | 0xF0;
*s++ =((cp >> 12) & 0x3F) | 0x80;
*s++ =((cp >> 6) & 0x3F) | 0x80;
*s =(cp & 0x3F) | 0x80;
return 4;
}
return -1;
}
int check_decoding(int flags)
{
printf("\n--- decoding test with flags %s...\n", to_bin(flags, 8));
int size = 0x1FFFFF;
int ok1, ok2, ok3, ok4;
int i, k, ok = 1;
char buf[8];
const char* str = buf;
for (i = 0; i < size; ++i) {
for (k = 0; k < 8; ++k) buf[k] = 0;
str = buf;
int cp = i + 1;
ok1 = mini_utf8_encode_f(cp, buf, 8, flags);
if (ok1 <= 0) force_encode(cp, buf, 8);
ok2 = mini_utf8_check_encoding_f(buf, flags);
ok3 = mini_utf8_decode_f(&str, flags);
ok4 = ok3 > 0 ? ok3 == cp : 1;
if (cp >= 0xD800 && cp <= 0xDFFF && ok1) ok1 = -1; /* no proper surrogate pairs, so this should fail. */
if (!((ok1 > 0 && ok2 >= 0 && ok3 >= 0 && ok4) || (ok1 < 0 && ok2 < 0 && ok3 < 0))) {
printf("encoding, checking and decoding disagree on cp %0X: %d, %d, %d, %d\n", cp, ok1, ok2, ok3, ok4);
ok = 0;
}
}
return ok;
}
int encode_as_surrogates(int cp, char* str, int size)
{
int ok1 = 01, ok2 = -1;
if (size < 6) return 0;
if (cp < 0x10000L) {
return mini_utf8_encode_f(cp, str, size, MINI_UTF8_STRICT);
}
int lo = cp & 0x3FF;
int hi = (cp >> 10) & 0x7FF;
ok1 = mini_utf8_encode_f(hi + 0xD800, str, size, MINI_UTF8_ENC_SURROGATES);
if (ok1 > 0) {
ok2 = mini_utf8_encode_f(lo + 0xDC00, str + ok1, size - ok1, MINI_UTF8_ENC_SURROGATES);
return ok2 > 0 ? ok1 + ok2 : -1;
}
return -1;
}
int check_surrogates()
{
printf("\n--- surrogate decoding test\n");
char buf[8];
const char *s;
int min = 0x10000, max = 0x10FFFF;
int cp, ok1, ok2, ok3, ok4, ok = 1;
for (cp = min; cp <= max; ++cp) {
ok1 = ok2 = ok3 = 0;
ok1 = encode_as_surrogates(cp, &buf[0], sizeof(buf));
s = buf;
if (ok1 > 0) ok2 = mini_utf8_decode_f(&s, MINI_UTF8_DEC_SURROGATES);
if (ok1 <= 0 || ok2 != cp) {
printf("utf8 surrogate decoding test failed for cp %06X\n", cp);
ok = 0;
} else {
buf[ok1] = 0;
ok2 = mini_utf8_check_encoding_f(buf, MINI_UTF8_DEC_SURROGATES);
if (ok2 < 0) {
printf("utf8 surrogate checking test failed for cp %06X\n", cp);
ok = 0;
}
}
}
for (cp = 0xD800; cp <= 0xDFFF; ++cp) {
ok1 = ok2 = ok3 = ok4 = 0;
ok1 = mini_utf8_encode_f(cp, &buf[0], sizeof(buf), MINI_UTF8_ENC_SURROGATES);
if (ok1 > 0) {
buf[ok1] = 0;
s = buf;
ok2 = mini_utf8_decode_f(&s, MINI_UTF8_STRICT);
ok3 = mini_utf8_decode_f(&s, MINI_UTF8_LIBERAL);
ok4 = mini_utf8_decode_f(&s, MINI_UTF8_EXTRALIBERAL);
if (ok2 >= 0 || ok3 >= 0 || ok4 < 0) {
printf("utf8 unpaired surrogate decoding test failed for cp %06X\n", cp);
ok = 0;
} else {
ok2 = mini_utf8_check_encoding_f(buf, MINI_UTF8_STRICT);
ok3 = mini_utf8_check_encoding_f(buf, MINI_UTF8_LIBERAL);
ok4 = mini_utf8_check_encoding_f(buf, MINI_UTF8_EXTRALIBERAL);
if (ok2 >= 0 || ok3 >= 0 || ok4 < 0) {
printf("utf8 unpaired surrogate checking test failed for cp %06X\n", cp);
ok = 0;
}
}
}
}
return ok;
}
int check_iter(int flags)
{
printf("\n--- iteration test with flags %s...\n", to_bin(flags, 8));
int size = 0x10ffff;
int i, cp, nb = size * 4, nc = 0;
char *buf = malloc(nb);
char *p = buf, *max;
int ok = 1;
for (cp = 1; cp <= size; ++cp) {
if (cp < 0xD800 || cp > 0xDFFF) {
if (flags & MINI_UTF8_ENC_SURROGATES) {
i = encode_as_surrogates(cp, p, nb);
} else {
i = mini_utf8_encode_f(cp, p, nb, flags);
}
if (i > 0) {
nb -= i;
p += i;
nc++;
}
}
}
*p = 0;
max = p - 1;
p = buf;
for (i = 0; i < nc; ++i) {
cp = mini_utf8_nextchar_f((const char**) &p, flags);
if (cp < 0) {
printf("forward iterating failed at pos %d\n", i);
break;
}
}
if (*p) {
printf("forward iterating failed.\n");
ok = 0;
}
p = max;
for (i = 0; i < nc; ++i) {
cp = mini_utf8_prevchar_f((const char**) &p, buf, flags);
if (cp < 0) {
printf("backward iterating failed at pos %d\n", nc - i);
break;
}
}
if (p != buf) {
printf("backward iterating failed.\n");
ok = 0;
}
p = max;
for (i = 0; i < nc; ++i) {
const char* np = mini_utf8_charstart_f(p, buf, flags);
if (!np) {
printf("charstart iterating failed at pos %d\n", nc - i);
break;
}
p = (char*)np - 1;
}
if (p+1 != buf) {
printf("charstart iterating failed.\n");
ok = 0;
}
return ok;
}
int check_speed(int flags)
{
printf("\n--- speed test with flags %s...\n", to_bin(flags, 8));
int size = 0x10ffff;
int factor = 100;
int i, cp, nb = size * factor * 4, nc = 0;
char *buf = malloc(nb);
char *p = buf, *max;
double t0 = clock();
for (i = 0; i < factor; ++i) {
for (cp = 1; cp <= size; ++cp) {
if (cp < 0xD800 || cp > 0xDFFF) {
int n = mini_utf8_encode_f(cp, p, nb, flags);
if (n > 0) {
p += n;
nb -= n;
++nc;
}
}
}
}
double t1 = clock();
printf("encoding: %d chars in %f seconds\n", nc, (t1 - t0) / CLOCKS_PER_SEC);
nb = p - buf;
*p = 0;
max = p - 1;
t0 = clock();
int ok = mini_utf8_check_encoding_f(buf, flags);
t1 = clock();
printf("checking: %f seconds\n", (t1 - t0) / CLOCKS_PER_SEC);
t0 = clock();
p = buf;
do {
ok = mini_utf8_decode_f((const char**)&p, flags);
} while (ok);
t1 = clock();
printf("decoding: %f seconds\n", (t1 - t0) / CLOCKS_PER_SEC);
t0 = clock();
p = buf;
do {
ok = mini_utf8_nextchar_f((const char**)&p, flags);
} while (ok);
t1 = clock();
printf("nextchar: %f seconds\n", (t1 - t0) / CLOCKS_PER_SEC);
t0 = clock();
p = max;
do {
ok = mini_utf8_prevchar_f((const char**)&p, buf, flags);
} while (ok > 0);
t1 = clock();
printf("prevchar: %f seconds\n", (t1 - t0) / CLOCKS_PER_SEC);
return 1;
}
int main(int argc, char** argv)
{
argc = argc;
argv = argv;
int ok = 0, all = 0, succeeded = 0;
double t0 = clock();
ok = check_roundtrip(MINI_UTF8_DEFAULT);
double t1 = clock();
printf("check_roundtrip %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_roundtrip(MINI_UTF8_STRICT);
t1 = clock();
printf("check_roundtrip %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_roundtrip(MINI_UTF8_LIBERAL);
t1 = clock();
printf("check_roundtrip %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_decoding(MINI_UTF8_STRICT);
t1 = clock();
printf("check_decoding %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_decoding(MINI_UTF8_LIBERAL);
t1 = clock();
printf("check_decoding %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_surrogates();
t1 = clock();
printf("check_surrogates %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_iter(MINI_UTF8_LIBERAL);
t1 = clock();
printf("check_iter %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_iter(MINI_UTF8_STRICT);
t1 = clock();
printf("check_iter %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_speed(MINI_UTF8_STRICT);
t1 = clock();
printf("check_speed %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
t0 = clock();
ok = check_speed(MINI_UTF8_LIBERAL);
t1 = clock();
printf("check_speed %s in %g seconds\n", ok ? "succeeded" : "failed", (t1 - t0) / CLOCKS_PER_SEC);
++all; succeeded += (ok != 0);
printf("\n%d out of %d tests succeeded.\n", succeeded, all);
exit(ok <= 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}