Re: [AD] utf16, file functions |
[ Thread Index |
Date Index
| More lists.liballeg.org/allegro-developers Archives
]
Ok, updated patch attached. Changing from the _dup version to the user
buffer version I also realized that the first patch didn't deal with
terminating 0 characters properly, should be fixed now. Prototypes now
are:
ALLEGRO_USTR *al_ustr_new_from_utf16(uint16_t const *s)
size_t al_ustr_size_utf16(const ALLEGRO_USTR *us)
size_t al_ustr_encode_utf16(const ALLEGRO_USTR *us, uint16_t *s, size_t n)
Not sure if the following two should be exposed to the public API, in
the patch they are:
size_t al_utf16_width(int c)
size_t al_utf16_encode(uint16_t s[], int32_t c)
Using size_t is a bit annoying as you can't use "int" loop variables
without triggering a warning about comparing signed and unsigned - but
if we want to change that it should be throughout the API.
--
Elias Pschernig <elias@xxxxxxxxxx>
Index: include/allegro5/utf8.h
===================================================================
--- include/allegro5/utf8.h (revision 11879)
+++ include/allegro5/utf8.h (working copy)
@@ -129,6 +129,13 @@
AL_FUNC(size_t, al_utf8_width, (int32_t c));
AL_FUNC(size_t, al_utf8_encode, (char s[], int32_t c));
+/* UTF-16 */
+AL_FUNC(ALLEGRO_USTR *, al_ustr_new_from_utf16, (uint16_t const *s));
+AL_FUNC(size_t, al_ustr_size_utf16, (const ALLEGRO_USTR *us));
+AL_FUNC(size_t, al_ustr_encode_utf16, (const ALLEGRO_USTR *us, uint16_t *s, size_t n));
+AL_FUNC(size_t, al_utf16_width, (int c));
+AL_FUNC(size_t, al_utf16_encode, (uint16_t s[], int32_t c));
+
#ifdef __cplusplus
}
#endif
Index: src/utf8.c
===================================================================
--- src/utf8.c (revision 11879)
+++ src/utf8.c (working copy)
@@ -1005,4 +1005,129 @@
return 0;
}
+
+/* Function: al_utf16_width
+ */
+size_t al_utf16_width(int c)
+{
+ /* So we don't need to check for negative values nor use unsigned ints
+ * in the interface, which are a pain.
+ */
+ uint32_t uc = c;
+
+ /* We do not check for invalid code points. */
+ if (uc <= 0xffff)
+ return 2;
+ if (uc <= 0x10ffff)
+ return 4;
+
+ /* The rest are illegal. */
+ return 0;
+}
+
+
+/* Function: al_utf16_encode
+ */
+size_t al_utf16_encode(uint16_t s[], int32_t c)
+{
+ uint32_t uc = c;
+
+ if (uc <= 0xffff) {
+ /* Note: We always assume the native endianness here. */
+ s[0] = uc;
+ return 2;
+ }
+
+ if (uc <= 0x10ffff) {
+ uint32_t u_ = uc - 0x10000;
+ /* Note: We always assume the native endianness here. */
+ s[0] = 0xd800 | (u_ >> 10);
+ s[1] = 0xdc00 | (u_ & 0x3ff);
+ return 4;
+ }
+
+ /* Otherwise is illegal. */
+ return 0;
+}
+
+
+static size_t _al_utf16_get(uint16_t const *s, int n, int *c)
+{
+ if (s[0] < 0xd800 || s[0] > 0xdfff) {
+ *c = s[0];
+ return 1;
+ }
+ if (n < 2) return 0;
+ *c = 0x10000 | ((s[0] & 0x3ff) << 10) | (s[1] & 0x3ff);
+ return 2;
+}
+
+
+/* Function: al_ustr_new_from_utf16
+ */
+ALLEGRO_USTR *al_ustr_new_from_utf16(uint16_t const *s)
+{
+ unsigned int i = 0;
+ ALLEGRO_USTR *ustr = al_ustr_new("");
+ while (1) {
+ int c;
+ /* We expect the passed string to be 0 terminated, so there are
+ * always 2 words available.
+ */
+ size_t n = _al_utf16_get(s + i, 2, &c);
+ /* Note: The string already is 0 terminated. */
+ if (c == 0) break;
+ al_ustr_append_chr(ustr, c);
+ i += n;
+ }
+ return ustr;
+}
+
+
+/* Function: al_ustr_size_utf16
+ */
+size_t al_ustr_size_utf16(const ALLEGRO_USTR *us)
+{
+ int pos = 0;
+ size_t sz = 0;
+ while (1) {
+ int32_t c = al_ustr_get_next(us, &pos);
+ if (c < 0) break;
+ sz += al_utf16_width(c);
+ }
+ /* Size of terminating 0 character - al_ustr_get_next will not
+ * return it.
+ */
+ sz += 2;
+ return sz;
+}
+
+
+/* Function: al_ustr_encode_utf16
+ */
+size_t al_ustr_encode_utf16(const ALLEGRO_USTR *us, uint16_t *s,
+ size_t n)
+{
+ int pos = 0;
+ size_t i = 0;
+ while (1) {
+ /* Used to hold one encoded UTF-16 character. */
+ uint16_t encoded[2];
+ size_t sz;
+ int32_t c = al_ustr_get_next(us, &pos);
+ if (c < 0) break;
+ sz = al_utf16_encode(encoded, c);
+ /* Need two bytes for terminating 0. */
+ if (i * 2 + sz > n - 2) break;
+ s[i++] = encoded[0];
+ if (sz == 4) s[i++] = encoded[1];
+ }
+ /* Append terminating 0 - al_ustr_get_next withheld it. */
+ if (i * 2 + 1 < n)
+ s[i++] = 0;
+
+ return i * 2;
+}
+
+
/* vim: set sts=3 sw=3 et: */
Index: docs/src/refman/utf8.txt
===================================================================
--- docs/src/refman/utf8.txt (revision 11879)
+++ docs/src/refman/utf8.txt (working copy)
@@ -473,6 +473,29 @@
Returns true iff `us1` ends with `s2`.
+# UTF-16 conversion
+
+## API: al_ustr_new_from_utf16
+
+Create a new string containing a copy of the 0-terminated string `s`
+which must be encoded as UTF-16.
+The string must eventually be freed with [al_ustr_free].
+
+## API: al_ustr_size_utf16
+
+Returns the number of bytes required to encode the string in UTF-16
+(including the terminating 0). Usually called before
+[al_ustr_encode_utf16] to determine the size of the buffer to allocate.
+
+## API: al_ustr_encode_utf16
+
+Encode the string into the given buffer, in UTF-16. Returns the number
+of bytes written. There are never more than `n` bytes written. The
+minimum size to encode the complete string can be queried with
+[al_ustr_size_utf16]. If the `n` parameter is smaller than that, the
+string will be truncated but still always 0 terminated.
+
+
# Low-level UTF-8 routines
## API: al_utf8_width
@@ -487,7 +510,25 @@
must have enough space to hold the encoding, which takes between 1 and 4
bytes. This routine will refuse to encode code points above 0x10FFFF.
-Returns the number of bytes written, which is the as that returned by
-[al_utf8_width].
+Returns the number of bytes written, which is the same as that returned
+by [al_utf8_width].
+# Low-level UTF-16 routines
+
+## API: al_utf16_width
+
+Returns the number of bytes that would be occupied by the specified code
+point when encoded in UTF-16. This is either 2 or 4 bytes for legal code
+point values. Otherwise returns 0.
+
+## API: al_utf16_encode
+
+Encode the specified code point to UTF-8 into the buffer `s`. The buffer
+must have enough space to hold the encoding, which takes either 2 or 4
+bytes. This routine will refuse to encode code points above 0x10FFFF.
+
+Returns the number of bytes written, which is the same as that returned
+by [al_utf16_width].
+
+
Index: examples/ex_utf8.c
===================================================================
--- examples/ex_utf8.c (revision 11879)
+++ examples/ex_utf8.c (working copy)
@@ -1104,6 +1104,46 @@
al_ustr_free(us);
}
+/* Test UTF-16 conversion. */
+void t50(void)
+{
+ ALLEGRO_USTR *us;
+ char utf8[] = "⅛-note: 𝅘𝅥𝅮, domino: 🁡";
+ uint16_t *utf16;
+ size_t s;
+ uint16_t small[8];
+ /* Only native byte order supported right now, so have to specify
+ * elements as uint16_t and not as char.
+ */
+ uint16_t utf16_ref[] = {
+ 0x215b, 0x002d, 0x006e, 0x006f, 0x0074,
+ 0x0065, 0x003a, 0x0020, 0xd834, 0xdd60,
+ 0x002c, 0x0020, 0x0064, 0x006f, 0x006d,
+ 0x0069, 0x006e, 0x006f, 0x003a, 0x0020,
+ 0xd83c, 0xdc61, 0x0000};
+ uint16_t truncated[] = {
+ 0x215b, 0x002d, 0x006e, 0x006f, 0x0074,
+ 0x0065, 0x003a, 0x0000};
+
+ us = al_ustr_new_from_utf16(utf16_ref);
+ CHECK(20 == al_ustr_length(us));
+ CHECK(0 == strcmp(utf8, al_cstr(us)));
+ al_ustr_free(us);
+
+ us = al_ustr_new(utf8);
+ s = al_ustr_size_utf16(us);
+ CHECK(46 == s);
+ utf16 = malloc(s);
+ al_ustr_encode_utf16(us, utf16, s);
+ CHECK(0 == memcmp(utf16, utf16_ref, s));
+ free(utf16);
+
+ s = al_ustr_encode_utf16(us, small, sizeof small);
+ CHECK(16 == s);
+ CHECK(0 == memcmp(truncated, small, s));
+ al_ustr_free(us);
+}
+
/*---------------------------------------------------------------------------*/
const test_t all_tests[] =
@@ -1112,7 +1152,8 @@
t10, t11, t12, t13, t14, t15, t16, t17, t18, t19,
t20, t21, t22, t23, t24, t25, t26, t27, t28, t29,
t30, t31, t32, t33, t34, t35, t36, t37, t38, t39,
- t40, t41, t42, t43, t44, t45, t46, t47, t48, t49
+ t40, t41, t42, t43, t44, t45, t46, t47, t48, t49,
+ t50
};
#define NUM_TESTS (int)(sizeof(all_tests) / sizeof(all_tests[0]))