[AD] utf16, file functions

[ Thread Index | Date Index | More lists.liballeg.org/allegro-developers Archives ]


I made a patch to add utf16 conversion as requested on the bug tracker.
Is the API like you expected? Basically two functions:

ALLEGRO_USTR *al_ustr_new_from_utf16(const char *s, size_t size)

to go along with

ALLEGRO_USTR *al_ustr_new_from_buffer(const char *s, size_t size)

and

char *al_utf16_dup(const ALLEGRO_USTR *us)

to go along with

char *al_cstr_dup(const ALLEGRO_USTR *us)

There is no support for UTF-16BE and UTF-16LE. I assume filenames will
always be returned in native order already - and it's probably the only
use for those functions. Else it would be trivial to add 4 more
functions with explicit endianness...


I also wanted to replace the remaining uses of uconvert with this, but
found that I'm confused about the state of our files API. Specifically:

- What are src/win/wfile.c and src/unix/ufile.c? Are they superseded by
src/fshook_stdio.c and can be removed?

- What for is the addons/packfile directory? At first glance it looks
like it was accidentally created during the fshook merge and has a copy
of the old A4 API, so should it be removed?

-- 
Elias Pschernig <elias@xxxxxxxxxx>
Index: include/allegro5/utf8.h
===================================================================
--- include/allegro5/utf8.h	(revision 11879)
+++ include/allegro5/utf8.h	(working copy)
@@ -129,6 +129,10 @@
 AL_FUNC(size_t, al_utf8_width, (int32_t c));
 AL_FUNC(size_t, al_utf8_encode, (char s[], int32_t c));
 
+/* UTF-16 */
+AL_FUNC(ALLEGRO_USTR *, al_ustr_new_from_utf16, (const char *s, size_t size));
+AL_FUNC(char *, al_utf16_dup, (const ALLEGRO_USTR *us));
+
 #ifdef __cplusplus
    }
 #endif
Index: src/utf8.c
===================================================================
--- src/utf8.c	(revision 11879)
+++ src/utf8.c	(working copy)
@@ -1005,4 +1005,105 @@
    return 0;
 }
 
+
+/* Function: al_utf16_width
+ */
+size_t al_utf16_width(int c)
+{
+   /* So we don't need to check for negative values nor use unsigned ints
+    * in the interface, which are a pain.
+    */
+   uint32_t uc = c;
+
+   /* We do not check for invalid code points. */
+   if (uc <= 0xffff)
+      return 2;
+   if (uc <= 0x10ffff)
+      return 4;
+
+   /* The rest are illegal. */
+   return 0;
+}
+
+
+/* Function: al_utf16_encode
+ */
+size_t al_utf16_encode(char s[], int32_t c)
+{
+   uint32_t uc = c;
+
+   if (uc <= 0xffff) {
+      /* Note: We always assume the native endianness here. */
+      ((uint16_t *)s)[0] = uc;
+      return 2;
+   }
+
+   if (uc <= 0x10ffff) {
+      uint32_t u_ = uc - 0x10000;
+      /* Note: We always assume the native endianness here. */
+      ((uint16_t *)s)[0] = 0xd800 | (u_ >> 10);
+      ((uint16_t *)s)[1] = 0xdc00 | (u_ & 0x3ff);
+      return 4;
+   }
+
+   /* Otherwise is illegal. */
+   return 0;
+}
+
+
+static size_t _al_utf16_get(uint16_t const *s, int n, int *c)
+{
+   if (s[0] < 0xd800 || s[0] > 0xdfff) {
+      *c = s[0];
+      return 1;
+   }
+   if (n < 2) return 0;
+   *c = 0x10000 | ((s[0] & 0x3ff) << 10) | (s[1] & 0x3ff);
+   return 2;
+}
+
+
+/* Function: al_ustr_new_from_utf16
+ */
+ALLEGRO_USTR *al_ustr_new_from_utf16(const char *s, size_t size)
+{
+   unsigned int i;
+   uint16_t const *us = (void *)s;
+   ALLEGRO_USTR *ustr = al_ustr_new("");
+   /* If size is odd, this rounds down and we ignore the last byte. */
+   size_t size2 = size / 2;
+   for (i = 0; i < size2;) {
+      int c;
+      size_t n = _al_utf16_get(us + i, size2 - i, &c);
+      if (!n) break;
+      al_ustr_append_chr(ustr, c);
+      i += n;
+   }
+   return ustr;
+}
+
+
+/* Function: al_utf16_dup
+ */
+char *al_utf16_dup(const ALLEGRO_USTR *us)
+{
+   int pos = 0, sz;
+   char *ret;
+   ALLEGRO_USTR *utf16_char;
+   // TODO: Can ALLEGRO_USTR be used to hold UTF-16?
+   ALLEGRO_USTR *utf16 = al_ustr_new("");
+   while (1) {
+      ALLEGRO_USTR_INFO info;
+      char encoded[4];
+      int32_t c = al_ustr_get_next(us, &pos);
+      if (c < 0) break;
+      sz = al_utf16_encode(encoded, c);
+      utf16_char = al_ref_buffer(&info, encoded, sz);
+      al_ustr_append(utf16, utf16_char);
+   }
+   ret = al_cstr_dup(utf16);
+   al_ustr_free(utf16);
+   return ret;
+}
+
 /* vim: set sts=3 sw=3 et: */
Index: examples/ex_utf8.c
===================================================================
--- examples/ex_utf8.c	(revision 11879)
+++ examples/ex_utf8.c	(working copy)
@@ -1104,6 +1104,33 @@
    al_ustr_free(us);
 }
 
+/* Test UTF-16 conversion. */
+void t50(void)
+{
+   ALLEGRO_USTR *us;
+   char utf8[] = "⅛-note: 𝅘𝅥𝅮, domino: 🁡";
+   uint16_t *utf16;
+   /* Only native byte order supported right now, so have to specify
+    * elements as uint16_t and not as char.
+    */
+   uint16_t utf16_ref[] = {
+      0x215b, 0x002d, 0x006e, 0x006f, 0x0074,
+      0x0065, 0x003a, 0x0020, 0xd834, 0xdd60,
+      0x002c, 0x0020, 0x0064, 0x006f, 0x006d,
+      0x0069, 0x006e, 0x006f, 0x003a, 0x0020,
+      0xd83c, 0xdc61};
+
+   us = al_ustr_new_from_utf16((char *)utf16_ref, 44);
+   CHECK(0 == strcmp(utf8, al_cstr(us)));
+   al_ustr_free(us);
+
+   us = al_ustr_new(utf8);
+   utf16 = (uint16_t *)al_utf16_dup(us);
+   CHECK(0 == memcmp(utf16, utf16_ref, 44));
+   al_ustr_free(us);
+   free(utf16); // FIXME: al_free
+}
+
 /*---------------------------------------------------------------------------*/
 
 const test_t all_tests[] =
@@ -1112,7 +1139,8 @@
    t10, t11, t12, t13, t14, t15, t16, t17, t18, t19,
    t20, t21, t22, t23, t24, t25, t26, t27, t28, t29,
    t30, t31, t32, t33, t34, t35, t36, t37, t38, t39,
-   t40, t41, t42, t43, t44, t45, t46, t47, t48, t49
+   t40, t41, t42, t43, t44, t45, t46, t47, t48, t49,
+   t50
 };
 
 #define NUM_TESTS (int)(sizeof(all_tests) / sizeof(all_tests[0]))


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/