Re: [AD] utf16, file functions

[ Thread Index | Date Index | More lists.liballeg.org/allegro-developers Archives ]

To: Coordination of admins/developers of the game programming library Allegro <alleg-developers@xxxxxxxxxx>
Subject: Re: [AD] utf16, file functions
From: Elias Pschernig <elias.pschernig@xxxxxxxxxx>
Date: Wed, 18 Mar 2009 18:20:33 +0100

Ok, updated patch attached. Changing from the _dup version to the user
buffer version I also realized that the first patch didn't deal with
terminating 0 characters properly, should be fixed now. Prototypes now
are:

ALLEGRO_USTR *al_ustr_new_from_utf16(uint16_t const *s)
size_t al_ustr_size_utf16(const ALLEGRO_USTR *us)
size_t al_ustr_encode_utf16(const ALLEGRO_USTR *us, uint16_t *s, size_t n)

Not sure if the following two should be exposed to the public API, in
the patch they are:

size_t al_utf16_width(int c)
size_t al_utf16_encode(uint16_t s[], int32_t c)

Using size_t is a bit annoying as you can't use "int" loop variables
without triggering a warning about comparing signed and unsigned - but
if we want to change that it should be throughout the API.

-- 
Elias Pschernig <elias@xxxxxxxxxx>

Index: include/allegro5/utf8.h
===================================================================
--- include/allegro5/utf8.h	(revision 11879)
+++ include/allegro5/utf8.h	(working copy)
@@ -129,6 +129,13 @@
 AL_FUNC(size_t, al_utf8_width, (int32_t c));
 AL_FUNC(size_t, al_utf8_encode, (char s[], int32_t c));
 
+/* UTF-16 */
+AL_FUNC(ALLEGRO_USTR *, al_ustr_new_from_utf16, (uint16_t const *s));
+AL_FUNC(size_t, al_ustr_size_utf16, (const ALLEGRO_USTR *us));
+AL_FUNC(size_t, al_ustr_encode_utf16, (const ALLEGRO_USTR *us, uint16_t *s, size_t n));
+AL_FUNC(size_t, al_utf16_width, (int c));
+AL_FUNC(size_t, al_utf16_encode, (uint16_t s[], int32_t c));
+
 #ifdef __cplusplus
    }
 #endif
Index: src/utf8.c
===================================================================
--- src/utf8.c	(revision 11879)
+++ src/utf8.c	(working copy)
@@ -1005,4 +1005,129 @@
    return 0;
 }
 
+
+/* Function: al_utf16_width
+ */
+size_t al_utf16_width(int c)
+{
+   /* So we don't need to check for negative values nor use unsigned ints
+    * in the interface, which are a pain.
+    */
+   uint32_t uc = c;
+
+   /* We do not check for invalid code points. */
+   if (uc <= 0xffff)
+      return 2;
+   if (uc <= 0x10ffff)
+      return 4;
+
+   /* The rest are illegal. */
+   return 0;
+}
+
+
+/* Function: al_utf16_encode
+ */
+size_t al_utf16_encode(uint16_t s[], int32_t c)
+{
+   uint32_t uc = c;
+
+   if (uc <= 0xffff) {
+      /* Note: We always assume the native endianness here. */
+      s[0] = uc;
+      return 2;
+   }
+
+   if (uc <= 0x10ffff) {
+      uint32_t u_ = uc - 0x10000;
+      /* Note: We always assume the native endianness here. */
+      s[0] = 0xd800 | (u_ >> 10);
+      s[1] = 0xdc00 | (u_ & 0x3ff);
+      return 4;
+   }
+
+   /* Otherwise is illegal. */
+   return 0;
+}
+
+
+static size_t _al_utf16_get(uint16_t const *s, int n, int *c)
+{
+   if (s[0] < 0xd800 || s[0] > 0xdfff) {
+      *c = s[0];
+      return 1;
+   }
+   if (n < 2) return 0;
+   *c = 0x10000 | ((s[0] & 0x3ff) << 10) | (s[1] & 0x3ff);
+   return 2;
+}
+
+
+/* Function: al_ustr_new_from_utf16
+ */
+ALLEGRO_USTR *al_ustr_new_from_utf16(uint16_t const *s)
+{
+   unsigned int i = 0;
+   ALLEGRO_USTR *ustr = al_ustr_new("");
+   while (1) {
+      int c;
+      /* We expect the passed string to be 0 terminated, so there are
+       * always 2 words available.
+       */
+      size_t n = _al_utf16_get(s + i, 2, &c);
+      /* Note: The string already is 0 terminated. */
+      if (c == 0) break;
+      al_ustr_append_chr(ustr, c);
+      i += n;
+   }
+   return ustr;
+}
+
+
+/* Function: al_ustr_size_utf16
+ */
+size_t al_ustr_size_utf16(const ALLEGRO_USTR *us)
+{
+   int pos = 0;
+   size_t sz = 0;
+   while (1) {
+      int32_t c = al_ustr_get_next(us, &pos);
+      if (c < 0) break;
+      sz += al_utf16_width(c);
+   }
+   /* Size of terminating 0 character - al_ustr_get_next will not
+    * return it.
+    */
+   sz += 2;
+   return sz;
+}
+
+
+/* Function: al_ustr_encode_utf16
+ */
+size_t al_ustr_encode_utf16(const ALLEGRO_USTR *us, uint16_t *s,
+   size_t n)
+{
+   int pos = 0;
+   size_t i = 0;
+   while (1) {
+      /* Used to hold one encoded UTF-16 character. */
+      uint16_t encoded[2];
+      size_t sz;
+      int32_t c = al_ustr_get_next(us, &pos);
+      if (c < 0) break;
+      sz = al_utf16_encode(encoded, c);
+      /* Need two bytes for terminating 0. */
+      if (i * 2 + sz > n - 2) break;
+      s[i++] = encoded[0];
+      if (sz == 4) s[i++] = encoded[1];
+   }
+   /* Append terminating 0 - al_ustr_get_next withheld it. */
+   if (i * 2 + 1 < n)
+      s[i++] = 0;
+
+   return i * 2;
+}
+
+
 /* vim: set sts=3 sw=3 et: */
Index: docs/src/refman/utf8.txt
===================================================================
--- docs/src/refman/utf8.txt	(revision 11879)
+++ docs/src/refman/utf8.txt	(working copy)
@@ -473,6 +473,29 @@
 Returns true iff `us1` ends with `s2`.
 
 
+# UTF-16 conversion
+
+## API: al_ustr_new_from_utf16
+
+Create a new string containing a copy of the 0-terminated string `s`
+which must be encoded as UTF-16.
+The string must eventually be freed with [al_ustr_free].
+
+## API: al_ustr_size_utf16
+
+Returns the number of bytes required to encode the string in UTF-16
+(including the terminating 0). Usually called before
+[al_ustr_encode_utf16] to determine the size of the buffer to allocate.
+
+## API: al_ustr_encode_utf16
+
+Encode the string into the given buffer, in UTF-16. Returns the number
+of bytes written. There are never more than `n` bytes written. The
+minimum size to encode the complete string can be queried with
+[al_ustr_size_utf16]. If the `n` parameter is smaller than that, the
+string will be truncated but still always 0 terminated.
+
+
 # Low-level UTF-8 routines
 
 ## API: al_utf8_width
@@ -487,7 +510,25 @@
 must have enough space to hold the encoding, which takes between 1 and 4
 bytes.  This routine will refuse to encode code points above 0x10FFFF.
 
-Returns the number of bytes written, which is the as that returned by
-[al_utf8_width].
+Returns the number of bytes written, which is the same as that returned
+by [al_utf8_width].
 
 
+# Low-level UTF-16 routines
+
+## API: al_utf16_width
+
+Returns the number of bytes that would be occupied by the specified code
+point when encoded in UTF-16. This is either 2 or 4 bytes for legal code
+point values. Otherwise returns 0.
+
+## API: al_utf16_encode
+
+Encode the specified code point to UTF-8 into the buffer `s`. The buffer
+must have enough space to hold the encoding, which takes either 2 or 4
+bytes. This routine will refuse to encode code points above 0x10FFFF.
+
+Returns the number of bytes written, which is the same as that returned
+by [al_utf16_width].
+
+
Index: examples/ex_utf8.c
===================================================================
--- examples/ex_utf8.c	(revision 11879)
+++ examples/ex_utf8.c	(working copy)
@@ -1104,6 +1104,46 @@
    al_ustr_free(us);
 }
 
+/* Test UTF-16 conversion. */
+void t50(void)
+{
+   ALLEGRO_USTR *us;
+   char utf8[] = "⅛-note: 𝅘𝅥𝅮, domino: 🁡";
+   uint16_t *utf16;
+   size_t s;
+   uint16_t small[8];
+   /* Only native byte order supported right now, so have to specify
+    * elements as uint16_t and not as char.
+    */
+   uint16_t utf16_ref[] = {
+      0x215b, 0x002d, 0x006e, 0x006f, 0x0074,
+      0x0065, 0x003a, 0x0020, 0xd834, 0xdd60,
+      0x002c, 0x0020, 0x0064, 0x006f, 0x006d,
+      0x0069, 0x006e, 0x006f, 0x003a, 0x0020,
+      0xd83c, 0xdc61, 0x0000};
+   uint16_t truncated[] = {
+      0x215b, 0x002d, 0x006e, 0x006f, 0x0074,
+      0x0065, 0x003a, 0x0000};
+
+   us = al_ustr_new_from_utf16(utf16_ref);
+   CHECK(20 == al_ustr_length(us));
+   CHECK(0 == strcmp(utf8, al_cstr(us)));
+   al_ustr_free(us);
+
+   us = al_ustr_new(utf8);
+   s = al_ustr_size_utf16(us);
+   CHECK(46 == s);
+   utf16 = malloc(s);
+   al_ustr_encode_utf16(us, utf16, s);
+   CHECK(0 == memcmp(utf16, utf16_ref, s));
+   free(utf16);
+   
+   s = al_ustr_encode_utf16(us, small, sizeof small);
+   CHECK(16 == s);
+   CHECK(0 == memcmp(truncated, small, s));
+   al_ustr_free(us);
+}
+
 /*---------------------------------------------------------------------------*/
 
 const test_t all_tests[] =
@@ -1112,7 +1152,8 @@
    t10, t11, t12, t13, t14, t15, t16, t17, t18, t19,
    t20, t21, t22, t23, t24, t25, t26, t27, t28, t29,
    t30, t31, t32, t33, t34, t35, t36, t37, t38, t39,
-   t40, t41, t42, t43, t44, t45, t46, t47, t48, t49
+   t40, t41, t42, t43, t44, t45, t46, t47, t48, t49,
+   t50
 };
 
 #define NUM_TESTS (int)(sizeof(all_tests) / sizeof(all_tests[0]))

Follow-Ups:
- Re: [AD] utf16, file functions
  - From: Evert Glebbeek
- Re: [AD] utf16, file functions
  - From: Peter Wang

References:
- [AD] utf16, file functions
  - From: Elias Pschernig
- Re: [AD] utf16, file functions
  - From: Peter Wang

Messages sorted by: [ date | thread ]
Prev by Date: Re: [AD] utf16, file functions
Next by Date: Re: [AD] utf16, file functions
Previous by thread: Re: [AD] utf16, file functions
Next by thread: Re: [AD] utf16, file functions

Mail converted by MHonArc 2.6.19+

http://listengine.tuxfamily.org/