Re: [AD] UTF-8 patch

[ Thread Index | Date Index | More lists.liballeg.org/allegro-developers Archives ]


On Sunday 18 June 2006 07:00, Jon Rafkind wrote:
> Could I request that you add some comments? Bit manipulation is a hard
> to follow..

How's this? This version also includes checks to protect against illegal 1, 7, 
and 8-byte sequence encodings.
Index: src/unicode.c
===================================================================
--- src/unicode.c	(revision 5814)
+++ src/unicode.c	(working copy)
@@ -319,14 +319,31 @@
 
    if (c & 0x80) {
       n = 1;
+      /* For multi-byte characters, the most-significant bits of the first byte
+       * is a string of 1's and denotes how many bytes are encoded here.
+       */
       while (c & (0x80>>n))
 	 n++;
 
+      /* Multi-byte sequences > 6 are illegal and should be ignored. Actually,
+       * sequences > 4 are illegal, but early versions of the standard allowed
+       * 5 and 6-byte sequences and we need to accept those. A length value of
+       * 1 is also illegal.
+       */
+      if ((n > 6) || (n == 1))
+	 return '^';
+
+      /* Clear out the length encoding and leave the MSBs of the soon-to-be-
+       * decoded value
+       */
       c &= (1<<(8-n))-1;
 
       while (--n > 0) {
 	 t = *((unsigned char *)(s++));
 
+	 /* Subsequent bytes in a multi-byte sequence will have the bit format
+	  * 10xxxxxx. If this is not the case, this is a badly encoded character.
+	  */
 	 if ((!(t&0x80)) || (t&0x40))
 	    return '^';
 
@@ -352,6 +369,9 @@
       while (c & (0x80>>n))
 	 n++;
 
+      if ((n > 6) || (n == 1))
+	 return '^';
+
       c &= (1<<(8-n))-1;
 
       while (--n > 0) {
@@ -376,37 +396,31 @@
  */
 static int utf8_setc(char *s, int c)
 {
-   int size, bits, b, i;
-
-   if (c < 128) {
-      *s = c;
-      return 1;
+   if (c >= 0) {
+      if (c <= 0x7F) {
+	 s[0] = c;
+	 return 1;
+      }
+      if (c <= 0x7FF) {
+	 s[0] = 0xC0 | (c>>6);
+	 s[1] = 0x80 | (c&0x3F);
+	 return 2;
+      }
+      if (c <= 0xFFFF) {
+	 s[0] = 0xE0 | (c>>12);
+	 s[1] = 0x80 | ((c>>6)&0x3F);
+	 s[2] = 0x80 | (c&0x3F);
+	 return 3;
+      }
+      if (c <= 0x10FFFF) {
+	 s[0] = 0xF0 | (c>>18);
+	 s[1] = 0x80 | ((c>>12)&0x3F);
+	 s[2] = 0x80 | ((c>>6)&0x3F);
+	 s[3] = 0x80 | (c&0x3F);
+	 return 4;
+      }
    }
-
-   bits = 7;
-   while (c >= (1<<bits))
-      bits++;
-
-   size = 2;
-   b = 11;
-
-   while (b < bits) {
-      size++;
-      b += 5;
-   }
-
-   b -= (7-size);
-   s[0] = c>>b;
-
-   for (i=0; i<size; i++)
-      s[0] |= (0x80>>i);
-
-   for (i=1; i<size; i++) {
-      b -= 6;
-      s[i] = 0x80 | ((c>>b)&0x3F);
-   }
-
-   return size;
+   return 0;
 }
 
 
@@ -417,14 +431,16 @@
 static int utf8_width(AL_CONST char *s)
 {
    int c = *((unsigned char *)s);
-   int n = 1;
 
    if (c & 0x80) {
+      int n = 1;
       while (c & (0x80>>n))
 	 n++;
+
+      return ((n < 7) && (n > 1)) ? n : 0;
    }
 
-   return n;
+   return 1;
 }
 
 
@@ -434,24 +450,17 @@
  */
 static int utf8_cwidth(int c)
 {
-   int size, bits, b;
-
-   if (c < 128)
-      return 1;
-
-   bits = 7;
-   while (c >= (1<<bits))
-      bits++;
-
-   size = 2;
-   b = 11;
-
-   while (b < bits) {
-      size++;
-      b += 5;
+   if (c >= 0) {
+      if (c <= 0x7F)
+	 return 1;
+      if (c <= 0x7FF)
+	 return 2;
+      if (c <= 0xFFFF)
+	 return 3;
+      if (c <= 0x10FFFF)
+	 return 4;
    }
-
-   return size;
+   return 0;
 }
 
 


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/