[ Thread Index |
Date Index
| More lists.liballeg.org/allegro-developers Archives
]
On Sunday 18 June 2006 07:00, Jon Rafkind wrote:
> Could I request that you add some comments? Bit manipulation is a hard
> to follow..
How's this? This version also includes checks to protect against illegal 1, 7,
and 8-byte sequence encodings.
Index: src/unicode.c
===================================================================
--- src/unicode.c (revision 5814)
+++ src/unicode.c (working copy)
@@ -319,14 +319,31 @@
if (c & 0x80) {
n = 1;
+ /* For multi-byte characters, the most-significant bits of the first byte
+ * is a string of 1's and denotes how many bytes are encoded here.
+ */
while (c & (0x80>>n))
n++;
+ /* Multi-byte sequences > 6 are illegal and should be ignored. Actually,
+ * sequences > 4 are illegal, but early versions of the standard allowed
+ * 5 and 6-byte sequences and we need to accept those. A length value of
+ * 1 is also illegal.
+ */
+ if ((n > 6) || (n == 1))
+ return '^';
+
+ /* Clear out the length encoding and leave the MSBs of the soon-to-be-
+ * decoded value
+ */
c &= (1<<(8-n))-1;
while (--n > 0) {
t = *((unsigned char *)(s++));
+ /* Subsequent bytes in a multi-byte sequence will have the bit format
+ * 10xxxxxx. If this is not the case, this is a badly encoded character.
+ */
if ((!(t&0x80)) || (t&0x40))
return '^';
@@ -352,6 +369,9 @@
while (c & (0x80>>n))
n++;
+ if ((n > 6) || (n == 1))
+ return '^';
+
c &= (1<<(8-n))-1;
while (--n > 0) {
@@ -376,37 +396,31 @@
*/
static int utf8_setc(char *s, int c)
{
- int size, bits, b, i;
-
- if (c < 128) {
- *s = c;
- return 1;
+ if (c >= 0) {
+ if (c <= 0x7F) {
+ s[0] = c;
+ return 1;
+ }
+ if (c <= 0x7FF) {
+ s[0] = 0xC0 | (c>>6);
+ s[1] = 0x80 | (c&0x3F);
+ return 2;
+ }
+ if (c <= 0xFFFF) {
+ s[0] = 0xE0 | (c>>12);
+ s[1] = 0x80 | ((c>>6)&0x3F);
+ s[2] = 0x80 | (c&0x3F);
+ return 3;
+ }
+ if (c <= 0x10FFFF) {
+ s[0] = 0xF0 | (c>>18);
+ s[1] = 0x80 | ((c>>12)&0x3F);
+ s[2] = 0x80 | ((c>>6)&0x3F);
+ s[3] = 0x80 | (c&0x3F);
+ return 4;
+ }
}
-
- bits = 7;
- while (c >= (1<<bits))
- bits++;
-
- size = 2;
- b = 11;
-
- while (b < bits) {
- size++;
- b += 5;
- }
-
- b -= (7-size);
- s[0] = c>>b;
-
- for (i=0; i<size; i++)
- s[0] |= (0x80>>i);
-
- for (i=1; i<size; i++) {
- b -= 6;
- s[i] = 0x80 | ((c>>b)&0x3F);
- }
-
- return size;
+ return 0;
}
@@ -417,14 +431,16 @@
static int utf8_width(AL_CONST char *s)
{
int c = *((unsigned char *)s);
- int n = 1;
if (c & 0x80) {
+ int n = 1;
while (c & (0x80>>n))
n++;
+
+ return ((n < 7) && (n > 1)) ? n : 0;
}
- return n;
+ return 1;
}
@@ -434,24 +450,17 @@
*/
static int utf8_cwidth(int c)
{
- int size, bits, b;
-
- if (c < 128)
- return 1;
-
- bits = 7;
- while (c >= (1<<bits))
- bits++;
-
- size = 2;
- b = 11;
-
- while (b < bits) {
- size++;
- b += 5;
+ if (c >= 0) {
+ if (c <= 0x7F)
+ return 1;
+ if (c <= 0x7FF)
+ return 2;
+ if (c <= 0xFFFF)
+ return 3;
+ if (c <= 0x10FFFF)
+ return 4;
}
-
- return size;
+ return 0;
}