dotfiles

personal configuration files and scripts
git clone https://tongong.net/git/dotfiles.git
Log | Files | Refs | README

utf8.h (2336B)


      1 /* Branchless UTF-8 decoder
      2  *
      3  * This is free and unencumbered software released into the public domain.
      4  */
      5 #ifndef UTF8_H
      6 #define UTF8_H
      7 
      8 #include <stdint.h>
      9 
     10 /* Decode the next character, C, from BUF, reporting errors in E.
     11  *
     12  * Since this is a branchless decoder, four bytes will be read from the
     13  * buffer regardless of the actual length of the next character. This
     14  * means the buffer _must_ have at least three bytes of zero padding
     15  * following the end of the data stream.
     16  *
     17  * Errors are reported in E, which will be non-zero if the parsed
     18  * character was somehow invalid: invalid byte sequence, non-canonical
     19  * encoding, or a surrogate half.
     20  *
     21  * The function returns a pointer to the next character. When an error
     22  * occurs, this pointer will be a guess that depends on the particular
     23  * error, but it will always advance at least one byte.
     24  */
     25 static void *
     26 utf8_decode(void *buf, uint32_t *c, int *e)
     27 {
     28     static const char lengths[] = {
     29         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     30         0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
     31     };
     32     static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
     33     static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
     34     static const int shiftc[] = {0, 18, 12, 6, 0};
     35     static const int shifte[] = {0, 6, 4, 2, 0};
     36 
     37     unsigned char *s = buf;
     38     int len = lengths[s[0] >> 3];
     39 
     40     /* Compute the pointer to the next character early so that the next
     41      * iteration can start working on the next character. Neither Clang
     42      * nor GCC figure out this reordering on their own.
     43      */
     44     unsigned char *next = s + len + !len;
     45 
     46     /* Assume a four-byte character and load four bytes. Unused bits are
     47      * shifted out.
     48      */
     49     *c  = (uint32_t)(s[0] & masks[len]) << 18;
     50     *c |= (uint32_t)(s[1] & 0x3f) << 12;
     51     *c |= (uint32_t)(s[2] & 0x3f) <<  6;
     52     *c |= (uint32_t)(s[3] & 0x3f) <<  0;
     53     *c >>= shiftc[len];
     54 
     55     /* Accumulate the various error conditions. */
     56     *e  = (*c < mins[len]) << 6; // non-canonical encoding
     57     *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
     58     *e |= (*c > 0x10FFFF) << 8;  // out of range?
     59     *e |= (s[1] & 0xc0) >> 2;
     60     *e |= (s[2] & 0xc0) >> 4;
     61     *e |= (s[3]       ) >> 6;
     62     *e ^= 0x2a; // top two bits of each tail byte correct?
     63     *e >>= shifte[len];
     64 
     65     return next;
     66 }
     67 
     68 #endif