src/ceph/src/common/utf8.c

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2011 New Dream Network
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14 #include "common/utf8.h"
  15
  16 #include <string.h>
  17
  18 static int high_bits_set(int c)
  19 {
  20         int ret = 0;
  21         while (1) {
  22                 if ((c & 0x80) != 0x080)
  23                         break;
  24                 c <<= 1;
  25                 ++ret;
  26         }
  27         return ret;
  28 }
  29
  30 /* Encode a 31-bit UTF8 code point to 'buf'.
  31  * Assumes buf is of size MAX_UTF8_SZ
  32  * Returns -1 on failure; number of bytes in the encoded value otherwise.
  33  */
  34 int encode_utf8(unsigned long u, unsigned char *buf)
  35 {
  36         int i;
  37         unsigned long max_val[MAX_UTF8_SZ] = {
  38                 0x0000007ful, 0x000007fful, 0x0000fffful,
  39                 0x001ffffful, 0x03fffffful, 0x7ffffffful
  40         };
  41         static const int MAX_VAL_SZ = sizeof(max_val) / sizeof(max_val[0]);
  42
  43         for (i = 0; i < MAX_VAL_SZ; ++i) {
  44                 if (u <= max_val[i])
  45                         break;
  46         }
  47         if (i == MAX_VAL_SZ) {
  48                 // This code point is too big to encode.
  49                 return -1;
  50         }
  51
  52         if (i == 0) {
  53                 buf[0] = u;
  54         }
  55         else {
  56                 signed int j;
  57                 for (j = i; j > 0; --j) {
  58                         buf[j] = 0x80 | (u & 0x3f);
  59                         u >>= 6;
  60                 }
  61
  62                 unsigned char mask = ~(0xFF >> (i + 1));
  63                 buf[0] = mask | u;
  64         }
  65
  66         return i + 1;
  67 }
  68
  69 /*
  70  * Decode a UTF8 character from an array of bytes. Return character code.
  71  * Upon error, return INVALID_UTF8_CHAR.
  72  */
  73 unsigned long decode_utf8(unsigned char *buf, int nbytes)
  74 {
  75         unsigned long code;
  76         int i, j;
  77
  78         if (nbytes <= 0)
  79                 return INVALID_UTF8_CHAR;
  80
  81         if (nbytes == 1) {
  82                 if (buf[0] >= 0x80)
  83                         return INVALID_UTF8_CHAR;
  84                 return buf[0];
  85         }
  86
  87         i = high_bits_set(buf[0]);
  88         if (i != nbytes)
  89                 return INVALID_UTF8_CHAR;
  90         code = buf[0] & (0xff >> i);
  91         for (j = 1; j < nbytes; ++j) {
  92                 if ((buf[j] & 0xc0) != 0x80)
  93                             return INVALID_UTF8_CHAR;
  94                 code = (code << 6) | (buf[j] & 0x3f);
  95         }
  96
  97         // Check for invalid code points
  98         if (code == 0xFFFE)
  99             return INVALID_UTF8_CHAR;
 100         if (code == 0xFFFF)
 101             return INVALID_UTF8_CHAR;
 102         if (code >= 0xD800 && code <= 0xDFFF)
 103             return INVALID_UTF8_CHAR;
 104
 105         return code;
 106 }
 107
 108 int check_utf8(const char *buf, int len)
 109 {
 110         unsigned char u[MAX_UTF8_SZ];
 111         int enc_len = 0;
 112         int i = 0;
 113         while (1) {
 114                 unsigned int c = buf[i];
 115                 if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
 116                         // the start of a new character. Process what we have
 117                         // in the buffer.
 118                         if (enc_len > 0) {
 119                                 int re_encoded_len;
 120                                 unsigned char re_encoded[MAX_UTF8_SZ];
 121                                 unsigned long code = decode_utf8(u, enc_len);
 122                                 if (code == INVALID_UTF8_CHAR) {
 123                                         //printf("decoded to invalid utf8");
 124                                         return i + 1;
 125                                 }
 126                                 re_encoded_len = encode_utf8(code, re_encoded);
 127                                 if (enc_len != re_encoded_len) {
 128                                         //printf("originally encoded as %d bytes, "
 129                                         //      "but was re-encoded to %d!\n",
 130                                         //      enc_len, re_encoded_len);
 131                                         return i + 1;
 132                                 }
 133                                 if (memcmp(u, re_encoded, enc_len) != 0) {
 134                                         //printf("re-encoded to a different "
 135                                         //      "byte stream!");
 136                                         return i + 1;
 137                                 }
 138                                 //printf("code_point %lu\n", code);
 139                         }
 140                         enc_len = 0;
 141                         if (i >= len)
 142                                 break;
 143                         // start collecting again?
 144                         if (c >= 0x80)
 145                                 u[enc_len++] = c;
 146                 } else {
 147                         if (enc_len == MAX_UTF8_SZ) {
 148                                 //printf("too many enc_len in utf character!\n");
 149                                 return i + 1;
 150                         }
 151                         //printf("continuation byte...\n");
 152                         u[enc_len++] = c;
 153                 }
 154                 ++i;
 155         }
 156         return 0;
 157 }
 158
 159 int check_utf8_cstr(const char *buf)
 160 {
 161         return check_utf8(buf, strlen(buf));
 162 }
 163
 164 int is_control_character(int c)
 165 {
 166         return (((c != 0) && (c < 0x20)) || (c == 0x7f));
 167 }
 168
 169 int check_for_control_characters(const char *buf, int len)
 170 {
 171         int i;
 172         for (i = 0; i < len; ++i) {
 173                 if (is_control_character((int)(unsigned char)buf[i])) {
 174                         return i + 1;
 175                 }
 176         }
 177         return 0;
 178 }
 179
 180 int check_for_control_characters_cstr(const char *buf)
 181 {
 182         return check_for_control_characters(buf, strlen(buf));
 183 }