Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / src / common / utf8.c
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4  * Ceph - scalable distributed file system
5  *
6  * Copyright (C) 2011 New Dream Network
7  *
8  * This is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License version 2.1, as published by the Free Software
11  * Foundation.  See file COPYING.
12  *
13  */
14 #include "common/utf8.h"
15
16 #include <string.h>
17
18 static int high_bits_set(int c)
19 {
20         int ret = 0;
21         while (1) {
22                 if ((c & 0x80) != 0x080)
23                         break;
24                 c <<= 1;
25                 ++ret;
26         }
27         return ret;
28 }
29
30 /* Encode a 31-bit UTF8 code point to 'buf'.
31  * Assumes buf is of size MAX_UTF8_SZ
32  * Returns -1 on failure; number of bytes in the encoded value otherwise.
33  */
34 int encode_utf8(unsigned long u, unsigned char *buf)
35 {
36         int i;
37         unsigned long max_val[MAX_UTF8_SZ] = {
38                 0x0000007ful, 0x000007fful, 0x0000fffful,
39                 0x001ffffful, 0x03fffffful, 0x7ffffffful
40         };
41         static const int MAX_VAL_SZ = sizeof(max_val) / sizeof(max_val[0]);
42
43         for (i = 0; i < MAX_VAL_SZ; ++i) {
44                 if (u <= max_val[i])
45                         break;
46         }
47         if (i == MAX_VAL_SZ) {
48                 // This code point is too big to encode.
49                 return -1;
50         }
51
52         if (i == 0) {
53                 buf[0] = u;
54         }
55         else {
56                 signed int j;
57                 for (j = i; j > 0; --j) {
58                         buf[j] = 0x80 | (u & 0x3f);
59                         u >>= 6;
60                 }
61
62                 unsigned char mask = ~(0xFF >> (i + 1));
63                 buf[0] = mask | u;
64         }
65
66         return i + 1;
67 }
68
69 /*
70  * Decode a UTF8 character from an array of bytes. Return character code.
71  * Upon error, return INVALID_UTF8_CHAR.
72  */
73 unsigned long decode_utf8(unsigned char *buf, int nbytes)
74 {
75         unsigned long code;
76         int i, j;
77
78         if (nbytes <= 0)
79                 return INVALID_UTF8_CHAR;
80
81         if (nbytes == 1) {
82                 if (buf[0] >= 0x80)
83                         return INVALID_UTF8_CHAR;
84                 return buf[0];
85         }
86
87         i = high_bits_set(buf[0]);
88         if (i != nbytes)
89                 return INVALID_UTF8_CHAR;
90         code = buf[0] & (0xff >> i);
91         for (j = 1; j < nbytes; ++j) {
92                 if ((buf[j] & 0xc0) != 0x80)
93                             return INVALID_UTF8_CHAR;
94                 code = (code << 6) | (buf[j] & 0x3f);
95         }
96
97         // Check for invalid code points
98         if (code == 0xFFFE)
99             return INVALID_UTF8_CHAR;
100         if (code == 0xFFFF)
101             return INVALID_UTF8_CHAR;
102         if (code >= 0xD800 && code <= 0xDFFF)
103             return INVALID_UTF8_CHAR;
104
105         return code;
106 }
107
108 int check_utf8(const char *buf, int len)
109 {
110         unsigned char u[MAX_UTF8_SZ];
111         int enc_len = 0;
112         int i = 0;
113         while (1) {
114                 unsigned int c = buf[i];
115                 if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
116                         // the start of a new character. Process what we have
117                         // in the buffer.
118                         if (enc_len > 0) {
119                                 int re_encoded_len;
120                                 unsigned char re_encoded[MAX_UTF8_SZ];
121                                 unsigned long code = decode_utf8(u, enc_len);
122                                 if (code == INVALID_UTF8_CHAR) {
123                                         //printf("decoded to invalid utf8");
124                                         return i + 1;
125                                 }
126                                 re_encoded_len = encode_utf8(code, re_encoded);
127                                 if (enc_len != re_encoded_len) {
128                                         //printf("originally encoded as %d bytes, "
129                                         //      "but was re-encoded to %d!\n",
130                                         //      enc_len, re_encoded_len);
131                                         return i + 1;
132                                 }
133                                 if (memcmp(u, re_encoded, enc_len) != 0) {
134                                         //printf("re-encoded to a different "
135                                         //      "byte stream!");
136                                         return i + 1;
137                                 }
138                                 //printf("code_point %lu\n", code);
139                         }
140                         enc_len = 0;
141                         if (i >= len)
142                                 break;
143                         // start collecting again?
144                         if (c >= 0x80)
145                                 u[enc_len++] = c;
146                 } else {
147                         if (enc_len == MAX_UTF8_SZ) {
148                                 //printf("too many enc_len in utf character!\n");
149                                 return i + 1;
150                         }
151                         //printf("continuation byte...\n");
152                         u[enc_len++] = c;
153                 }
154                 ++i;
155         }
156         return 0;
157 }
158
159 int check_utf8_cstr(const char *buf)
160 {
161         return check_utf8(buf, strlen(buf));
162 }
163
164 int is_control_character(int c)
165 {
166         return (((c != 0) && (c < 0x20)) || (c == 0x7f));
167 }
168
169 int check_for_control_characters(const char *buf, int len)
170 {
171         int i;
172         for (i = 0; i < len; ++i) {
173                 if (is_control_character((int)(unsigned char)buf[i])) {
174                         return i + 1;
175                 }
176         }
177         return 0;
178 }
179
180 int check_for_control_characters_cstr(const char *buf)
181 {
182         return check_for_control_characters(buf, strlen(buf));
183 }