1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "apr_private.h"
19 #include "apr_errno.h"
20 #include "apr_arch_utf8.h"
22 /* Implement the design principal specified by RFC 2718 2.2.5
23 * Guidelines for new URL Schemes - within the APR.
25 * Since many architectures support unicode, and UCS2 is the most
26 * efficient storage used by those archictures, these functions
27 * exist to validate a UCS string. It is up to the operating system
28 * to determine the validitity of the string in the context of it's
29 * native language support. File systems that support filename
30 * characters of 0x80-0xff but have no support of Unicode will find
31 * this function useful only for validating the character sequences
32 * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
35 * from RFC 2279 UTF-8, a transformation format of ISO 10646
37 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
38 * 1:2 0000 0000-0000 007F 0xxxxxxx
39 * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx
40 * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx
41 * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
42 * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
43 * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
45 * One of the X values must be one for the encoding length to be legit.
46 * Neither the z bit, nor the final two forms, are used for ucs-2
48 * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
49 * Unicode parlance), being actually UCS-4 characters transformed
50 * through UTF-16, need special treatment: the UTF-16 transformation
51 * must be undone, yielding a UCS-4 character that is then transformed
54 * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
57 * U' = 000000000000yyyyyyyyyyxxxxxxxxxx
58 * W1 = 110110yyyyyyyyyy
59 * W2 = 110111xxxxxxxxxx
61 * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
63 * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
66 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
71 apr_int64_t newch, mask;
72 apr_size_t expect, eating;
75 while (*inbytes && *outwords)
77 ch = (unsigned char)(*in++);
79 /* US-ASCII-7 plain text
87 if ((ch & 0300) != 0300) {
88 /* Multibyte Continuation is out of place
94 /* Multibyte Sequence Lead Character
96 * Compute the expected bytes while adjusting
97 * or lead byte and leading zeros mask.
101 while ((ch & mask) == mask) {
103 if (++expect > 3) /* (truly 5 for ucs-4) */
108 if (*inbytes <= expect)
109 return APR_INCOMPLETE;
110 /* Reject values of excessive leading 0 bits
111 * utf-8 _demands_ the shortest possible byte length
118 /* Reject values of excessive leading 0 bits
120 if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
123 /* Reject values D800-DFFF when not utf16 encoded
124 * (may not be an appropriate restriction for ucs-4)
126 if (newch == 0015 && ((unsigned char)*in & 0040))
129 else if (expect == 3) {
130 /* Short circuit values > 110000
134 if (newch == 4 && ((unsigned char)*in & 0060))
138 /* Where the boolean (expect > 2) is true, we will need
139 * an extra word for the output.
141 if (*outwords < (apr_size_t)(expect > 2) + 1)
142 break; /* buffer full */
145 /* Multibyte Continuation must be legal */
146 if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
149 newch |= (ch & 0077);
152 /* newch is now a true ucs-4 character
154 * now we need to fold to ucs-2
159 *(out++) = (apr_wchar_t) newch;
165 *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
166 *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
171 /* Buffer full 'errors' aren't errors, the client must inspect both
172 * the inbytes and outwords values
177 APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
180 apr_size_t *outbytes)
182 apr_int64_t newch, require;
187 while (*inwords && *outbytes)
189 ch = (unsigned short)(*in++);
194 *(out++) = (unsigned char) ch;
198 if ((ch & 0xFC00) == 0xDC00) {
199 /* Invalid Leading ucs-2 Multiword Continuation Character
203 if ((ch & 0xFC00) == 0xD800) {
204 /* Leading ucs-2 Multiword Character
207 /* Missing ucs-2 Multiword Continuation Character
209 return APR_INCOMPLETE;
211 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
212 /* Invalid ucs-2 Multiword Continuation Character
216 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
220 /* ucs-2 Single Word Character
224 /* Determine the absolute minimum utf-8 bytes required
226 require = newch >> 11;
229 require >>= 5, ++need;
230 if (need >= *outbytes)
231 break; /* Insufficient buffer */
232 *inwords -= (need > 2) + 1;
233 *outbytes -= need + 1;
234 /* Compute the utf-8 characters in last to first order,
235 * calculating the lead character length bits along the way.
242 *(--invout) = (unsigned char)(0200 | (newch & 0077));
245 /* Compute the lead utf-8 character and move the dest offset
247 *(--invout) = (unsigned char)(ch | newch);
250 /* Buffer full 'errors' aren't errors, the client must inspect both
251 * the inwords and outbytes values