rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c

   1 /* Licensed to the Apache Software Foundation (ASF) under one or more
   2  * contributor license agreements.  See the NOTICE file distributed with
   3  * this work for additional information regarding copyright ownership.
   4  * The ASF licenses this file to You under the Apache License, Version 2.0
   5  * (the "License"); you may not use this file except in compliance with
   6  * the License.  You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "apr.h"
  18 #include "apr_private.h"
  19 #include "apr_errno.h"
  20 #include "apr_arch_utf8.h"
  21
  22 /* Implement the design principal specified by RFC 2718 2.2.5
  23  * Guidelines for new URL Schemes - within the APR.
  24  *
  25  * Since many architectures support unicode, and UCS2 is the most
  26  * efficient storage used by those archictures, these functions
  27  * exist to validate a UCS string.  It is up to the operating system
  28  * to determine the validitity of the string in the context of it's
  29  * native language support.  File systems that support filename
  30  * characters of 0x80-0xff but have no support of Unicode will find
  31  * this function useful only for validating the character sequences
  32  * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
  33  * desired.
  34  *
  35  * from RFC 2279 UTF-8, a transformation format of ISO 10646
  36  *
  37  *     UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  38  * 1:2 0000 0000-0000 007F   0xxxxxxx
  39  * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx
  40  * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx
  41  * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
  42  * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
  43  * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  44  *
  45  * One of the X values must be one for the encoding length to be legit.
  46  * Neither the z bit, nor the final two forms, are used for ucs-2
  47  *
  48  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
  49  *   Unicode parlance), being actually UCS-4 characters transformed
  50  *   through UTF-16, need special treatment: the UTF-16 transformation
  51  *   must be undone, yielding a UCS-4 character that is then transformed
  52  *   as above."
  53  *
  54  * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  55  *
  56  *  U' = U - 0x10000
  57  *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx
  58  *                  W1 = 110110yyyyyyyyyy
  59  *                  W2 = 110111xxxxxxxxxx
  60  *
  61  * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
  62  *
  63  * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  64  */
  65
  66 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
  67                                                 apr_size_t *inbytes,
  68                                                 apr_wchar_t *out,
  69                                                 apr_size_t *outwords)
  70 {
  71     apr_int64_t newch, mask;
  72     apr_size_t expect, eating;
  73     int ch;
  74
  75     while (*inbytes && *outwords)
  76     {
  77         ch = (unsigned char)(*in++);
  78         if (!(ch & 0200)) {
  79             /* US-ASCII-7 plain text
  80              */
  81             --*inbytes;
  82             --*outwords;
  83             *(out++) = ch;
  84         }
  85         else
  86         {
  87             if ((ch & 0300) != 0300) {
  88                 /* Multibyte Continuation is out of place
  89                  */
  90                 return APR_EINVAL;
  91             }
  92             else
  93             {
  94                 /* Multibyte Sequence Lead Character
  95                  *
  96                  * Compute the expected bytes while adjusting
  97                  * or lead byte and leading zeros mask.
  98                  */
  99                 mask = 0340;
 100                 expect = 1;
 101                 while ((ch & mask) == mask) {
 102                     mask |= mask >> 1;
 103                     if (++expect > 3) /* (truly 5 for ucs-4) */
 104                         return APR_EINVAL;
 105                 }
 106                 newch = ch & ~mask;
 107                 eating = expect + 1;
 108                 if (*inbytes <= expect)
 109                     return APR_INCOMPLETE;
 110                 /* Reject values of excessive leading 0 bits
 111                  * utf-8 _demands_ the shortest possible byte length
 112                  */
 113                 if (expect == 1) {
 114                     if (!(newch & 0036))
 115                         return APR_EINVAL;
 116                 }
 117                 else {
 118                     /* Reject values of excessive leading 0 bits
 119                      */
 120                     if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
 121                         return APR_EINVAL;
 122                     if (expect == 2) {
 123                         /* Reject values D800-DFFF when not utf16 encoded
 124                          * (may not be an appropriate restriction for ucs-4)
 125                          */
 126                         if (newch == 0015 && ((unsigned char)*in & 0040))
 127                             return APR_EINVAL;
 128                     }
 129                     else if (expect == 3) {
 130                         /* Short circuit values > 110000
 131                          */
 132                         if (newch > 4)
 133                             return APR_EINVAL;
 134                         if (newch == 4 && ((unsigned char)*in & 0060))
 135                             return APR_EINVAL;
 136                     }
 137                 }
 138                 /* Where the boolean (expect > 2) is true, we will need
 139                  * an extra word for the output.
 140                  */
 141                 if (*outwords < (apr_size_t)(expect > 2) + 1)
 142                     break; /* buffer full */
 143                 while (expect--)
 144                 {
 145                     /* Multibyte Continuation must be legal */
 146                     if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
 147                         return APR_EINVAL;
 148                     newch <<= 6;
 149                     newch |= (ch & 0077);
 150                 }
 151                 *inbytes -= eating;
 152                 /* newch is now a true ucs-4 character
 153                  *
 154                  * now we need to fold to ucs-2
 155                  */
 156                 if (newch < 0x10000)
 157                 {
 158                     --*outwords;
 159                     *(out++) = (apr_wchar_t) newch;
 160                 }
 161                 else
 162                 {
 163                     *outwords -= 2;
 164                     newch -= 0x10000;
 165                     *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
 166                     *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
 167                 }
 168             }
 169         }
 170     }
 171     /* Buffer full 'errors' aren't errors, the client must inspect both
 172      * the inbytes and outwords values
 173      */
 174     return APR_SUCCESS;
 175 }
 176
 177 APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
 178                                                 apr_size_t *inwords,
 179                                                 char *out,
 180                                                 apr_size_t *outbytes)
 181 {
 182     apr_int64_t newch, require;
 183     apr_size_t need;
 184     char *invout;
 185     int ch;
 186
 187     while (*inwords && *outbytes)
 188     {
 189         ch = (unsigned short)(*in++);
 190         if (ch < 0x80)
 191         {
 192             --*inwords;
 193             --*outbytes;
 194             *(out++) = (unsigned char) ch;
 195         }
 196         else
 197         {
 198             if ((ch & 0xFC00) == 0xDC00) {
 199                 /* Invalid Leading ucs-2 Multiword Continuation Character
 200                  */
 201                 return APR_EINVAL;
 202             }
 203             if ((ch & 0xFC00) == 0xD800) {
 204                 /* Leading ucs-2 Multiword Character
 205                  */
 206                 if (*inwords < 2) {
 207                     /* Missing ucs-2 Multiword Continuation Character
 208                      */
 209                     return APR_INCOMPLETE;
 210                 }
 211                 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
 212                     /* Invalid ucs-2 Multiword Continuation Character
 213                      */
 214                     return APR_EINVAL;
 215                 }
 216                 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
 217                 newch += 0x10000;
 218             }
 219             else {
 220                 /* ucs-2 Single Word Character
 221                  */
 222                 newch = ch;
 223             }
 224             /* Determine the absolute minimum utf-8 bytes required
 225              */
 226             require = newch >> 11;
 227             need = 1;
 228             while (require)
 229                 require >>= 5, ++need;
 230             if (need >= *outbytes)
 231                 break; /* Insufficient buffer */
 232             *inwords -= (need > 2) + 1;
 233             *outbytes -= need + 1;
 234             /* Compute the utf-8 characters in last to first order,
 235              * calculating the lead character length bits along the way.
 236              */
 237             ch = 0200;
 238             out += need + 1;
 239             invout = out;
 240             while (need--) {
 241                 ch |= ch >> 1;
 242                 *(--invout) = (unsigned char)(0200 | (newch & 0077));
 243                 newch >>= 6;
 244             }
 245             /* Compute the lead utf-8 character and move the dest offset
 246              */
 247             *(--invout) = (unsigned char)(ch | newch);
 248         }
 249     }
 250     /* Buffer full 'errors' aren't errors, the client must inspect both
 251      * the inwords and outbytes values
 252      */
 253     return APR_SUCCESS;
 254 }