src/dma/vendor/golang.org/x/text/internal/language/parse.go

   1 // Copyright 2013 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package language
   6
   7 import (
   8         "bytes"
   9         "errors"
  10         "fmt"
  11         "sort"
  12
  13         "golang.org/x/text/internal/tag"
  14 )
  15
  16 // isAlpha returns true if the byte is not a digit.
  17 // b must be an ASCII letter or digit.
  18 func isAlpha(b byte) bool {
  19         return b > '9'
  20 }
  21
  22 // isAlphaNum returns true if the string contains only ASCII letters or digits.
  23 func isAlphaNum(s []byte) bool {
  24         for _, c := range s {
  25                 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
  26                         return false
  27                 }
  28         }
  29         return true
  30 }
  31
  32 // ErrSyntax is returned by any of the parsing functions when the
  33 // input is not well-formed, according to BCP 47.
  34 // TODO: return the position at which the syntax error occurred?
  35 var ErrSyntax = errors.New("language: tag is not well-formed")
  36
  37 // ErrDuplicateKey is returned when a tag contains the same key twice with
  38 // different values in the -u section.
  39 var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
  40
  41 // ValueError is returned by any of the parsing functions when the
  42 // input is well-formed but the respective subtag is not recognized
  43 // as a valid value.
  44 type ValueError struct {
  45         v [8]byte
  46 }
  47
  48 // NewValueError creates a new ValueError.
  49 func NewValueError(tag []byte) ValueError {
  50         var e ValueError
  51         copy(e.v[:], tag)
  52         return e
  53 }
  54
  55 func (e ValueError) tag() []byte {
  56         n := bytes.IndexByte(e.v[:], 0)
  57         if n == -1 {
  58                 n = 8
  59         }
  60         return e.v[:n]
  61 }
  62
  63 // Error implements the error interface.
  64 func (e ValueError) Error() string {
  65         return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
  66 }
  67
  68 // Subtag returns the subtag for which the error occurred.
  69 func (e ValueError) Subtag() string {
  70         return string(e.tag())
  71 }
  72
  73 // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
  74 type scanner struct {
  75         b     []byte
  76         bytes [max99thPercentileSize]byte
  77         token []byte
  78         start int // start position of the current token
  79         end   int // end position of the current token
  80         next  int // next point for scan
  81         err   error
  82         done  bool
  83 }
  84
  85 func makeScannerString(s string) scanner {
  86         scan := scanner{}
  87         if len(s) <= len(scan.bytes) {
  88                 scan.b = scan.bytes[:copy(scan.bytes[:], s)]
  89         } else {
  90                 scan.b = []byte(s)
  91         }
  92         scan.init()
  93         return scan
  94 }
  95
  96 // makeScanner returns a scanner using b as the input buffer.
  97 // b is not copied and may be modified by the scanner routines.
  98 func makeScanner(b []byte) scanner {
  99         scan := scanner{b: b}
 100         scan.init()
 101         return scan
 102 }
 103
 104 func (s *scanner) init() {
 105         for i, c := range s.b {
 106                 if c == '_' {
 107                         s.b[i] = '-'
 108                 }
 109         }
 110         s.scan()
 111 }
 112
 113 // restToLower converts the string between start and end to lower case.
 114 func (s *scanner) toLower(start, end int) {
 115         for i := start; i < end; i++ {
 116                 c := s.b[i]
 117                 if 'A' <= c && c <= 'Z' {
 118                         s.b[i] += 'a' - 'A'
 119                 }
 120         }
 121 }
 122
 123 func (s *scanner) setError(e error) {
 124         if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
 125                 s.err = e
 126         }
 127 }
 128
 129 // resizeRange shrinks or grows the array at position oldStart such that
 130 // a new string of size newSize can fit between oldStart and oldEnd.
 131 // Sets the scan point to after the resized range.
 132 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
 133         s.start = oldStart
 134         if end := oldStart + newSize; end != oldEnd {
 135                 diff := end - oldEnd
 136                 if end < cap(s.b) {
 137                         b := make([]byte, len(s.b)+diff)
 138                         copy(b, s.b[:oldStart])
 139                         copy(b[end:], s.b[oldEnd:])
 140                         s.b = b
 141                 } else {
 142                         s.b = append(s.b[end:], s.b[oldEnd:]...)
 143                 }
 144                 s.next = end + (s.next - s.end)
 145                 s.end = end
 146         }
 147 }
 148
 149 // replace replaces the current token with repl.
 150 func (s *scanner) replace(repl string) {
 151         s.resizeRange(s.start, s.end, len(repl))
 152         copy(s.b[s.start:], repl)
 153 }
 154
 155 // gobble removes the current token from the input.
 156 // Caller must call scan after calling gobble.
 157 func (s *scanner) gobble(e error) {
 158         s.setError(e)
 159         if s.start == 0 {
 160                 s.b = s.b[:+copy(s.b, s.b[s.next:])]
 161                 s.end = 0
 162         } else {
 163                 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
 164                 s.end = s.start - 1
 165         }
 166         s.next = s.start
 167 }
 168
 169 // deleteRange removes the given range from s.b before the current token.
 170 func (s *scanner) deleteRange(start, end int) {
 171         s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
 172         diff := end - start
 173         s.next -= diff
 174         s.start -= diff
 175         s.end -= diff
 176 }
 177
 178 // scan parses the next token of a BCP 47 string.  Tokens that are larger
 179 // than 8 characters or include non-alphanumeric characters result in an error
 180 // and are gobbled and removed from the output.
 181 // It returns the end position of the last token consumed.
 182 func (s *scanner) scan() (end int) {
 183         end = s.end
 184         s.token = nil
 185         for s.start = s.next; s.next < len(s.b); {
 186                 i := bytes.IndexByte(s.b[s.next:], '-')
 187                 if i == -1 {
 188                         s.end = len(s.b)
 189                         s.next = len(s.b)
 190                         i = s.end - s.start
 191                 } else {
 192                         s.end = s.next + i
 193                         s.next = s.end + 1
 194                 }
 195                 token := s.b[s.start:s.end]
 196                 if i < 1 || i > 8 || !isAlphaNum(token) {
 197                         s.gobble(ErrSyntax)
 198                         continue
 199                 }
 200                 s.token = token
 201                 return end
 202         }
 203         if n := len(s.b); n > 0 && s.b[n-1] == '-' {
 204                 s.setError(ErrSyntax)
 205                 s.b = s.b[:len(s.b)-1]
 206         }
 207         s.done = true
 208         return end
 209 }
 210
 211 // acceptMinSize parses multiple tokens of the given size or greater.
 212 // It returns the end position of the last token consumed.
 213 func (s *scanner) acceptMinSize(min int) (end int) {
 214         end = s.end
 215         s.scan()
 216         for ; len(s.token) >= min; s.scan() {
 217                 end = s.end
 218         }
 219         return end
 220 }
 221
 222 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
 223 // failed it returns an error and any part of the tag that could be parsed.
 224 // If parsing succeeded but an unknown value was found, it returns
 225 // ValueError. The Tag returned in this case is just stripped of the unknown
 226 // value. All other values are preserved. It accepts tags in the BCP 47 format
 227 // and extensions to this standard defined in
 228 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 229 func Parse(s string) (t Tag, err error) {
 230         // TODO: consider supporting old-style locale key-value pairs.
 231         if s == "" {
 232                 return Und, ErrSyntax
 233         }
 234         if len(s) <= maxAltTaglen {
 235                 b := [maxAltTaglen]byte{}
 236                 for i, c := range s {
 237                         // Generating invalid UTF-8 is okay as it won't match.
 238                         if 'A' <= c && c <= 'Z' {
 239                                 c += 'a' - 'A'
 240                         } else if c == '_' {
 241                                 c = '-'
 242                         }
 243                         b[i] = byte(c)
 244                 }
 245                 if t, ok := grandfathered(b); ok {
 246                         return t, nil
 247                 }
 248         }
 249         scan := makeScannerString(s)
 250         return parse(&scan, s)
 251 }
 252
 253 func parse(scan *scanner, s string) (t Tag, err error) {
 254         t = Und
 255         var end int
 256         if n := len(scan.token); n <= 1 {
 257                 scan.toLower(0, len(scan.b))
 258                 if n == 0 || scan.token[0] != 'x' {
 259                         return t, ErrSyntax
 260                 }
 261                 end = parseExtensions(scan)
 262         } else if n >= 4 {
 263                 return Und, ErrSyntax
 264         } else { // the usual case
 265                 t, end = parseTag(scan)
 266                 if n := len(scan.token); n == 1 {
 267                         t.pExt = uint16(end)
 268                         end = parseExtensions(scan)
 269                 } else if end < len(scan.b) {
 270                         scan.setError(ErrSyntax)
 271                         scan.b = scan.b[:end]
 272                 }
 273         }
 274         if int(t.pVariant) < len(scan.b) {
 275                 if end < len(s) {
 276                         s = s[:end]
 277                 }
 278                 if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
 279                         t.str = s
 280                 } else {
 281                         t.str = string(scan.b)
 282                 }
 283         } else {
 284                 t.pVariant, t.pExt = 0, 0
 285         }
 286         return t, scan.err
 287 }
 288
 289 // parseTag parses language, script, region and variants.
 290 // It returns a Tag and the end position in the input that was parsed.
 291 func parseTag(scan *scanner) (t Tag, end int) {
 292         var e error
 293         // TODO: set an error if an unknown lang, script or region is encountered.
 294         t.LangID, e = getLangID(scan.token)
 295         scan.setError(e)
 296         scan.replace(t.LangID.String())
 297         langStart := scan.start
 298         end = scan.scan()
 299         for len(scan.token) == 3 && isAlpha(scan.token[0]) {
 300                 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
 301                 // to a tag of the form <extlang>.
 302                 lang, e := getLangID(scan.token)
 303                 if lang != 0 {
 304                         t.LangID = lang
 305                         copy(scan.b[langStart:], lang.String())
 306                         scan.b[langStart+3] = '-'
 307                         scan.start = langStart + 4
 308                 }
 309                 scan.gobble(e)
 310                 end = scan.scan()
 311         }
 312         if len(scan.token) == 4 && isAlpha(scan.token[0]) {
 313                 t.ScriptID, e = getScriptID(script, scan.token)
 314                 if t.ScriptID == 0 {
 315                         scan.gobble(e)
 316                 }
 317                 end = scan.scan()
 318         }
 319         if n := len(scan.token); n >= 2 && n <= 3 {
 320                 t.RegionID, e = getRegionID(scan.token)
 321                 if t.RegionID == 0 {
 322                         scan.gobble(e)
 323                 } else {
 324                         scan.replace(t.RegionID.String())
 325                 }
 326                 end = scan.scan()
 327         }
 328         scan.toLower(scan.start, len(scan.b))
 329         t.pVariant = byte(end)
 330         end = parseVariants(scan, end, t)
 331         t.pExt = uint16(end)
 332         return t, end
 333 }
 334
 335 var separator = []byte{'-'}
 336
 337 // parseVariants scans tokens as long as each token is a valid variant string.
 338 // Duplicate variants are removed.
 339 func parseVariants(scan *scanner, end int, t Tag) int {
 340         start := scan.start
 341         varIDBuf := [4]uint8{}
 342         variantBuf := [4][]byte{}
 343         varID := varIDBuf[:0]
 344         variant := variantBuf[:0]
 345         last := -1
 346         needSort := false
 347         for ; len(scan.token) >= 4; scan.scan() {
 348                 // TODO: measure the impact of needing this conversion and redesign
 349                 // the data structure if there is an issue.
 350                 v, ok := variantIndex[string(scan.token)]
 351                 if !ok {
 352                         // unknown variant
 353                         // TODO: allow user-defined variants?
 354                         scan.gobble(NewValueError(scan.token))
 355                         continue
 356                 }
 357                 varID = append(varID, v)
 358                 variant = append(variant, scan.token)
 359                 if !needSort {
 360                         if last < int(v) {
 361                                 last = int(v)
 362                         } else {
 363                                 needSort = true
 364                                 // There is no legal combinations of more than 7 variants
 365                                 // (and this is by no means a useful sequence).
 366                                 const maxVariants = 8
 367                                 if len(varID) > maxVariants {
 368                                         break
 369                                 }
 370                         }
 371                 }
 372                 end = scan.end
 373         }
 374         if needSort {
 375                 sort.Sort(variantsSort{varID, variant})
 376                 k, l := 0, -1
 377                 for i, v := range varID {
 378                         w := int(v)
 379                         if l == w {
 380                                 // Remove duplicates.
 381                                 continue
 382                         }
 383                         varID[k] = varID[i]
 384                         variant[k] = variant[i]
 385                         k++
 386                         l = w
 387                 }
 388                 if str := bytes.Join(variant[:k], separator); len(str) == 0 {
 389                         end = start - 1
 390                 } else {
 391                         scan.resizeRange(start, end, len(str))
 392                         copy(scan.b[scan.start:], str)
 393                         end = scan.end
 394                 }
 395         }
 396         return end
 397 }
 398
 399 type variantsSort struct {
 400         i []uint8
 401         v [][]byte
 402 }
 403
 404 func (s variantsSort) Len() int {
 405         return len(s.i)
 406 }
 407
 408 func (s variantsSort) Swap(i, j int) {
 409         s.i[i], s.i[j] = s.i[j], s.i[i]
 410         s.v[i], s.v[j] = s.v[j], s.v[i]
 411 }
 412
 413 func (s variantsSort) Less(i, j int) bool {
 414         return s.i[i] < s.i[j]
 415 }
 416
 417 type bytesSort struct {
 418         b [][]byte
 419         n int // first n bytes to compare
 420 }
 421
 422 func (b bytesSort) Len() int {
 423         return len(b.b)
 424 }
 425
 426 func (b bytesSort) Swap(i, j int) {
 427         b.b[i], b.b[j] = b.b[j], b.b[i]
 428 }
 429
 430 func (b bytesSort) Less(i, j int) bool {
 431         for k := 0; k < b.n; k++ {
 432                 if b.b[i][k] == b.b[j][k] {
 433                         continue
 434                 }
 435                 return b.b[i][k] < b.b[j][k]
 436         }
 437         return false
 438 }
 439
 440 // parseExtensions parses and normalizes the extensions in the buffer.
 441 // It returns the last position of scan.b that is part of any extension.
 442 // It also trims scan.b to remove excess parts accordingly.
 443 func parseExtensions(scan *scanner) int {
 444         start := scan.start
 445         exts := [][]byte{}
 446         private := []byte{}
 447         end := scan.end
 448         for len(scan.token) == 1 {
 449                 extStart := scan.start
 450                 ext := scan.token[0]
 451                 end = parseExtension(scan)
 452                 extension := scan.b[extStart:end]
 453                 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
 454                         scan.setError(ErrSyntax)
 455                         end = extStart
 456                         continue
 457                 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
 458                         scan.b = scan.b[:end]
 459                         return end
 460                 } else if ext == 'x' {
 461                         private = extension
 462                         break
 463                 }
 464                 exts = append(exts, extension)
 465         }
 466         sort.Sort(bytesSort{exts, 1})
 467         if len(private) > 0 {
 468                 exts = append(exts, private)
 469         }
 470         scan.b = scan.b[:start]
 471         if len(exts) > 0 {
 472                 scan.b = append(scan.b, bytes.Join(exts, separator)...)
 473         } else if start > 0 {
 474                 // Strip trailing '-'.
 475                 scan.b = scan.b[:start-1]
 476         }
 477         return end
 478 }
 479
 480 // parseExtension parses a single extension and returns the position of
 481 // the extension end.
 482 func parseExtension(scan *scanner) int {
 483         start, end := scan.start, scan.end
 484         switch scan.token[0] {
 485         case 'u':
 486                 attrStart := end
 487                 scan.scan()
 488                 for last := []byte{}; len(scan.token) > 2; scan.scan() {
 489                         if bytes.Compare(scan.token, last) != -1 {
 490                                 // Attributes are unsorted. Start over from scratch.
 491                                 p := attrStart + 1
 492                                 scan.next = p
 493                                 attrs := [][]byte{}
 494                                 for scan.scan(); len(scan.token) > 2; scan.scan() {
 495                                         attrs = append(attrs, scan.token)
 496                                         end = scan.end
 497                                 }
 498                                 sort.Sort(bytesSort{attrs, 3})
 499                                 copy(scan.b[p:], bytes.Join(attrs, separator))
 500                                 break
 501                         }
 502                         last = scan.token
 503                         end = scan.end
 504                 }
 505                 var last, key []byte
 506                 for attrEnd := end; len(scan.token) == 2; last = key {
 507                         key = scan.token
 508                         keyEnd := scan.end
 509                         end = scan.acceptMinSize(3)
 510                         // TODO: check key value validity
 511                         if keyEnd == end || bytes.Compare(key, last) != 1 {
 512                                 // We have an invalid key or the keys are not sorted.
 513                                 // Start scanning keys from scratch and reorder.
 514                                 p := attrEnd + 1
 515                                 scan.next = p
 516                                 keys := [][]byte{}
 517                                 for scan.scan(); len(scan.token) == 2; {
 518                                         keyStart, keyEnd := scan.start, scan.end
 519                                         end = scan.acceptMinSize(3)
 520                                         if keyEnd != end {
 521                                                 keys = append(keys, scan.b[keyStart:end])
 522                                         } else {
 523                                                 scan.setError(ErrSyntax)
 524                                                 end = keyStart
 525                                         }
 526                                 }
 527                                 sort.Stable(bytesSort{keys, 2})
 528                                 if n := len(keys); n > 0 {
 529                                         k := 0
 530                                         for i := 1; i < n; i++ {
 531                                                 if !bytes.Equal(keys[k][:2], keys[i][:2]) {
 532                                                         k++
 533                                                         keys[k] = keys[i]
 534                                                 } else if !bytes.Equal(keys[k], keys[i]) {
 535                                                         scan.setError(ErrDuplicateKey)
 536                                                 }
 537                                         }
 538                                         keys = keys[:k+1]
 539                                 }
 540                                 reordered := bytes.Join(keys, separator)
 541                                 if e := p + len(reordered); e < end {
 542                                         scan.deleteRange(e, end)
 543                                         end = e
 544                                 }
 545                                 copy(scan.b[p:], reordered)
 546                                 break
 547                         }
 548                 }
 549         case 't':
 550                 scan.scan()
 551                 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
 552                         _, end = parseTag(scan)
 553                         scan.toLower(start, end)
 554                 }
 555                 for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
 556                         end = scan.acceptMinSize(3)
 557                 }
 558         case 'x':
 559                 end = scan.acceptMinSize(1)
 560         default:
 561                 end = scan.acceptMinSize(2)
 562         }
 563         return end
 564 }
 565
 566 // getExtension returns the name, body and end position of the extension.
 567 func getExtension(s string, p int) (end int, ext string) {
 568         if s[p] == '-' {
 569                 p++
 570         }
 571         if s[p] == 'x' {
 572                 return len(s), s[p:]
 573         }
 574         end = nextExtension(s, p)
 575         return end, s[p:end]
 576 }
 577
 578 // nextExtension finds the next extension within the string, searching
 579 // for the -<char>- pattern from position p.
 580 // In the fast majority of cases, language tags will have at most
 581 // one extension and extensions tend to be small.
 582 func nextExtension(s string, p int) int {
 583         for n := len(s) - 3; p < n; {
 584                 if s[p] == '-' {
 585                         if s[p+2] == '-' {
 586                                 return p
 587                         }
 588                         p += 3
 589                 } else {
 590                         p++
 591                 }
 592         }
 593         return len(s)
 594 }