1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
13 "golang.org/x/text/internal/tag"
16 // isAlpha returns true if the byte is not a digit.
17 // b must be an ASCII letter or digit.
18 func isAlpha(b byte) bool {
22 // isAlphaNum returns true if the string contains only ASCII letters or digits.
23 func isAlphaNum(s []byte) bool {
25 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
32 // ErrSyntax is returned by any of the parsing functions when the
33 // input is not well-formed, according to BCP 47.
34 // TODO: return the position at which the syntax error occurred?
35 var ErrSyntax = errors.New("language: tag is not well-formed")
37 // ErrDuplicateKey is returned when a tag contains the same key twice with
38 // different values in the -u section.
39 var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
41 // ValueError is returned by any of the parsing functions when the
42 // input is well-formed but the respective subtag is not recognized
44 type ValueError struct {
48 // NewValueError creates a new ValueError.
49 func NewValueError(tag []byte) ValueError {
55 func (e ValueError) tag() []byte {
56 n := bytes.IndexByte(e.v[:], 0)
63 // Error implements the error interface.
64 func (e ValueError) Error() string {
65 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
68 // Subtag returns the subtag for which the error occurred.
69 func (e ValueError) Subtag() string {
70 return string(e.tag())
73 // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
76 bytes [max99thPercentileSize]byte
78 start int // start position of the current token
79 end int // end position of the current token
80 next int // next point for scan
85 func makeScannerString(s string) scanner {
87 if len(s) <= len(scan.bytes) {
88 scan.b = scan.bytes[:copy(scan.bytes[:], s)]
96 // makeScanner returns a scanner using b as the input buffer.
97 // b is not copied and may be modified by the scanner routines.
98 func makeScanner(b []byte) scanner {
104 func (s *scanner) init() {
105 for i, c := range s.b {
113 // restToLower converts the string between start and end to lower case.
114 func (s *scanner) toLower(start, end int) {
115 for i := start; i < end; i++ {
117 if 'A' <= c && c <= 'Z' {
123 func (s *scanner) setError(e error) {
124 if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
129 // resizeRange shrinks or grows the array at position oldStart such that
130 // a new string of size newSize can fit between oldStart and oldEnd.
131 // Sets the scan point to after the resized range.
132 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
134 if end := oldStart + newSize; end != oldEnd {
137 b := make([]byte, len(s.b)+diff)
138 copy(b, s.b[:oldStart])
139 copy(b[end:], s.b[oldEnd:])
142 s.b = append(s.b[end:], s.b[oldEnd:]...)
144 s.next = end + (s.next - s.end)
149 // replace replaces the current token with repl.
150 func (s *scanner) replace(repl string) {
151 s.resizeRange(s.start, s.end, len(repl))
152 copy(s.b[s.start:], repl)
155 // gobble removes the current token from the input.
156 // Caller must call scan after calling gobble.
157 func (s *scanner) gobble(e error) {
160 s.b = s.b[:+copy(s.b, s.b[s.next:])]
163 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
169 // deleteRange removes the given range from s.b before the current token.
170 func (s *scanner) deleteRange(start, end int) {
171 s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
178 // scan parses the next token of a BCP 47 string. Tokens that are larger
179 // than 8 characters or include non-alphanumeric characters result in an error
180 // and are gobbled and removed from the output.
181 // It returns the end position of the last token consumed.
182 func (s *scanner) scan() (end int) {
185 for s.start = s.next; s.next < len(s.b); {
186 i := bytes.IndexByte(s.b[s.next:], '-')
195 token := s.b[s.start:s.end]
196 if i < 1 || i > 8 || !isAlphaNum(token) {
203 if n := len(s.b); n > 0 && s.b[n-1] == '-' {
204 s.setError(ErrSyntax)
205 s.b = s.b[:len(s.b)-1]
211 // acceptMinSize parses multiple tokens of the given size or greater.
212 // It returns the end position of the last token consumed.
213 func (s *scanner) acceptMinSize(min int) (end int) {
216 for ; len(s.token) >= min; s.scan() {
222 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
223 // failed it returns an error and any part of the tag that could be parsed.
224 // If parsing succeeded but an unknown value was found, it returns
225 // ValueError. The Tag returned in this case is just stripped of the unknown
226 // value. All other values are preserved. It accepts tags in the BCP 47 format
227 // and extensions to this standard defined in
228 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
229 func Parse(s string) (t Tag, err error) {
230 // TODO: consider supporting old-style locale key-value pairs.
232 return Und, ErrSyntax
234 if len(s) <= maxAltTaglen {
235 b := [maxAltTaglen]byte{}
236 for i, c := range s {
237 // Generating invalid UTF-8 is okay as it won't match.
238 if 'A' <= c && c <= 'Z' {
245 if t, ok := grandfathered(b); ok {
249 scan := makeScannerString(s)
250 return parse(&scan, s)
253 func parse(scan *scanner, s string) (t Tag, err error) {
256 if n := len(scan.token); n <= 1 {
257 scan.toLower(0, len(scan.b))
258 if n == 0 || scan.token[0] != 'x' {
261 end = parseExtensions(scan)
263 return Und, ErrSyntax
264 } else { // the usual case
265 t, end = parseTag(scan)
266 if n := len(scan.token); n == 1 {
268 end = parseExtensions(scan)
269 } else if end < len(scan.b) {
270 scan.setError(ErrSyntax)
271 scan.b = scan.b[:end]
274 if int(t.pVariant) < len(scan.b) {
278 if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
281 t.str = string(scan.b)
284 t.pVariant, t.pExt = 0, 0
289 // parseTag parses language, script, region and variants.
290 // It returns a Tag and the end position in the input that was parsed.
291 func parseTag(scan *scanner) (t Tag, end int) {
293 // TODO: set an error if an unknown lang, script or region is encountered.
294 t.LangID, e = getLangID(scan.token)
296 scan.replace(t.LangID.String())
297 langStart := scan.start
299 for len(scan.token) == 3 && isAlpha(scan.token[0]) {
300 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
301 // to a tag of the form <extlang>.
302 lang, e := getLangID(scan.token)
305 copy(scan.b[langStart:], lang.String())
306 scan.b[langStart+3] = '-'
307 scan.start = langStart + 4
312 if len(scan.token) == 4 && isAlpha(scan.token[0]) {
313 t.ScriptID, e = getScriptID(script, scan.token)
319 if n := len(scan.token); n >= 2 && n <= 3 {
320 t.RegionID, e = getRegionID(scan.token)
324 scan.replace(t.RegionID.String())
328 scan.toLower(scan.start, len(scan.b))
329 t.pVariant = byte(end)
330 end = parseVariants(scan, end, t)
335 var separator = []byte{'-'}
337 // parseVariants scans tokens as long as each token is a valid variant string.
338 // Duplicate variants are removed.
339 func parseVariants(scan *scanner, end int, t Tag) int {
341 varIDBuf := [4]uint8{}
342 variantBuf := [4][]byte{}
343 varID := varIDBuf[:0]
344 variant := variantBuf[:0]
347 for ; len(scan.token) >= 4; scan.scan() {
348 // TODO: measure the impact of needing this conversion and redesign
349 // the data structure if there is an issue.
350 v, ok := variantIndex[string(scan.token)]
353 // TODO: allow user-defined variants?
354 scan.gobble(NewValueError(scan.token))
357 varID = append(varID, v)
358 variant = append(variant, scan.token)
364 // There is no legal combinations of more than 7 variants
365 // (and this is by no means a useful sequence).
366 const maxVariants = 8
367 if len(varID) > maxVariants {
375 sort.Sort(variantsSort{varID, variant})
377 for i, v := range varID {
380 // Remove duplicates.
384 variant[k] = variant[i]
388 if str := bytes.Join(variant[:k], separator); len(str) == 0 {
391 scan.resizeRange(start, end, len(str))
392 copy(scan.b[scan.start:], str)
399 type variantsSort struct {
404 func (s variantsSort) Len() int {
408 func (s variantsSort) Swap(i, j int) {
409 s.i[i], s.i[j] = s.i[j], s.i[i]
410 s.v[i], s.v[j] = s.v[j], s.v[i]
413 func (s variantsSort) Less(i, j int) bool {
414 return s.i[i] < s.i[j]
417 type bytesSort struct {
419 n int // first n bytes to compare
422 func (b bytesSort) Len() int {
426 func (b bytesSort) Swap(i, j int) {
427 b.b[i], b.b[j] = b.b[j], b.b[i]
430 func (b bytesSort) Less(i, j int) bool {
431 for k := 0; k < b.n; k++ {
432 if b.b[i][k] == b.b[j][k] {
435 return b.b[i][k] < b.b[j][k]
440 // parseExtensions parses and normalizes the extensions in the buffer.
441 // It returns the last position of scan.b that is part of any extension.
442 // It also trims scan.b to remove excess parts accordingly.
443 func parseExtensions(scan *scanner) int {
448 for len(scan.token) == 1 {
449 extStart := scan.start
451 end = parseExtension(scan)
452 extension := scan.b[extStart:end]
453 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
454 scan.setError(ErrSyntax)
457 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
458 scan.b = scan.b[:end]
460 } else if ext == 'x' {
464 exts = append(exts, extension)
466 sort.Sort(bytesSort{exts, 1})
467 if len(private) > 0 {
468 exts = append(exts, private)
470 scan.b = scan.b[:start]
472 scan.b = append(scan.b, bytes.Join(exts, separator)...)
473 } else if start > 0 {
474 // Strip trailing '-'.
475 scan.b = scan.b[:start-1]
480 // parseExtension parses a single extension and returns the position of
481 // the extension end.
482 func parseExtension(scan *scanner) int {
483 start, end := scan.start, scan.end
484 switch scan.token[0] {
488 for last := []byte{}; len(scan.token) > 2; scan.scan() {
489 if bytes.Compare(scan.token, last) != -1 {
490 // Attributes are unsorted. Start over from scratch.
494 for scan.scan(); len(scan.token) > 2; scan.scan() {
495 attrs = append(attrs, scan.token)
498 sort.Sort(bytesSort{attrs, 3})
499 copy(scan.b[p:], bytes.Join(attrs, separator))
506 for attrEnd := end; len(scan.token) == 2; last = key {
509 end = scan.acceptMinSize(3)
510 // TODO: check key value validity
511 if keyEnd == end || bytes.Compare(key, last) != 1 {
512 // We have an invalid key or the keys are not sorted.
513 // Start scanning keys from scratch and reorder.
517 for scan.scan(); len(scan.token) == 2; {
518 keyStart, keyEnd := scan.start, scan.end
519 end = scan.acceptMinSize(3)
521 keys = append(keys, scan.b[keyStart:end])
523 scan.setError(ErrSyntax)
527 sort.Stable(bytesSort{keys, 2})
528 if n := len(keys); n > 0 {
530 for i := 1; i < n; i++ {
531 if !bytes.Equal(keys[k][:2], keys[i][:2]) {
534 } else if !bytes.Equal(keys[k], keys[i]) {
535 scan.setError(ErrDuplicateKey)
540 reordered := bytes.Join(keys, separator)
541 if e := p + len(reordered); e < end {
542 scan.deleteRange(e, end)
545 copy(scan.b[p:], reordered)
551 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
552 _, end = parseTag(scan)
553 scan.toLower(start, end)
555 for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
556 end = scan.acceptMinSize(3)
559 end = scan.acceptMinSize(1)
561 end = scan.acceptMinSize(2)
566 // getExtension returns the name, body and end position of the extension.
567 func getExtension(s string, p int) (end int, ext string) {
574 end = nextExtension(s, p)
578 // nextExtension finds the next extension within the string, searching
579 // for the -<char>- pattern from position p.
580 // In the fast majority of cases, language tags will have at most
581 // one extension and extensions tend to be small.
582 func nextExtension(s string, p int) int {
583 for n := len(s) - 3; p < n; {