1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //go:generate go run gen.go gen_common.go -output tables.go
7 package language // import "golang.org/x/text/internal/language"
9 // TODO: Remove above NOTE after:
10 // - verifying that tables are dropped correctly (most notably matcher tables).
19 // maxCoreSize is the maximum size of a BCP 47 tag without variants and
20 // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
23 // max99thPercentileSize is a somewhat arbitrary buffer size that presumably
24 // is large enough to hold at least 99% of the BCP 47 tags.
25 max99thPercentileSize = 32
27 // maxSimpleUExtensionSize is the maximum size of a -u extension with one
28 // key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
29 maxSimpleUExtensionSize = 14
32 // Tag represents a BCP 47 language tag. It is used to specify an instance of a
33 // specific language or locale. All language tag values are guaranteed to be
34 // well-formed. The zero value of Tag is Und.
36 // TODO: the following fields have the form TagTypeID. This name is chosen
37 // to allow refactoring the public package without conflicting with its
38 // Base, Script, and Region methods. Once the transition is fully completed
39 // the ID can be stripped from the name.
43 // TODO: we will soon run out of positions for ScriptID. Idea: instead of
44 // storing lang, region, and ScriptID codes, store only the compact index and
45 // have a lookup table from this code to its expansion. This greatly speeds
46 // up table lookup, speed up common variant cases.
47 // This will also immediately free up 3 extra bytes. Also, the pVariant
48 // field can now be moved to the lookup table, as the compact index uniquely
49 // determines the offset of a possible variant.
51 pVariant byte // offset in str, includes preceding '-'
52 pExt uint16 // offset of first extension, includes preceding '-'
54 // str is the string representation of the Tag. It will only be used if the
55 // tag has variants or extensions.
59 // Make is a convenience wrapper for Parse that omits the error.
60 // In case of an error, a sensible default is returned.
61 func Make(s string) Tag {
66 // Raw returns the raw base language, script and region, without making an
67 // attempt to infer their values.
68 // TODO: consider removing
69 func (t Tag) Raw() (b Language, s Script, r Region) {
70 return t.LangID, t.ScriptID, t.RegionID
73 // equalTags compares language, script and region subtags only.
74 func (t Tag) equalTags(a Tag) bool {
75 return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID
78 // IsRoot returns true if t is equal to language "und".
79 func (t Tag) IsRoot() bool {
80 if int(t.pVariant) < len(t.str) {
83 return t.equalTags(Und)
86 // IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use
88 func (t Tag) IsPrivateUse() bool {
89 return t.str != "" && t.pVariant == 0
92 // RemakeString is used to update t.str in case lang, script or region changed.
93 // It is assumed that pExt and pVariant still point to the start of the
95 func (t *Tag) RemakeString() {
99 extra := t.str[t.pVariant:]
103 if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {
109 var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
110 b := buf[:t.genCoreBytes(buf[:])]
112 diff := len(b) - int(t.pVariant)
114 b = append(b, extra...)
115 t.pVariant = uint8(int(t.pVariant) + diff)
116 t.pExt = uint16(int(t.pExt) + diff)
118 t.pVariant = uint8(len(b))
119 t.pExt = uint16(len(b))
124 // genCoreBytes writes a string for the base languages, script and region tags
125 // to the given buffer and returns the number of bytes written. It will never
126 // write more than maxCoreSize bytes.
127 func (t *Tag) genCoreBytes(buf []byte) int {
128 n := t.LangID.StringToBuf(buf[:])
130 n += copy(buf[n:], "-")
131 n += copy(buf[n:], t.ScriptID.String())
134 n += copy(buf[n:], "-")
135 n += copy(buf[n:], t.RegionID.String())
140 // String returns the canonical string representation of the language tag.
141 func (t Tag) String() string {
145 if t.ScriptID == 0 && t.RegionID == 0 {
146 return t.LangID.String()
148 buf := [maxCoreSize]byte{}
149 return string(buf[:t.genCoreBytes(buf[:])])
152 // MarshalText implements encoding.TextMarshaler.
153 func (t Tag) MarshalText() (text []byte, err error) {
155 text = append(text, t.str...)
156 } else if t.ScriptID == 0 && t.RegionID == 0 {
157 text = append(text, t.LangID.String()...)
159 buf := [maxCoreSize]byte{}
160 text = buf[:t.genCoreBytes(buf[:])]
165 // UnmarshalText implements encoding.TextUnmarshaler.
166 func (t *Tag) UnmarshalText(text []byte) error {
167 tag, err := Parse(string(text))
172 // Variants returns the part of the tag holding all variants or the empty string
173 // if there are no variants defined.
174 func (t Tag) Variants() string {
178 return t.str[t.pVariant:t.pExt]
181 // VariantOrPrivateUseTags returns variants or private use tags.
182 func (t Tag) VariantOrPrivateUseTags() string {
184 return t.str[t.pVariant:t.pExt]
186 return t.str[t.pVariant:]
189 // HasString reports whether this tag defines more than just the raw
191 func (t Tag) HasString() bool {
195 // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
196 // specific language are substituted with fields from the parent language.
197 // The parent for a language may change for newer versions of CLDR.
198 func (t Tag) Parent() Tag {
200 // Strip the variants and extensions.
202 t = Tag{LangID: b, ScriptID: s, RegionID: r}
203 if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {
204 base, _ := addTags(Tag{LangID: t.LangID})
205 if base.ScriptID == t.ScriptID {
206 return Tag{LangID: t.LangID}
213 maxScript := t.ScriptID
216 maxScript = max.ScriptID
219 for i := range parents {
220 if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {
221 for _, r := range parents[i].fromRegion {
222 if Region(r) == t.RegionID {
225 ScriptID: Script(parents[i].script),
226 RegionID: Region(parents[i].toRegion),
233 // Strip the script if it is the default one.
234 base, _ := addTags(Tag{LangID: t.LangID})
235 if base.ScriptID != maxScript {
236 return Tag{LangID: t.LangID, ScriptID: maxScript}
238 return Tag{LangID: t.LangID}
239 } else if t.ScriptID != 0 {
240 // The parent for an base-script pair with a non-default script is
241 // "und" instead of the base language.
242 base, _ := addTags(Tag{LangID: t.LangID})
243 if base.ScriptID != t.ScriptID {
246 return Tag{LangID: t.LangID}
252 // ParseExtension parses s as an extension and returns it on success.
253 func ParseExtension(s string) (ext string, err error) {
254 scan := makeScannerString(s)
256 if n := len(scan.token); n != 1 {
259 scan.toLower(0, len(scan.b))
260 end = parseExtension(&scan)
264 return string(scan.b), nil
267 // HasVariants reports whether t has variants.
268 func (t Tag) HasVariants() bool {
269 return uint16(t.pVariant) < t.pExt
272 // HasExtensions reports whether t has extensions.
273 func (t Tag) HasExtensions() bool {
274 return int(t.pExt) < len(t.str)
277 // Extension returns the extension of type x for tag t. It will return
278 // false for ok if t does not have the requested extension. The returned
279 // extension will be invalid in this case.
280 func (t Tag) Extension(x byte) (ext string, ok bool) {
281 for i := int(t.pExt); i < len(t.str)-1; {
283 i, ext = getExtension(t.str, i)
291 // Extensions returns all extensions of t.
292 func (t Tag) Extensions() []string {
294 for i := int(t.pExt); i < len(t.str)-1; {
296 i, ext = getExtension(t.str, i)
302 // TypeForKey returns the type associated with the given key, where key and type
303 // are of the allowed values defined for the Unicode locale extension ('u') in
304 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
305 // TypeForKey will traverse the inheritance chain to get the correct value.
306 func (t Tag) TypeForKey(key string) string {
307 if start, end, _ := t.findTypeForKey(key); end != start {
308 return t.str[start:end]
314 errPrivateUse = errors.New("cannot set a key on a private use tag")
315 errInvalidArguments = errors.New("invalid key or type")
318 // SetTypeForKey returns a new Tag with the key set to type, where key and type
319 // are of the allowed values defined for the Unicode locale extension ('u') in
320 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
321 // An empty value removes an existing pair with the same key.
322 func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
323 if t.IsPrivateUse() {
324 return t, errPrivateUse
327 return t, errInvalidArguments
330 // Remove the setting if value is "".
332 start, end, _ := t.findTypeForKey(key)
334 // Remove key tag and leading '-'.
337 // Remove a possible empty extension.
338 if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
341 if start == int(t.pVariant) && end == len(t.str) {
343 t.pVariant, t.pExt = 0, 0
345 t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
351 if len(value) < 3 || len(value) > 8 {
352 return t, errInvalidArguments
356 buf [maxCoreSize + maxSimpleUExtensionSize]byte
357 uStart int // start of the -u extension.
360 // Generate the tag string if needed.
362 uStart = t.genCoreBytes(buf[:])
367 // Create new key-type pair and parse it to verify.
372 b = b[:5+copy(b[5:], value)]
373 scan := makeScanner(b)
374 if parseExtensions(&scan); scan.err != nil {
378 // Assemble the replacement string.
380 t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
381 t.str = string(buf[:uStart+len(b)])
384 start, end, hasExt := t.findTypeForKey(key)
389 t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
391 t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
397 // findKeyAndType returns the start and end position for the type corresponding
398 // to key or the point at which to insert the key-value pair if the type
399 // wasn't found. The hasExt return value reports whether an -u extension was present.
400 // Note: the extensions are typically very small and are likely to contain
401 // only one key-type pair.
402 func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
404 if len(key) != 2 || p == len(t.str) || p == 0 {
409 // Find the correct extension.
410 for p++; s[p] != 'u'; p++ {
415 if p = nextExtension(s, p); p == len(s) {
416 return len(s), len(s), false
419 // Proceed to the hyphen following the extension name.
422 // curKey is the key currently being processed.
425 // Iterate over keys until we get the end of a section.
427 // p points to the hyphen preceding the current token.
428 if p3 := p + 3; s[p3] == '-' {
430 // Check whether we just processed the key that was requested.
432 return start, p, true
434 // Set to the next key and continue scanning type tokens.
439 // Start of the type token sequence.
441 // A type is at least 3 characters long.
444 // Attribute or type, which is at least 3 characters long.
447 // p points past the third character of a type or attribute.
448 max := p + 5 // maximum length of token plus hyphen.
452 for ; p < max && s[p] != '-'; p++ {
454 // Bail if we have exhausted all tokens or if the next token starts
456 if p == len(s) || s[p+2] == '-' {
458 return start, p, true
465 // ParseBase parses a 2- or 3-letter ISO 639 code.
466 // It returns a ValueError if s is a well-formed but unknown language identifier
467 // or another error if another error occurred.
468 func ParseBase(s string) (Language, error) {
469 if n := len(s); n < 2 || 3 < n {
473 return getLangID(buf[:copy(buf[:], s)])
476 // ParseScript parses a 4-letter ISO 15924 code.
477 // It returns a ValueError if s is a well-formed but unknown script identifier
478 // or another error if another error occurred.
479 func ParseScript(s string) (Script, error) {
484 return getScriptID(script, buf[:copy(buf[:], s)])
487 // EncodeM49 returns the Region for the given UN M.49 code.
488 // It returns an error if r is not a valid code.
489 func EncodeM49(r int) (Region, error) {
490 return getRegionM49(r)
493 // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
494 // It returns a ValueError if s is a well-formed but unknown region identifier
495 // or another error if another error occurred.
496 func ParseRegion(s string) (Region, error) {
497 if n := len(s); n < 2 || 3 < n {
501 return getRegionID(buf[:copy(buf[:], s)])
504 // IsCountry returns whether this region is a country or autonomous area. This
505 // includes non-standard definitions from CLDR.
506 func (r Region) IsCountry() bool {
507 if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {
513 // IsGroup returns whether this region defines a collection of regions. This
514 // includes non-standard definitions from CLDR.
515 func (r Region) IsGroup() bool {
519 return int(regionInclusion[r]) < len(regionContainment)
522 // Contains returns whether Region c is contained by Region r. It returns true
524 func (r Region) Contains(c Region) bool {
528 g := regionInclusion[r]
529 if g >= nRegionGroups {
532 m := regionContainment[g]
534 d := regionInclusion[c]
535 b := regionInclusionBits[d]
537 // A contained country may belong to multiple disjoint groups. Matching any
538 // of these indicates containment. If the contained region is a group, it
539 // must strictly be a subset.
540 if d >= nRegionGroups {
546 var errNoTLD = errors.New("language: region is not a valid ccTLD")
548 // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
549 // In all other cases it returns either the region itself or an error.
551 // This method may return an error for a region for which there exists a
552 // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
553 // region will already be canonicalized it was obtained from a Tag that was
554 // obtained using any of the default methods.
555 func (r Region) TLD() (Region, error) {
556 // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
557 // difference between ISO 3166-1 and IANA ccTLD.
561 if (r.typ() & ccTLD) == 0 {
567 // Canonicalize returns the region or a possible replacement if the region is
568 // deprecated. It will not return a replacement for deprecated regions that
569 // are split into multiple regions.
570 func (r Region) Canonicalize() Region {
571 if cr := normRegion(r); cr != 0 {
577 // Variant represents a registered variant of a language as defined by BCP 47.
578 type Variant struct {
583 // ParseVariant parses and returns a Variant. An error is returned if s is not
585 func ParseVariant(s string) (Variant, error) {
586 s = strings.ToLower(s)
587 if id, ok := variantIndex[s]; ok {
588 return Variant{id, s}, nil
590 return Variant{}, NewValueError([]byte(s))
593 // String returns the string representation of the variant.
594 func (v Variant) String() string {