1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // TODO: remove hard-coded versions when we have implemented fractional weights.
6 // The current implementation is incompatible with later CLDR versions.
7 //go:generate go run maketables.go -cldr=23 -unicode=6.2.0
9 // Package collate contains types for comparing and sorting Unicode strings
10 // according to a given collation order.
11 package collate // import "golang.org/x/text/collate"
17 "golang.org/x/text/internal/colltab"
18 "golang.org/x/text/language"
21 // Collator provides functionality for comparing strings for a given
23 type Collator struct {
31 func (c *Collator) iter(i int) *iter {
32 // TODO: evaluate performance for making the second iterator optional.
36 // Supported returns the list of languages for which collating differs from its parent.
37 func Supported() []language.Tag {
38 // TODO: use language.Coverage instead.
40 t := make([]language.Tag, len(tags))
46 ids := strings.Split(availableLocales, ",")
47 tags = make([]language.Tag, len(ids))
48 for i, s := range ids {
49 tags[i] = language.Raw.MustParse(s)
53 var tags []language.Tag
55 // New returns a new Collator initialized for the given locale.
56 func New(t language.Tag, o ...Option) *Collator {
57 index := colltab.MatchLang(t, tags)
58 c := newCollator(getTable(locales[index]))
60 // Set options from the user-supplied tag.
63 // Set the user-supplied options.
70 // NewFromTable returns a new Collator for the given Weighter.
71 func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
78 func (c *Collator) init() {
80 c.t = colltab.NewNumericWeighter(c.t)
86 // Buffer holds keys generated by Key and KeyString.
92 func (b *Buffer) init() {
98 // Reset clears the buffer from previous results generated by Key and KeyString.
99 func (b *Buffer) Reset() {
103 // Compare returns an integer comparing the two byte slices.
104 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
105 func (c *Collator) Compare(a, b []byte) int {
106 // TODO: skip identical prefixes once we have a fast way to detect if a rune is
107 // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
108 c.iter(0).SetInput(a)
109 c.iter(1).SetInput(b)
110 if res := c.compare(); res != 0 {
113 if !c.ignore[colltab.Identity] {
114 return bytes.Compare(a, b)
119 // CompareString returns an integer comparing the two strings.
120 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
121 func (c *Collator) CompareString(a, b string) int {
122 // TODO: skip identical prefixes once we have a fast way to detect if a rune is
123 // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
124 c.iter(0).SetInputString(a)
125 c.iter(1).SetInputString(b)
126 if res := c.compare(); res != 0 {
129 if !c.ignore[colltab.Identity] {
139 func compareLevel(f func(i *iter) int, a, b *iter) int {
157 func (c *Collator) compare() int {
158 ia, ib := c.iter(0), c.iter(1)
159 // Process primary level
160 if c.alternate != altShifted {
161 // TODO: implement script reordering
162 if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
166 // TODO: handle shifted
168 if !c.ignore[colltab.Secondary] {
169 f := (*iter).nextSecondary
171 f = (*iter).prevSecondary
173 if res := compareLevel(f, ia, ib); res != 0 {
177 // TODO: special case handling (Danish?)
178 if !c.ignore[colltab.Tertiary] || c.caseLevel {
179 if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
182 if !c.ignore[colltab.Quaternary] {
183 if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
191 // Key returns the collation key for str.
192 // Passing the buffer buf may avoid memory allocations.
193 // The returned slice will point to an allocation in Buffer and will remain
194 // valid until the next call to buf.Reset().
195 func (c *Collator) Key(buf *Buffer, str []byte) []byte {
196 // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
198 return c.key(buf, c.getColElems(str))
201 // KeyFromString returns the collation key for str.
202 // Passing the buffer buf may avoid memory allocations.
203 // The returned slice will point to an allocation in Buffer and will retain
204 // valid until the next call to buf.ResetKeys().
205 func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
206 // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
208 return c.key(buf, c.getColElemsString(str))
211 func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
212 processWeights(c.alternate, c.t.Top(), w)
214 c.keyFromElems(buf, w)
218 func (c *Collator) getColElems(str []byte) []colltab.Elem {
226 func (c *Collator) getColElemsString(str string) []colltab.Elem {
228 i.SetInputString(str)
241 func (i *iter) init(c *Collator) {
246 func (i *iter) nextPrimary() int {
248 for ; i.pce < i.N; i.pce++ {
249 if v := i.Elems[i.pce].Primary(); v != 0 {
258 panic("should not reach here")
261 func (i *iter) nextSecondary() int {
262 for ; i.pce < len(i.Elems); i.pce++ {
263 if v := i.Elems[i.pce].Secondary(); v != 0 {
271 func (i *iter) prevSecondary() int {
272 for ; i.pce < len(i.Elems); i.pce++ {
273 if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
281 func (i *iter) nextTertiary() int {
282 for ; i.pce < len(i.Elems); i.pce++ {
283 if v := i.Elems[i.pce].Tertiary(); v != 0 {
291 func (i *iter) nextQuaternary() int {
292 for ; i.pce < len(i.Elems); i.pce++ {
293 if v := i.Elems[i.pce].Quaternary(); v != 0 {
301 func appendPrimary(key []byte, p int) []byte {
302 // Convert to variable length encoding; supports up to 23 bits.
304 key = append(key, uint8(p>>8), uint8(p))
306 key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
311 // keyFromElems converts the weights ws to a compact sequence of bytes.
312 // The result will be appended to the byte buffer in buf.
313 func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
314 for _, v := range ws {
315 if w := v.Primary(); w > 0 {
316 buf.key = appendPrimary(buf.key, w)
319 if !c.ignore[colltab.Secondary] {
320 buf.key = append(buf.key, 0, 0)
321 // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
323 for _, v := range ws {
324 if w := v.Secondary(); w > 0 {
325 buf.key = append(buf.key, uint8(w>>8), uint8(w))
329 for i := len(ws) - 1; i >= 0; i-- {
330 if w := ws[i].Secondary(); w > 0 {
331 buf.key = append(buf.key, uint8(w>>8), uint8(w))
335 } else if c.caseLevel {
336 buf.key = append(buf.key, 0, 0)
338 if !c.ignore[colltab.Tertiary] || c.caseLevel {
339 buf.key = append(buf.key, 0, 0)
340 for _, v := range ws {
341 if w := v.Tertiary(); w > 0 {
342 buf.key = append(buf.key, uint8(w))
345 // Derive the quaternary weights from the options and other levels.
346 // Note that we represent MaxQuaternary as 0xFF. The first byte of the
347 // representation of a primary weight is always smaller than 0xFF,
348 // so using this single byte value will compare correctly.
349 if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
350 if c.alternate == altShiftTrimmed {
351 lastNonFFFF := len(buf.key)
352 buf.key = append(buf.key, 0)
353 for _, v := range ws {
354 if w := v.Quaternary(); w == colltab.MaxQuaternary {
355 buf.key = append(buf.key, 0xFF)
357 buf.key = appendPrimary(buf.key, w)
358 lastNonFFFF = len(buf.key)
361 buf.key = buf.key[:lastNonFFFF]
363 buf.key = append(buf.key, 0)
364 for _, v := range ws {
365 if w := v.Quaternary(); w == colltab.MaxQuaternary {
366 buf.key = append(buf.key, 0xFF)
368 buf.key = appendPrimary(buf.key, w)
376 func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
380 case altShifted, altShiftTrimmed:
382 if p := wa[i].Primary(); p <= vtop && p != 0 {
383 wa[i] = colltab.MakeQuaternary(p)
387 wa[i] = colltab.Ignore
395 if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
396 wa[i] = colltab.Ignore