barometer: update DMA's vendoring packages
[barometer.git] / src / dma / vendor / golang.org / x / text / unicode / norm / iter.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package norm
6
7 import (
8         "fmt"
9         "unicode/utf8"
10 )
11
12 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
13 // sequence of starter and non-starter runes for the purpose of normalization.
14 const MaxSegmentSize = maxByteBufferSize
15
16 // An Iter iterates over a string or byte slice, while normalizing it
17 // to a given Form.
18 type Iter struct {
19         rb     reorderBuffer
20         buf    [maxByteBufferSize]byte
21         info   Properties // first character saved from previous iteration
22         next   iterFunc   // implementation of next depends on form
23         asciiF iterFunc
24
25         p        int    // current position in input source
26         multiSeg []byte // remainder of multi-segment decomposition
27 }
28
29 type iterFunc func(*Iter) []byte
30
31 // Init initializes i to iterate over src after normalizing it to Form f.
32 func (i *Iter) Init(f Form, src []byte) {
33         i.p = 0
34         if len(src) == 0 {
35                 i.setDone()
36                 i.rb.nsrc = 0
37                 return
38         }
39         i.multiSeg = nil
40         i.rb.init(f, src)
41         i.next = i.rb.f.nextMain
42         i.asciiF = nextASCIIBytes
43         i.info = i.rb.f.info(i.rb.src, i.p)
44         i.rb.ss.first(i.info)
45 }
46
47 // InitString initializes i to iterate over src after normalizing it to Form f.
48 func (i *Iter) InitString(f Form, src string) {
49         i.p = 0
50         if len(src) == 0 {
51                 i.setDone()
52                 i.rb.nsrc = 0
53                 return
54         }
55         i.multiSeg = nil
56         i.rb.initString(f, src)
57         i.next = i.rb.f.nextMain
58         i.asciiF = nextASCIIString
59         i.info = i.rb.f.info(i.rb.src, i.p)
60         i.rb.ss.first(i.info)
61 }
62
63 // Seek sets the segment to be returned by the next call to Next to start
64 // at position p.  It is the responsibility of the caller to set p to the
65 // start of a segment.
66 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
67         var abs int64
68         switch whence {
69         case 0:
70                 abs = offset
71         case 1:
72                 abs = int64(i.p) + offset
73         case 2:
74                 abs = int64(i.rb.nsrc) + offset
75         default:
76                 return 0, fmt.Errorf("norm: invalid whence")
77         }
78         if abs < 0 {
79                 return 0, fmt.Errorf("norm: negative position")
80         }
81         if int(abs) >= i.rb.nsrc {
82                 i.setDone()
83                 return int64(i.p), nil
84         }
85         i.p = int(abs)
86         i.multiSeg = nil
87         i.next = i.rb.f.nextMain
88         i.info = i.rb.f.info(i.rb.src, i.p)
89         i.rb.ss.first(i.info)
90         return abs, nil
91 }
92
93 // returnSlice returns a slice of the underlying input type as a byte slice.
94 // If the underlying is of type []byte, it will simply return a slice.
95 // If the underlying is of type string, it will copy the slice to the buffer
96 // and return that.
97 func (i *Iter) returnSlice(a, b int) []byte {
98         if i.rb.src.bytes == nil {
99                 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
100         }
101         return i.rb.src.bytes[a:b]
102 }
103
104 // Pos returns the byte position at which the next call to Next will commence processing.
105 func (i *Iter) Pos() int {
106         return i.p
107 }
108
109 func (i *Iter) setDone() {
110         i.next = nextDone
111         i.p = i.rb.nsrc
112 }
113
114 // Done returns true if there is no more input to process.
115 func (i *Iter) Done() bool {
116         return i.p >= i.rb.nsrc
117 }
118
119 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
120 // For any input a and b for which f(a) == f(b), subsequent calls
121 // to Next will return the same segments.
122 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
123 // Although not guaranteed, n will typically be the smallest possible n.
124 func (i *Iter) Next() []byte {
125         return i.next(i)
126 }
127
128 func nextASCIIBytes(i *Iter) []byte {
129         p := i.p + 1
130         if p >= i.rb.nsrc {
131                 p0 := i.p
132                 i.setDone()
133                 return i.rb.src.bytes[p0:p]
134         }
135         if i.rb.src.bytes[p] < utf8.RuneSelf {
136                 p0 := i.p
137                 i.p = p
138                 return i.rb.src.bytes[p0:p]
139         }
140         i.info = i.rb.f.info(i.rb.src, i.p)
141         i.next = i.rb.f.nextMain
142         return i.next(i)
143 }
144
145 func nextASCIIString(i *Iter) []byte {
146         p := i.p + 1
147         if p >= i.rb.nsrc {
148                 i.buf[0] = i.rb.src.str[i.p]
149                 i.setDone()
150                 return i.buf[:1]
151         }
152         if i.rb.src.str[p] < utf8.RuneSelf {
153                 i.buf[0] = i.rb.src.str[i.p]
154                 i.p = p
155                 return i.buf[:1]
156         }
157         i.info = i.rb.f.info(i.rb.src, i.p)
158         i.next = i.rb.f.nextMain
159         return i.next(i)
160 }
161
162 func nextHangul(i *Iter) []byte {
163         p := i.p
164         next := p + hangulUTF8Size
165         if next >= i.rb.nsrc {
166                 i.setDone()
167         } else if i.rb.src.hangul(next) == 0 {
168                 i.rb.ss.next(i.info)
169                 i.info = i.rb.f.info(i.rb.src, i.p)
170                 i.next = i.rb.f.nextMain
171                 return i.next(i)
172         }
173         i.p = next
174         return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
175 }
176
177 func nextDone(i *Iter) []byte {
178         return nil
179 }
180
181 // nextMulti is used for iterating over multi-segment decompositions
182 // for decomposing normal forms.
183 func nextMulti(i *Iter) []byte {
184         j := 0
185         d := i.multiSeg
186         // skip first rune
187         for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
188         }
189         for j < len(d) {
190                 info := i.rb.f.info(input{bytes: d}, j)
191                 if info.BoundaryBefore() {
192                         i.multiSeg = d[j:]
193                         return d[:j]
194                 }
195                 j += int(info.size)
196         }
197         // treat last segment as normal decomposition
198         i.next = i.rb.f.nextMain
199         return i.next(i)
200 }
201
202 // nextMultiNorm is used for iterating over multi-segment decompositions
203 // for composing normal forms.
204 func nextMultiNorm(i *Iter) []byte {
205         j := 0
206         d := i.multiSeg
207         for j < len(d) {
208                 info := i.rb.f.info(input{bytes: d}, j)
209                 if info.BoundaryBefore() {
210                         i.rb.compose()
211                         seg := i.buf[:i.rb.flushCopy(i.buf[:])]
212                         i.rb.insertUnsafe(input{bytes: d}, j, info)
213                         i.multiSeg = d[j+int(info.size):]
214                         return seg
215                 }
216                 i.rb.insertUnsafe(input{bytes: d}, j, info)
217                 j += int(info.size)
218         }
219         i.multiSeg = nil
220         i.next = nextComposed
221         return doNormComposed(i)
222 }
223
224 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
225 func nextDecomposed(i *Iter) (next []byte) {
226         outp := 0
227         inCopyStart, outCopyStart := i.p, 0
228         for {
229                 if sz := int(i.info.size); sz <= 1 {
230                         i.rb.ss = 0
231                         p := i.p
232                         i.p++ // ASCII or illegal byte.  Either way, advance by 1.
233                         if i.p >= i.rb.nsrc {
234                                 i.setDone()
235                                 return i.returnSlice(p, i.p)
236                         } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
237                                 i.next = i.asciiF
238                                 return i.returnSlice(p, i.p)
239                         }
240                         outp++
241                 } else if d := i.info.Decomposition(); d != nil {
242                         // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
243                         // Case 1: there is a leftover to copy.  In this case the decomposition
244                         // must begin with a modifier and should always be appended.
245                         // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
246                         p := outp + len(d)
247                         if outp > 0 {
248                                 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
249                                 // TODO: this condition should not be possible, but we leave it
250                                 // in for defensive purposes.
251                                 if p > len(i.buf) {
252                                         return i.buf[:outp]
253                                 }
254                         } else if i.info.multiSegment() {
255                                 // outp must be 0 as multi-segment decompositions always
256                                 // start a new segment.
257                                 if i.multiSeg == nil {
258                                         i.multiSeg = d
259                                         i.next = nextMulti
260                                         return nextMulti(i)
261                                 }
262                                 // We are in the last segment.  Treat as normal decomposition.
263                                 d = i.multiSeg
264                                 i.multiSeg = nil
265                                 p = len(d)
266                         }
267                         prevCC := i.info.tccc
268                         if i.p += sz; i.p >= i.rb.nsrc {
269                                 i.setDone()
270                                 i.info = Properties{} // Force BoundaryBefore to succeed.
271                         } else {
272                                 i.info = i.rb.f.info(i.rb.src, i.p)
273                         }
274                         switch i.rb.ss.next(i.info) {
275                         case ssOverflow:
276                                 i.next = nextCGJDecompose
277                                 fallthrough
278                         case ssStarter:
279                                 if outp > 0 {
280                                         copy(i.buf[outp:], d)
281                                         return i.buf[:p]
282                                 }
283                                 return d
284                         }
285                         copy(i.buf[outp:], d)
286                         outp = p
287                         inCopyStart, outCopyStart = i.p, outp
288                         if i.info.ccc < prevCC {
289                                 goto doNorm
290                         }
291                         continue
292                 } else if r := i.rb.src.hangul(i.p); r != 0 {
293                         outp = decomposeHangul(i.buf[:], r)
294                         i.p += hangulUTF8Size
295                         inCopyStart, outCopyStart = i.p, outp
296                         if i.p >= i.rb.nsrc {
297                                 i.setDone()
298                                 break
299                         } else if i.rb.src.hangul(i.p) != 0 {
300                                 i.next = nextHangul
301                                 return i.buf[:outp]
302                         }
303                 } else {
304                         p := outp + sz
305                         if p > len(i.buf) {
306                                 break
307                         }
308                         outp = p
309                         i.p += sz
310                 }
311                 if i.p >= i.rb.nsrc {
312                         i.setDone()
313                         break
314                 }
315                 prevCC := i.info.tccc
316                 i.info = i.rb.f.info(i.rb.src, i.p)
317                 if v := i.rb.ss.next(i.info); v == ssStarter {
318                         break
319                 } else if v == ssOverflow {
320                         i.next = nextCGJDecompose
321                         break
322                 }
323                 if i.info.ccc < prevCC {
324                         goto doNorm
325                 }
326         }
327         if outCopyStart == 0 {
328                 return i.returnSlice(inCopyStart, i.p)
329         } else if inCopyStart < i.p {
330                 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
331         }
332         return i.buf[:outp]
333 doNorm:
334         // Insert what we have decomposed so far in the reorderBuffer.
335         // As we will only reorder, there will always be enough room.
336         i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
337         i.rb.insertDecomposed(i.buf[0:outp])
338         return doNormDecomposed(i)
339 }
340
341 func doNormDecomposed(i *Iter) []byte {
342         for {
343                 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
344                 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
345                         i.setDone()
346                         break
347                 }
348                 i.info = i.rb.f.info(i.rb.src, i.p)
349                 if i.info.ccc == 0 {
350                         break
351                 }
352                 if s := i.rb.ss.next(i.info); s == ssOverflow {
353                         i.next = nextCGJDecompose
354                         break
355                 }
356         }
357         // new segment or too many combining characters: exit normalization
358         return i.buf[:i.rb.flushCopy(i.buf[:])]
359 }
360
361 func nextCGJDecompose(i *Iter) []byte {
362         i.rb.ss = 0
363         i.rb.insertCGJ()
364         i.next = nextDecomposed
365         i.rb.ss.first(i.info)
366         buf := doNormDecomposed(i)
367         return buf
368 }
369
370 // nextComposed is the implementation of Next for forms NFC and NFKC.
371 func nextComposed(i *Iter) []byte {
372         outp, startp := 0, i.p
373         var prevCC uint8
374         for {
375                 if !i.info.isYesC() {
376                         goto doNorm
377                 }
378                 prevCC = i.info.tccc
379                 sz := int(i.info.size)
380                 if sz == 0 {
381                         sz = 1 // illegal rune: copy byte-by-byte
382                 }
383                 p := outp + sz
384                 if p > len(i.buf) {
385                         break
386                 }
387                 outp = p
388                 i.p += sz
389                 if i.p >= i.rb.nsrc {
390                         i.setDone()
391                         break
392                 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
393                         i.rb.ss = 0
394                         i.next = i.asciiF
395                         break
396                 }
397                 i.info = i.rb.f.info(i.rb.src, i.p)
398                 if v := i.rb.ss.next(i.info); v == ssStarter {
399                         break
400                 } else if v == ssOverflow {
401                         i.next = nextCGJCompose
402                         break
403                 }
404                 if i.info.ccc < prevCC {
405                         goto doNorm
406                 }
407         }
408         return i.returnSlice(startp, i.p)
409 doNorm:
410         // reset to start position
411         i.p = startp
412         i.info = i.rb.f.info(i.rb.src, i.p)
413         i.rb.ss.first(i.info)
414         if i.info.multiSegment() {
415                 d := i.info.Decomposition()
416                 info := i.rb.f.info(input{bytes: d}, 0)
417                 i.rb.insertUnsafe(input{bytes: d}, 0, info)
418                 i.multiSeg = d[int(info.size):]
419                 i.next = nextMultiNorm
420                 return nextMultiNorm(i)
421         }
422         i.rb.ss.first(i.info)
423         i.rb.insertUnsafe(i.rb.src, i.p, i.info)
424         return doNormComposed(i)
425 }
426
427 func doNormComposed(i *Iter) []byte {
428         // First rune should already be inserted.
429         for {
430                 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
431                         i.setDone()
432                         break
433                 }
434                 i.info = i.rb.f.info(i.rb.src, i.p)
435                 if s := i.rb.ss.next(i.info); s == ssStarter {
436                         break
437                 } else if s == ssOverflow {
438                         i.next = nextCGJCompose
439                         break
440                 }
441                 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
442         }
443         i.rb.compose()
444         seg := i.buf[:i.rb.flushCopy(i.buf[:])]
445         return seg
446 }
447
448 func nextCGJCompose(i *Iter) []byte {
449         i.rb.ss = 0 // instead of first
450         i.rb.insertCGJ()
451         i.next = nextComposed
452         // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
453         // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
454         // If we ever change that, insert a check here.
455         i.rb.ss.first(i.info)
456         i.rb.insertUnsafe(i.rb.src, i.p, i.info)
457         return doNormComposed(i)
458 }