1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package bidirule implements the Bidi Rule defined by RFC 5893.
7 // This package is under development. The API may change without notice and
8 // without preserving backward compatibility.
15 "golang.org/x/text/transform"
16 "golang.org/x/text/unicode/bidi"
19 // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
20 // Internationalized Domain Names for Applications (IDNA)
22 // A label is an individual component of a domain name. Labels are usually
23 // shown separated by dots; for example, the domain name "www.example.com" is
24 // composed of three labels: "www", "example", and "com".
26 // An RTL label is a label that contains at least one character of class R, AL,
27 // or AN. An LTR label is any label that is not an RTL label.
29 // A "Bidi domain name" is a domain name that contains at least one RTL label.
31 // The following guarantees can be made based on the above:
33 // o In a domain name consisting of only labels that satisfy the rule,
34 // the requirements of Section 3 are satisfied. Note that even LTR
35 // labels and pure ASCII labels have to be tested.
37 // o In a domain name consisting of only LDH labels (as defined in the
38 // Definitions document [RFC5890]) and labels that satisfy the rule,
39 // the requirements of Section 3 are satisfied as long as a label
40 // that starts with an ASCII digit does not come after a
41 // right-to-left label.
43 // No guarantee is given for other combinations.
45 // ErrInvalid indicates a label is invalid according to the Bidi Rule.
46 var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
51 ruleInitial ruleState = iota
59 type ruleTransition struct {
64 var transitions = [...][2]ruleTransition{
65 // [2.1] The first character must be a character with Bidi property L, R, or
66 // AL. If it has the R or AL property, it is an RTL label; if it has the L
67 // property, it is an LTR label.
69 {ruleLTRFinal, 1 << bidi.L},
70 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
73 // [2.3] In an RTL label, the end of the label must be a character with
74 // Bidi property R, AL, EN, or AN, followed by zero or more characters
75 // with Bidi property NSM.
76 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
78 // [2.2] In an RTL label, only characters with the Bidi properties R,
79 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
80 // We exclude the entries from [2.3]
81 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
84 // [2.3] In an RTL label, the end of the label must be a character with
85 // Bidi property R, AL, EN, or AN, followed by zero or more characters
86 // with Bidi property NSM.
87 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
89 // [2.2] In an RTL label, only characters with the Bidi properties R,
90 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
91 // We exclude the entries from [2.3] and NSM.
92 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
95 // [2.6] In an LTR label, the end of the label must be a character with
96 // Bidi property L or EN, followed by zero or more characters with Bidi
98 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
100 // [2.5] In an LTR label, only characters with the Bidi properties L,
101 // EN, ES, CS, ET, ON, BN, or NSM are allowed.
102 // We exclude the entries from [2.6].
103 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
106 // [2.6] In an LTR label, the end of the label must be a character with
107 // Bidi property L or EN, followed by zero or more characters with Bidi
109 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
111 // [2.5] In an LTR label, only characters with the Bidi properties L,
112 // EN, ES, CS, ET, ON, BN, or NSM are allowed.
113 // We exclude the entries from [2.6].
114 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
122 // [2.4] In an RTL label, if an EN is present, no AN may be present, and
124 const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
127 // An RTL label is a label that contains at least one character of type
130 // An LTR label is any label that is not an RTL label.
132 // Direction reports the direction of the given label as defined by RFC 5893.
133 // The Bidi Rule does not have to be applied to labels of the category
135 func Direction(b []byte) bidi.Direction {
136 for i := 0; i < len(b); {
137 e, sz := bidi.Lookup(b[i:])
142 if c == bidi.R || c == bidi.AL || c == bidi.AN {
143 return bidi.RightToLeft
147 return bidi.LeftToRight
150 // DirectionString reports the direction of the given label as defined by RFC
151 // 5893. The Bidi Rule does not have to be applied to labels of the category
153 func DirectionString(s string) bidi.Direction {
154 for i := 0; i < len(s); {
155 e, sz := bidi.LookupString(s[i:])
161 if c == bidi.R || c == bidi.AL || c == bidi.AN {
162 return bidi.RightToLeft
166 return bidi.LeftToRight
169 // Valid reports whether b conforms to the BiDi rule.
170 func Valid(b []byte) bool {
172 if n, ok := t.advance(b); !ok || n < len(b) {
178 // ValidString reports whether s conforms to the BiDi rule.
179 func ValidString(s string) bool {
181 if n, ok := t.advanceString(s); !ok || n < len(s) {
187 // New returns a Transformer that verifies that input adheres to the Bidi Rule.
188 func New() *Transformer {
189 return &Transformer{}
192 // Transformer implements transform.Transform.
193 type Transformer struct {
199 // A rule can only be violated for "Bidi Domain names", meaning if one of the
200 // following categories has been observed.
201 func (t *Transformer) isRTL() bool {
202 const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
203 return t.seen&isRTL != 0
206 // Reset implements transform.Transformer.
207 func (t *Transformer) Reset() { *t = Transformer{} }
209 // Transform implements transform.Transformer. This Transformer has state and
210 // needs to be reset between uses.
211 func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
212 if len(dst) < len(src) {
215 err = transform.ErrShortDst
217 n, err1 := t.Span(src, atEOF)
219 if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
225 // Span returns the first n bytes of src that conform to the Bidi rule.
226 func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
227 if t.state == ruleInvalid && t.isRTL() {
230 n, ok := t.advance(src)
236 err = transform.ErrShortSrc
246 // Precomputing the ASCII values decreases running time for the ASCII fast path
248 var asciiTable [128]bidi.Properties
251 for i := range asciiTable {
252 p, _ := bidi.LookupRune(rune(i))
257 func (t *Transformer) advance(s []byte) (n int, ok bool) {
258 var e bidi.Properties
261 if s[n] < utf8.RuneSelf {
262 e, sz = asciiTable[s[n]], 1
264 e, sz = bidi.Lookup(s[n:])
267 // We always consider invalid UTF-8 to be invalid, even if
268 // the string has not yet been determined to be RTL.
269 // TODO: is this correct?
272 return n, true // incomplete UTF-8 encoding
275 // TODO: using CompactClass would result in noticeable speedup.
276 // See unicode/bidi/prop.go:Properties.CompactClass.
277 c := uint16(1 << e.Class())
279 if t.seen&exclusiveRTL == exclusiveRTL {
280 t.state = ruleInvalid
283 switch tr := transitions[t.state]; {
284 case tr[0].mask&c != 0:
286 case tr[1].mask&c != 0:
289 t.state = ruleInvalid
299 func (t *Transformer) advanceString(s string) (n int, ok bool) {
300 var e bidi.Properties
303 if s[n] < utf8.RuneSelf {
304 e, sz = asciiTable[s[n]], 1
306 e, sz = bidi.LookupString(s[n:])
309 return n, false // invalid UTF-8
311 return n, true // incomplete UTF-8 encoding
314 // TODO: using CompactClass results in noticeable speedup.
315 // See unicode/bidi/prop.go:Properties.CompactClass.
316 c := uint16(1 << e.Class())
318 if t.seen&exclusiveRTL == exclusiveRTL {
319 t.state = ruleInvalid
322 switch tr := transitions[t.state]; {
323 case tr[0].mask&c != 0:
325 case tr[1].mask&c != 0:
328 t.state = ruleInvalid