parse.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "bytes"
  7. "errors"
  8. "fmt"
  9. "sort"
  10. "golang.org/x/text/internal/tag"
  11. )
  12. // isAlpha returns true if the byte is not a digit.
  13. // b must be an ASCII letter or digit.
  14. func isAlpha(b byte) bool {
  15. return b > '9'
  16. }
  17. // isAlphaNum returns true if the string contains only ASCII letters or digits.
  18. func isAlphaNum(s []byte) bool {
  19. for _, c := range s {
  20. if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
  21. return false
  22. }
  23. }
  24. return true
  25. }
  26. // ErrSyntax is returned by any of the parsing functions when the
  27. // input is not well-formed, according to BCP 47.
  28. // TODO: return the position at which the syntax error occurred?
  29. var ErrSyntax = errors.New("language: tag is not well-formed")
  30. // ErrDuplicateKey is returned when a tag contains the same key twice with
  31. // different values in the -u section.
  32. var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
  33. // ValueError is returned by any of the parsing functions when the
  34. // input is well-formed but the respective subtag is not recognized
  35. // as a valid value.
  36. type ValueError struct {
  37. v [8]byte
  38. }
  39. // NewValueError creates a new ValueError.
  40. func NewValueError(tag []byte) ValueError {
  41. var e ValueError
  42. copy(e.v[:], tag)
  43. return e
  44. }
  45. func (e ValueError) tag() []byte {
  46. n := bytes.IndexByte(e.v[:], 0)
  47. if n == -1 {
  48. n = 8
  49. }
  50. return e.v[:n]
  51. }
  52. // Error implements the error interface.
  53. func (e ValueError) Error() string {
  54. return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
  55. }
  56. // Subtag returns the subtag for which the error occurred.
  57. func (e ValueError) Subtag() string {
  58. return string(e.tag())
  59. }
  60. // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
  61. type scanner struct {
  62. b []byte
  63. bytes [max99thPercentileSize]byte
  64. token []byte
  65. start int // start position of the current token
  66. end int // end position of the current token
  67. next int // next point for scan
  68. err error
  69. done bool
  70. }
  71. func makeScannerString(s string) scanner {
  72. scan := scanner{}
  73. if len(s) <= len(scan.bytes) {
  74. scan.b = scan.bytes[:copy(scan.bytes[:], s)]
  75. } else {
  76. scan.b = []byte(s)
  77. }
  78. scan.init()
  79. return scan
  80. }
  81. // makeScanner returns a scanner using b as the input buffer.
  82. // b is not copied and may be modified by the scanner routines.
  83. func makeScanner(b []byte) scanner {
  84. scan := scanner{b: b}
  85. scan.init()
  86. return scan
  87. }
  88. func (s *scanner) init() {
  89. for i, c := range s.b {
  90. if c == '_' {
  91. s.b[i] = '-'
  92. }
  93. }
  94. s.scan()
  95. }
  96. // restToLower converts the string between start and end to lower case.
  97. func (s *scanner) toLower(start, end int) {
  98. for i := start; i < end; i++ {
  99. c := s.b[i]
  100. if 'A' <= c && c <= 'Z' {
  101. s.b[i] += 'a' - 'A'
  102. }
  103. }
  104. }
  105. func (s *scanner) setError(e error) {
  106. if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
  107. s.err = e
  108. }
  109. }
  110. // resizeRange shrinks or grows the array at position oldStart such that
  111. // a new string of size newSize can fit between oldStart and oldEnd.
  112. // Sets the scan point to after the resized range.
  113. func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
  114. s.start = oldStart
  115. if end := oldStart + newSize; end != oldEnd {
  116. diff := end - oldEnd
  117. var b []byte
  118. if n := len(s.b) + diff; n > cap(s.b) {
  119. b = make([]byte, n)
  120. copy(b, s.b[:oldStart])
  121. } else {
  122. b = s.b[:n]
  123. }
  124. copy(b[end:], s.b[oldEnd:])
  125. s.b = b
  126. s.next = end + (s.next - s.end)
  127. s.end = end
  128. }
  129. }
  130. // replace replaces the current token with repl.
  131. func (s *scanner) replace(repl string) {
  132. s.resizeRange(s.start, s.end, len(repl))
  133. copy(s.b[s.start:], repl)
  134. }
  135. // gobble removes the current token from the input.
  136. // Caller must call scan after calling gobble.
  137. func (s *scanner) gobble(e error) {
  138. s.setError(e)
  139. if s.start == 0 {
  140. s.b = s.b[:+copy(s.b, s.b[s.next:])]
  141. s.end = 0
  142. } else {
  143. s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
  144. s.end = s.start - 1
  145. }
  146. s.next = s.start
  147. }
  148. // deleteRange removes the given range from s.b before the current token.
  149. func (s *scanner) deleteRange(start, end int) {
  150. s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
  151. diff := end - start
  152. s.next -= diff
  153. s.start -= diff
  154. s.end -= diff
  155. }
  156. // scan parses the next token of a BCP 47 string. Tokens that are larger
  157. // than 8 characters or include non-alphanumeric characters result in an error
  158. // and are gobbled and removed from the output.
  159. // It returns the end position of the last token consumed.
  160. func (s *scanner) scan() (end int) {
  161. end = s.end
  162. s.token = nil
  163. for s.start = s.next; s.next < len(s.b); {
  164. i := bytes.IndexByte(s.b[s.next:], '-')
  165. if i == -1 {
  166. s.end = len(s.b)
  167. s.next = len(s.b)
  168. i = s.end - s.start
  169. } else {
  170. s.end = s.next + i
  171. s.next = s.end + 1
  172. }
  173. token := s.b[s.start:s.end]
  174. if i < 1 || i > 8 || !isAlphaNum(token) {
  175. s.gobble(ErrSyntax)
  176. continue
  177. }
  178. s.token = token
  179. return end
  180. }
  181. if n := len(s.b); n > 0 && s.b[n-1] == '-' {
  182. s.setError(ErrSyntax)
  183. s.b = s.b[:len(s.b)-1]
  184. }
  185. s.done = true
  186. return end
  187. }
  188. // acceptMinSize parses multiple tokens of the given size or greater.
  189. // It returns the end position of the last token consumed.
  190. func (s *scanner) acceptMinSize(min int) (end int) {
  191. end = s.end
  192. s.scan()
  193. for ; len(s.token) >= min; s.scan() {
  194. end = s.end
  195. }
  196. return end
  197. }
  198. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  199. // failed it returns an error and any part of the tag that could be parsed.
  200. // If parsing succeeded but an unknown value was found, it returns
  201. // ValueError. The Tag returned in this case is just stripped of the unknown
  202. // value. All other values are preserved. It accepts tags in the BCP 47 format
  203. // and extensions to this standard defined in
  204. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  205. func Parse(s string) (t Tag, err error) {
  206. // TODO: consider supporting old-style locale key-value pairs.
  207. if s == "" {
  208. return Und, ErrSyntax
  209. }
  210. defer func() {
  211. if recover() != nil {
  212. t = Und
  213. err = ErrSyntax
  214. return
  215. }
  216. }()
  217. if len(s) <= maxAltTaglen {
  218. b := [maxAltTaglen]byte{}
  219. for i, c := range s {
  220. // Generating invalid UTF-8 is okay as it won't match.
  221. if 'A' <= c && c <= 'Z' {
  222. c += 'a' - 'A'
  223. } else if c == '_' {
  224. c = '-'
  225. }
  226. b[i] = byte(c)
  227. }
  228. if t, ok := grandfathered(b); ok {
  229. return t, nil
  230. }
  231. }
  232. scan := makeScannerString(s)
  233. return parse(&scan, s)
  234. }
  235. func parse(scan *scanner, s string) (t Tag, err error) {
  236. t = Und
  237. var end int
  238. if n := len(scan.token); n <= 1 {
  239. scan.toLower(0, len(scan.b))
  240. if n == 0 || scan.token[0] != 'x' {
  241. return t, ErrSyntax
  242. }
  243. end = parseExtensions(scan)
  244. } else if n >= 4 {
  245. return Und, ErrSyntax
  246. } else { // the usual case
  247. t, end = parseTag(scan, true)
  248. if n := len(scan.token); n == 1 {
  249. t.pExt = uint16(end)
  250. end = parseExtensions(scan)
  251. } else if end < len(scan.b) {
  252. scan.setError(ErrSyntax)
  253. scan.b = scan.b[:end]
  254. }
  255. }
  256. if int(t.pVariant) < len(scan.b) {
  257. if end < len(s) {
  258. s = s[:end]
  259. }
  260. if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
  261. t.str = s
  262. } else {
  263. t.str = string(scan.b)
  264. }
  265. } else {
  266. t.pVariant, t.pExt = 0, 0
  267. }
  268. return t, scan.err
  269. }
  270. // parseTag parses language, script, region and variants.
  271. // It returns a Tag and the end position in the input that was parsed.
  272. // If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
  273. func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
  274. var e error
  275. // TODO: set an error if an unknown lang, script or region is encountered.
  276. t.LangID, e = getLangID(scan.token)
  277. scan.setError(e)
  278. scan.replace(t.LangID.String())
  279. langStart := scan.start
  280. end = scan.scan()
  281. for len(scan.token) == 3 && isAlpha(scan.token[0]) {
  282. // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
  283. // to a tag of the form <extlang>.
  284. if doNorm {
  285. lang, e := getLangID(scan.token)
  286. if lang != 0 {
  287. t.LangID = lang
  288. langStr := lang.String()
  289. copy(scan.b[langStart:], langStr)
  290. scan.b[langStart+len(langStr)] = '-'
  291. scan.start = langStart + len(langStr) + 1
  292. }
  293. scan.gobble(e)
  294. }
  295. end = scan.scan()
  296. }
  297. if len(scan.token) == 4 && isAlpha(scan.token[0]) {
  298. t.ScriptID, e = getScriptID(script, scan.token)
  299. if t.ScriptID == 0 {
  300. scan.gobble(e)
  301. }
  302. end = scan.scan()
  303. }
  304. if n := len(scan.token); n >= 2 && n <= 3 {
  305. t.RegionID, e = getRegionID(scan.token)
  306. if t.RegionID == 0 {
  307. scan.gobble(e)
  308. } else {
  309. scan.replace(t.RegionID.String())
  310. }
  311. end = scan.scan()
  312. }
  313. scan.toLower(scan.start, len(scan.b))
  314. t.pVariant = byte(end)
  315. end = parseVariants(scan, end, t)
  316. t.pExt = uint16(end)
  317. return t, end
  318. }
  319. var separator = []byte{'-'}
  320. // parseVariants scans tokens as long as each token is a valid variant string.
  321. // Duplicate variants are removed.
  322. func parseVariants(scan *scanner, end int, t Tag) int {
  323. start := scan.start
  324. varIDBuf := [4]uint8{}
  325. variantBuf := [4][]byte{}
  326. varID := varIDBuf[:0]
  327. variant := variantBuf[:0]
  328. last := -1
  329. needSort := false
  330. for ; len(scan.token) >= 4; scan.scan() {
  331. // TODO: measure the impact of needing this conversion and redesign
  332. // the data structure if there is an issue.
  333. v, ok := variantIndex[string(scan.token)]
  334. if !ok {
  335. // unknown variant
  336. // TODO: allow user-defined variants?
  337. scan.gobble(NewValueError(scan.token))
  338. continue
  339. }
  340. varID = append(varID, v)
  341. variant = append(variant, scan.token)
  342. if !needSort {
  343. if last < int(v) {
  344. last = int(v)
  345. } else {
  346. needSort = true
  347. // There is no legal combinations of more than 7 variants
  348. // (and this is by no means a useful sequence).
  349. const maxVariants = 8
  350. if len(varID) > maxVariants {
  351. break
  352. }
  353. }
  354. }
  355. end = scan.end
  356. }
  357. if needSort {
  358. sort.Sort(variantsSort{varID, variant})
  359. k, l := 0, -1
  360. for i, v := range varID {
  361. w := int(v)
  362. if l == w {
  363. // Remove duplicates.
  364. continue
  365. }
  366. varID[k] = varID[i]
  367. variant[k] = variant[i]
  368. k++
  369. l = w
  370. }
  371. if str := bytes.Join(variant[:k], separator); len(str) == 0 {
  372. end = start - 1
  373. } else {
  374. scan.resizeRange(start, end, len(str))
  375. copy(scan.b[scan.start:], str)
  376. end = scan.end
  377. }
  378. }
  379. return end
  380. }
  381. type variantsSort struct {
  382. i []uint8
  383. v [][]byte
  384. }
  385. func (s variantsSort) Len() int {
  386. return len(s.i)
  387. }
  388. func (s variantsSort) Swap(i, j int) {
  389. s.i[i], s.i[j] = s.i[j], s.i[i]
  390. s.v[i], s.v[j] = s.v[j], s.v[i]
  391. }
  392. func (s variantsSort) Less(i, j int) bool {
  393. return s.i[i] < s.i[j]
  394. }
  395. type bytesSort struct {
  396. b [][]byte
  397. n int // first n bytes to compare
  398. }
  399. func (b bytesSort) Len() int {
  400. return len(b.b)
  401. }
  402. func (b bytesSort) Swap(i, j int) {
  403. b.b[i], b.b[j] = b.b[j], b.b[i]
  404. }
  405. func (b bytesSort) Less(i, j int) bool {
  406. for k := 0; k < b.n; k++ {
  407. if b.b[i][k] == b.b[j][k] {
  408. continue
  409. }
  410. return b.b[i][k] < b.b[j][k]
  411. }
  412. return false
  413. }
  414. // parseExtensions parses and normalizes the extensions in the buffer.
  415. // It returns the last position of scan.b that is part of any extension.
  416. // It also trims scan.b to remove excess parts accordingly.
  417. func parseExtensions(scan *scanner) int {
  418. start := scan.start
  419. exts := [][]byte{}
  420. private := []byte{}
  421. end := scan.end
  422. for len(scan.token) == 1 {
  423. extStart := scan.start
  424. ext := scan.token[0]
  425. end = parseExtension(scan)
  426. extension := scan.b[extStart:end]
  427. if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
  428. scan.setError(ErrSyntax)
  429. end = extStart
  430. continue
  431. } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
  432. scan.b = scan.b[:end]
  433. return end
  434. } else if ext == 'x' {
  435. private = extension
  436. break
  437. }
  438. exts = append(exts, extension)
  439. }
  440. sort.Sort(bytesSort{exts, 1})
  441. if len(private) > 0 {
  442. exts = append(exts, private)
  443. }
  444. scan.b = scan.b[:start]
  445. if len(exts) > 0 {
  446. scan.b = append(scan.b, bytes.Join(exts, separator)...)
  447. } else if start > 0 {
  448. // Strip trailing '-'.
  449. scan.b = scan.b[:start-1]
  450. }
  451. return end
  452. }
  453. // parseExtension parses a single extension and returns the position of
  454. // the extension end.
  455. func parseExtension(scan *scanner) int {
  456. start, end := scan.start, scan.end
  457. switch scan.token[0] {
  458. case 'u': // https://www.ietf.org/rfc/rfc6067.txt
  459. attrStart := end
  460. scan.scan()
  461. for last := []byte{}; len(scan.token) > 2; scan.scan() {
  462. if bytes.Compare(scan.token, last) != -1 {
  463. // Attributes are unsorted. Start over from scratch.
  464. p := attrStart + 1
  465. scan.next = p
  466. attrs := [][]byte{}
  467. for scan.scan(); len(scan.token) > 2; scan.scan() {
  468. attrs = append(attrs, scan.token)
  469. end = scan.end
  470. }
  471. sort.Sort(bytesSort{attrs, 3})
  472. copy(scan.b[p:], bytes.Join(attrs, separator))
  473. break
  474. }
  475. last = scan.token
  476. end = scan.end
  477. }
  478. // Scan key-type sequences. A key is of length 2 and may be followed
  479. // by 0 or more "type" subtags from 3 to the maximum of 8 letters.
  480. var last, key []byte
  481. for attrEnd := end; len(scan.token) == 2; last = key {
  482. key = scan.token
  483. end = scan.end
  484. for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
  485. end = scan.end
  486. }
  487. // TODO: check key value validity
  488. if bytes.Compare(key, last) != 1 || scan.err != nil {
  489. // We have an invalid key or the keys are not sorted.
  490. // Start scanning keys from scratch and reorder.
  491. p := attrEnd + 1
  492. scan.next = p
  493. keys := [][]byte{}
  494. for scan.scan(); len(scan.token) == 2; {
  495. keyStart := scan.start
  496. end = scan.end
  497. for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
  498. end = scan.end
  499. }
  500. keys = append(keys, scan.b[keyStart:end])
  501. }
  502. sort.Stable(bytesSort{keys, 2})
  503. if n := len(keys); n > 0 {
  504. k := 0
  505. for i := 1; i < n; i++ {
  506. if !bytes.Equal(keys[k][:2], keys[i][:2]) {
  507. k++
  508. keys[k] = keys[i]
  509. } else if !bytes.Equal(keys[k], keys[i]) {
  510. scan.setError(ErrDuplicateKey)
  511. }
  512. }
  513. keys = keys[:k+1]
  514. }
  515. reordered := bytes.Join(keys, separator)
  516. if e := p + len(reordered); e < end {
  517. scan.deleteRange(e, end)
  518. end = e
  519. }
  520. copy(scan.b[p:], reordered)
  521. break
  522. }
  523. }
  524. case 't': // https://www.ietf.org/rfc/rfc6497.txt
  525. scan.scan()
  526. if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
  527. _, end = parseTag(scan, false)
  528. scan.toLower(start, end)
  529. }
  530. for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
  531. end = scan.acceptMinSize(3)
  532. }
  533. case 'x':
  534. end = scan.acceptMinSize(1)
  535. default:
  536. end = scan.acceptMinSize(2)
  537. }
  538. return end
  539. }
  540. // getExtension returns the name, body and end position of the extension.
  541. func getExtension(s string, p int) (end int, ext string) {
  542. if s[p] == '-' {
  543. p++
  544. }
  545. if s[p] == 'x' {
  546. return len(s), s[p:]
  547. }
  548. end = nextExtension(s, p)
  549. return end, s[p:end]
  550. }
  551. // nextExtension finds the next extension within the string, searching
  552. // for the -<char>- pattern from position p.
  553. // In the fast majority of cases, language tags will have at most
  554. // one extension and extensions tend to be small.
  555. func nextExtension(s string, p int) int {
  556. for n := len(s) - 3; p < n; {
  557. if s[p] == '-' {
  558. if s[p+2] == '-' {
  559. return p
  560. }
  561. p += 3
  562. } else {
  563. p++
  564. }
  565. }
  566. return len(s)
  567. }