string.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. // This files's string processing codes are inspired by https://github.com/segmentio/encoding.
  2. // The license notation is as follows.
  3. //
  4. // # MIT License
  5. //
  6. // Copyright (c) 2019 Segment.io, Inc.
  7. //
  8. // Permission is hereby granted, free of charge, to any person obtaining a copy
  9. // of this software and associated documentation files (the "Software"), to deal
  10. // in the Software without restriction, including without limitation the rights
  11. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. // copies of the Software, and to permit persons to whom the Software is
  13. // furnished to do so, subject to the following conditions:
  14. //
  15. // The above copyright notice and this permission notice shall be included in all
  16. // copies or substantial portions of the Software.
  17. //
  18. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. // SOFTWARE.
  25. package encoder
  26. import (
  27. "math/bits"
  28. "reflect"
  29. "unsafe"
  30. )
  31. const (
  32. lsb = 0x0101010101010101
  33. msb = 0x8080808080808080
  34. )
  35. var hex = "0123456789abcdef"
  36. //nolint:govet
  37. func stringToUint64Slice(s string) []uint64 {
  38. return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
  39. Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
  40. Len: len(s) / 8,
  41. Cap: len(s) / 8,
  42. }))
  43. }
  44. func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
  45. if ctx.Option.Flag&HTMLEscapeOption != 0 {
  46. if ctx.Option.Flag&NormalizeUTF8Option != 0 {
  47. return appendNormalizedHTMLString(buf, s)
  48. }
  49. return appendHTMLString(buf, s)
  50. }
  51. if ctx.Option.Flag&NormalizeUTF8Option != 0 {
  52. return appendNormalizedString(buf, s)
  53. }
  54. return appendString(buf, s)
  55. }
  56. func appendNormalizedHTMLString(buf []byte, s string) []byte {
  57. valLen := len(s)
  58. if valLen == 0 {
  59. return append(buf, `""`...)
  60. }
  61. buf = append(buf, '"')
  62. var (
  63. i, j int
  64. )
  65. if valLen >= 8 {
  66. chunks := stringToUint64Slice(s)
  67. for _, n := range chunks {
  68. // combine masks before checking for the MSB of each byte. We include
  69. // `n` in the mask to check whether any of the *input* byte MSBs were
  70. // set (i.e. the byte was outside the ASCII range).
  71. mask := n | (n - (lsb * 0x20)) |
  72. ((n ^ (lsb * '"')) - lsb) |
  73. ((n ^ (lsb * '\\')) - lsb) |
  74. ((n ^ (lsb * '<')) - lsb) |
  75. ((n ^ (lsb * '>')) - lsb) |
  76. ((n ^ (lsb * '&')) - lsb)
  77. if (mask & msb) != 0 {
  78. j = bits.TrailingZeros64(mask&msb) / 8
  79. goto ESCAPE_END
  80. }
  81. }
  82. for i := len(chunks) * 8; i < valLen; i++ {
  83. if needEscapeHTMLNormalizeUTF8[s[i]] {
  84. j = i
  85. goto ESCAPE_END
  86. }
  87. }
  88. // no found any escape characters.
  89. return append(append(buf, s...), '"')
  90. }
  91. ESCAPE_END:
  92. for j < valLen {
  93. c := s[j]
  94. if !needEscapeHTMLNormalizeUTF8[c] {
  95. // fast path: most of the time, printable ascii characters are used
  96. j++
  97. continue
  98. }
  99. switch c {
  100. case '\\', '"':
  101. buf = append(buf, s[i:j]...)
  102. buf = append(buf, '\\', c)
  103. i = j + 1
  104. j = j + 1
  105. continue
  106. case '\n':
  107. buf = append(buf, s[i:j]...)
  108. buf = append(buf, '\\', 'n')
  109. i = j + 1
  110. j = j + 1
  111. continue
  112. case '\r':
  113. buf = append(buf, s[i:j]...)
  114. buf = append(buf, '\\', 'r')
  115. i = j + 1
  116. j = j + 1
  117. continue
  118. case '\t':
  119. buf = append(buf, s[i:j]...)
  120. buf = append(buf, '\\', 't')
  121. i = j + 1
  122. j = j + 1
  123. continue
  124. case '<', '>', '&':
  125. buf = append(buf, s[i:j]...)
  126. buf = append(buf, `\u00`...)
  127. buf = append(buf, hex[c>>4], hex[c&0xF])
  128. i = j + 1
  129. j = j + 1
  130. continue
  131. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  132. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  133. buf = append(buf, s[i:j]...)
  134. buf = append(buf, `\u00`...)
  135. buf = append(buf, hex[c>>4], hex[c&0xF])
  136. i = j + 1
  137. j = j + 1
  138. continue
  139. }
  140. state, size := decodeRuneInString(s[j:])
  141. switch state {
  142. case runeErrorState:
  143. buf = append(buf, s[i:j]...)
  144. buf = append(buf, `\ufffd`...)
  145. i = j + 1
  146. j = j + 1
  147. continue
  148. // U+2028 is LINE SEPARATOR.
  149. // U+2029 is PARAGRAPH SEPARATOR.
  150. // They are both technically valid characters in JSON strings,
  151. // but don't work in JSONP, which has to be evaluated as JavaScript,
  152. // and can lead to security holes there. It is valid JSON to
  153. // escape them, so we do so unconditionally.
  154. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
  155. case lineSepState:
  156. buf = append(buf, s[i:j]...)
  157. buf = append(buf, `\u2028`...)
  158. i = j + 3
  159. j = j + 3
  160. continue
  161. case paragraphSepState:
  162. buf = append(buf, s[i:j]...)
  163. buf = append(buf, `\u2029`...)
  164. i = j + 3
  165. j = j + 3
  166. continue
  167. }
  168. j += size
  169. }
  170. return append(append(buf, s[i:]...), '"')
  171. }
  172. func appendHTMLString(buf []byte, s string) []byte {
  173. valLen := len(s)
  174. if valLen == 0 {
  175. return append(buf, `""`...)
  176. }
  177. buf = append(buf, '"')
  178. var (
  179. i, j int
  180. )
  181. if valLen >= 8 {
  182. chunks := stringToUint64Slice(s)
  183. for _, n := range chunks {
  184. // combine masks before checking for the MSB of each byte. We include
  185. // `n` in the mask to check whether any of the *input* byte MSBs were
  186. // set (i.e. the byte was outside the ASCII range).
  187. mask := n | (n - (lsb * 0x20)) |
  188. ((n ^ (lsb * '"')) - lsb) |
  189. ((n ^ (lsb * '\\')) - lsb) |
  190. ((n ^ (lsb * '<')) - lsb) |
  191. ((n ^ (lsb * '>')) - lsb) |
  192. ((n ^ (lsb * '&')) - lsb)
  193. if (mask & msb) != 0 {
  194. j = bits.TrailingZeros64(mask&msb) / 8
  195. goto ESCAPE_END
  196. }
  197. }
  198. for i := len(chunks) * 8; i < valLen; i++ {
  199. if needEscapeHTML[s[i]] {
  200. j = i
  201. goto ESCAPE_END
  202. }
  203. }
  204. // no found any escape characters.
  205. return append(append(buf, s...), '"')
  206. }
  207. ESCAPE_END:
  208. for j < valLen {
  209. c := s[j]
  210. if !needEscapeHTML[c] {
  211. // fast path: most of the time, printable ascii characters are used
  212. j++
  213. continue
  214. }
  215. switch c {
  216. case '\\', '"':
  217. buf = append(buf, s[i:j]...)
  218. buf = append(buf, '\\', c)
  219. i = j + 1
  220. j = j + 1
  221. continue
  222. case '\n':
  223. buf = append(buf, s[i:j]...)
  224. buf = append(buf, '\\', 'n')
  225. i = j + 1
  226. j = j + 1
  227. continue
  228. case '\r':
  229. buf = append(buf, s[i:j]...)
  230. buf = append(buf, '\\', 'r')
  231. i = j + 1
  232. j = j + 1
  233. continue
  234. case '\t':
  235. buf = append(buf, s[i:j]...)
  236. buf = append(buf, '\\', 't')
  237. i = j + 1
  238. j = j + 1
  239. continue
  240. case '<', '>', '&':
  241. buf = append(buf, s[i:j]...)
  242. buf = append(buf, `\u00`...)
  243. buf = append(buf, hex[c>>4], hex[c&0xF])
  244. i = j + 1
  245. j = j + 1
  246. continue
  247. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  248. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  249. buf = append(buf, s[i:j]...)
  250. buf = append(buf, `\u00`...)
  251. buf = append(buf, hex[c>>4], hex[c&0xF])
  252. i = j + 1
  253. j = j + 1
  254. continue
  255. }
  256. j++
  257. }
  258. return append(append(buf, s[i:]...), '"')
  259. }
  260. func appendNormalizedString(buf []byte, s string) []byte {
  261. valLen := len(s)
  262. if valLen == 0 {
  263. return append(buf, `""`...)
  264. }
  265. buf = append(buf, '"')
  266. var (
  267. i, j int
  268. )
  269. if valLen >= 8 {
  270. chunks := stringToUint64Slice(s)
  271. for _, n := range chunks {
  272. // combine masks before checking for the MSB of each byte. We include
  273. // `n` in the mask to check whether any of the *input* byte MSBs were
  274. // set (i.e. the byte was outside the ASCII range).
  275. mask := n | (n - (lsb * 0x20)) |
  276. ((n ^ (lsb * '"')) - lsb) |
  277. ((n ^ (lsb * '\\')) - lsb)
  278. if (mask & msb) != 0 {
  279. j = bits.TrailingZeros64(mask&msb) / 8
  280. goto ESCAPE_END
  281. }
  282. }
  283. valLen := len(s)
  284. for i := len(chunks) * 8; i < valLen; i++ {
  285. if needEscapeNormalizeUTF8[s[i]] {
  286. j = i
  287. goto ESCAPE_END
  288. }
  289. }
  290. return append(append(buf, s...), '"')
  291. }
  292. ESCAPE_END:
  293. for j < valLen {
  294. c := s[j]
  295. if !needEscapeNormalizeUTF8[c] {
  296. // fast path: most of the time, printable ascii characters are used
  297. j++
  298. continue
  299. }
  300. switch c {
  301. case '\\', '"':
  302. buf = append(buf, s[i:j]...)
  303. buf = append(buf, '\\', c)
  304. i = j + 1
  305. j = j + 1
  306. continue
  307. case '\n':
  308. buf = append(buf, s[i:j]...)
  309. buf = append(buf, '\\', 'n')
  310. i = j + 1
  311. j = j + 1
  312. continue
  313. case '\r':
  314. buf = append(buf, s[i:j]...)
  315. buf = append(buf, '\\', 'r')
  316. i = j + 1
  317. j = j + 1
  318. continue
  319. case '\t':
  320. buf = append(buf, s[i:j]...)
  321. buf = append(buf, '\\', 't')
  322. i = j + 1
  323. j = j + 1
  324. continue
  325. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  326. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  327. buf = append(buf, s[i:j]...)
  328. buf = append(buf, `\u00`...)
  329. buf = append(buf, hex[c>>4], hex[c&0xF])
  330. i = j + 1
  331. j = j + 1
  332. continue
  333. }
  334. state, size := decodeRuneInString(s[j:])
  335. switch state {
  336. case runeErrorState:
  337. buf = append(buf, s[i:j]...)
  338. buf = append(buf, `\ufffd`...)
  339. i = j + 1
  340. j = j + 1
  341. continue
  342. // U+2028 is LINE SEPARATOR.
  343. // U+2029 is PARAGRAPH SEPARATOR.
  344. // They are both technically valid characters in JSON strings,
  345. // but don't work in JSONP, which has to be evaluated as JavaScript,
  346. // and can lead to security holes there. It is valid JSON to
  347. // escape them, so we do so unconditionally.
  348. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
  349. case lineSepState:
  350. buf = append(buf, s[i:j]...)
  351. buf = append(buf, `\u2028`...)
  352. i = j + 3
  353. j = j + 3
  354. continue
  355. case paragraphSepState:
  356. buf = append(buf, s[i:j]...)
  357. buf = append(buf, `\u2029`...)
  358. i = j + 3
  359. j = j + 3
  360. continue
  361. }
  362. j += size
  363. }
  364. return append(append(buf, s[i:]...), '"')
  365. }
  366. func appendString(buf []byte, s string) []byte {
  367. valLen := len(s)
  368. if valLen == 0 {
  369. return append(buf, `""`...)
  370. }
  371. buf = append(buf, '"')
  372. var (
  373. i, j int
  374. )
  375. if valLen >= 8 {
  376. chunks := stringToUint64Slice(s)
  377. for _, n := range chunks {
  378. // combine masks before checking for the MSB of each byte. We include
  379. // `n` in the mask to check whether any of the *input* byte MSBs were
  380. // set (i.e. the byte was outside the ASCII range).
  381. mask := n | (n - (lsb * 0x20)) |
  382. ((n ^ (lsb * '"')) - lsb) |
  383. ((n ^ (lsb * '\\')) - lsb)
  384. if (mask & msb) != 0 {
  385. j = bits.TrailingZeros64(mask&msb) / 8
  386. goto ESCAPE_END
  387. }
  388. }
  389. valLen := len(s)
  390. for i := len(chunks) * 8; i < valLen; i++ {
  391. if needEscape[s[i]] {
  392. j = i
  393. goto ESCAPE_END
  394. }
  395. }
  396. return append(append(buf, s...), '"')
  397. }
  398. ESCAPE_END:
  399. for j < valLen {
  400. c := s[j]
  401. if !needEscape[c] {
  402. // fast path: most of the time, printable ascii characters are used
  403. j++
  404. continue
  405. }
  406. switch c {
  407. case '\\', '"':
  408. buf = append(buf, s[i:j]...)
  409. buf = append(buf, '\\', c)
  410. i = j + 1
  411. j = j + 1
  412. continue
  413. case '\n':
  414. buf = append(buf, s[i:j]...)
  415. buf = append(buf, '\\', 'n')
  416. i = j + 1
  417. j = j + 1
  418. continue
  419. case '\r':
  420. buf = append(buf, s[i:j]...)
  421. buf = append(buf, '\\', 'r')
  422. i = j + 1
  423. j = j + 1
  424. continue
  425. case '\t':
  426. buf = append(buf, s[i:j]...)
  427. buf = append(buf, '\\', 't')
  428. i = j + 1
  429. j = j + 1
  430. continue
  431. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  432. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  433. buf = append(buf, s[i:j]...)
  434. buf = append(buf, `\u00`...)
  435. buf = append(buf, hex[c>>4], hex[c&0xF])
  436. i = j + 1
  437. j = j + 1
  438. continue
  439. }
  440. j++
  441. }
  442. return append(append(buf, s[i:]...), '"')
  443. }