utf8.go 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. /*
  2. * Copyright 2022 ByteDance Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package utf8
  17. import (
  18. `runtime`
  19. `github.com/bytedance/sonic/internal/rt`
  20. `github.com/bytedance/sonic/internal/native/types`
  21. `github.com/bytedance/sonic/internal/native`
  22. )
  23. // CorrectWith corrects the invalid utf8 byte with repl string.
  24. func CorrectWith(dst []byte, src []byte, repl string) []byte {
  25. sstr := rt.Mem2Str(src)
  26. sidx := 0
  27. /* state machine records the invalid positions */
  28. m := types.NewStateMachine()
  29. m.Sp = 0 // invalid utf8 numbers
  30. for sidx < len(sstr) {
  31. scur := sidx
  32. ecode := native.ValidateUTF8(&sstr, &sidx, m)
  33. if m.Sp != 0 {
  34. if m.Sp > len(sstr) {
  35. panic("numbers of invalid utf8 exceed the string len!")
  36. }
  37. }
  38. for i := 0; i < m.Sp; i++ {
  39. ipos := m.Vt[i] // invalid utf8 position
  40. dst = append(dst, sstr[scur:ipos]...)
  41. dst = append(dst, repl...)
  42. scur = m.Vt[i] + 1
  43. }
  44. /* append the remained valid utf8 bytes */
  45. dst = append(dst, sstr[scur:sidx]...)
  46. /* not enough space, reset and continue */
  47. if ecode != 0 {
  48. m.Sp = 0
  49. }
  50. }
  51. types.FreeStateMachine(m)
  52. return dst
  53. }
  54. // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
  55. func Validate(src []byte) bool {
  56. if src == nil {
  57. return true
  58. }
  59. return ValidateString(rt.Mem2Str(src))
  60. }
  61. // ValidateString as Validate, but for string.
  62. func ValidateString(src string) bool {
  63. if src == "" {
  64. return true
  65. }
  66. ret := native.ValidateUTF8Fast(&src) == 0
  67. runtime.KeepAlive(src)
  68. return ret
  69. }