Skip to content

Commit babf881

Browse files
committed
Fix UTF-8 text diagnostic encoding.
1 parent 13fc53c commit babf881

File tree

2 files changed

+134
-18
lines changed

2 files changed

+134
-18
lines changed

diagnose.go

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ func Diag(data []byte, opts *DiagOptions) ([]byte, error) {
6666
return di.diag()
6767
}
6868

69+
// loosest decode options for diagnostic purpose.
6970
var diagnoseDecMode, _ = DecOptions{
7071
MaxNestedLevels: 256,
7172
UTF8: UTF8DecodeInvalid,
@@ -446,43 +447,72 @@ func (di *diagnose) encodeByteString(val []byte) error {
446447
var utf16SurrSelf = rune(0x10000)
447448

448449
// quote should be either `'` or `"`
449-
func (di *diagnose) encodeTextString(val string, quote rune) error {
450-
if err := di.writeByte(byte(quote)); err != nil {
450+
func (di *diagnose) encodeTextString(val string, quote byte) error {
451+
if err := di.writeByte(quote); err != nil {
451452
return err
452453
}
453454

454-
for _, r := range val {
455-
switch {
456-
case r == '\t', r == '\n', r == '\r', r == '\\', r == quote:
457-
if err := di.writeByte('\\'); err != nil {
458-
return err
459-
}
460-
if err := di.writeByte(byte(r)); err != nil {
461-
return err
455+
for i := 0; i < len(val); {
456+
if b := val[i]; b < utf8.RuneSelf {
457+
switch {
458+
case b == '\t', b == '\n', b == '\r', b == '\\', b == quote:
459+
if err := di.writeByte('\\'); err != nil {
460+
return err
461+
}
462+
463+
switch b {
464+
case '\t':
465+
b = 't'
466+
case '\n':
467+
b = 'n'
468+
case '\r':
469+
b = 'r'
470+
}
471+
if err := di.writeByte(b); err != nil {
472+
return err
473+
}
474+
475+
case b >= ' ' && b <= '~':
476+
if err := di.writeByte(b); err != nil {
477+
return err
478+
}
479+
480+
default:
481+
if err := di.writeU16(rune(b)); err != nil {
482+
return err
483+
}
462484
}
463485

464-
case r >= ' ' && r <= '~':
465-
if err := di.writeByte(byte(r)); err != nil {
486+
i++
487+
continue
488+
}
489+
490+
c, size := utf8.DecodeRuneInString(val[i:])
491+
switch {
492+
case c == utf8.RuneError:
493+
if err := di.writeU16(rune(val[i])); err != nil {
466494
return err
467495
}
468496

469-
case r < utf16SurrSelf:
470-
if err := di.writeU16(r); err != nil {
497+
case c < utf16SurrSelf:
498+
if err := di.writeU16(c); err != nil {
471499
return err
472500
}
473501

474502
default:
475-
r1, r2 := utf16.EncodeRune(r)
476-
if err := di.writeU16(r1); err != nil {
503+
c1, c2 := utf16.EncodeRune(c)
504+
if err := di.writeU16(c1); err != nil {
477505
return err
478506
}
479-
if err := di.writeU16(r2); err != nil {
507+
if err := di.writeU16(c2); err != nil {
480508
return err
481509
}
482510
}
511+
512+
i += size
483513
}
484514

485-
return di.writeByte(byte(quote))
515+
return di.writeByte(quote)
486516
}
487517

488518
func (di *diagnose) encodeFloat(ai byte, val uint64) error {

diagnose_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,92 @@ func TestDiagnoseByteString(t *testing.T) {
549549
})
550550
}
551551

552+
func TestDiagnoseTextString(t *testing.T) {
553+
testCases := []struct {
554+
title string
555+
cbor []byte
556+
diag string
557+
opts *DiagOptions
558+
}{
559+
{
560+
"valid UTF-8 text in byte string",
561+
hexDecode("4d68656c6c6f2c20e4bda0e5a5bd"),
562+
`'hello, \u4f60\u597d'`,
563+
&DiagOptions{
564+
ByteStringText: true,
565+
},
566+
},
567+
{
568+
"valid UTF-8 text in text string",
569+
hexDecode("6d68656c6c6f2c20e4bda0e5a5bd"),
570+
`"hello, \u4f60\u597d"`, // "hello, 你好"
571+
&DiagOptions{
572+
ByteStringText: true,
573+
},
574+
},
575+
{
576+
"invalid UTF-8 text in byte string",
577+
hexDecode("4d68656c6c6fffeee4bda0e5a5bd"),
578+
`h'68656c6c6fffeee4bda0e5a5bd'`,
579+
&DiagOptions{
580+
ByteStringText: true,
581+
},
582+
},
583+
{
584+
"invalid UTF-8 text in text string",
585+
hexDecode("6d68656c6c6fffeee4bda0e5a5bd"),
586+
`"hello\u00ff\u00ee\u4f60\u597d"`,
587+
&DiagOptions{
588+
ByteStringText: true,
589+
},
590+
},
591+
{
592+
"valid grapheme cluster text in byte string",
593+
hexDecode("583448656c6c6f2c2027e29da4efb88fe2808df09f94a5270ae4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"),
594+
`'Hello, \'\u2764\ufe0f\u200d\ud83d\udd25\'\n\u4f60\u597d\uff0c"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1"'`,
595+
&DiagOptions{
596+
ByteStringText: true,
597+
},
598+
},
599+
{
600+
"valid grapheme cluster text in text string",
601+
hexDecode("783448656c6c6f2c2027e29da4efb88fe2808df09f94a5270ae4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"),
602+
`"Hello, '\u2764\ufe0f\u200d\ud83d\udd25'\n\u4f60\u597d\uff0c\"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1\""`, // "Hello, '❤️‍🔥'\n你好,\"🧑‍🤝‍🧑\""
603+
&DiagOptions{
604+
ByteStringText: true,
605+
},
606+
},
607+
{
608+
"invalid grapheme cluster text in byte string",
609+
hexDecode("583448656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"),
610+
`h'48656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122'`,
611+
&DiagOptions{
612+
ByteStringText: true,
613+
},
614+
},
615+
{
616+
"invalid grapheme cluster text in text string",
617+
hexDecode("783448656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"),
618+
`"Hello\u00ee\u00ff'\u2764\ufe0f\u200d\ud83d\udd25'\r\u4f60\u597d\uff0c\"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1\""`,
619+
&DiagOptions{
620+
ByteStringText: true,
621+
},
622+
},
623+
}
624+
625+
for _, tc := range testCases {
626+
t.Run(tc.title, func(t *testing.T) {
627+
628+
data, err := Diag(tc.cbor, tc.opts)
629+
if err != nil {
630+
t.Errorf("Diag(0x%x) returned error %q", tc.cbor, err)
631+
} else if string(data) != tc.diag {
632+
t.Errorf("Diag(0x%x) returned `%s`, want %s", tc.cbor, string(data), tc.diag)
633+
}
634+
})
635+
}
636+
}
637+
552638
func TestDiagnoseFloatingPointNumber(t *testing.T) {
553639
testCases := []struct {
554640
title string

0 commit comments

Comments
 (0)