From 9344b3a06b3586df58dad00e637dfb3f28f9d805 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:49:21 +0200 Subject: [PATCH 1/4] extract text transform library from libs/tags to libs/textutil --- libs/tags/aws.go | 18 +++++++++------- libs/tags/azure.go | 12 ++++++----- libs/tags/gcp.go | 22 ++++++++++--------- libs/tags/tag.go | 10 +++++---- libs/tags/transform_test.go | 25 ---------------------- libs/{tags => textutil}/transform.go | 32 ++++++++++++++-------------- libs/textutil/transform_test.go | 25 ++++++++++++++++++++++ 7 files changed, 76 insertions(+), 68 deletions(-) delete mode 100644 libs/tags/transform_test.go rename libs/{tags => textutil}/transform.go (61%) create mode 100644 libs/textutil/transform_test.go diff --git a/libs/tags/aws.go b/libs/tags/aws.go index 44d69c683e..7c8096e3a1 100644 --- a/libs/tags/aws.go +++ b/libs/tags/aws.go @@ -4,6 +4,8 @@ import ( "regexp" "unicode" + "github.com/databricks/cli/libs/textutil" + "golang.org/x/text/unicode/rangetable" ) @@ -20,17 +22,17 @@ var awsChars = rangetable.Merge( var awsTag = &tag{ keyLength: 127, keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`), - keyNormalize: chain( - normalizeMarks(), - replaceNotIn(latin1, '_'), - replaceNotIn(awsChars, '_'), + keyNormalize: textutil.Chain( + textutil.NormalizeMarks(), + textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(awsChars, '_'), ), valueLength: 255, valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`), - valueNormalize: chain( - normalizeMarks(), - replaceNotIn(latin1, '_'), - replaceNotIn(awsChars, '_'), + valueNormalize: textutil.Chain( + textutil.NormalizeMarks(), + textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(awsChars, '_'), ), } diff --git a/libs/tags/azure.go b/libs/tags/azure.go index e98a5eb2d4..caf06399d5 100644 --- a/libs/tags/azure.go +++ b/libs/tags/azure.go @@ -3,6 +3,8 @@ package tags import ( "regexp" + "github.com/databricks/cli/libs/textutil" + "golang.org/x/text/unicode/rangetable" ) @@ -12,14 +14,14 @@ var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/' var azureTag = &tag{ keyLength: 512, keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`), - keyNormalize: chain( - replaceNotIn(latin1, '_'), - replaceIn(azureForbiddenChars, '_'), + keyNormalize: textutil.Chain( + textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceIn(azureForbiddenChars, '_'), ), valueLength: 256, valuePattern: regexp.MustCompile(`^.*$`), - valueNormalize: chain( - replaceNotIn(latin1, '_'), + valueNormalize: textutil.Chain( + textutil.ReplaceNotIn(latin1, '_'), ), } diff --git a/libs/tags/gcp.go b/libs/tags/gcp.go index f30ca4cae0..93584c6264 100644 --- a/libs/tags/gcp.go +++ b/libs/tags/gcp.go @@ -3,6 +3,8 @@ package tags import ( "regexp" "unicode" + + "github.com/databricks/cli/libs/textutil" ) // Tag keys and values on GCP are limited to 63 characters and must match the @@ -45,19 +47,19 @@ var gcpInner = &unicode.RangeTable{ var gcpTag = &tag{ keyLength: 63, keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`), - keyNormalize: chain( - normalizeMarks(), - replaceNotIn(latin1, '_'), - replaceNotIn(gcpInner, '_'), - trimIfNotIn(gcpOuter), + keyNormalize: textutil.Chain( + textutil.NormalizeMarks(), + textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(gcpInner, '_'), + textutil.TrimIfNotIn(gcpOuter), ), valueLength: 63, valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`), - valueNormalize: chain( - normalizeMarks(), - replaceNotIn(latin1, '_'), - replaceNotIn(gcpInner, '_'), - trimIfNotIn(gcpOuter), + valueNormalize: textutil.Chain( + textutil.NormalizeMarks(), + textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(gcpInner, '_'), + textutil.TrimIfNotIn(gcpOuter), ), } diff --git a/libs/tags/tag.go b/libs/tags/tag.go index 64eab947e2..4ebec04c21 100644 --- a/libs/tags/tag.go +++ b/libs/tags/tag.go @@ -6,6 +6,8 @@ import ( "regexp" "strings" "unicode" + + "github.com/databricks/cli/libs/textutil" ) // The tag type holds the validation and normalization rules for @@ -13,11 +15,11 @@ import ( type tag struct { keyLength int keyPattern *regexp.Regexp - keyNormalize transformer + keyNormalize textutil.Transformer valueLength int valuePattern *regexp.Regexp - valueNormalize transformer + valueNormalize textutil.Transformer } func (t *tag) ValidateKey(s string) error { @@ -50,9 +52,9 @@ func (t *tag) ValidateValue(s string) error { } func (t *tag) NormalizeKey(s string) string { - return t.keyNormalize.transform(s) + return t.keyNormalize.TransformString(s) } func (t *tag) NormalizeValue(s string) string { - return t.valueNormalize.transform(s) + return t.valueNormalize.TransformString(s) } diff --git a/libs/tags/transform_test.go b/libs/tags/transform_test.go deleted file mode 100644 index 6481b6d9bc..0000000000 --- a/libs/tags/transform_test.go +++ /dev/null @@ -1,25 +0,0 @@ -package tags - -import ( - "testing" - "unicode" - - "github.com/stretchr/testify/assert" -) - -func TestNormalizeMarks(t *testing.T) { - x := normalizeMarks() - assert.Equal(t, "cafe", x.transform("café")) - assert.Equal(t, "cafe 🍎", x.transform("café 🍎")) - assert.Equal(t, "Foo Bar", x.transform("Foo Bar")) -} - -func TestReplace(t *testing.T) { - assert.Equal(t, "___abc___", replaceIn(unicode.Digit, '_').transform("000abc999")) - assert.Equal(t, "___000___", replaceNotIn(unicode.Digit, '_').transform("abc000abc")) -} - -func TestTrim(t *testing.T) { - assert.Equal(t, "abc", trimIfIn(unicode.Digit).transform("000abc999")) - assert.Equal(t, "000", trimIfNotIn(unicode.Digit).transform("abc000abc")) -} diff --git a/libs/tags/transform.go b/libs/textutil/transform.go similarity index 61% rename from libs/tags/transform.go rename to libs/textutil/transform.go index 71d01b3563..e525bc250c 100644 --- a/libs/tags/transform.go +++ b/libs/textutil/transform.go @@ -1,4 +1,4 @@ -package tags +package textutil import ( "strings" @@ -9,34 +9,34 @@ import ( "golang.org/x/text/unicode/norm" ) -type transformer interface { - transform(string) string +type Transformer interface { + TransformString(string) string } -type chainTransformer []transformer +type chainTransformer []Transformer -func (c chainTransformer) transform(s string) string { +func (c chainTransformer) TransformString(s string) string { for _, t := range c { - s = t.transform(s) + s = t.TransformString(s) } return s } -func chain(t ...transformer) transformer { +func Chain(t ...Transformer) Transformer { return chainTransformer(t) } -// Implement [transformer] interface with text/transform package. +// Implement [Transformer] interface with text/transform package. type textTransformer struct { transform.Transformer } -func (t textTransformer) transform(s string) string { +func (t textTransformer) TransformString(s string) string { s, _, _ = transform.String(t, s) return s } -func normalizeMarks() transformer { +func NormalizeMarks() Transformer { // Decompose unicode characters, then remove all non-spacing marks, then recompose. // This turns 'é' into 'e' and 'ü' into 'u'. return textTransformer{ @@ -50,7 +50,7 @@ type replaceTransformer struct { replacement rune } -func (t replaceTransformer) transform(s string) string { +func (t replaceTransformer) TransformString(s string) string { return strings.Map(func(r rune) rune { if t.set.Contains(r) { return t.replacement @@ -59,11 +59,11 @@ func (t replaceTransformer) transform(s string) string { }, s) } -func replaceIn(table *unicode.RangeTable, replacement rune) transformer { +func ReplaceIn(table *unicode.RangeTable, replacement rune) Transformer { return replaceTransformer{runes.In(table), replacement} } -func replaceNotIn(table *unicode.RangeTable, replacement rune) transformer { +func ReplaceNotIn(table *unicode.RangeTable, replacement rune) Transformer { return replaceTransformer{runes.NotIn(table), replacement} } @@ -72,16 +72,16 @@ type trimTransformer struct { set runes.Set } -func (t trimTransformer) transform(s string) string { +func (t trimTransformer) TransformString(s string) string { return strings.TrimFunc(s, func(r rune) bool { return t.set.Contains(r) }) } -func trimIfIn(table *unicode.RangeTable) transformer { +func trimIfIn(table *unicode.RangeTable) Transformer { return trimTransformer{runes.In(table)} } -func trimIfNotIn(table *unicode.RangeTable) transformer { +func TrimIfNotIn(table *unicode.RangeTable) Transformer { return trimTransformer{runes.NotIn(table)} } diff --git a/libs/textutil/transform_test.go b/libs/textutil/transform_test.go new file mode 100644 index 0000000000..80759ed856 --- /dev/null +++ b/libs/textutil/transform_test.go @@ -0,0 +1,25 @@ +package textutil + +import ( + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestNormalizeMarks(t *testing.T) { + x := NormalizeMarks() + assert.Equal(t, "cafe", x.TransformString("café")) + assert.Equal(t, "cafe 🍎", x.TransformString("café 🍎")) + assert.Equal(t, "Foo Bar", x.TransformString("Foo Bar")) +} + +func TestReplace(t *testing.T) { + assert.Equal(t, "___abc___", ReplaceIn(unicode.Digit, '_').TransformString("000abc999")) + assert.Equal(t, "___000___", ReplaceNotIn(unicode.Digit, '_').TransformString("abc000abc")) +} + +func TestTrim(t *testing.T) { + assert.Equal(t, "abc", trimIfIn(unicode.Digit).TransformString("000abc999")) + assert.Equal(t, "000", TrimIfNotIn(unicode.Digit).TransformString("abc000abc")) +} From bd6b3f21180244c2e6c0455f8173e107cbdde620 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:11:16 +0200 Subject: [PATCH 2/4] extract latin1 to textutil --- libs/tags/aws.go | 4 ++-- libs/tags/azure.go | 4 ++-- libs/tags/gcp.go | 4 ++-- libs/tags/latin_test.go | 16 ---------------- libs/tags/tag.go | 4 ++-- libs/{tags => textutil}/latin.go | 4 ++-- libs/textutil/latin_test.go | 16 ++++++++++++++++ 7 files changed, 26 insertions(+), 26 deletions(-) delete mode 100644 libs/tags/latin_test.go rename libs/{tags => textutil}/latin.go (75%) create mode 100644 libs/textutil/latin_test.go diff --git a/libs/tags/aws.go b/libs/tags/aws.go index 7c8096e3a1..3272ec99c6 100644 --- a/libs/tags/aws.go +++ b/libs/tags/aws.go @@ -24,7 +24,7 @@ var awsTag = &tag{ keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`), keyNormalize: textutil.Chain( textutil.NormalizeMarks(), - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), textutil.ReplaceNotIn(awsChars, '_'), ), @@ -32,7 +32,7 @@ var awsTag = &tag{ valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`), valueNormalize: textutil.Chain( textutil.NormalizeMarks(), - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), textutil.ReplaceNotIn(awsChars, '_'), ), } diff --git a/libs/tags/azure.go b/libs/tags/azure.go index caf06399d5..4b58a5b9e6 100644 --- a/libs/tags/azure.go +++ b/libs/tags/azure.go @@ -15,13 +15,13 @@ var azureTag = &tag{ keyLength: 512, keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`), keyNormalize: textutil.Chain( - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), textutil.ReplaceIn(azureForbiddenChars, '_'), ), valueLength: 256, valuePattern: regexp.MustCompile(`^.*$`), valueNormalize: textutil.Chain( - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), ), } diff --git a/libs/tags/gcp.go b/libs/tags/gcp.go index 93584c6264..9fed16d4bd 100644 --- a/libs/tags/gcp.go +++ b/libs/tags/gcp.go @@ -49,7 +49,7 @@ var gcpTag = &tag{ keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`), keyNormalize: textutil.Chain( textutil.NormalizeMarks(), - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), textutil.ReplaceNotIn(gcpInner, '_'), textutil.TrimIfNotIn(gcpOuter), ), @@ -58,7 +58,7 @@ var gcpTag = &tag{ valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`), valueNormalize: textutil.Chain( textutil.NormalizeMarks(), - textutil.ReplaceNotIn(latin1, '_'), + textutil.ReplaceNotIn(textutil.Latin1, '_'), textutil.ReplaceNotIn(gcpInner, '_'), textutil.TrimIfNotIn(gcpOuter), ), diff --git a/libs/tags/latin_test.go b/libs/tags/latin_test.go deleted file mode 100644 index c3234a4435..0000000000 --- a/libs/tags/latin_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package tags - -import ( - "testing" - "unicode" - - "github.com/stretchr/testify/assert" -) - -func TestLatinTable(t *testing.T) { - assert.True(t, unicode.In('\u0000', latin1)) - assert.True(t, unicode.In('A', latin1)) - assert.True(t, unicode.In('Z', latin1)) - assert.True(t, unicode.In('\u00ff', latin1)) - assert.False(t, unicode.In('\u0100', latin1)) -} diff --git a/libs/tags/tag.go b/libs/tags/tag.go index 4ebec04c21..dc2c479d2d 100644 --- a/libs/tags/tag.go +++ b/libs/tags/tag.go @@ -29,7 +29,7 @@ func (t *tag) ValidateKey(s string) error { if len(s) > t.keyLength { return fmt.Errorf("key length %d exceeds maximum of %d", len(s), t.keyLength) } - if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) { + if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(textutil.Latin1, r) }) { return errors.New("key contains non-latin1 characters") } if !t.keyPattern.MatchString(s) { @@ -42,7 +42,7 @@ func (t *tag) ValidateValue(s string) error { if len(s) > t.valueLength { return fmt.Errorf("value length %d exceeds maximum of %d", len(s), t.valueLength) } - if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) { + if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(textutil.Latin1, r) }) { return errors.New("value contains non-latin1 characters") } if !t.valuePattern.MatchString(s) { diff --git a/libs/tags/latin.go b/libs/textutil/latin.go similarity index 75% rename from libs/tags/latin.go rename to libs/textutil/latin.go index df9ad403e7..87a111793c 100644 --- a/libs/tags/latin.go +++ b/libs/textutil/latin.go @@ -1,9 +1,9 @@ -package tags +package textutil import "unicode" // Range table for all characters in the Latin1 character set. -var latin1 = &unicode.RangeTable{ +var Latin1 = &unicode.RangeTable{ R16: []unicode.Range16{ {0x0000, 0x00ff, 1}, }, diff --git a/libs/textutil/latin_test.go b/libs/textutil/latin_test.go new file mode 100644 index 0000000000..3d5734ddde --- /dev/null +++ b/libs/textutil/latin_test.go @@ -0,0 +1,16 @@ +package textutil + +import ( + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestLatinTable(t *testing.T) { + assert.True(t, unicode.In('\u0000', Latin1)) + assert.True(t, unicode.In('A', Latin1)) + assert.True(t, unicode.In('Z', Latin1)) + assert.True(t, unicode.In('\u00ff', Latin1)) + assert.False(t, unicode.In('\u0100', Latin1)) +} From 483161e0b5ec948ed5809db0c4efd8c0c671733d Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:34:29 +0200 Subject: [PATCH 3/4] export TrimIfIn --- libs/textutil/transform.go | 2 +- libs/textutil/transform_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/textutil/transform.go b/libs/textutil/transform.go index e525bc250c..222e7521e2 100644 --- a/libs/textutil/transform.go +++ b/libs/textutil/transform.go @@ -78,7 +78,7 @@ func (t trimTransformer) TransformString(s string) string { }) } -func trimIfIn(table *unicode.RangeTable) Transformer { +func TrimIfIn(table *unicode.RangeTable) Transformer { return trimTransformer{runes.In(table)} } diff --git a/libs/textutil/transform_test.go b/libs/textutil/transform_test.go index 80759ed856..9f39139c0a 100644 --- a/libs/textutil/transform_test.go +++ b/libs/textutil/transform_test.go @@ -20,6 +20,6 @@ func TestReplace(t *testing.T) { } func TestTrim(t *testing.T) { - assert.Equal(t, "abc", trimIfIn(unicode.Digit).TransformString("000abc999")) + assert.Equal(t, "abc", TrimIfIn(unicode.Digit).TransformString("000abc999")) assert.Equal(t, "000", TrimIfNotIn(unicode.Digit).TransformString("abc000abc")) } From 338560cb874367c5bf97ec9c22ce5f57b35f9aec Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:44:03 +0200 Subject: [PATCH 4/4] add short doc strings to transform structs/methods --- libs/textutil/transform.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libs/textutil/transform.go b/libs/textutil/transform.go index 222e7521e2..a37906b9dd 100644 --- a/libs/textutil/transform.go +++ b/libs/textutil/transform.go @@ -9,6 +9,7 @@ import ( "golang.org/x/text/unicode/norm" ) +// Transformer represents a text transformation operation. type Transformer interface { TransformString(string) string } @@ -22,6 +23,7 @@ func (c chainTransformer) TransformString(s string) string { return s } +// Chain creates a transformer that applies multiple transformers in sequence. func Chain(t ...Transformer) Transformer { return chainTransformer(t) } @@ -36,9 +38,10 @@ func (t textTransformer) TransformString(s string) string { return s } +// NormalizeMarks creates a transformer that removes diacritical marks from characters. +// This turns 'é' into 'e' and 'ü' into 'u'. func NormalizeMarks() Transformer { - // Decompose unicode characters, then remove all non-spacing marks, then recompose. - // This turns 'é' into 'e' and 'ü' into 'u'. + // Decompose unicode characters, then remove all non-spacing marks, then recompose return textTransformer{ transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC), } @@ -59,10 +62,12 @@ func (t replaceTransformer) TransformString(s string) string { }, s) } +// ReplaceIn creates a transformer that replaces characters within the given Unicode range table with the replacement rune. func ReplaceIn(table *unicode.RangeTable, replacement rune) Transformer { return replaceTransformer{runes.In(table), replacement} } +// ReplaceNotIn creates a transformer that replaces characters NOT within the given Unicode range table with the replacement rune. func ReplaceNotIn(table *unicode.RangeTable, replacement rune) Transformer { return replaceTransformer{runes.NotIn(table), replacement} } @@ -78,10 +83,12 @@ func (t trimTransformer) TransformString(s string) string { }) } +// TrimIfIn creates a transformer that trims characters from the beginning and end of strings if they are within the given Unicode range table. func TrimIfIn(table *unicode.RangeTable) Transformer { return trimTransformer{runes.In(table)} } +// TrimIfNotIn creates a transformer that trims characters from the beginning and end of strings if they are NOT within the given Unicode range table. func TrimIfNotIn(table *unicode.RangeTable) Transformer { return trimTransformer{runes.NotIn(table)} }