* Add optional encoding for string.length and string.sub, default to

`"utf8"` * Add `string.chars` with encoding. * Fix default string escaping to properly fallback to `"ascii"` when utf8 escaping failed. Fixes: #4109
savonet · toots · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
commit 887793ce62d59dba0d435952c2063f62aa053e78
diff --git a/CHANGES.md b/CHANGES.md
@@ -53,6 +53,7 @@ New:
 - Add `string.of_int` and `string.spaces`.
 - Add `list.assoc.nullable`.
 - Add `source.cue` (#3620).
+- Add `string.chars` (#4111)
 - Added atomic file write operations.
 
 Changed:
@@ -64,6 +65,7 @@ Changed:
 - Changed internal metadata format to be immutable (#3297).
 - Allow a getter for the offset of `on_offset` and dropped the metadata
   mechanism for updating it (#3355).
+- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
 - Disable output paging when `TERM` environment variable is not set.
 - Allow running as `root` user inside `docker` container by default (#3406).
 - Run `check_next` before playlist's requests resolutions (#3625)

diff --git a/doc/content/migrating.md b/doc/content/migrating.md
@@ -92,6 +92,14 @@ end
   However, EBU R128 data is now extracted directly from metadata when available.
   So `replaygain` cannot control the gain type via this parameter anymore.
 
+### String functions
+
+Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their
+given string is in `utf8` by default.
+
+While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by
+passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings.
+
 ### `check_next`
 
 `check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out

diff --git a/src/core/builtins/builtins_string_extra.ml b/src/core/builtins/builtins_string_extra.ml
@@ -20,6 +20,32 @@
 
  *****************************************************************************)
 
+let log = Log.make ["lang"; "string"]
+
+let conf_string =
+  Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings"
+
+let () =
+  let conf_default_encoding =
+    Dtools.Conf.string
+      ~p:(conf_string#plug "default_encoding")
+      ~d:"utf8"
+      "Default encoding for `string.length`, `string.chars` and `string.sub`"
+  in
+  conf_default_encoding#on_change (fun v ->
+      let enc =
+        match v with
+          | "ascii" -> `Ascii
+          | "utf8" -> `Utf8
+          | _ ->
+              log#important
+                "Invalid value %s for `settings.string.default_encoding`! \
+                 Should be one of: \"ascii\" or \"utf8\"."
+                v;
+              `Utf8
+      in
+      Liquidsoap_lang.Builtins_string.default_encoding := enc)
+
 let string = Liquidsoap_lang.Builtins_string.string
 let string_annotate = Lang.add_module ~base:string "annotate"
 

diff --git a/src/lang/builtins_regexp.ml b/src/lang/builtins_regexp.ml
@@ -49,7 +49,7 @@ let escape_regex_descr =
         else Lang_string.utf8_special_char s pos len)
       ~escape_char:(fun s pos len ->
         if s.[pos] = '/' && len = 1 then "\\/"
-        else Lang_string.escape_utf8_char s pos len)
+        else Lang_string.escape_utf8_char ~strict:false s pos len)
       ~next:Lang_string.utf8_next
   in
   Lang_string.escape_string escape_regex_formatter

diff --git a/src/lang/builtins_string.ml b/src/lang/builtins_string.ml
@@ -52,6 +52,86 @@ let _ =
       let l = List.map Lang.to_string l in
       Lang.string (String.concat sep l))
 
+let split ~encoding s =
+  let get =
+    match encoding with
+      | `Ascii ->
+          fun pos ->
+            let buf = Buffer.create 1 in
+            Buffer.add_char buf (String.get s pos);
+            (Buffer.contents buf, 1)
+      | `Utf8 ->
+          fun pos ->
+            let d = String.get_utf_8_uchar s pos in
+            if not (Uchar.utf_decode_is_valid d) then
+              failwith "Decoding failed!";
+            let c = Uchar.utf_decode_uchar d in
+            let buf = Buffer.create 1 in
+            Buffer.add_utf_8_uchar buf c;
+            (Buffer.contents buf, Uchar.utf_decode_length d)
+  in
+  let len = String.length s in
+  let rec f chars pos =
+    if pos = len then List.rev chars
+    else (
+      let char, len = get pos in
+      f (char :: chars) (pos + len))
+  in
+  f [] 0
+
+let default_encoding = ref `Utf8
+
+let encoding_option =
+  ( "encoding",
+    Lang.nullable_t Lang.string_t,
+    Some Lang.null,
+    Some
+      "Encoding used to split characters. Should be one of: `\"utf8\"` or \
+       `\"ascii\"`" )
+
+let get_encoding p =
+  match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with
+    | None -> ("utf8", !default_encoding)
+    | Some "utf8" -> ("utf8", `Utf8)
+    | Some "ascii" -> ("ascii", `Ascii)
+    | _ ->
+        Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
+          "invalid"
+
+let _ =
+  Lang.add_builtin ~base:string "chars" ~category:`String
+    ~descr:"Split string into characters. Raises `error.invalid` on errors."
+    [encoding_option; ("", Lang.string_t, None, None)]
+    (Lang.list_t Lang.string_t)
+    (fun p ->
+      let enc, encoding = get_encoding p in
+      let s = Lang.to_string (List.assoc "" p) in
+      try Lang.list (List.map Lang.string (split ~encoding s))
+      with _ ->
+        Runtime_error.raise ~pos:(Lang.pos p)
+          ~message:
+            (Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
+               enc)
+          "invalid")
+
+let _ =
+  Lang.add_builtin ~base:string "length" ~category:`String
+    ~descr:
+      "Return the string's length using the given encoding. Raises \
+       `error.invalid` on errors."
+    [encoding_option; ("", Lang.string_t, None, None)]
+    Lang.int_t
+    (fun p ->
+      let enc, encoding = get_encoding p in
+      let s = Lang.to_string (List.assoc "" p) in
+      try Lang.int (List.length (split ~encoding s))
+      with _ ->
+        Runtime_error.raise ~pos:(Lang.pos p)
+          ~message:
+            (Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
+               enc)
+          "invalid")
+
 let _ =
   Lang.add_builtin ~base:string "nth" ~category:`String
     ~descr:
@@ -165,7 +245,7 @@ let string_escape =
                          ("", Lang.string (String.sub s ofs len));
                        ])
             | None, `Ascii -> Lang_string.escape_hex_char
-            | None, `Utf8 -> Lang_string.escape_utf8_char
+            | None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
         in
         let next =
           match encoding with
@@ -213,7 +293,8 @@ let _ =
         match Lang.to_string format with
           | "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
           | "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
-          | "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
+          | "utf8" ->
+              (Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
           | _ ->
               raise
                 (Error.Invalid_value
@@ -264,15 +345,6 @@ let _ =
       let s = Lang.to_string (List.assoc "" p) in
       Lang.string (Lang_string.unescape_string s))
 
-let _ =
-  Lang.add_builtin ~base:string "length" ~category:`String
-    ~descr:"Get the length of a string."
-    [("", Lang.string_t, None, None)]
-    Lang.int_t
-    (fun p ->
-      let string = Lang.to_string (List.assoc "" p) in
-      Lang.int (String.length string))
-
 let _ =
   Lang.add_builtin ~base:string "sub" ~category:`String
     ~descr:
@@ -285,6 +357,7 @@ let _ =
         Some
           "Return a sub string starting at this position. First position is 0."
       );
+      encoding_option;
       ( "length",
         Lang.int_t,
         None,
@@ -294,9 +367,24 @@ let _ =
     (fun p ->
       let start = Lang.to_int (List.assoc "start" p) in
       let len = Lang.to_int (List.assoc "length" p) in
+      let _, encoding = get_encoding p in
       let string = Lang.to_string (List.assoc "" p) in
-      Lang.string
-        (try String.sub string start len with Invalid_argument _ -> ""))
+      let s =
+        match encoding with
+          | `Ascii -> (
+              try String.sub string start len with Invalid_argument _ -> "")
+          | `Utf8 -> (
+              try
+                let chars = split ~encoding string in
+                if List.length chars < len + start then ""
+                else
+                  String.concat ""
+                    (List.filteri
+                       (fun pos _ -> start <= pos && pos < start + len)
+                       chars)
+              with _ -> "")
+      in
+      Lang.string s)
 
 let _ =
   Lang.add_builtin ~base:string "index" ~category:`String

diff --git a/src/lang/lang_string.ml b/src/lang/lang_string.ml
@@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
     | '\'', 1 -> "\\'"
     | _ -> escape_fun s pos len
 
-let escape_utf8_char =
+let escape_utf8_char ~strict =
   let utf8_char_code s pos len =
-    try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
+    try utf8_char_code s pos len
+    with _ when not strict -> Uchar.to_int Uchar.rep
   in
   escape_char ~escape_fun:(fun s pos len ->
       Printf.sprintf "\\u%04X" (utf8_char_code s pos len))
 
-let escape_utf8_formatter ?(special_char = utf8_special_char) =
-  escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
+let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
+    =
+  escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next
 
 let escape_hex_char =
   escape_char ~escape_fun:(fun s pos len ->
@@ -153,15 +155,15 @@ let escape_string escape s =
          len segments);
     Bytes.unsafe_to_string b)
 
-let escape_utf8_string ?special_char =
-  escape_string (escape_utf8_formatter ?special_char)
+let escape_utf8_string ?strict ?special_char =
+  escape_string (escape_utf8_formatter ?strict ?special_char)
 
 let escape_ascii_string ?special_char =
   escape_string (escape_ascii_formatter ?special_char)
 
-let quote_utf8_string s =
+let quote_utf8_string ?strict s =
   Printf.sprintf "\"%s\""
-    (escape_utf8_string
+    (escape_utf8_string ?strict
        ~special_char:(fun s pos len ->
          if s.[pos] = '\'' && len = 1 then false
          else utf8_special_char s pos len)
@@ -175,7 +177,9 @@ let quote_ascii_string s =
          else ascii_special_char s pos len)
        s)
 
-let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
+let quote_string s =
+  try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s
+
 let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
 let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
 let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"

diff --git a/src/lang/lang_string.mli b/src/lang/lang_string.mli
@@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
 val escape_char :
   escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string
 
-val escape_utf8_char : string -> int -> int -> string
+val escape_utf8_char : strict:bool -> string -> int -> int -> string
 
 val escape_utf8_formatter :
+  ?strict:bool ->
   ?special_char:(string -> int -> int -> bool) ->
   string ->
   [> `Orig of int * int | `Subst of string * int ] list * int
@@ -39,12 +40,15 @@ val escape_string :
   string
 
 val escape_utf8_string :
-  ?special_char:(string -> int -> int -> bool) -> string -> string
+  ?strict:bool ->
+  ?special_char:(string -> int -> int -> bool) ->
+  string ->
+  string
 
 val escape_ascii_string :
   ?special_char:(string -> int -> int -> bool) -> string -> string
 
-val quote_utf8_string : string -> string
+val quote_utf8_string : ?strict:bool -> string -> string
 val quote_ascii_string : string -> string
 val quote_string : string -> string
 val unescape_utf8_pattern : string

diff --git a/tests/language/string.liq b/tests/language/string.liq
@@ -143,6 +143,37 @@ def f() =
     "blo#{(1, 2, 3)}",
     "blo(1, 2, 3)"
   )
+
+  s = "王^小東="
+  test.equal(string.length(s), 5)
+  test.equal(string.chars(s), ["王", "^", "小", "東", "="])
+  test.equal(string.sub(start=1, length=2, s), "^小")
+  test.equal(string.length(encoding="ascii", s), 11)
+  test.equal(
+    string.chars(encoding="ascii", s),
+    [
+      "\xE7",
+      "\x8E",
+      "\x8B",
+      "^",
+      "\xE5",
+      "\xB0",
+      "\x8F",
+      "\xE6",
+      "\x9D",
+      "\xB1",
+      "="
+    ]
+  )
+  test.equal(string.sub(encoding="ascii", start=1, length=2, s), "\x8E\x8B")
+
+  try
+    string.chars(encoding="utf16le", s)
+    test.fail()
+  catch e : [error.invalid] do
+    ()
+  end
+
   test.pass()
 end
 
@@ -171,6 +202,7 @@ def test_escape_html() =
   test.equal(string.escape.html("\\"), "\\")
   test.equal(string.escape.html("/"), "/")
   test.equal(string.escape.html("`"), "`")
+
   test.pass()
 end