Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
* Add optional encoding for string.length and string.sub, default to
  `"utf8"`
* Add `string.chars` with encoding.
* Fix default string escaping to properly fallback to `"ascii"` when
  utf8 escaping failed.

Fixes: #4109
  • Loading branch information
toots committed Aug 25, 2024
commit 887793ce62d59dba0d435952c2063f62aa053e78
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ New:
- Add `string.of_int` and `string.spaces`.
- Add `list.assoc.nullable`.
- Add `source.cue` (#3620).
- Add `string.chars` (#4111)
- Added atomic file write operations.

Changed:
Expand All @@ -64,6 +65,7 @@ Changed:
- Changed internal metadata format to be immutable (#3297).
- Allow a getter for the offset of `on_offset` and dropped the metadata
mechanism for updating it (#3355).
- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
- Disable output paging when `TERM` environment variable is not set.
- Allow running as `root` user inside `docker` container by default (#3406).
- Run `check_next` before playlist's requests resolutions (#3625)
Expand Down
8 changes: 8 additions & 0 deletions doc/content/migrating.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ end
However, EBU R128 data is now extracted directly from metadata when available.
So `replaygain` cannot control the gain type via this parameter anymore.

### String functions

Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their
given string is in `utf8` by default.

While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by
passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings.

### `check_next`

`check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out
Expand Down
26 changes: 26 additions & 0 deletions src/core/builtins/builtins_string_extra.ml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@

*****************************************************************************)

let log = Log.make ["lang"; "string"]

let conf_string =
Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings"

let () =
let conf_default_encoding =
Dtools.Conf.string
~p:(conf_string#plug "default_encoding")
~d:"utf8"
"Default encoding for `string.length`, `string.chars` and `string.sub`"
in
conf_default_encoding#on_change (fun v ->
let enc =
match v with
| "ascii" -> `Ascii
| "utf8" -> `Utf8
| _ ->
log#important
"Invalid value %s for `settings.string.default_encoding`! \
Should be one of: \"ascii\" or \"utf8\"."
v;
`Utf8
in
Liquidsoap_lang.Builtins_string.default_encoding := enc)

let string = Liquidsoap_lang.Builtins_string.string
let string_annotate = Lang.add_module ~base:string "annotate"

Expand Down
2 changes: 1 addition & 1 deletion src/lang/builtins_regexp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ let escape_regex_descr =
else Lang_string.utf8_special_char s pos len)
~escape_char:(fun s pos len ->
if s.[pos] = '/' && len = 1 then "\\/"
else Lang_string.escape_utf8_char s pos len)
else Lang_string.escape_utf8_char ~strict:false s pos len)
~next:Lang_string.utf8_next
in
Lang_string.escape_string escape_regex_formatter
Expand Down
114 changes: 101 additions & 13 deletions src/lang/builtins_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,86 @@ let _ =
let l = List.map Lang.to_string l in
Lang.string (String.concat sep l))

let split ~encoding s =
let get =
match encoding with
| `Ascii ->
fun pos ->
let buf = Buffer.create 1 in
Buffer.add_char buf (String.get s pos);
(Buffer.contents buf, 1)
| `Utf8 ->
fun pos ->
let d = String.get_utf_8_uchar s pos in
if not (Uchar.utf_decode_is_valid d) then
failwith "Decoding failed!";
let c = Uchar.utf_decode_uchar d in
let buf = Buffer.create 1 in
Buffer.add_utf_8_uchar buf c;
(Buffer.contents buf, Uchar.utf_decode_length d)
in
let len = String.length s in
let rec f chars pos =
if pos = len then List.rev chars
else (
let char, len = get pos in
f (char :: chars) (pos + len))
in
f [] 0

let default_encoding = ref `Utf8

let encoding_option =
( "encoding",
Lang.nullable_t Lang.string_t,
Some Lang.null,
Some
"Encoding used to split characters. Should be one of: `\"utf8\"` or \
`\"ascii\"`" )

let get_encoding p =
match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with
| None -> ("utf8", !default_encoding)
| Some "utf8" -> ("utf8", `Utf8)
| Some "ascii" -> ("ascii", `Ascii)
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"

let _ =
Lang.add_builtin ~base:string "chars" ~category:`String
~descr:"Split string into characters. Raises `error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
(Lang.list_t Lang.string_t)
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.list (List.map Lang.string (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:
"Return the string's length using the given encoding. Raises \
`error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.int (List.length (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "nth" ~category:`String
~descr:
Expand Down Expand Up @@ -165,7 +245,7 @@ let string_escape =
("", Lang.string (String.sub s ofs len));
])
| None, `Ascii -> Lang_string.escape_hex_char
| None, `Utf8 -> Lang_string.escape_utf8_char
| None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
in
let next =
match encoding with
Expand Down Expand Up @@ -213,7 +293,8 @@ let _ =
match Lang.to_string format with
| "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
| "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
| "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
| "utf8" ->
(Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
| _ ->
raise
(Error.Invalid_value
Expand Down Expand Up @@ -264,15 +345,6 @@ let _ =
let s = Lang.to_string (List.assoc "" p) in
Lang.string (Lang_string.unescape_string s))

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:"Get the length of a string."
[("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let string = Lang.to_string (List.assoc "" p) in
Lang.int (String.length string))

let _ =
Lang.add_builtin ~base:string "sub" ~category:`String
~descr:
Expand All @@ -285,6 +357,7 @@ let _ =
Some
"Return a sub string starting at this position. First position is 0."
);
encoding_option;
( "length",
Lang.int_t,
None,
Expand All @@ -294,9 +367,24 @@ let _ =
(fun p ->
let start = Lang.to_int (List.assoc "start" p) in
let len = Lang.to_int (List.assoc "length" p) in
let _, encoding = get_encoding p in
let string = Lang.to_string (List.assoc "" p) in
Lang.string
(try String.sub string start len with Invalid_argument _ -> ""))
let s =
match encoding with
| `Ascii -> (
try String.sub string start len with Invalid_argument _ -> "")
| `Utf8 -> (
try
let chars = split ~encoding string in
if List.length chars < len + start then ""
else
String.concat ""
(List.filteri
(fun pos _ -> start <= pos && pos < start + len)
chars)
with _ -> "")
in
Lang.string s)

let _ =
Lang.add_builtin ~base:string "index" ~category:`String
Expand Down
22 changes: 13 additions & 9 deletions src/lang/lang_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
| '\'', 1 -> "\\'"
| _ -> escape_fun s pos len

let escape_utf8_char =
let escape_utf8_char ~strict =
let utf8_char_code s pos len =
try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
try utf8_char_code s pos len
with _ when not strict -> Uchar.to_int Uchar.rep
in
escape_char ~escape_fun:(fun s pos len ->
Printf.sprintf "\\u%04X" (utf8_char_code s pos len))

let escape_utf8_formatter ?(special_char = utf8_special_char) =
escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
=
escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next

let escape_hex_char =
escape_char ~escape_fun:(fun s pos len ->
Expand Down Expand Up @@ -153,15 +155,15 @@ let escape_string escape s =
len segments);
Bytes.unsafe_to_string b)

let escape_utf8_string ?special_char =
escape_string (escape_utf8_formatter ?special_char)
let escape_utf8_string ?strict ?special_char =
escape_string (escape_utf8_formatter ?strict ?special_char)

let escape_ascii_string ?special_char =
escape_string (escape_ascii_formatter ?special_char)

let quote_utf8_string s =
let quote_utf8_string ?strict s =
Printf.sprintf "\"%s\""
(escape_utf8_string
(escape_utf8_string ?strict
~special_char:(fun s pos len ->
if s.[pos] = '\'' && len = 1 then false
else utf8_special_char s pos len)
Expand All @@ -175,7 +177,9 @@ let quote_ascii_string s =
else ascii_special_char s pos len)
s)

let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
let quote_string s =
try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s

let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"
Expand Down
10 changes: 7 additions & 3 deletions src/lang/lang_string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
val escape_char :
escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string

val escape_utf8_char : string -> int -> int -> string
val escape_utf8_char : strict:bool -> string -> int -> int -> string

val escape_utf8_formatter :
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
[> `Orig of int * int | `Subst of string * int ] list * int
Expand All @@ -39,12 +40,15 @@ val escape_string :
string

val escape_utf8_string :
?special_char:(string -> int -> int -> bool) -> string -> string
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
string

val escape_ascii_string :
?special_char:(string -> int -> int -> bool) -> string -> string

val quote_utf8_string : string -> string
val quote_utf8_string : ?strict:bool -> string -> string
val quote_ascii_string : string -> string
val quote_string : string -> string
val unescape_utf8_pattern : string
Expand Down
32 changes: 32 additions & 0 deletions tests/language/string.liq
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,37 @@ def f() =
"blo#{(1, 2, 3)}",
"blo(1, 2, 3)"
)

s = "王^小東="
test.equal(string.length(s), 5)
test.equal(string.chars(s), ["王", "^", "小", "東", "="])
test.equal(string.sub(start=1, length=2, s), "^小")
test.equal(string.length(encoding="ascii", s), 11)
test.equal(
string.chars(encoding="ascii", s),
[
"\xE7",
"\x8E",
"\x8B",
"^",
"\xE5",
"\xB0",
"\x8F",
"\xE6",
"\x9D",
"\xB1",
"="
]
)
test.equal(string.sub(encoding="ascii", start=1, length=2, s), "\x8E\x8B")

try
string.chars(encoding="utf16le", s)
test.fail()
catch e : [error.invalid] do
()
end

test.pass()
end

Expand Down Expand Up @@ -171,6 +202,7 @@ def test_escape_html() =
test.equal(string.escape.html("\\"), "\\")
test.equal(string.escape.html("/"), "/")
test.equal(string.escape.html("`"), "`")

test.pass()
end

Expand Down