Skip to content

Commit 3b4b9a7

Browse files
authored
Take encoding into account in string functions. (#4111)
1 parent 893690d commit 3b4b9a7

File tree

15 files changed

+268
-62
lines changed

15 files changed

+268
-62
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ New:
5353
- Add `string.of_int` and `string.spaces`.
5454
- Add `list.assoc.nullable`.
5555
- Add `source.cue` (#3620).
56+
- Add `string.chars` (#4111)
5657
- Added atomic file write operations.
5758

5859
Changed:
@@ -64,6 +65,7 @@ Changed:
6465
- Changed internal metadata format to be immutable (#3297).
6566
- Allow a getter for the offset of `on_offset` and dropped the metadata
6667
mechanism for updating it (#3355).
68+
- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
6769
- Disable output paging when `TERM` environment variable is not set.
6870
- Allow running as `root` user inside `docker` container by default (#3406).
6971
- Run `check_next` before playlist's requests resolutions (#3625)

doc/content/migrating.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ end
9292
However, EBU R128 data is now extracted directly from metadata when available.
9393
So `replaygain` cannot control the gain type via this parameter anymore.
9494

95+
### String functions
96+
97+
Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their
98+
given string is in `utf8` by default.
99+
100+
While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by
101+
passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings.
102+
95103
### `check_next`
96104

97105
`check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out

src/core/builtins/builtins_string_extra.ml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,32 @@
2020
2121
*****************************************************************************)
2222

23+
let log = Log.make ["lang"; "string"]
24+
25+
let conf_string =
26+
Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings"
27+
28+
let () =
29+
let conf_default_encoding =
30+
Dtools.Conf.string
31+
~p:(conf_string#plug "default_encoding")
32+
~d:"utf8"
33+
"Default encoding for `string.length`, `string.chars` and `string.sub`"
34+
in
35+
conf_default_encoding#on_change (fun v ->
36+
let enc =
37+
match v with
38+
| "ascii" -> `Ascii
39+
| "utf8" -> `Utf8
40+
| _ ->
41+
log#important
42+
"Invalid value %s for `settings.string.default_encoding`! \
43+
Should be one of: \"ascii\" or \"utf8\"."
44+
v;
45+
`Utf8
46+
in
47+
Liquidsoap_lang.Builtins_string.default_encoding := enc)
48+
2349
let string = Liquidsoap_lang.Builtins_string.string
2450
let string_annotate = Lang.add_module ~base:string "annotate"
2551

src/lang/builtins_regexp.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ let escape_regex_descr =
4949
else Lang_string.utf8_special_char s pos len)
5050
~escape_char:(fun s pos len ->
5151
if s.[pos] = '/' && len = 1 then "\\/"
52-
else Lang_string.escape_utf8_char s pos len)
52+
else Lang_string.escape_utf8_char ~strict:false s pos len)
5353
~next:Lang_string.utf8_next
5454
in
5555
Lang_string.escape_string escape_regex_formatter

src/lang/builtins_string.ml

Lines changed: 101 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,86 @@ let _ =
5252
let l = List.map Lang.to_string l in
5353
Lang.string (String.concat sep l))
5454

55+
let split ~encoding s =
56+
let buf = Buffer.create 1 in
57+
let to_string add c =
58+
Buffer.clear buf;
59+
add buf c;
60+
Buffer.contents buf
61+
in
62+
let get =
63+
match encoding with
64+
| `Ascii -> fun pos -> (to_string Buffer.add_char (String.get s pos), 1)
65+
| `Utf8 ->
66+
fun pos ->
67+
let d = String.get_utf_8_uchar s pos in
68+
if not (Uchar.utf_decode_is_valid d) then
69+
failwith "Decoding failed!";
70+
( to_string Buffer.add_utf_8_uchar (Uchar.utf_decode_uchar d),
71+
Uchar.utf_decode_length d )
72+
in
73+
let len = String.length s in
74+
let rec f chars pos =
75+
if pos = len then List.rev chars
76+
else (
77+
let char, len = get pos in
78+
f (char :: chars) (pos + len))
79+
in
80+
f [] 0
81+
82+
let default_encoding = ref `Utf8
83+
84+
let encoding_option =
85+
( "encoding",
86+
Lang.nullable_t Lang.string_t,
87+
Some Lang.null,
88+
Some
89+
"Encoding used to split characters. Should be one of: `\"utf8\"` or \
90+
`\"ascii\"`" )
91+
92+
let get_encoding p =
93+
match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with
94+
| None -> ("utf8", !default_encoding)
95+
| Some "utf8" -> ("utf8", `Utf8)
96+
| Some "ascii" -> ("ascii", `Ascii)
97+
| _ ->
98+
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
99+
"invalid"
100+
101+
let _ =
102+
Lang.add_builtin ~base:string "chars" ~category:`String
103+
~descr:"Split string into characters. Raises `error.invalid` on errors."
104+
[encoding_option; ("", Lang.string_t, None, None)]
105+
(Lang.list_t Lang.string_t)
106+
(fun p ->
107+
let enc, encoding = get_encoding p in
108+
let s = Lang.to_string (List.assoc "" p) in
109+
try Lang.list (List.map Lang.string (split ~encoding s))
110+
with _ ->
111+
Runtime_error.raise ~pos:(Lang.pos p)
112+
~message:
113+
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
114+
enc)
115+
"invalid")
116+
117+
let _ =
118+
Lang.add_builtin ~base:string "length" ~category:`String
119+
~descr:
120+
"Return the string's length using the given encoding. Raises \
121+
`error.invalid` on errors."
122+
[encoding_option; ("", Lang.string_t, None, None)]
123+
Lang.int_t
124+
(fun p ->
125+
let enc, encoding = get_encoding p in
126+
let s = Lang.to_string (List.assoc "" p) in
127+
try Lang.int (List.length (split ~encoding s))
128+
with _ ->
129+
Runtime_error.raise ~pos:(Lang.pos p)
130+
~message:
131+
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
132+
enc)
133+
"invalid")
134+
55135
let _ =
56136
Lang.add_builtin ~base:string "nth" ~category:`String
57137
~descr:
@@ -165,7 +245,7 @@ let string_escape =
165245
("", Lang.string (String.sub s ofs len));
166246
])
167247
| None, `Ascii -> Lang_string.escape_hex_char
168-
| None, `Utf8 -> Lang_string.escape_utf8_char
248+
| None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
169249
in
170250
let next =
171251
match encoding with
@@ -213,7 +293,8 @@ let _ =
213293
match Lang.to_string format with
214294
| "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
215295
| "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
216-
| "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
296+
| "utf8" ->
297+
(Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
217298
| _ ->
218299
raise
219300
(Error.Invalid_value
@@ -264,15 +345,6 @@ let _ =
264345
let s = Lang.to_string (List.assoc "" p) in
265346
Lang.string (Lang_string.unescape_string s))
266347

267-
let _ =
268-
Lang.add_builtin ~base:string "length" ~category:`String
269-
~descr:"Get the length of a string."
270-
[("", Lang.string_t, None, None)]
271-
Lang.int_t
272-
(fun p ->
273-
let string = Lang.to_string (List.assoc "" p) in
274-
Lang.int (String.length string))
275-
276348
let _ =
277349
Lang.add_builtin ~base:string "sub" ~category:`String
278350
~descr:
@@ -285,6 +357,7 @@ let _ =
285357
Some
286358
"Return a sub string starting at this position. First position is 0."
287359
);
360+
encoding_option;
288361
( "length",
289362
Lang.int_t,
290363
None,
@@ -294,9 +367,24 @@ let _ =
294367
(fun p ->
295368
let start = Lang.to_int (List.assoc "start" p) in
296369
let len = Lang.to_int (List.assoc "length" p) in
370+
let _, encoding = get_encoding p in
297371
let string = Lang.to_string (List.assoc "" p) in
298-
Lang.string
299-
(try String.sub string start len with Invalid_argument _ -> ""))
372+
let s =
373+
match encoding with
374+
| `Ascii -> (
375+
try String.sub string start len with Invalid_argument _ -> "")
376+
| `Utf8 -> (
377+
try
378+
let chars = split ~encoding string in
379+
if List.length chars < len + start then ""
380+
else
381+
String.concat ""
382+
(List.filteri
383+
(fun pos _ -> start <= pos && pos < start + len)
384+
chars)
385+
with _ -> "")
386+
in
387+
Lang.string s)
300388

301389
let _ =
302390
Lang.add_builtin ~base:string "index" ~category:`String

src/lang/lang_string.ml

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
105105
| '\'', 1 -> "\\'"
106106
| _ -> escape_fun s pos len
107107

108-
let escape_utf8_char =
108+
let escape_utf8_char ~strict =
109109
let utf8_char_code s pos len =
110-
try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
110+
try utf8_char_code s pos len
111+
with _ when not strict -> Uchar.to_int Uchar.rep
111112
in
112113
escape_char ~escape_fun:(fun s pos len ->
113114
Printf.sprintf "\\u%04X" (utf8_char_code s pos len))
114115

115-
let escape_utf8_formatter ?(special_char = utf8_special_char) =
116-
escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
116+
let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
117+
=
118+
escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next
117119

118120
let escape_hex_char =
119121
escape_char ~escape_fun:(fun s pos len ->
@@ -153,15 +155,15 @@ let escape_string escape s =
153155
len segments);
154156
Bytes.unsafe_to_string b)
155157

156-
let escape_utf8_string ?special_char =
157-
escape_string (escape_utf8_formatter ?special_char)
158+
let escape_utf8_string ?strict ?special_char =
159+
escape_string (escape_utf8_formatter ?strict ?special_char)
158160

159161
let escape_ascii_string ?special_char =
160162
escape_string (escape_ascii_formatter ?special_char)
161163

162-
let quote_utf8_string s =
164+
let quote_utf8_string ?strict s =
163165
Printf.sprintf "\"%s\""
164-
(escape_utf8_string
166+
(escape_utf8_string ?strict
165167
~special_char:(fun s pos len ->
166168
if s.[pos] = '\'' && len = 1 then false
167169
else utf8_special_char s pos len)
@@ -175,7 +177,9 @@ let quote_ascii_string s =
175177
else ascii_special_char s pos len)
176178
s)
177179

178-
let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
180+
let quote_string s =
181+
try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s
182+
179183
let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
180184
let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
181185
let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"

src/lang/lang_string.mli

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
1616
val escape_char :
1717
escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string
1818

19-
val escape_utf8_char : string -> int -> int -> string
19+
val escape_utf8_char : strict:bool -> string -> int -> int -> string
2020

2121
val escape_utf8_formatter :
22+
?strict:bool ->
2223
?special_char:(string -> int -> int -> bool) ->
2324
string ->
2425
[> `Orig of int * int | `Subst of string * int ] list * int
@@ -39,12 +40,15 @@ val escape_string :
3940
string
4041

4142
val escape_utf8_string :
42-
?special_char:(string -> int -> int -> bool) -> string -> string
43+
?strict:bool ->
44+
?special_char:(string -> int -> int -> bool) ->
45+
string ->
46+
string
4347

4448
val escape_ascii_string :
4549
?special_char:(string -> int -> int -> bool) -> string -> string
4650

47-
val quote_utf8_string : string -> string
51+
val quote_utf8_string : ?strict:bool -> string -> string
4852
val quote_ascii_string : string -> string
4953
val quote_string : string -> string
5054
val unescape_utf8_pattern : string

src/libs/extra/audio.liq

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,9 @@ end
303303
def replaces dtmf(~duration=0.1, ~delay=0.05, dtmf) =
304304
l = ref([])
305305
for i = 0 to
306-
string.length(dtmf) - 1
306+
string.length(encoding="ascii", dtmf) - 1
307307
do
308-
c = string.sub(dtmf, start=i, length=1)
308+
c = string.sub(encoding="ascii", dtmf, start=i, length=1)
309309
let (row, col) =
310310
if
311311
c == "1"

src/libs/file.liq

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -260,15 +260,16 @@ def file.metadata.flac.cover.decode(s) =
260260
def read_int() =
261261
ret =
262262
string.binary.to_int(
263-
little_endian=false, string.sub(s, start=i(), length=4)
263+
little_endian=false,
264+
string.sub(encoding="ascii", s, start=i(), length=4)
264265
)
265266

266267
i := i() + 4
267268
ret
268269
end
269270

270271
def read_string(len) =
271-
ret = string.sub(s, start=i(), length=len)
272+
ret = string.sub(encoding="ascii", s, start=i(), length=len)
272273
i := i() + len
273274
(ret : string)
274275
end
@@ -284,7 +285,7 @@ def file.metadata.flac.cover.decode(s) =
284285
number_of_colors = read_int()
285286
number_of_colors = number_of_colors > 0 ? null(number_of_colors) : null()
286287
data_len = read_int()
287-
data = string.sub(s, start=i(), length=data_len)
288+
data = string.sub(encoding="ascii", s, start=i(), length=data_len)
288289
if
289290
data == ""
290291
then
@@ -320,18 +321,18 @@ def file.metadata.flac.cover.encode(
320321
data
321322
) =
322323
def encode_string(s) =
323-
len = 1 + (string.length(s) / 8)
324+
len = 1 + (string.length(encoding="ascii", s) / 8)
324325
str_len = string.binary.of_int(little_endian=false, pad=4, len)
325326
if
326-
string.length(str_len) > 4
327+
string.length(encoding="ascii", str_len) > 4
327328
then
328329
error.raise(
329330
error.invalid,
330331
"Data length too long for APIC format!"
331332
)
332333
end
333334

334-
pad = string.make(char_code=0, len * 8 - string.length(s))
335+
pad = string.make(char_code=0, len * 8 - string.length(encoding="ascii", s))
335336
(str_len, "#{s}#{pad}")
336337
end
337338

0 commit comments

Comments
 (0)