@@ -52,6 +52,86 @@ let _ =
5252 let l = List. map Lang. to_string l in
5353 Lang. string (String. concat sep l))
5454
55+ let split ~encoding s =
56+ let buf = Buffer. create 1 in
57+ let to_string add c =
58+ Buffer. clear buf;
59+ add buf c;
60+ Buffer. contents buf
61+ in
62+ let get =
63+ match encoding with
64+ | `Ascii -> fun pos -> (to_string Buffer. add_char (String. get s pos), 1 )
65+ | `Utf8 ->
66+ fun pos ->
67+ let d = String. get_utf_8_uchar s pos in
68+ if not (Uchar. utf_decode_is_valid d) then
69+ failwith " Decoding failed!" ;
70+ ( to_string Buffer. add_utf_8_uchar (Uchar. utf_decode_uchar d),
71+ Uchar. utf_decode_length d )
72+ in
73+ let len = String. length s in
74+ let rec f chars pos =
75+ if pos = len then List. rev chars
76+ else (
77+ let char , len = get pos in
78+ f (char :: chars) (pos + len))
79+ in
80+ f [] 0
81+
82+ let default_encoding = ref `Utf8
83+
84+ let encoding_option =
85+ ( " encoding" ,
86+ Lang. nullable_t Lang. string_t,
87+ Some Lang. null,
88+ Some
89+ " Encoding used to split characters. Should be one of: `\" utf8\" ` or \
90+ `\" ascii\" `" )
91+
92+ let get_encoding p =
93+ match Lang. to_valued_option Lang. to_string (List. assoc " encoding" p) with
94+ | None -> (" utf8" , ! default_encoding)
95+ | Some "utf8" -> (" utf8" , `Utf8 )
96+ | Some "ascii" -> (" ascii" , `Ascii )
97+ | _ ->
98+ Runtime_error. raise ~pos: (Lang. pos p) ~message: " Invalid encoding!"
99+ " invalid"
100+
101+ let _ =
102+ Lang. add_builtin ~base: string " chars" ~category: `String
103+ ~descr: " Split string into characters. Raises `error.invalid` on errors."
104+ [encoding_option; (" " , Lang. string_t, None , None )]
105+ (Lang. list_t Lang. string_t)
106+ (fun p ->
107+ let enc, encoding = get_encoding p in
108+ let s = Lang. to_string (List. assoc " " p) in
109+ try Lang. list (List. map Lang. string (split ~encoding s))
110+ with _ ->
111+ Runtime_error. raise ~pos: (Lang. pos p)
112+ ~message:
113+ (Printf. sprintf " String cannot be split using encoding `\" %s\" `!"
114+ enc)
115+ " invalid" )
116+
117+ let _ =
118+ Lang. add_builtin ~base: string " length" ~category: `String
119+ ~descr:
120+ " Return the string's length using the given encoding. Raises \
121+ `error.invalid` on errors."
122+ [encoding_option; (" " , Lang. string_t, None , None )]
123+ Lang. int_t
124+ (fun p ->
125+ let enc, encoding = get_encoding p in
126+ let s = Lang. to_string (List. assoc " " p) in
127+ try Lang. int (List. length (split ~encoding s))
128+ with _ ->
129+ Runtime_error. raise ~pos: (Lang. pos p)
130+ ~message:
131+ (Printf. sprintf " String cannot be split using encoding `\" %s\" `!"
132+ enc)
133+ " invalid" )
134+
55135let _ =
56136 Lang. add_builtin ~base: string " nth" ~category: `String
57137 ~descr:
@@ -165,7 +245,7 @@ let string_escape =
165245 (" " , Lang. string (String. sub s ofs len));
166246 ])
167247 | None , `Ascii -> Lang_string. escape_hex_char
168- | None , `Utf8 -> Lang_string. escape_utf8_char
248+ | None , `Utf8 -> Lang_string. escape_utf8_char ~strict: false
169249 in
170250 let next =
171251 match encoding with
@@ -213,7 +293,8 @@ let _ =
213293 match Lang. to_string format with
214294 | "octal" -> (Lang_string. escape_octal_char, Lang_string. ascii_next)
215295 | "hex" -> (Lang_string. escape_hex_char, Lang_string. ascii_next)
216- | "utf8" -> (Lang_string. escape_utf8_char, Lang_string. utf8_next)
296+ | "utf8" ->
297+ (Lang_string. escape_utf8_char ~strict: false , Lang_string. utf8_next)
217298 | _ ->
218299 raise
219300 (Error. Invalid_value
@@ -264,15 +345,6 @@ let _ =
264345 let s = Lang. to_string (List. assoc " " p) in
265346 Lang. string (Lang_string. unescape_string s))
266347
267- let _ =
268- Lang. add_builtin ~base: string " length" ~category: `String
269- ~descr: " Get the length of a string."
270- [(" " , Lang. string_t, None , None )]
271- Lang. int_t
272- (fun p ->
273- let string = Lang. to_string (List. assoc " " p) in
274- Lang. int (String. length string ))
275-
276348let _ =
277349 Lang. add_builtin ~base: string " sub" ~category: `String
278350 ~descr:
@@ -285,6 +357,7 @@ let _ =
285357 Some
286358 " Return a sub string starting at this position. First position is 0."
287359 );
360+ encoding_option;
288361 ( " length" ,
289362 Lang. int_t,
290363 None ,
@@ -294,9 +367,24 @@ let _ =
294367 (fun p ->
295368 let start = Lang. to_int (List. assoc " start" p) in
296369 let len = Lang. to_int (List. assoc " length" p) in
370+ let _, encoding = get_encoding p in
297371 let string = Lang. to_string (List. assoc " " p) in
298- Lang. string
299- (try String. sub string start len with Invalid_argument _ -> " " ))
372+ let s =
373+ match encoding with
374+ | `Ascii -> (
375+ try String. sub string start len with Invalid_argument _ -> " " )
376+ | `Utf8 -> (
377+ try
378+ let chars = split ~encoding string in
379+ if List. length chars < len + start then " "
380+ else
381+ String. concat " "
382+ (List. filteri
383+ (fun pos _ -> start < = pos && pos < start + len)
384+ chars)
385+ with _ -> " " )
386+ in
387+ Lang. string s)
300388
301389let _ =
302390 Lang. add_builtin ~base: string " index" ~category: `String
0 commit comments