Skip to content

Commit 5d045df

Browse files
authored
export and document transcode (#17323)
* export and document transcode from #16974, add transcode(String, x) and transcode(T, ::String) convenience methods * docs * support UTF-32 in transcode * don't use splatting for UTF-32 to String conversion * typo * eliminate method ambiguities * re-run genstdlib * doc clarification * typo
1 parent 426f202 commit 5d045df

File tree

11 files changed

+63
-22
lines changed

11 files changed

+63
-22
lines changed

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ Library improvements
131131
`String(s)`, `unsafe_string(ptr)` (formerly `bytestring(ptr)`), and
132132
`unsafe_wrap(String, ptr)` (formerly `pointer_to_string`) ([#16731]).
133133

134+
* A `transcode(T, src)` function is now exported for converting data
135+
between UTF-xx Unicode encodings ([#17323]).
136+
134137
* Most of the combinatorics functions have been moved from `Base`
135138
to the [Combinatorics.jl package](https://github.com/JuliaLang/Combinatorics.jl) ([#13897]).
136139

@@ -321,4 +324,5 @@ Deprecated or removed
321324
[#17075]: https://github.com/JuliaLang/julia/issues/17075
322325
[#17266]: https://github.com/JuliaLang/julia/issues/17266
323326
[#17300]: https://github.com/JuliaLang/julia/issues/17300
327+
[#17323]: https://github.com/JuliaLang/julia/issues/17323
324328
[#17374]: https://github.com/JuliaLang/julia/issues/17374

base/c.jl

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -128,20 +128,39 @@ function cwstring(s::AbstractString)
128128
end
129129
end
130130

131-
# transcoding between data in UTF-8 and UTF-16 for Windows APIs
131+
# transcoding between data in UTF-8 and UTF-16 for Windows APIs,
132+
# and also UTF-32 for APIs using Cwchar_t on other platforms.
133+
132134
"""
133-
Base.transcode(T,src::Vector{U})
135+
transcode(T, src)
136+
137+
Convert string data between Unicode encodings. `src` is either a
138+
`String` or a `Vector{UIntXX}` of UTF-XX code units, where
139+
`XX` is 8, 16, or 32. `T` indicates the encoding of the return value:
140+
`String` to return a (UTF-8 encoded) `String` or `UIntXX`
141+
to return a `Vector{UIntXX}` of UTF-`XX` data. (The alias `Cwchar_t`
142+
can also be used as the integer type, for converting `wchar_t*` strings
143+
used by external C libraries.)
134144
135-
Transcodes unicode data `src` to a different encoding, where `U` and `T` are the integers
136-
denoting the input and output code units. Currently supported are UTF-8 and UTF-16, which
137-
are denoted by integers `UInt8` and `UInt16`, respectively.
145+
The `transcode` function succeeds as long as the input data can be
146+
reasonably represented in the target encoding; it always succeeds for
147+
conversions between UTF-XX encodings, even for invalid Unicode data.
138148
139-
NULs are handled like any other character (i.e. the output will be NUL-terminated if and
140-
only if the `src` is).
149+
Only conversion to/from UTF-8 is currently supported.
141150
"""
142151
function transcode end
143-
transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src
144-
transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src)
152+
153+
transcode{T<:Union{UInt8,UInt16,UInt32,Int32}}(::Type{T}, src::Vector{T}) = src
154+
transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src]
155+
transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::Vector{UInt8}) = transcode(T, String(src))
156+
function transcode{S<:Union{Int32,UInt32}}(::Type{UInt8}, src::Vector{S})
157+
buf = IOBuffer()
158+
for c in src; print(buf, Char(c)); end
159+
takebuf_array(buf)
160+
end
161+
transcode(::Type{String}, src::String) = src
162+
transcode(T, src::String) = transcode(T, src.data)
163+
transcode(::Type{String}, src) = String(transcode(UInt8, src))
145164

146165
function transcode(::Type{UInt16}, src::Vector{UInt8})
147166
dst = UInt16[]

base/env.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function access_env(onError::Function, str::AbstractString)
1919
error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage()))
2020
end
2121
pop!(val) # NUL
22-
return String(transcode(UInt8, val))
22+
return transcode(String, val)
2323
end
2424

2525
function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true)
@@ -97,7 +97,7 @@ function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}})
9797
len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos)
9898
buf = Array{UInt16}(len)
9999
unsafe_copy!(pointer(buf), pos, len)
100-
env = String(transcode(UInt8, buf))
100+
env = transcode(String, buf)
101101
m = match(r"^(=?[^=]+)=(.*)$"s, env)
102102
if m === nothing
103103
error("malformed environment entry: $env")

base/exports.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,7 @@ export
874874
strip,
875875
strwidth,
876876
summary,
877+
transcode,
877878
ucfirst,
878879
unescape_string,
879880
uppercase,

base/file.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ function tempdir()
203203
error("GetTempPath failed: $(Libc.FormatMessage())")
204204
end
205205
resize!(temppath,lentemppath)
206-
return String(transcode(UInt8, temppath))
206+
return transcode(String, temppath)
207207
end
208208
tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique)
209209
const temp_prefix = cwstring("jl_")
@@ -216,7 +216,7 @@ function tempname(temppath::AbstractString,uunique::UInt32)
216216
error("GetTempFileName failed: $(Libc.FormatMessage())")
217217
end
218218
resize!(tname,lentname)
219-
return String(transcode(UInt8, tname))
219+
return transcode(String, tname)
220220
end
221221
function mktemp(parent=tempdir())
222222
filename = tempname(parent, UInt32(0))

base/interactiveutil.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ elseif is_windows()
150150
len = 0
151151
while unsafe_load(plock, len+1) != 0; len += 1; end
152152
# get Vector{UInt16}, transcode data to UTF-8, make a String of it
153-
s = String(transcode(UInt8, unsafe_wrap(Array, plock, len)))
153+
s = transcode(String, unsafe_wrap(Array, plock, len))
154154
systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock))
155155
return s
156156
end

base/libc.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ if is_windows()
277277
buf = Array{UInt16}(len)
278278
unsafe_copy!(pointer(buf), p, len)
279279
ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p)
280-
return String(transcode(UInt8, buf))
280+
return transcode(String, buf)
281281
end
282282
end
283283

base/path.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ function realpath(path::AbstractString)
136136
systemerror(:realpath, n == 0)
137137
x = n < length(buf) # is the buffer big enough?
138138
resize!(buf, n) # shrink if x, grow if !x
139-
x && return String(transcode(UInt8, buf))
139+
x && return transcode(String, buf)
140140
end
141141
end
142142

@@ -150,7 +150,7 @@ function longpath(path::AbstractString)
150150
systemerror(:longpath, n == 0)
151151
x = n < length(buf) # is the buffer big enough?
152152
resize!(buf, n) # shrink if x, grow if !x
153-
x && return String(transcode(UInt8, buf))
153+
x && return transcode(String, buf)
154154
end
155155
end
156156

doc/manual/strings.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,14 +352,16 @@ exception handling required:
352352
<BLANKLINE>
353353
y
354354

355-
Julia uses UTF-8 encoding by default, and support for new encodings can
355+
Julia uses the UTF-8 encoding by default, and support for new encodings can
356356
be added by packages. For example, the `LegacyStrings.jl
357357
<https://github.com/JuliaArchive/LegacyStrings.jl>`_ package implements
358358
``UTF16String`` and ``UTF32String`` types. Additional discussion of other
359359
encodings and how to implement support for them is beyond the scope of this
360360
document for the time being. For further discussion of UTF-8 encoding issues,
361-
see the section below on `byte array literals <#Byte+Array+Literals>`_,
362-
which goes into some greater detail.
361+
see the section below on `byte array literals <#Byte+Array+Literals>`_.
362+
The :func:`transcode` function is provided to convert data between
363+
the various UTF-xx encodings, primarily for working with external
364+
data and libraries.
363365

364366
.. _man-string-interpolation:
365367

doc/stdlib/strings.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,16 @@
5656
5757
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. This representation is often appropriate for passing strings to C.
5858

59+
.. function:: transcode(T, src)
60+
61+
.. Docstring generated from Julia source
62+
63+
Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` can also be used as the integer type, for converting ``wchar_t*`` strings used by external C libraries.)
64+
65+
The ``transcode`` function succeeds as long as the input data can be reasonably represented in the target encoding; it always succeeds for conversions between UTF-XX encodings, even for invalid Unicode data.
66+
67+
Only conversion to/from UTF-8 is currently supported.
68+
5969
.. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer])
6070

6171
.. Docstring generated from Julia source
@@ -472,4 +482,3 @@
472482
.. Docstring generated from Julia source
473483
474484
General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ .
475-

0 commit comments

Comments
 (0)