Truncate string in Julia
In the naive ASCII world:
truncate_ascii(s,n) = s[1:min(sizeof(s),n)]
would do. If it's preferable to share memory with original string and avoid copying SubString can be used:
truncate_ascii(s,n) = SubString(s,1,min(sizeof(s),n))
But in a Unicode world (and it is a Unicode world) this is better:
truncate_utf8(s,n) = SubString(s,1, (eo=endof(s) ; neo=0 ;
for i=1:n
if neo<eo neo=nextind(s,neo) ; else break ; end ;
end ; neo) )
Finally, @IsmaelVenegasCastelló reminded us of grapheme complexity (arrrgh), and then this is what's needed:
function truncate_grapheme(s,n)
eo = endof(s) ; tt = 0 ; neo=0
for i=1:n
if (neo<eo)
tt = nextind(s,neo)
while neo>0 && tt<eo && !Base.UTF8proc.isgraphemebreak(s[neo],s[tt])
(neo,tt) = (tt,nextind(s,tt))
end
neo = tt
else
break
end
end
return SubString(s,1,neo)
end
These last two implementations try to avoid calculating the length
(which can be slow) or allocating/copying, or even just looping n
times when the length
is shorter.
This answer draws on contributions of @MichaelOhlrogge, @FengyangWang, @Oxinabox and @IsmaelVenegasCastelló
I would do strtruncate(str, n) = join(take(str, n))
.
Example:
julia> strtruncate("αβγδ", 3)
"αβγ"
julia> strtruncate("αβγδ", 5)
"αβγδ"
Note that your code is not fully valid for Unicode strings.
If the string is ASCII, this is pretty efficient:
String(resize!(str.data, n))
Or in-place:
resize!(str.data, n)
For unicode, @Fengyang Wangs's method is very fast, but converting to a Char
array can be slightly faster if you only truncate the very end of the string:
trunc1(str::String, n) = String(collect(take(str, n)))
trunc2(str::String, n) = String(Vector{Char}(str)[1:n])
trunc3(str::String, n) = String(resize!(Vector{Char}(str), n))
trunc4(str::String, n::Int)::String = join(collect(graphemes(str))[1:n])
function trunc5(str::String, n)
if isascii(str)
return String(resize!(str.data, n))
else
trunc1(str, n)
end
end
Timing:
julia> time_trunc(100, 100000, 25)
0.112851 seconds (700.00 k allocations: 42.725 MB, 7.75% gc time)
0.165806 seconds (700.00 k allocations: 91.553 MB, 11.84% gc time)
0.160116 seconds (600.00 k allocations: 73.242 MB, 11.58% gc time)
1.167706 seconds (31.60 M allocations: 1.049 GB, 11.12% gc time)
0.017833 seconds (100.00 k allocations: 1.526 MB)
true
julia> time_trunc(100, 100000, 98)
0.367191 seconds (700.00 k allocations: 83.923 MB, 5.23% gc time)
0.318507 seconds (700.00 k allocations: 132.751 MB, 9.08% gc time)
0.301685 seconds (600.00 k allocations: 80.872 MB, 6.19% gc time)
1.561337 seconds (31.80 M allocations: 1.122 GB, 9.86% gc time)
0.061827 seconds (100.00 k allocations: 1.526 MB)
true
Edit: Whoops.. I just realized that I'm actually destroying the original string in trunc5
. This should be correct, but with less superior performance:
function trunc5(str::String, n)
if isascii(str)
return String(str.data[1:n])
else
trunc1(str, n)
end
end
New timings:
julia> time_trunc(100, 100000, 25)
0.123629 seconds (700.00 k allocations: 42.725 MB, 7.70% gc time)
0.162332 seconds (700.00 k allocations: 91.553 MB, 11.41% gc time)
0.152473 seconds (600.00 k allocations: 73.242 MB, 9.19% gc time)
1.152640 seconds (31.60 M allocations: 1.049 GB, 11.54% gc time)
0.066662 seconds (200.00 k allocations: 12.207 MB)
true
julia> time_trunc(100, 100000, 98)
0.369576 seconds (700.00 k allocations: 83.923 MB, 5.10% gc time)
0.312237 seconds (700.00 k allocations: 132.751 MB, 9.42% gc time)
0.297736 seconds (600.00 k allocations: 80.872 MB, 5.95% gc time)
1.545329 seconds (31.80 M allocations: 1.122 GB, 10.02% gc time)
0.080399 seconds (200.00 k allocations: 19.836 MB, 5.07% gc time)
true
Aaand new edit: Aargh, forgot the timing function. I'm inputting an ascii string:
function time_trunc(m, n, m_)
str = randstring(m)
@time for _ in 1:n trunc1(str, m_) end
@time for _ in 1:n trunc2(str, m_) end
@time for _ in 1:n trunc3(str, m_) end
@time for _ in 1:n trunc4(str, m_) end
@time for _ in 1:n trunc5(str, m_) end
trunc1(str, m_) == trunc2(str, m_) == trunc3(str, m_) == trunc4(str, m_) == trunc5(str, m_)
end
Final edit (I hope):
Trying out @Dan Getz's truncate_grapheme
and using unicode strings:
function time_trunc(m, n, m_)
# str = randstring(m)
str = join(["αβγπϕ1t_Ω₃!" for i in 1:100])
@time for _ in 1:n trunc1(str, m_) end
@time for _ in 1:n trunc2(str, m_) end
@time for _ in 1:n trunc3(str, m_) end
# @time for _ in 1:n trunc4(str, m_) end # too slow
@time for _ in 1:n trunc5(str, m_) end
@time for _ in 1:n truncate_grapheme(str, m_) end
trunc1(str, m_) == trunc2(str, m_) == trunc3(str, m_) == trunc5(str, m_) == truncate_grapheme(str, m_)
end
Timing:
julia> time_trunc(100, 100000, 98)
0.690399 seconds (800.00 k allocations: 103.760 MB, 3.69% gc time)
1.828437 seconds (800.00 k allocations: 534.058 MB, 3.66% gc time)
1.795005 seconds (700.00 k allocations: 482.178 MB, 3.19% gc time)
0.667831 seconds (800.00 k allocations: 103.760 MB, 3.17% gc time)
0.347953 seconds (100.00 k allocations: 3.052 MB)
true
julia> time_trunc(100, 100000, 25)
0.282922 seconds (800.00 k allocations: 48.828 MB, 4.01% gc time)
1.576374 seconds (800.00 k allocations: 479.126 MB, 3.98% gc time)
1.643700 seconds (700.00 k allocations: 460.815 MB, 3.70% gc time)
0.276586 seconds (800.00 k allocations: 48.828 MB, 4.59% gc time)
0.091773 seconds (100.00 k allocations: 3.052 MB)
true
So the last one seems clearly the best (and this post is now way too long.)