Lua: how to make os.rename & os.remove work with filenames containing unicode characters?
As others noted, you won't be able to do much with the stock version of Lua, as it's using CreateFileA
and not the unicode version of this function (CreateFileW
). If you can load external modules, you can use winapi, as it supports retrieval of "short" file names:
local ok, winapi = pcall(require, "winapi")
if ok then
winapi.set_encoding(winapi.CP_UTF8)
local shortpath = winapi.short_path(filepath)
if shortpath ~= filepath then
-- have the short path
end
end
This code should work on all platforms (as it will fail to load winapi on macOS and Linux where this conversion is not needed). The conversion may still fail if the short filenames are not available and it can be configured (per drive) in Windows using fsutil 8dot3name set DRIVE: 0
command.
The renaming will work if you run both the source and the destination file names through the conversion (and delete the destination file, as it may be created by short_file
call).
As lhf pointed out, your code works fine on MacOS.
All you need is a correction for Windows.
The following code is written in pure Lua; it redefines standard os
/io
functions so that they work in Windows with UTF-8 filenames.
Please note that your Windows locale must be Greek and all your filenames must contain only symbols from Windows Greek codepage. On Windows in pure Lua, you can't open a file containing arbitrary UTF-8 symbols in its name.
if (os.getenv"os" or ""):match"^Windows" then
local map_unicode_to_1253 = {
[0x20AC] = 0x80,
[0x201A] = 0x82,
[0x0192] = 0x83,
[0x201E] = 0x84,
[0x2026] = 0x85,
[0x2020] = 0x86,
[0x2021] = 0x87,
[0x2030] = 0x89,
[0x2039] = 0x8B,
[0x2018] = 0x91,
[0x2019] = 0x92,
[0x201C] = 0x93,
[0x201D] = 0x94,
[0x2022] = 0x95,
[0x2013] = 0x96,
[0x2014] = 0x97,
[0x2122] = 0x99,
[0x203A] = 0x9B,
[0x00A0] = 0xA0,
[0x0385] = 0xA1,
[0x0386] = 0xA2,
[0x00A3] = 0xA3,
[0x00A4] = 0xA4,
[0x00A5] = 0xA5,
[0x00A6] = 0xA6,
[0x00A7] = 0xA7,
[0x00A8] = 0xA8,
[0x00A9] = 0xA9,
[0x00AB] = 0xAB,
[0x00AC] = 0xAC,
[0x00AD] = 0xAD,
[0x00AE] = 0xAE,
[0x2015] = 0xAF,
[0x00B0] = 0xB0,
[0x00B1] = 0xB1,
[0x00B2] = 0xB2,
[0x00B3] = 0xB3,
[0x0384] = 0xB4,
[0x00B5] = 0xB5,
[0x00B6] = 0xB6,
[0x00B7] = 0xB7,
[0x0388] = 0xB8,
[0x0389] = 0xB9,
[0x038A] = 0xBA,
[0x00BB] = 0xBB,
[0x038C] = 0xBC,
[0x00BD] = 0xBD,
[0x038E] = 0xBE,
[0x038F] = 0xBF,
[0x0390] = 0xC0,
[0x0391] = 0xC1,
[0x0392] = 0xC2,
[0x0393] = 0xC3,
[0x0394] = 0xC4,
[0x0395] = 0xC5,
[0x0396] = 0xC6,
[0x0397] = 0xC7,
[0x0398] = 0xC8,
[0x0399] = 0xC9,
[0x039A] = 0xCA,
[0x039B] = 0xCB,
[0x039C] = 0xCC,
[0x039D] = 0xCD,
[0x039E] = 0xCE,
[0x039F] = 0xCF,
[0x03A0] = 0xD0,
[0x03A1] = 0xD1,
[0x03A3] = 0xD3,
[0x03A4] = 0xD4,
[0x03A5] = 0xD5,
[0x03A6] = 0xD6,
[0x03A7] = 0xD7,
[0x03A8] = 0xD8,
[0x03A9] = 0xD9,
[0x03AA] = 0xDA,
[0x03AB] = 0xDB,
[0x03AC] = 0xDC,
[0x03AD] = 0xDD,
[0x03AE] = 0xDE,
[0x03AF] = 0xDF,
[0x03B0] = 0xE0,
[0x03B1] = 0xE1,
[0x03B2] = 0xE2,
[0x03B3] = 0xE3,
[0x03B4] = 0xE4,
[0x03B5] = 0xE5,
[0x03B6] = 0xE6,
[0x03B7] = 0xE7,
[0x03B8] = 0xE8,
[0x03B9] = 0xE9,
[0x03BA] = 0xEA,
[0x03BB] = 0xEB,
[0x03BC] = 0xEC,
[0x03BD] = 0xED,
[0x03BE] = 0xEE,
[0x03BF] = 0xEF,
[0x03C0] = 0xF0,
[0x03C1] = 0xF1,
[0x03C2] = 0xF2,
[0x03C3] = 0xF3,
[0x03C4] = 0xF4,
[0x03C5] = 0xF5,
[0x03C6] = 0xF6,
[0x03C7] = 0xF7,
[0x03C8] = 0xF8,
[0x03C9] = 0xF9,
[0x03CA] = 0xFA,
[0x03CB] = 0xFB,
[0x03CC] = 0xFC,
[0x03CD] = 0xFD,
[0x03CE] = 0xFE,
}
local char, byte, table_insert, table_concat = string.char, string.byte, table.insert, table.concat
local function utf8_to_unicode(utf8str, pos)
-- pos = starting byte position inside input string (default 1)
pos = pos or 1
local code, size = byte(utf8str, pos), 1
if code >= 0xC0 and code < 0xFE then
local mask = 64
code = code - 128
repeat
local next_byte = byte(utf8str, pos + size) or 0
if next_byte >= 0x80 and next_byte < 0xC0 then
code, size = (code - mask - 2) * 64 + next_byte, size + 1
else
code, size = byte(utf8str, pos), 1
end
mask = mask * 32
until code < mask
end
-- returns code, number of bytes in this utf8 char
return code, size
end
local function utf8_to_1253(utf8str)
local pos, result_1253 = 1, {}
while pos <= #utf8str do
local code, size = utf8_to_unicode(utf8str, pos)
pos = pos + size
code = code < 128 and code or map_unicode_to_1253[code] or byte('?')
table_insert(result_1253, char(code))
end
return table_concat(result_1253)
end
local orig_os_rename = os.rename
function os.rename(old, new)
return orig_os_rename(utf8_to_1253(old), utf8_to_1253(new))
end
local orig_os_remove = os.remove
function os.remove(filename)
return orig_os_remove(utf8_to_1253(filename))
end
local orig_os_execute = os.execute
function os.execute(command)
if command then
command = utf8_to_1253(command)
end
return orig_os_execute(command)
end
local orig_io_open = io.open
function io.open(filename, ...)
return orig_io_open(utf8_to_1253(filename), ...)
end
local orig_io_popen = io.popen
function io.popen(prog, ...)
return orig_io_popen(utf8_to_1253(prog), ...)
end
local orig_io_lines = io.lines
function io.lines(filename, ...)
if filename then
filename = utf8_to_1253(filename)
end
return orig_io_lines(filename, ...)
end
end
UPDATE:
How to determine Windows codepage:
local function get_windows_ansi_codepage()
local pipe = assert(io.popen[[reg query HKLM\SYSTEM\CurrentControlSet\Control\Nls\CodePage /v ACP]])
local codepage = pipe:read"*a":match"%sACP%s+REG_SZ%s+(.-)%s*$"
pipe:close()
return codepage -- returns string "1253"
end
I also tried to use non-ASCII file names in unmodified Lua 5.3 in Windows, and it didn't work. I think it requires a modified version of Lua. My understanding is that Lua uses basic C functions for filenames, commands, and environment variables, but Windows uses the UTF-16 encoding and requires you to use the wide-string (which means UTF-16 on Windows) functions for non-ASCII filenames, commands, and environment variables.
There is a modified version of Lua that I compiled and tried out, and it handled a non-ASCII filename just fine: lua-u8w. It uses wide-string versions of various functions dealing with files and so on, and converts from UTF-8 to UTF-16 and back so that you can use UTF-8 in Lua while UTF-16 is used in dealing with the Windows operating system.