Create a mapping for transliteration from cyrillic to latin in LuaLaTeX
Edit 10/2017
Here is a new version of the Lua script. It generates the code for Luaotfload, instead of the font feature file, which is not supported anymore. It has also new option --back
, which creates mapping in the opposite direction, like from Cyrillic to Latin in our example.
The script is named maptolua.lua
:
kpse.set_program_name "luatex"
local lapp = require "lapp-mk4"
local uchar = unicode.utf8.char
local args
local function load_glyph_list(filename)
local t = {}
for line in io.lines(filename) do
local glyph, code = line:match("([^;]+);([A-Fa-f0-9]+)")
if glyph then
code = string.upper(code)
-- print(code, glyph)
t[code] = glyph
end
end
return t
end
local function load_map_file(mapfile, glyph_list)
local glyph_list = glyph_list or {}
local parse_codepoints = function(s)
local t = {}
local s = string.upper(s)
for x in s:gmatch("U%+([0-9A-F]+)") do
t[#t+1] = glyph_list[x] or "undefined"
end
return t
end
local get_chars = function(s)
local t = {}
for x in s:gmatch("U%+([0-9A-F]+)") do
t[#t+1] = string.format('"%s"', uchar(tonumber(x, 16)))
end
return t
end
local t = {liga = {}, gsub = {}, ccmp= {}}
for line in io.lines(mapfile) do
-- search for
local lookup, replace = line:match("([^%<]+)<>([^%;]+)")
if args.back then -- we can create reverse mapping from the map file
lookup, replace = replace, lookup
end
-- process lines which define mappings
if lookup then
-- convert strings with unicode codepoints to tables with glyph names
-- local lookups = parse_codepoints(lookup)
local lookups = get_chars(lookup)
-- local replaces = parse_codepoints(replace)
local replaces = get_chars(replace)
-- print(table.concat(lookups, ";"), "+", table.concat(replaces, ";"))
local newt = {lookups = lookups, replaces = replaces}
if #lookups > 1 then
table.insert(t.liga, newt)
elseif #replaces > 1 then
table.insert(t.ccmp, newt)
else
table.insert(t.gsub, newt)
end
end
end
return t
end
local function print_fea_file(script, language, maptable)
local function print_feature(feature, name, typ, key, value)
-- the field must be only one character long
print("fonts.handlers.otf.addfeature {")
print(string.format("\tname='%s',", name))
print(string.format("\ttype='%s',", typ))
print("\tdata={")
for _, entry in ipairs(maptable[feature]) do
local field = entry[key][1]
local result = entry[value]
if #result > 1 then
local t = {}
for _, s in ipairs(result) do
t[#t+1] = string.format("%s", s)
end
result = "{" .. table.concat(t, ",") .. "}"
else
result = string.format("%s", result[1])
end
print(string.format("[%s] = %s,", field, result))
end
print("}}")
end
-- print(string.format("languagesystem %s %s;", script, language))
print("\\directlua{")
print_feature("liga", "liga", "ligature", "replaces", "lookups")
print_feature("ccmp", "ccmp", "multiple", "lookups", "replaces")
print_feature("gsub", "gsub", "substitution","lookups", "replaces")
print "}"
end
args = lapp [[
maptolua.lua Convert teckit map files to Luaotfload feature tables
Usage:
texlua maptolua.lua [options] <map file> [glyph list file]
-l,--language (default dflt) language name in OpenType format
-s,--script (default LATN) script name in OpenType format
-b,--back create back mapping
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping
]]
-- if not arg[1] then
-- print "Usage:"
-- print "texlua maptofea.lua mapfile [glyph list] > featurefile.fea"
-- os.exit()
-- end
-- map files use Unicode values, we need to transform them to the glyph names
-- table with glyph list can be either passes as second argument, or one shipped in TL is used
local glyphfile = args.glyph_list or kpse.find_file("glyphlist.txt", "map")
local glyphtable = load_glyph_list(glyphfile)
-- load the map file, search for unicode values and replace them with glyph names
local maptable = load_map_file(args.map_file, glyphtable)
print_fea_file(args.script, args.language, maptable)
It can be executed like:
texlua maptolua.lua cyr.map > newfeat.tex
Which produces a following TeX file:
\directlua{
fonts.handlers.otf.addfeature {
name='liga',
type='ligature',
data={
["–"] = {"-","-"},
["—"] = {"-","-","-"},
["”"] = {"'","'"},
["“"] = {"`","`"},
["¡"] = {"!","`"},
["¿"] = {"?","`"},
["„"] = {",",","},
["«"] = {"<","<"},
["»"] = {">",">"},
}}
fonts.handlers.otf.addfeature {
name='ccmp',
type='multiple',
data={
["Ю"] = {"J","u"},
["Я"] = {"J","a"},
["ю"] = {"j","u"},
["я"] = {"j","a"},
["є"] = {"j","e"},
["Ѩ"] = {"J","e"},
["Щ"] = {"Š","Č"},
["щ"] = {"š","č"},
}}
fonts.handlers.otf.addfeature {
name='gsub',
type='substitution',
data={
["'"] = "’",
["`"] = "‘",
["А"] = "A",
["Б"] = "B",
["В"] = "V",
["Г"] = "G",
["Д"] = "D",
["Е"] = "E",
["Ж"] = "Ž",
["З"] = "Z",
["И"] = "J",
["К"] = "K",
["Л"] = "L",
["М"] = "M",
["Н"] = "N",
["О"] = "O",
["П"] = "P",
["Р"] = "R",
["С"] = "S",
["Т"] = "T",
["У"] = "U",
["Ф"] = "F",
["Ц"] = "C",
["Ч"] = "Č",
["Ш"] = "Š",
["Э"] = "Ė",
["Ё"] = "Ë",
["а"] = "a",
["б"] = "b",
["в"] = "v",
["г"] = "g",
["д"] = "d",
["е"] = "e",
["ж"] = "ž",
["з"] = "z",
["и"] = "i",
["й"] = "j",
["к"] = "k",
["л"] = "l",
["м"] = "m",
["н"] = "n",
["о"] = "o",
["п"] = "p",
["р"] = "r",
["с"] = "s",
["т"] = "t",
["у"] = "u",
["ф"] = "f",
["ц"] = "c",
["ч"] = "č",
["ш"] = "š",
["э"] = "ė",
["ё"] = "ë",
["і"] = "i",
["І"] = "I",
["Х"] = "X",
["х"] = "x",
["И"] = "I",
["Ъ"] = "'",
["Ы"] = "Y",
["Ь"] = "’",
["ъ"] = "’",
["ы"] = "y",
["ь"] = "’",
}}
}
It can be used in the following way:
\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}
\input{newfeat.tex}
\setmainfont[RawFeature=+gsub;]{Linux Libertine O}
\begin{document}
\ifxetex
This is XeTeX
\else\ifluatex
This is LuaTeX
\fi\fi
Hello,, -- --- world я щ
Здравствуй, Мир
\end{document}
(Note that it is necessary to use RawFeature=+gsub;
in the font declaration)
And this is the result:
@edit added support for replacing one glyph with multiple new ones
LuaTeX doesn't support mapping files, but on the other hand it supports OpenType feature files. There is a major difference between the two, the first one works on character level and with unicode values, the other with glyph names.
I've created simple script for converting the map files to .fea
files, maptofea.lua
:
kpse.set_program_name "luatex"
local lapp = require "lapp-mk4"
local function load_glyph_list(filename)
local t = {}
for line in io.lines(filename) do
local glyph, code = line:match("([^;]+);([A-Fa-f0-9]+)")
if glyph then
code = string.upper(code)
-- print(code, glyph)
t[code] = glyph
end
end
return t
end
local function load_map_file(mapfile, glyph_list)
local glyph_list = glyph_list or {}
local parse_codepoints = function(s)
local t = {}
local s = string.upper(s)
for x in s:gmatch("U%+([0-9A-F]+)") do
t[#t+1] = glyph_list[x] or "undefined"
end
return t
end
local t = {liga = {}, gsub = {}, ccmp= {}}
for line in io.lines(mapfile) do
-- search for
local lookup, replace = line:match("([^%<]+)<>([^%;]+)")
-- process lines which define mappings
if lookup then
-- convert strings with unicode codepoints to tables with glyph names
local lookups = parse_codepoints(lookup)
local replaces = parse_codepoints(replace)
-- print(table.concat(lookups, ";"), "+", table.concat(replaces, ";"))
local newt = {lookups = lookups, replaces = replaces}
if #lookups > 1 then
table.insert(t.liga, newt)
elseif #replaces > 1 then
table.insert(t.ccmp, newt)
else
table.insert(t.gsub, newt)
end
end
end
return t
end
local function print_fea_file(script, language, maptable)
local function print_feature(feature)
print("feature " .. feature .. " {")
for _, entry in ipairs(maptable[feature]) do
print(string.format(" sub %s by %s;", table.concat(entry.lookups, " "), table.concat(entry.replaces, " ")))
end
print("} ".. feature .. ";")
end
print(string.format("languagesystem %s %s;", script, language))
print_feature "liga"
print_feature "ccmp"
print_feature "gsub"
end
local args = lapp [[
maptofea.lua Convert teckit map files to OpenType feature files
Usage:
texlua maptofea.lua [options] <map file> [glyph list file]
-l,--language (default dflt) language name in OpenType format
-s,--script (default LATN) script name in OpenType format
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping
]]
-- if not arg[1] then
-- print "Usage:"
-- print "texlua maptofea.lua mapfile [glyph list] > featurefile.fea"
-- os.exit()
-- end
-- map files use Unicode values, we need to transform them to the glyph names
-- table with glyph list can be either passes as second argument, or one shipped in TL is used
local glyphfile = arg[2] or kpse.find_file("glyphlist.txt", "map")
local glyphtable = load_glyph_list(glyphfile)
-- load the map file, search for unicode values and replace them with glyph names
local maptable = load_map_file(arg[1], glyphtable)
print_fea_file(args.script, args.language, maptable)
It's help message:
maptofea.lua Convert teckit map files to OpenType feature files
Usage:
texlua maptofea.lua [options] <map file> [glyph list file]
-l,--language (default dflt) language name in OpenType format
-s,--script (default LATN) script name in OpenType format
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping
you can just simply use it without any options on a map file:
texlua maptofea.lua cyrillic-to-latin.map > cyrtolatn2.fea
the converted file cyrtolatn2.fea
:
languagesystem LATN dflt;
feature liga {
sub hyphen hyphen by endash;
sub hyphen hyphen hyphen by emdash;
sub quotesingle quotesingle by quotedblright;
sub grave grave by quotedblleft;
sub exclam grave by exclamdown;
sub question grave by questiondown;
sub comma comma by quotedblbase;
sub less less by guillemotleft;
sub greater greater by guillemotright;
} liga;
feature ccmp {
sub afii10048 by J u;
sub afii10049 by J a;
sub iucyrillic by j u;
sub iacyrillic by j a;
sub ecyrillic by j e;
sub Yuslittleiotifiedcyrillic by J e;
sub afii10043 by Scaron Ccaron;
sub shchacyrillic by scaron ccaron;
} ccmp;
feature gsub {
sub quotesingle by quoteright;
sub grave by quoteleft;
sub afii10017 by A;
sub afii10018 by B;
sub afii10019 by V;
sub afii10020 by G;
sub afii10021 by D;
sub afii10022 by E;
sub afii10024 by Zcaron;
sub afii10025 by Z;
sub afii10026 by J;
sub afii10028 by K;
sub afii10029 by L;
sub afii10030 by M;
sub afii10031 by N;
sub afii10032 by O;
sub afii10033 by P;
sub afii10034 by R;
sub afii10035 by S;
sub afii10036 by T;
sub afii10037 by U;
sub afii10038 by F;
sub afii10040 by C;
sub afii10041 by Ccaron;
sub afii10042 by Scaron;
sub afii10047 by Edotaccent;
sub afii10023 by Edieresis;
sub afii10065 by a;
sub becyrillic by b;
sub vecyrillic by v;
sub gecyrillic by g;
sub decyrillic by d;
sub iecyrillic by e;
sub zhecyrillic by zcaron;
sub zecyrillic by z;
sub iicyrillic by i;
sub iishortcyrillic by j;
sub kacyrillic by k;
sub elcyrillic by l;
sub emcyrillic by m;
sub encyrillic by n;
sub ocyrillic by o;
sub pecyrillic by p;
sub ercyrillic by r;
sub escyrillic by s;
sub tecyrillic by t;
sub ucyrillic by u;
sub efcyrillic by f;
sub tsecyrillic by c;
sub checyrillic by ccaron;
sub shacyrillic by scaron;
sub ereversedcyrillic by edotaccent;
sub iocyrillic by edieresis;
sub icyrillic by i;
sub afii10055 by I;
sub afii10039 by X;
sub khacyrillic by x;
sub afii10026 by I;
sub afii10044 by quotesingle;
sub afii10045 by Y;
sub afii10046 by quoteright;
sub hardsigncyrillic by quoteright;
sub yericyrillic by y;
sub softsigncyrillic by quoteright;
} gsub;
You have to request the feature file and also gsub
opentype feature in the document:
\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}
\setmainfont[Mapping=cyrillic-to-latin,FeatureFile=cyrtolatn2.fea, RawFeature={+gsub;+liga;}]{Linux Libertine O}
\begin{document}
\ifxetex
This is XeTeX
\else\ifluatex
This is LuaTeX
\fi\fi
Hello,, -- --- world я щ
Здравствуй, Мир
\end{document}
and this is the result:
Upgrade for the 2016 TeXLive distribution.
The new release does not support the inclusion of a .fea file, making obsolete this method. A workaround can be made by the use of \directlua
as follows:
\directlua{
fonts.handlers.otf.addfeature {
name = "myliga",
type = "ligature",
data = {
['Aacute'] = { "А", 0x0301},
['Eacute'] = { "Е", 0x0301},
['Iacute'] = { "И", 0x0301},
['iacute'] = { "и", 0x0301},
['Oacute'] = { "О", 0x0301},
['Uacute'] = { "У", 0x0301},
['Yacute'] = { "Ы", 0x0301},
['Egrave'] = { "Э", 0x0301},
['egrave'] = { "э", 0x0301},
},
}
}
\directlua{
fonts.handlers.otf.addfeature {
name = "mycomp",
type = "multiple",
data = {
afii10039 = { "C", "h" },
afii10087 = { "c", "h" },
afii10048 = { "J", "u" },
afii10049 = { "J", "a" },
afii10096 = { "j", "u" },
afii10097 = { "j", "a" },
Yuslittleiotifiedcyrillic = { "J", "e" },
afii10043 = { "Scaron", "ccaron" },
afii10091 = { "scaron", "ccaron" },
},
}
}
\directlua{
fonts.handlers.otf.addfeature {
name = "mysub",
type = "substitution",
data = {
["quotesingle"] = "quoteright",
["grave"] = "quoteleft",
["afii10017"] = "A",
["afii10018"] = "B",
["afii10019"] = "V",
["afii10020"] = "G",
["afii10021"] = "D",
["afii10022"] = "E",
["afii10024"] = "Zcaron",
["afii10025"] = "Z",
["afii10026"] = "I",
["afii10027"] = "J",
["afii10028"] = "K",
["afii10029"] = "L",
["afii10030"] = "M",
["afii10031"] = "N",
["afii10032"] = "O",
["afii10033"] = "P",
["afii10034"] = "R",
["afii10035"] = "S",
["afii10036"] = "T",
["afii10037"] = "U",
["afii10038"] = "F",
["afii10040"] = "C",
["afii10041"] = "Ccaron",
["afii10042"] = "Scaron",
["afii10047"] = "Edotaccent",
["afii10023"] = "Edieresis",
["afii10065"] = "a",
["afii10066"] = "b",
["afii10067"] = "v",
["afii10068"] = "g",
["afii10069"] = "d",
["afii10070"] = "e",
["afii10072"] = "zcaron",
["afii10073"] = "z",
["afii10074"] = "i",
["afii10075"] = "j",
["afii10076"] = "k",
["afii10077"] = "l",
["afii10078"] = "m",
["afii10079"] = "n",
["afii10080"] = "o",
["afii10081"] = "p",
["afii10082"] = "r",
["afii10083"] = "s",
["afii10084"] = "t",
["afii10085"] = "u",
["afii10086"] = "f",
["afii10088"] = "c",
["afii10089"] = "ccaron",
["afii10090"] = "scaron",
["afii10095"] = "edotaccent",
["afii10071"] = "edieresis",
["afii10103"] = "i",
["afii10055"] = "I",
["afii10026"] = "I",
["afii10044"] = "quoteright",
["afii10045"] = "Y",
["afii10046"] = "quoteright",
["afii10092"] = "quoteright",
["afii10093"] = "y",
["afii10094"] = "quoteright",
},
}
}
The features defined above must be called in the usual way.
\setmainfont{Linux Libertine O}[RawFeature={+mysub;+mycomp;+myliga}]
this was typeset with luatex:
\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}
\setmainfont[Mapping=cyrillic-to-latin]{texgyreadventor-regular.otf}
\begin{document}
\ifxetex
This is XeTeX
\else\ifluatex
This is LuaTeX
\directlua{require("cyrtr")}
\fi
Hello, world
Здравствуй, Мир
\end{document}
using a cyrtr.lua more or less generated from the mapping file in the linked question. (I messed up a bit on some of the quote characters, you'll need to add them back)
function cyrtr (s)
return
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
s,
'А','A'),
'Б','B'),
'В','V'),
'Г','G'),
'Д','D'),
'Е','E'),
'Ж','Ž'),
'З','Z'),
'И','I'),
'К','K'),
'Л','L'),
'М','M'),
'Н','N'),
'О','O'),
'П','P'),
'Р','R'),
'С','S'),
'Т','T'),
'У','U'),
'Ф','F'),
'Ц','C'),
'Ч','Č'),
'Ш','Š'),
'Э','Ė'),
'Ю','Ju'),
'Я','Ja'),
'Ё','Ë'),
'а','a'),
'б','b'),
'в','v'),
'г','g'),
'д','d'),
'е','e'),
'ж','ž'),
'з','z'),
'и','i'),
'й','j'),
'к','k'),
'л','l'),
'м','m'),
'н','n'),
'о','o'),
'п','p'),
'р','r'),
'с','s'),
'т','t'),
'у','u'),
'ф','f'),
'ц','c'),
'ч','č'),
'ш','š'),
'э','ė'),
'ю','ju'),
'я','ja'),
'ё','ë'),
'і','i'),
'І','I'),
'є','je'),
'Ѩ','Je'),
'Х','X'),
'х','x'),
'И','I'),
'Щ','Šč'),
'Ы','Y'),
'щ','šč'),
'ы','y')
end
luatexbase.add_to_callback(
"process_input_buffer",
cyrtr,
"cyrillic transliteration")
Anyway, this transliteration is one of the possible. Different language have different transliteration systems.