Create a mapping for transliteration from cyrillic to latin in LuaLaTeX

Edit 10/2017

Here is a new version of the Lua script. It generates the code for Luaotfload, instead of the font feature file, which is not supported anymore. It has also new option --back, which creates mapping in the opposite direction, like from Cyrillic to Latin in our example.

The script is named maptolua.lua:

kpse.set_program_name "luatex"
local lapp = require "lapp-mk4"
local uchar = unicode.utf8.char
local args

local function load_glyph_list(filename)
  local t = {}
  for line in io.lines(filename) do
    local glyph, code = line:match("([^;]+);([A-Fa-f0-9]+)")
    if glyph then
      code = string.upper(code)
      -- print(code, glyph)
      t[code] = glyph
    end
  end
  return t
end

local function load_map_file(mapfile, glyph_list)
  local glyph_list = glyph_list or {}
  local parse_codepoints = function(s)
    local t = {}
    local s = string.upper(s)
    for x in s:gmatch("U%+([0-9A-F]+)") do
      t[#t+1] = glyph_list[x] or "undefined"
    end
    return t
  end
  local get_chars = function(s)
    local t = {}
    for x in s:gmatch("U%+([0-9A-F]+)") do
      t[#t+1] = string.format('"%s"', uchar(tonumber(x, 16)))
    end
    return t
  end
  local t = {liga = {}, gsub = {}, ccmp= {}}
  for line in io.lines(mapfile) do
    -- search for 
    local lookup, replace = line:match("([^%<]+)<>([^%;]+)")
    if args.back then -- we can create reverse mapping from the map file
      lookup, replace = replace, lookup
    end
    -- process lines which define mappings
    if lookup then
      -- convert strings with unicode codepoints to tables with glyph names
      -- local lookups = parse_codepoints(lookup)
      local lookups = get_chars(lookup)
      -- local replaces = parse_codepoints(replace)
      local replaces = get_chars(replace)
      -- print(table.concat(lookups, ";"), "+", table.concat(replaces, ";"))
      local newt = {lookups = lookups, replaces = replaces}
      if #lookups > 1 then
        table.insert(t.liga, newt)
      elseif #replaces > 1 then
        table.insert(t.ccmp, newt)
      else 
        table.insert(t.gsub, newt)
      end
    end
  end
  return t
end

local function print_fea_file(script, language,  maptable)
  local function print_feature(feature, name, typ, key, value) 
    -- the field must be only one character long
    print("fonts.handlers.otf.addfeature {")
    print(string.format("\tname='%s',", name))
    print(string.format("\ttype='%s',", typ))
    print("\tdata={")
    for _, entry in ipairs(maptable[feature]) do
      local field = entry[key][1]
      local result = entry[value]
      if #result > 1 then
        local t = {}
        for _, s in ipairs(result) do
          t[#t+1] = string.format("%s", s)
        end
        result = "{" .. table.concat(t, ",") .. "}"
      else
        result = string.format("%s", result[1])
      end
      print(string.format("[%s] = %s,", field, result))
    end
    print("}}")
  end
  -- print(string.format("languagesystem %s %s;", script, language))
  print("\\directlua{")
  print_feature("liga", "liga", "ligature", "replaces", "lookups")
  print_feature("ccmp", "ccmp", "multiple", "lookups", "replaces")
  print_feature("gsub", "gsub", "substitution","lookups", "replaces")
  print "}"
end

args = lapp [[
maptolua.lua Convert teckit map files to Luaotfload feature tables
Usage:
texlua maptolua.lua [options] <map file> [glyph list file]
-l,--language  (default dflt) language name in OpenType format
-s,--script  (default LATN) script name in OpenType format
-b,--back  create back mapping
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping
]]

-- if not arg[1] then
--   print "Usage:"
--   print "texlua maptofea.lua mapfile [glyph list] > featurefile.fea"
--   os.exit()
-- end

-- map files use Unicode values, we need to transform them to the glyph names
-- table with glyph list can be either passes as second argument, or one shipped in TL is used
local glyphfile = args.glyph_list or kpse.find_file("glyphlist.txt", "map")

local glyphtable = load_glyph_list(glyphfile)

-- load the map file, search for unicode values and replace them with glyph names
local maptable = load_map_file(args.map_file, glyphtable)

print_fea_file(args.script, args.language, maptable)

It can be executed like:

texlua maptolua.lua  cyr.map  > newfeat.tex

Which produces a following TeX file:

\directlua{
fonts.handlers.otf.addfeature {
    name='liga',
    type='ligature',
    data={
["–"] = {"-","-"},
["—"] = {"-","-","-"},
["”"] = {"'","'"},
["“"] = {"`","`"},
["¡"] = {"!","`"},
["¿"] = {"?","`"},
["„"] = {",",","},
["«"] = {"<","<"},
["»"] = {">",">"},
}}
fonts.handlers.otf.addfeature {
    name='ccmp',
    type='multiple',
    data={
["Ю"] = {"J","u"},
["Я"] = {"J","a"},
["ю"] = {"j","u"},
["я"] = {"j","a"},
["є"] = {"j","e"},
["Ѩ"] = {"J","e"},
["Щ"] = {"Š","Č"},
["щ"] = {"š","č"},
}}
fonts.handlers.otf.addfeature {
    name='gsub',
    type='substitution',
    data={
["'"] = "’",
["`"] = "‘",
["А"] = "A",
["Б"] = "B",
["В"] = "V",
["Г"] = "G",
["Д"] = "D",
["Е"] = "E",
["Ж"] = "Ž",
["З"] = "Z",
["И"] = "J",
["К"] = "K",
["Л"] = "L",
["М"] = "M",
["Н"] = "N",
["О"] = "O",
["П"] = "P",
["Р"] = "R",
["С"] = "S",
["Т"] = "T",
["У"] = "U",
["Ф"] = "F",
["Ц"] = "C",
["Ч"] = "Č",
["Ш"] = "Š",
["Э"] = "Ė",
["Ё"] = "Ë",
["а"] = "a",
["б"] = "b",
["в"] = "v",
["г"] = "g",
["д"] = "d",
["е"] = "e",
["ж"] = "ž",
["з"] = "z",
["и"] = "i",
["й"] = "j",
["к"] = "k",
["л"] = "l",
["м"] = "m",
["н"] = "n",
["о"] = "o",
["п"] = "p",
["р"] = "r",
["с"] = "s",
["т"] = "t",
["у"] = "u",
["ф"] = "f",
["ц"] = "c",
["ч"] = "č",
["ш"] = "š",
["э"] = "ė",
["ё"] = "ë",
["і"] = "i",
["І"] = "I",
["Х"] = "X",
["х"] = "x",
["И"] = "I",
["Ъ"] = "'",
["Ы"] = "Y",
["Ь"] = "’",
["ъ"] = "’",
["ы"] = "y",
["ь"] = "’",
}}
}

It can be used in the following way:

\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}
\input{newfeat.tex}
\setmainfont[RawFeature=+gsub;]{Linux Libertine O}


\begin{document}
\ifxetex
This is XeTeX
\else\ifluatex
This is LuaTeX
\fi\fi


Hello,, -- --- world я щ 
Здравствуй, Мир

\end{document}

(Note that it is necessary to use RawFeature=+gsub; in the font declaration)

And this is the result:

enter image description here


@edit added support for replacing one glyph with multiple new ones

LuaTeX doesn't support mapping files, but on the other hand it supports OpenType feature files. There is a major difference between the two, the first one works on character level and with unicode values, the other with glyph names.

I've created simple script for converting the map files to .fea files, maptofea.lua:

kpse.set_program_name "luatex"
local lapp = require "lapp-mk4"

local function load_glyph_list(filename)
  local t = {}
  for line in io.lines(filename) do
    local glyph, code = line:match("([^;]+);([A-Fa-f0-9]+)")
    if glyph then
      code = string.upper(code)
      -- print(code, glyph)
      t[code] = glyph
    end
  end
  return t
end

local function load_map_file(mapfile, glyph_list)
  local glyph_list = glyph_list or {}
  local parse_codepoints = function(s)
    local t = {}
    local s = string.upper(s)
    for x in s:gmatch("U%+([0-9A-F]+)") do
      t[#t+1] = glyph_list[x] or "undefined"
    end
    return t
  end
  local t = {liga = {}, gsub = {}, ccmp= {}}
  for line in io.lines(mapfile) do
    -- search for 
    local lookup, replace = line:match("([^%<]+)<>([^%;]+)")
    -- process lines which define mappings
    if lookup then
      -- convert strings with unicode codepoints to tables with glyph names
      local lookups = parse_codepoints(lookup)
      local replaces = parse_codepoints(replace)
      -- print(table.concat(lookups, ";"), "+", table.concat(replaces, ";"))
      local newt = {lookups = lookups, replaces = replaces}
      if #lookups > 1 then
        table.insert(t.liga, newt)
      elseif #replaces > 1 then
        table.insert(t.ccmp, newt)
      else 
        table.insert(t.gsub, newt)
      end
    end
  end
  return t
end

local function print_fea_file(script, language,  maptable)
  local function print_feature(feature) 
    print("feature " .. feature .. " {")
    for _, entry in ipairs(maptable[feature]) do
      print(string.format("  sub %s by %s;", table.concat(entry.lookups, " "), table.concat(entry.replaces, " ")))
    end
    print("} ".. feature .. ";")
  end
  print(string.format("languagesystem %s %s;", script, language))
  print_feature "liga"
  print_feature "ccmp"
  print_feature "gsub"
end

local args = lapp [[
maptofea.lua Convert teckit map files to OpenType feature files
Usage:
texlua maptofea.lua [options] <map file> [glyph list file]
-l,--language  (default dflt) language name in OpenType format
-s,--script  (default LATN) script name in OpenType format
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping
]]

-- if not arg[1] then
--   print "Usage:"
--   print "texlua maptofea.lua mapfile [glyph list] > featurefile.fea"
--   os.exit()
-- end

-- map files use Unicode values, we need to transform them to the glyph names
-- table with glyph list can be either passes as second argument, or one shipped in TL is used
local glyphfile = arg[2] or kpse.find_file("glyphlist.txt", "map")

local glyphtable = load_glyph_list(glyphfile)

-- load the map file, search for unicode values and replace them with glyph names
local maptable = load_map_file(arg[1], glyphtable)

print_fea_file(args.script, args.language, maptable)

It's help message:

maptofea.lua Convert teckit map files to OpenType feature files
Usage:
texlua maptofea.lua [options] <map file> [glyph list file]
-l,--language  (default dflt) language name in OpenType format
-s,--script  (default LATN) script name in OpenType format
<map_file> (string) file to be converted
[glyph_list] (defualt glyphlist.txt) file in Adobe glyh list format with unicode to glyph names mapping

you can just simply use it without any options on a map file:

texlua maptofea.lua cyrillic-to-latin.map > cyrtolatn2.fea

the converted file cyrtolatn2.fea:

languagesystem LATN dflt;
feature liga {
  sub hyphen hyphen by endash;
  sub hyphen hyphen hyphen by emdash;
  sub quotesingle quotesingle by quotedblright;
  sub grave grave by quotedblleft;
  sub exclam grave by exclamdown;
  sub question grave by questiondown;
  sub comma comma by quotedblbase;
  sub less less by guillemotleft;
  sub greater greater by guillemotright;
} liga;
feature ccmp {
  sub afii10048 by J u;
  sub afii10049 by J a;
  sub iucyrillic by j u;
  sub iacyrillic by j a;
  sub ecyrillic by j e;
  sub Yuslittleiotifiedcyrillic by J e;
  sub afii10043 by Scaron Ccaron;
  sub shchacyrillic by scaron ccaron;
} ccmp;
feature gsub {
  sub quotesingle by quoteright;
  sub grave by quoteleft;
  sub afii10017 by A;
  sub afii10018 by B;
  sub afii10019 by V;
  sub afii10020 by G;
  sub afii10021 by D;
  sub afii10022 by E;
  sub afii10024 by Zcaron;
  sub afii10025 by Z;
  sub afii10026 by J;
  sub afii10028 by K;
  sub afii10029 by L;
  sub afii10030 by M;
  sub afii10031 by N;
  sub afii10032 by O;
  sub afii10033 by P;
  sub afii10034 by R;
  sub afii10035 by S;
  sub afii10036 by T;
  sub afii10037 by U;
  sub afii10038 by F;
  sub afii10040 by C;
  sub afii10041 by Ccaron;
  sub afii10042 by Scaron;
  sub afii10047 by Edotaccent;
  sub afii10023 by Edieresis;
  sub afii10065 by a;
  sub becyrillic by b;
  sub vecyrillic by v;
  sub gecyrillic by g;
  sub decyrillic by d;
  sub iecyrillic by e;
  sub zhecyrillic by zcaron;
  sub zecyrillic by z;
  sub iicyrillic by i;
  sub iishortcyrillic by j;
  sub kacyrillic by k;
  sub elcyrillic by l;
  sub emcyrillic by m;
  sub encyrillic by n;
  sub ocyrillic by o;
  sub pecyrillic by p;
  sub ercyrillic by r;
  sub escyrillic by s;
  sub tecyrillic by t;
  sub ucyrillic by u;
  sub efcyrillic by f;
  sub tsecyrillic by c;
  sub checyrillic by ccaron;
  sub shacyrillic by scaron;
  sub ereversedcyrillic by edotaccent;
  sub iocyrillic by edieresis;
  sub icyrillic by i;
  sub afii10055 by I;
  sub afii10039 by X;
  sub khacyrillic by x;
  sub afii10026 by I;
  sub afii10044 by quotesingle;
  sub afii10045 by Y;
  sub afii10046 by quoteright;
  sub hardsigncyrillic by quoteright;
  sub yericyrillic by y;
  sub softsigncyrillic by quoteright;
} gsub;

You have to request the feature file and also gsub opentype feature in the document:

\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}

\setmainfont[Mapping=cyrillic-to-latin,FeatureFile=cyrtolatn2.fea, RawFeature={+gsub;+liga;}]{Linux Libertine O}

\begin{document}
\ifxetex
    This is XeTeX
\else\ifluatex
    This is LuaTeX
\fi\fi

Hello,, -- ---  world я щ 
Здравствуй, Мир

\end{document}

and this is the result: ]

Upgrade for the 2016 TeXLive distribution.

The new release does not support the inclusion of a .fea file, making obsolete this method. A workaround can be made by the use of \directlua as follows:

\directlua{
    fonts.handlers.otf.addfeature {
        name = "myliga",
         type = "ligature",
         data = {
             ['Aacute'] = { "А", 0x0301},
             ['Eacute'] = { "Е", 0x0301},
             ['Iacute'] = { "И", 0x0301},
             ['iacute'] = { "и", 0x0301},
             ['Oacute'] = { "О", 0x0301},
             ['Uacute'] = { "У", 0x0301},
             ['Yacute'] = { "Ы", 0x0301},         
             ['Egrave'] = { "Э", 0x0301},         
             ['egrave'] = { "э", 0x0301},         
        },
    }
}

\directlua{
    fonts.handlers.otf.addfeature {
        name = "mycomp",
         type = "multiple",
         data = {
             afii10039 = { "C", "h" },
             afii10087 = { "c", "h" },
             afii10048 = { "J", "u" },
             afii10049 = { "J", "a" },
             afii10096 = { "j", "u" },
             afii10097 = { "j", "a" },
             Yuslittleiotifiedcyrillic = { "J", "e" },
             afii10043 = { "Scaron", "ccaron" },
             afii10091 = { "scaron", "ccaron" },
        },
    }
}

\directlua{
    fonts.handlers.otf.addfeature {
        name = "mysub",
            type = "substitution",
            data = {
                ["quotesingle"] = "quoteright",
                ["grave"] = "quoteleft",
                ["afii10017"] = "A",
                ["afii10018"] = "B",
                ["afii10019"] = "V",
                ["afii10020"] = "G",
                ["afii10021"] = "D",
                ["afii10022"] = "E",
                ["afii10024"] = "Zcaron",
                ["afii10025"] = "Z",
                ["afii10026"] = "I",
                ["afii10027"] = "J",
                ["afii10028"] = "K",
                ["afii10029"] = "L",
                ["afii10030"] = "M",
                ["afii10031"] = "N",
                ["afii10032"] = "O",
                ["afii10033"] = "P",
                ["afii10034"] = "R",
                ["afii10035"] = "S",
                ["afii10036"] = "T",
                ["afii10037"] = "U",
                ["afii10038"] = "F",
                ["afii10040"] = "C",
                ["afii10041"] = "Ccaron",
                ["afii10042"] = "Scaron",
                ["afii10047"] = "Edotaccent",
                ["afii10023"] = "Edieresis",
                ["afii10065"] = "a",
                ["afii10066"] = "b",
                ["afii10067"] = "v",
                ["afii10068"] = "g",
                ["afii10069"] = "d",
                ["afii10070"] = "e",
                ["afii10072"] = "zcaron",
                ["afii10073"] = "z",
                ["afii10074"] = "i",
                ["afii10075"] = "j",
                ["afii10076"] = "k",
                ["afii10077"] = "l",
                ["afii10078"] = "m",
                ["afii10079"] = "n",
                ["afii10080"] = "o",
                ["afii10081"] = "p",
                ["afii10082"] = "r",
                ["afii10083"] = "s",
                ["afii10084"] = "t",
                ["afii10085"] = "u",
                ["afii10086"] = "f",
                ["afii10088"] = "c",
                ["afii10089"] = "ccaron",
                ["afii10090"] = "scaron",
                ["afii10095"] = "edotaccent",
                ["afii10071"] = "edieresis",
                ["afii10103"] = "i",
                ["afii10055"] = "I",
                ["afii10026"] = "I",
                ["afii10044"] = "quoteright",
                ["afii10045"] = "Y",
                ["afii10046"] = "quoteright",
                ["afii10092"] = "quoteright",
                ["afii10093"] = "y",
                ["afii10094"] = "quoteright",
        },
    }
}

The features defined above must be called in the usual way.

\setmainfont{Linux Libertine O}[RawFeature={+mysub;+mycomp;+myliga}]

enter image description here

this was typeset with luatex:

\documentclass{article}
\usepackage{fontspec}
\usepackage{ifluatex,ifxetex}

\setmainfont[Mapping=cyrillic-to-latin]{texgyreadventor-regular.otf}

\begin{document}
\ifxetex
    This is XeTeX
\else\ifluatex
    This is LuaTeX

\directlua{require("cyrtr")}

\fi

Hello, world

Здравствуй, Мир

\end{document}

using a cyrtr.lua more or less generated from the mapping file in the linked question. (I messed up a bit on some of the quote characters, you'll need to add them back)

function cyrtr (s)
return
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
string.gsub(
s,
'А','A'),
'Б','B'),
'В','V'),
'Г','G'),
'Д','D'),
'Е','E'),
'Ж','Ž'),
'З','Z'),
'И','I'),
'К','K'),
'Л','L'),
'М','M'),
'Н','N'),
'О','O'),
'П','P'),
'Р','R'),
'С','S'),
'Т','T'),
'У','U'),
'Ф','F'),
'Ц','C'),
'Ч','Č'),
'Ш','Š'),
'Э','Ė'),
'Ю','Ju'),
'Я','Ja'),
'Ё','Ë'),
'а','a'),
'б','b'),
'в','v'),
'г','g'),
'д','d'),
'е','e'),
'ж','ž'),
'з','z'),
'и','i'),
'й','j'),
'к','k'),
'л','l'),
'м','m'),
'н','n'),
'о','o'),
'п','p'),
'р','r'),
'с','s'),
'т','t'),
'у','u'),
'ф','f'),
'ц','c'),
'ч','č'),
'ш','š'),
'э','ė'),
'ю','ju'),
'я','ja'),
'ё','ë'),
'і','i'),
'І','I'),
'є','je'),
'Ѩ','Je'),
'Х','X'),
'х','x'),
'И','I'),
'Щ','Šč'),
'Ы','Y'),
'щ','šč'),
'ы','y')
end


luatexbase.add_to_callback(
"process_input_buffer",
cyrtr,
"cyrillic transliteration")

Anyway, this transliteration is one of the possible. Different language have different transliteration systems.