Javascript - normalize accented greek characters

You could also use the npm library greek-utils which has methods that perform what you are looking for such as replacement of accented and other diacritics characters.

For modern Greek:

var greekUtils = require('greek-utils');

var sanitized = greekUtils.sanitizeDiacritics('Αρνάκι άσπρο και παχύ');
console.log(sanitized); //Αρνακι ασπρο και παχυ

and ancient Greek:

var greekUtils = require('greek-utils');

var sanitized = greekUtils.sanitizeDiacritics('Ἐξ οὗ καὶ δῆλον ὅτι οὐδεμία τῶν ἠθικῶν ἀρετῶν φύσει ἡμῖν ἐγγίνεται');
console.log(sanitized); //Εξ ου και δηλον οτι ουδεμια των ηθικων αρετων φυσει ημιν εγγινεται

I don't think you can do it in another way than by checking for each letter, but that doesn't make it any worse.

Simply chain your .replace functions like so:

result = string.replace(/Ά|Α|ά/g,'α')
  .replace(/Έ|Ε|έ/g,'ε')
  .replace(/Ή|Η|ή/g,'η');
// & so on...   

Or if you rather loop over it, which you presumably do if you have greater amounts of characters to check against,and which is also better for code maintainability, store the character matches in an array of objects/ arrays. Eg. with an object:

var cvtValues =  [ /* from = chars to convert; to = conversion output */
  {from:['Ά','Α','ά'], to: 'α'}
  {from:['Έ','Ε','έ'], to: 'ε'}
  {from:['Ή','Η','ή'], to: 'η'}];
/* loop over all from-to containers */
for ( var i = 0; i < cvtValues.length; i++ ) {
  /* loop over all characters in the 'from' array & replace them with 'to' value*/
  for ( var x = 0; x < cvtValues[i].from.length; x++ ) {
    string = string.replace(new RegExp(cvtValues[i].from[x],'g'), cvtValues[i].to);
    /* You could assign this to another variable, eg. result if you wated */
  }
}

I have also found the following solution which makes use of: String.prototype.normalize()

normal = 'Αντίθετα με αυτό που θεωρεί η πλειοψηφία, το Lorem Ipsum δεν είναι απλά ένα τυχαίο κείμενο. Οι ρίζες του βρίσκονται σε ένα κείμενο Λατινικής λογοτεχνίας του 45 π.Χ., φτάνοντας την ηλικία του πάνω από 2000 έτη.';

pol = 'Μήγαρις ἔχω ἄλλο στὸ νοῦ μου πάρεξ ἐλευθερία καὶ γλώσσα;';

console.log(normalizeGreek(normal));
console.log(normalizePolytonicGreek(pol));

function normalizeGreek(text) {
    return text.normalize('NFD').replace(/[\u0300-\u036f]/g, "");
}


function normalizePolytonicGreek(text) {
    return text.normalize('NFD').replace(/[\u0300-\u036f]/g, "");
}

How it works + example:

Inside .normalize('NFD'), accented characters are decomposed to:

  • the character itself
  • followed by the equivalent Combining Diacritical Mark (see: range [0300-036f])

Removing these marks is easy by using: .replace(/[\u0300-\u036f]/g, "")

a = "ἄ"
console.log(a);             // prints: ἄ
console.log(Array.from(a)); // prints: [ "ἄ" ]

b = a.normalize('NFD')
console.log(b);             // prints: ἄ 
console.log(Array.from(b)); // prints: [ "α", "̓", "́" ]

c = a.normalize('NFD').replace(/[\u0300-\u036f]/g, "")
console.log(c);             // prints: α
console.log(Array.from(c)); // prints: [ "α" ]

Interesting links:

  • Unicode® Standard Annex #15 - Unicode Normalization Forms
  • Normalization charts
  • Remove accents/diacritics in a string in JavaScript