-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgreekEnglish.js
88 lines (76 loc) · 3.8 KB
/
greekEnglish.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
var natural = require('natural');
var XRegExp = require('xregexp');
var async = require('async');
var phraseTable = require('./src/phraseTable.js');
var correctionsTable = require('./src/correctionsTable.js');
var wordAligner = require('./src/wordAligner.js');
var nonUnicodeLetter = XRegExp('\\PL');
var tokenizer = new natural.RegexpTokenizer({pattern: nonUnicodeLetter});
var cli = require('cli');
function lineArray(filename) {
var array = []; // response
require('fs').readFileSync(filename).toString().split(/\r?\n/).forEach(function(line){
array.push(line);
});
return array;
}
function normalizePolytonicGreek(text) {
text = text.normalize('NFKC')
.replace(/Ά|Α|ά|ἀ|ἁ|ἂ|ἃ|ἄ|ἅ|ἆ|ἇ|ὰ|ά|ᾀ|ᾁ|ᾂ|ᾃ|ᾄ|ᾅ|ᾆ|ᾇ|ᾰ|ᾱ|ᾲ|ᾳ|ᾴ|ᾶ|ᾷ|Ἀ|Ἁ|Ἂ|Ἃ|Ἄ|Ἅ|Ἆ|Ἇ|ᾈ|ᾉ|ᾊ|ᾋ|ᾌ|ᾍ|ᾎ|ᾏ|Ᾰ|Ᾱ|Ὰ|Ά|ᾼ/g,'α')
.replace(/Έ|Ε|έ|ἐ|ἑ|ἒ|ἓ|ἔ|ἕ|ὲ|έ|Ἐ|Ἑ|Ἒ|Ἓ|Ἔ|Ἕ|Ὲ|Έ/g,'ε')
.replace(/Ή|Η|ή|ἠ|ἡ|ἢ|ἣ|ἤ|ἥ|ἦ|ἧ|ὴ|ή|ᾐ|ᾑ|ᾒ|ᾓ|ᾔ|ᾕ|ᾖ|ᾗ|ῂ|ῃ|ῄ|ῆ|ῇ|Ἠ|Ἡ|Ἢ|Ἣ|Ἤ|Ἥ|Ἦ|Ἧ|ᾘ|ᾙ|ᾚ|ᾛ|ᾜ|ᾝ|ᾞ|ᾟ|Ὴ|Ή|ῌ/g,'η')
.replace(/Ί|Ϊ|Ι|ί|ΐ|ἰ|ἱ|ἲ|ἳ|ἴ|ἵ|ἶ|ἷ|ὶ|ί|ῐ|ῑ|ῒ|ΐ|ῖ|ῗ|Ἰ|Ἱ|Ἲ|Ἳ|Ἴ|Ἵ|Ἶ|Ἷ|Ῐ|Ῑ|Ὶ|Ί/g,'ι')
.replace(/Ό|Ο|ό|ὀ|ὁ|ὂ|ὃ|ὄ|ὅ|ὸ|ό|Ὀ|Ὁ|Ὂ|Ὃ|Ὄ|Ὅ|Ὸ|Ό/g,'ο')
.replace(/Ύ|Ϋ|Υ|ΰ|ϋ|ύ|ὐ|ὑ|ὒ|ὓ|ὔ|ὕ|ὖ|ὗ|ὺ|ύ|ῠ|ῡ|ῢ|ΰ|ῦ|ῧ|Ὑ|Ὓ|Ὕ|Ὗ|Ῠ|Ῡ|Ὺ|Ύ/g,'υ')
.replace(/Ώ|Ω|ώ|ὠ|ὡ|ὢ|ὣ|ὤ|ὥ|ὦ|ὧ|ὼ|ώ|ᾠ|ᾡ|ᾢ|ᾣ|ᾤ|ᾥ|ᾦ|ᾧ|ῲ|ῳ|ῴ|ῶ|ῷ|Ὠ|Ὡ|Ὢ|Ὣ|Ὤ|Ὥ|Ὦ|Ὧ|ᾨ|ᾩ|ᾪ|ᾫ|ᾬ|ᾭ|ᾮ|ᾯ|Ὼ|Ώ|ῼ/g,'ω')
.replace(/ῤ|ῥ|Ῥ/g,'ρ')
.replace(/Σ|ς/g,'σ');
return text;
}
function parseCorpusFiles(sourceFile, targetFile) {
var corpus = [];
var greekArray = lineArray(sourceFile);
var englishArray = lineArray(targetFile);
greekArray.forEach(function(greekString, index) {
var englishString = englishArray[index];
corpus[index] = [normalizePolytonicGreek(greekString).toLowerCase(), englishString.toLowerCase()];
});
greekArray = []; englishArray = [];
return corpus;
}
console.time('corpus');
// statistical corpus
var sourceFileCorpus = './tests/fixtures/greekToEnglish/corpus/greek.txt';
var targetFileCorpus = './tests/fixtures/greekToEnglish/corpus/english.txt';
corpus = parseCorpusFiles(sourceFileCorpus, targetFileCorpus);
// correctional corpus
var sourceFileCorrections = './tests/fixtures/greekToEnglish/corrections/greek.txt';
var targetFileCorrections = './tests/fixtures/greekToEnglish/corrections/english.txt';
corrections = parseCorpusFiles(sourceFileCorrections, targetFileCorrections);
console.timeEnd('corpus');
var corpus = corpus.splice(1,10000);
// var alignmentPairs = corpus.slice(0).splice(1,17);
var alignmentPairs = corpus.slice(0).splice(0,100);
console.time('table');
correctionsTable.generate(corrections, function(percent){cli.progress(percent);}, function() {
console.log('correctionsTable done.');
phraseTable.generate(corpus, function(percent){cli.progress(percent);}, function() {
console.timeEnd('table');
console.time('alignment');
async.mapSeries(alignmentPairs,
function(alignmentPair, callback) {
wordAligner.align(alignmentPair, function(alignments) {
console.log("\n\n\tSource: ", alignmentPair[0], "\n\tTarget: ", alignmentPair[1], "\n\tAlignments:\n", alignments);
callback(null, alignments);
});
},
function(err, allAlignments) {
allAlignments.forEach(function(alignments, index) {
var alignmentPair = alignmentPairs[index];
});
console.timeEnd('alignment');
console.log(JSON.stringify(allAlignments, null, 2));
}
);
});
});