-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeEnJaLSD.pl
executable file
·102 lines (91 loc) · 2.57 KB
/
makeEnJaLSD.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl
use warnings;
use strict;
use Fatal qw/open/;
use Getopt::Std;
use open 'utf8';
use constant LSD_Version => '2019';
binmode STDIN, ":encoding(utf8)";
binmode STDOUT, ":encoding(utf8)";
our(%opts);
getopt('s:', \%opts);
die unless($opts{"s"});
my $file1 = "/data/yayamamo/LSD/".LSD_Version."/03Eng-Jap_utf8.txt";
my $file2 = "/data/yayamamo/LSD/".LSD_Version."/16Desc_utf8.txt";
my $file3 = "/data/yayamamo/hpo/MP_eav0904_tsv.txt";
my $file4 = "/data/yayamamo/hpo/Medical-Dictionary.tsv";
my ($pattern1, $pattern2);
my $source;
if($opts{"s"} eq "LSD"){
$source = "/data/yayamamo/hpo/LSD2HPO.txt";
$pattern1 = qr/:[A-Z]\d+$/;
$pattern2 = qr/:([A-Z]\d+)$/;
}elsif($opts{"s"} eq "MP"){
$source = "/data/yayamamo/hpo/MP_eav0904_TO_HPO.txt";
$pattern1 = qr/:MP:\d+$/;
$pattern2 = qr/:(MP:\d+)$/;
}elsif($opts{"s"} eq "MD"){
$source = "/data/yayamamo/hpo/MD2HPO.txt";
$pattern1 = qr/:\d+$/;
$pattern2 = qr/:(\d+)$/;
}
my %dictionary;
my $fh;
if($opts{"s"} eq "LSD"){
open($fh, "<:encoding(utf8)", $file1);
while(<$fh>){
chomp;
my ($eid, $elabel, undef, $jlabel, undef, $jcode) = split /\t/;
push @{$dictionary{$eid}}, $jlabel;
}
close($fh);
open($fh, "<:encoding(utf8)", $file2);
while(<$fh>){
chomp;
my ($eid, $jlabel, $elabel) = split /\t/;
push @{$dictionary{$eid}}, $jlabel;
}
close($fh);
}elsif($opts{"s"} eq "MP"){
open($fh, "<:encoding(utf8)", $file3);
while(<$fh>){
chomp;
my ($eid, $jlabel, $elabel) = split /\t/;
push @{$dictionary{$eid}}, $jlabel;
}
close($fh);
}elsif($opts{"s"} eq "MD"){
open($fh, "<:encoding(utf8)", $file4);
while(<$fh>){
chomp;
my ($id, $jlabel, $ejflag, $elabel) = split /\t/;
next if $id eq 'id';
next if $ejflag eq 'true';
push @{$dictionary{$id}}, $jlabel;
}
close($fh);
}
open($fh, "<:encoding(utf8)", $source);
while(<$fh>){
chomp;
next if index($_, ">") != 0;
if(/^>H/){ # Exact match
my ($hpid, $label, $ecode) = split /\t/;
$hpid = substr($hpid, 1);
print join("\t", ($hpid, $label, join(" OR ", @{$dictionary{$ecode}} ))), "\n";
}elsif(/^>>H/){ # Partial match
my ($hpid, $label, $ecodewords) = split /\t/;
$hpid = substr($hpid, 2);
my @ewordset = map {s/${pattern1}//;$_} split /$;/, $ecodewords;
my @ecodeset = map {m,${pattern2},;$1} split /$;/, $ecodewords;
print join("\t",
($hpid, $label,
join("|", @ewordset),
join("|", map{ join(" OR ", @{$dictionary{$_}} )} grep {$dictionary{$_}} @ecodeset),
)), "\n";
}elsif($opts{"u"} && /^>>>H/){ # Unmatch
print substr($_, 3), "\n";
}
}
close($fh);
__END__