-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetParentTaxa.pl
97 lines (84 loc) · 2.28 KB
/
getParentTaxa.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/perl -w
use lib '/home/josh/perl5/lib';
use lib '/home/josh/perl5/lib/perl5';
use strict;
use Fcntl ':flock';
my $qgiFile = shift(@ARGV);
my $hitDir = shift(@ARGV);
my $taxaFile = shift(@ARGV);
my $sourceFilesDir = shift(@ARGV);
my $parentTaxaDataPath = $sourceFilesDir . "/parentTaxaData.txt";
#1: build gi_lookup_hash{$gi} = $taxon; load @taxaArr
#get taxa list
my $cmd="touch $taxaFile";
system($cmd);
open (taxa, $taxaFile);
my @taxaArr=<taxa>;
my $count=0;
foreach my $taxon (@taxaArr)
{
chomp($taxon); #removes any newline characters at the end of the string
$taxaArr[$count] = $taxon;
$count++;
}
#use the hitfiles for a taxon blasted against itself to make a hash of every gi with its parent taxon
my %gi_lookup_hash=();
my @giArr=();
foreach my $taxon (@taxaArr)
{
my $targetName = $taxon . "_" . $taxon;
open (hitFile, "$hitDir/$targetName");
@giArr = <hitFile>;
close hitFile;
$count=0;
foreach my $gi (@giArr) #we just want the query gis, not the hits
{
chomp($gi);
$gi =~ s/^(.+?)\s+.+?\s+\d+.*$/$1/;
$giArr[$count] = $gi;
#print $giArr[$count] . "\n";
$count++;
}
my @nrGiArr=(); my $tempGi=-1; #filter out consecutive duplicate GIs
while ($#giArr>=0)
{
$tempGi=shift(@giArr);
#print "comparing $giArr[0] to $tempGi\n";
if (defined $giArr[0]) {
if ($giArr[0] ne $tempGi)
{
push @nrGiArr, $tempGi;
#print "added $tempGi to nrGiArr\n";
}
}
else {
push @nrGiArr, $tempGi;
#print "added $tempGi to nrGiArr\n";
}
}
foreach my $gi (@nrGiArr) #add gi & its taxon to gi_lookup_hash
{
$gi_lookup_hash{$gi} = $taxon; # this should add (key, value) to gi_lookup_hash where key is gi & value is parent taxon
#print "adding $gi of taxon $taxon\n";
}
}
#2: assign query gis to their parent taxa
open (BATCH_QUERY, "$qgiFile");
my @queryArr_unparsed = <BATCH_QUERY>; my %tempHash=();
close BATCH_QUERY;
foreach my $line (@queryArr_unparsed) {
chomp($line);
if (!($line =~ m/^h\s.+$/)) {
if ($line =~ m/^(.+?)\s(\d+)$/) {
#tempHash{<gi>} = <parent taxon>
$tempHash{$1} = $gi_lookup_hash{$1};
#print "tempHash\{" . $1 . "\} = " . $gi_lookup_hash{$1} . "\n";
}
}
}
#3: output tempHash to file
open out, ">$parentTaxaDataPath";
foreach my $gi (keys %tempHash) {
print out $gi . " " . $tempHash{$gi} . "\n";
}
close out;