Commit 6342e605765fd2f5b9171c1ced4e92a19438c5e3
1 parent
f03ad4a854
Exists in
master
Added the option to give an input file for the full proteins to locateFragment.pl
Showing 1 changed file with 21 additions and 3 deletions Inline Diff
locateFragment.pl
View file @
6342e60
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | $Data::Dumper::Deepcopy = 1; | |
8 | 8 | |||
use Getopt::Long; | 9 | 9 | use Getopt::Long; | |
use Bio::SearchIO; | 10 | 10 | use Bio::SearchIO; | |
use Bio::SeqIO; | 11 | 11 | use Bio::SeqIO; | |
12 | 12 | |||
13 | 13 | |||
#Local libraries | 14 | 14 | #Local libraries | |
use TCDB::CheckDependencies; | 15 | 15 | use TCDB::CheckDependencies; | |
use TCDB::Assorted; | 16 | 16 | use TCDB::Assorted; | |
17 | 17 | |||
18 | 18 | |||
########################################################################### | 19 | 19 | ########################################################################### | |
# | 20 | 20 | # | |
# Parse GBLAST output file and return the sequence of the proteins that | 21 | 21 | # Parse GBLAST output file and return the sequence of the proteins that | |
# meet the user's criteria. | 22 | 22 | # meet the user's criteria. | |
# | 23 | 23 | # | |
########################################################################### | 24 | 24 | ########################################################################### | |
25 | 25 | |||
26 | 26 | |||
#This will prevent quod and alnquod from going into interactive mode | 27 | 27 | #This will prevent quod and alnquod from going into interactive mode | |
$ENV{"MPLBACKEND"} = "agg"; | 28 | 28 | $ENV{"MPLBACKEND"} = "agg"; | |
29 | 29 | |||
30 | 30 | |||
#========================================================================== | 31 | 31 | #========================================================================== | |
#Check dependencies | 32 | 32 | #Check dependencies | |
33 | 33 | |||
my @dependencies = ("glsearch36", "blastdbcmd"); | 34 | 34 | my @dependencies = ("glsearch36", "blastdbcmd"); | |
my $CheckDep_obj = new TCDB::CheckDependencies(); | 35 | 35 | my $CheckDep_obj = new TCDB::CheckDependencies(); | |
$CheckDep_obj -> dependencies_list(\@dependencies); | 36 | 36 | $CheckDep_obj -> dependencies_list(\@dependencies); | |
$CheckDep_obj -> checkDependencies; | 37 | 37 | $CheckDep_obj -> checkDependencies; | |
38 | 38 | |||
39 | 39 | |||
40 | 40 | |||
41 | 41 | |||
#========================================================================== | 42 | 42 | #========================================================================== | |
#Read command line arguments | 43 | 43 | #Read command line arguments | |
44 | 44 | |||
my $fragment = undef; | 45 | 45 | my $fragment = undef; | |
my $accession = undef; | 46 | 46 | my $accession = undef; | |
my $accFile = undef | 47 | 47 | my $accFile = undef | |
my $outdir = undef; | 48 | 48 | my $outdir = undef; | |
my $blastdb = undef; | 49 | 49 | my $blastdb = undef; | |
my $evalue = 1e-2; | 50 | 50 | my $evalue = 1e-2; | |
my $subMatrix = 'BL50'; | 51 | 51 | my $subMatrix = 'BL50'; | |
my $quiet = 0; | 52 | 52 | my $quiet = 0; | |
my $interactive = 0; | 53 | 53 | my $interactive = 0; | |
54 | 54 | |||
read_command_line(); | 55 | 55 | read_command_line(); | |
56 | 56 | |||
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | 57 | 57 | #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | |
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | 58 | 58 | # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | |
#exit; | 59 | 59 | #exit; | |
60 | 60 | |||
61 | 61 | |||
62 | 62 | |||
63 | 63 | |||
#========================================================================== | 64 | 64 | #========================================================================== | |
#Get the sequences for the fragment and full proteins in files | 65 | 65 | #Get the sequences for the fragment and full proteins in files | |
66 | 66 | |||
67 | 67 | |||
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | 68 | 68 | my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | |
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | 69 | 69 | #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | |
70 | 70 | |||
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | 71 | 71 | die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | |
72 | 72 | |||
#========================================================================== | 73 | 73 | #========================================================================== | |
#align fragment to full protein | 74 | 74 | #align fragment to full protein | |
75 | 75 | |||
my $alignFile = "$outdir/${accession}_glsearch.out"; | 76 | 76 | my $alignFile = "$outdir/${accession}_glsearch.out"; | |
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | 77 | 77 | my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | |
system $cmd unless (-f $alignFile); | 78 | 78 | system $cmd unless (-f $alignFile); | |
79 | 79 | |||
80 | 80 | |||
#========================================================================== | 81 | 81 | #========================================================================== | |
#Parse glsearch output and run quod | 82 | 82 | #Parse glsearch output and run quod | |
83 | 83 | |||
84 | 84 | |||
run_quod ($alignFile, $protFile); | 85 | 85 | run_quod ($alignFile, $protFile); | |
86 | 86 | |||
87 | 87 | |||
88 | 88 | |||
89 | 89 | |||
90 | 90 | |||
#========================================================================== | 91 | 91 | #========================================================================== | |
################ Subroutines definition beyond ths point ############## | 92 | 92 | ################ Subroutines definition beyond ths point ############## | |
#========================================================================== | 93 | 93 | #========================================================================== | |
94 | 94 | |||
95 | 95 | |||
sub run_quod { | 96 | 96 | sub run_quod { | |
97 | 97 | |||
my ($alignment, $sequence) = @_; | 98 | 98 | my ($alignment, $sequence) = @_; | |
99 | 99 | |||
100 | 100 | |||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | 101 | 101 | my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | |
102 | 102 | |||
my @res = (); | 103 | 103 | my @res = (); | |
104 | 104 | |||
#---------------------------------------------------------------------- | 105 | 105 | #---------------------------------------------------------------------- | |
#Parse glsearch output | 106 | 106 | #Parse glsearch output | |
107 | 107 | |||
my $hsp_cnt = 1; | 108 | 108 | my $hsp_cnt = 1; | |
while (my $result = $parser->next_result) { | 109 | 109 | while (my $result = $parser->next_result) { | |
110 | 110 | |||
HIT:while (my $hit = $result->next_hit) { | 111 | 111 | HIT:while (my $hit = $result->next_hit) { | |
HSP:while(my $hsp = $hit->next_hsp) { | 112 | 112 | HSP:while(my $hsp = $hit->next_hsp) { | |
113 | 113 | |||
my %data = (); | 114 | 114 | my %data = (); | |
my $key = "hsp_$hsp_cnt"; | 115 | 115 | my $key = "hsp_$hsp_cnt"; | |
my $hId = $hsp->frac_identical('total'); #identity in the alignment | 116 | 116 | my $hId = $hsp->frac_identical('total'); #identity in the alignment | |
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | 117 | 117 | my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | |
118 | 118 | |||
#Alignment parameters | 119 | 119 | #Alignment parameters | |
$data{hsp} = $key; | 120 | 120 | $data{hsp} = $key; | |
$data{evalue} = $hsp->evalue; | 121 | 121 | $data{evalue} = $hsp->evalue; | |
$data{id} = $hId; | 122 | 122 | $data{id} = $hId; | |
$data{sim} = $hSim; | 123 | 123 | $data{sim} = $hSim; | |
124 | 124 | |||
#coordinates in the alignment to plot bars | 125 | 125 | #coordinates in the alignment to plot bars | |
$data{qstart} = $hsp->start('query'); | 126 | 126 | $data{qstart} = $hsp->start('query'); | |
$data{qend} = $hsp->end('query'); | 127 | 127 | $data{qend} = $hsp->end('query'); | |
$data{sstart} = $hsp->start('subject'); | 128 | 128 | $data{sstart} = $hsp->start('subject'); | |
$data{send} = $hsp->end('subject'); | 129 | 129 | $data{send} = $hsp->end('subject'); | |
130 | 130 | |||
push (@res, \%data); | 131 | 131 | push (@res, \%data); | |
$hsp_cnt++; | 132 | 132 | $hsp_cnt++; | |
} | 133 | 133 | } | |
} | 134 | 134 | } | |
} | 135 | 135 | } | |
136 | 136 | |||
die "Error: no match found between fragment and sequence" unless (@res); | 137 | 137 | die "Error: no match found between fragment and sequence" unless (@res); | |
# print Data::Dumper->Dump([\@res ], [qw(*res )]); | 138 | 138 | # print Data::Dumper->Dump([\@res ], [qw(*res )]); | |
139 | 139 | |||
140 | 140 | |||
#---------------------------------------------------------------------- | 141 | 141 | #---------------------------------------------------------------------- | |
#Generate quod plot | 142 | 142 | #Generate quod plot | |
143 | 143 | |||
#Format string for the regions | 144 | 144 | #Format string for the regions | |
my $regions = "-at "; | 145 | 145 | my $regions = "-at "; | |
my $coords = ""; | 146 | 146 | my $coords = ""; | |
foreach my $hit (@res) { | 147 | 147 | foreach my $hit (@res) { | |
148 | 148 | |||
$coords = $hit->{sstart} . "-" . $hit->{send}; | 149 | 149 | $coords = $hit->{sstart} . "-" . $hit->{send}; | |
$regions .= "${coords}:green"; | 150 | 150 | $regions .= "${coords}:green"; | |
151 | 151 | |||
#only the best HSP is required to be plotted | 152 | 152 | #only the best HSP is required to be plotted | |
last; | 153 | 153 | last; | |
} | 154 | 154 | } | |
155 | 155 | |||
my $outPlot = "$outdir/${accession}_map_frag.png"; | 156 | 156 | my $outPlot = "$outdir/${accession}_map_frag.png"; | |
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; | 157 | 157 | my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; | |
my $iString = ($interactive)? "-o $outPlot --show" : ""; | 158 | 158 | my $iString = ($interactive)? "-o $outPlot --show" : ""; | |
159 | 159 | |||
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); | 160 | 160 | my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); | |
print "$cmd\n"; | 161 | 161 | print "$cmd\n"; | |
system $cmd; | 162 | 162 | system $cmd; | |
} | 163 | 163 | } | |
164 | 164 | |||
165 | 165 | |||
166 | 166 | |||
167 | 167 | |||
sub getSequences { | 168 | 168 | sub getSequences { | |
169 | 169 | |||
my ($frag, $acc) = @_; | 170 | 170 | my ($frag, $acc) = @_; | |
171 | 171 | |||
172 | ||||
173 | # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); | |||
174 | # exit; | |||
175 | ||||
176 | ||||
#Sequence for full protein | 172 | 177 | #Sequence for full protein | |
my $accSeq = "$outdir/${acc}.faa"; | 173 | 178 | my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; | |
174 | 179 | |||
175 | 180 | |||
#Save fragment to file | 176 | 181 | #Save fragment to file | |
my $tmpFile = "$outdir/${acc}_frag.faa"; | 177 | 182 | my $tmpFile = "$outdir/${acc}_frag.faa"; | |
178 | 183 | |||
179 | 184 | |||
if (-f $frag) { | 180 | 185 | if (-f $frag) { | |
system "mv $frag $tmpFile" unless (-f $tmpFile); | 181 | 186 | system "mv $frag $tmpFile" unless (-f $tmpFile); | |
} | 182 | 187 | } | |
else { | 183 | 188 | else { | |
unless (-f $tmpFile) { | 184 | 189 | unless (-f $tmpFile) { | |
open (my $fh, ">", $tmpFile) || die $!; | 185 | 190 | open (my $fh, ">", $tmpFile) || die $!; | |
print $fh ">${acc} fragment\n$frag\n"; | 186 | 191 | print $fh ">${acc} fragment\n$frag\n"; | |
close $fh; | 187 | 192 | close $fh; | |
} | 188 | 193 | } | |
} | 189 | 194 | } | |
190 | 195 | |||
unless (-f $accSeq) { | 191 | 196 | unless (-f $accSeq) { | |
#Blast DB to be used | 192 | 197 | #Blast DB to be used | |
my $db = ($blastdb)? $blastdb : 'nr'; | 193 | 198 | my $db = ($blastdb)? $blastdb : 'nr'; | |
194 | 199 | |||
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | 195 | 200 | my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | |
system $cmd; | 196 | 201 | system $cmd; | |
197 | 202 | |||
#Remove the version and annotations from the sequence file | 198 | 203 | #Remove the version and annotations from the sequence file | |
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | 199 | 204 | my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | |
system $cmd2 unless (-f "${accSeq}.pkp"); | 200 | 205 | system $cmd2 unless (-f "${accSeq}.pkp"); | |
} | 201 | 206 | } | |
202 | 207 | |||
#Return files to be aligned | 203 | 208 | #Return files to be aligned | |
return [$tmpFile, $accSeq]; | 204 | 209 | return [$tmpFile, $accSeq]; | |
} | 205 | 210 | } | |
206 | 211 | |||
207 | 212 | |||
208 | 213 | |||
209 | 214 | |||
#=========================================================================== | 210 | 215 | #=========================================================================== | |
#Read command line and print help | 211 | 216 | #Read command line and print help | |
212 | 217 | |||
213 | 218 | |||
sub read_command_line { | 214 | 219 | sub read_command_line { | |
215 | 220 | |||
print_help() unless (@ARGV); | 216 | 221 | print_help() unless (@ARGV); | |
217 | 222 | |||
my $status = GetOptions( | 218 | 223 | my $status = GetOptions( | |
"i|acc-file=s" => \&read_accFile, | 219 | 224 | "i|acc-file=s" => \&read_accFile, | |
"a|accession=s" => \&read_accession, | 220 | 225 | "a|accession=s" => \&read_accession, | |
"f|fragment=s" => \&read_fragment, | 221 | 226 | "f|fragment=s" => \&read_fragment, | |
"o|outdir=s" => \$outdir, | 222 | 227 | "o|outdir=s" => \$outdir, | |
"bdb|blastdb=s" => \&read_blastdb, | 223 | 228 | "bdb|blastdb=s" => \&read_blastdb, | |
"e|evalue=f" => \$evalue, | 224 | 229 | "e|evalue=f" => \$evalue, | |
"m|sub-matrix=s" => \&read_subMatrix, | 225 | 230 | "m|sub-matrix=s" => \&read_subMatrix, | |
"t|interactive!" => \$interactive, | 226 | 231 | "t|interactive!" => \$interactive, | |
"q|quiet!" => \$quiet, | 227 | 232 | "q|quiet!" => \$quiet, | |
"h|help" => sub { print_help(); }, | 228 | 233 | "h|help" => sub { print_help(); }, | |
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | 229 | 234 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | |
exit unless ($status); | 230 | 235 | exit unless ($status); | |
231 | 236 | |||
232 | 237 | |||
die "Error: option -f is mandatory." unless ($fragment); | 233 | 238 | die "Error: option -f is mandatory." unless ($fragment); | |
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | 234 | 239 | die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | |
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | 235 | 240 | die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | |
236 | 241 | |||
#Default value for output directory | 237 | 242 | #Default value for output directory | |
$outdir = "." unless ($outdir); | 238 | 243 | $outdir = "." unless ($outdir); | |
system "mkdir -p $outdir" unless (-d $outdir); | 239 | 244 | system "mkdir -p $outdir" unless (-d $outdir); | |
245 | ||||
246 | #When accession is not given, take the first part of the input file name as accession | |||
247 | unless ($accession) { | |||
248 | my @pathComp = split(/\//, $accFile); | |||
249 | $accession = $pathComp[-1]; | |||
250 | $accession =~ s/(\.faa|\.fasta)$//g; | |||
251 | } | |||
252 | ||||
} | 240 | 253 | } | |
241 | 254 |