Commit 6342e605765fd2f5b9171c1ced4e92a19438c5e3

Authored by Luis Arturo Medrano-Soto
1 parent f03ad4a854
Exists in master

Added the option to give an input file for the full proteins to locateFragment.pl

Showing 1 changed file with 21 additions and 3 deletions Inline Diff

locateFragment.pl View file @ 6342e60
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use Bio::SearchIO; 10 10 use Bio::SearchIO;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
12 12
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Assorted; 16 16 use TCDB::Assorted;
17 17
18 18
########################################################################### 19 19 ###########################################################################
# 20 20 #
# Parse GBLAST output file and return the sequence of the proteins that 21 21 # Parse GBLAST output file and return the sequence of the proteins that
# meet the user's criteria. 22 22 # meet the user's criteria.
# 23 23 #
########################################################################### 24 24 ###########################################################################
25 25
26 26
#This will prevent quod and alnquod from going into interactive mode 27 27 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 28 28 $ENV{"MPLBACKEND"} = "agg";
29 29
30 30
#========================================================================== 31 31 #==========================================================================
#Check dependencies 32 32 #Check dependencies
33 33
my @dependencies = ("glsearch36", "blastdbcmd"); 34 34 my @dependencies = ("glsearch36", "blastdbcmd");
my $CheckDep_obj = new TCDB::CheckDependencies(); 35 35 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 36 36 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 37 37 $CheckDep_obj -> checkDependencies;
38 38
39 39
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $fragment = undef; 45 45 my $fragment = undef;
my $accession = undef; 46 46 my $accession = undef;
my $accFile = undef 47 47 my $accFile = undef
my $outdir = undef; 48 48 my $outdir = undef;
my $blastdb = undef; 49 49 my $blastdb = undef;
my $evalue = 1e-2; 50 50 my $evalue = 1e-2;
my $subMatrix = 'BL50'; 51 51 my $subMatrix = 'BL50';
my $quiet = 0; 52 52 my $quiet = 0;
my $interactive = 0; 53 53 my $interactive = 0;
54 54
read_command_line(); 55 55 read_command_line();
56 56
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], 57 57 #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet],
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); 58 58 # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]);
#exit; 59 59 #exit;
60 60
61 61
62 62
63 63
#========================================================================== 64 64 #==========================================================================
#Get the sequences for the fragment and full proteins in files 65 65 #Get the sequences for the fragment and full proteins in files
66 66
67 67
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; 68 68 my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) };
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); 69 69 #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]);
70 70
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); 71 71 die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile);
72 72
#========================================================================== 73 73 #==========================================================================
#align fragment to full protein 74 74 #align fragment to full protein
75 75
my $alignFile = "$outdir/${accession}_glsearch.out"; 76 76 my $alignFile = "$outdir/${accession}_glsearch.out";
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); 77 77 my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile);
system $cmd unless (-f $alignFile); 78 78 system $cmd unless (-f $alignFile);
79 79
80 80
#========================================================================== 81 81 #==========================================================================
#Parse glsearch output and run quod 82 82 #Parse glsearch output and run quod
83 83
84 84
run_quod ($alignFile, $protFile); 85 85 run_quod ($alignFile, $protFile);
86 86
87 87
88 88
89 89
90 90
#========================================================================== 91 91 #==========================================================================
################ Subroutines definition beyond ths point ############## 92 92 ################ Subroutines definition beyond ths point ##############
#========================================================================== 93 93 #==========================================================================
94 94
95 95
sub run_quod { 96 96 sub run_quod {
97 97
my ($alignment, $sequence) = @_; 98 98 my ($alignment, $sequence) = @_;
99 99
100 100
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); 101 101 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment);
102 102
my @res = (); 103 103 my @res = ();
104 104
#---------------------------------------------------------------------- 105 105 #----------------------------------------------------------------------
#Parse glsearch output 106 106 #Parse glsearch output
107 107
my $hsp_cnt = 1; 108 108 my $hsp_cnt = 1;
while (my $result = $parser->next_result) { 109 109 while (my $result = $parser->next_result) {
110 110
HIT:while (my $hit = $result->next_hit) { 111 111 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 112 112 HSP:while(my $hsp = $hit->next_hsp) {
113 113
my %data = (); 114 114 my %data = ();
my $key = "hsp_$hsp_cnt"; 115 115 my $key = "hsp_$hsp_cnt";
my $hId = $hsp->frac_identical('total'); #identity in the alignment 116 116 my $hId = $hsp->frac_identical('total'); #identity in the alignment
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment 117 117 my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
118 118
#Alignment parameters 119 119 #Alignment parameters
$data{hsp} = $key; 120 120 $data{hsp} = $key;
$data{evalue} = $hsp->evalue; 121 121 $data{evalue} = $hsp->evalue;
$data{id} = $hId; 122 122 $data{id} = $hId;
$data{sim} = $hSim; 123 123 $data{sim} = $hSim;
124 124
#coordinates in the alignment to plot bars 125 125 #coordinates in the alignment to plot bars
$data{qstart} = $hsp->start('query'); 126 126 $data{qstart} = $hsp->start('query');
$data{qend} = $hsp->end('query'); 127 127 $data{qend} = $hsp->end('query');
$data{sstart} = $hsp->start('subject'); 128 128 $data{sstart} = $hsp->start('subject');
$data{send} = $hsp->end('subject'); 129 129 $data{send} = $hsp->end('subject');
130 130
push (@res, \%data); 131 131 push (@res, \%data);
$hsp_cnt++; 132 132 $hsp_cnt++;
} 133 133 }
} 134 134 }
} 135 135 }
136 136
die "Error: no match found between fragment and sequence" unless (@res); 137 137 die "Error: no match found between fragment and sequence" unless (@res);
# print Data::Dumper->Dump([\@res ], [qw(*res )]); 138 138 # print Data::Dumper->Dump([\@res ], [qw(*res )]);
139 139
140 140
#---------------------------------------------------------------------- 141 141 #----------------------------------------------------------------------
#Generate quod plot 142 142 #Generate quod plot
143 143
#Format string for the regions 144 144 #Format string for the regions
my $regions = "-at "; 145 145 my $regions = "-at ";
my $coords = ""; 146 146 my $coords = "";
foreach my $hit (@res) { 147 147 foreach my $hit (@res) {
148 148
$coords = $hit->{sstart} . "-" . $hit->{send}; 149 149 $coords = $hit->{sstart} . "-" . $hit->{send};
$regions .= "${coords}:green"; 150 150 $regions .= "${coords}:green";
151 151
#only the best HSP is required to be plotted 152 152 #only the best HSP is required to be plotted
last; 153 153 last;
} 154 154 }
155 155
my $outPlot = "$outdir/${accession}_map_frag.png"; 156 156 my $outPlot = "$outdir/${accession}_map_frag.png";
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; 157 157 my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot";
my $iString = ($interactive)? "-o $outPlot --show" : ""; 158 158 my $iString = ($interactive)? "-o $outPlot --show" : "";
159 159
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); 160 160 my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence);
print "$cmd\n"; 161 161 print "$cmd\n";
system $cmd; 162 162 system $cmd;
} 163 163 }
164 164
165 165
166 166
167 167
sub getSequences { 168 168 sub getSequences {
169 169
my ($frag, $acc) = @_; 170 170 my ($frag, $acc) = @_;
171 171
172
173 # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]);
174 # exit;
175
176
#Sequence for full protein 172 177 #Sequence for full protein
my $accSeq = "$outdir/${acc}.faa"; 173 178 my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa";
174 179
175 180
#Save fragment to file 176 181 #Save fragment to file
my $tmpFile = "$outdir/${acc}_frag.faa"; 177 182 my $tmpFile = "$outdir/${acc}_frag.faa";
178 183
179 184
if (-f $frag) { 180 185 if (-f $frag) {
system "mv $frag $tmpFile" unless (-f $tmpFile); 181 186 system "mv $frag $tmpFile" unless (-f $tmpFile);
} 182 187 }
else { 183 188 else {
unless (-f $tmpFile) { 184 189 unless (-f $tmpFile) {
open (my $fh, ">", $tmpFile) || die $!; 185 190 open (my $fh, ">", $tmpFile) || die $!;
print $fh ">${acc} fragment\n$frag\n"; 186 191 print $fh ">${acc} fragment\n$frag\n";
close $fh; 187 192 close $fh;
} 188 193 }
} 189 194 }
190 195
unless (-f $accSeq) { 191 196 unless (-f $accSeq) {
#Blast DB to be used 192 197 #Blast DB to be used
my $db = ($blastdb)? $blastdb : 'nr'; 193 198 my $db = ($blastdb)? $blastdb : 'nr';
194 199
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); 195 200 my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq);
system $cmd; 196 201 system $cmd;
197 202
#Remove the version and annotations from the sequence file 198 203 #Remove the version and annotations from the sequence file
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); 199 204 my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq);
system $cmd2 unless (-f "${accSeq}.pkp"); 200 205 system $cmd2 unless (-f "${accSeq}.pkp");
} 201 206 }
202 207
#Return files to be aligned 203 208 #Return files to be aligned
return [$tmpFile, $accSeq]; 204 209 return [$tmpFile, $accSeq];
} 205 210 }
206 211
207 212
208 213
209 214
#=========================================================================== 210 215 #===========================================================================
#Read command line and print help 211 216 #Read command line and print help
212 217
213 218
sub read_command_line { 214 219 sub read_command_line {
215 220
print_help() unless (@ARGV); 216 221 print_help() unless (@ARGV);
217 222
my $status = GetOptions( 218 223 my $status = GetOptions(
"i|acc-file=s" => \&read_accFile, 219 224 "i|acc-file=s" => \&read_accFile,
"a|accession=s" => \&read_accession, 220 225 "a|accession=s" => \&read_accession,
"f|fragment=s" => \&read_fragment, 221 226 "f|fragment=s" => \&read_fragment,
"o|outdir=s" => \$outdir, 222 227 "o|outdir=s" => \$outdir,
"bdb|blastdb=s" => \&read_blastdb, 223 228 "bdb|blastdb=s" => \&read_blastdb,
"e|evalue=f" => \$evalue, 224 229 "e|evalue=f" => \$evalue,
"m|sub-matrix=s" => \&read_subMatrix, 225 230 "m|sub-matrix=s" => \&read_subMatrix,
"t|interactive!" => \$interactive, 226 231 "t|interactive!" => \$interactive,
"q|quiet!" => \$quiet, 227 232 "q|quiet!" => \$quiet,
"h|help" => sub { print_help(); }, 228 233 "h|help" => sub { print_help(); },
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); 229 234 "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
exit unless ($status); 230 235 exit unless ($status);
231 236
232 237
die "Error: option -f is mandatory." unless ($fragment); 233 238 die "Error: option -f is mandatory." unless ($fragment);
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); 234 239 die "Error: options -i or -a are mandatory." unless ($accession || $accFile);
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); 235 240 die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive);
236 241
#Default value for output directory 237 242 #Default value for output directory
$outdir = "." unless ($outdir); 238 243 $outdir = "." unless ($outdir);
system "mkdir -p $outdir" unless (-d $outdir); 239 244 system "mkdir -p $outdir" unless (-d $outdir);
245
246 #When accession is not given, take the first part of the input file name as accession
247 unless ($accession) {
248 my @pathComp = split(/\//, $accFile);
249 $accession = $pathComp[-1];
250 $accession =~ s/(\.faa|\.fasta)$//g;
251 }
252
} 240 253 }
241 254