Commit 538dec87bcf488605c35aa8fd2e872326adff65d

Authored by Luis Arturo Medrano-Soto
1 parent bbf0ef5765
Exists in master

updated scripts to run with new versions of quod and extractTCDB

Showing 3 changed files with 56 additions and 28 deletions Inline Diff

alignSeqsFiles.pl View file @ 538dec8
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use LWP; 10 10 use LWP;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
use Bio::SearchIO; 12 12 use Bio::SearchIO;
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Domain::PfamParser; 16 16 use TCDB::Domain::PfamParser;
use TCDB::Domain::Characterize; 17 17 use TCDB::Domain::Characterize;
use TCDB::Assorted; 18 18 use TCDB::Assorted;
19 19
20 20
########################################################################### 21 21 ###########################################################################
# 22 22 #
# Comapre two files with fasta sequences and report the alignment parameters 23 23 # Comapre two files with fasta sequences and report the alignment parameters
# Along with hydropathy plots and PFAM domains. 24 24 # Along with hydropathy plots and PFAM domains.
# 25 25 #
########################################################################### 26 26 ###########################################################################
27 27
#========================================================================== 28 28 #==========================================================================
#Check dependencies 29 29 #Check dependencies
30 30
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", 31 31 my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd",
"hmmscan"); 32 32 "hmmscan");
my $CheckDep_obj = new TCDB::CheckDependencies(); 33 33 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 34 34 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 35 35 $CheckDep_obj -> checkDependencies;
36 36
37 37
#This will prevent quod and alnquod from going into interactive mode 38 38 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 39 39 $ENV{"MPLBACKEND"} = "agg";
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $qfile = ""; 45 45 my $qfile = "";
my $qProt = ""; 46 46 my $qProt = "";
my $sfile = ""; 47 47 my $sfile = "";
my $sProt = ""; 48 48 my $sProt = "";
my $qlabel = "Query"; 49 49 my $qlabel = "Query";
my $slabel = "Subject"; 50 50 my $slabel = "Subject";
my $outdir = ""; 51 51 my $outdir = "";
my $prog = 'ssearch36'; #'blastp'; 52 52 my $prog = 'ssearch36'; #'blastp';
my $evalue = 1e-4; 53 53 my $evalue = 1e-4;
my $identity = 20.0; 54 54 my $identity = 20.0;
my $coverage = 40.0; 55 55 my $coverage = 40.0;
my $covControl = "X"; 56 56 my $covControl = "X";
my $blastComp = "F"; #2; 57 57 my $blastComp = "F"; #2;
my $segFilter = 'no'; 58 58 my $segFilter = 'no';
my $minLength = 30; #Min legnth of proteins to analyze (without gaps) 59 59 my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
my $subMatrix = 'BL50'; 60 60 my $subMatrix = 'BL50';
my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] 61 61 my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] 62 62 my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
63 63
#this can be used to remove long sequences from results 64 64 #this can be used to remove long sequences from results
my $maxProtLength = 100000; #default threshold to allow any length 65 65 my $maxProtLength = 100000; #default threshold to allow any length
my $LengthControl = "N"; #same meaning as $covControl 66 66 my $LengthControl = "N"; #same meaning as $covControl
67 67
#internal directories 68 68 #internal directories
my $filesDir = ""; 69 69 my $filesDir = "";
my $plotsDir = ""; 70 70 my $plotsDir = "";
my $seqDir = ""; 71 71 my $seqDir = "";
my $blastDir = ""; 72 72 my $blastDir = "";
73 73
74 74
read_command_line(); 75 75 read_command_line();
76 76
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, 77 77 #print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog,
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, 78 78 # $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength,
# $LengthControl], 79 79 # $LengthControl],
# [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog 80 80 # [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog
# *evalue *coverage *covControl *blastComp *segFilter *maxProtLength 81 81 # *evalue *coverage *covControl *blastComp *segFilter *maxProtLength
# *LengthControl)]); 82 82 # *LengthControl)]);
#exit; 83 83 #exit;
84 84
85 85
86 86
#========================================================================== 87 87 #==========================================================================
#Output files 88 88 #Output files
89 89
#The alignment file by blastp or ssearch36 90 90 #The alignment file by blastp or ssearch36
my $alnFile = "$filesDir/${prog}.out"; 91 91 my $alnFile = "$filesDir/${prog}.out";
92 92
#The results of running hmmscan 93 93 #The results of running hmmscan
my $pfamFile = "$filesDir/hmmscan.out"; 94 94 my $pfamFile = "$filesDir/hmmscan.out";
95 95
#The results from running hmmtop 96 96 #The results from running hmmtop
my $hmmtopFile = "$filesDir/hmmtop.out"; 97 97 my $hmmtopFile = "$filesDir/hmmtop.out";
98 98
#The blast database to retrieve sequences for ploting 99 99 #The blast database to retrieve sequences for ploting
my $blastdb = "$blastDir/sequences"; 100 100 my $blastdb = "$blastDir/sequences";
101 101
102 102
103 103
#========================================================================== 104 104 #==========================================================================
#Run the alignment first 105 105 #Run the alignment first
106 106
print "Running $prog and parsing output....\n"; 107 107 print "Running $prog and parsing output....\n";
run_alignment(); 108 108 run_alignment();
109 109
110 110
my @alnHits = (); 111 111 my @alnHits = ();
if ($prog eq 'blastp') { parse_blast(\@alnHits); } 112 112 if ($prog eq 'blastp') { parse_blast(\@alnHits); }
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} 113 113 elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)}
114 114
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); 115 115 #print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]);
#exit; 116 116 #exit;
117 117
118 118
die "No significant blastHits found!\n" unless (@alnHits); 119 119 die "No significant blastHits found!\n" unless (@alnHits);
120 120
121 121
122 122
#========================================================================== 123 123 #==========================================================================
#Run pfam (get clans, hmmtop, and parse results 124 124 #Run pfam (get clans, hmmtop, and parse results
125 125
my %pfamHits = (); 126 126 my %pfamHits = ();
my %clans = (); 127 127 my %clans = ();
my %hmmtopHits = (); 128 128 my %hmmtopHits = ();
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); 129 129 run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans );
130 130
#print Data::Dumper->Dump([\%clans], [qw(*clans)]); 131 131 #print Data::Dumper->Dump([\%clans], [qw(*clans)]);
#exit; 132 132 #exit;
133 133
134 134
#========================================================================== 135 135 #==========================================================================
#Parse the alignment results to make sure there are signficant results, 136 136 #Parse the alignment results to make sure there are signficant results,
#get domains for significant hits and plot the corresponding hydropathies. 137 137 #get domains for significant hits and plot the corresponding hydropathies.
138 138
139 139
140 140
141 141
print "Geneating report...\n"; 142 142 print "Geneating report...\n";
generate_report(); 143 143 generate_report();
144 144
145 145
146 146
#========================================================================== 147 147 #==========================================================================
################ Subroutines definition beyond ths point ############## 148 148 ################ Subroutines definition beyond ths point ##############
#========================================================================== 149 149 #==========================================================================
150 150
151 151
#========================================================================== 152 152 #==========================================================================
#Generate output for significant hits 153 153 #Generate output for significant hits
154 154
sub generate_report { 155 155 sub generate_report {
156 156
157 157
#Prepare output files 158 158 #Prepare output files
my $htmlFile = "$outdir/report.html"; 159 159 my $htmlFile = "$outdir/report.html";
my $tsvFile = "$outdir/report.tsv"; 160 160 my $tsvFile = "$outdir/report.tsv";
my $plotsFile = "$outdir/plots.html"; 161 161 my $plotsFile = "$outdir/plots.html";
162 162
my $tsvHeader = "#Query\tSubject\tQlen\tSlen\tE-value\tIdentity\tQstart\tQend\tQcov\tSstart\tSend\tScov\tQseq\tSseq\n"; 163 163 my $tsvHeader = "#Query\tSubject\tQlen\tSlen\tE-value\tIdentity\tQstart\tQend\tQcov\tSstart\tSend\tScov\tQseq\tSseq\n";
164 164
my $htmlHeader = <<HEADER; 165 165 my $htmlHeader = <<HEADER;
<!DOCTYPE html> 166 166 <!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"> 167 167 <html xmlns="http://www.w3.org/1999/xhtml">
<head> 168 168 <head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> 169 169 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
170 170
<style type="text/css"> 171 171 <style type="text/css">
172 172
.label { 173 173 .label {
text-align: right; 174 174 text-align: right;
width: 50px; 175 175 width: 50px;
} 176 176 }
177 177
.data { 178 178 .data {
text-align: left; 179 179 text-align: left;
padding-left: 8px; 180 180 padding-left: 8px;
width: 100px; 181 181 width: 100px;
} 182 182 }
183 183
.uline { 184 184 .uline {
text-decoration: underline; 185 185 text-decoration: underline;
} 186 186 }
187 187
.pfam { 188 188 .pfam {
text-align: center; 189 189 text-align: center;
vertical-align: middle; 190 190 vertical-align: middle;
} 191 191 }
192 192
.seq { 193 193 .seq {
border: 2px solid black; 194 194 border: 2px solid black;
height: 70px; 195 195 height: 70px;
width: 100%; 196 196 width: 100%;
overflow-x: auto; 197 197 overflow-x: auto;
overflow-y: hidden; 198 198 overflow-y: hidden;
margin: 1em 0; 199 199 margin: 1em 0;
background: gray; 200 200 background: gray;
color: white; 201 201 color: white;
} 202 202 }
203 203
204 204
.dom { 205 205 .dom {
border: 2px solid black; 206 206 border: 2px solid black;
height: 100px; 207 207 height: 100px;
width: 100%; 208 208 width: 100%;
overflow-x: auto; 209 209 overflow-x: auto;
overflow-y: auto; 210 210 overflow-y: auto;
margin: 1em 0; 211 211 margin: 1em 0;
background: gray; 212 212 background: gray;
color: white; 213 213 color: white;
} 214 214 }
215 215
img { 216 216 img {
display: block; 217 217 display: block;
margin-left: auto; 218 218 margin-left: auto;
margin-right: auto; 219 219 margin-right: auto;
height: 250px; 220 220 height: 250px;
width: auto; 221 221 width: auto;
max-width: 1500px; 222 222 max-width: 1500px;
max-height: 300px; 223 223 max-height: 300px;
} 224 224 }
225 225
</style> 226 226 </style>
<title>$qlabel vs $slabel</title> 227 227 <title>$qlabel vs $slabel</title>
</head> 228 228 </head>
<br /> 229 229 <br />
<h1 style='text-align:center'>$qlabel vs $slabel</h1> 230 230 <h1 style='text-align:center'>$qlabel vs $slabel</h1>
<body> 231 231 <body>
232 232
HEADER 233 233 HEADER
234 234
235 235
open (my $tsvh, ">", $tsvFile) || die $!; 236 236 open (my $tsvh, ">", $tsvFile) || die $!;
print $tsvh $tsvHeader; 237 237 print $tsvh $tsvHeader;
238 238
open (my $outh, ">", $htmlFile) || die $!; 239 239 open (my $outh, ">", $htmlFile) || die $!;
print $outh $htmlHeader; 240 240 print $outh $htmlHeader;
241 241
242 242
foreach my $hit (sort by_evalue @alnHits) { 243 243 foreach my $hit (sort by_evalue @alnHits) {
244 244
245 245
my $qacc = $hit->{qacc}; 246 246 my $qacc = $hit->{qacc};
my $qlen = $hit->{qlen}; 247 247 my $qlen = $hit->{qlen};
my $qseq = $hit->{qseq}; 248 248 my $qseq = $hit->{qseq};
my $qcov = sprintf("%.1f", $hit->{qcov}); 249 249 my $qcov = sprintf("%.1f", $hit->{qcov});
my $qstart = $hit->{qstart}; 250 250 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 251 251 my $qend = $hit->{qend};
252 252
my $sacc = $hit->{sacc}; 253 253 my $sacc = $hit->{sacc};
my $slen = $hit->{slen}; 254 254 my $slen = $hit->{slen};
my $sseq = $hit->{sseq}; 255 255 my $sseq = $hit->{sseq};
my $scov = sprintf("%.1f", $hit->{scov}); 256 256 my $scov = sprintf("%.1f", $hit->{scov});
my $sstart = $hit->{sstart}; 257 257 my $sstart = $hit->{sstart};
my $send = $hit->{send}; 258 258 my $send = $hit->{send};
259 259
my $eval = sprintf ("%.1e", $hit->{evalue}); 260 260 my $eval = sprintf ("%.1e", $hit->{evalue});
my $id = sprintf ("%.1f", $hit->{id}); 261 261 my $id = sprintf ("%.1f", $hit->{id});
my $hstr = $hit->{hstr}; 262 262 my $hstr = $hit->{hstr};
263 263
my $alnHit = <<HIT; 264 264 my $alnHit = <<HIT;
265 265
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> 266 266 <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>
267 267
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> 268 268 <p><b>$qacc ($qlen) vs $sacc ($slen)</b></p>
269 269
<table width="600px" border="0" cellspacing="0" cellpadding="2"> 270 270 <table width="600px" border="0" cellspacing="0" cellpadding="2">
<tr> 271 271 <tr>
<td class='label'><b>E-value:</b></td> 272 272 <td class='label'><b>E-value:</b></td>
<td class='data'>$eval</td> 273 273 <td class='data'>$eval</td>
<td class='label'><b>Identity:</b></td> 274 274 <td class='label'><b>Identity:</b></td>
<td class='data'>${id}%</td> 275 275 <td class='data'>${id}%</td>
<td class='label'><b>Q_coverage:</b></td> 276 276 <td class='label'><b>Q_coverage:</b></td>
<td class='data'>${qcov}%</td> 277 277 <td class='data'>${qcov}%</td>
<td class='label'><b>S_coverage:</b></td> 278 278 <td class='label'><b>S_coverage:</b></td>
<td class='data'>${scov}%</td> 279 279 <td class='data'>${scov}%</td>
</tr> 280 280 </tr>
<tr> 281 281 <tr>
<td class='label'><b>Q_aln:</b></td> 282 282 <td class='label'><b>Q_aln:</b></td>
<td class='data'>${qstart}-$qend</td> 283 283 <td class='data'>${qstart}-$qend</td>
<td class='label'><b>S_aln:</b></td> 284 284 <td class='label'><b>S_aln:</b></td>
<td class='data'>${sstart}-$send</td> 285 285 <td class='data'>${sstart}-$send</td>
<td class='label'></td> 286 286 <td class='label'></td>
<td class='data'></td> 287 287 <td class='data'></td>
<td class='label'></td> 288 288 <td class='label'></td>
<td class='data'></td> 289 289 <td class='data'></td>
</tr> 290 290 </tr>
</table> 291 291 </table>
<br /> 292 292 <br />
293 293
<p><b>Alignment:</b></p> 294 294 <p><b>Alignment:</b></p>
<div class='seq'> 295 295 <div class='seq'>
<pre> 296 296 <pre>
$qseq 297 297 $qseq
$hstr 298 298 $hstr
$sseq 299 299 $sseq
</pre> 300 300 </pre>
</div> 301 301 </div>
302 302
HIT 303 303 HIT
304 304
print $outh $alnHit; 305 305 print $outh $alnHit;
306 306
print $tsvh "$qacc\t$sacc\t$qlen\t$slen\t$eval\t$id\t$qstart\t$qend\t$qcov\t$sstart\t$send\t$scov\t$qseq\t$sseq\n"; 307 307 print $tsvh "$qacc\t$sacc\t$qlen\t$slen\t$eval\t$id\t$qstart\t$qend\t$qcov\t$sstart\t$send\t$scov\t$qseq\t$sseq\n";
308 308
#Generate the hydropathy plots 309 309 #Generate the hydropathy plots
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); 310 310 my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq);
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); 311 311 die "Could not generate plots for hit: $qacc vs $sacc" unless ($good);
312 312
my $domData = generate_domain_data($qacc, $sacc); 313 313 my $domData = generate_domain_data($qacc, $sacc);
my $domHTML = ""; 314 314 my $domHTML = "";
if ($domData) { 315 315 if ($domData) {
$domHTML =<<DATA; 316 316 $domHTML =<<DATA;
<br /> 317 317 <br />
<hr /> 318 318 <hr />
<p><b>Pfam info:</b></p> 319 319 <p><b>Pfam info:</b></p>
<div class='dom'> 320 320 <div class='dom'>
321 321
$domData 322 322 $domData
323 323
</div> 324 324 </div>
DATA 325 325 DATA
} 326 326 }
327 327
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; 328 328 my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png";
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; 329 329 my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png";
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; 330 330 my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png";
331 331
#now include the plots 332 332 #now include the plots
my $prtPlots =<<PLOTS; 333 333 my $prtPlots =<<PLOTS;
334 334
<br /> 335 335 <br />
<table style="width:100%"> 336 336 <table style="width:100%">
<tr> 337 337 <tr>
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> 338 338 <td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td>
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> 339 339 <td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td>
</tr> 340 340 </tr>
<tr> 341 341 <tr>
<td colspan="2" style="text-align: center;"> 342 342 <td colspan="2" style="text-align: center;">
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> 343 343 <a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a>
</td> 344 344 </td>
</tr> 345 345 </tr>
</table> 346 346 </table>
347 347
$domHTML 348 348 $domHTML
349 349
PLOTS 350 350 PLOTS
351 351
print $outh $prtPlots; 352 352 print $outh $prtPlots;
} 353 353 }
354 354
#Close HTML report 355 355 #Close HTML report
my $closeRep = <<CLOSE; 356 356 my $closeRep = <<CLOSE;
</body> 357 357 </body>
</html> 358 358 </html>
CLOSE 359 359 CLOSE
360 360
print $outh $closeRep; 361 361 print $outh $closeRep;
362 362
close $outh; 363 363 close $outh;
close $tsvh; 364 364 close $tsvh;
} 365 365 }
366 366
367 367
#========================================================================== 368 368 #==========================================================================
#Generate domain data for the html report 369 369 #Generate domain data for the html report
370 370
371 371
sub generate_domain_data { 372 372 sub generate_domain_data {
373 373
my ($q, $s) = @_; 374 374 my ($q, $s) = @_;
375 375
#Format of PFAM hash: 376 376 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 377 377 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, 378 378 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval,
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 379 379 # dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
380 380
381 381
382 382
383 383
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); 384 384 my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info);
my $colStr = " <th>" . join ("</th>\n <th>", @cols) . 385 385 my $colStr = " <th>" . join ("</th>\n <th>", @cols) .
"</th>\n"; 386 386 "</th>\n";
387 387
my $header =<<HEADER; 388 388 my $header =<<HEADER;
<table border='1', style='width:100%'> 389 389 <table border='1', style='width:100%'>
<tr> 390 390 <tr>
$colStr 391 391 $colStr
</tr> 392 392 </tr>
HEADER 393 393 HEADER
394 394
395 395
my $res = ""; 396 396 my $res = "";
foreach my $prot ($q, $s) { 397 397 foreach my $prot ($q, $s) {
398 398
if (exists $pfamHits{$prot}) { 399 399 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 400 400 my @Doms = keys %{ $pfamHits{$prot} };
401 401
foreach my $d (@Doms) { 402 402 foreach my $d (@Doms) {
403 403
my $clan = ($clans{$d})? $clans{$d} : "N/A"; 404 404 my $clan = ($clans{$d})? $clans{$d} : "N/A";
405 405
my @hits = @{ $pfamHits{$prot}{$d} }; 406 406 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 407 407 foreach my $hit (@hits) {
my $dlen = $hit->{dlen}; 408 408 my $dlen = $hit->{dlen};
my $eval = $hit->{evalue}; 409 409 my $eval = $hit->{evalue};
my $qstart = $hit->{qstart}; 410 410 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 411 411 my $qend = $hit->{qend};
my $dstart = $hit->{dstart}; 412 412 my $dstart = $hit->{dstart};
my $dend = $hit->{dend}; 413 413 my $dend = $hit->{dend};
my $name = $hit->{dname}; 414 414 my $name = $hit->{dname};
my $def = $hit->{def}; 415 415 my $def = $hit->{def};
416 416
$res .=<<ROW; 417 417 $res .=<<ROW;
<tr> 418 418 <tr>
<td class="pfam">$prot</td> 419 419 <td class="pfam">$prot</td>
<td class="pfam">$d</td> 420 420 <td class="pfam">$d</td>
<td class="pfam">$clan</td> 421 421 <td class="pfam">$clan</td>
<td class="pfam">$dlen</td> 422 422 <td class="pfam">$dlen</td>
<td class="pfam">$eval</td> 423 423 <td class="pfam">$eval</td>
<td class="pfam">$dstart</td> 424 424 <td class="pfam">$dstart</td>
<td class="pfam">$dend</td> 425 425 <td class="pfam">$dend</td>
<td class="pfam">$qstart</td> 426 426 <td class="pfam">$qstart</td>
<td class="pfam">$qend</td> 427 427 <td class="pfam">$qend</td>
<td class="pfam">$name</td> 428 428 <td class="pfam">$name</td>
<td class="pfam">$def</td> 429 429 <td class="pfam">$def</td>
</tr> 430 430 </tr>
ROW 431 431 ROW
} 432 432 }
} 433 433 }
} 434 434 }
} 435 435 }
436 436
#Return final result 437 437 #Return final result
if ($res) { 438 438 if ($res) {
$header .= $res; 439 439 $header .= $res;
$header .= " </table>\n"; 440 440 $header .= " </table>\n";
return $header; 441 441 return $header;
} 442 442 }
else { 443 443 else {
return $res; 444 444 return $res;
} 445 445 }
} 446 446 }
447 447
448 448
449 449
450 450
#========================================================================== 451 451 #==========================================================================
#Run quod on the query, subject and the alignment. 452 452 #Run quod on the query, subject and the alignment.
453 453
454 454
#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa 455 455 #quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
456 456
457 457
sub run_quod { 458 458 sub run_quod {
459 459
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; 460 460 my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
461 461
462 462
#extract sequences for query and subject 463 463 #extract sequences for query and subject
extract_full_sequences($q,$s); 464 464 extract_full_sequences($q,$s);
465 465
466 466
#----------------------------------------------------------------- 467 467 #-----------------------------------------------------------------
#Run quod for the alignment 468 468 #Run quod for the alignment
469 469
#First save aligned segments to files 470 470 #First save aligned segments to files
my $qalnFile ="$seqDir/${q}_aln.faa"; 471 471 my $qalnFile ="$seqDir/${q}_aln.faa";
open(my $qfh, '>', $qalnFile) || die $!; 472 472 open(my $qfh, '>', $qalnFile) || die $!;
print $qfh ">$q alignment\n$qseq\n"; 473 473 print $qfh ">$q alignment\n$qseq\n";
close $qfh; 474 474 close $qfh;
475 475
my $salnFile ="$seqDir/${s}_aln.faa"; 476 476 my $salnFile ="$seqDir/${s}_aln.faa";
open(my $sfh, '>', $salnFile) || die $!; 477 477 open(my $sfh, '>', $salnFile) || die $!;
print $sfh ">$s alignment\n$sseq\n"; 478 478 print $sfh ">$s alignment\n$sseq\n";
close $sfh; 479 479 close $sfh;
480 480
my $labelsStr = "--axis-font 18.0 --tick-font 15.0 --title-font 20.0 --xlabel Residues --ylabel Hydropathy --width 15 --xticks 25"; 481 481 my $labelsStr = "--axis-font 18.0 --tick-font 15.0 --title-font 20.0 --xlabel Residues --ylabel Hydropathy --width 15 --xticks 25";
my $ylimStr = "--ylim -3 3"; 482 482 my $ylimStr = "--ylim -3 3";
483 483
484 484
#Note alnquod requires to add the extension to the image name 485 485 #Plot alignment
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; 486 486 my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
my $cmd1 = qq(quod.py -q $labelsStr -l "$q (red) and $s (blue)" -o $alnFig $ylimStr --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); 487 487 my $cmd1 = qq(quod.py -q $labelsStr -l "$q (red) and $s (blue)" -o $alnFig $ylimStr --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
# print "$cmd1\n\n"; 488 488 # print "$cmd1\n\n";
# exit; 489 489 # exit;
system $cmd1 unless (-f "${alnFig}"); 490 490 system $cmd1 unless (-f "${alnFig}");
return undef unless (-f "${alnFig}"); 491 491 return undef unless (-f "${alnFig}");
492 492
493 493
#----------------------------------------------------------------- 494 494 #-----------------------------------------------------------------
#Run quod for the full sequencess of the query and subject proteins 495 495 #Run quod for the full sequencess of the query and subject proteins
496 496
497 497
#Extract TMS coordinates for query 498 498 #Extract TMS coordinates for query
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); 499 499 die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
my $qTMS = ""; 500 500 my $qTMS = "";
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { 501 501 if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
$qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; 502 502 $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
} 503 503 }
504 504
505 505
#Plot query hydropathy 506 506 #Plot query hydropathy
my $qPfam = get_pfam_coords_for_quod($q, "red"); 507 507 my $qPfam = get_pfam_coords_for_quod($q, "red");
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; 508 508 my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
my $cmd2 = qq(quod.py -q $labelsStr -l "$q" -o $qName --edgecolor red $ylimStr -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); 509 509 my $cmd2 = qq(quod.py -q $labelsStr -l "$q" -o $qName --edgecolor red $ylimStr -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
# print "$cmd2\n\n"; 510 510 # print "$cmd2\n\n";
# exit; 511 511 # exit;
system $cmd2 unless (-f $qName); 512 512 system $cmd2 unless (-f $qName);
return undef unless (-f $qName); 513 513 return undef unless (-f $qName);
514 514
515 515
516 516
#TMS coords for the subject 517 517 #TMS coords for the subject
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); 518 518 die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
my $sTMS = ""; 519 519 my $sTMS = "";
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { 520 520 if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
$sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; 521 521 $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
} 522 522 }
523 523
#Plot Subject hydropaty 524 524 #Plot Subject hydropaty
my $sPfam = get_pfam_coords_for_quod($s, "blue"); 525 525 my $sPfam = get_pfam_coords_for_quod($s, "blue");
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; 526 526 my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
my $cmd3 = qq(quod.py -q $labelsStr -l "$s" -o $sName --edgecolor blue $ylimStr -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); 527 527 my $cmd3 = qq(quod.py -q $labelsStr -l "$s" -o $sName --edgecolor blue $ylimStr -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
# print "$cmd3\n\n"; 528 528 # print "$cmd3\n\n";
# exit; 529 529 # exit;
system $cmd3 unless (-f $sName); 530 530 system $cmd3 unless (-f $sName);
return undef unless (-f $sName); 531 531 return undef unless (-f $sName);
532 532
533 533
return 1; 534 534 return 1;
} 535 535 }
536 536
#========================================================================== 537 537 #==========================================================================
#Get the string for quod that will plot the PFAM domains 538 538 #Get the string for quod that will plot the PFAM domains
539 539
sub get_pfam_coords_for_quod { 540 540 sub get_pfam_coords_for_quod {
541 541
my ($prot, $color) = @_; 542 542 my ($prot, $color) = @_;
543 543
#Format of PFAM hash: 544 544 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 545 545 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, 546 546 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend,
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 547 547 # def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
548 548
549 549
550 550
my $str = ""; 551 551 my $str = "";
552 552
if (exists $pfamHits{$prot}) { 553 553 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 554 554 my @Doms = keys %{ $pfamHits{$prot} };
my $dcnt = 0; 555 555 my $dcnt = 0;
$str = "--region-font 12 --add-region "; 556 556 $str = "--region-font 12 --add-region ";
foreach my $d (@Doms) { 557 557 foreach my $d (@Doms) {
558 558
my @hits = @{ $pfamHits{$prot}{$d} }; 559 559 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 560 560 foreach my $hit (@hits) {
my $left = $hit->{qstart}; 561 561 my $left = $hit->{qstart};
my $right = $hit->{qend}; 562 562 my $right = $hit->{qend};
563 563
my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord 564 564 my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
my $yposh = $yposl + 0.15; #domain height coord 565 565 my $yposh = $yposl + 0.15; #domain height coord
566 566
$str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; 567 567 $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
$dcnt++; 568 568 $dcnt++;
} 569 569 }
} 570 570 }
} 571 571 }
572 572
return $str; 573 573 return $str;
574 574
} 575 575 }
576 576
577 577
#========================================================================== 578 578 #==========================================================================
#Extract the full sequences of the query and subject proteins 579 579 #Extract the full sequences of the query and subject proteins
#Examples: AKM80767.1 580 580 #Examples: AKM80767.1
581 581
582 582
583 583
sub extract_full_sequences { 584 584 sub extract_full_sequences {
585 585
my ($q, $s) = @_; 586 586 my ($q, $s) = @_;
587 587
588 588
my $q_seq = "$seqDir/${q}.faa"; 589 589 my $q_seq = "$seqDir/${q}.faa";
my $s_seq = "$seqDir/${s}.faa"; 590 590 my $s_seq = "$seqDir/${s}.faa";
591 591
#extract the query secuence from tcdb and the subject from the custom blastdb 592 592 #extract the query secuence from tcdb and the subject from the custom blastdb
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); 593 593 my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq);
system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); 594 594 system "$cmd1" unless (-f $q_seq && !(-z $q_seq));
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); 595 595 die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq));
596 596
597 597
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); 598 598 my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq);
system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); 599 599 system "$cmd2" unless (-f $s_seq && !(-z $s_seq));
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); 600 600 die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq));
} 601 601 }
602 602
603 603
604 604
605 605
606 606
607 607
608 608
#========================================================================== 609 609 #==========================================================================
#Sort alignmnet results by E-value 610 610 #Sort alignmnet results by E-value
611 611
sub by_evalue { 612 612 sub by_evalue {
$a->{evalue} <=> $b->{evalue}; 613 613 $a->{evalue} <=> $b->{evalue};
} 614 614 }
615 615
616 616
617 617
#========================================================================== 618 618 #==========================================================================
#Run PFAM, hmmtop and parse results 619 619 #Run PFAM, hmmtop and parse results
620 620
621 621
sub run_pfam_hmmtop { 622 622 sub run_pfam_hmmtop {
my ($pfamOut, $hmmtopOut, $pfamClans) = @_; 623 623 my ($pfamOut, $hmmtopOut, $pfamClans) = @_;
624 624
625 625
#---------------------------------------------------------------------- 626 626 #----------------------------------------------------------------------
#Generate blast DB for easy sequence retrieval 627 627 #Generate blast DB for easy sequence retrieval
628 628
print "Generate Blast DB with sequences for fast sequence retrieval...\n"; 629 629 print "Generate Blast DB with sequences for fast sequence retrieval...\n";
630 630
#Get the sequences for which hmmscan will run 631 631 #Get the sequences for which hmmscan will run
my $allSeqsFile = "$seqDir/all_seqs.faa"; 632 632 my $allSeqsFile = "$seqDir/all_seqs.faa";
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); 633 633 system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile));
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); 634 634 die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile));
635 635
636 636
#Generate blastdb ...assuming there are no duplicate sequences. 637 637 #Generate blastdb ...assuming there are no duplicate sequences.
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); 638 638 my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb);
print "$cmd1\n"; 639 639 print "$cmd1\n";
system $cmd1 unless (-f "${blastdb}.pin"); 640 640 system $cmd1 unless (-f "${blastdb}.pin");
system "rm $allSeqsFile" if (-f $allSeqsFile); 641 641 system "rm $allSeqsFile" if (-f $allSeqsFile);
642 642
643 643
#---------------------------------------------------------------------- 644 644 #----------------------------------------------------------------------
#Get the accessions of the top hits in the alignments 645 645 #Get the accessions of the top hits in the alignments
646 646
#get the accessions with significant hits 647 647 #get the accessions with significant hits
my %accList = (); 648 648 my %accList = ();
foreach my $hit (@alnHits) { 649 649 foreach my $hit (@alnHits) {
$accList{$hit->{qacc}} = 1; 650 650 $accList{$hit->{qacc}} = 1;
$accList{$hit->{sacc}} = 1; 651 651 $accList{$hit->{sacc}} = 1;
} 652 652 }
653 653
654 654
#Save accessions to a file 655 655 #Save accessions to a file
my $idFile = "$seqDir/top_hits_accs.txt"; 656 656 my $idFile = "$seqDir/top_hits_accs.txt";
unless (-f $idFile) { 657 657 unless (-f $idFile) {
open (my $afh, ">", $idFile) || die $!; 658 658 open (my $afh, ">", $idFile) || die $!;
print $afh join("\n", keys %accList), "\n"; 659 659 print $afh join("\n", keys %accList), "\n";
close $afh; 660 660 close $afh;
} 661 661 }
662 662
#---------------------------------------------------------------------- 663 663 #----------------------------------------------------------------------
#Extract full sequences for top hits. 664 664 #Extract full sequences for top hits.
665 665
my $topHitsSeqs = "$seqDir/topHits.faa"; 666 666 my $topHitsSeqs = "$seqDir/topHits.faa";
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); 667 667 my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs);
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); 668 668 system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs));
669 669
670 670
#---------------------------------------------------------------------- 671 671 #----------------------------------------------------------------------
#run hmmscan on all the sequences for both files 672 672 #run hmmscan on all the sequences for both files
673 673
print "\nRunning hmmscan and parsing output....\n"; 674 674 print "\nRunning hmmscan and parsing output....\n";
675 675
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; 676 676 my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/DB/domainDBs/xfamDB/Pfam-A.hmm";
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); 677 677 my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs);
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); 678 678 system $cmd2 unless (-f $pfamFile && !(-z $pfamFile));
679 679
680 680
#parse Pfam output 681 681 #parse Pfam output
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); 682 682 TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans);
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); 683 683 # print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]);
# exit; 684 684 # exit;
685 685
#---------------------------------------------------------------------- 686 686 #----------------------------------------------------------------------
#Extract clans 687 687 #Extract clans
688 688
TCDB::Assorted::get_clans($pfamClans, $filesDir); 689 689 TCDB::Assorted::get_clans($pfamClans, $filesDir);
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); 690 690 # print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]);
# exit; 691 691 # exit;
692 692
#-------------------------------------------------------------------------- 693 693 #--------------------------------------------------------------------------
#Run hmmtop on top hits for later hydropathy plots. 694 694 #Run hmmtop on top hits for later hydropathy plots.
695 695
print "Runnign HMMTOP and parsing output...\n"; 696 696 print "Runnign HMMTOP and parsing output...\n";
697 697
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); 698 698 my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo);
system $cmd3 unless (-f $hmmtopFile); 699 699 system $cmd3 unless (-f $hmmtopFile);
system "rm $topHitsSeqs" if (-f $topHitsSeqs); 700 700 system "rm $topHitsSeqs" if (-f $topHitsSeqs);
701 701
#Parse hmmtop output 702 702 #Parse hmmtop output
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); 703 703 TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile);
704 704
} 705 705 }
706 706
707 707
#========================================================================== 708 708 #==========================================================================
#Parse ssearch36 output 709 709 #Parse ssearch36 output
710 710
sub parse_ssearch { 711 711 sub parse_ssearch {
712 712
my $out = shift; 713 713 my $out = shift;
714 714
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); 715 715 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile);
716 716
my $formatTmp = $parser->format(); 717 717 my $formatTmp = $parser->format();
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); 718 718 # print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]);
# exit; 719 719 # exit;
720 720
while (my $result = $parser->next_result) { 721 721 while (my $result = $parser->next_result) {
722 722
my $qacc = $result->query_name; 723 723 my $qacc = $result->query_name;
my $qlen = $result->query_length; 724 724 my $qlen = $result->query_length;
725 725
726 726
HIT:while (my $hit = $result->next_hit) { 727 727 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 728 728 HSP:while(my $hsp = $hit->next_hsp) {
729 729
#Alignment parameters 730 730 #Alignment parameters
my $sacc = $hit->name; 731 731 my $sacc = $hit->name;
my $slen = $hit->length; 732 732 my $slen = $hit->length;
my $eval = $hsp->evalue; 733 733 my $eval = $hsp->evalue;
my $id = $hsp->frac_identical('total') * 100; 734 734 my $id = $hsp->frac_identical('total') * 100;
735 735
#coordinates and sequence 736 736 #coordinates and sequence
my $qstart = $hsp->start('query'); 737 737 my $qstart = $hsp->start('query');
my $qend = $hsp->end('query'); 738 738 my $qend = $hsp->end('query');
my $sstart = $hsp->start('subject'); 739 739 my $sstart = $hsp->start('subject');
my $send = $hsp->end('subject'); 740 740 my $send = $hsp->end('subject');
my $qseq = $hsp->query_string; 741 741 my $qseq = $hsp->query_string;
my $sseq = $hsp->hit_string; 742 742 my $sseq = $hsp->hit_string;
my $hstr = $hsp->homology_string; 743 743 my $hstr = $hsp->homology_string;
744 744
745 745
#Check first that both proteins have the right length 746 746 #Check first that both proteins have the right length
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); 747 747 next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl));
748 748
#If the alignment has less than $minLength aas, ignore it 749 749 #If the alignment has less than $minLength aas, ignore it
my $qtmp = $qseq; $qtmp =~ s/-//g; 750 750 my $qtmp = $qseq; $qtmp =~ s/-//g;
my $stmp = $sseq; $stmp =~ s/-//g; 751 751 my $stmp = $sseq; $stmp =~ s/-//g;
next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); 752 752 next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength);
753 753
#Calculate coverages properly (do not use alignment length as it includes gaps 754 754 #Calculate coverages properly (do not use alignment length as it includes gaps
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; 755 755 my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100;
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; 756 756 my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp;
757 757
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; 758 758 my $sCov_tmp = ($send - $sstart + 1) / $slen * 100;
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; 759 759 my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp;
760 760
761 761
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 762 762 if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
763 763
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 764 764 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 765 765 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); 766 766 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr});
} 767 767 }
} # hsp 768 768 } # hsp
} # hit 769 769 } # hit
} # result 770 770 } # result
} 771 771 }
772 772
773 773
774 774
#========================================================================== 775 775 #==========================================================================
#Test whether the lengths of two proteins are withing a predefined 776 776 #Test whether the lengths of two proteins are withing a predefined
#legnth specified by the user. This are the options for control: 777 777 #legnth specified by the user. This are the options for control:
# X: Either protein is larger than the cutoff 778 778 # X: Either protein is larger than the cutoff
# B: Both proteins are larger than the cutoff 779 779 # B: Both proteins are larger than the cutoff
# Q: Only the query protein is larger than the cutoff 780 780 # Q: Only the query protein is larger than the cutoff
# S: Only the subject protein is larger than the cutoff 781 781 # S: Only the subject protein is larger than the cutoff
# N: No control. Any length is ok. 782 782 # N: No control. Any length is ok.
783 783
sub max_length_violation { 784 784 sub max_length_violation {
785 785
my ($qlen, $slen, $maxLen, $control) = @_; 786 786 my ($qlen, $slen, $maxLen, $control) = @_;
787 787
if ($control eq "X") { 788 788 if ($control eq "X") {
(($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; 789 789 (($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0;
} 790 790 }
791 791
if ($control eq "B") { 792 792 if ($control eq "B") {
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; 793 793 (($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0;
} 794 794 }
795 795
if ($control eq "Q") { 796 796 if ($control eq "Q") {
($qlen >= $maxLen)? return 1 : return 0; 797 797 ($qlen >= $maxLen)? return 1 : return 0;
} 798 798 }
799 799
if ($control eq "S") { 800 800 if ($control eq "S") {
($slen >= $maxLen)? return 1 : return 0; 801 801 ($slen >= $maxLen)? return 1 : return 0;
} 802 802 }
803 803
if ($control eq "N") { 804 804 if ($control eq "N") {
return 0; 805 805 return 0;
} 806 806 }
807 807
die "Unknown control mode: $control"; 808 808 die "Unknown control mode: $control";
809 809
} 810 810 }
811 811
812 812
813 813
814 814
#========================================================================== 815 815 #==========================================================================
#Parse blast output 816 816 #Parse blast output
817 817
818 818
sub parse_blast { 819 819 sub parse_blast {
my $out = shift; 820 820 my $out = shift;
821 821
open (my $fh, "<", $alnFile) || die $!; 822 822 open (my $fh, "<", $alnFile) || die $!;
LINE:while (<$fh>) { 823 823 LINE:while (<$fh>) {
chomp; 824 824 chomp;
next unless ($_); 825 825 next unless ($_);
next if (/^#/); 826 826 next if (/^#/);
827 827
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq 828 828 #Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); 829 829 my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_);
830 830
831 831
if ($eval <= $evalue) { 832 832 if ($eval <= $evalue) {
833 833
my $qcov = ($qend - $qstart + 1) / $qlen * 100; 834 834 my $qcov = ($qend - $qstart + 1) / $qlen * 100;
my $scov = ($send - $sstart + 1) / $slen * 100; 835 835 my $scov = ($send - $sstart + 1) / $slen * 100;
836 836
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 837 837 if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
838 838
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 839 839 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 840 840 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); 841 841 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq});
} 842 842 }
} 843 843 }
} 844 844 }
close $fh; 845 845 close $fh;
} 846 846 }
847 847
848 848
849 849
850 850
851 851
#========================================================================== 852 852 #==========================================================================
#Run the alignemnt between the two files depending on the program 853 853 #Run the alignemnt between the two files depending on the program
#Selected by the user. 854 854 #Selected by the user.
855 855
locateFragment.pl View file @ 538dec8
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use Bio::SearchIO; 10 10 use Bio::SearchIO;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
12 12
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Assorted; 16 16 use TCDB::Assorted;
17 17
18 18
########################################################################### 19 19 ###########################################################################
# 20 20 #
# Parse GBLAST output file and return the sequence of the proteins that 21 21 # Parse GBLAST output file and return the sequence of the proteins that
# meet the user's criteria. 22 22 # meet the user's criteria.
# 23 23 #
########################################################################### 24 24 ###########################################################################
25 25
26 26
#This will prevent quod and alnquod from going into interactive mode 27 27 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 28 28 $ENV{"MPLBACKEND"} = "agg";
29 29
30 30
#========================================================================== 31 31 #==========================================================================
#Check dependencies 32 32 #Check dependencies
33 33
my @dependencies = ("glsearch36", "blastdbcmd"); 34 34 my @dependencies = ("glsearch36", "blastdbcmd");
my $CheckDep_obj = new TCDB::CheckDependencies(); 35 35 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 36 36 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 37 37 $CheckDep_obj -> checkDependencies;
38 38
39 39
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $fragment = undef; 45 45 my $fragment = undef;
my $accession = undef; 46 46 my $accession = undef;
my $accFile = undef 47 47 my $accFile = undef;
my $outdir = undef; 48 48 my $outdir = undef;
my $blastdb = undef; 49 49 my $blastdb = "uniref90";
my $evalue = 1e-2; 50 50 my $evalue = 1e-2;
my $subMatrix = 'BL50'; 51 51 my $subMatrix = 'BL50';
my $quiet = 0; 52 52 my $quiet = 0;
my $interactive = 0; 53 53 my $interactive = 0;
54 54
read_command_line(); 55 55 read_command_line();
56 56
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], 57 57 #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet],
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); 58 58 # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]);
#exit; 59 59 #exit;
60 60
61 61
62 62
63 63
#========================================================================== 64 64 #==========================================================================
#Get the sequences for the fragment and full proteins in files 65 65 #Get the sequences for the fragment and full proteins in files
66 66
67 67
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; 68 68 my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) };
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); 69 69 #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]);
70 70
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); 71 71 die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile);
72 72
#========================================================================== 73 73 #==========================================================================
#align fragment to full protein 74 74 #align fragment to full protein
75 75
my $alignFile = "$outdir/${accession}_glsearch.out"; 76 76 my $alignFile = "$outdir/${accession}_glsearch.out";
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); 77 77 my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile);
system $cmd unless (-f $alignFile); 78 78 system $cmd unless (-f $alignFile);
79 79
80 80
#========================================================================== 81 81 #==========================================================================
#Parse glsearch output and run quod 82 82 #Parse glsearch output and run quod
83 83
84 84
run_quod ($alignFile, $protFile); 85 85 run_quod ($alignFile, $protFile);
86 86
87 87
88 88
89 89
90 90
#========================================================================== 91 91 #==========================================================================
################ Subroutines definition beyond ths point ############## 92 92 ################ Subroutines definition beyond ths point ##############
#========================================================================== 93 93 #==========================================================================
94 94
95 95
sub run_quod { 96 96 sub run_quod {
97 97
my ($alignment, $sequence) = @_; 98 98 my ($alignment, $sequence) = @_;
99 99
100 100
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); 101 101 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment);
102 102
my @res = (); 103 103 my @res = ();
104 104
#---------------------------------------------------------------------- 105 105 #----------------------------------------------------------------------
#Parse glsearch output 106 106 #Parse glsearch output
107 107
my $hsp_cnt = 1; 108 108 my $hsp_cnt = 1;
while (my $result = $parser->next_result) { 109 109 while (my $result = $parser->next_result) {
110 110
HIT:while (my $hit = $result->next_hit) { 111 111 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 112 112 HSP:while(my $hsp = $hit->next_hsp) {
113 113
my %data = (); 114 114 my %data = ();
my $key = "hsp_$hsp_cnt"; 115 115 my $key = "hsp_$hsp_cnt";
my $hId = $hsp->frac_identical('total'); #identity in the alignment 116 116 my $hId = $hsp->frac_identical('total'); #identity in the alignment
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment 117 117 my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
118 118
#Alignment parameters 119 119 #Alignment parameters
$data{hsp} = $key; 120 120 $data{hsp} = $key;
$data{evalue} = $hsp->evalue; 121 121 $data{evalue} = $hsp->evalue;
$data{id} = $hId; 122 122 $data{id} = $hId;
$data{sim} = $hSim; 123 123 $data{sim} = $hSim;
124 124
#coordinates in the alignment to plot bars 125 125 #coordinates in the alignment to plot bars
$data{qstart} = $hsp->start('query'); 126 126 $data{qstart} = $hsp->start('query');
$data{qend} = $hsp->end('query'); 127 127 $data{qend} = $hsp->end('query');
$data{sstart} = $hsp->start('subject'); 128 128 $data{sstart} = $hsp->start('subject');
$data{send} = $hsp->end('subject'); 129 129 $data{send} = $hsp->end('subject');
130 130
push (@res, \%data); 131 131 push (@res, \%data);
$hsp_cnt++; 132 132 $hsp_cnt++;
} 133 133 }
} 134 134 }
} 135 135 }
136 136
die "Error: no match found between fragment and sequence" unless (@res); 137 137 die "Error: no match found between fragment and sequence" unless (@res);
# print Data::Dumper->Dump([\@res ], [qw(*res )]); 138 138 # print Data::Dumper->Dump([\@res ], [qw(*res )]);
139 139
140 140
#---------------------------------------------------------------------- 141 141 #----------------------------------------------------------------------
#Generate quod plot 142 142 #Generate quod plot
143 143
#Format string for the regions 144 144 #Format string for the regions
my $regions = "--add-tms "; 145 145 my $regions = "--add-tms ";
my $coords = ""; 146 146 my $coords = "";
foreach my $hit (@res) { 147 147 foreach my $hit (@res) {
148 148
$coords = $hit->{sstart} . "-" . $hit->{send}; 149 149 $coords = $hit->{sstart} . "-" . $hit->{send};
$regions .= "${coords}:green"; 150 150 $regions .= "${coords}:green";
151 151
#only the best HSP is required to be plotted 152 152 #only the best HSP is required to be plotted
last; 153 153 last;
} 154 154 }
155 155
my $outPlot = "$outdir/${accession}_map_frag.png"; 156 156 my $outPlot = "$outdir/${accession}_map_frag.png";
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; 157 157 my $qstring = ($quiet)? "-q" : "";
my $iString = ($interactive)? "-o $outPlot --show" : ""; 158 158 my $iString = "-o $outPlot";
159 159
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); 160 160 my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence 2>/dev/null);
print "$cmd\n"; 161 161 #print "$cmd\n";
system $cmd; 162 162 system $cmd;
} 163 163 }
164 164
165 165
166 166
167 167
sub getSequences { 168 168 sub getSequences {
169 169
my ($frag, $acc) = @_; 170 170 my ($frag, $acc) = @_;
171 171
172 172
# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); 173 173 # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]);
# exit; 174 174 # exit;
175 175
176 176
#Sequence for full protein 177 177 #Sequence for full protein
my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; 178 178 my $accSeq = ($accFile && -f $accFile)? $accFile : "$outdir/${acc}.faa";
179 179
180 180
#Save fragment to file 181 181 #Save fragment to file
my $tmpFile = "$outdir/${acc}_frag.faa"; 182 182 my $tmpFile = "$outdir/${acc}_frag.faa";
183 183
184 184
if (-f $frag) { 185 185 if (-f $frag) {
system "mv $frag $tmpFile" unless (-f $tmpFile); 186 186 system "mv $frag $tmpFile" unless (-f $tmpFile);
} 187 187 }
else { 188 188 else {
unless (-f $tmpFile) { 189 189 unless (-f $tmpFile) {
open (my $fh, ">", $tmpFile) || die $!; 190 190 open (my $fh, ">", $tmpFile) || die $!;
print $fh ">${acc} fragment\n$frag\n"; 191 191 print $fh ">${acc} fragment\n$frag\n";
close $fh; 192 192 close $fh;
} 193 193 }
} 194 194 }
195 195
unless (-f $accSeq) { 196 196 unless (-f $accSeq) {
#Blast DB to be used 197 197 #Blast DB to be used
my $db = ($blastdb)? $blastdb : 'nr'; 198 198 my $db = ($blastdb)? $blastdb : 'nr';
199 199
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); 200 200 my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq);
system $cmd; 201 201 system $cmd;
202 202
#Remove the version and annotations from the sequence file 203 203 #Remove the version and annotations from the sequence file
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); 204 204 my $cmd2 = qq(perl -i -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq);
system $cmd2 unless (-f "${accSeq}.pkp"); 205 205 system $cmd2 unless (-f "${accSeq}.pkp");
} 206 206 }
207 207
#Return files to be aligned 208 208 #Return files to be aligned
return [$tmpFile, $accSeq]; 209 209 return [$tmpFile, $accSeq];
} 210 210 }
211 211
212 212
213 213
214 214
#=========================================================================== 215 215 #===========================================================================
#Read command line and print help 216 216 #Read command line and print help
217 217
218 218
sub read_command_line { 219 219 sub read_command_line {
220 220
print_help() unless (@ARGV); 221 221 print_help() unless (@ARGV);
222 222
my $status = GetOptions( 223 223 my $status = GetOptions(
"i|acc-file=s" => \&read_accFile, 224 224 "i|acc-file=s" => \&read_accFile,
"a|accession=s" => \&read_accession, 225 225 "a|accession=s" => \&read_accession,
"f|fragment=s" => \&read_fragment, 226 226 "f|fragment=s" => \&read_fragment,
"o|outdir=s" => \$outdir, 227 227 "o|outdir=s" => \$outdir,
"bdb|blastdb=s" => \&read_blastdb, 228 228 "bdb|blastdb=s" => \&read_blastdb,
"e|evalue=f" => \$evalue, 229 229 "e|evalue=f" => \$evalue,
"m|sub-matrix=s" => \&read_subMatrix, 230 230 "m|sub-matrix=s" => \&read_subMatrix,
"t|interactive!" => \$interactive, 231 231 "t|interactive!" => \$interactive,
"q|quiet!" => \$quiet, 232 232 "q|quiet!" => \$quiet,
"h|help" => sub { print_help(); }, 233 233 "h|help" => sub { print_help(); },
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); 234 234 "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
exit unless ($status); 235 235 exit unless ($status);
236 236
237 237
die "Error: option -f is mandatory." unless ($fragment); 238 238 die "Error: option -f is mandatory." unless ($fragment);
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); 239 239 die "Error: options -i or -a are mandatory." unless ($accession || $accFile);
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); 240 240 die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive);
241 241
#Default value for output directory 242 242 #Default value for output directory
$outdir = "." unless ($outdir); 243 243 $outdir = "." unless ($outdir);
system "mkdir -p $outdir" unless (-d $outdir); 244 244 system "mkdir -p $outdir" unless (-d $outdir);
245 245
#When accession is not given, take the first part of the input file name as accession 246 246 #When accession is not given, take the first part of the input file name as accession
unless ($accession) { 247 247 unless ($accession) {
my @pathComp = split(/\//, $accFile); 248 248 my @pathComp = split(/\//, $accFile);
$accession = $pathComp[-1]; 249 249 $accession = $pathComp[-1];
$accession =~ s/(\.faa|\.fasta)$//g; 250 250 $accession =~ s/(\.faa|\.fasta)$//g;
} 251 251 }
252 252
} 253 253 }
254 254
255 255
#========================================================================== 256 256 #==========================================================================
#Option -i 257 257 #Option -i
258 258
sub read_accFile { 259 259 sub read_accFile {
my ($opt, $value) = @_; 260 260 my ($opt, $value) = @_;
261 261
unless (-f $value && !(-z $value)) { 262 262 unless (-f $value && !(-z $value)) {
die "Error in option -i: File with sequences does not exist or is empty!\n"; 263 263 die "Error in option -i: File with sequences does not exist or is empty!\n";
} 264 264 }
265 265
$accFile = $value; 266 266 $accFile = $value;
} 267 267 }
268 268
269 269
#========================================================================== 270 270 #==========================================================================
#Option -a 271 271 #Option -a
272 272
sub read_accession { 273 273 sub read_accession {
my ($opt, $value) = @_; 274 274 my ($opt, $value) = @_;
275 275
#Remove version number if any 276 276 #Remove version number if any
$value =~ s/\.\d+$//; 277 277 #$value =~ s/\.\d+$//;
1 1
#Help 2 2 #Help
if [ "$1" = "-h" ] 3 3 if [ "$1" = "-h" ]
then 4 4 then
echo "Locate aligned fragments within 2 full proteins" 5 5 echo "Locate aligned fragments within 2 full proteins"
echo "Arguments:" 6 6 echo "Arguments:"
echo " 1. NCBI Accession of first protein 1" 7 7 echo " 1. Accession of protein 1"
echo " 2. Aligned fragment of protein 1" 8 8 echo " 2. Aligned fragment of protein 1"
echo " 3. NCBI Accession of first protein 2" 9 9 echo " 3. Accession of protein 2"
echo " 4. Aligned fragment of protein 2" 10 10 echo " 4. Aligned fragment of protein 2"
echo " 5. (Optional) substitution matrix to use. (Defaul: BL50)" 11 11 echo " 5. (optional) blast DB to extract sequences from (Default: uniref90)"
echo " 6. (Optional) Indicate whether plots will be shown (Values: show/quiet; Default: show)" 12 12 echo " 6. (Optional) substitution matrix to use. (Defaul: BL50)"
13 echo " 7. (Optional) Indicate whether plots will be shown (Values: show/quiet; Default: show)"
exit 1 13 14 exit 1
fi 14 15 fi
15 16
16 17
17 18
#Define the substitution matrix to work with 18 19 #Define the substitution matrix to work with
mat="BL50" 19 20 mat="BL50"
mode="" 20 21 mode=""
22 db="uniref90"
21 23
24 #Identify the blast DB to extract sequences from
25 if [[ ! -z "$5" ]] && ([[ "$5" != "quiet" ]] && [[ "$5" != "show" ]] && [[ "$5" != "uniref90" ]])
26 then
27 db="$5"
28 fi
22 29
#Identify the type of substitution matrix, if given 23 30 #Identify the type of substitution matrix, if given
if [[ ! -z "$5" ]] && ([[ "$5" != "quiet" ]] && [[ "$5" != "show" ]]) 24 31 if [[ ! -z "$6" ]] && ([[ "$6" != "quiet" ]] && [[ "$6" != "show" ]] && [[ "$6" != "uniref90" ]])
then 25 32 then
mat=$5 26 33 mat="$6"
fi 27 34 fi
28 35
29
#Check the mode of operation: quiet/show 30 36 #Check the mode of operation: quiet/show
if [[ "$5" == "quiet" ]] || [[ "$6" == "quiet" ]] 31 37 if [[ "$6" == "quiet" ]] || [[ "$7" == "quiet" ]]
then 32 38 then
mode="-q" 33 39 mode="-q"
fi 34 40 fi
35 41
locateFragment.pl -a $1 -f $2 $mode 36 42
locateFragment.pl -a $3 -f $4 $mode 37 43
44 #localizing fragments
45 locateFragment.pl -a $1 -f $2 $mode -bdb $db
46 locateFragment.pl -a $3 -f $4 $mode -bdb $db
47
48 #Aligning fragments and full sequences
alignSeqsFiles.pl -q $1_frag.faa -ql $1_frag -s $3_frag.faa -sl $3_frag -e 0.1 -c 20 -cc X -m $mat 38 49 alignSeqsFiles.pl -q $1_frag.faa -ql $1_frag -s $3_frag.faa -sl $3_frag -e 0.1 -c 20 -cc X -m $mat
alignSeqsFiles.pl -q $1.faa -ql $1 -s $3.faa -sl $3 -e 0.1 -c 5 -cc X -m $mat 39 50 alignSeqsFiles.pl -q $1.faa -ql $1 -s $3.faa -sl $3 -e 0.1 -c 5 -cc X -m $mat
40 51
41 52
#Open html reports if apropriate 42 53 #Open html reports if apropriate
if [[ $mode != "-q" ]] 43 54 if [[ $mode != "-q" ]]
then 44 55 then