Commit bbf0ef5765e6cc205108b61dd2dfbf0d0a0e6137

Authored by Luis Arturo Medrano-Soto
1 parent 68b41f16a3
Exists in master

Added the ability to generate a tsv file as output for easy parsing

Showing 1 changed file with 13 additions and 3 deletions Inline Diff

alignSeqsFiles.pl View file @ bbf0ef5
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use LWP; 10 10 use LWP;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
use Bio::SearchIO; 12 12 use Bio::SearchIO;
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Domain::PfamParser; 16 16 use TCDB::Domain::PfamParser;
use TCDB::Domain::Characterize; 17 17 use TCDB::Domain::Characterize;
use TCDB::Assorted; 18 18 use TCDB::Assorted;
19 19
20 20
########################################################################### 21 21 ###########################################################################
# 22 22 #
# Comapre two files with fasta sequences and report the alignment parameters 23 23 # Comapre two files with fasta sequences and report the alignment parameters
# Along with hydropathy plots and PFAM domains. 24 24 # Along with hydropathy plots and PFAM domains.
# 25 25 #
########################################################################### 26 26 ###########################################################################
27 27
#========================================================================== 28 28 #==========================================================================
#Check dependencies 29 29 #Check dependencies
30 30
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", 31 31 my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd",
"hmmscan"); 32 32 "hmmscan");
my $CheckDep_obj = new TCDB::CheckDependencies(); 33 33 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 34 34 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 35 35 $CheckDep_obj -> checkDependencies;
36 36
37 37
#This will prevent quod and alnquod from going into interactive mode 38 38 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 39 39 $ENV{"MPLBACKEND"} = "agg";
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $qfile = ""; 45 45 my $qfile = "";
my $qProt = ""; 46 46 my $qProt = "";
my $sfile = ""; 47 47 my $sfile = "";
my $sProt = ""; 48 48 my $sProt = "";
my $qlabel = "Query"; 49 49 my $qlabel = "Query";
my $slabel = "Subject"; 50 50 my $slabel = "Subject";
my $outdir = ""; 51 51 my $outdir = "";
my $prog = 'ssearch36'; #'blastp'; 52 52 my $prog = 'ssearch36'; #'blastp';
my $evalue = 1e-4; 53 53 my $evalue = 1e-4;
my $identity = 20.0; 54 54 my $identity = 20.0;
my $coverage = 40.0; 55 55 my $coverage = 40.0;
my $covControl = "X"; 56 56 my $covControl = "X";
my $blastComp = "F"; #2; 57 57 my $blastComp = "F"; #2;
my $segFilter = 'no'; 58 58 my $segFilter = 'no';
my $minLength = 30; #Min legnth of proteins to analyze (without gaps) 59 59 my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
my $subMatrix = 'BL50'; 60 60 my $subMatrix = 'BL50';
my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] 61 61 my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] 62 62 my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
63 63
#this can be used to remove long sequences from results 64 64 #this can be used to remove long sequences from results
my $maxProtLength = 100000; #default threshold to allow any length 65 65 my $maxProtLength = 100000; #default threshold to allow any length
my $LengthControl = "N"; #same meaning as $covControl 66 66 my $LengthControl = "N"; #same meaning as $covControl
67 67
#internal directories 68 68 #internal directories
my $filesDir = ""; 69 69 my $filesDir = "";
my $plotsDir = ""; 70 70 my $plotsDir = "";
my $seqDir = ""; 71 71 my $seqDir = "";
my $blastDir = ""; 72 72 my $blastDir = "";
73 73
74 74
read_command_line(); 75 75 read_command_line();
76 76
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, 77 77 #print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog,
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, 78 78 # $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength,
# $LengthControl], 79 79 # $LengthControl],
# [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog 80 80 # [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog
# *evalue *coverage *covControl *blastComp *segFilter *maxProtLength 81 81 # *evalue *coverage *covControl *blastComp *segFilter *maxProtLength
# *LengthControl)]); 82 82 # *LengthControl)]);
#exit; 83 83 #exit;
84 84
85 85
86 86
#========================================================================== 87 87 #==========================================================================
#Output files 88 88 #Output files
89 89
#The alignment file by blastp or ssearch36 90 90 #The alignment file by blastp or ssearch36
my $alnFile = "$filesDir/${prog}.out"; 91 91 my $alnFile = "$filesDir/${prog}.out";
92 92
#The results of running hmmscan 93 93 #The results of running hmmscan
my $pfamFile = "$filesDir/hmmscan.out"; 94 94 my $pfamFile = "$filesDir/hmmscan.out";
95 95
#The results from running hmmtop 96 96 #The results from running hmmtop
my $hmmtopFile = "$filesDir/hmmtop.out"; 97 97 my $hmmtopFile = "$filesDir/hmmtop.out";
98 98
#The blast database to retrieve sequences for ploting 99 99 #The blast database to retrieve sequences for ploting
my $blastdb = "$blastDir/sequences"; 100 100 my $blastdb = "$blastDir/sequences";
101 101
102 102
103 103
#========================================================================== 104 104 #==========================================================================
#Run the alignment first 105 105 #Run the alignment first
106 106
print "Running $prog and parsing output....\n"; 107 107 print "Running $prog and parsing output....\n";
run_alignment(); 108 108 run_alignment();
109 109
110 110
my @alnHits = (); 111 111 my @alnHits = ();
if ($prog eq 'blastp') { parse_blast(\@alnHits); } 112 112 if ($prog eq 'blastp') { parse_blast(\@alnHits); }
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} 113 113 elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)}
114 114
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); 115 115 #print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]);
#exit; 116 116 #exit;
117 117
118 118
die "No significant blastHits found!\n" unless (@alnHits); 119 119 die "No significant blastHits found!\n" unless (@alnHits);
120 120
121 121
122 122
#========================================================================== 123 123 #==========================================================================
#Run pfam (get clans, hmmtop, and parse results 124 124 #Run pfam (get clans, hmmtop, and parse results
125 125
my %pfamHits = (); 126 126 my %pfamHits = ();
my %clans = (); 127 127 my %clans = ();
my %hmmtopHits = (); 128 128 my %hmmtopHits = ();
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); 129 129 run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans );
130 130
#print Data::Dumper->Dump([\%clans], [qw(*clans)]); 131 131 #print Data::Dumper->Dump([\%clans], [qw(*clans)]);
#exit; 132 132 #exit;
133 133
134 134
#========================================================================== 135 135 #==========================================================================
#Parse the alignment results to make sure there are signficant results, 136 136 #Parse the alignment results to make sure there are signficant results,
#get domains for significant hits and plot the corresponding hydropathies. 137 137 #get domains for significant hits and plot the corresponding hydropathies.
138 138
139 139
140 140
141 141
print "Geneating report...\n"; 142 142 print "Geneating report...\n";
generate_report(); 143 143 generate_report();
144 144
145 145
146 146
#========================================================================== 147 147 #==========================================================================
################ Subroutines definition beyond ths point ############## 148 148 ################ Subroutines definition beyond ths point ##############
#========================================================================== 149 149 #==========================================================================
150 150
151 151
#========================================================================== 152 152 #==========================================================================
#Generate output for significant hits 153 153 #Generate output for significant hits
154 154
sub generate_report { 155 155 sub generate_report {
156 156
157 157
#Prepare output files 158 158 #Prepare output files
my $htmlFile = "$outdir/report.html"; 159 159 my $htmlFile = "$outdir/report.html";
160 my $tsvFile = "$outdir/report.tsv";
my $plotsFile = "$outdir/plots.html"; 160 161 my $plotsFile = "$outdir/plots.html";
161 162
163 my $tsvHeader = "#Query\tSubject\tQlen\tSlen\tE-value\tIdentity\tQstart\tQend\tQcov\tSstart\tSend\tScov\tQseq\tSseq\n";
162 164
my $htmlHeader = <<HEADER; 163 165 my $htmlHeader = <<HEADER;
<!DOCTYPE html> 164 166 <!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"> 165 167 <html xmlns="http://www.w3.org/1999/xhtml">
<head> 166 168 <head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> 167 169 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
168 170
<style type="text/css"> 169 171 <style type="text/css">
170 172
.label { 171 173 .label {
text-align: right; 172 174 text-align: right;
width: 50px; 173 175 width: 50px;
} 174 176 }
175 177
.data { 176 178 .data {
text-align: left; 177 179 text-align: left;
padding-left: 8px; 178 180 padding-left: 8px;
width: 100px; 179 181 width: 100px;
} 180 182 }
181 183
.uline { 182 184 .uline {
text-decoration: underline; 183 185 text-decoration: underline;
} 184 186 }
185 187
.pfam { 186 188 .pfam {
text-align: center; 187 189 text-align: center;
vertical-align: middle; 188 190 vertical-align: middle;
} 189 191 }
190 192
.seq { 191 193 .seq {
border: 2px solid black; 192 194 border: 2px solid black;
height: 70px; 193 195 height: 70px;
width: 100%; 194 196 width: 100%;
overflow-x: auto; 195 197 overflow-x: auto;
overflow-y: hidden; 196 198 overflow-y: hidden;
margin: 1em 0; 197 199 margin: 1em 0;
background: gray; 198 200 background: gray;
color: white; 199 201 color: white;
} 200 202 }
201 203
202 204
.dom { 203 205 .dom {
border: 2px solid black; 204 206 border: 2px solid black;
height: 100px; 205 207 height: 100px;
width: 100%; 206 208 width: 100%;
overflow-x: auto; 207 209 overflow-x: auto;
overflow-y: auto; 208 210 overflow-y: auto;
margin: 1em 0; 209 211 margin: 1em 0;
background: gray; 210 212 background: gray;
color: white; 211 213 color: white;
} 212 214 }
213 215
img { 214 216 img {
display: block; 215 217 display: block;
margin-left: auto; 216 218 margin-left: auto;
margin-right: auto; 217 219 margin-right: auto;
height: 250px; 218 220 height: 250px;
width: auto; 219 221 width: auto;
max-width: 1500px; 220 222 max-width: 1500px;
max-height: 300px; 221 223 max-height: 300px;
} 222 224 }
223 225
</style> 224 226 </style>
<title>$qlabel vs $slabel</title> 225 227 <title>$qlabel vs $slabel</title>
</head> 226 228 </head>
<br /> 227 229 <br />
<h1 style='text-align:center'>$qlabel vs $slabel</h1> 228 230 <h1 style='text-align:center'>$qlabel vs $slabel</h1>
<body> 229 231 <body>
230 232
HEADER 231 233 HEADER
232 234
233 235
236 open (my $tsvh, ">", $tsvFile) || die $!;
237 print $tsvh $tsvHeader;
238
open (my $outh, ">", $htmlFile) || die $!; 234 239 open (my $outh, ">", $htmlFile) || die $!;
print $outh $htmlHeader; 235 240 print $outh $htmlHeader;
236 241
237 242
foreach my $hit (sort by_evalue @alnHits) { 238 243 foreach my $hit (sort by_evalue @alnHits) {
239 244
240 245
my $qacc = $hit->{qacc}; 241 246 my $qacc = $hit->{qacc};
my $qlen = $hit->{qlen}; 242 247 my $qlen = $hit->{qlen};
my $qseq = $hit->{qseq}; 243 248 my $qseq = $hit->{qseq};
my $qcov = sprintf("%.1f", $hit->{qcov}); 244 249 my $qcov = sprintf("%.1f", $hit->{qcov});
my $qstart = $hit->{qstart}; 245 250 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 246 251 my $qend = $hit->{qend};
247 252
my $sacc = $hit->{sacc}; 248 253 my $sacc = $hit->{sacc};
my $slen = $hit->{slen}; 249 254 my $slen = $hit->{slen};
my $sseq = $hit->{sseq}; 250 255 my $sseq = $hit->{sseq};
my $scov = sprintf("%.1f", $hit->{scov}); 251 256 my $scov = sprintf("%.1f", $hit->{scov});
my $sstart = $hit->{sstart}; 252 257 my $sstart = $hit->{sstart};
my $send = $hit->{send}; 253 258 my $send = $hit->{send};
254 259
my $eval = sprintf ("%.1e", $hit->{evalue}); 255 260 my $eval = sprintf ("%.1e", $hit->{evalue});
my $id = sprintf ("%.1f", $hit->{id}); 256 261 my $id = sprintf ("%.1f", $hit->{id});
my $hstr = $hit->{hstr}; 257 262 my $hstr = $hit->{hstr};
258 263
my $alnHit = <<HIT; 259 264 my $alnHit = <<HIT;
260 265
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> 261 266 <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>
262 267
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> 263 268 <p><b>$qacc ($qlen) vs $sacc ($slen)</b></p>
264 269
<table width="600px" border="0" cellspacing="0" cellpadding="2"> 265 270 <table width="600px" border="0" cellspacing="0" cellpadding="2">
<tr> 266 271 <tr>
<td class='label'><b>E-value:</b></td> 267 272 <td class='label'><b>E-value:</b></td>
<td class='data'>$eval</td> 268 273 <td class='data'>$eval</td>
<td class='label'><b>Identity:</b></td> 269 274 <td class='label'><b>Identity:</b></td>
<td class='data'>${id}%</td> 270 275 <td class='data'>${id}%</td>
<td class='label'><b>Q_coverage:</b></td> 271 276 <td class='label'><b>Q_coverage:</b></td>
<td class='data'>${qcov}%</td> 272 277 <td class='data'>${qcov}%</td>
<td class='label'><b>S_coverage:</b></td> 273 278 <td class='label'><b>S_coverage:</b></td>
<td class='data'>${scov}%</td> 274 279 <td class='data'>${scov}%</td>
</tr> 275 280 </tr>
<tr> 276 281 <tr>
<td class='label'><b>Q_aln:</b></td> 277 282 <td class='label'><b>Q_aln:</b></td>
<td class='data'>${qstart}-$qend</td> 278 283 <td class='data'>${qstart}-$qend</td>
<td class='label'><b>S_aln:</b></td> 279 284 <td class='label'><b>S_aln:</b></td>
<td class='data'>${sstart}-$send</td> 280 285 <td class='data'>${sstart}-$send</td>
<td class='label'></td> 281 286 <td class='label'></td>
<td class='data'></td> 282 287 <td class='data'></td>
<td class='label'></td> 283 288 <td class='label'></td>
<td class='data'></td> 284 289 <td class='data'></td>
</tr> 285 290 </tr>
</table> 286 291 </table>
<br /> 287 292 <br />
288 293
<p><b>Alignment:</b></p> 289 294 <p><b>Alignment:</b></p>
<div class='seq'> 290 295 <div class='seq'>
<pre> 291 296 <pre>
$qseq 292 297 $qseq
$hstr 293 298 $hstr
$sseq 294 299 $sseq
</pre> 295 300 </pre>
</div> 296 301 </div>
297 302
HIT 298 303 HIT
299 304
print $outh $alnHit; 300 305 print $outh $alnHit;
301 306
307 print $tsvh "$qacc\t$sacc\t$qlen\t$slen\t$eval\t$id\t$qstart\t$qend\t$qcov\t$sstart\t$send\t$scov\t$qseq\t$sseq\n";
302 308
#Generate the hydropathy plots 303 309 #Generate the hydropathy plots
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); 304 310 my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq);
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); 305 311 die "Could not generate plots for hit: $qacc vs $sacc" unless ($good);
306 312
my $domData = generate_domain_data($qacc, $sacc); 307 313 my $domData = generate_domain_data($qacc, $sacc);
my $domHTML = ""; 308 314 my $domHTML = "";
if ($domData) { 309 315 if ($domData) {
$domHTML =<<DATA; 310 316 $domHTML =<<DATA;
<br /> 311 317 <br />
<hr /> 312 318 <hr />
<p><b>Pfam info:</b></p> 313 319 <p><b>Pfam info:</b></p>
<div class='dom'> 314 320 <div class='dom'>
315 321
$domData 316 322 $domData
317 323
</div> 318 324 </div>
DATA 319 325 DATA
} 320 326 }
321 327
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; 322 328 my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png";
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; 323 329 my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png";
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; 324 330 my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png";
325 331
#now include the plots 326 332 #now include the plots
my $prtPlots =<<PLOTS; 327 333 my $prtPlots =<<PLOTS;
328 334
<br /> 329 335 <br />
<table style="width:100%"> 330 336 <table style="width:100%">
<tr> 331 337 <tr>
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> 332 338 <td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td>
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> 333 339 <td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td>
</tr> 334 340 </tr>
<tr> 335 341 <tr>
<td colspan="2" style="text-align: center;"> 336 342 <td colspan="2" style="text-align: center;">
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> 337 343 <a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a>
</td> 338 344 </td>
</tr> 339 345 </tr>
</table> 340 346 </table>
341 347
$domHTML 342 348 $domHTML
343 349
PLOTS 344 350 PLOTS
345 351
print $outh $prtPlots; 346 352 print $outh $prtPlots;
} 347 353 }
348 354
#Close HTML report 349 355 #Close HTML report
my $closeRep = <<CLOSE; 350 356 my $closeRep = <<CLOSE;
</body> 351 357 </body>
</html> 352 358 </html>
CLOSE 353 359 CLOSE
354 360
print $outh $closeRep; 355 361 print $outh $closeRep;
356 362
close $outh; 357 363 close $outh;
364 close $tsvh;
} 358 365 }
359 366
360 367
#========================================================================== 361 368 #==========================================================================
#Generate domain data for the html report 362 369 #Generate domain data for the html report
363 370
364 371
sub generate_domain_data { 365 372 sub generate_domain_data {
366 373
my ($q, $s) = @_; 367 374 my ($q, $s) = @_;
368 375
#Format of PFAM hash: 369 376 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 370 377 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, 371 378 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval,
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 372 379 # dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
373 380
374 381
375 382
376 383
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); 377 384 my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info);
my $colStr = " <th>" . join ("</th>\n <th>", @cols) . 378 385 my $colStr = " <th>" . join ("</th>\n <th>", @cols) .
"</th>\n"; 379 386 "</th>\n";
380 387
my $header =<<HEADER; 381 388 my $header =<<HEADER;
<table border='1', style='width:100%'> 382 389 <table border='1', style='width:100%'>
<tr> 383 390 <tr>
$colStr 384 391 $colStr
</tr> 385 392 </tr>
HEADER 386 393 HEADER
387 394
388 395
my $res = ""; 389 396 my $res = "";
foreach my $prot ($q, $s) { 390 397 foreach my $prot ($q, $s) {
391 398
if (exists $pfamHits{$prot}) { 392 399 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 393 400 my @Doms = keys %{ $pfamHits{$prot} };
394 401
foreach my $d (@Doms) { 395 402 foreach my $d (@Doms) {
396 403
my $clan = ($clans{$d})? $clans{$d} : "N/A"; 397 404 my $clan = ($clans{$d})? $clans{$d} : "N/A";
398 405
my @hits = @{ $pfamHits{$prot}{$d} }; 399 406 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 400 407 foreach my $hit (@hits) {
my $dlen = $hit->{dlen}; 401 408 my $dlen = $hit->{dlen};
my $eval = $hit->{evalue}; 402 409 my $eval = $hit->{evalue};
my $qstart = $hit->{qstart}; 403 410 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 404 411 my $qend = $hit->{qend};
my $dstart = $hit->{dstart}; 405 412 my $dstart = $hit->{dstart};
my $dend = $hit->{dend}; 406 413 my $dend = $hit->{dend};
my $name = $hit->{dname}; 407 414 my $name = $hit->{dname};
my $def = $hit->{def}; 408 415 my $def = $hit->{def};
409 416
$res .=<<ROW; 410 417 $res .=<<ROW;
<tr> 411 418 <tr>
<td class="pfam">$prot</td> 412 419 <td class="pfam">$prot</td>
<td class="pfam">$d</td> 413 420 <td class="pfam">$d</td>
<td class="pfam">$clan</td> 414 421 <td class="pfam">$clan</td>
<td class="pfam">$dlen</td> 415 422 <td class="pfam">$dlen</td>
<td class="pfam">$eval</td> 416 423 <td class="pfam">$eval</td>
<td class="pfam">$dstart</td> 417 424 <td class="pfam">$dstart</td>
<td class="pfam">$dend</td> 418 425 <td class="pfam">$dend</td>
<td class="pfam">$qstart</td> 419 426 <td class="pfam">$qstart</td>
<td class="pfam">$qend</td> 420 427 <td class="pfam">$qend</td>
<td class="pfam">$name</td> 421 428 <td class="pfam">$name</td>
<td class="pfam">$def</td> 422 429 <td class="pfam">$def</td>
</tr> 423 430 </tr>
ROW 424 431 ROW
} 425 432 }
} 426 433 }
} 427 434 }
} 428 435 }
429 436
#Return final result 430 437 #Return final result
if ($res) { 431 438 if ($res) {
$header .= $res; 432 439 $header .= $res;
$header .= " </table>\n"; 433 440 $header .= " </table>\n";
return $header; 434 441 return $header;
} 435 442 }
else { 436 443 else {
return $res; 437 444 return $res;
} 438 445 }
} 439 446 }
440 447
441 448
442 449
443 450
#========================================================================== 444 451 #==========================================================================
#Run quod on the query, subject and the alignment. 445 452 #Run quod on the query, subject and the alignment.
446 453
447 454
#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa 448 455 #quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
449 456
450 457
sub run_quod { 451 458 sub run_quod {
452 459
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; 453 460 my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
454 461
455 462
#extract sequences for query and subject 456 463 #extract sequences for query and subject
extract_full_sequences($q,$s); 457 464 extract_full_sequences($q,$s);
458 465
459 466
#----------------------------------------------------------------- 460 467 #-----------------------------------------------------------------
#Run quod for the alignment 461 468 #Run quod for the alignment
462 469
#First save aligned segments to files 463 470 #First save aligned segments to files
my $qalnFile ="$seqDir/${q}_aln.faa"; 464 471 my $qalnFile ="$seqDir/${q}_aln.faa";
open(my $qfh, '>', $qalnFile) || die $!; 465 472 open(my $qfh, '>', $qalnFile) || die $!;
print $qfh ">$q alignment\n$qseq\n"; 466 473 print $qfh ">$q alignment\n$qseq\n";
close $qfh; 467 474 close $qfh;
468 475
my $salnFile ="$seqDir/${s}_aln.faa"; 469 476 my $salnFile ="$seqDir/${s}_aln.faa";
open(my $sfh, '>', $salnFile) || die $!; 470 477 open(my $sfh, '>', $salnFile) || die $!;
print $sfh ">$s alignment\n$sseq\n"; 471 478 print $sfh ">$s alignment\n$sseq\n";
close $sfh; 472 479 close $sfh;
473 480
481 my $labelsStr = "--axis-font 18.0 --tick-font 15.0 --title-font 20.0 --xlabel Residues --ylabel Hydropathy --width 15 --xticks 25";
482 my $ylimStr = "--ylim -3 3";
474 483
484
#Note alnquod requires to add the extension to the image name 475 485 #Note alnquod requires to add the extension to the image name
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; 476 486 my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); 477 487 my $cmd1 = qq(quod.py -q $labelsStr -l "$q (red) and $s (blue)" -o $alnFig $ylimStr --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
# print "$cmd1\n\n"; 478 488 # print "$cmd1\n\n";
# exit; 479 489 # exit;
system $cmd1 unless (-f "${alnFig}"); 480 490 system $cmd1 unless (-f "${alnFig}");
return undef unless (-f "${alnFig}"); 481 491 return undef unless (-f "${alnFig}");
482 492
483 493
#----------------------------------------------------------------- 484 494 #-----------------------------------------------------------------
#Run quod for the full sequencess of the query and subject proteins 485 495 #Run quod for the full sequencess of the query and subject proteins
486 496
487 497
#Extract TMS coordinates for query 488 498 #Extract TMS coordinates for query
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); 489 499 die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
my $qTMS = ""; 490 500 my $qTMS = "";
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { 491 501 if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
$qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; 492 502 $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
} 493 503 }
494 504
495 505
#Plot query hydropathy 496 506 #Plot query hydropathy
my $qPfam = get_pfam_coords_for_quod($q, "red"); 497 507 my $qPfam = get_pfam_coords_for_quod($q, "red");
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; 498 508 my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); 499 509 my $cmd2 = qq(quod.py -q $labelsStr -l "$q" -o $qName --edgecolor red $ylimStr -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
# print "$cmd2\n\n"; 500 510 # print "$cmd2\n\n";
# exit; 501 511 # exit;
system $cmd2 unless (-f $qName); 502 512 system $cmd2 unless (-f $qName);
return undef unless (-f $qName); 503 513 return undef unless (-f $qName);
504 514
505 515
506 516
#TMS coords for the subject 507 517 #TMS coords for the subject
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); 508 518 die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
my $sTMS = ""; 509 519 my $sTMS = "";
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { 510 520 if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
$sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; 511 521 $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
} 512 522 }
513 523
#Plot Subject hydropaty 514 524 #Plot Subject hydropaty
my $sPfam = get_pfam_coords_for_quod($s, "blue"); 515 525 my $sPfam = get_pfam_coords_for_quod($s, "blue");
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; 516 526 my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); 517 527 my $cmd3 = qq(quod.py -q $labelsStr -l "$s" -o $sName --edgecolor blue $ylimStr -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
# print "$cmd3\n\n"; 518 528 # print "$cmd3\n\n";
# exit; 519 529 # exit;
system $cmd3 unless (-f $sName); 520 530 system $cmd3 unless (-f $sName);
return undef unless (-f $sName); 521 531 return undef unless (-f $sName);
522 532
523 533
return 1; 524 534 return 1;
} 525 535 }
526 536
#========================================================================== 527 537 #==========================================================================
#Get the string for quod that will plot the PFAM domains 528 538 #Get the string for quod that will plot the PFAM domains
529 539
sub get_pfam_coords_for_quod { 530 540 sub get_pfam_coords_for_quod {
531 541
my ($prot, $color) = @_; 532 542 my ($prot, $color) = @_;
533 543
#Format of PFAM hash: 534 544 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 535 545 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, 536 546 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend,
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 537 547 # def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
538 548
539 549
540 550
my $str = ""; 541 551 my $str = "";
542 552
if (exists $pfamHits{$prot}) { 543 553 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 544 554 my @Doms = keys %{ $pfamHits{$prot} };
my $dcnt = 0; 545 555 my $dcnt = 0;
$str = "--region-font 12 --add-region "; 546 556 $str = "--region-font 12 --add-region ";
foreach my $d (@Doms) { 547 557 foreach my $d (@Doms) {
548 558
my @hits = @{ $pfamHits{$prot}{$d} }; 549 559 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 550 560 foreach my $hit (@hits) {
my $left = $hit->{qstart}; 551 561 my $left = $hit->{qstart};
my $right = $hit->{qend}; 552 562 my $right = $hit->{qend};
553 563
my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord 554 564 my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
my $yposh = $yposl + 0.15; #domain height coord 555 565 my $yposh = $yposl + 0.15; #domain height coord
556 566
$str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; 557 567 $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
$dcnt++; 558 568 $dcnt++;
} 559 569 }
} 560 570 }
} 561 571 }
562 572
return $str; 563 573 return $str;
564 574
} 565 575 }
566 576
567 577
#========================================================================== 568 578 #==========================================================================
#Extract the full sequences of the query and subject proteins 569 579 #Extract the full sequences of the query and subject proteins
#Examples: AKM80767.1 570 580 #Examples: AKM80767.1
571 581
572 582
573 583
sub extract_full_sequences { 574 584 sub extract_full_sequences {
575 585
my ($q, $s) = @_; 576 586 my ($q, $s) = @_;
577 587
578 588
my $q_seq = "$seqDir/${q}.faa"; 579 589 my $q_seq = "$seqDir/${q}.faa";
my $s_seq = "$seqDir/${s}.faa"; 580 590 my $s_seq = "$seqDir/${s}.faa";
581 591
#extract the query secuence from tcdb and the subject from the custom blastdb 582 592 #extract the query secuence from tcdb and the subject from the custom blastdb
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); 583 593 my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq);
system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); 584 594 system "$cmd1" unless (-f $q_seq && !(-z $q_seq));
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); 585 595 die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq));
586 596
587 597
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); 588 598 my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq);
system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); 589 599 system "$cmd2" unless (-f $s_seq && !(-z $s_seq));
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); 590 600 die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq));
} 591 601 }
592 602
593 603
594 604
595 605
596 606
597 607
598 608
#========================================================================== 599 609 #==========================================================================
#Sort alignmnet results by E-value 600 610 #Sort alignmnet results by E-value
601 611
sub by_evalue { 602 612 sub by_evalue {
$a->{evalue} <=> $b->{evalue}; 603 613 $a->{evalue} <=> $b->{evalue};
} 604 614 }
605 615
606 616
607 617
#========================================================================== 608 618 #==========================================================================
#Run PFAM, hmmtop and parse results 609 619 #Run PFAM, hmmtop and parse results
610 620
611 621
sub run_pfam_hmmtop { 612 622 sub run_pfam_hmmtop {
my ($pfamOut, $hmmtopOut, $pfamClans) = @_; 613 623 my ($pfamOut, $hmmtopOut, $pfamClans) = @_;
614 624
615 625
#---------------------------------------------------------------------- 616 626 #----------------------------------------------------------------------
#Generate blast DB for easy sequence retrieval 617 627 #Generate blast DB for easy sequence retrieval
618 628
print "Generate Blast DB with sequences for fast sequence retrieval...\n"; 619 629 print "Generate Blast DB with sequences for fast sequence retrieval...\n";
620 630
#Get the sequences for which hmmscan will run 621 631 #Get the sequences for which hmmscan will run
my $allSeqsFile = "$seqDir/all_seqs.faa"; 622 632 my $allSeqsFile = "$seqDir/all_seqs.faa";
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); 623 633 system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile));
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); 624 634 die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile));
625 635
626 636
#Generate blastdb ...assuming there are no duplicate sequences. 627 637 #Generate blastdb ...assuming there are no duplicate sequences.
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); 628 638 my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb);
print "$cmd1\n"; 629 639 print "$cmd1\n";
system $cmd1 unless (-f "${blastdb}.pin"); 630 640 system $cmd1 unless (-f "${blastdb}.pin");
system "rm $allSeqsFile" if (-f $allSeqsFile); 631 641 system "rm $allSeqsFile" if (-f $allSeqsFile);
632 642
633 643
#---------------------------------------------------------------------- 634 644 #----------------------------------------------------------------------
#Get the accessions of the top hits in the alignments 635 645 #Get the accessions of the top hits in the alignments
636 646
#get the accessions with significant hits 637 647 #get the accessions with significant hits
my %accList = (); 638 648 my %accList = ();
foreach my $hit (@alnHits) { 639 649 foreach my $hit (@alnHits) {
$accList{$hit->{qacc}} = 1; 640 650 $accList{$hit->{qacc}} = 1;
$accList{$hit->{sacc}} = 1; 641 651 $accList{$hit->{sacc}} = 1;
} 642 652 }
643 653
644 654
#Save accessions to a file 645 655 #Save accessions to a file
my $idFile = "$seqDir/top_hits_accs.txt"; 646 656 my $idFile = "$seqDir/top_hits_accs.txt";
unless (-f $idFile) { 647 657 unless (-f $idFile) {
open (my $afh, ">", $idFile) || die $!; 648 658 open (my $afh, ">", $idFile) || die $!;
print $afh join("\n", keys %accList), "\n"; 649 659 print $afh join("\n", keys %accList), "\n";
close $afh; 650 660 close $afh;
} 651 661 }
652 662
#---------------------------------------------------------------------- 653 663 #----------------------------------------------------------------------
#Extract full sequences for top hits. 654 664 #Extract full sequences for top hits.
655 665
my $topHitsSeqs = "$seqDir/topHits.faa"; 656 666 my $topHitsSeqs = "$seqDir/topHits.faa";
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); 657 667 my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs);
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); 658 668 system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs));
659 669
660 670
#---------------------------------------------------------------------- 661 671 #----------------------------------------------------------------------
#run hmmscan on all the sequences for both files 662 672 #run hmmscan on all the sequences for both files
663 673
print "\nRunning hmmscan and parsing output....\n"; 664 674 print "\nRunning hmmscan and parsing output....\n";
665 675
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; 666 676 my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm";
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); 667 677 my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs);
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); 668 678 system $cmd2 unless (-f $pfamFile && !(-z $pfamFile));
669 679
670 680
#parse Pfam output 671 681 #parse Pfam output
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); 672 682 TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans);
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); 673 683 # print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]);
# exit; 674 684 # exit;
675 685
#---------------------------------------------------------------------- 676 686 #----------------------------------------------------------------------
#Extract clans 677 687 #Extract clans
678 688
TCDB::Assorted::get_clans($pfamClans, $filesDir); 679 689 TCDB::Assorted::get_clans($pfamClans, $filesDir);
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); 680 690 # print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]);
# exit; 681 691 # exit;
682 692
#-------------------------------------------------------------------------- 683 693 #--------------------------------------------------------------------------
#Run hmmtop on top hits for later hydropathy plots. 684 694 #Run hmmtop on top hits for later hydropathy plots.
685 695
print "Runnign HMMTOP and parsing output...\n"; 686 696 print "Runnign HMMTOP and parsing output...\n";
687 697
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); 688 698 my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo);
system $cmd3 unless (-f $hmmtopFile); 689 699 system $cmd3 unless (-f $hmmtopFile);
system "rm $topHitsSeqs" if (-f $topHitsSeqs); 690 700 system "rm $topHitsSeqs" if (-f $topHitsSeqs);
691 701
#Parse hmmtop output 692 702 #Parse hmmtop output
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); 693 703 TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile);
694 704
} 695 705 }
696 706
697 707
#========================================================================== 698 708 #==========================================================================
#Parse ssearch36 output 699 709 #Parse ssearch36 output
700 710
sub parse_ssearch { 701 711 sub parse_ssearch {
702 712
my $out = shift; 703 713 my $out = shift;
704 714
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); 705 715 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile);
706 716
my $formatTmp = $parser->format(); 707 717 my $formatTmp = $parser->format();
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); 708 718 # print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]);
# exit; 709 719 # exit;
710 720
while (my $result = $parser->next_result) { 711 721 while (my $result = $parser->next_result) {
712 722
my $qacc = $result->query_name; 713 723 my $qacc = $result->query_name;
my $qlen = $result->query_length; 714 724 my $qlen = $result->query_length;
715 725
716 726
HIT:while (my $hit = $result->next_hit) { 717 727 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 718 728 HSP:while(my $hsp = $hit->next_hsp) {
719 729
#Alignment parameters 720 730 #Alignment parameters
my $sacc = $hit->name; 721 731 my $sacc = $hit->name;
my $slen = $hit->length; 722 732 my $slen = $hit->length;
my $eval = $hsp->evalue; 723 733 my $eval = $hsp->evalue;
my $id = $hsp->frac_identical('total') * 100; 724 734 my $id = $hsp->frac_identical('total') * 100;
725 735
#coordinates and sequence 726 736 #coordinates and sequence
my $qstart = $hsp->start('query'); 727 737 my $qstart = $hsp->start('query');
my $qend = $hsp->end('query'); 728 738 my $qend = $hsp->end('query');
my $sstart = $hsp->start('subject'); 729 739 my $sstart = $hsp->start('subject');
my $send = $hsp->end('subject'); 730 740 my $send = $hsp->end('subject');
my $qseq = $hsp->query_string; 731 741 my $qseq = $hsp->query_string;
my $sseq = $hsp->hit_string; 732 742 my $sseq = $hsp->hit_string;
my $hstr = $hsp->homology_string; 733 743 my $hstr = $hsp->homology_string;
734 744
735 745
#Check first that both proteins have the right length 736 746 #Check first that both proteins have the right length
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); 737 747 next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl));
738 748
#If the alignment has less than $minLength aas, ignore it 739 749 #If the alignment has less than $minLength aas, ignore it
my $qtmp = $qseq; $qtmp =~ s/-//g; 740 750 my $qtmp = $qseq; $qtmp =~ s/-//g;
my $stmp = $sseq; $stmp =~ s/-//g; 741 751 my $stmp = $sseq; $stmp =~ s/-//g;
next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); 742 752 next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength);
743 753
#Calculate coverages properly (do not use alignment length as it includes gaps 744 754 #Calculate coverages properly (do not use alignment length as it includes gaps
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; 745 755 my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100;
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; 746 756 my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp;
747 757
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; 748 758 my $sCov_tmp = ($send - $sstart + 1) / $slen * 100;
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; 749 759 my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp;
750 760
751 761
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 752 762 if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
753 763
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 754 764 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 755 765 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); 756 766 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr});
} 757 767 }
} # hsp 758 768 } # hsp
} # hit 759 769 } # hit
} # result 760 770 } # result
} 761 771 }
762 772
763 773
764 774
#========================================================================== 765 775 #==========================================================================
#Test whether the lengths of two proteins are withing a predefined 766 776 #Test whether the lengths of two proteins are withing a predefined
#legnth specified by the user. This are the options for control: 767 777 #legnth specified by the user. This are the options for control:
# X: Either protein is larger than the cutoff 768 778 # X: Either protein is larger than the cutoff
# B: Both proteins are larger than the cutoff 769 779 # B: Both proteins are larger than the cutoff
# Q: Only the query protein is larger than the cutoff 770 780 # Q: Only the query protein is larger than the cutoff
# S: Only the subject protein is larger than the cutoff 771 781 # S: Only the subject protein is larger than the cutoff
# N: No control. Any length is ok. 772 782 # N: No control. Any length is ok.
773 783
sub max_length_violation { 774 784 sub max_length_violation {
775 785
my ($qlen, $slen, $maxLen, $control) = @_; 776 786 my ($qlen, $slen, $maxLen, $control) = @_;
777 787
if ($control eq "X") { 778 788 if ($control eq "X") {
(($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; 779 789 (($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0;
} 780 790 }
781 791
if ($control eq "B") { 782 792 if ($control eq "B") {
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; 783 793 (($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0;
} 784 794 }
785 795
if ($control eq "Q") { 786 796 if ($control eq "Q") {
($qlen >= $maxLen)? return 1 : return 0; 787 797 ($qlen >= $maxLen)? return 1 : return 0;
} 788 798 }
789 799
if ($control eq "S") { 790 800 if ($control eq "S") {
($slen >= $maxLen)? return 1 : return 0; 791 801 ($slen >= $maxLen)? return 1 : return 0;
} 792 802 }
793 803
if ($control eq "N") { 794 804 if ($control eq "N") {
return 0; 795 805 return 0;
} 796 806 }
797 807
die "Unknown control mode: $control"; 798 808 die "Unknown control mode: $control";
799 809
} 800 810 }
801 811
802 812
803 813
804 814
#========================================================================== 805 815 #==========================================================================
#Parse blast output 806 816 #Parse blast output
807 817
808 818
sub parse_blast { 809 819 sub parse_blast {
my $out = shift; 810 820 my $out = shift;
811 821
open (my $fh, "<", $alnFile) || die $!; 812 822 open (my $fh, "<", $alnFile) || die $!;
LINE:while (<$fh>) { 813 823 LINE:while (<$fh>) {
chomp; 814 824 chomp;
next unless ($_); 815 825 next unless ($_);
next if (/^#/); 816 826 next if (/^#/);
817 827
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq 818 828 #Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); 819 829 my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_);
820 830
821 831
if ($eval <= $evalue) { 822 832 if ($eval <= $evalue) {
823 833
my $qcov = ($qend - $qstart + 1) / $qlen * 100; 824 834 my $qcov = ($qend - $qstart + 1) / $qlen * 100;
my $scov = ($send - $sstart + 1) / $slen * 100; 825 835 my $scov = ($send - $sstart + 1) / $slen * 100;
826 836
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 827 837 if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
828 838
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 829 839 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 830 840 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); 831 841 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq});
} 832 842 }
} 833 843 }
} 834 844 }
close $fh; 835 845 close $fh;
} 836 846 }
837 847
838 848
839 849
840 850
841 851
#========================================================================== 842 852 #==========================================================================
#Run the alignemnt between the two files depending on the program 843 853 #Run the alignemnt between the two files depending on the program
#Selected by the user. 844 854 #Selected by the user.
845 855
sub run_alignment { 846 856 sub run_alignment {
847 857
my $cmd = ""; 848 858 my $cmd = "";