Commit 68b41f16a35757f498478ba2822e6978013608f2

Authored by Luis Arturo Medrano-Soto
1 parent 6342e60576
Exists in master

update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts

Showing 3 changed files with 128 additions and 1080 deletions Inline Diff

alignSeqsFiles.pl View file @ 68b41f1
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use LWP; 10 10 use LWP;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
use Bio::SearchIO; 12 12 use Bio::SearchIO;
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Domain::PfamParser; 16 16 use TCDB::Domain::PfamParser;
use TCDB::Domain::Characterize; 17 17 use TCDB::Domain::Characterize;
use TCDB::Assorted; 18 18 use TCDB::Assorted;
19 19
20 20
########################################################################### 21 21 ###########################################################################
# 22 22 #
# Comapre two files with fasta sequences and report the alignment parameters 23 23 # Comapre two files with fasta sequences and report the alignment parameters
# Along with hydropathy plots and PFAM domains. 24 24 # Along with hydropathy plots and PFAM domains.
# 25 25 #
########################################################################### 26 26 ###########################################################################
27 27
#========================================================================== 28 28 #==========================================================================
#Check dependencies 29 29 #Check dependencies
30 30
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", 31 31 my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd",
"hmmscan"); 32 32 "hmmscan");
my $CheckDep_obj = new TCDB::CheckDependencies(); 33 33 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 34 34 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 35 35 $CheckDep_obj -> checkDependencies;
36 36
37 37
#This will prevent quod and alnquod from going into interactive mode 38 38 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 39 39 $ENV{"MPLBACKEND"} = "agg";
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $qfile = ""; 45 45 my $qfile = "";
my $qProt = ""; 46 46 my $qProt = "";
my $sfile = ""; 47 47 my $sfile = "";
my $sProt = ""; 48 48 my $sProt = "";
my $qlabel = "Query"; 49 49 my $qlabel = "Query";
my $slabel = "Subject"; 50 50 my $slabel = "Subject";
my $outdir = ""; 51 51 my $outdir = "";
my $prog = 'ssearch36'; #'blastp'; 52 52 my $prog = 'ssearch36'; #'blastp';
my $evalue = 1e-4; 53 53 my $evalue = 1e-4;
my $identity = 20.0; 54 54 my $identity = 20.0;
my $coverage = 40.0; 55 55 my $coverage = 40.0;
my $covControl = "X"; 56 56 my $covControl = "X";
my $blastComp = "F"; #2; 57 57 my $blastComp = "F"; #2;
my $segFilter = 'no'; 58 58 my $segFilter = 'no';
my $minLength = 30; #Min legnth of proteins to analyze (without gaps) 59 59 my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
my $subMatrix = 'BL50'; 60 60 my $subMatrix = 'BL50';
61 my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
62 my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
61 63
#this can be used to remove long sequences from results 62 64 #this can be used to remove long sequences from results
my $maxProtLength = 100000; #default threshold to allow any length 63 65 my $maxProtLength = 100000; #default threshold to allow any length
my $LengthControl = "N"; #same meaning as $covControl 64 66 my $LengthControl = "N"; #same meaning as $covControl
65 67
#internal directories 66 68 #internal directories
my $filesDir = ""; 67 69 my $filesDir = "";
my $plotsDir = ""; 68 70 my $plotsDir = "";
my $seqDir = ""; 69 71 my $seqDir = "";
my $blastDir = ""; 70 72 my $blastDir = "";
71 73
72 74
read_command_line(); 73 75 read_command_line();
74 76
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, 75 77 #print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog,
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, 76 78 # $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength,
# $LengthControl], 77 79 # $LengthControl],
# [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog 78 80 # [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog
# *evalue *coverage *covControl *blastComp *segFilter *maxProtLength 79 81 # *evalue *coverage *covControl *blastComp *segFilter *maxProtLength
# *LengthControl)]); 80 82 # *LengthControl)]);
#exit; 81 83 #exit;
82 84
83 85
84 86
#========================================================================== 85 87 #==========================================================================
#Output files 86 88 #Output files
87 89
#The alignment file by blastp or ssearch36 88 90 #The alignment file by blastp or ssearch36
my $alnFile = "$filesDir/${prog}.out"; 89 91 my $alnFile = "$filesDir/${prog}.out";
90 92
#The results of running hmmscan 91 93 #The results of running hmmscan
my $pfamFile = "$filesDir/hmmscan.out"; 92 94 my $pfamFile = "$filesDir/hmmscan.out";
93 95
#The results from running hmmtop 94 96 #The results from running hmmtop
my $hmmtopFile = "$filesDir/hmmtop.out"; 95 97 my $hmmtopFile = "$filesDir/hmmtop.out";
96 98
#The blast database to retrieve sequences for ploting 97 99 #The blast database to retrieve sequences for ploting
my $blastdb = "$blastDir/sequences"; 98 100 my $blastdb = "$blastDir/sequences";
99 101
100 102
101 103
#========================================================================== 102 104 #==========================================================================
#Run the alignment first 103 105 #Run the alignment first
104 106
print "Running $prog and parsing output....\n"; 105 107 print "Running $prog and parsing output....\n";
run_alignment(); 106 108 run_alignment();
107 109
108 110
my @alnHits = (); 109 111 my @alnHits = ();
if ($prog eq 'blastp') { parse_blast(\@alnHits); } 110 112 if ($prog eq 'blastp') { parse_blast(\@alnHits); }
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} 111 113 elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)}
112 114
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); 113 115 #print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]);
#exit; 114 116 #exit;
115 117
116 118
die "No significant blastHits found!\n" unless (@alnHits); 117 119 die "No significant blastHits found!\n" unless (@alnHits);
118 120
119 121
120 122
#========================================================================== 121 123 #==========================================================================
#Run pfam (get clans, hmmtop, and parse results 122 124 #Run pfam (get clans, hmmtop, and parse results
123 125
my %pfamHits = (); 124 126 my %pfamHits = ();
my %clans = (); 125 127 my %clans = ();
my %hmmtopHits = (); 126 128 my %hmmtopHits = ();
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); 127 129 run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans );
128 130
#print Data::Dumper->Dump([\%clans], [qw(*clans)]); 129 131 #print Data::Dumper->Dump([\%clans], [qw(*clans)]);
#exit; 130 132 #exit;
131 133
132 134
#========================================================================== 133 135 #==========================================================================
#Parse the alignment results to make sure there are signficant results, 134 136 #Parse the alignment results to make sure there are signficant results,
#get domains for significant hits and plot the corresponding hydropathies. 135 137 #get domains for significant hits and plot the corresponding hydropathies.
136 138
137 139
138 140
139 141
print "Geneating report...\n"; 140 142 print "Geneating report...\n";
generate_report(); 141 143 generate_report();
142 144
143 145
144 146
#========================================================================== 145 147 #==========================================================================
################ Subroutines definition beyond ths point ############## 146 148 ################ Subroutines definition beyond ths point ##############
#========================================================================== 147 149 #==========================================================================
148 150
149 151
#========================================================================== 150 152 #==========================================================================
#Generate output for significant hits 151 153 #Generate output for significant hits
152 154
sub generate_report { 153 155 sub generate_report {
154 156
155 157
#Prepare output files 156 158 #Prepare output files
my $htmlFile = "$outdir/report.html"; 157 159 my $htmlFile = "$outdir/report.html";
my $plotsFile = "$outdir/plots.html"; 158 160 my $plotsFile = "$outdir/plots.html";
159 161
160 162
my $htmlHeader = <<HEADER; 161 163 my $htmlHeader = <<HEADER;
<!DOCTYPE html> 162 164 <!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"> 163 165 <html xmlns="http://www.w3.org/1999/xhtml">
<head> 164 166 <head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> 165 167 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
166 168
<style type="text/css"> 167 169 <style type="text/css">
168 170
.label { 169 171 .label {
text-align: right; 170 172 text-align: right;
width: 50px; 171 173 width: 50px;
} 172 174 }
173 175
.data { 174 176 .data {
text-align: left; 175 177 text-align: left;
padding-left: 8px; 176 178 padding-left: 8px;
width: 100px; 177 179 width: 100px;
} 178 180 }
179 181
.uline { 180 182 .uline {
text-decoration: underline; 181 183 text-decoration: underline;
} 182 184 }
183 185
.pfam { 184 186 .pfam {
text-align: center; 185 187 text-align: center;
vertical-align: middle; 186 188 vertical-align: middle;
} 187 189 }
188 190
.seq { 189 191 .seq {
border: 2px solid black; 190 192 border: 2px solid black;
height: 70px; 191 193 height: 70px;
width: 100%; 192 194 width: 100%;
overflow-x: auto; 193 195 overflow-x: auto;
overflow-y: hidden; 194 196 overflow-y: hidden;
margin: 1em 0; 195 197 margin: 1em 0;
background: gray; 196 198 background: gray;
color: white; 197 199 color: white;
} 198 200 }
199 201
200 202
.dom { 201 203 .dom {
border: 2px solid black; 202 204 border: 2px solid black;
height: 100px; 203 205 height: 100px;
width: 100%; 204 206 width: 100%;
overflow-x: auto; 205 207 overflow-x: auto;
overflow-y: auto; 206 208 overflow-y: auto;
margin: 1em 0; 207 209 margin: 1em 0;
background: gray; 208 210 background: gray;
color: white; 209 211 color: white;
} 210 212 }
211 213
img { 212 214 img {
display: block; 213 215 display: block;
margin-left: auto; 214 216 margin-left: auto;
margin-right: auto; 215 217 margin-right: auto;
height: 250px; 216 218 height: 250px;
width: auto; 217 219 width: auto;
max-width: 1500px; 218 220 max-width: 1500px;
max-height: 300px; 219 221 max-height: 300px;
} 220 222 }
221 223
</style> 222 224 </style>
<title>$qlabel vs $slabel</title> 223 225 <title>$qlabel vs $slabel</title>
</head> 224 226 </head>
<br /> 225 227 <br />
<h1 style='text-align:center'>$qlabel vs $slabel</h1> 226 228 <h1 style='text-align:center'>$qlabel vs $slabel</h1>
<body> 227 229 <body>
228 230
HEADER 229 231 HEADER
230 232
231 233
open (my $outh, ">", $htmlFile) || die $!; 232 234 open (my $outh, ">", $htmlFile) || die $!;
print $outh $htmlHeader; 233 235 print $outh $htmlHeader;
234 236
235 237
foreach my $hit (sort by_evalue @alnHits) { 236 238 foreach my $hit (sort by_evalue @alnHits) {
237 239
238 240
my $qacc = $hit->{qacc}; 239 241 my $qacc = $hit->{qacc};
my $qlen = $hit->{qlen}; 240 242 my $qlen = $hit->{qlen};
my $qseq = $hit->{qseq}; 241 243 my $qseq = $hit->{qseq};
my $qcov = sprintf("%.1f", $hit->{qcov}); 242 244 my $qcov = sprintf("%.1f", $hit->{qcov});
my $qstart = $hit->{qstart}; 243 245 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 244 246 my $qend = $hit->{qend};
245 247
my $sacc = $hit->{sacc}; 246 248 my $sacc = $hit->{sacc};
my $slen = $hit->{slen}; 247 249 my $slen = $hit->{slen};
my $sseq = $hit->{sseq}; 248 250 my $sseq = $hit->{sseq};
my $scov = sprintf("%.1f", $hit->{scov}); 249 251 my $scov = sprintf("%.1f", $hit->{scov});
my $sstart = $hit->{sstart}; 250 252 my $sstart = $hit->{sstart};
my $send = $hit->{send}; 251 253 my $send = $hit->{send};
252 254
my $eval = sprintf ("%.1e", $hit->{evalue}); 253 255 my $eval = sprintf ("%.1e", $hit->{evalue});
my $id = sprintf ("%.1f", $hit->{id}); 254 256 my $id = sprintf ("%.1f", $hit->{id});
my $hstr = $hit->{hstr}; 255 257 my $hstr = $hit->{hstr};
256 258
my $alnHit = <<HIT; 257 259 my $alnHit = <<HIT;
258 260
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> 259 261 <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>
260 262
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> 261 263 <p><b>$qacc ($qlen) vs $sacc ($slen)</b></p>
262 264
<table width="600px" border="0" cellspacing="0" cellpadding="2"> 263 265 <table width="600px" border="0" cellspacing="0" cellpadding="2">
<tr> 264 266 <tr>
<td class='label'><b>E-value:</b></td> 265 267 <td class='label'><b>E-value:</b></td>
<td class='data'>$eval</td> 266 268 <td class='data'>$eval</td>
<td class='label'><b>Identity:</b></td> 267 269 <td class='label'><b>Identity:</b></td>
<td class='data'>${id}%</td> 268 270 <td class='data'>${id}%</td>
<td class='label'><b>Q_coverage:</b></td> 269 271 <td class='label'><b>Q_coverage:</b></td>
<td class='data'>${qcov}%</td> 270 272 <td class='data'>${qcov}%</td>
<td class='label'><b>S_coverage:</b></td> 271 273 <td class='label'><b>S_coverage:</b></td>
<td class='data'>${scov}%</td> 272 274 <td class='data'>${scov}%</td>
</tr> 273 275 </tr>
<tr> 274 276 <tr>
<td class='label'><b>Q_aln:</b></td> 275 277 <td class='label'><b>Q_aln:</b></td>
<td class='data'>${qstart}-$qend</td> 276 278 <td class='data'>${qstart}-$qend</td>
<td class='label'><b>S_aln:</b></td> 277 279 <td class='label'><b>S_aln:</b></td>
<td class='data'>${sstart}-$send</td> 278 280 <td class='data'>${sstart}-$send</td>
<td class='label'></td> 279 281 <td class='label'></td>
<td class='data'></td> 280 282 <td class='data'></td>
<td class='label'></td> 281 283 <td class='label'></td>
<td class='data'></td> 282 284 <td class='data'></td>
</tr> 283 285 </tr>
</table> 284 286 </table>
<br /> 285 287 <br />
286 288
<p><b>Alignment:</b></p> 287 289 <p><b>Alignment:</b></p>
<div class='seq'> 288 290 <div class='seq'>
<pre> 289 291 <pre>
$qseq 290 292 $qseq
$hstr 291 293 $hstr
$sseq 292 294 $sseq
</pre> 293 295 </pre>
</div> 294 296 </div>
295 297
HIT 296 298 HIT
297 299
print $outh $alnHit; 298 300 print $outh $alnHit;
299 301
300 302
#Generate the hydropathy plots 301 303 #Generate the hydropathy plots
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); 302 304 my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq);
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); 303 305 die "Could not generate plots for hit: $qacc vs $sacc" unless ($good);
304 306
my $domData = generate_domain_data($qacc, $sacc); 305 307 my $domData = generate_domain_data($qacc, $sacc);
my $domHTML = ""; 306 308 my $domHTML = "";
if ($domData) { 307 309 if ($domData) {
$domHTML =<<DATA; 308 310 $domHTML =<<DATA;
<br /> 309 311 <br />
<hr /> 310 312 <hr />
<p><b>Pfam info:</b></p> 311 313 <p><b>Pfam info:</b></p>
<div class='dom'> 312 314 <div class='dom'>
313 315
$domData 314 316 $domData
315 317
</div> 316 318 </div>
DATA 317 319 DATA
} 318 320 }
319 321
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; 320 322 my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png";
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; 321 323 my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png";
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; 322 324 my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png";
323 325
#now include the plots 324 326 #now include the plots
my $prtPlots =<<PLOTS; 325 327 my $prtPlots =<<PLOTS;
326 328
<br /> 327 329 <br />
<table style="width:100%"> 328 330 <table style="width:100%">
<tr> 329 331 <tr>
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> 330 332 <td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td>
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> 331 333 <td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td>
</tr> 332 334 </tr>
<tr> 333 335 <tr>
<td colspan="2" style="text-align: center;"> 334 336 <td colspan="2" style="text-align: center;">
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> 335 337 <a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a>
</td> 336 338 </td>
</tr> 337 339 </tr>
</table> 338 340 </table>
339 341
$domHTML 340 342 $domHTML
341 343
PLOTS 342 344 PLOTS
343 345
print $outh $prtPlots; 344 346 print $outh $prtPlots;
} 345 347 }
346 348
#Close HTML report 347 349 #Close HTML report
my $closeRep = <<CLOSE; 348 350 my $closeRep = <<CLOSE;
</body> 349 351 </body>
</html> 350 352 </html>
CLOSE 351 353 CLOSE
352 354
print $outh $closeRep; 353 355 print $outh $closeRep;
354 356
close $outh; 355 357 close $outh;
} 356 358 }
357 359
358 360
#========================================================================== 359 361 #==========================================================================
#Generate domain data for the html report 360 362 #Generate domain data for the html report
361 363
362 364
sub generate_domain_data { 363 365 sub generate_domain_data {
364 366
my ($q, $s) = @_; 365 367 my ($q, $s) = @_;
366 368
#Format of PFAM hash: 367 369 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 368 370 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, 369 371 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval,
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 370 372 # dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
371 373
372 374
373 375
374 376
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); 375 377 my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info);
my $colStr = " <th>" . join ("</th>\n <th>", @cols) . 376 378 my $colStr = " <th>" . join ("</th>\n <th>", @cols) .
"</th>\n"; 377 379 "</th>\n";
378 380
my $header =<<HEADER; 379 381 my $header =<<HEADER;
<table border='1', style='width:100%'> 380 382 <table border='1', style='width:100%'>
<tr> 381 383 <tr>
$colStr 382 384 $colStr
</tr> 383 385 </tr>
HEADER 384 386 HEADER
385 387
386 388
my $res = ""; 387 389 my $res = "";
foreach my $prot ($q, $s) { 388 390 foreach my $prot ($q, $s) {
389 391
if (exists $pfamHits{$prot}) { 390 392 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 391 393 my @Doms = keys %{ $pfamHits{$prot} };
392 394
foreach my $d (@Doms) { 393 395 foreach my $d (@Doms) {
394 396
my $clan = ($clans{$d})? $clans{$d} : "N/A"; 395 397 my $clan = ($clans{$d})? $clans{$d} : "N/A";
396 398
my @hits = @{ $pfamHits{$prot}{$d} }; 397 399 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 398 400 foreach my $hit (@hits) {
my $dlen = $hit->{dlen}; 399 401 my $dlen = $hit->{dlen};
my $eval = $hit->{evalue}; 400 402 my $eval = $hit->{evalue};
my $qstart = $hit->{qstart}; 401 403 my $qstart = $hit->{qstart};
my $qend = $hit->{qend}; 402 404 my $qend = $hit->{qend};
my $dstart = $hit->{dstart}; 403 405 my $dstart = $hit->{dstart};
my $dend = $hit->{dend}; 404 406 my $dend = $hit->{dend};
my $name = $hit->{dname}; 405 407 my $name = $hit->{dname};
my $def = $hit->{def}; 406 408 my $def = $hit->{def};
407 409
$res .=<<ROW; 408 410 $res .=<<ROW;
<tr> 409 411 <tr>
<td class="pfam">$prot</td> 410 412 <td class="pfam">$prot</td>
<td class="pfam">$d</td> 411 413 <td class="pfam">$d</td>
<td class="pfam">$clan</td> 412 414 <td class="pfam">$clan</td>
<td class="pfam">$dlen</td> 413 415 <td class="pfam">$dlen</td>
<td class="pfam">$eval</td> 414 416 <td class="pfam">$eval</td>
<td class="pfam">$dstart</td> 415 417 <td class="pfam">$dstart</td>
<td class="pfam">$dend</td> 416 418 <td class="pfam">$dend</td>
<td class="pfam">$qstart</td> 417 419 <td class="pfam">$qstart</td>
<td class="pfam">$qend</td> 418 420 <td class="pfam">$qend</td>
<td class="pfam">$name</td> 419 421 <td class="pfam">$name</td>
<td class="pfam">$def</td> 420 422 <td class="pfam">$def</td>
</tr> 421 423 </tr>
ROW 422 424 ROW
} 423 425 }
} 424 426 }
} 425 427 }
} 426 428 }
427 429
#Return final result 428 430 #Return final result
if ($res) { 429 431 if ($res) {
$header .= $res; 430 432 $header .= $res;
$header .= " </table>\n"; 431 433 $header .= " </table>\n";
return $header; 432 434 return $header;
} 433 435 }
else { 434 436 else {
return $res; 435 437 return $res;
} 436 438 }
} 437 439 }
438 440
439 441
440 442
441 443
#========================================================================== 442 444 #==========================================================================
#Run quod on the query, subject and the alignment. 443 445 #Run quod on the query, subject and the alignment.
444 446
445 447
448 #quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
449
450
sub run_quod { 446 451 sub run_quod {
447 452
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; 448 453 my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
449 454
450 455
#extract sequences for query and subject 451 456 #extract sequences for query and subject
extract_full_sequences($q,$s); 452 457 extract_full_sequences($q,$s);
453 458
454 459
#----------------------------------------------------------------- 455 460 #-----------------------------------------------------------------
#Run quod for the alignment 456 461 #Run quod for the alignment
457 462
#First save aligned segments to files 458 463 #First save aligned segments to files
my $qalnFile ="$seqDir/${q}_aln.faa"; 459 464 my $qalnFile ="$seqDir/${q}_aln.faa";
open(my $qfh, '>', $qalnFile) || die $!; 460 465 open(my $qfh, '>', $qalnFile) || die $!;
print $qfh ">$q alignment\n$qseq\n"; 461 466 print $qfh ">$q alignment\n$qseq\n";
close $qfh; 462 467 close $qfh;
463 468
my $salnFile ="$seqDir/${s}_aln.faa"; 464 469 my $salnFile ="$seqDir/${s}_aln.faa";
open(my $sfh, '>', $salnFile) || die $!; 465 470 open(my $sfh, '>', $salnFile) || die $!;
print $sfh ">$s alignment\n$sseq\n"; 466 471 print $sfh ">$s alignment\n$sseq\n";
close $sfh; 467 472 close $sfh;
468 473
469 474
#Note alnquod requires to add the extension to the image name 470 475 #Note alnquod requires to add the extension to the image name
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; 471 476 my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); 472 477 my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
#print "$cmd1\n\n"; 473 478 # print "$cmd1\n\n";
479 # exit;
system $cmd1 unless (-f "${alnFig}"); 474 480 system $cmd1 unless (-f "${alnFig}");
return undef unless (-f "${alnFig}"); 475 481 return undef unless (-f "${alnFig}");
476 482
477 483
#----------------------------------------------------------------- 478 484 #-----------------------------------------------------------------
#Run quod for the full sequencess of the query and subject proteins 479 485 #Run quod for the full sequencess of the query and subject proteins
480 486
481 487
#Extract TMS coordinates for query 482 488 #Extract TMS coordinates for query
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); 483 489 die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
my $qTMS = ""; 484 490 my $qTMS = "";
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { 485 491 if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
$qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; 486 492 $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
} 487 493 }
488 494
489 495
#Plot query hydropathy 490 496 #Plot query hydropathy
my $qPfam = get_pfam_coords_for_quod($q, "red"); 491 497 my $qPfam = get_pfam_coords_for_quod($q, "red");
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}"; 492 498 my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa); 493 499 my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
#print "$cmd2\n\n"; 494 500 # print "$cmd2\n\n";
system $cmd2 unless (-f "${qName}.png"); 495 501 # exit;
return undef unless (-f "${qName}.png"); 496 502 system $cmd2 unless (-f $qName);
503 return undef unless (-f $qName);
497 504
498 505
499 506
#TMS coords for the subject 500 507 #TMS coords for the subject
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); 501 508 die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
my $sTMS = ""; 502 509 my $sTMS = "";
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { 503 510 if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
$sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; 504 511 $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
} 505 512 }
506 513
#Plot Subject hydropaty 507 514 #Plot Subject hydropaty
my $sPfam = get_pfam_coords_for_quod($s, "blue"); 508 515 my $sPfam = get_pfam_coords_for_quod($s, "blue");
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}"; 509 516 my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa); 510 517 my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
#print "$cmd3\n\n"; 511 518 # print "$cmd3\n\n";
system $cmd3 unless (-f "${sName}.png"); 512 519 # exit;
return undef unless (-f "${sName}.png"); 513 520 system $cmd3 unless (-f $sName);
521 return undef unless (-f $sName);
514 522
515 523
return 1; 516 524 return 1;
} 517 525 }
518 526
#========================================================================== 519 527 #==========================================================================
#Get the string for quod that will plot the PFAM domains 520 528 #Get the string for quod that will plot the PFAM domains
521 529
sub get_pfam_coords_for_quod { 522 530 sub get_pfam_coords_for_quod {
523 531
my ($prot, $color) = @_; 524 532 my ($prot, $color) = @_;
525 533
#Format of PFAM hash: 526 534 #Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} }, 527 535 # push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, 528 536 # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend,
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); 529 537 # def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
530 538
531 539
532 540
my $str = ""; 533 541 my $str = "";
534 542
if (exists $pfamHits{$prot}) { 535 543 if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} }; 536 544 my @Doms = keys %{ $pfamHits{$prot} };
my $dcnt = 0; 537 545 my $dcnt = 0;
$str = "--region-font 12 -ar "; 538 546 $str = "--region-font 12 --add-region ";
foreach my $d (@Doms) { 539 547 foreach my $d (@Doms) {
540 548
my @hits = @{ $pfamHits{$prot}{$d} }; 541 549 my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) { 542 550 foreach my $hit (@hits) {
my $left = $hit->{qstart}; 543 551 my $left = $hit->{qstart};
my $right = $hit->{qend}; 544 552 my $right = $hit->{qend};
545 553
my $ypos = -2.8 + $dcnt * 0.4; 546 554 my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
$str .= "${left}-${right}:'${d}':${ypos}:$color "; 547 555 my $yposh = $yposl + 0.15; #domain height coord
556
557 $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
$dcnt++; 548 558 $dcnt++;
} 549 559 }
} 550 560 }
} 551 561 }
552 562
return $str; 553 563 return $str;
554 564
} 555 565 }
556 566
557 567
#========================================================================== 558 568 #==========================================================================
#Extract the full sequences of the query and subject proteins 559 569 #Extract the full sequences of the query and subject proteins
#Examples: AKM80767.1 560 570 #Examples: AKM80767.1
561 571
562 572
563 573
sub extract_full_sequences { 564 574 sub extract_full_sequences {
565 575
my ($q, $s) = @_; 566 576 my ($q, $s) = @_;
567 577
568 578
my $q_seq = "$seqDir/${q}.faa"; 569 579 my $q_seq = "$seqDir/${q}.faa";
my $s_seq = "$seqDir/${s}.faa"; 570 580 my $s_seq = "$seqDir/${s}.faa";
571 581
#extract the query secuence from tcdb and the subject from the custom blastdb 572 582 #extract the query secuence from tcdb and the subject from the custom blastdb
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); 573 583 my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq);
system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); 574 584 system "$cmd1" unless (-f $q_seq && !(-z $q_seq));
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); 575 585 die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq));
576 586
577 587
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); 578 588 my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq);
system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); 579 589 system "$cmd2" unless (-f $s_seq && !(-z $s_seq));
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); 580 590 die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq));
} 581 591 }
582 592
583 593
584 594
585 595
586 596
587 597
588 598
#========================================================================== 589 599 #==========================================================================
#Sort alignmnet results by E-value 590 600 #Sort alignmnet results by E-value
591 601
sub by_evalue { 592 602 sub by_evalue {
$a->{evalue} <=> $b->{evalue}; 593 603 $a->{evalue} <=> $b->{evalue};
} 594 604 }
595 605
596 606
597 607
#========================================================================== 598 608 #==========================================================================
#Run PFAM, hmmtop and parse results 599 609 #Run PFAM, hmmtop and parse results
600 610
601 611
sub run_pfam_hmmtop { 602 612 sub run_pfam_hmmtop {
my ($pfamOut, $hmmtopOut, $pfamClans) = @_; 603 613 my ($pfamOut, $hmmtopOut, $pfamClans) = @_;
604 614
605 615
#---------------------------------------------------------------------- 606 616 #----------------------------------------------------------------------
#Generate blast DB for easy sequence retrieval 607 617 #Generate blast DB for easy sequence retrieval
608 618
print "Generate Blast DB with sequences for fast sequence retrieval...\n"; 609 619 print "Generate Blast DB with sequences for fast sequence retrieval...\n";
610 620
#Get the sequences for which hmmscan will run 611 621 #Get the sequences for which hmmscan will run
my $allSeqsFile = "$seqDir/all_seqs.faa"; 612 622 my $allSeqsFile = "$seqDir/all_seqs.faa";
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); 613 623 system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile));
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); 614 624 die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile));
615 625
616 626
#Generate blastdb ...assuming there are no duplicate sequences. 617 627 #Generate blastdb ...assuming there are no duplicate sequences.
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); 618 628 my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb);
print "$cmd1\n"; 619 629 print "$cmd1\n";
system $cmd1 unless (-f "${blastdb}.pin"); 620 630 system $cmd1 unless (-f "${blastdb}.pin");
system "rm $allSeqsFile" if (-f $allSeqsFile); 621 631 system "rm $allSeqsFile" if (-f $allSeqsFile);
622 632
623 633
#---------------------------------------------------------------------- 624 634 #----------------------------------------------------------------------
#Get the accessions of the top hits in the alignments 625 635 #Get the accessions of the top hits in the alignments
626 636
#get the accessions with significant hits 627 637 #get the accessions with significant hits
my %accList = (); 628 638 my %accList = ();
foreach my $hit (@alnHits) { 629 639 foreach my $hit (@alnHits) {
$accList{$hit->{qacc}} = 1; 630 640 $accList{$hit->{qacc}} = 1;
$accList{$hit->{sacc}} = 1; 631 641 $accList{$hit->{sacc}} = 1;
} 632 642 }
633 643
634 644
#Save accessions to a file 635 645 #Save accessions to a file
my $idFile = "$seqDir/top_hits_accs.txt"; 636 646 my $idFile = "$seqDir/top_hits_accs.txt";
unless (-f $idFile) { 637 647 unless (-f $idFile) {
open (my $afh, ">", $idFile) || die $!; 638 648 open (my $afh, ">", $idFile) || die $!;
print $afh join("\n", keys %accList), "\n"; 639 649 print $afh join("\n", keys %accList), "\n";
close $afh; 640 650 close $afh;
} 641 651 }
642 652
#---------------------------------------------------------------------- 643 653 #----------------------------------------------------------------------
#Extract full sequences for top hits. 644 654 #Extract full sequences for top hits.
645 655
my $topHitsSeqs = "$seqDir/topHits.faa"; 646 656 my $topHitsSeqs = "$seqDir/topHits.faa";
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); 647 657 my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs);
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); 648 658 system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs));
649 659
650 660
#---------------------------------------------------------------------- 651 661 #----------------------------------------------------------------------
#run hmmscan on all the sequences for both files 652 662 #run hmmscan on all the sequences for both files
653 663
print "\nRunning hmmscan and parsing output....\n"; 654 664 print "\nRunning hmmscan and parsing output....\n";
655 665
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; 656 666 my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm";
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); 657 667 my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs);
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); 658 668 system $cmd2 unless (-f $pfamFile && !(-z $pfamFile));
659 669
660 670
#parse Pfam output 661 671 #parse Pfam output
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); 662 672 TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans);
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); 663 673 # print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]);
# exit; 664 674 # exit;
665 675
#---------------------------------------------------------------------- 666 676 #----------------------------------------------------------------------
#Extract clans 667 677 #Extract clans
668 678
TCDB::Assorted::get_clans($pfamClans, $filesDir); 669 679 TCDB::Assorted::get_clans($pfamClans, $filesDir);
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); 670 680 # print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]);
# exit; 671 681 # exit;
672 682
#-------------------------------------------------------------------------- 673 683 #--------------------------------------------------------------------------
#Run hmmtop on top hits for later hydropathy plots. 674 684 #Run hmmtop on top hits for later hydropathy plots.
675 685
print "Runnign HMMTOP and parsing output...\n"; 676 686 print "Runnign HMMTOP and parsing output...\n";
677 687
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); 678 688 my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo);
system $cmd3 unless (-f $hmmtopFile); 679 689 system $cmd3 unless (-f $hmmtopFile);
system "rm $topHitsSeqs" if (-f $topHitsSeqs); 680 690 system "rm $topHitsSeqs" if (-f $topHitsSeqs);
681 691
#Parse hmmtop output 682 692 #Parse hmmtop output
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); 683 693 TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile);
684 694
} 685 695 }
686 696
687 697
#========================================================================== 688 698 #==========================================================================
#Parse ssearch36 output 689 699 #Parse ssearch36 output
690 700
sub parse_ssearch { 691 701 sub parse_ssearch {
692 702
my $out = shift; 693 703 my $out = shift;
694 704
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); 695 705 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile);
696 706
my $formatTmp = $parser->format(); 697 707 my $formatTmp = $parser->format();
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); 698 708 # print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]);
# exit; 699 709 # exit;
700 710
while (my $result = $parser->next_result) { 701 711 while (my $result = $parser->next_result) {
702 712
my $qacc = $result->query_name; 703 713 my $qacc = $result->query_name;
my $qlen = $result->query_length; 704 714 my $qlen = $result->query_length;
705 715
706 716
HIT:while (my $hit = $result->next_hit) { 707 717 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 708 718 HSP:while(my $hsp = $hit->next_hsp) {
709 719
#Alignment parameters 710 720 #Alignment parameters
my $sacc = $hit->name; 711 721 my $sacc = $hit->name;
my $slen = $hit->length; 712 722 my $slen = $hit->length;
my $eval = $hsp->evalue; 713 723 my $eval = $hsp->evalue;
my $id = $hsp->frac_identical('total') * 100; 714 724 my $id = $hsp->frac_identical('total') * 100;
715 725
#coordinates and sequence 716 726 #coordinates and sequence
my $qstart = $hsp->start('query'); 717 727 my $qstart = $hsp->start('query');
my $qend = $hsp->end('query'); 718 728 my $qend = $hsp->end('query');
my $sstart = $hsp->start('subject'); 719 729 my $sstart = $hsp->start('subject');
my $send = $hsp->end('subject'); 720 730 my $send = $hsp->end('subject');
my $qseq = $hsp->query_string; 721 731 my $qseq = $hsp->query_string;
my $sseq = $hsp->hit_string; 722 732 my $sseq = $hsp->hit_string;
my $hstr = $hsp->homology_string; 723 733 my $hstr = $hsp->homology_string;
724 734
725 735
#Check first that both proteins have the right length 726 736 #Check first that both proteins have the right length
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); 727 737 next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl));
728 738
#If the alignment has less than $minLength aas, ignore it 729 739 #If the alignment has less than $minLength aas, ignore it
my $qtmp = $qseq; $qtmp =~ s/-//g; 730 740 my $qtmp = $qseq; $qtmp =~ s/-//g;
my $stmp = $sseq; $stmp =~ s/-//g; 731 741 my $stmp = $sseq; $stmp =~ s/-//g;
next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); 732 742 next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength);
733 743
#Calculate coverages properly (do not use alignment length as it includes gaps 734 744 #Calculate coverages properly (do not use alignment length as it includes gaps
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; 735 745 my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100;
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; 736 746 my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp;
737 747
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; 738 748 my $sCov_tmp = ($send - $sstart + 1) / $slen * 100;
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; 739 749 my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp;
740 750
741 751
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 742 752 if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
743 753
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 744 754 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 745 755 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); 746 756 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr});
} 747 757 }
} # hsp 748 758 } # hsp
} # hit 749 759 } # hit
} # result 750 760 } # result
} 751 761 }
752 762
753 763
754 764
#========================================================================== 755 765 #==========================================================================
#Test whether the lengths of two proteins are withing a predefined 756 766 #Test whether the lengths of two proteins are withing a predefined
#legnth specified by the user. This are the options for control: 757 767 #legnth specified by the user. This are the options for control:
# X: Either protein is larger than the cutoff 758 768 # X: Either protein is larger than the cutoff
# B: Both proteins are larger than the cutoff 759 769 # B: Both proteins are larger than the cutoff
# Q: Only the query protein is larger than the cutoff 760 770 # Q: Only the query protein is larger than the cutoff
# S: Only the subject protein is larger than the cutoff 761 771 # S: Only the subject protein is larger than the cutoff
# N: No control. Any length is ok. 762 772 # N: No control. Any length is ok.
763 773
sub max_length_violation { 764 774 sub max_length_violation {
765 775
my ($qlen, $slen, $maxLen, $control) = @_; 766 776 my ($qlen, $slen, $maxLen, $control) = @_;
767 777
if ($control eq "X") { 768 778 if ($control eq "X") {
(($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; 769 779 (($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0;
} 770 780 }
771 781
if ($control eq "B") { 772 782 if ($control eq "B") {
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; 773 783 (($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0;
} 774 784 }
775 785
if ($control eq "Q") { 776 786 if ($control eq "Q") {
($qlen >= $maxLen)? return 1 : return 0; 777 787 ($qlen >= $maxLen)? return 1 : return 0;
} 778 788 }
779 789
if ($control eq "S") { 780 790 if ($control eq "S") {
($slen >= $maxLen)? return 1 : return 0; 781 791 ($slen >= $maxLen)? return 1 : return 0;
} 782 792 }
783 793
if ($control eq "N") { 784 794 if ($control eq "N") {
return 0; 785 795 return 0;
} 786 796 }
787 797
die "Unknown control mode: $control"; 788 798 die "Unknown control mode: $control";
789 799
} 790 800 }
791 801
792 802
793 803
794 804
#========================================================================== 795 805 #==========================================================================
#Parse blast output 796 806 #Parse blast output
797 807
798 808
sub parse_blast { 799 809 sub parse_blast {
my $out = shift; 800 810 my $out = shift;
801 811
open (my $fh, "<", $alnFile) || die $!; 802 812 open (my $fh, "<", $alnFile) || die $!;
LINE:while (<$fh>) { 803 813 LINE:while (<$fh>) {
chomp; 804 814 chomp;
next unless ($_); 805 815 next unless ($_);
next if (/^#/); 806 816 next if (/^#/);
807 817
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq 808 818 #Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); 809 819 my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_);
810 820
811 821
if ($eval <= $evalue) { 812 822 if ($eval <= $evalue) {
813 823
my $qcov = ($qend - $qstart + 1) / $qlen * 100; 814 824 my $qcov = ($qend - $qstart + 1) / $qlen * 100;
my $scov = ($send - $sstart + 1) / $slen * 100; 815 825 my $scov = ($send - $sstart + 1) / $slen * 100;
816 826
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { 817 827 if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
818 828
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, 819 829 push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, 820 830 scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); 821 831 sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq});
} 822 832 }
} 823 833 }
} 824 834 }
close $fh; 825 835 close $fh;
} 826 836 }
827 837
828 838
829 839
830 840
831 841
#========================================================================== 832 842 #==========================================================================
#Run the alignemnt between the two files depending on the program 833 843 #Run the alignemnt between the two files depending on the program
#Selected by the user. 834 844 #Selected by the user.
835 845
sub run_alignment { 836 846 sub run_alignment {
837 847
my $cmd = ""; 838 848 my $cmd = "";
839 849
840 850
if ($prog eq 'blastp') { 841 851 if ($prog eq 'blastp') {
842 852
my $compStr = "-comp_based_stats $blastComp"; 843 853 my $compStr = "-comp_based_stats $blastComp";
my $segStr = "-seg $segFilter"; 844 854 my $segStr = "-seg $segFilter";
my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq'); 845 855 my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq');
846 856
#Run blast 847 857 #Run blast
$cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr); 848 858 $cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr);
print "$cmd\n"; 849 859 print "$cmd\n";
system $cmd unless (-f $alnFile && !(-z $alnFile)); 850 860 system $cmd unless (-f $alnFile && !(-z $alnFile));
851 861
#Append command line to the end of results file 852 862 #Append command line to the end of results file
open (my $fh, ">>", $alnFile) || die $!; 853 863 open (my $fh, ">>", $alnFile) || die $!;
print $fh "\n# $cmd\n"; 854 864 print $fh "\n# $cmd\n";
close $fh; 855 865 close $fh;
locateFragment.pl View file @ 68b41f1
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 $Data::Dumper::Deepcopy = 1;
8 8
use Getopt::Long; 9 9 use Getopt::Long;
use Bio::SearchIO; 10 10 use Bio::SearchIO;
use Bio::SeqIO; 11 11 use Bio::SeqIO;
12 12
13 13
#Local libraries 14 14 #Local libraries
use TCDB::CheckDependencies; 15 15 use TCDB::CheckDependencies;
use TCDB::Assorted; 16 16 use TCDB::Assorted;
17 17
18 18
########################################################################### 19 19 ###########################################################################
# 20 20 #
# Parse GBLAST output file and return the sequence of the proteins that 21 21 # Parse GBLAST output file and return the sequence of the proteins that
# meet the user's criteria. 22 22 # meet the user's criteria.
# 23 23 #
########################################################################### 24 24 ###########################################################################
25 25
26 26
#This will prevent quod and alnquod from going into interactive mode 27 27 #This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg"; 28 28 $ENV{"MPLBACKEND"} = "agg";
29 29
30 30
#========================================================================== 31 31 #==========================================================================
#Check dependencies 32 32 #Check dependencies
33 33
my @dependencies = ("glsearch36", "blastdbcmd"); 34 34 my @dependencies = ("glsearch36", "blastdbcmd");
my $CheckDep_obj = new TCDB::CheckDependencies(); 35 35 my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies); 36 36 $CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies; 37 37 $CheckDep_obj -> checkDependencies;
38 38
39 39
40 40
41 41
#========================================================================== 42 42 #==========================================================================
#Read command line arguments 43 43 #Read command line arguments
44 44
my $fragment = undef; 45 45 my $fragment = undef;
my $accession = undef; 46 46 my $accession = undef;
my $accFile = undef 47 47 my $accFile = undef
my $outdir = undef; 48 48 my $outdir = undef;
my $blastdb = undef; 49 49 my $blastdb = undef;
my $evalue = 1e-2; 50 50 my $evalue = 1e-2;
my $subMatrix = 'BL50'; 51 51 my $subMatrix = 'BL50';
my $quiet = 0; 52 52 my $quiet = 0;
my $interactive = 0; 53 53 my $interactive = 0;
54 54
read_command_line(); 55 55 read_command_line();
56 56
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], 57 57 #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet],
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); 58 58 # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]);
#exit; 59 59 #exit;
60 60
61 61
62 62
63 63
#========================================================================== 64 64 #==========================================================================
#Get the sequences for the fragment and full proteins in files 65 65 #Get the sequences for the fragment and full proteins in files
66 66
67 67
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; 68 68 my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) };
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); 69 69 #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]);
70 70
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); 71 71 die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile);
72 72
#========================================================================== 73 73 #==========================================================================
#align fragment to full protein 74 74 #align fragment to full protein
75 75
my $alignFile = "$outdir/${accession}_glsearch.out"; 76 76 my $alignFile = "$outdir/${accession}_glsearch.out";
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); 77 77 my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile);
system $cmd unless (-f $alignFile); 78 78 system $cmd unless (-f $alignFile);
79 79
80 80
#========================================================================== 81 81 #==========================================================================
#Parse glsearch output and run quod 82 82 #Parse glsearch output and run quod
83 83
84 84
run_quod ($alignFile, $protFile); 85 85 run_quod ($alignFile, $protFile);
86 86
87 87
88 88
89 89
90 90
#========================================================================== 91 91 #==========================================================================
################ Subroutines definition beyond ths point ############## 92 92 ################ Subroutines definition beyond ths point ##############
#========================================================================== 93 93 #==========================================================================
94 94
95 95
sub run_quod { 96 96 sub run_quod {
97 97
my ($alignment, $sequence) = @_; 98 98 my ($alignment, $sequence) = @_;
99 99
100 100
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); 101 101 my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment);
102 102
my @res = (); 103 103 my @res = ();
104 104
#---------------------------------------------------------------------- 105 105 #----------------------------------------------------------------------
#Parse glsearch output 106 106 #Parse glsearch output
107 107
my $hsp_cnt = 1; 108 108 my $hsp_cnt = 1;
while (my $result = $parser->next_result) { 109 109 while (my $result = $parser->next_result) {
110 110
HIT:while (my $hit = $result->next_hit) { 111 111 HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) { 112 112 HSP:while(my $hsp = $hit->next_hsp) {
113 113
my %data = (); 114 114 my %data = ();
my $key = "hsp_$hsp_cnt"; 115 115 my $key = "hsp_$hsp_cnt";
my $hId = $hsp->frac_identical('total'); #identity in the alignment 116 116 my $hId = $hsp->frac_identical('total'); #identity in the alignment
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment 117 117 my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
118 118
#Alignment parameters 119 119 #Alignment parameters
$data{hsp} = $key; 120 120 $data{hsp} = $key;
$data{evalue} = $hsp->evalue; 121 121 $data{evalue} = $hsp->evalue;
$data{id} = $hId; 122 122 $data{id} = $hId;
$data{sim} = $hSim; 123 123 $data{sim} = $hSim;
124 124
#coordinates in the alignment to plot bars 125 125 #coordinates in the alignment to plot bars
$data{qstart} = $hsp->start('query'); 126 126 $data{qstart} = $hsp->start('query');
$data{qend} = $hsp->end('query'); 127 127 $data{qend} = $hsp->end('query');
$data{sstart} = $hsp->start('subject'); 128 128 $data{sstart} = $hsp->start('subject');
$data{send} = $hsp->end('subject'); 129 129 $data{send} = $hsp->end('subject');
130 130
push (@res, \%data); 131 131 push (@res, \%data);
$hsp_cnt++; 132 132 $hsp_cnt++;
} 133 133 }
} 134 134 }
} 135 135 }
136 136
die "Error: no match found between fragment and sequence" unless (@res); 137 137 die "Error: no match found between fragment and sequence" unless (@res);
# print Data::Dumper->Dump([\@res ], [qw(*res )]); 138 138 # print Data::Dumper->Dump([\@res ], [qw(*res )]);
139 139
140 140
#---------------------------------------------------------------------- 141 141 #----------------------------------------------------------------------
#Generate quod plot 142 142 #Generate quod plot
143 143
#Format string for the regions 144 144 #Format string for the regions
my $regions = "-at "; 145 145 my $regions = "--add-tms ";
my $coords = ""; 146 146 my $coords = "";
foreach my $hit (@res) { 147 147 foreach my $hit (@res) {
148 148
$coords = $hit->{sstart} . "-" . $hit->{send}; 149 149 $coords = $hit->{sstart} . "-" . $hit->{send};
$regions .= "${coords}:green"; 150 150 $regions .= "${coords}:green";
151 151
#only the best HSP is required to be plotted 152 152 #only the best HSP is required to be plotted
last; 153 153 last;
} 154 154 }
155 155
my $outPlot = "$outdir/${accession}_map_frag.png"; 156 156 my $outPlot = "$outdir/${accession}_map_frag.png";
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; 157 157 my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot";
my $iString = ($interactive)? "-o $outPlot --show" : ""; 158 158 my $iString = ($interactive)? "-o $outPlot --show" : "";
159 159
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); 160 160 my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence);
print "$cmd\n"; 161 161 print "$cmd\n";
system $cmd; 162 162 system $cmd;
} 163 163 }
164 164
165 165
166 166
167 167
sub getSequences { 168 168 sub getSequences {
169 169
my ($frag, $acc) = @_; 170 170 my ($frag, $acc) = @_;
171 171
172 172
# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); 173 173 # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]);
# exit; 174 174 # exit;
175 175
176 176
#Sequence for full protein 177 177 #Sequence for full protein
my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; 178 178 my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa";
179 179
180 180
#Save fragment to file 181 181 #Save fragment to file
my $tmpFile = "$outdir/${acc}_frag.faa"; 182 182 my $tmpFile = "$outdir/${acc}_frag.faa";
183 183
184 184
if (-f $frag) { 185 185 if (-f $frag) {
system "mv $frag $tmpFile" unless (-f $tmpFile); 186 186 system "mv $frag $tmpFile" unless (-f $tmpFile);
} 187 187 }
else { 188 188 else {
unless (-f $tmpFile) { 189 189 unless (-f $tmpFile) {
open (my $fh, ">", $tmpFile) || die $!; 190 190 open (my $fh, ">", $tmpFile) || die $!;
print $fh ">${acc} fragment\n$frag\n"; 191 191 print $fh ">${acc} fragment\n$frag\n";
close $fh; 192 192 close $fh;
} 193 193 }
} 194 194 }
195 195
unless (-f $accSeq) { 196 196 unless (-f $accSeq) {
#Blast DB to be used 197 197 #Blast DB to be used
my $db = ($blastdb)? $blastdb : 'nr'; 198 198 my $db = ($blastdb)? $blastdb : 'nr';
199 199
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); 200 200 my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq);
system $cmd; 201 201 system $cmd;
202 202
#Remove the version and annotations from the sequence file 203 203 #Remove the version and annotations from the sequence file
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); 204 204 my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq);
system $cmd2 unless (-f "${accSeq}.pkp"); 205 205 system $cmd2 unless (-f "${accSeq}.pkp");
} 206 206 }
207 207
#Return files to be aligned 208 208 #Return files to be aligned
return [$tmpFile, $accSeq]; 209 209 return [$tmpFile, $accSeq];
} 210 210 }
211 211
212 212
213 213
214 214
#=========================================================================== 215 215 #===========================================================================
#Read command line and print help 216 216 #Read command line and print help
217 217
218 218
sub read_command_line { 219 219 sub read_command_line {
220 220
print_help() unless (@ARGV); 221 221 print_help() unless (@ARGV);
222 222
my $status = GetOptions( 223 223 my $status = GetOptions(
"i|acc-file=s" => \&read_accFile, 224 224 "i|acc-file=s" => \&read_accFile,
"a|accession=s" => \&read_accession, 225 225 "a|accession=s" => \&read_accession,
"f|fragment=s" => \&read_fragment, 226 226 "f|fragment=s" => \&read_fragment,
"o|outdir=s" => \$outdir, 227 227 "o|outdir=s" => \$outdir,
"bdb|blastdb=s" => \&read_blastdb, 228 228 "bdb|blastdb=s" => \&read_blastdb,
"e|evalue=f" => \$evalue, 229 229 "e|evalue=f" => \$evalue,
"m|sub-matrix=s" => \&read_subMatrix, 230 230 "m|sub-matrix=s" => \&read_subMatrix,
"t|interactive!" => \$interactive, 231 231 "t|interactive!" => \$interactive,
"q|quiet!" => \$quiet, 232 232 "q|quiet!" => \$quiet,
"h|help" => sub { print_help(); }, 233 233 "h|help" => sub { print_help(); },
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); 234 234 "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
exit unless ($status); 235 235 exit unless ($status);
236 236
237 237
die "Error: option -f is mandatory." unless ($fragment); 238 238 die "Error: option -f is mandatory." unless ($fragment);
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); 239 239 die "Error: options -i or -a are mandatory." unless ($accession || $accFile);
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); 240 240 die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive);
241 241
#Default value for output directory 242 242 #Default value for output directory
$outdir = "." unless ($outdir); 243 243 $outdir = "." unless ($outdir);
system "mkdir -p $outdir" unless (-d $outdir); 244 244 system "mkdir -p $outdir" unless (-d $outdir);
245 245
tmsRepeat.pl View file @ 68b41f1
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl
2 2
use warnings; 3 3 no warnings;
use strict; 4 4 use strict;
use Data::Dumper; 5 5 use Data::Dumper;
6 6
$Data::Dumper::Deepcopy = 1; 7 7 use TCDB::Repeats;
$Data::Dumper::Indent = 1; 8
#$Data::Dumper::Purity = 0; 9
$Data::Dumper::Sortkeys = 1; 10
11
use Getopt::Long; 12 8 use Getopt::Long;
use Bio::SearchIO; 13
use Bio::SeqIO; 14
15 9
16 10
use TCDB::CheckDependencies; 17 11 my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences';
use TCDB::Assorted; 18 12 my $seqsFile = undef;
13 my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop';
14 my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP';
19 15
16 my $evalue = 1e-2;
17 my $coverage = 0.85;
18 my $identity = 0.2;
20 19
20 my @tmsRanges = ();
21 21
#========================================================================== 22 22 read_command_line();
#Check dependencies 23
24 23
my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py"); 25
my $CheckDep_obj = new TCDB::CheckDependencies(); 26
$CheckDep_obj -> dependencies_list(\@dependencies); 27
$CheckDep_obj -> checkDependencies; 28
29 24
30 25 #print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir],
31 26 # [qw(*seqsDir *seqsFile *tmsFile *outDir )]);
#========================================================================== 32
#Read command line options 33
34
my $gs_infile = ""; 35
my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS 36
my $gs_idFormat = ""; 37
my $gs_repUnit = 0; 38
my $gs_seqDir = ""; 39
my $gs_tail = 5; 40
my $gs_evalue = 0.1; 41
my $gs_coverage = 0.8; 42
my $gs_identity = 0.25; 43
my $gsatShuffles = 1000; 44
my $min_gsat_score = 4.0; 45
46
my $compStatsFlag = 1; 47
my $compStats = ""; 48
my $outdir = "repeats"; 49
my $repDir = "reports"; 50
my $seqDir = "sequences"; 51
my $alignDir = "alignments"; 52
my $plotsDir = "plots"; 53
my $goodHitsOnly = 1; #print only significant results, ignore everything else 54
55
56
#all (all sequences in output file) 57
#each (generate one directory per sequence.. for better organization) 58
#debug (it will print the contents of the hash table one sequences at a time) 59
my $mode = "all"; 60
61
read_command_line_arguments(); 62
63
#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir, 64
# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats], 65
# [qw(*infile *idFormat *repUnit *seqDir *tail *evalue 66
# *coverage *identity $gsatShuffles *compStatFlag *compStats)]); 67
#exit; 68 27 #exit;
69 28
# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa 70
71 29
30 #my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir,
31 # 'tmsFile' => $tmsFile,
32 # 'outDir' => $outDir,
33 # 'ranges2searchTMS' => \@TMSranges);
72 34
#========================================================================== 73
#Read file with coordinates of TMSs and verify that the sequences are 74
#available 75
76 35
my %gh_tms = (); 77 36 my @TMSranges = ([1, 3], [4, 6]);
78 37
read_tms_coordinates_file($gs_infile, \%gh_tms); 79 38 my $repObj = TCDB::Repeat->new();
80 39
#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]); 81 40 #$repObj->tmsFile($tmsFile);
#exit; 82 41 #$repObj->seqsDir($seqsDir);
42 $repObj->seqsFile($seqsFile);
43 $repObj->outDir($outDir);
44 $repObj->evalueCutoff($evalue);
45 $repObj->identityCutoff($identity);
46 $repObj->coverageCutoff($coverage);
47 $repObj->TMSranges2search(\@TMSranges);
83 48
49 $repObj-> findRepeatsTMSranges();
84 50
#=========================================================================== 85 51 #print Data::Dumper->Dump([$repObj ], [qw(*repObj)]);
#Main Output directory 86
87 52
#Root directory for all results 88
system "mkdir -p $outdir" unless (-d $outdir); 89
die "Could not generate output directory: $outdir" unless (-d $outdir); 90
91 53
92 54
93 55
#========================================================================== 94
#Search for repeats inside query sequences 95
96 56
my %results = (); 97
my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots 98
99
foreach my $ls_sid (keys %gh_tms) { 100
101
my %gh_bundleSeqs = (); 102
my %gh_topHits = (); 103
104
105
print "Processing: $ls_sid\n"; 106
107
108
#Clean results if one output directory is generated per input sequence 109
%results = () if ($mode eq 'each'); 110
111
112
#Cut sequences in non overlaping regions with as many TMS as the 113
#repeat unit we want to find. 114
cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs); 115
116
117
# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]); 118
# <STDIN>; 119
120
121
#run ssearch to find potential repeats. 122
align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits); 123
124
125
# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]); 126
# <STDIN>; 127
128
129
#Collect results for final table 130
$results{$ls_sid} = \%gh_topHits; 131
132
#present results per input sequence to verify everything looks fine. 133
if ($mode eq 'debug') { 134
print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]); 135
<STDIN>; 136
} 137
138
print_reports(\%results) if ($mode eq 'each'); 139
} 140
141
142
143
144
145
#=========================================================================== 146 57 #===========================================================================
#Print final results in summarized or detailed format 147 58 #Read command line and print help
148 59
#print Data::Dumper->Dump([\%results ], [qw(*results )]); 149
#<STDIN>; 150
151 60
print_reports(\%results) if ($mode eq 'all'); 152 61 sub read_command_line {
153 62
63 print_help() unless (@ARGV);
154 64
65 my $status = GetOptions(
66 "s|seqs-file=s" => \&read_seqsFile,
67 "d|seqs-dir=s" => \&read_seqsDir,
68 "o|outdir=s" => \&read_outdir,
69 "t|tms=s" => \&read_tmsFile,
70 "e|evalue=f" => \$evalue,
71 "i|identity=f" => \$identity,
72 "c|coverage=f" => \$coverage,
73 "h|help" => sub { print_help(); },
74 "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
75 exit unless ($status);
155 76
156 77
157 78 #Validadte input file option
########################################################################### 158 79 die "Error: no sequence file detected!" unless ($seqsFile);
# # 159
# Subroutine definition # 160
# # 161
########################################################################### 162
163
164
#print final_report 165
166
sub print_reports { 167
168
my $res = shift; 169
170
171
#Get the directory where reports will be saved 172
my $reportDir = undef; 173
if ($mode eq 'all') { 174
$reportDir = getReportsDir(); 175
} 176
else { 177
178
#one id per report 179
my @ids = keys %$res; 180
my $seqId = $ids[0]; 181
182
$reportDir = getReportsDir($seqId); 183
} 184
die "Error: invalid report dir" unless ($reportDir); 185
186
187
my $sumFile = "$reportDir/repeats_summary_report.txt"; 188
my $detailsFile = "$reportDir/repeats_detailed_report.txt"; 189
my $htmlFile = "$reportDir/report.html"; 190
191
192
open (my $htmlfh, ">", $htmlFile) || die $!; 193
194
my $htmlHeader = <<HEADER; 195
<!DOCTYPE html> 196
<html xmlns="http://www.w3.org/1999/xhtml"> 197
<head> 198
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> 199
200
<style type="text/css"> 201
202
.label { 203
text-align: right; 204
width: 50px; 205
} 206 80 }
207 81
.data { 208
text-align: left; 209
padding-left: 8px; 210
width: 100px; 211
} 212
213 82
.uline { 214
text-decoration: underline; 215
} 216
217
.seq { 218
border: 2px solid black; 219
height: 70px; 220
width: 100%; 221
overflow-x: auto; 222
overflow-y: hidden; 223
margin: 1em 0; 224
background: gray; 225
color: white; 226
} 227
228
img { 229
display: block; 230
margin-left: auto; 231
margin-right: auto; 232
height: 250px; 233
width: auto; 234
max-width: 1500px; 235
max-height: 300px; 236
} 237
238
</style> 239
<title>Inferring repeats of $gs_repUnit TMS</title> 240
</head> 241
<br /> 242
<h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1> 243
<body> 244
245
HEADER 246
247
print $htmlfh $htmlHeader; 248
open (my $sumh, ">", $sumFile) || die $!; 249
open (my $deth, ">", $detailsFile) || die $!; 250
251
252
#Header for summary table 253
print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n"; 254
255
256
# print Data::Dumper->Dump([$res ], [qw(*res )]); 257
# <STDIN>; 258
259
260
P:foreach my $id (sort {$a cmp $b} keys %$res) { 261
262
#Jump to next result if there are NO hits for this protein and 263
#ONLY good hits are going to be recorded. 264
unless (%{ $res->{$id} }) { 265
next P if ($goodHitsOnly); 266
} 267
268
269
print $deth "===========================================================================\n"; 270
print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n"; 271
272
#There must be results to continue 273
unless (%{ $res->{$id} }) { 274
print $sumh "$id\tNo_hits\n"; 275
print $deth "$id\tNo_hits\n\n\n"; 276
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n"; 277
} 278
279
print $deth "$id\n\n"; 280
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n"; 281
282
283
284
285
#get the long bundle names 286
BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) { 287
288
BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) { 289
290
my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName}; 291
my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen}; 292
293
#Each of the hits for this bundle 294
my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} }; 295
296
#To get rid of a warning when there is only one hit. 297
my @hits = (scalar (@hits_tmp) > 1)? 298
sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp; 299
300
foreach my $hit (@hits) { 301
302
my $hName = $hit->{hName}; 303
my $hLen = $hit->{hLen}; 304
305
my $evalue = sprintf ("%.1e", $hit->{hEvalue}); 306
my $ident = sprintf ("%.1f", $hit->{hId} * 100); 307
my $sim = sprintf ("%.1f", $hit->{hSim} * 100); 308
my $gsat = sprintf ("%.1f", $hit->{gsat}); 309
310
my $alnLen = $hit->{alnLen}; 311
my $qCov = sprintf("%.1f", $hit->{qCov} * 100); 312
my $hCov = sprintf("%.1f", $hit->{hCov} * 100); 313
314
315
#The alignment 316
my $qstart = $hit->{qstart}; 317
my $qend = $hit->{qend}; 318
my $sstart = $hit->{sstart}; 319
my $send = $hit->{send}; 320
my $qSeq = $hit->{qSeq}; 321
my $homStr = $hit->{homStr}; 322
my $sSeq = $hit->{sSeq}; 323
324
my $plot = $hit->{plot}; 325
326
#For summary tab-delimitedfile (everything except the alignment) 327
print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n"; 328
329
330
#Detailed report that includes the alignment 331
print $deth "----------\n"; 332
print $deth "$qName ($qLen) vs $hName ($hLen)\n\n"; 333
print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n"; 334
print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n"; 335
print $deth "Alignment ($qName|${qstart}-$qend vs $hName|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n"; 336
337
338
#The HTML report (includes alignment and hydropathy image 339
my $repHit = <<HIT; 340
341
<p><b>$qName ($qLen) vs $hName ($hLen)</b></p> 342
343
<table width="600px" border="0" cellspacing="0" cellpadding="2"> 344
<tr> 345
<td class='label'><b>E-value:</b></td> 346
<td class='data'>$evalue</td> 347
<td class='label'><b>Identity:</b></td> 348
<td class='data'>${ident}%</td> 349
<td class='label'><b>Similarity:</b></td> 350
<td class='data'>${sim}%</td> 351
<td class='label'><b>GSAT:</b></td> 352
<td class='data'>$gsat</td> 353
</tr> 354
<tr> 355
<td class='label'><b>Aln:</b></td> 356
<td class='data'>$alnLen</td> 357
<td class='label'><b>Q_cov:</b></td> 358
<td class='data'>${qCov}%</td> 359
<td class='label'><b>S_cov:</b></td> 360
<td class='data'>${hCov}%</td> 361
<td class='label'></td> 362
<td class='data'></td> 363
</tr> 364
</table> 365
366
<p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p> 367
<div class='seq'> 368
<pre> 369
$qSeq 370
$homStr 371
$sSeq 372
</pre> 373
</div> 374
<a href="$plot" target="_blank"><img src="$plot"/></a> 375
<br /> 376
<hr /> 377
378
HIT 379
380
print $htmlfh $repHit; 381
382
} #hit 383
} #reference bundle number 384
} #Reference bundle name 385
} #Query protein 386
387
#Close HTML report 388
my $closeRep = <<CLOSE; 389
</body> 390
</html> 391
CLOSE 392
393
print $htmlfh $closeRep; 394
395
close $sumh; 396
close $deth; 397
close $htmlfh; 398
} 399
400
401
402
#========================================================================== 403 83 #==========================================================================
#Run ssearch36 between the different bundles in a sequence 404 84 #Option -s
405 85
sub align_bundles { 406 86 sub read_seqsFile {
87 my ($opt, $value) = @_;
407 88
my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_; 408 89 unless (-f $value && !(-z $value)) {
409 90 die "Error: file with sequences does not exist or is empty!\n";
%$lhr_topHits = (); 410
411
#Directory where the sequences of TMS bundles are saved 412
my $sequencesDir = undef; 413
my $alignmentsDir = undef; 414
my $hydroPlotsDir = undef; 415
416
if ($mode eq 'all') { 417
$sequencesDir = getSequencesDir(); 418
$alignmentsDir = getAlignmentsDir(); 419
$hydroPlotsDir = getPlotsDir(); 420
} 421 91 }
else { 422
$sequencesDir = getSequencesDir($seqId); 423
$alignmentsDir = getAlignmentsDir($seqId); 424
$hydroPlotsDir = getPlotsDir($seqId); 425
} 426
die "Error: invalid sequences dir" unless ($sequencesDir); 427
die "Error: invalid alignments dir" unless ($alignmentsDir); 428
die "Error: invalid plots dir" unless ($hydroPlotsDir); 429
430 92
431 93 $seqsFile = $value;
# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]); 432
# <STDIN>; 433
434
435
#The bundle that will be used as reference for the comparison 436
REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) { 437
438
my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0]; 439
440
441
#Id to name ssearch36 output files 442
my $id = $lhr_bundleSeqFiles->{$bundle}->[0]; 443
$id =~ s/\.faa//; 444
445
446
#For naming GSAT files (ID of system or protein accession) 447
my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef; 448
die "Could not extract accession from $id!" unless ($id); 449
450
451
# print Data::Dumper->Dump([$id, $tcAcc ], [qw(*id *tcAcc)]); 452
# <STDIN>; 453
454
455
#-------------------------------------------------------------------- 456
#Get the non-overlapping bundles to compare them against the 457
#reference bundle 458
459
my @cmpFiles = (); 460
461
#Initialize the index to the first non-overlapping bundle 462
my $next_bundle_idx = $bundle + $gs_repUnit; 463
464
CMP:while (1) { 465
466
#Exit if next bundle is not in bundles hash 467
last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx}); 468
469
#Get file name for this non-overlapping bundle 470
my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0]; 471
push (@cmpFiles, $cmpBundle); 472
473
#Update the index to the next non-overlapping bundle 474
$next_bundle_idx = $next_bundle_idx + $gs_repUnit; 475
} 476
477
#go to next reference bundle if there are no non-overlapping bundles. 478
next REF unless (@cmpFiles); 479
480
481
# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]); 482
# <STDIN>; 483
484
485
#-------------------------------------------------------------------- 486
#Now run ssearch36 of the reference bundle against all its 487
#non-overlapping bundles 488
489
#put all non-overlapping bundles into a file 490
my $libFile = "$sequencesDir/lib_$id.faa"; 491
my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile"; 492
system $cmd; 493
494
495
#run ssearch36 of $rFile vs @cmpFile 496
my $ssearchOut = "$alignmentsDir/ssearch_$id.out"; 497
my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut); 498
system "ssearch36 $ssearch_params" unless (-f $ssearchOut); 499
500
501
# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]); 502
# <STDIN>; 503
504
505
#--------------------------------------------------------------------- 506
#Estimate here the spacing between x-ticks for hydropathy plots 507
508
my $protLen = $origSeqLength{$seqId}; 509
510
my $xticksSpacing = undef; 511
if ($protLen <= 500) { 512
$xticksSpacing = 25; 513
} 514
elsif ($protLen <= 1000) { 515
$xticksSpacing = 50; 516
} 517
else { 518
$xticksSpacing = 100; 519
} 520
521
522
523
#-------------------------------------------------------------------- 524
#parse ssearch36 output. For BioPerl resouces check: 525
#http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm 526
#https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html 527
528
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut); 529
530
531
#put hir the top hits 532
my %lh_hits = (); 533
534
535
while (my $result = $parser->next_result) { 536
537
538
my $qLen = $result->query_length; 539
$lh_hits{$bundle}{qName} = $result->query_name; 540
$lh_hits{$bundle}{qLen} = $qLen; 541
$lh_hits{$bundle}{hits} = []; 542
543
544
HIT:while (my $hit = $result->next_hit) { 545
546
HSP:while(my $hsp = $hit->next_hsp) { 547
548
549
# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]); 550
# <STDIN>; 551
552
553
my %tmp = (); 554
555
my $alnLen = $hsp->hsp_length; 556
my $hLen = $hit->length; 557
my $hEvalue = $hsp->evalue; 558
my $hId = $hsp->frac_identical('total'); #identity in the alignment 559
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment 560
561
562
#coordinates in the alignment to properly calculate coverages 563
my $qstart = $hsp->start('query'); 564
my $qend = $hsp->end('query'); 565
my $sstart = $hsp->start('subject'); 566
my $send = $hsp->end('subject'); 567
568
569
#Calculate coverages properly (do not use alignment length as it includes gaps 570
571
my $qCov_tmp = ($qend - $qstart + 1) / $qLen; 572
my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp; 573
574
my $hCov_tmp = ($send - $sstart + 1) / $hLen; 575
my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp; 576
577
578
# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity], 579
# [qw(*qLen *qCov $hLen *hCov *coverageCutoff *evalue *evalCutoff *hId *IDcutoff)]); 580
# <STDIN>; 581
582
583
#Before storing hit results check minimum coverage, identity and evalue 584
next HSP unless (($qCov >= $gs_coverage || $hCov >= $gs_coverage) && 585
($hEvalue <= $gs_evalue) && ($hId >= $gs_identity)); 586
587
588
#hit identity 589
$tmp{hName} = $hit->name; 590
$tmp{hLen} = $hLen; 591
592
593
#hit statistics 594
$tmp{alnLen} = $alnLen; 595
$tmp{hEvalue} = $hEvalue; 596
$tmp{hId} = $hId; 597
$tmp{hSim} = $hSim; 598
$tmp{qCov} = $qCov; 599
$tmp{hCov} = $hCov; 600
601
602
#The alignment 603
$tmp{qstart} = $qstart; 604
$tmp{qend} = $qend; 605
$tmp{sstart} = $sstart; 606
$tmp{send} = $send; 607
608
$tmp{qSeq} = $hsp->query_string; 609
$tmp{sSeq} = $hsp->hit_string; 610
$tmp{homStr} = $hsp->homology_string; 611
612
613
#Get the GSAT score 614
my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat"; 615
616
617
# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n"; 618
# exit; 619
620
system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile); 621
622
my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile); 623
$tmp{gsat} = $gsat_score; 624
625
626
# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]); 627
# <STDIN>; 628
629
630
#GSAT is the last filter 631
next HSP unless ($gsat_score >= $min_gsat_score); 632
633
#------------------------------------------------------------ 634
#Generate quod plot with the repeat 635
636
my $whole_prot_seq = "$gs_seqDir/${seqId}.faa"; 637
die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq); 638
639
640
my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName}; 641
my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png"; 642
my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName}; 643
644
#Get hydrophobic peaks coords 645
my $hydroPeaks = $gh_tms{$seqId}; 646
die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks }); 647
648
649
#format the hydrophobic peaks for quod 650
my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks; 651
my $pstring = join (" ", @peaks); 652
653
654
#---------- 655
#Calculate the positions of the aligned section of each bundle in the full sequence. 656
657
my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef; 658
my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef; 659
die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid); 660
661
662
#extract initial positions for both bundles 663
my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1]; 664
my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1; 665
my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1]; 666
my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1; 667
die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend); 668
die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend); 669
670
671
#Calculate bundle positions here 672
my $qgp_start = $qbstart + ($qstart - 1); 673
my $qgp_end = $qbstart + ($qend - 1); 674
675
my $sgp_start = $sbstart + ($sstart - 1); 676
my $sgp_end = $sbstart + ($send - 1); 677
678
679
#Format the coordinates for the repeats now 680
my $qrep = "${qgp_start}-${qgp_end}:green"; 681
my $srep = "${sgp_start}-${sgp_end}:blue"; 682
683
#Format the coordinates for the bar delimiting the bundles 684
my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1"; 685
686
#The quod command line 687
my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}"; 688
689
my $img = "${plotFile}.png"; 690
system $cmd unless (-f $img); 691
die "Could not generate plot: $img" unless (-f $img); 692
693
$tmp{plot} = $fileName; 694
695
696
#load the data into the hits section for this bundle 697
push (@{ $lh_hits{$bundle}{hits} }, \%tmp); 698
699
700
} #HSP 701
} #HIT 702
} #While 703
704
705
#Add results to the topHits hash 706
if (@{ $lh_hits{$bundle}{hits} }) { 707
$lhr_topHits->{$id} = \%lh_hits; 708
} 709
710
} 711
} 712 94 }
713 95
714 96
715
716
#========================================================================== 717 97 #==========================================================================
#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the 718 98 #Option -t
#sequence in TMS bundles of length rsize. 719
720 99
100 sub read_tmsFile {
101 my ($opt, $value) = @_;
721 102
sub cut_seq_in_tms_regions { 722 103 unless (-f $value && !(-z $value)) {
723 104 die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n";
my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_; 724
725
726
%$lhr_seqSegs = (); 727
728
729
#Get the directory where bundle sequences will be saved 730
my $sequencesDir = undef; 731
732
if ($mode eq 'all') { 733
$sequencesDir = getSequencesDir(); 734
} 735 105 }
else { 736
$sequencesDir = getSequencesDir($ls_pid); 737
} 738
die "Error: invalid sequence dir" unless ($sequencesDir); 739
740 106
741 107 $tmsFile = $value;
#---------------------------------------------------------------------- 742
#Get the coordinates of the overlapping bundles 743
744
my @la_tms = @{ $lhr_tms->{$ls_pid} }; 745
746
747
748
#Get the Length of the sequence of the query protein 749
my $seqFile = "$gs_seqDir/${ls_pid}.faa"; 750
my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta"); 751
my $seqObj = $obj->next_seq; 752
my $qlength = $seqObj->length; 753
die "Could not extract protein length." unless ($qlength); 754
755
#Store the length of the original sequence for proper calculation of 756
#the x-ticks in the hydropathy plots of the results 757
$origSeqLength{$ls_pid} = $qlength; 758
759
760
761
#Number of TMS in protein 762
my $ls_ntms = scalar (@la_tms); 763
764
765
766
for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) { 767
768
#TMS in bundle 769
my $left_tms = $la_tms[$idx - 1]; 770
my $right_tms = $la_tms[$idx + $ls_repeat - 2]; 771
772
773
#The coordinates of the bundle 774
my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail; 775
#my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail; 776
my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail; 777
778
779
#Cut and name the bundles only if bundle file does not exist 780
my $outfile = "${ls_pid}_bundle${idx}"; 781
unless (-f "$sequencesDir/${outfile}.faa") { 782
783
#cutting bundle 784
my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0); 785
system "tmsplit $args > /dev/null"; 786
787
#replace protein ID with bundle number to the ID so alignments can be easily identified 788
system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa); 789
} 790
791
$lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos]; 792
} 793
} 794 108 }
795 109
796 110
797
798
#========================================================================== 799 111 #==========================================================================
#Read file with the TMS coordinates of the input proteins. The TMS 800 112 #Option -d
#must have been validated with WHAT to make sure they are reliable. 801
802 113
114 sub read_seqsDir {
115 my ($opt, $value) = @_;
803 116
sub read_tms_coordinates_file { 804 117 die "Error: directory with sequences does not exist." unless (-d $value);
805 118
my ($s_coordsFile, $hr_tms) = @_; 806 119 $seqsDir = $value;
807
open (my $fileh, "<", $s_coordsFile) || die $!; 808
809
#----------------------------------------------------------------- 810
#The format of this file is protein ID followed by pairs of 811
#coordinates separated by dash: 812
# 2.A.43.1.1-O60931 1-20 25-35 50-68 .... 813
if ($infileFmt eq 'tms') { 814
815
while(<$fileh>) { 816
chomp; 817
818
#ignore empty lines; 819
next unless ($_); 820
821
#extract id and TMSs coordinates 822
my ($id, @tms_str) = split(/\s+/, $_); 823
my @tms = map { [ split(/-/, $_) ] } @tms_str; 824
825
826
#For debugging purposes 827
# next unless ($id eq 'WP_100644534'); 828
829
830
$hr_tms->{$id} = \@tms; 831
832
#Verify that the sequence is available for this protein 833