Commit 68b41f16a35757f498478ba2822e6978013608f2
1 parent
6342e60576
Exists in
master
update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts
Showing 3 changed files with 128 additions and 1080 deletions Inline Diff
alignSeqsFiles.pl
View file @
68b41f1
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | $Data::Dumper::Deepcopy = 1; | |
8 | 8 | |||
use Getopt::Long; | 9 | 9 | use Getopt::Long; | |
use LWP; | 10 | 10 | use LWP; | |
use Bio::SeqIO; | 11 | 11 | use Bio::SeqIO; | |
use Bio::SearchIO; | 12 | 12 | use Bio::SearchIO; | |
13 | 13 | |||
#Local libraries | 14 | 14 | #Local libraries | |
use TCDB::CheckDependencies; | 15 | 15 | use TCDB::CheckDependencies; | |
use TCDB::Domain::PfamParser; | 16 | 16 | use TCDB::Domain::PfamParser; | |
use TCDB::Domain::Characterize; | 17 | 17 | use TCDB::Domain::Characterize; | |
use TCDB::Assorted; | 18 | 18 | use TCDB::Assorted; | |
19 | 19 | |||
20 | 20 | |||
########################################################################### | 21 | 21 | ########################################################################### | |
# | 22 | 22 | # | |
# Comapre two files with fasta sequences and report the alignment parameters | 23 | 23 | # Comapre two files with fasta sequences and report the alignment parameters | |
# Along with hydropathy plots and PFAM domains. | 24 | 24 | # Along with hydropathy plots and PFAM domains. | |
# | 25 | 25 | # | |
########################################################################### | 26 | 26 | ########################################################################### | |
27 | 27 | |||
#========================================================================== | 28 | 28 | #========================================================================== | |
#Check dependencies | 29 | 29 | #Check dependencies | |
30 | 30 | |||
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", | 31 | 31 | my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", | |
"hmmscan"); | 32 | 32 | "hmmscan"); | |
my $CheckDep_obj = new TCDB::CheckDependencies(); | 33 | 33 | my $CheckDep_obj = new TCDB::CheckDependencies(); | |
$CheckDep_obj -> dependencies_list(\@dependencies); | 34 | 34 | $CheckDep_obj -> dependencies_list(\@dependencies); | |
$CheckDep_obj -> checkDependencies; | 35 | 35 | $CheckDep_obj -> checkDependencies; | |
36 | 36 | |||
37 | 37 | |||
#This will prevent quod and alnquod from going into interactive mode | 38 | 38 | #This will prevent quod and alnquod from going into interactive mode | |
$ENV{"MPLBACKEND"} = "agg"; | 39 | 39 | $ENV{"MPLBACKEND"} = "agg"; | |
40 | 40 | |||
41 | 41 | |||
#========================================================================== | 42 | 42 | #========================================================================== | |
#Read command line arguments | 43 | 43 | #Read command line arguments | |
44 | 44 | |||
my $qfile = ""; | 45 | 45 | my $qfile = ""; | |
my $qProt = ""; | 46 | 46 | my $qProt = ""; | |
my $sfile = ""; | 47 | 47 | my $sfile = ""; | |
my $sProt = ""; | 48 | 48 | my $sProt = ""; | |
my $qlabel = "Query"; | 49 | 49 | my $qlabel = "Query"; | |
my $slabel = "Subject"; | 50 | 50 | my $slabel = "Subject"; | |
my $outdir = ""; | 51 | 51 | my $outdir = ""; | |
my $prog = 'ssearch36'; #'blastp'; | 52 | 52 | my $prog = 'ssearch36'; #'blastp'; | |
my $evalue = 1e-4; | 53 | 53 | my $evalue = 1e-4; | |
my $identity = 20.0; | 54 | 54 | my $identity = 20.0; | |
my $coverage = 40.0; | 55 | 55 | my $coverage = 40.0; | |
my $covControl = "X"; | 56 | 56 | my $covControl = "X"; | |
my $blastComp = "F"; #2; | 57 | 57 | my $blastComp = "F"; #2; | |
my $segFilter = 'no'; | 58 | 58 | my $segFilter = 'no'; | |
my $minLength = 30; #Min legnth of proteins to analyze (without gaps) | 59 | 59 | my $minLength = 30; #Min legnth of proteins to analyze (without gaps) | |
my $subMatrix = 'BL50'; | 60 | 60 | my $subMatrix = 'BL50'; | |
61 | my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] | |||
62 | my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] | |||
61 | 63 | |||
#this can be used to remove long sequences from results | 62 | 64 | #this can be used to remove long sequences from results | |
my $maxProtLength = 100000; #default threshold to allow any length | 63 | 65 | my $maxProtLength = 100000; #default threshold to allow any length | |
my $LengthControl = "N"; #same meaning as $covControl | 64 | 66 | my $LengthControl = "N"; #same meaning as $covControl | |
65 | 67 | |||
#internal directories | 66 | 68 | #internal directories | |
my $filesDir = ""; | 67 | 69 | my $filesDir = ""; | |
my $plotsDir = ""; | 68 | 70 | my $plotsDir = ""; | |
my $seqDir = ""; | 69 | 71 | my $seqDir = ""; | |
my $blastDir = ""; | 70 | 72 | my $blastDir = ""; | |
71 | 73 | |||
72 | 74 | |||
read_command_line(); | 73 | 75 | read_command_line(); | |
74 | 76 | |||
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, | 75 | 77 | #print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, | |
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, | 76 | 78 | # $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, | |
# $LengthControl], | 77 | 79 | # $LengthControl], | |
# [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog | 78 | 80 | # [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog | |
# *evalue *coverage *covControl *blastComp *segFilter *maxProtLength | 79 | 81 | # *evalue *coverage *covControl *blastComp *segFilter *maxProtLength | |
# *LengthControl)]); | 80 | 82 | # *LengthControl)]); | |
#exit; | 81 | 83 | #exit; | |
82 | 84 | |||
83 | 85 | |||
84 | 86 | |||
#========================================================================== | 85 | 87 | #========================================================================== | |
#Output files | 86 | 88 | #Output files | |
87 | 89 | |||
#The alignment file by blastp or ssearch36 | 88 | 90 | #The alignment file by blastp or ssearch36 | |
my $alnFile = "$filesDir/${prog}.out"; | 89 | 91 | my $alnFile = "$filesDir/${prog}.out"; | |
90 | 92 | |||
#The results of running hmmscan | 91 | 93 | #The results of running hmmscan | |
my $pfamFile = "$filesDir/hmmscan.out"; | 92 | 94 | my $pfamFile = "$filesDir/hmmscan.out"; | |
93 | 95 | |||
#The results from running hmmtop | 94 | 96 | #The results from running hmmtop | |
my $hmmtopFile = "$filesDir/hmmtop.out"; | 95 | 97 | my $hmmtopFile = "$filesDir/hmmtop.out"; | |
96 | 98 | |||
#The blast database to retrieve sequences for ploting | 97 | 99 | #The blast database to retrieve sequences for ploting | |
my $blastdb = "$blastDir/sequences"; | 98 | 100 | my $blastdb = "$blastDir/sequences"; | |
99 | 101 | |||
100 | 102 | |||
101 | 103 | |||
#========================================================================== | 102 | 104 | #========================================================================== | |
#Run the alignment first | 103 | 105 | #Run the alignment first | |
104 | 106 | |||
print "Running $prog and parsing output....\n"; | 105 | 107 | print "Running $prog and parsing output....\n"; | |
run_alignment(); | 106 | 108 | run_alignment(); | |
107 | 109 | |||
108 | 110 | |||
my @alnHits = (); | 109 | 111 | my @alnHits = (); | |
if ($prog eq 'blastp') { parse_blast(\@alnHits); } | 110 | 112 | if ($prog eq 'blastp') { parse_blast(\@alnHits); } | |
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} | 111 | 113 | elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} | |
112 | 114 | |||
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); | 113 | 115 | #print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); | |
#exit; | 114 | 116 | #exit; | |
115 | 117 | |||
116 | 118 | |||
die "No significant blastHits found!\n" unless (@alnHits); | 117 | 119 | die "No significant blastHits found!\n" unless (@alnHits); | |
118 | 120 | |||
119 | 121 | |||
120 | 122 | |||
#========================================================================== | 121 | 123 | #========================================================================== | |
#Run pfam (get clans, hmmtop, and parse results | 122 | 124 | #Run pfam (get clans, hmmtop, and parse results | |
123 | 125 | |||
my %pfamHits = (); | 124 | 126 | my %pfamHits = (); | |
my %clans = (); | 125 | 127 | my %clans = (); | |
my %hmmtopHits = (); | 126 | 128 | my %hmmtopHits = (); | |
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); | 127 | 129 | run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); | |
128 | 130 | |||
#print Data::Dumper->Dump([\%clans], [qw(*clans)]); | 129 | 131 | #print Data::Dumper->Dump([\%clans], [qw(*clans)]); | |
#exit; | 130 | 132 | #exit; | |
131 | 133 | |||
132 | 134 | |||
#========================================================================== | 133 | 135 | #========================================================================== | |
#Parse the alignment results to make sure there are signficant results, | 134 | 136 | #Parse the alignment results to make sure there are signficant results, | |
#get domains for significant hits and plot the corresponding hydropathies. | 135 | 137 | #get domains for significant hits and plot the corresponding hydropathies. | |
136 | 138 | |||
137 | 139 | |||
138 | 140 | |||
139 | 141 | |||
print "Geneating report...\n"; | 140 | 142 | print "Geneating report...\n"; | |
generate_report(); | 141 | 143 | generate_report(); | |
142 | 144 | |||
143 | 145 | |||
144 | 146 | |||
#========================================================================== | 145 | 147 | #========================================================================== | |
################ Subroutines definition beyond ths point ############## | 146 | 148 | ################ Subroutines definition beyond ths point ############## | |
#========================================================================== | 147 | 149 | #========================================================================== | |
148 | 150 | |||
149 | 151 | |||
#========================================================================== | 150 | 152 | #========================================================================== | |
#Generate output for significant hits | 151 | 153 | #Generate output for significant hits | |
152 | 154 | |||
sub generate_report { | 153 | 155 | sub generate_report { | |
154 | 156 | |||
155 | 157 | |||
#Prepare output files | 156 | 158 | #Prepare output files | |
my $htmlFile = "$outdir/report.html"; | 157 | 159 | my $htmlFile = "$outdir/report.html"; | |
my $plotsFile = "$outdir/plots.html"; | 158 | 160 | my $plotsFile = "$outdir/plots.html"; | |
159 | 161 | |||
160 | 162 | |||
my $htmlHeader = <<HEADER; | 161 | 163 | my $htmlHeader = <<HEADER; | |
<!DOCTYPE html> | 162 | 164 | <!DOCTYPE html> | |
<html xmlns="http://www.w3.org/1999/xhtml"> | 163 | 165 | <html xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | 164 | 166 | <head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | 165 | 167 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
166 | 168 | |||
<style type="text/css"> | 167 | 169 | <style type="text/css"> | |
168 | 170 | |||
.label { | 169 | 171 | .label { | |
text-align: right; | 170 | 172 | text-align: right; | |
width: 50px; | 171 | 173 | width: 50px; | |
} | 172 | 174 | } | |
173 | 175 | |||
.data { | 174 | 176 | .data { | |
text-align: left; | 175 | 177 | text-align: left; | |
padding-left: 8px; | 176 | 178 | padding-left: 8px; | |
width: 100px; | 177 | 179 | width: 100px; | |
} | 178 | 180 | } | |
179 | 181 | |||
.uline { | 180 | 182 | .uline { | |
text-decoration: underline; | 181 | 183 | text-decoration: underline; | |
} | 182 | 184 | } | |
183 | 185 | |||
.pfam { | 184 | 186 | .pfam { | |
text-align: center; | 185 | 187 | text-align: center; | |
vertical-align: middle; | 186 | 188 | vertical-align: middle; | |
} | 187 | 189 | } | |
188 | 190 | |||
.seq { | 189 | 191 | .seq { | |
border: 2px solid black; | 190 | 192 | border: 2px solid black; | |
height: 70px; | 191 | 193 | height: 70px; | |
width: 100%; | 192 | 194 | width: 100%; | |
overflow-x: auto; | 193 | 195 | overflow-x: auto; | |
overflow-y: hidden; | 194 | 196 | overflow-y: hidden; | |
margin: 1em 0; | 195 | 197 | margin: 1em 0; | |
background: gray; | 196 | 198 | background: gray; | |
color: white; | 197 | 199 | color: white; | |
} | 198 | 200 | } | |
199 | 201 | |||
200 | 202 | |||
.dom { | 201 | 203 | .dom { | |
border: 2px solid black; | 202 | 204 | border: 2px solid black; | |
height: 100px; | 203 | 205 | height: 100px; | |
width: 100%; | 204 | 206 | width: 100%; | |
overflow-x: auto; | 205 | 207 | overflow-x: auto; | |
overflow-y: auto; | 206 | 208 | overflow-y: auto; | |
margin: 1em 0; | 207 | 209 | margin: 1em 0; | |
background: gray; | 208 | 210 | background: gray; | |
color: white; | 209 | 211 | color: white; | |
} | 210 | 212 | } | |
211 | 213 | |||
img { | 212 | 214 | img { | |
display: block; | 213 | 215 | display: block; | |
margin-left: auto; | 214 | 216 | margin-left: auto; | |
margin-right: auto; | 215 | 217 | margin-right: auto; | |
height: 250px; | 216 | 218 | height: 250px; | |
width: auto; | 217 | 219 | width: auto; | |
max-width: 1500px; | 218 | 220 | max-width: 1500px; | |
max-height: 300px; | 219 | 221 | max-height: 300px; | |
} | 220 | 222 | } | |
221 | 223 | |||
</style> | 222 | 224 | </style> | |
<title>$qlabel vs $slabel</title> | 223 | 225 | <title>$qlabel vs $slabel</title> | |
</head> | 224 | 226 | </head> | |
<br /> | 225 | 227 | <br /> | |
<h1 style='text-align:center'>$qlabel vs $slabel</h1> | 226 | 228 | <h1 style='text-align:center'>$qlabel vs $slabel</h1> | |
<body> | 227 | 229 | <body> | |
228 | 230 | |||
HEADER | 229 | 231 | HEADER | |
230 | 232 | |||
231 | 233 | |||
open (my $outh, ">", $htmlFile) || die $!; | 232 | 234 | open (my $outh, ">", $htmlFile) || die $!; | |
print $outh $htmlHeader; | 233 | 235 | print $outh $htmlHeader; | |
234 | 236 | |||
235 | 237 | |||
foreach my $hit (sort by_evalue @alnHits) { | 236 | 238 | foreach my $hit (sort by_evalue @alnHits) { | |
237 | 239 | |||
238 | 240 | |||
my $qacc = $hit->{qacc}; | 239 | 241 | my $qacc = $hit->{qacc}; | |
my $qlen = $hit->{qlen}; | 240 | 242 | my $qlen = $hit->{qlen}; | |
my $qseq = $hit->{qseq}; | 241 | 243 | my $qseq = $hit->{qseq}; | |
my $qcov = sprintf("%.1f", $hit->{qcov}); | 242 | 244 | my $qcov = sprintf("%.1f", $hit->{qcov}); | |
my $qstart = $hit->{qstart}; | 243 | 245 | my $qstart = $hit->{qstart}; | |
my $qend = $hit->{qend}; | 244 | 246 | my $qend = $hit->{qend}; | |
245 | 247 | |||
my $sacc = $hit->{sacc}; | 246 | 248 | my $sacc = $hit->{sacc}; | |
my $slen = $hit->{slen}; | 247 | 249 | my $slen = $hit->{slen}; | |
my $sseq = $hit->{sseq}; | 248 | 250 | my $sseq = $hit->{sseq}; | |
my $scov = sprintf("%.1f", $hit->{scov}); | 249 | 251 | my $scov = sprintf("%.1f", $hit->{scov}); | |
my $sstart = $hit->{sstart}; | 250 | 252 | my $sstart = $hit->{sstart}; | |
my $send = $hit->{send}; | 251 | 253 | my $send = $hit->{send}; | |
252 | 254 | |||
my $eval = sprintf ("%.1e", $hit->{evalue}); | 253 | 255 | my $eval = sprintf ("%.1e", $hit->{evalue}); | |
my $id = sprintf ("%.1f", $hit->{id}); | 254 | 256 | my $id = sprintf ("%.1f", $hit->{id}); | |
my $hstr = $hit->{hstr}; | 255 | 257 | my $hstr = $hit->{hstr}; | |
256 | 258 | |||
my $alnHit = <<HIT; | 257 | 259 | my $alnHit = <<HIT; | |
258 | 260 | |||
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> | 259 | 261 | <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> | |
260 | 262 | |||
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> | 261 | 263 | <p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> | |
262 | 264 | |||
<table width="600px" border="0" cellspacing="0" cellpadding="2"> | 263 | 265 | <table width="600px" border="0" cellspacing="0" cellpadding="2"> | |
<tr> | 264 | 266 | <tr> | |
<td class='label'><b>E-value:</b></td> | 265 | 267 | <td class='label'><b>E-value:</b></td> | |
<td class='data'>$eval</td> | 266 | 268 | <td class='data'>$eval</td> | |
<td class='label'><b>Identity:</b></td> | 267 | 269 | <td class='label'><b>Identity:</b></td> | |
<td class='data'>${id}%</td> | 268 | 270 | <td class='data'>${id}%</td> | |
<td class='label'><b>Q_coverage:</b></td> | 269 | 271 | <td class='label'><b>Q_coverage:</b></td> | |
<td class='data'>${qcov}%</td> | 270 | 272 | <td class='data'>${qcov}%</td> | |
<td class='label'><b>S_coverage:</b></td> | 271 | 273 | <td class='label'><b>S_coverage:</b></td> | |
<td class='data'>${scov}%</td> | 272 | 274 | <td class='data'>${scov}%</td> | |
</tr> | 273 | 275 | </tr> | |
<tr> | 274 | 276 | <tr> | |
<td class='label'><b>Q_aln:</b></td> | 275 | 277 | <td class='label'><b>Q_aln:</b></td> | |
<td class='data'>${qstart}-$qend</td> | 276 | 278 | <td class='data'>${qstart}-$qend</td> | |
<td class='label'><b>S_aln:</b></td> | 277 | 279 | <td class='label'><b>S_aln:</b></td> | |
<td class='data'>${sstart}-$send</td> | 278 | 280 | <td class='data'>${sstart}-$send</td> | |
<td class='label'></td> | 279 | 281 | <td class='label'></td> | |
<td class='data'></td> | 280 | 282 | <td class='data'></td> | |
<td class='label'></td> | 281 | 283 | <td class='label'></td> | |
<td class='data'></td> | 282 | 284 | <td class='data'></td> | |
</tr> | 283 | 285 | </tr> | |
</table> | 284 | 286 | </table> | |
<br /> | 285 | 287 | <br /> | |
286 | 288 | |||
<p><b>Alignment:</b></p> | 287 | 289 | <p><b>Alignment:</b></p> | |
<div class='seq'> | 288 | 290 | <div class='seq'> | |
<pre> | 289 | 291 | <pre> | |
$qseq | 290 | 292 | $qseq | |
$hstr | 291 | 293 | $hstr | |
$sseq | 292 | 294 | $sseq | |
</pre> | 293 | 295 | </pre> | |
</div> | 294 | 296 | </div> | |
295 | 297 | |||
HIT | 296 | 298 | HIT | |
297 | 299 | |||
print $outh $alnHit; | 298 | 300 | print $outh $alnHit; | |
299 | 301 | |||
300 | 302 | |||
#Generate the hydropathy plots | 301 | 303 | #Generate the hydropathy plots | |
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); | 302 | 304 | my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); | |
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); | 303 | 305 | die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); | |
304 | 306 | |||
my $domData = generate_domain_data($qacc, $sacc); | 305 | 307 | my $domData = generate_domain_data($qacc, $sacc); | |
my $domHTML = ""; | 306 | 308 | my $domHTML = ""; | |
if ($domData) { | 307 | 309 | if ($domData) { | |
$domHTML =<<DATA; | 308 | 310 | $domHTML =<<DATA; | |
<br /> | 309 | 311 | <br /> | |
<hr /> | 310 | 312 | <hr /> | |
<p><b>Pfam info:</b></p> | 311 | 313 | <p><b>Pfam info:</b></p> | |
<div class='dom'> | 312 | 314 | <div class='dom'> | |
313 | 315 | |||
$domData | 314 | 316 | $domData | |
315 | 317 | |||
</div> | 316 | 318 | </div> | |
DATA | 317 | 319 | DATA | |
} | 318 | 320 | } | |
319 | 321 | |||
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; | 320 | 322 | my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; | |
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; | 321 | 323 | my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; | |
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; | 322 | 324 | my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; | |
323 | 325 | |||
#now include the plots | 324 | 326 | #now include the plots | |
my $prtPlots =<<PLOTS; | 325 | 327 | my $prtPlots =<<PLOTS; | |
326 | 328 | |||
<br /> | 327 | 329 | <br /> | |
<table style="width:100%"> | 328 | 330 | <table style="width:100%"> | |
<tr> | 329 | 331 | <tr> | |
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> | 330 | 332 | <td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> | |
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> | 331 | 333 | <td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> | |
</tr> | 332 | 334 | </tr> | |
<tr> | 333 | 335 | <tr> | |
<td colspan="2" style="text-align: center;"> | 334 | 336 | <td colspan="2" style="text-align: center;"> | |
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> | 335 | 337 | <a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> | |
</td> | 336 | 338 | </td> | |
</tr> | 337 | 339 | </tr> | |
</table> | 338 | 340 | </table> | |
339 | 341 | |||
$domHTML | 340 | 342 | $domHTML | |
341 | 343 | |||
PLOTS | 342 | 344 | PLOTS | |
343 | 345 | |||
print $outh $prtPlots; | 344 | 346 | print $outh $prtPlots; | |
} | 345 | 347 | } | |
346 | 348 | |||
#Close HTML report | 347 | 349 | #Close HTML report | |
my $closeRep = <<CLOSE; | 348 | 350 | my $closeRep = <<CLOSE; | |
</body> | 349 | 351 | </body> | |
</html> | 350 | 352 | </html> | |
CLOSE | 351 | 353 | CLOSE | |
352 | 354 | |||
print $outh $closeRep; | 353 | 355 | print $outh $closeRep; | |
354 | 356 | |||
close $outh; | 355 | 357 | close $outh; | |
} | 356 | 358 | } | |
357 | 359 | |||
358 | 360 | |||
#========================================================================== | 359 | 361 | #========================================================================== | |
#Generate domain data for the html report | 360 | 362 | #Generate domain data for the html report | |
361 | 363 | |||
362 | 364 | |||
sub generate_domain_data { | 363 | 365 | sub generate_domain_data { | |
364 | 366 | |||
my ($q, $s) = @_; | 365 | 367 | my ($q, $s) = @_; | |
366 | 368 | |||
#Format of PFAM hash: | 367 | 369 | #Format of PFAM hash: | |
# push (@{ $out->{$qacc}->{$pfamID} }, | 368 | 370 | # push (@{ $out->{$qacc}->{$pfamID} }, | |
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, | 369 | 371 | # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, | |
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | 370 | 372 | # dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | |
371 | 373 | |||
372 | 374 | |||
373 | 375 | |||
374 | 376 | |||
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); | 375 | 377 | my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); | |
my $colStr = " <th>" . join ("</th>\n <th>", @cols) . | 376 | 378 | my $colStr = " <th>" . join ("</th>\n <th>", @cols) . | |
"</th>\n"; | 377 | 379 | "</th>\n"; | |
378 | 380 | |||
my $header =<<HEADER; | 379 | 381 | my $header =<<HEADER; | |
<table border='1', style='width:100%'> | 380 | 382 | <table border='1', style='width:100%'> | |
<tr> | 381 | 383 | <tr> | |
$colStr | 382 | 384 | $colStr | |
</tr> | 383 | 385 | </tr> | |
HEADER | 384 | 386 | HEADER | |
385 | 387 | |||
386 | 388 | |||
my $res = ""; | 387 | 389 | my $res = ""; | |
foreach my $prot ($q, $s) { | 388 | 390 | foreach my $prot ($q, $s) { | |
389 | 391 | |||
if (exists $pfamHits{$prot}) { | 390 | 392 | if (exists $pfamHits{$prot}) { | |
my @Doms = keys %{ $pfamHits{$prot} }; | 391 | 393 | my @Doms = keys %{ $pfamHits{$prot} }; | |
392 | 394 | |||
foreach my $d (@Doms) { | 393 | 395 | foreach my $d (@Doms) { | |
394 | 396 | |||
my $clan = ($clans{$d})? $clans{$d} : "N/A"; | 395 | 397 | my $clan = ($clans{$d})? $clans{$d} : "N/A"; | |
396 | 398 | |||
my @hits = @{ $pfamHits{$prot}{$d} }; | 397 | 399 | my @hits = @{ $pfamHits{$prot}{$d} }; | |
foreach my $hit (@hits) { | 398 | 400 | foreach my $hit (@hits) { | |
my $dlen = $hit->{dlen}; | 399 | 401 | my $dlen = $hit->{dlen}; | |
my $eval = $hit->{evalue}; | 400 | 402 | my $eval = $hit->{evalue}; | |
my $qstart = $hit->{qstart}; | 401 | 403 | my $qstart = $hit->{qstart}; | |
my $qend = $hit->{qend}; | 402 | 404 | my $qend = $hit->{qend}; | |
my $dstart = $hit->{dstart}; | 403 | 405 | my $dstart = $hit->{dstart}; | |
my $dend = $hit->{dend}; | 404 | 406 | my $dend = $hit->{dend}; | |
my $name = $hit->{dname}; | 405 | 407 | my $name = $hit->{dname}; | |
my $def = $hit->{def}; | 406 | 408 | my $def = $hit->{def}; | |
407 | 409 | |||
$res .=<<ROW; | 408 | 410 | $res .=<<ROW; | |
<tr> | 409 | 411 | <tr> | |
<td class="pfam">$prot</td> | 410 | 412 | <td class="pfam">$prot</td> | |
<td class="pfam">$d</td> | 411 | 413 | <td class="pfam">$d</td> | |
<td class="pfam">$clan</td> | 412 | 414 | <td class="pfam">$clan</td> | |
<td class="pfam">$dlen</td> | 413 | 415 | <td class="pfam">$dlen</td> | |
<td class="pfam">$eval</td> | 414 | 416 | <td class="pfam">$eval</td> | |
<td class="pfam">$dstart</td> | 415 | 417 | <td class="pfam">$dstart</td> | |
<td class="pfam">$dend</td> | 416 | 418 | <td class="pfam">$dend</td> | |
<td class="pfam">$qstart</td> | 417 | 419 | <td class="pfam">$qstart</td> | |
<td class="pfam">$qend</td> | 418 | 420 | <td class="pfam">$qend</td> | |
<td class="pfam">$name</td> | 419 | 421 | <td class="pfam">$name</td> | |
<td class="pfam">$def</td> | 420 | 422 | <td class="pfam">$def</td> | |
</tr> | 421 | 423 | </tr> | |
ROW | 422 | 424 | ROW | |
} | 423 | 425 | } | |
} | 424 | 426 | } | |
} | 425 | 427 | } | |
} | 426 | 428 | } | |
427 | 429 | |||
#Return final result | 428 | 430 | #Return final result | |
if ($res) { | 429 | 431 | if ($res) { | |
$header .= $res; | 430 | 432 | $header .= $res; | |
$header .= " </table>\n"; | 431 | 433 | $header .= " </table>\n"; | |
return $header; | 432 | 434 | return $header; | |
} | 433 | 435 | } | |
else { | 434 | 436 | else { | |
return $res; | 435 | 437 | return $res; | |
} | 436 | 438 | } | |
} | 437 | 439 | } | |
438 | 440 | |||
439 | 441 | |||
440 | 442 | |||
441 | 443 | |||
#========================================================================== | 442 | 444 | #========================================================================== | |
#Run quod on the query, subject and the alignment. | 443 | 445 | #Run quod on the query, subject and the alignment. | |
444 | 446 | |||
445 | 447 | |||
448 | #quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa | |||
449 | ||||
450 | ||||
sub run_quod { | 446 | 451 | sub run_quod { | |
447 | 452 | |||
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; | 448 | 453 | my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; | |
449 | 454 | |||
450 | 455 | |||
#extract sequences for query and subject | 451 | 456 | #extract sequences for query and subject | |
extract_full_sequences($q,$s); | 452 | 457 | extract_full_sequences($q,$s); | |
453 | 458 | |||
454 | 459 | |||
#----------------------------------------------------------------- | 455 | 460 | #----------------------------------------------------------------- | |
#Run quod for the alignment | 456 | 461 | #Run quod for the alignment | |
457 | 462 | |||
#First save aligned segments to files | 458 | 463 | #First save aligned segments to files | |
my $qalnFile ="$seqDir/${q}_aln.faa"; | 459 | 464 | my $qalnFile ="$seqDir/${q}_aln.faa"; | |
open(my $qfh, '>', $qalnFile) || die $!; | 460 | 465 | open(my $qfh, '>', $qalnFile) || die $!; | |
print $qfh ">$q alignment\n$qseq\n"; | 461 | 466 | print $qfh ">$q alignment\n$qseq\n"; | |
close $qfh; | 462 | 467 | close $qfh; | |
463 | 468 | |||
my $salnFile ="$seqDir/${s}_aln.faa"; | 464 | 469 | my $salnFile ="$seqDir/${s}_aln.faa"; | |
open(my $sfh, '>', $salnFile) || die $!; | 465 | 470 | open(my $sfh, '>', $salnFile) || die $!; | |
print $sfh ">$s alignment\n$sseq\n"; | 466 | 471 | print $sfh ">$s alignment\n$sseq\n"; | |
close $sfh; | 467 | 472 | close $sfh; | |
468 | 473 | |||
469 | 474 | |||
#Note alnquod requires to add the extension to the image name | 470 | 475 | #Note alnquod requires to add the extension to the image name | |
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; | 471 | 476 | my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; | |
my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | 472 | 477 | my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | |
#print "$cmd1\n\n"; | 473 | 478 | # print "$cmd1\n\n"; | |
479 | # exit; | |||
system $cmd1 unless (-f "${alnFig}"); | 474 | 480 | system $cmd1 unless (-f "${alnFig}"); | |
return undef unless (-f "${alnFig}"); | 475 | 481 | return undef unless (-f "${alnFig}"); | |
476 | 482 | |||
477 | 483 | |||
#----------------------------------------------------------------- | 478 | 484 | #----------------------------------------------------------------- | |
#Run quod for the full sequencess of the query and subject proteins | 479 | 485 | #Run quod for the full sequencess of the query and subject proteins | |
480 | 486 | |||
481 | 487 | |||
#Extract TMS coordinates for query | 482 | 488 | #Extract TMS coordinates for query | |
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); | 483 | 489 | die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); | |
my $qTMS = ""; | 484 | 490 | my $qTMS = ""; | |
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { | 485 | 491 | if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { | |
$qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | 486 | 492 | $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | |
} | 487 | 493 | } | |
488 | 494 | |||
489 | 495 | |||
#Plot query hydropathy | 490 | 496 | #Plot query hydropathy | |
my $qPfam = get_pfam_coords_for_quod($q, "red"); | 491 | 497 | my $qPfam = get_pfam_coords_for_quod($q, "red"); | |
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}"; | 492 | 498 | my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; | |
my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa); | 493 | 499 | my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); | |
#print "$cmd2\n\n"; | 494 | 500 | # print "$cmd2\n\n"; | |
system $cmd2 unless (-f "${qName}.png"); | 495 | 501 | # exit; | |
return undef unless (-f "${qName}.png"); | 496 | 502 | system $cmd2 unless (-f $qName); | |
503 | return undef unless (-f $qName); | |||
497 | 504 | |||
498 | 505 | |||
499 | 506 | |||
#TMS coords for the subject | 500 | 507 | #TMS coords for the subject | |
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); | 501 | 508 | die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); | |
my $sTMS = ""; | 502 | 509 | my $sTMS = ""; | |
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { | 503 | 510 | if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { | |
$sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | 504 | 511 | $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | |
} | 505 | 512 | } | |
506 | 513 | |||
#Plot Subject hydropaty | 507 | 514 | #Plot Subject hydropaty | |
my $sPfam = get_pfam_coords_for_quod($s, "blue"); | 508 | 515 | my $sPfam = get_pfam_coords_for_quod($s, "blue"); | |
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}"; | 509 | 516 | my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; | |
my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa); | 510 | 517 | my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); | |
#print "$cmd3\n\n"; | 511 | 518 | # print "$cmd3\n\n"; | |
system $cmd3 unless (-f "${sName}.png"); | 512 | 519 | # exit; | |
return undef unless (-f "${sName}.png"); | 513 | 520 | system $cmd3 unless (-f $sName); | |
521 | return undef unless (-f $sName); | |||
514 | 522 | |||
515 | 523 | |||
return 1; | 516 | 524 | return 1; | |
} | 517 | 525 | } | |
518 | 526 | |||
#========================================================================== | 519 | 527 | #========================================================================== | |
#Get the string for quod that will plot the PFAM domains | 520 | 528 | #Get the string for quod that will plot the PFAM domains | |
521 | 529 | |||
sub get_pfam_coords_for_quod { | 522 | 530 | sub get_pfam_coords_for_quod { | |
523 | 531 | |||
my ($prot, $color) = @_; | 524 | 532 | my ($prot, $color) = @_; | |
525 | 533 | |||
#Format of PFAM hash: | 526 | 534 | #Format of PFAM hash: | |
# push (@{ $out->{$qacc}->{$pfamID} }, | 527 | 535 | # push (@{ $out->{$qacc}->{$pfamID} }, | |
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, | 528 | 536 | # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, | |
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | 529 | 537 | # def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | |
530 | 538 | |||
531 | 539 | |||
532 | 540 | |||
my $str = ""; | 533 | 541 | my $str = ""; | |
534 | 542 | |||
if (exists $pfamHits{$prot}) { | 535 | 543 | if (exists $pfamHits{$prot}) { | |
my @Doms = keys %{ $pfamHits{$prot} }; | 536 | 544 | my @Doms = keys %{ $pfamHits{$prot} }; | |
my $dcnt = 0; | 537 | 545 | my $dcnt = 0; | |
$str = "--region-font 12 -ar "; | 538 | 546 | $str = "--region-font 12 --add-region "; | |
foreach my $d (@Doms) { | 539 | 547 | foreach my $d (@Doms) { | |
540 | 548 | |||
my @hits = @{ $pfamHits{$prot}{$d} }; | 541 | 549 | my @hits = @{ $pfamHits{$prot}{$d} }; | |
foreach my $hit (@hits) { | 542 | 550 | foreach my $hit (@hits) { | |
my $left = $hit->{qstart}; | 543 | 551 | my $left = $hit->{qstart}; | |
my $right = $hit->{qend}; | 544 | 552 | my $right = $hit->{qend}; | |
545 | 553 | |||
my $ypos = -2.8 + $dcnt * 0.4; | 546 | 554 | my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord | |
$str .= "${left}-${right}:'${d}':${ypos}:$color "; | 547 | 555 | my $yposh = $yposl + 0.15; #domain height coord | |
556 | ||||
557 | $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; | |||
$dcnt++; | 548 | 558 | $dcnt++; | |
} | 549 | 559 | } | |
} | 550 | 560 | } | |
} | 551 | 561 | } | |
552 | 562 | |||
return $str; | 553 | 563 | return $str; | |
554 | 564 | |||
} | 555 | 565 | } | |
556 | 566 | |||
557 | 567 | |||
#========================================================================== | 558 | 568 | #========================================================================== | |
#Extract the full sequences of the query and subject proteins | 559 | 569 | #Extract the full sequences of the query and subject proteins | |
#Examples: AKM80767.1 | 560 | 570 | #Examples: AKM80767.1 | |
561 | 571 | |||
562 | 572 | |||
563 | 573 | |||
sub extract_full_sequences { | 564 | 574 | sub extract_full_sequences { | |
565 | 575 | |||
my ($q, $s) = @_; | 566 | 576 | my ($q, $s) = @_; | |
567 | 577 | |||
568 | 578 | |||
my $q_seq = "$seqDir/${q}.faa"; | 569 | 579 | my $q_seq = "$seqDir/${q}.faa"; | |
my $s_seq = "$seqDir/${s}.faa"; | 570 | 580 | my $s_seq = "$seqDir/${s}.faa"; | |
571 | 581 | |||
#extract the query secuence from tcdb and the subject from the custom blastdb | 572 | 582 | #extract the query secuence from tcdb and the subject from the custom blastdb | |
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); | 573 | 583 | my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); | |
system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); | 574 | 584 | system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); | |
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); | 575 | 585 | die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); | |
576 | 586 | |||
577 | 587 | |||
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); | 578 | 588 | my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); | |
system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); | 579 | 589 | system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); | |
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); | 580 | 590 | die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); | |
} | 581 | 591 | } | |
582 | 592 | |||
583 | 593 | |||
584 | 594 | |||
585 | 595 | |||
586 | 596 | |||
587 | 597 | |||
588 | 598 | |||
#========================================================================== | 589 | 599 | #========================================================================== | |
#Sort alignmnet results by E-value | 590 | 600 | #Sort alignmnet results by E-value | |
591 | 601 | |||
sub by_evalue { | 592 | 602 | sub by_evalue { | |
$a->{evalue} <=> $b->{evalue}; | 593 | 603 | $a->{evalue} <=> $b->{evalue}; | |
} | 594 | 604 | } | |
595 | 605 | |||
596 | 606 | |||
597 | 607 | |||
#========================================================================== | 598 | 608 | #========================================================================== | |
#Run PFAM, hmmtop and parse results | 599 | 609 | #Run PFAM, hmmtop and parse results | |
600 | 610 | |||
601 | 611 | |||
sub run_pfam_hmmtop { | 602 | 612 | sub run_pfam_hmmtop { | |
my ($pfamOut, $hmmtopOut, $pfamClans) = @_; | 603 | 613 | my ($pfamOut, $hmmtopOut, $pfamClans) = @_; | |
604 | 614 | |||
605 | 615 | |||
#---------------------------------------------------------------------- | 606 | 616 | #---------------------------------------------------------------------- | |
#Generate blast DB for easy sequence retrieval | 607 | 617 | #Generate blast DB for easy sequence retrieval | |
608 | 618 | |||
print "Generate Blast DB with sequences for fast sequence retrieval...\n"; | 609 | 619 | print "Generate Blast DB with sequences for fast sequence retrieval...\n"; | |
610 | 620 | |||
#Get the sequences for which hmmscan will run | 611 | 621 | #Get the sequences for which hmmscan will run | |
my $allSeqsFile = "$seqDir/all_seqs.faa"; | 612 | 622 | my $allSeqsFile = "$seqDir/all_seqs.faa"; | |
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); | 613 | 623 | system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); | |
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); | 614 | 624 | die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); | |
615 | 625 | |||
616 | 626 | |||
#Generate blastdb ...assuming there are no duplicate sequences. | 617 | 627 | #Generate blastdb ...assuming there are no duplicate sequences. | |
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); | 618 | 628 | my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); | |
print "$cmd1\n"; | 619 | 629 | print "$cmd1\n"; | |
system $cmd1 unless (-f "${blastdb}.pin"); | 620 | 630 | system $cmd1 unless (-f "${blastdb}.pin"); | |
system "rm $allSeqsFile" if (-f $allSeqsFile); | 621 | 631 | system "rm $allSeqsFile" if (-f $allSeqsFile); | |
622 | 632 | |||
623 | 633 | |||
#---------------------------------------------------------------------- | 624 | 634 | #---------------------------------------------------------------------- | |
#Get the accessions of the top hits in the alignments | 625 | 635 | #Get the accessions of the top hits in the alignments | |
626 | 636 | |||
#get the accessions with significant hits | 627 | 637 | #get the accessions with significant hits | |
my %accList = (); | 628 | 638 | my %accList = (); | |
foreach my $hit (@alnHits) { | 629 | 639 | foreach my $hit (@alnHits) { | |
$accList{$hit->{qacc}} = 1; | 630 | 640 | $accList{$hit->{qacc}} = 1; | |
$accList{$hit->{sacc}} = 1; | 631 | 641 | $accList{$hit->{sacc}} = 1; | |
} | 632 | 642 | } | |
633 | 643 | |||
634 | 644 | |||
#Save accessions to a file | 635 | 645 | #Save accessions to a file | |
my $idFile = "$seqDir/top_hits_accs.txt"; | 636 | 646 | my $idFile = "$seqDir/top_hits_accs.txt"; | |
unless (-f $idFile) { | 637 | 647 | unless (-f $idFile) { | |
open (my $afh, ">", $idFile) || die $!; | 638 | 648 | open (my $afh, ">", $idFile) || die $!; | |
print $afh join("\n", keys %accList), "\n"; | 639 | 649 | print $afh join("\n", keys %accList), "\n"; | |
close $afh; | 640 | 650 | close $afh; | |
} | 641 | 651 | } | |
642 | 652 | |||
#---------------------------------------------------------------------- | 643 | 653 | #---------------------------------------------------------------------- | |
#Extract full sequences for top hits. | 644 | 654 | #Extract full sequences for top hits. | |
645 | 655 | |||
my $topHitsSeqs = "$seqDir/topHits.faa"; | 646 | 656 | my $topHitsSeqs = "$seqDir/topHits.faa"; | |
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); | 647 | 657 | my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); | |
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); | 648 | 658 | system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); | |
649 | 659 | |||
650 | 660 | |||
#---------------------------------------------------------------------- | 651 | 661 | #---------------------------------------------------------------------- | |
#run hmmscan on all the sequences for both files | 652 | 662 | #run hmmscan on all the sequences for both files | |
653 | 663 | |||
print "\nRunning hmmscan and parsing output....\n"; | 654 | 664 | print "\nRunning hmmscan and parsing output....\n"; | |
655 | 665 | |||
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; | 656 | 666 | my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; | |
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); | 657 | 667 | my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); | |
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); | 658 | 668 | system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); | |
659 | 669 | |||
660 | 670 | |||
#parse Pfam output | 661 | 671 | #parse Pfam output | |
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); | 662 | 672 | TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); | |
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); | 663 | 673 | # print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); | |
# exit; | 664 | 674 | # exit; | |
665 | 675 | |||
#---------------------------------------------------------------------- | 666 | 676 | #---------------------------------------------------------------------- | |
#Extract clans | 667 | 677 | #Extract clans | |
668 | 678 | |||
TCDB::Assorted::get_clans($pfamClans, $filesDir); | 669 | 679 | TCDB::Assorted::get_clans($pfamClans, $filesDir); | |
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); | 670 | 680 | # print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); | |
# exit; | 671 | 681 | # exit; | |
672 | 682 | |||
#-------------------------------------------------------------------------- | 673 | 683 | #-------------------------------------------------------------------------- | |
#Run hmmtop on top hits for later hydropathy plots. | 674 | 684 | #Run hmmtop on top hits for later hydropathy plots. | |
675 | 685 | |||
print "Runnign HMMTOP and parsing output...\n"; | 676 | 686 | print "Runnign HMMTOP and parsing output...\n"; | |
677 | 687 | |||
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); | 678 | 688 | my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); | |
system $cmd3 unless (-f $hmmtopFile); | 679 | 689 | system $cmd3 unless (-f $hmmtopFile); | |
system "rm $topHitsSeqs" if (-f $topHitsSeqs); | 680 | 690 | system "rm $topHitsSeqs" if (-f $topHitsSeqs); | |
681 | 691 | |||
#Parse hmmtop output | 682 | 692 | #Parse hmmtop output | |
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); | 683 | 693 | TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); | |
684 | 694 | |||
} | 685 | 695 | } | |
686 | 696 | |||
687 | 697 | |||
#========================================================================== | 688 | 698 | #========================================================================== | |
#Parse ssearch36 output | 689 | 699 | #Parse ssearch36 output | |
690 | 700 | |||
sub parse_ssearch { | 691 | 701 | sub parse_ssearch { | |
692 | 702 | |||
my $out = shift; | 693 | 703 | my $out = shift; | |
694 | 704 | |||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); | 695 | 705 | my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); | |
696 | 706 | |||
my $formatTmp = $parser->format(); | 697 | 707 | my $formatTmp = $parser->format(); | |
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); | 698 | 708 | # print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); | |
# exit; | 699 | 709 | # exit; | |
700 | 710 | |||
while (my $result = $parser->next_result) { | 701 | 711 | while (my $result = $parser->next_result) { | |
702 | 712 | |||
my $qacc = $result->query_name; | 703 | 713 | my $qacc = $result->query_name; | |
my $qlen = $result->query_length; | 704 | 714 | my $qlen = $result->query_length; | |
705 | 715 | |||
706 | 716 | |||
HIT:while (my $hit = $result->next_hit) { | 707 | 717 | HIT:while (my $hit = $result->next_hit) { | |
HSP:while(my $hsp = $hit->next_hsp) { | 708 | 718 | HSP:while(my $hsp = $hit->next_hsp) { | |
709 | 719 | |||
#Alignment parameters | 710 | 720 | #Alignment parameters | |
my $sacc = $hit->name; | 711 | 721 | my $sacc = $hit->name; | |
my $slen = $hit->length; | 712 | 722 | my $slen = $hit->length; | |
my $eval = $hsp->evalue; | 713 | 723 | my $eval = $hsp->evalue; | |
my $id = $hsp->frac_identical('total') * 100; | 714 | 724 | my $id = $hsp->frac_identical('total') * 100; | |
715 | 725 | |||
#coordinates and sequence | 716 | 726 | #coordinates and sequence | |
my $qstart = $hsp->start('query'); | 717 | 727 | my $qstart = $hsp->start('query'); | |
my $qend = $hsp->end('query'); | 718 | 728 | my $qend = $hsp->end('query'); | |
my $sstart = $hsp->start('subject'); | 719 | 729 | my $sstart = $hsp->start('subject'); | |
my $send = $hsp->end('subject'); | 720 | 730 | my $send = $hsp->end('subject'); | |
my $qseq = $hsp->query_string; | 721 | 731 | my $qseq = $hsp->query_string; | |
my $sseq = $hsp->hit_string; | 722 | 732 | my $sseq = $hsp->hit_string; | |
my $hstr = $hsp->homology_string; | 723 | 733 | my $hstr = $hsp->homology_string; | |
724 | 734 | |||
725 | 735 | |||
#Check first that both proteins have the right length | 726 | 736 | #Check first that both proteins have the right length | |
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); | 727 | 737 | next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); | |
728 | 738 | |||
#If the alignment has less than $minLength aas, ignore it | 729 | 739 | #If the alignment has less than $minLength aas, ignore it | |
my $qtmp = $qseq; $qtmp =~ s/-//g; | 730 | 740 | my $qtmp = $qseq; $qtmp =~ s/-//g; | |
my $stmp = $sseq; $stmp =~ s/-//g; | 731 | 741 | my $stmp = $sseq; $stmp =~ s/-//g; | |
next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); | 732 | 742 | next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); | |
733 | 743 | |||
#Calculate coverages properly (do not use alignment length as it includes gaps | 734 | 744 | #Calculate coverages properly (do not use alignment length as it includes gaps | |
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; | 735 | 745 | my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; | |
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; | 736 | 746 | my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; | |
737 | 747 | |||
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; | 738 | 748 | my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; | |
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; | 739 | 749 | my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; | |
740 | 750 | |||
741 | 751 | |||
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | 742 | 752 | if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | |
743 | 753 | |||
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | 744 | 754 | push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | |
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | 745 | 755 | scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | |
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); | 746 | 756 | sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); | |
} | 747 | 757 | } | |
} # hsp | 748 | 758 | } # hsp | |
} # hit | 749 | 759 | } # hit | |
} # result | 750 | 760 | } # result | |
} | 751 | 761 | } | |
752 | 762 | |||
753 | 763 | |||
754 | 764 | |||
#========================================================================== | 755 | 765 | #========================================================================== | |
#Test whether the lengths of two proteins are withing a predefined | 756 | 766 | #Test whether the lengths of two proteins are withing a predefined | |
#legnth specified by the user. This are the options for control: | 757 | 767 | #legnth specified by the user. This are the options for control: | |
# X: Either protein is larger than the cutoff | 758 | 768 | # X: Either protein is larger than the cutoff | |
# B: Both proteins are larger than the cutoff | 759 | 769 | # B: Both proteins are larger than the cutoff | |
# Q: Only the query protein is larger than the cutoff | 760 | 770 | # Q: Only the query protein is larger than the cutoff | |
# S: Only the subject protein is larger than the cutoff | 761 | 771 | # S: Only the subject protein is larger than the cutoff | |
# N: No control. Any length is ok. | 762 | 772 | # N: No control. Any length is ok. | |
763 | 773 | |||
sub max_length_violation { | 764 | 774 | sub max_length_violation { | |
765 | 775 | |||
my ($qlen, $slen, $maxLen, $control) = @_; | 766 | 776 | my ($qlen, $slen, $maxLen, $control) = @_; | |
767 | 777 | |||
if ($control eq "X") { | 768 | 778 | if ($control eq "X") { | |
(($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; | 769 | 779 | (($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; | |
} | 770 | 780 | } | |
771 | 781 | |||
if ($control eq "B") { | 772 | 782 | if ($control eq "B") { | |
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; | 773 | 783 | (($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; | |
} | 774 | 784 | } | |
775 | 785 | |||
if ($control eq "Q") { | 776 | 786 | if ($control eq "Q") { | |
($qlen >= $maxLen)? return 1 : return 0; | 777 | 787 | ($qlen >= $maxLen)? return 1 : return 0; | |
} | 778 | 788 | } | |
779 | 789 | |||
if ($control eq "S") { | 780 | 790 | if ($control eq "S") { | |
($slen >= $maxLen)? return 1 : return 0; | 781 | 791 | ($slen >= $maxLen)? return 1 : return 0; | |
} | 782 | 792 | } | |
783 | 793 | |||
if ($control eq "N") { | 784 | 794 | if ($control eq "N") { | |
return 0; | 785 | 795 | return 0; | |
} | 786 | 796 | } | |
787 | 797 | |||
die "Unknown control mode: $control"; | 788 | 798 | die "Unknown control mode: $control"; | |
789 | 799 | |||
} | 790 | 800 | } | |
791 | 801 | |||
792 | 802 | |||
793 | 803 | |||
794 | 804 | |||
#========================================================================== | 795 | 805 | #========================================================================== | |
#Parse blast output | 796 | 806 | #Parse blast output | |
797 | 807 | |||
798 | 808 | |||
sub parse_blast { | 799 | 809 | sub parse_blast { | |
my $out = shift; | 800 | 810 | my $out = shift; | |
801 | 811 | |||
open (my $fh, "<", $alnFile) || die $!; | 802 | 812 | open (my $fh, "<", $alnFile) || die $!; | |
LINE:while (<$fh>) { | 803 | 813 | LINE:while (<$fh>) { | |
chomp; | 804 | 814 | chomp; | |
next unless ($_); | 805 | 815 | next unless ($_); | |
next if (/^#/); | 806 | 816 | next if (/^#/); | |
807 | 817 | |||
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq | 808 | 818 | #Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq | |
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); | 809 | 819 | my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); | |
810 | 820 | |||
811 | 821 | |||
if ($eval <= $evalue) { | 812 | 822 | if ($eval <= $evalue) { | |
813 | 823 | |||
my $qcov = ($qend - $qstart + 1) / $qlen * 100; | 814 | 824 | my $qcov = ($qend - $qstart + 1) / $qlen * 100; | |
my $scov = ($send - $sstart + 1) / $slen * 100; | 815 | 825 | my $scov = ($send - $sstart + 1) / $slen * 100; | |
816 | 826 | |||
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | 817 | 827 | if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | |
818 | 828 | |||
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | 819 | 829 | push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | |
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | 820 | 830 | scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | |
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); | 821 | 831 | sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); | |
} | 822 | 832 | } | |
} | 823 | 833 | } | |
} | 824 | 834 | } | |
close $fh; | 825 | 835 | close $fh; | |
} | 826 | 836 | } | |
827 | 837 | |||
828 | 838 | |||
829 | 839 | |||
830 | 840 | |||
831 | 841 | |||
#========================================================================== | 832 | 842 | #========================================================================== | |
#Run the alignemnt between the two files depending on the program | 833 | 843 | #Run the alignemnt between the two files depending on the program | |
#Selected by the user. | 834 | 844 | #Selected by the user. | |
835 | 845 | |||
sub run_alignment { | 836 | 846 | sub run_alignment { | |
837 | 847 | |||
my $cmd = ""; | 838 | 848 | my $cmd = ""; | |
839 | 849 | |||
840 | 850 | |||
if ($prog eq 'blastp') { | 841 | 851 | if ($prog eq 'blastp') { | |
842 | 852 | |||
my $compStr = "-comp_based_stats $blastComp"; | 843 | 853 | my $compStr = "-comp_based_stats $blastComp"; | |
my $segStr = "-seg $segFilter"; | 844 | 854 | my $segStr = "-seg $segFilter"; | |
my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq'); | 845 | 855 | my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq'); | |
846 | 856 | |||
#Run blast | 847 | 857 | #Run blast | |
$cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr); | 848 | 858 | $cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr); | |
print "$cmd\n"; | 849 | 859 | print "$cmd\n"; | |
system $cmd unless (-f $alnFile && !(-z $alnFile)); | 850 | 860 | system $cmd unless (-f $alnFile && !(-z $alnFile)); | |
851 | 861 | |||
#Append command line to the end of results file | 852 | 862 | #Append command line to the end of results file | |
open (my $fh, ">>", $alnFile) || die $!; | 853 | 863 | open (my $fh, ">>", $alnFile) || die $!; | |
print $fh "\n# $cmd\n"; | 854 | 864 | print $fh "\n# $cmd\n"; | |
close $fh; | 855 | 865 | close $fh; |
locateFragment.pl
View file @
68b41f1
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | $Data::Dumper::Deepcopy = 1; | |
8 | 8 | |||
use Getopt::Long; | 9 | 9 | use Getopt::Long; | |
use Bio::SearchIO; | 10 | 10 | use Bio::SearchIO; | |
use Bio::SeqIO; | 11 | 11 | use Bio::SeqIO; | |
12 | 12 | |||
13 | 13 | |||
#Local libraries | 14 | 14 | #Local libraries | |
use TCDB::CheckDependencies; | 15 | 15 | use TCDB::CheckDependencies; | |
use TCDB::Assorted; | 16 | 16 | use TCDB::Assorted; | |
17 | 17 | |||
18 | 18 | |||
########################################################################### | 19 | 19 | ########################################################################### | |
# | 20 | 20 | # | |
# Parse GBLAST output file and return the sequence of the proteins that | 21 | 21 | # Parse GBLAST output file and return the sequence of the proteins that | |
# meet the user's criteria. | 22 | 22 | # meet the user's criteria. | |
# | 23 | 23 | # | |
########################################################################### | 24 | 24 | ########################################################################### | |
25 | 25 | |||
26 | 26 | |||
#This will prevent quod and alnquod from going into interactive mode | 27 | 27 | #This will prevent quod and alnquod from going into interactive mode | |
$ENV{"MPLBACKEND"} = "agg"; | 28 | 28 | $ENV{"MPLBACKEND"} = "agg"; | |
29 | 29 | |||
30 | 30 | |||
#========================================================================== | 31 | 31 | #========================================================================== | |
#Check dependencies | 32 | 32 | #Check dependencies | |
33 | 33 | |||
my @dependencies = ("glsearch36", "blastdbcmd"); | 34 | 34 | my @dependencies = ("glsearch36", "blastdbcmd"); | |
my $CheckDep_obj = new TCDB::CheckDependencies(); | 35 | 35 | my $CheckDep_obj = new TCDB::CheckDependencies(); | |
$CheckDep_obj -> dependencies_list(\@dependencies); | 36 | 36 | $CheckDep_obj -> dependencies_list(\@dependencies); | |
$CheckDep_obj -> checkDependencies; | 37 | 37 | $CheckDep_obj -> checkDependencies; | |
38 | 38 | |||
39 | 39 | |||
40 | 40 | |||
41 | 41 | |||
#========================================================================== | 42 | 42 | #========================================================================== | |
#Read command line arguments | 43 | 43 | #Read command line arguments | |
44 | 44 | |||
my $fragment = undef; | 45 | 45 | my $fragment = undef; | |
my $accession = undef; | 46 | 46 | my $accession = undef; | |
my $accFile = undef | 47 | 47 | my $accFile = undef | |
my $outdir = undef; | 48 | 48 | my $outdir = undef; | |
my $blastdb = undef; | 49 | 49 | my $blastdb = undef; | |
my $evalue = 1e-2; | 50 | 50 | my $evalue = 1e-2; | |
my $subMatrix = 'BL50'; | 51 | 51 | my $subMatrix = 'BL50'; | |
my $quiet = 0; | 52 | 52 | my $quiet = 0; | |
my $interactive = 0; | 53 | 53 | my $interactive = 0; | |
54 | 54 | |||
read_command_line(); | 55 | 55 | read_command_line(); | |
56 | 56 | |||
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | 57 | 57 | #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | |
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | 58 | 58 | # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | |
#exit; | 59 | 59 | #exit; | |
60 | 60 | |||
61 | 61 | |||
62 | 62 | |||
63 | 63 | |||
#========================================================================== | 64 | 64 | #========================================================================== | |
#Get the sequences for the fragment and full proteins in files | 65 | 65 | #Get the sequences for the fragment and full proteins in files | |
66 | 66 | |||
67 | 67 | |||
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | 68 | 68 | my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | |
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | 69 | 69 | #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | |
70 | 70 | |||
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | 71 | 71 | die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | |
72 | 72 | |||
#========================================================================== | 73 | 73 | #========================================================================== | |
#align fragment to full protein | 74 | 74 | #align fragment to full protein | |
75 | 75 | |||
my $alignFile = "$outdir/${accession}_glsearch.out"; | 76 | 76 | my $alignFile = "$outdir/${accession}_glsearch.out"; | |
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | 77 | 77 | my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | |
system $cmd unless (-f $alignFile); | 78 | 78 | system $cmd unless (-f $alignFile); | |
79 | 79 | |||
80 | 80 | |||
#========================================================================== | 81 | 81 | #========================================================================== | |
#Parse glsearch output and run quod | 82 | 82 | #Parse glsearch output and run quod | |
83 | 83 | |||
84 | 84 | |||
run_quod ($alignFile, $protFile); | 85 | 85 | run_quod ($alignFile, $protFile); | |
86 | 86 | |||
87 | 87 | |||
88 | 88 | |||
89 | 89 | |||
90 | 90 | |||
#========================================================================== | 91 | 91 | #========================================================================== | |
################ Subroutines definition beyond ths point ############## | 92 | 92 | ################ Subroutines definition beyond ths point ############## | |
#========================================================================== | 93 | 93 | #========================================================================== | |
94 | 94 | |||
95 | 95 | |||
sub run_quod { | 96 | 96 | sub run_quod { | |
97 | 97 | |||
my ($alignment, $sequence) = @_; | 98 | 98 | my ($alignment, $sequence) = @_; | |
99 | 99 | |||
100 | 100 | |||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | 101 | 101 | my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | |
102 | 102 | |||
my @res = (); | 103 | 103 | my @res = (); | |
104 | 104 | |||
#---------------------------------------------------------------------- | 105 | 105 | #---------------------------------------------------------------------- | |
#Parse glsearch output | 106 | 106 | #Parse glsearch output | |
107 | 107 | |||
my $hsp_cnt = 1; | 108 | 108 | my $hsp_cnt = 1; | |
while (my $result = $parser->next_result) { | 109 | 109 | while (my $result = $parser->next_result) { | |
110 | 110 | |||
HIT:while (my $hit = $result->next_hit) { | 111 | 111 | HIT:while (my $hit = $result->next_hit) { | |
HSP:while(my $hsp = $hit->next_hsp) { | 112 | 112 | HSP:while(my $hsp = $hit->next_hsp) { | |
113 | 113 | |||
my %data = (); | 114 | 114 | my %data = (); | |
my $key = "hsp_$hsp_cnt"; | 115 | 115 | my $key = "hsp_$hsp_cnt"; | |
my $hId = $hsp->frac_identical('total'); #identity in the alignment | 116 | 116 | my $hId = $hsp->frac_identical('total'); #identity in the alignment | |
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | 117 | 117 | my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | |
118 | 118 | |||
#Alignment parameters | 119 | 119 | #Alignment parameters | |
$data{hsp} = $key; | 120 | 120 | $data{hsp} = $key; | |
$data{evalue} = $hsp->evalue; | 121 | 121 | $data{evalue} = $hsp->evalue; | |
$data{id} = $hId; | 122 | 122 | $data{id} = $hId; | |
$data{sim} = $hSim; | 123 | 123 | $data{sim} = $hSim; | |
124 | 124 | |||
#coordinates in the alignment to plot bars | 125 | 125 | #coordinates in the alignment to plot bars | |
$data{qstart} = $hsp->start('query'); | 126 | 126 | $data{qstart} = $hsp->start('query'); | |
$data{qend} = $hsp->end('query'); | 127 | 127 | $data{qend} = $hsp->end('query'); | |
$data{sstart} = $hsp->start('subject'); | 128 | 128 | $data{sstart} = $hsp->start('subject'); | |
$data{send} = $hsp->end('subject'); | 129 | 129 | $data{send} = $hsp->end('subject'); | |
130 | 130 | |||
push (@res, \%data); | 131 | 131 | push (@res, \%data); | |
$hsp_cnt++; | 132 | 132 | $hsp_cnt++; | |
} | 133 | 133 | } | |
} | 134 | 134 | } | |
} | 135 | 135 | } | |
136 | 136 | |||
die "Error: no match found between fragment and sequence" unless (@res); | 137 | 137 | die "Error: no match found between fragment and sequence" unless (@res); | |
# print Data::Dumper->Dump([\@res ], [qw(*res )]); | 138 | 138 | # print Data::Dumper->Dump([\@res ], [qw(*res )]); | |
139 | 139 | |||
140 | 140 | |||
#---------------------------------------------------------------------- | 141 | 141 | #---------------------------------------------------------------------- | |
#Generate quod plot | 142 | 142 | #Generate quod plot | |
143 | 143 | |||
#Format string for the regions | 144 | 144 | #Format string for the regions | |
my $regions = "-at "; | 145 | 145 | my $regions = "--add-tms "; | |
my $coords = ""; | 146 | 146 | my $coords = ""; | |
foreach my $hit (@res) { | 147 | 147 | foreach my $hit (@res) { | |
148 | 148 | |||
$coords = $hit->{sstart} . "-" . $hit->{send}; | 149 | 149 | $coords = $hit->{sstart} . "-" . $hit->{send}; | |
$regions .= "${coords}:green"; | 150 | 150 | $regions .= "${coords}:green"; | |
151 | 151 | |||
#only the best HSP is required to be plotted | 152 | 152 | #only the best HSP is required to be plotted | |
last; | 153 | 153 | last; | |
} | 154 | 154 | } | |
155 | 155 | |||
my $outPlot = "$outdir/${accession}_map_frag.png"; | 156 | 156 | my $outPlot = "$outdir/${accession}_map_frag.png"; | |
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; | 157 | 157 | my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; | |
my $iString = ($interactive)? "-o $outPlot --show" : ""; | 158 | 158 | my $iString = ($interactive)? "-o $outPlot --show" : ""; | |
159 | 159 | |||
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); | 160 | 160 | my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); | |
print "$cmd\n"; | 161 | 161 | print "$cmd\n"; | |
system $cmd; | 162 | 162 | system $cmd; | |
} | 163 | 163 | } | |
164 | 164 | |||
165 | 165 | |||
166 | 166 | |||
167 | 167 | |||
sub getSequences { | 168 | 168 | sub getSequences { | |
169 | 169 | |||
my ($frag, $acc) = @_; | 170 | 170 | my ($frag, $acc) = @_; | |
171 | 171 | |||
172 | 172 | |||
# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); | 173 | 173 | # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); | |
# exit; | 174 | 174 | # exit; | |
175 | 175 | |||
176 | 176 | |||
#Sequence for full protein | 177 | 177 | #Sequence for full protein | |
my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; | 178 | 178 | my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; | |
179 | 179 | |||
180 | 180 | |||
#Save fragment to file | 181 | 181 | #Save fragment to file | |
my $tmpFile = "$outdir/${acc}_frag.faa"; | 182 | 182 | my $tmpFile = "$outdir/${acc}_frag.faa"; | |
183 | 183 | |||
184 | 184 | |||
if (-f $frag) { | 185 | 185 | if (-f $frag) { | |
system "mv $frag $tmpFile" unless (-f $tmpFile); | 186 | 186 | system "mv $frag $tmpFile" unless (-f $tmpFile); | |
} | 187 | 187 | } | |
else { | 188 | 188 | else { | |
unless (-f $tmpFile) { | 189 | 189 | unless (-f $tmpFile) { | |
open (my $fh, ">", $tmpFile) || die $!; | 190 | 190 | open (my $fh, ">", $tmpFile) || die $!; | |
print $fh ">${acc} fragment\n$frag\n"; | 191 | 191 | print $fh ">${acc} fragment\n$frag\n"; | |
close $fh; | 192 | 192 | close $fh; | |
} | 193 | 193 | } | |
} | 194 | 194 | } | |
195 | 195 | |||
unless (-f $accSeq) { | 196 | 196 | unless (-f $accSeq) { | |
#Blast DB to be used | 197 | 197 | #Blast DB to be used | |
my $db = ($blastdb)? $blastdb : 'nr'; | 198 | 198 | my $db = ($blastdb)? $blastdb : 'nr'; | |
199 | 199 | |||
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | 200 | 200 | my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | |
system $cmd; | 201 | 201 | system $cmd; | |
202 | 202 | |||
#Remove the version and annotations from the sequence file | 203 | 203 | #Remove the version and annotations from the sequence file | |
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | 204 | 204 | my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | |
system $cmd2 unless (-f "${accSeq}.pkp"); | 205 | 205 | system $cmd2 unless (-f "${accSeq}.pkp"); | |
} | 206 | 206 | } | |
207 | 207 | |||
#Return files to be aligned | 208 | 208 | #Return files to be aligned | |
return [$tmpFile, $accSeq]; | 209 | 209 | return [$tmpFile, $accSeq]; | |
} | 210 | 210 | } | |
211 | 211 | |||
212 | 212 | |||
213 | 213 | |||
214 | 214 | |||
#=========================================================================== | 215 | 215 | #=========================================================================== | |
#Read command line and print help | 216 | 216 | #Read command line and print help | |
217 | 217 | |||
218 | 218 | |||
sub read_command_line { | 219 | 219 | sub read_command_line { | |
220 | 220 | |||
print_help() unless (@ARGV); | 221 | 221 | print_help() unless (@ARGV); | |
222 | 222 | |||
my $status = GetOptions( | 223 | 223 | my $status = GetOptions( | |
"i|acc-file=s" => \&read_accFile, | 224 | 224 | "i|acc-file=s" => \&read_accFile, | |
"a|accession=s" => \&read_accession, | 225 | 225 | "a|accession=s" => \&read_accession, | |
"f|fragment=s" => \&read_fragment, | 226 | 226 | "f|fragment=s" => \&read_fragment, | |
"o|outdir=s" => \$outdir, | 227 | 227 | "o|outdir=s" => \$outdir, | |
"bdb|blastdb=s" => \&read_blastdb, | 228 | 228 | "bdb|blastdb=s" => \&read_blastdb, | |
"e|evalue=f" => \$evalue, | 229 | 229 | "e|evalue=f" => \$evalue, | |
"m|sub-matrix=s" => \&read_subMatrix, | 230 | 230 | "m|sub-matrix=s" => \&read_subMatrix, | |
"t|interactive!" => \$interactive, | 231 | 231 | "t|interactive!" => \$interactive, | |
"q|quiet!" => \$quiet, | 232 | 232 | "q|quiet!" => \$quiet, | |
"h|help" => sub { print_help(); }, | 233 | 233 | "h|help" => sub { print_help(); }, | |
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | 234 | 234 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | |
exit unless ($status); | 235 | 235 | exit unless ($status); | |
236 | 236 | |||
237 | 237 | |||
die "Error: option -f is mandatory." unless ($fragment); | 238 | 238 | die "Error: option -f is mandatory." unless ($fragment); | |
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | 239 | 239 | die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | |
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | 240 | 240 | die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | |
241 | 241 | |||
#Default value for output directory | 242 | 242 | #Default value for output directory | |
$outdir = "." unless ($outdir); | 243 | 243 | $outdir = "." unless ($outdir); | |
system "mkdir -p $outdir" unless (-d $outdir); | 244 | 244 | system "mkdir -p $outdir" unless (-d $outdir); | |
245 | 245 |
tmsRepeat.pl
View file @
68b41f1
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl | |
2 | 2 | |||
use warnings; | 3 | 3 | no warnings; | |
use strict; | 4 | 4 | use strict; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | use TCDB::Repeats; | |
$Data::Dumper::Indent = 1; | 8 | |||
#$Data::Dumper::Purity = 0; | 9 | |||
$Data::Dumper::Sortkeys = 1; | 10 | |||
11 | ||||
use Getopt::Long; | 12 | 8 | use Getopt::Long; | |
use Bio::SearchIO; | 13 | |||
use Bio::SeqIO; | 14 | |||
15 | 9 | |||
16 | 10 | |||
use TCDB::CheckDependencies; | 17 | 11 | my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences'; | |
use TCDB::Assorted; | 18 | 12 | my $seqsFile = undef; | |
13 | my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop'; | |||
14 | my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP'; | |||
19 | 15 | |||
16 | my $evalue = 1e-2; | |||
17 | my $coverage = 0.85; | |||
18 | my $identity = 0.2; | |||
20 | 19 | |||
20 | my @tmsRanges = (); | |||
21 | 21 | |||
#========================================================================== | 22 | 22 | read_command_line(); | |
#Check dependencies | 23 | |||
24 | 23 | |||
my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py"); | 25 | |||
my $CheckDep_obj = new TCDB::CheckDependencies(); | 26 | |||
$CheckDep_obj -> dependencies_list(\@dependencies); | 27 | |||
$CheckDep_obj -> checkDependencies; | 28 | |||
29 | 24 | |||
30 | 25 | #print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir], | ||
31 | 26 | # [qw(*seqsDir *seqsFile *tmsFile *outDir )]); | ||
#========================================================================== | 32 | |||
#Read command line options | 33 | |||
34 | ||||
my $gs_infile = ""; | 35 | |||
my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS | 36 | |||
my $gs_idFormat = ""; | 37 | |||
my $gs_repUnit = 0; | 38 | |||
my $gs_seqDir = ""; | 39 | |||
my $gs_tail = 5; | 40 | |||
my $gs_evalue = 0.1; | 41 | |||
my $gs_coverage = 0.8; | 42 | |||
my $gs_identity = 0.25; | 43 | |||
my $gsatShuffles = 1000; | 44 | |||
my $min_gsat_score = 4.0; | 45 | |||
46 | ||||
my $compStatsFlag = 1; | 47 | |||
my $compStats = ""; | 48 | |||
my $outdir = "repeats"; | 49 | |||
my $repDir = "reports"; | 50 | |||
my $seqDir = "sequences"; | 51 | |||
my $alignDir = "alignments"; | 52 | |||
my $plotsDir = "plots"; | 53 | |||
my $goodHitsOnly = 1; #print only significant results, ignore everything else | 54 | |||
55 | ||||
56 | ||||
#all (all sequences in output file) | 57 | |||
#each (generate one directory per sequence.. for better organization) | 58 | |||
#debug (it will print the contents of the hash table one sequences at a time) | 59 | |||
my $mode = "all"; | 60 | |||
61 | ||||
read_command_line_arguments(); | 62 | |||
63 | ||||
#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir, | 64 | |||
# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats], | 65 | |||
# [qw(*infile *idFormat *repUnit *seqDir *tail *evalue | 66 | |||
# *coverage *identity $gsatShuffles *compStatFlag *compStats)]); | 67 | |||
#exit; | 68 | 27 | #exit; | |
69 | 28 | |||
# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa | 70 | |||
71 | 29 | |||
30 | #my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir, | |||
31 | # 'tmsFile' => $tmsFile, | |||
32 | # 'outDir' => $outDir, | |||
33 | # 'ranges2searchTMS' => \@TMSranges); | |||
72 | 34 | |||
#========================================================================== | 73 | |||
#Read file with coordinates of TMSs and verify that the sequences are | 74 | |||
#available | 75 | |||
76 | 35 | |||
my %gh_tms = (); | 77 | 36 | my @TMSranges = ([1, 3], [4, 6]); | |
78 | 37 | |||
read_tms_coordinates_file($gs_infile, \%gh_tms); | 79 | 38 | my $repObj = TCDB::Repeat->new(); | |
80 | 39 | |||
#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]); | 81 | 40 | #$repObj->tmsFile($tmsFile); | |
#exit; | 82 | 41 | #$repObj->seqsDir($seqsDir); | |
42 | $repObj->seqsFile($seqsFile); | |||
43 | $repObj->outDir($outDir); | |||
44 | $repObj->evalueCutoff($evalue); | |||
45 | $repObj->identityCutoff($identity); | |||
46 | $repObj->coverageCutoff($coverage); | |||
47 | $repObj->TMSranges2search(\@TMSranges); | |||
83 | 48 | |||
49 | $repObj-> findRepeatsTMSranges(); | |||
84 | 50 | |||
#=========================================================================== | 85 | 51 | #print Data::Dumper->Dump([$repObj ], [qw(*repObj)]); | |
#Main Output directory | 86 | |||
87 | 52 | |||
#Root directory for all results | 88 | |||
system "mkdir -p $outdir" unless (-d $outdir); | 89 | |||
die "Could not generate output directory: $outdir" unless (-d $outdir); | 90 | |||
91 | 53 | |||
92 | 54 | |||
93 | 55 | |||
#========================================================================== | 94 | |||
#Search for repeats inside query sequences | 95 | |||
96 | 56 | |||
my %results = (); | 97 | |||
my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots | 98 | |||
99 | ||||
foreach my $ls_sid (keys %gh_tms) { | 100 | |||
101 | ||||
my %gh_bundleSeqs = (); | 102 | |||
my %gh_topHits = (); | 103 | |||
104 | ||||
105 | ||||
print "Processing: $ls_sid\n"; | 106 | |||
107 | ||||
108 | ||||
#Clean results if one output directory is generated per input sequence | 109 | |||
%results = () if ($mode eq 'each'); | 110 | |||
111 | ||||
112 | ||||
#Cut sequences in non overlaping regions with as many TMS as the | 113 | |||
#repeat unit we want to find. | 114 | |||
cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs); | 115 | |||
116 | ||||
117 | ||||
# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]); | 118 | |||
# <STDIN>; | 119 | |||
120 | ||||
121 | ||||
#run ssearch to find potential repeats. | 122 | |||
align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits); | 123 | |||
124 | ||||
125 | ||||
# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]); | 126 | |||
# <STDIN>; | 127 | |||
128 | ||||
129 | ||||
#Collect results for final table | 130 | |||
$results{$ls_sid} = \%gh_topHits; | 131 | |||
132 | ||||
#present results per input sequence to verify everything looks fine. | 133 | |||
if ($mode eq 'debug') { | 134 | |||
print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]); | 135 | |||
<STDIN>; | 136 | |||
} | 137 | |||
138 | ||||
print_reports(\%results) if ($mode eq 'each'); | 139 | |||
} | 140 | |||
141 | ||||
142 | ||||
143 | ||||
144 | ||||
145 | ||||
#=========================================================================== | 146 | 57 | #=========================================================================== | |
#Print final results in summarized or detailed format | 147 | 58 | #Read command line and print help | |
148 | 59 | |||
#print Data::Dumper->Dump([\%results ], [qw(*results )]); | 149 | |||
#<STDIN>; | 150 | |||
151 | 60 | |||
print_reports(\%results) if ($mode eq 'all'); | 152 | 61 | sub read_command_line { | |
153 | 62 | |||
63 | print_help() unless (@ARGV); | |||
154 | 64 | |||
65 | my $status = GetOptions( | |||
66 | "s|seqs-file=s" => \&read_seqsFile, | |||
67 | "d|seqs-dir=s" => \&read_seqsDir, | |||
68 | "o|outdir=s" => \&read_outdir, | |||
69 | "t|tms=s" => \&read_tmsFile, | |||
70 | "e|evalue=f" => \$evalue, | |||
71 | "i|identity=f" => \$identity, | |||
72 | "c|coverage=f" => \$coverage, | |||
73 | "h|help" => sub { print_help(); }, | |||
74 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | |||
75 | exit unless ($status); | |||
155 | 76 | |||
156 | 77 | |||
157 | 78 | #Validadte input file option | ||
########################################################################### | 158 | 79 | die "Error: no sequence file detected!" unless ($seqsFile); | |
# # | 159 | |||
# Subroutine definition # | 160 | |||
# # | 161 | |||
########################################################################### | 162 | |||
163 | ||||
164 | ||||
#print final_report | 165 | |||
166 | ||||
sub print_reports { | 167 | |||
168 | ||||
my $res = shift; | 169 | |||
170 | ||||
171 | ||||
#Get the directory where reports will be saved | 172 | |||
my $reportDir = undef; | 173 | |||
if ($mode eq 'all') { | 174 | |||
$reportDir = getReportsDir(); | 175 | |||
} | 176 | |||
else { | 177 | |||
178 | ||||
#one id per report | 179 | |||
my @ids = keys %$res; | 180 | |||
my $seqId = $ids[0]; | 181 | |||
182 | ||||
$reportDir = getReportsDir($seqId); | 183 | |||
} | 184 | |||
die "Error: invalid report dir" unless ($reportDir); | 185 | |||
186 | ||||
187 | ||||
my $sumFile = "$reportDir/repeats_summary_report.txt"; | 188 | |||
my $detailsFile = "$reportDir/repeats_detailed_report.txt"; | 189 | |||
my $htmlFile = "$reportDir/report.html"; | 190 | |||
191 | ||||
192 | ||||
open (my $htmlfh, ">", $htmlFile) || die $!; | 193 | |||
194 | ||||
my $htmlHeader = <<HEADER; | 195 | |||
<!DOCTYPE html> | 196 | |||
<html xmlns="http://www.w3.org/1999/xhtml"> | 197 | |||
<head> | 198 | |||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | 199 | |||
200 | ||||
<style type="text/css"> | 201 | |||
202 | ||||
.label { | 203 | |||
text-align: right; | 204 | |||
width: 50px; | 205 | |||
} | 206 | 80 | } | |
207 | 81 | |||
.data { | 208 | |||
text-align: left; | 209 | |||
padding-left: 8px; | 210 | |||
width: 100px; | 211 | |||
} | 212 | |||
213 | 82 | |||
.uline { | 214 | |||
text-decoration: underline; | 215 | |||
} | 216 | |||
217 | ||||
.seq { | 218 | |||
border: 2px solid black; | 219 | |||
height: 70px; | 220 | |||
width: 100%; | 221 | |||
overflow-x: auto; | 222 | |||
overflow-y: hidden; | 223 | |||
margin: 1em 0; | 224 | |||
background: gray; | 225 | |||
color: white; | 226 | |||
} | 227 | |||
228 | ||||
img { | 229 | |||
display: block; | 230 | |||
margin-left: auto; | 231 | |||
margin-right: auto; | 232 | |||
height: 250px; | 233 | |||
width: auto; | 234 | |||
max-width: 1500px; | 235 | |||
max-height: 300px; | 236 | |||
} | 237 | |||
238 | ||||
</style> | 239 | |||
<title>Inferring repeats of $gs_repUnit TMS</title> | 240 | |||
</head> | 241 | |||
<br /> | 242 | |||
<h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1> | 243 | |||
<body> | 244 | |||
245 | ||||
HEADER | 246 | |||
247 | ||||
print $htmlfh $htmlHeader; | 248 | |||
open (my $sumh, ">", $sumFile) || die $!; | 249 | |||
open (my $deth, ">", $detailsFile) || die $!; | 250 | |||
251 | ||||
252 | ||||
#Header for summary table | 253 | |||
print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n"; | 254 | |||
255 | ||||
256 | ||||
# print Data::Dumper->Dump([$res ], [qw(*res )]); | 257 | |||
# <STDIN>; | 258 | |||
259 | ||||
260 | ||||
P:foreach my $id (sort {$a cmp $b} keys %$res) { | 261 | |||
262 | ||||
#Jump to next result if there are NO hits for this protein and | 263 | |||
#ONLY good hits are going to be recorded. | 264 | |||
unless (%{ $res->{$id} }) { | 265 | |||
next P if ($goodHitsOnly); | 266 | |||
} | 267 | |||
268 | ||||
269 | ||||
print $deth "===========================================================================\n"; | 270 | |||
print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n"; | 271 | |||
272 | ||||
#There must be results to continue | 273 | |||
unless (%{ $res->{$id} }) { | 274 | |||
print $sumh "$id\tNo_hits\n"; | 275 | |||
print $deth "$id\tNo_hits\n\n\n"; | 276 | |||
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n"; | 277 | |||
} | 278 | |||
279 | ||||
print $deth "$id\n\n"; | 280 | |||
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n"; | 281 | |||
282 | ||||
283 | ||||
284 | ||||
285 | ||||
#get the long bundle names | 286 | |||
BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) { | 287 | |||
288 | ||||
BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) { | 289 | |||
290 | ||||
my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName}; | 291 | |||
my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen}; | 292 | |||
293 | ||||
#Each of the hits for this bundle | 294 | |||
my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} }; | 295 | |||
296 | ||||
#To get rid of a warning when there is only one hit. | 297 | |||
my @hits = (scalar (@hits_tmp) > 1)? | 298 | |||
sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp; | 299 | |||
300 | ||||
foreach my $hit (@hits) { | 301 | |||
302 | ||||
my $hName = $hit->{hName}; | 303 | |||
my $hLen = $hit->{hLen}; | 304 | |||
305 | ||||
my $evalue = sprintf ("%.1e", $hit->{hEvalue}); | 306 | |||
my $ident = sprintf ("%.1f", $hit->{hId} * 100); | 307 | |||
my $sim = sprintf ("%.1f", $hit->{hSim} * 100); | 308 | |||
my $gsat = sprintf ("%.1f", $hit->{gsat}); | 309 | |||
310 | ||||
my $alnLen = $hit->{alnLen}; | 311 | |||
my $qCov = sprintf("%.1f", $hit->{qCov} * 100); | 312 | |||
my $hCov = sprintf("%.1f", $hit->{hCov} * 100); | 313 | |||
314 | ||||
315 | ||||
#The alignment | 316 | |||
my $qstart = $hit->{qstart}; | 317 | |||
my $qend = $hit->{qend}; | 318 | |||
my $sstart = $hit->{sstart}; | 319 | |||
my $send = $hit->{send}; | 320 | |||
my $qSeq = $hit->{qSeq}; | 321 | |||
my $homStr = $hit->{homStr}; | 322 | |||
my $sSeq = $hit->{sSeq}; | 323 | |||
324 | ||||
my $plot = $hit->{plot}; | 325 | |||
326 | ||||
#For summary tab-delimitedfile (everything except the alignment) | 327 | |||
print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n"; | 328 | |||
329 | ||||
330 | ||||
#Detailed report that includes the alignment | 331 | |||
print $deth "----------\n"; | 332 | |||
print $deth "$qName ($qLen) vs $hName ($hLen)\n\n"; | 333 | |||
print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n"; | 334 | |||
print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n"; | 335 | |||
print $deth "Alignment ($qName|${qstart}-$qend vs $hName|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n"; | 336 | |||
337 | ||||
338 | ||||
#The HTML report (includes alignment and hydropathy image | 339 | |||
my $repHit = <<HIT; | 340 | |||
341 | ||||
<p><b>$qName ($qLen) vs $hName ($hLen)</b></p> | 342 | |||
343 | ||||
<table width="600px" border="0" cellspacing="0" cellpadding="2"> | 344 | |||
<tr> | 345 | |||
<td class='label'><b>E-value:</b></td> | 346 | |||
<td class='data'>$evalue</td> | 347 | |||
<td class='label'><b>Identity:</b></td> | 348 | |||
<td class='data'>${ident}%</td> | 349 | |||
<td class='label'><b>Similarity:</b></td> | 350 | |||
<td class='data'>${sim}%</td> | 351 | |||
<td class='label'><b>GSAT:</b></td> | 352 | |||
<td class='data'>$gsat</td> | 353 | |||
</tr> | 354 | |||
<tr> | 355 | |||
<td class='label'><b>Aln:</b></td> | 356 | |||
<td class='data'>$alnLen</td> | 357 | |||
<td class='label'><b>Q_cov:</b></td> | 358 | |||
<td class='data'>${qCov}%</td> | 359 | |||
<td class='label'><b>S_cov:</b></td> | 360 | |||
<td class='data'>${hCov}%</td> | 361 | |||
<td class='label'></td> | 362 | |||
<td class='data'></td> | 363 | |||
</tr> | 364 | |||
</table> | 365 | |||
366 | ||||
<p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p> | 367 | |||
<div class='seq'> | 368 | |||
<pre> | 369 | |||
$qSeq | 370 | |||
$homStr | 371 | |||
$sSeq | 372 | |||
</pre> | 373 | |||
</div> | 374 | |||
<a href="$plot" target="_blank"><img src="$plot"/></a> | 375 | |||
<br /> | 376 | |||
<hr /> | 377 | |||
378 | ||||
HIT | 379 | |||
380 | ||||
print $htmlfh $repHit; | 381 | |||
382 | ||||
} #hit | 383 | |||
} #reference bundle number | 384 | |||
} #Reference bundle name | 385 | |||
} #Query protein | 386 | |||
387 | ||||
#Close HTML report | 388 | |||
my $closeRep = <<CLOSE; | 389 | |||
</body> | 390 | |||
</html> | 391 | |||
CLOSE | 392 | |||
393 | ||||
print $htmlfh $closeRep; | 394 | |||
395 | ||||
close $sumh; | 396 | |||
close $deth; | 397 | |||
close $htmlfh; | 398 | |||
} | 399 | |||
400 | ||||
401 | ||||
402 | ||||
#========================================================================== | 403 | 83 | #========================================================================== | |
#Run ssearch36 between the different bundles in a sequence | 404 | 84 | #Option -s | |
405 | 85 | |||
sub align_bundles { | 406 | 86 | sub read_seqsFile { | |
87 | my ($opt, $value) = @_; | |||
407 | 88 | |||
my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_; | 408 | 89 | unless (-f $value && !(-z $value)) { | |
409 | 90 | die "Error: file with sequences does not exist or is empty!\n"; | ||
%$lhr_topHits = (); | 410 | |||
411 | ||||
#Directory where the sequences of TMS bundles are saved | 412 | |||
my $sequencesDir = undef; | 413 | |||
my $alignmentsDir = undef; | 414 | |||
my $hydroPlotsDir = undef; | 415 | |||
416 | ||||
if ($mode eq 'all') { | 417 | |||
$sequencesDir = getSequencesDir(); | 418 | |||
$alignmentsDir = getAlignmentsDir(); | 419 | |||
$hydroPlotsDir = getPlotsDir(); | 420 | |||
} | 421 | 91 | } | |
else { | 422 | |||
$sequencesDir = getSequencesDir($seqId); | 423 | |||
$alignmentsDir = getAlignmentsDir($seqId); | 424 | |||
$hydroPlotsDir = getPlotsDir($seqId); | 425 | |||
} | 426 | |||
die "Error: invalid sequences dir" unless ($sequencesDir); | 427 | |||
die "Error: invalid alignments dir" unless ($alignmentsDir); | 428 | |||
die "Error: invalid plots dir" unless ($hydroPlotsDir); | 429 | |||
430 | 92 | |||
431 | 93 | $seqsFile = $value; | ||
# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]); | 432 | |||
# <STDIN>; | 433 | |||
434 | ||||
435 | ||||
#The bundle that will be used as reference for the comparison | 436 | |||
REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) { | 437 | |||
438 | ||||
my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0]; | 439 | |||
440 | ||||
441 | ||||
#Id to name ssearch36 output files | 442 | |||
my $id = $lhr_bundleSeqFiles->{$bundle}->[0]; | 443 | |||
$id =~ s/\.faa//; | 444 | |||
445 | ||||
446 | ||||
#For naming GSAT files (ID of system or protein accession) | 447 | |||
my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef; | 448 | |||
die "Could not extract accession from $id!" unless ($id); | 449 | |||
450 | ||||
451 | ||||
# print Data::Dumper->Dump([$id, $tcAcc ], [qw(*id *tcAcc)]); | 452 | |||
# <STDIN>; | 453 | |||
454 | ||||
455 | ||||
#-------------------------------------------------------------------- | 456 | |||
#Get the non-overlapping bundles to compare them against the | 457 | |||
#reference bundle | 458 | |||
459 | ||||
my @cmpFiles = (); | 460 | |||
461 | ||||
#Initialize the index to the first non-overlapping bundle | 462 | |||
my $next_bundle_idx = $bundle + $gs_repUnit; | 463 | |||
464 | ||||
CMP:while (1) { | 465 | |||
466 | ||||
#Exit if next bundle is not in bundles hash | 467 | |||
last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx}); | 468 | |||
469 | ||||
#Get file name for this non-overlapping bundle | 470 | |||
my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0]; | 471 | |||
push (@cmpFiles, $cmpBundle); | 472 | |||
473 | ||||
#Update the index to the next non-overlapping bundle | 474 | |||
$next_bundle_idx = $next_bundle_idx + $gs_repUnit; | 475 | |||
} | 476 | |||
477 | ||||
#go to next reference bundle if there are no non-overlapping bundles. | 478 | |||
next REF unless (@cmpFiles); | 479 | |||
480 | ||||
481 | ||||
# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]); | 482 | |||
# <STDIN>; | 483 | |||
484 | ||||
485 | ||||
#-------------------------------------------------------------------- | 486 | |||
#Now run ssearch36 of the reference bundle against all its | 487 | |||
#non-overlapping bundles | 488 | |||
489 | ||||
#put all non-overlapping bundles into a file | 490 | |||
my $libFile = "$sequencesDir/lib_$id.faa"; | 491 | |||
my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile"; | 492 | |||
system $cmd; | 493 | |||
494 | ||||
495 | ||||
#run ssearch36 of $rFile vs @cmpFile | 496 | |||
my $ssearchOut = "$alignmentsDir/ssearch_$id.out"; | 497 | |||
my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut); | 498 | |||
system "ssearch36 $ssearch_params" unless (-f $ssearchOut); | 499 | |||
500 | ||||
501 | ||||
# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]); | 502 | |||
# <STDIN>; | 503 | |||
504 | ||||
505 | ||||
#--------------------------------------------------------------------- | 506 | |||
#Estimate here the spacing between x-ticks for hydropathy plots | 507 | |||
508 | ||||
my $protLen = $origSeqLength{$seqId}; | 509 | |||
510 | ||||
my $xticksSpacing = undef; | 511 | |||
if ($protLen <= 500) { | 512 | |||
$xticksSpacing = 25; | 513 | |||
} | 514 | |||
elsif ($protLen <= 1000) { | 515 | |||
$xticksSpacing = 50; | 516 | |||
} | 517 | |||
else { | 518 | |||
$xticksSpacing = 100; | 519 | |||
} | 520 | |||
521 | ||||
522 | ||||
523 | ||||
#-------------------------------------------------------------------- | 524 | |||
#parse ssearch36 output. For BioPerl resouces check: | 525 | |||
#http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm | 526 | |||
#https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html | 527 | |||
528 | ||||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut); | 529 | |||
530 | ||||
531 | ||||
#put hir the top hits | 532 | |||
my %lh_hits = (); | 533 | |||
534 | ||||
535 | ||||
while (my $result = $parser->next_result) { | 536 | |||
537 | ||||
538 | ||||
my $qLen = $result->query_length; | 539 | |||
$lh_hits{$bundle}{qName} = $result->query_name; | 540 | |||
$lh_hits{$bundle}{qLen} = $qLen; | 541 | |||
$lh_hits{$bundle}{hits} = []; | 542 | |||
543 | ||||
544 | ||||
HIT:while (my $hit = $result->next_hit) { | 545 | |||
546 | ||||
HSP:while(my $hsp = $hit->next_hsp) { | 547 | |||
548 | ||||
549 | ||||
# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]); | 550 | |||
# <STDIN>; | 551 | |||
552 | ||||
553 | ||||
my %tmp = (); | 554 | |||
555 | ||||
my $alnLen = $hsp->hsp_length; | 556 | |||
my $hLen = $hit->length; | 557 | |||
my $hEvalue = $hsp->evalue; | 558 | |||
my $hId = $hsp->frac_identical('total'); #identity in the alignment | 559 | |||
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | 560 | |||
561 | ||||
562 | ||||
#coordinates in the alignment to properly calculate coverages | 563 | |||
my $qstart = $hsp->start('query'); | 564 | |||
my $qend = $hsp->end('query'); | 565 | |||
my $sstart = $hsp->start('subject'); | 566 | |||
my $send = $hsp->end('subject'); | 567 | |||
568 | ||||
569 | ||||
#Calculate coverages properly (do not use alignment length as it includes gaps | 570 | |||
571 | ||||
my $qCov_tmp = ($qend - $qstart + 1) / $qLen; | 572 | |||
my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp; | 573 | |||
574 | ||||
my $hCov_tmp = ($send - $sstart + 1) / $hLen; | 575 | |||
my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp; | 576 | |||
577 | ||||
578 | ||||
# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity], | 579 | |||
# [qw(*qLen *qCov $hLen *hCov *coverageCutoff *evalue *evalCutoff *hId *IDcutoff)]); | 580 | |||
# <STDIN>; | 581 | |||
582 | ||||
583 | ||||
#Before storing hit results check minimum coverage, identity and evalue | 584 | |||
next HSP unless (($qCov >= $gs_coverage || $hCov >= $gs_coverage) && | 585 | |||
($hEvalue <= $gs_evalue) && ($hId >= $gs_identity)); | 586 | |||
587 | ||||
588 | ||||
#hit identity | 589 | |||
$tmp{hName} = $hit->name; | 590 | |||
$tmp{hLen} = $hLen; | 591 | |||
592 | ||||
593 | ||||
#hit statistics | 594 | |||
$tmp{alnLen} = $alnLen; | 595 | |||
$tmp{hEvalue} = $hEvalue; | 596 | |||
$tmp{hId} = $hId; | 597 | |||
$tmp{hSim} = $hSim; | 598 | |||
$tmp{qCov} = $qCov; | 599 | |||
$tmp{hCov} = $hCov; | 600 | |||
601 | ||||
602 | ||||
#The alignment | 603 | |||
$tmp{qstart} = $qstart; | 604 | |||
$tmp{qend} = $qend; | 605 | |||
$tmp{sstart} = $sstart; | 606 | |||
$tmp{send} = $send; | 607 | |||
608 | ||||
$tmp{qSeq} = $hsp->query_string; | 609 | |||
$tmp{sSeq} = $hsp->hit_string; | 610 | |||
$tmp{homStr} = $hsp->homology_string; | 611 | |||
612 | ||||
613 | ||||
#Get the GSAT score | 614 | |||
my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat"; | 615 | |||
616 | ||||
617 | ||||
# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n"; | 618 | |||
# exit; | 619 | |||
620 | ||||
system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile); | 621 | |||
622 | ||||
my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile); | 623 | |||
$tmp{gsat} = $gsat_score; | 624 | |||
625 | ||||
626 | ||||
# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]); | 627 | |||
# <STDIN>; | 628 | |||
629 | ||||
630 | ||||
#GSAT is the last filter | 631 | |||
next HSP unless ($gsat_score >= $min_gsat_score); | 632 | |||
633 | ||||
#------------------------------------------------------------ | 634 | |||
#Generate quod plot with the repeat | 635 | |||
636 | ||||
my $whole_prot_seq = "$gs_seqDir/${seqId}.faa"; | 637 | |||
die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq); | 638 | |||
639 | ||||
640 | ||||
my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName}; | 641 | |||
my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png"; | 642 | |||
my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName}; | 643 | |||
644 | ||||
#Get hydrophobic peaks coords | 645 | |||
my $hydroPeaks = $gh_tms{$seqId}; | 646 | |||
die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks }); | 647 | |||
648 | ||||
649 | ||||
#format the hydrophobic peaks for quod | 650 | |||
my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks; | 651 | |||
my $pstring = join (" ", @peaks); | 652 | |||
653 | ||||
654 | ||||
#---------- | 655 | |||
#Calculate the positions of the aligned section of each bundle in the full sequence. | 656 | |||
657 | ||||
my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef; | 658 | |||
my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef; | 659 | |||
die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid); | 660 | |||
661 | ||||
662 | ||||
#extract initial positions for both bundles | 663 | |||
my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1]; | 664 | |||
my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1; | 665 | |||
my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1]; | 666 | |||
my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1; | 667 | |||
die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend); | 668 | |||
die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend); | 669 | |||
670 | ||||
671 | ||||
#Calculate bundle positions here | 672 | |||
my $qgp_start = $qbstart + ($qstart - 1); | 673 | |||
my $qgp_end = $qbstart + ($qend - 1); | 674 | |||
675 | ||||
my $sgp_start = $sbstart + ($sstart - 1); | 676 | |||
my $sgp_end = $sbstart + ($send - 1); | 677 | |||
678 | ||||
679 | ||||
#Format the coordinates for the repeats now | 680 | |||
my $qrep = "${qgp_start}-${qgp_end}:green"; | 681 | |||
my $srep = "${sgp_start}-${sgp_end}:blue"; | 682 | |||
683 | ||||
#Format the coordinates for the bar delimiting the bundles | 684 | |||
my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1"; | 685 | |||
686 | ||||
#The quod command line | 687 | |||
my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}"; | 688 | |||
689 | ||||
my $img = "${plotFile}.png"; | 690 | |||
system $cmd unless (-f $img); | 691 | |||
die "Could not generate plot: $img" unless (-f $img); | 692 | |||
693 | ||||
$tmp{plot} = $fileName; | 694 | |||
695 | ||||
696 | ||||
#load the data into the hits section for this bundle | 697 | |||
push (@{ $lh_hits{$bundle}{hits} }, \%tmp); | 698 | |||
699 | ||||
700 | ||||
} #HSP | 701 | |||
} #HIT | 702 | |||
} #While | 703 | |||
704 | ||||
705 | ||||
#Add results to the topHits hash | 706 | |||
if (@{ $lh_hits{$bundle}{hits} }) { | 707 | |||
$lhr_topHits->{$id} = \%lh_hits; | 708 | |||
} | 709 | |||
710 | ||||
} | 711 | |||
} | 712 | 94 | } | |
713 | 95 | |||
714 | 96 | |||
715 | ||||
716 | ||||
#========================================================================== | 717 | 97 | #========================================================================== | |
#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the | 718 | 98 | #Option -t | |
#sequence in TMS bundles of length rsize. | 719 | |||
720 | 99 | |||
100 | sub read_tmsFile { | |||
101 | my ($opt, $value) = @_; | |||
721 | 102 | |||
sub cut_seq_in_tms_regions { | 722 | 103 | unless (-f $value && !(-z $value)) { | |
723 | 104 | die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n"; | ||
my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_; | 724 | |||
725 | ||||
726 | ||||
%$lhr_seqSegs = (); | 727 | |||
728 | ||||
729 | ||||
#Get the directory where bundle sequences will be saved | 730 | |||
my $sequencesDir = undef; | 731 | |||
732 | ||||
if ($mode eq 'all') { | 733 | |||
$sequencesDir = getSequencesDir(); | 734 | |||
} | 735 | 105 | } | |
else { | 736 | |||
$sequencesDir = getSequencesDir($ls_pid); | 737 | |||
} | 738 | |||
die "Error: invalid sequence dir" unless ($sequencesDir); | 739 | |||
740 | 106 | |||
741 | 107 | $tmsFile = $value; | ||
#---------------------------------------------------------------------- | 742 | |||
#Get the coordinates of the overlapping bundles | 743 | |||
744 | ||||
my @la_tms = @{ $lhr_tms->{$ls_pid} }; | 745 | |||
746 | ||||
747 | ||||
748 | ||||
#Get the Length of the sequence of the query protein | 749 | |||
my $seqFile = "$gs_seqDir/${ls_pid}.faa"; | 750 | |||
my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta"); | 751 | |||
my $seqObj = $obj->next_seq; | 752 | |||
my $qlength = $seqObj->length; | 753 | |||
die "Could not extract protein length." unless ($qlength); | 754 | |||
755 | ||||
#Store the length of the original sequence for proper calculation of | 756 | |||
#the x-ticks in the hydropathy plots of the results | 757 | |||
$origSeqLength{$ls_pid} = $qlength; | 758 | |||
759 | ||||
760 | ||||
761 | ||||
#Number of TMS in protein | 762 | |||
my $ls_ntms = scalar (@la_tms); | 763 | |||
764 | ||||
765 | ||||
766 | ||||
for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) { | 767 | |||
768 | ||||
#TMS in bundle | 769 | |||
my $left_tms = $la_tms[$idx - 1]; | 770 | |||
my $right_tms = $la_tms[$idx + $ls_repeat - 2]; | 771 | |||
772 | ||||
773 | ||||
#The coordinates of the bundle | 774 | |||
my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail; | 775 | |||
#my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail; | 776 | |||
my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail; | 777 | |||
778 | ||||
779 | ||||
#Cut and name the bundles only if bundle file does not exist | 780 | |||
my $outfile = "${ls_pid}_bundle${idx}"; | 781 | |||
unless (-f "$sequencesDir/${outfile}.faa") { | 782 | |||
783 | ||||
#cutting bundle | 784 | |||
my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0); | 785 | |||
system "tmsplit $args > /dev/null"; | 786 | |||
787 | ||||
#replace protein ID with bundle number to the ID so alignments can be easily identified | 788 | |||
system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa); | 789 | |||
} | 790 | |||
791 | ||||
$lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos]; | 792 | |||
} | 793 | |||
} | 794 | 108 | } | |
795 | 109 | |||
796 | 110 | |||
797 | ||||
798 | ||||
#========================================================================== | 799 | 111 | #========================================================================== | |
#Read file with the TMS coordinates of the input proteins. The TMS | 800 | 112 | #Option -d | |
#must have been validated with WHAT to make sure they are reliable. | 801 | |||
802 | 113 | |||
114 | sub read_seqsDir { | |||
115 | my ($opt, $value) = @_; | |||
803 | 116 | |||
sub read_tms_coordinates_file { | 804 | 117 | die "Error: directory with sequences does not exist." unless (-d $value); | |
805 | 118 | |||
my ($s_coordsFile, $hr_tms) = @_; | 806 | 119 | $seqsDir = $value; | |
807 | ||||
open (my $fileh, "<", $s_coordsFile) || die $!; | 808 | |||
809 | ||||
#----------------------------------------------------------------- | 810 | |||
#The format of this file is protein ID followed by pairs of | 811 | |||
#coordinates separated by dash: | 812 | |||
# 2.A.43.1.1-O60931 1-20 25-35 50-68 .... | 813 | |||
if ($infileFmt eq 'tms') { | 814 | |||
815 | ||||
while(<$fileh>) { | 816 | |||
chomp; | 817 | |||
818 | ||||
#ignore empty lines; | 819 | |||
next unless ($_); | 820 | |||
821 | ||||
#extract id and TMSs coordinates | 822 | |||
my ($id, @tms_str) = split(/\s+/, $_); | 823 | |||
my @tms = map { [ split(/-/, $_) ] } @tms_str; | 824 | |||
825 | ||||
826 | ||||
#For debugging purposes | 827 | |||
# next unless ($id eq 'WP_100644534'); | 828 | |||
829 | ||||
830 | ||||
$hr_tms->{$id} = \@tms; | 831 | |||
832 | ||||
#Verify that the sequence is available for this protein | 833 |