Commit 538dec87bcf488605c35aa8fd2e872326adff65d
1 parent
bbf0ef5765
Exists in
master
updated scripts to run with new versions of quod and extractTCDB
Showing 3 changed files with 56 additions and 28 deletions Inline Diff
alignSeqsFiles.pl
View file @
538dec8
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | $Data::Dumper::Deepcopy = 1; | |
8 | 8 | |||
use Getopt::Long; | 9 | 9 | use Getopt::Long; | |
use LWP; | 10 | 10 | use LWP; | |
use Bio::SeqIO; | 11 | 11 | use Bio::SeqIO; | |
use Bio::SearchIO; | 12 | 12 | use Bio::SearchIO; | |
13 | 13 | |||
#Local libraries | 14 | 14 | #Local libraries | |
use TCDB::CheckDependencies; | 15 | 15 | use TCDB::CheckDependencies; | |
use TCDB::Domain::PfamParser; | 16 | 16 | use TCDB::Domain::PfamParser; | |
use TCDB::Domain::Characterize; | 17 | 17 | use TCDB::Domain::Characterize; | |
use TCDB::Assorted; | 18 | 18 | use TCDB::Assorted; | |
19 | 19 | |||
20 | 20 | |||
########################################################################### | 21 | 21 | ########################################################################### | |
# | 22 | 22 | # | |
# Comapre two files with fasta sequences and report the alignment parameters | 23 | 23 | # Comapre two files with fasta sequences and report the alignment parameters | |
# Along with hydropathy plots and PFAM domains. | 24 | 24 | # Along with hydropathy plots and PFAM domains. | |
# | 25 | 25 | # | |
########################################################################### | 26 | 26 | ########################################################################### | |
27 | 27 | |||
#========================================================================== | 28 | 28 | #========================================================================== | |
#Check dependencies | 29 | 29 | #Check dependencies | |
30 | 30 | |||
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", | 31 | 31 | my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd", | |
"hmmscan"); | 32 | 32 | "hmmscan"); | |
my $CheckDep_obj = new TCDB::CheckDependencies(); | 33 | 33 | my $CheckDep_obj = new TCDB::CheckDependencies(); | |
$CheckDep_obj -> dependencies_list(\@dependencies); | 34 | 34 | $CheckDep_obj -> dependencies_list(\@dependencies); | |
$CheckDep_obj -> checkDependencies; | 35 | 35 | $CheckDep_obj -> checkDependencies; | |
36 | 36 | |||
37 | 37 | |||
#This will prevent quod and alnquod from going into interactive mode | 38 | 38 | #This will prevent quod and alnquod from going into interactive mode | |
$ENV{"MPLBACKEND"} = "agg"; | 39 | 39 | $ENV{"MPLBACKEND"} = "agg"; | |
40 | 40 | |||
41 | 41 | |||
#========================================================================== | 42 | 42 | #========================================================================== | |
#Read command line arguments | 43 | 43 | #Read command line arguments | |
44 | 44 | |||
my $qfile = ""; | 45 | 45 | my $qfile = ""; | |
my $qProt = ""; | 46 | 46 | my $qProt = ""; | |
my $sfile = ""; | 47 | 47 | my $sfile = ""; | |
my $sProt = ""; | 48 | 48 | my $sProt = ""; | |
my $qlabel = "Query"; | 49 | 49 | my $qlabel = "Query"; | |
my $slabel = "Subject"; | 50 | 50 | my $slabel = "Subject"; | |
my $outdir = ""; | 51 | 51 | my $outdir = ""; | |
my $prog = 'ssearch36'; #'blastp'; | 52 | 52 | my $prog = 'ssearch36'; #'blastp'; | |
my $evalue = 1e-4; | 53 | 53 | my $evalue = 1e-4; | |
my $identity = 20.0; | 54 | 54 | my $identity = 20.0; | |
my $coverage = 40.0; | 55 | 55 | my $coverage = 40.0; | |
my $covControl = "X"; | 56 | 56 | my $covControl = "X"; | |
my $blastComp = "F"; #2; | 57 | 57 | my $blastComp = "F"; #2; | |
my $segFilter = 'no'; | 58 | 58 | my $segFilter = 'no'; | |
my $minLength = 30; #Min legnth of proteins to analyze (without gaps) | 59 | 59 | my $minLength = 30; #Min legnth of proteins to analyze (without gaps) | |
my $subMatrix = 'BL50'; | 60 | 60 | my $subMatrix = 'BL50'; | |
my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] | 61 | 61 | my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] | |
my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] | 62 | 62 | my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] | |
63 | 63 | |||
#this can be used to remove long sequences from results | 64 | 64 | #this can be used to remove long sequences from results | |
my $maxProtLength = 100000; #default threshold to allow any length | 65 | 65 | my $maxProtLength = 100000; #default threshold to allow any length | |
my $LengthControl = "N"; #same meaning as $covControl | 66 | 66 | my $LengthControl = "N"; #same meaning as $covControl | |
67 | 67 | |||
#internal directories | 68 | 68 | #internal directories | |
my $filesDir = ""; | 69 | 69 | my $filesDir = ""; | |
my $plotsDir = ""; | 70 | 70 | my $plotsDir = ""; | |
my $seqDir = ""; | 71 | 71 | my $seqDir = ""; | |
my $blastDir = ""; | 72 | 72 | my $blastDir = ""; | |
73 | 73 | |||
74 | 74 | |||
read_command_line(); | 75 | 75 | read_command_line(); | |
76 | 76 | |||
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, | 77 | 77 | #print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog, | |
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, | 78 | 78 | # $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength, | |
# $LengthControl], | 79 | 79 | # $LengthControl], | |
# [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog | 80 | 80 | # [qw(*qfile *qProt *sfile *sProt *qlabel *slabel *outdir *prog | |
# *evalue *coverage *covControl *blastComp *segFilter *maxProtLength | 81 | 81 | # *evalue *coverage *covControl *blastComp *segFilter *maxProtLength | |
# *LengthControl)]); | 82 | 82 | # *LengthControl)]); | |
#exit; | 83 | 83 | #exit; | |
84 | 84 | |||
85 | 85 | |||
86 | 86 | |||
#========================================================================== | 87 | 87 | #========================================================================== | |
#Output files | 88 | 88 | #Output files | |
89 | 89 | |||
#The alignment file by blastp or ssearch36 | 90 | 90 | #The alignment file by blastp or ssearch36 | |
my $alnFile = "$filesDir/${prog}.out"; | 91 | 91 | my $alnFile = "$filesDir/${prog}.out"; | |
92 | 92 | |||
#The results of running hmmscan | 93 | 93 | #The results of running hmmscan | |
my $pfamFile = "$filesDir/hmmscan.out"; | 94 | 94 | my $pfamFile = "$filesDir/hmmscan.out"; | |
95 | 95 | |||
#The results from running hmmtop | 96 | 96 | #The results from running hmmtop | |
my $hmmtopFile = "$filesDir/hmmtop.out"; | 97 | 97 | my $hmmtopFile = "$filesDir/hmmtop.out"; | |
98 | 98 | |||
#The blast database to retrieve sequences for ploting | 99 | 99 | #The blast database to retrieve sequences for ploting | |
my $blastdb = "$blastDir/sequences"; | 100 | 100 | my $blastdb = "$blastDir/sequences"; | |
101 | 101 | |||
102 | 102 | |||
103 | 103 | |||
#========================================================================== | 104 | 104 | #========================================================================== | |
#Run the alignment first | 105 | 105 | #Run the alignment first | |
106 | 106 | |||
print "Running $prog and parsing output....\n"; | 107 | 107 | print "Running $prog and parsing output....\n"; | |
run_alignment(); | 108 | 108 | run_alignment(); | |
109 | 109 | |||
110 | 110 | |||
my @alnHits = (); | 111 | 111 | my @alnHits = (); | |
if ($prog eq 'blastp') { parse_blast(\@alnHits); } | 112 | 112 | if ($prog eq 'blastp') { parse_blast(\@alnHits); } | |
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} | 113 | 113 | elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)} | |
114 | 114 | |||
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); | 115 | 115 | #print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]); | |
#exit; | 116 | 116 | #exit; | |
117 | 117 | |||
118 | 118 | |||
die "No significant blastHits found!\n" unless (@alnHits); | 119 | 119 | die "No significant blastHits found!\n" unless (@alnHits); | |
120 | 120 | |||
121 | 121 | |||
122 | 122 | |||
#========================================================================== | 123 | 123 | #========================================================================== | |
#Run pfam (get clans, hmmtop, and parse results | 124 | 124 | #Run pfam (get clans, hmmtop, and parse results | |
125 | 125 | |||
my %pfamHits = (); | 126 | 126 | my %pfamHits = (); | |
my %clans = (); | 127 | 127 | my %clans = (); | |
my %hmmtopHits = (); | 128 | 128 | my %hmmtopHits = (); | |
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); | 129 | 129 | run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans ); | |
130 | 130 | |||
#print Data::Dumper->Dump([\%clans], [qw(*clans)]); | 131 | 131 | #print Data::Dumper->Dump([\%clans], [qw(*clans)]); | |
#exit; | 132 | 132 | #exit; | |
133 | 133 | |||
134 | 134 | |||
#========================================================================== | 135 | 135 | #========================================================================== | |
#Parse the alignment results to make sure there are signficant results, | 136 | 136 | #Parse the alignment results to make sure there are signficant results, | |
#get domains for significant hits and plot the corresponding hydropathies. | 137 | 137 | #get domains for significant hits and plot the corresponding hydropathies. | |
138 | 138 | |||
139 | 139 | |||
140 | 140 | |||
141 | 141 | |||
print "Geneating report...\n"; | 142 | 142 | print "Geneating report...\n"; | |
generate_report(); | 143 | 143 | generate_report(); | |
144 | 144 | |||
145 | 145 | |||
146 | 146 | |||
#========================================================================== | 147 | 147 | #========================================================================== | |
################ Subroutines definition beyond ths point ############## | 148 | 148 | ################ Subroutines definition beyond ths point ############## | |
#========================================================================== | 149 | 149 | #========================================================================== | |
150 | 150 | |||
151 | 151 | |||
#========================================================================== | 152 | 152 | #========================================================================== | |
#Generate output for significant hits | 153 | 153 | #Generate output for significant hits | |
154 | 154 | |||
sub generate_report { | 155 | 155 | sub generate_report { | |
156 | 156 | |||
157 | 157 | |||
#Prepare output files | 158 | 158 | #Prepare output files | |
my $htmlFile = "$outdir/report.html"; | 159 | 159 | my $htmlFile = "$outdir/report.html"; | |
my $tsvFile = "$outdir/report.tsv"; | 160 | 160 | my $tsvFile = "$outdir/report.tsv"; | |
my $plotsFile = "$outdir/plots.html"; | 161 | 161 | my $plotsFile = "$outdir/plots.html"; | |
162 | 162 | |||
my $tsvHeader = "#Query\tSubject\tQlen\tSlen\tE-value\tIdentity\tQstart\tQend\tQcov\tSstart\tSend\tScov\tQseq\tSseq\n"; | 163 | 163 | my $tsvHeader = "#Query\tSubject\tQlen\tSlen\tE-value\tIdentity\tQstart\tQend\tQcov\tSstart\tSend\tScov\tQseq\tSseq\n"; | |
164 | 164 | |||
my $htmlHeader = <<HEADER; | 165 | 165 | my $htmlHeader = <<HEADER; | |
<!DOCTYPE html> | 166 | 166 | <!DOCTYPE html> | |
<html xmlns="http://www.w3.org/1999/xhtml"> | 167 | 167 | <html xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | 168 | 168 | <head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | 169 | 169 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
170 | 170 | |||
<style type="text/css"> | 171 | 171 | <style type="text/css"> | |
172 | 172 | |||
.label { | 173 | 173 | .label { | |
text-align: right; | 174 | 174 | text-align: right; | |
width: 50px; | 175 | 175 | width: 50px; | |
} | 176 | 176 | } | |
177 | 177 | |||
.data { | 178 | 178 | .data { | |
text-align: left; | 179 | 179 | text-align: left; | |
padding-left: 8px; | 180 | 180 | padding-left: 8px; | |
width: 100px; | 181 | 181 | width: 100px; | |
} | 182 | 182 | } | |
183 | 183 | |||
.uline { | 184 | 184 | .uline { | |
text-decoration: underline; | 185 | 185 | text-decoration: underline; | |
} | 186 | 186 | } | |
187 | 187 | |||
.pfam { | 188 | 188 | .pfam { | |
text-align: center; | 189 | 189 | text-align: center; | |
vertical-align: middle; | 190 | 190 | vertical-align: middle; | |
} | 191 | 191 | } | |
192 | 192 | |||
.seq { | 193 | 193 | .seq { | |
border: 2px solid black; | 194 | 194 | border: 2px solid black; | |
height: 70px; | 195 | 195 | height: 70px; | |
width: 100%; | 196 | 196 | width: 100%; | |
overflow-x: auto; | 197 | 197 | overflow-x: auto; | |
overflow-y: hidden; | 198 | 198 | overflow-y: hidden; | |
margin: 1em 0; | 199 | 199 | margin: 1em 0; | |
background: gray; | 200 | 200 | background: gray; | |
color: white; | 201 | 201 | color: white; | |
} | 202 | 202 | } | |
203 | 203 | |||
204 | 204 | |||
.dom { | 205 | 205 | .dom { | |
border: 2px solid black; | 206 | 206 | border: 2px solid black; | |
height: 100px; | 207 | 207 | height: 100px; | |
width: 100%; | 208 | 208 | width: 100%; | |
overflow-x: auto; | 209 | 209 | overflow-x: auto; | |
overflow-y: auto; | 210 | 210 | overflow-y: auto; | |
margin: 1em 0; | 211 | 211 | margin: 1em 0; | |
background: gray; | 212 | 212 | background: gray; | |
color: white; | 213 | 213 | color: white; | |
} | 214 | 214 | } | |
215 | 215 | |||
img { | 216 | 216 | img { | |
display: block; | 217 | 217 | display: block; | |
margin-left: auto; | 218 | 218 | margin-left: auto; | |
margin-right: auto; | 219 | 219 | margin-right: auto; | |
height: 250px; | 220 | 220 | height: 250px; | |
width: auto; | 221 | 221 | width: auto; | |
max-width: 1500px; | 222 | 222 | max-width: 1500px; | |
max-height: 300px; | 223 | 223 | max-height: 300px; | |
} | 224 | 224 | } | |
225 | 225 | |||
</style> | 226 | 226 | </style> | |
<title>$qlabel vs $slabel</title> | 227 | 227 | <title>$qlabel vs $slabel</title> | |
</head> | 228 | 228 | </head> | |
<br /> | 229 | 229 | <br /> | |
<h1 style='text-align:center'>$qlabel vs $slabel</h1> | 230 | 230 | <h1 style='text-align:center'>$qlabel vs $slabel</h1> | |
<body> | 231 | 231 | <body> | |
232 | 232 | |||
HEADER | 233 | 233 | HEADER | |
234 | 234 | |||
235 | 235 | |||
open (my $tsvh, ">", $tsvFile) || die $!; | 236 | 236 | open (my $tsvh, ">", $tsvFile) || die $!; | |
print $tsvh $tsvHeader; | 237 | 237 | print $tsvh $tsvHeader; | |
238 | 238 | |||
open (my $outh, ">", $htmlFile) || die $!; | 239 | 239 | open (my $outh, ">", $htmlFile) || die $!; | |
print $outh $htmlHeader; | 240 | 240 | print $outh $htmlHeader; | |
241 | 241 | |||
242 | 242 | |||
foreach my $hit (sort by_evalue @alnHits) { | 243 | 243 | foreach my $hit (sort by_evalue @alnHits) { | |
244 | 244 | |||
245 | 245 | |||
my $qacc = $hit->{qacc}; | 246 | 246 | my $qacc = $hit->{qacc}; | |
my $qlen = $hit->{qlen}; | 247 | 247 | my $qlen = $hit->{qlen}; | |
my $qseq = $hit->{qseq}; | 248 | 248 | my $qseq = $hit->{qseq}; | |
my $qcov = sprintf("%.1f", $hit->{qcov}); | 249 | 249 | my $qcov = sprintf("%.1f", $hit->{qcov}); | |
my $qstart = $hit->{qstart}; | 250 | 250 | my $qstart = $hit->{qstart}; | |
my $qend = $hit->{qend}; | 251 | 251 | my $qend = $hit->{qend}; | |
252 | 252 | |||
my $sacc = $hit->{sacc}; | 253 | 253 | my $sacc = $hit->{sacc}; | |
my $slen = $hit->{slen}; | 254 | 254 | my $slen = $hit->{slen}; | |
my $sseq = $hit->{sseq}; | 255 | 255 | my $sseq = $hit->{sseq}; | |
my $scov = sprintf("%.1f", $hit->{scov}); | 256 | 256 | my $scov = sprintf("%.1f", $hit->{scov}); | |
my $sstart = $hit->{sstart}; | 257 | 257 | my $sstart = $hit->{sstart}; | |
my $send = $hit->{send}; | 258 | 258 | my $send = $hit->{send}; | |
259 | 259 | |||
my $eval = sprintf ("%.1e", $hit->{evalue}); | 260 | 260 | my $eval = sprintf ("%.1e", $hit->{evalue}); | |
my $id = sprintf ("%.1f", $hit->{id}); | 261 | 261 | my $id = sprintf ("%.1f", $hit->{id}); | |
my $hstr = $hit->{hstr}; | 262 | 262 | my $hstr = $hit->{hstr}; | |
263 | 263 | |||
my $alnHit = <<HIT; | 264 | 264 | my $alnHit = <<HIT; | |
265 | 265 | |||
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> | 266 | 266 | <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/> | |
267 | 267 | |||
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> | 268 | 268 | <p><b>$qacc ($qlen) vs $sacc ($slen)</b></p> | |
269 | 269 | |||
<table width="600px" border="0" cellspacing="0" cellpadding="2"> | 270 | 270 | <table width="600px" border="0" cellspacing="0" cellpadding="2"> | |
<tr> | 271 | 271 | <tr> | |
<td class='label'><b>E-value:</b></td> | 272 | 272 | <td class='label'><b>E-value:</b></td> | |
<td class='data'>$eval</td> | 273 | 273 | <td class='data'>$eval</td> | |
<td class='label'><b>Identity:</b></td> | 274 | 274 | <td class='label'><b>Identity:</b></td> | |
<td class='data'>${id}%</td> | 275 | 275 | <td class='data'>${id}%</td> | |
<td class='label'><b>Q_coverage:</b></td> | 276 | 276 | <td class='label'><b>Q_coverage:</b></td> | |
<td class='data'>${qcov}%</td> | 277 | 277 | <td class='data'>${qcov}%</td> | |
<td class='label'><b>S_coverage:</b></td> | 278 | 278 | <td class='label'><b>S_coverage:</b></td> | |
<td class='data'>${scov}%</td> | 279 | 279 | <td class='data'>${scov}%</td> | |
</tr> | 280 | 280 | </tr> | |
<tr> | 281 | 281 | <tr> | |
<td class='label'><b>Q_aln:</b></td> | 282 | 282 | <td class='label'><b>Q_aln:</b></td> | |
<td class='data'>${qstart}-$qend</td> | 283 | 283 | <td class='data'>${qstart}-$qend</td> | |
<td class='label'><b>S_aln:</b></td> | 284 | 284 | <td class='label'><b>S_aln:</b></td> | |
<td class='data'>${sstart}-$send</td> | 285 | 285 | <td class='data'>${sstart}-$send</td> | |
<td class='label'></td> | 286 | 286 | <td class='label'></td> | |
<td class='data'></td> | 287 | 287 | <td class='data'></td> | |
<td class='label'></td> | 288 | 288 | <td class='label'></td> | |
<td class='data'></td> | 289 | 289 | <td class='data'></td> | |
</tr> | 290 | 290 | </tr> | |
</table> | 291 | 291 | </table> | |
<br /> | 292 | 292 | <br /> | |
293 | 293 | |||
<p><b>Alignment:</b></p> | 294 | 294 | <p><b>Alignment:</b></p> | |
<div class='seq'> | 295 | 295 | <div class='seq'> | |
<pre> | 296 | 296 | <pre> | |
$qseq | 297 | 297 | $qseq | |
$hstr | 298 | 298 | $hstr | |
$sseq | 299 | 299 | $sseq | |
</pre> | 300 | 300 | </pre> | |
</div> | 301 | 301 | </div> | |
302 | 302 | |||
HIT | 303 | 303 | HIT | |
304 | 304 | |||
print $outh $alnHit; | 305 | 305 | print $outh $alnHit; | |
306 | 306 | |||
print $tsvh "$qacc\t$sacc\t$qlen\t$slen\t$eval\t$id\t$qstart\t$qend\t$qcov\t$sstart\t$send\t$scov\t$qseq\t$sseq\n"; | 307 | 307 | print $tsvh "$qacc\t$sacc\t$qlen\t$slen\t$eval\t$id\t$qstart\t$qend\t$qcov\t$sstart\t$send\t$scov\t$qseq\t$sseq\n"; | |
308 | 308 | |||
#Generate the hydropathy plots | 309 | 309 | #Generate the hydropathy plots | |
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); | 310 | 310 | my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq); | |
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); | 311 | 311 | die "Could not generate plots for hit: $qacc vs $sacc" unless ($good); | |
312 | 312 | |||
my $domData = generate_domain_data($qacc, $sacc); | 313 | 313 | my $domData = generate_domain_data($qacc, $sacc); | |
my $domHTML = ""; | 314 | 314 | my $domHTML = ""; | |
if ($domData) { | 315 | 315 | if ($domData) { | |
$domHTML =<<DATA; | 316 | 316 | $domHTML =<<DATA; | |
<br /> | 317 | 317 | <br /> | |
<hr /> | 318 | 318 | <hr /> | |
<p><b>Pfam info:</b></p> | 319 | 319 | <p><b>Pfam info:</b></p> | |
<div class='dom'> | 320 | 320 | <div class='dom'> | |
321 | 321 | |||
$domData | 322 | 322 | $domData | |
323 | 323 | |||
</div> | 324 | 324 | </div> | |
DATA | 325 | 325 | DATA | |
} | 326 | 326 | } | |
327 | 327 | |||
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; | 328 | 328 | my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png"; | |
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; | 329 | 329 | my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png"; | |
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; | 330 | 330 | my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png"; | |
331 | 331 | |||
#now include the plots | 332 | 332 | #now include the plots | |
my $prtPlots =<<PLOTS; | 333 | 333 | my $prtPlots =<<PLOTS; | |
334 | 334 | |||
<br /> | 335 | 335 | <br /> | |
<table style="width:100%"> | 336 | 336 | <table style="width:100%"> | |
<tr> | 337 | 337 | <tr> | |
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> | 338 | 338 | <td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td> | |
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> | 339 | 339 | <td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td> | |
</tr> | 340 | 340 | </tr> | |
<tr> | 341 | 341 | <tr> | |
<td colspan="2" style="text-align: center;"> | 342 | 342 | <td colspan="2" style="text-align: center;"> | |
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> | 343 | 343 | <a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a> | |
</td> | 344 | 344 | </td> | |
</tr> | 345 | 345 | </tr> | |
</table> | 346 | 346 | </table> | |
347 | 347 | |||
$domHTML | 348 | 348 | $domHTML | |
349 | 349 | |||
PLOTS | 350 | 350 | PLOTS | |
351 | 351 | |||
print $outh $prtPlots; | 352 | 352 | print $outh $prtPlots; | |
} | 353 | 353 | } | |
354 | 354 | |||
#Close HTML report | 355 | 355 | #Close HTML report | |
my $closeRep = <<CLOSE; | 356 | 356 | my $closeRep = <<CLOSE; | |
</body> | 357 | 357 | </body> | |
</html> | 358 | 358 | </html> | |
CLOSE | 359 | 359 | CLOSE | |
360 | 360 | |||
print $outh $closeRep; | 361 | 361 | print $outh $closeRep; | |
362 | 362 | |||
close $outh; | 363 | 363 | close $outh; | |
close $tsvh; | 364 | 364 | close $tsvh; | |
} | 365 | 365 | } | |
366 | 366 | |||
367 | 367 | |||
#========================================================================== | 368 | 368 | #========================================================================== | |
#Generate domain data for the html report | 369 | 369 | #Generate domain data for the html report | |
370 | 370 | |||
371 | 371 | |||
sub generate_domain_data { | 372 | 372 | sub generate_domain_data { | |
373 | 373 | |||
my ($q, $s) = @_; | 374 | 374 | my ($q, $s) = @_; | |
375 | 375 | |||
#Format of PFAM hash: | 376 | 376 | #Format of PFAM hash: | |
# push (@{ $out->{$qacc}->{$pfamID} }, | 377 | 377 | # push (@{ $out->{$qacc}->{$pfamID} }, | |
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, | 378 | 378 | # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval, | |
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | 379 | 379 | # dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | |
380 | 380 | |||
381 | 381 | |||
382 | 382 | |||
383 | 383 | |||
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); | 384 | 384 | my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info); | |
my $colStr = " <th>" . join ("</th>\n <th>", @cols) . | 385 | 385 | my $colStr = " <th>" . join ("</th>\n <th>", @cols) . | |
"</th>\n"; | 386 | 386 | "</th>\n"; | |
387 | 387 | |||
my $header =<<HEADER; | 388 | 388 | my $header =<<HEADER; | |
<table border='1', style='width:100%'> | 389 | 389 | <table border='1', style='width:100%'> | |
<tr> | 390 | 390 | <tr> | |
$colStr | 391 | 391 | $colStr | |
</tr> | 392 | 392 | </tr> | |
HEADER | 393 | 393 | HEADER | |
394 | 394 | |||
395 | 395 | |||
my $res = ""; | 396 | 396 | my $res = ""; | |
foreach my $prot ($q, $s) { | 397 | 397 | foreach my $prot ($q, $s) { | |
398 | 398 | |||
if (exists $pfamHits{$prot}) { | 399 | 399 | if (exists $pfamHits{$prot}) { | |
my @Doms = keys %{ $pfamHits{$prot} }; | 400 | 400 | my @Doms = keys %{ $pfamHits{$prot} }; | |
401 | 401 | |||
foreach my $d (@Doms) { | 402 | 402 | foreach my $d (@Doms) { | |
403 | 403 | |||
my $clan = ($clans{$d})? $clans{$d} : "N/A"; | 404 | 404 | my $clan = ($clans{$d})? $clans{$d} : "N/A"; | |
405 | 405 | |||
my @hits = @{ $pfamHits{$prot}{$d} }; | 406 | 406 | my @hits = @{ $pfamHits{$prot}{$d} }; | |
foreach my $hit (@hits) { | 407 | 407 | foreach my $hit (@hits) { | |
my $dlen = $hit->{dlen}; | 408 | 408 | my $dlen = $hit->{dlen}; | |
my $eval = $hit->{evalue}; | 409 | 409 | my $eval = $hit->{evalue}; | |
my $qstart = $hit->{qstart}; | 410 | 410 | my $qstart = $hit->{qstart}; | |
my $qend = $hit->{qend}; | 411 | 411 | my $qend = $hit->{qend}; | |
my $dstart = $hit->{dstart}; | 412 | 412 | my $dstart = $hit->{dstart}; | |
my $dend = $hit->{dend}; | 413 | 413 | my $dend = $hit->{dend}; | |
my $name = $hit->{dname}; | 414 | 414 | my $name = $hit->{dname}; | |
my $def = $hit->{def}; | 415 | 415 | my $def = $hit->{def}; | |
416 | 416 | |||
$res .=<<ROW; | 417 | 417 | $res .=<<ROW; | |
<tr> | 418 | 418 | <tr> | |
<td class="pfam">$prot</td> | 419 | 419 | <td class="pfam">$prot</td> | |
<td class="pfam">$d</td> | 420 | 420 | <td class="pfam">$d</td> | |
<td class="pfam">$clan</td> | 421 | 421 | <td class="pfam">$clan</td> | |
<td class="pfam">$dlen</td> | 422 | 422 | <td class="pfam">$dlen</td> | |
<td class="pfam">$eval</td> | 423 | 423 | <td class="pfam">$eval</td> | |
<td class="pfam">$dstart</td> | 424 | 424 | <td class="pfam">$dstart</td> | |
<td class="pfam">$dend</td> | 425 | 425 | <td class="pfam">$dend</td> | |
<td class="pfam">$qstart</td> | 426 | 426 | <td class="pfam">$qstart</td> | |
<td class="pfam">$qend</td> | 427 | 427 | <td class="pfam">$qend</td> | |
<td class="pfam">$name</td> | 428 | 428 | <td class="pfam">$name</td> | |
<td class="pfam">$def</td> | 429 | 429 | <td class="pfam">$def</td> | |
</tr> | 430 | 430 | </tr> | |
ROW | 431 | 431 | ROW | |
} | 432 | 432 | } | |
} | 433 | 433 | } | |
} | 434 | 434 | } | |
} | 435 | 435 | } | |
436 | 436 | |||
#Return final result | 437 | 437 | #Return final result | |
if ($res) { | 438 | 438 | if ($res) { | |
$header .= $res; | 439 | 439 | $header .= $res; | |
$header .= " </table>\n"; | 440 | 440 | $header .= " </table>\n"; | |
return $header; | 441 | 441 | return $header; | |
} | 442 | 442 | } | |
else { | 443 | 443 | else { | |
return $res; | 444 | 444 | return $res; | |
} | 445 | 445 | } | |
} | 446 | 446 | } | |
447 | 447 | |||
448 | 448 | |||
449 | 449 | |||
450 | 450 | |||
#========================================================================== | 451 | 451 | #========================================================================== | |
#Run quod on the query, subject and the alignment. | 452 | 452 | #Run quod on the query, subject and the alignment. | |
453 | 453 | |||
454 | 454 | |||
#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa | 455 | 455 | #quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa | |
456 | 456 | |||
457 | 457 | |||
sub run_quod { | 458 | 458 | sub run_quod { | |
459 | 459 | |||
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; | 460 | 460 | my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; | |
461 | 461 | |||
462 | 462 | |||
#extract sequences for query and subject | 463 | 463 | #extract sequences for query and subject | |
extract_full_sequences($q,$s); | 464 | 464 | extract_full_sequences($q,$s); | |
465 | 465 | |||
466 | 466 | |||
#----------------------------------------------------------------- | 467 | 467 | #----------------------------------------------------------------- | |
#Run quod for the alignment | 468 | 468 | #Run quod for the alignment | |
469 | 469 | |||
#First save aligned segments to files | 470 | 470 | #First save aligned segments to files | |
my $qalnFile ="$seqDir/${q}_aln.faa"; | 471 | 471 | my $qalnFile ="$seqDir/${q}_aln.faa"; | |
open(my $qfh, '>', $qalnFile) || die $!; | 472 | 472 | open(my $qfh, '>', $qalnFile) || die $!; | |
print $qfh ">$q alignment\n$qseq\n"; | 473 | 473 | print $qfh ">$q alignment\n$qseq\n"; | |
close $qfh; | 474 | 474 | close $qfh; | |
475 | 475 | |||
my $salnFile ="$seqDir/${s}_aln.faa"; | 476 | 476 | my $salnFile ="$seqDir/${s}_aln.faa"; | |
open(my $sfh, '>', $salnFile) || die $!; | 477 | 477 | open(my $sfh, '>', $salnFile) || die $!; | |
print $sfh ">$s alignment\n$sseq\n"; | 478 | 478 | print $sfh ">$s alignment\n$sseq\n"; | |
close $sfh; | 479 | 479 | close $sfh; | |
480 | 480 | |||
my $labelsStr = "--axis-font 18.0 --tick-font 15.0 --title-font 20.0 --xlabel Residues --ylabel Hydropathy --width 15 --xticks 25"; | 481 | 481 | my $labelsStr = "--axis-font 18.0 --tick-font 15.0 --title-font 20.0 --xlabel Residues --ylabel Hydropathy --width 15 --xticks 25"; | |
my $ylimStr = "--ylim -3 3"; | 482 | 482 | my $ylimStr = "--ylim -3 3"; | |
483 | 483 | |||
484 | 484 | |||
#Note alnquod requires to add the extension to the image name | 485 | 485 | #Plot alignment | |
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; | 486 | 486 | my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; | |
my $cmd1 = qq(quod.py -q $labelsStr -l "$q (red) and $s (blue)" -o $alnFig $ylimStr --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | 487 | 487 | my $cmd1 = qq(quod.py -q $labelsStr -l "$q (red) and $s (blue)" -o $alnFig $ylimStr --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | |
# print "$cmd1\n\n"; | 488 | 488 | # print "$cmd1\n\n"; | |
# exit; | 489 | 489 | # exit; | |
system $cmd1 unless (-f "${alnFig}"); | 490 | 490 | system $cmd1 unless (-f "${alnFig}"); | |
return undef unless (-f "${alnFig}"); | 491 | 491 | return undef unless (-f "${alnFig}"); | |
492 | 492 | |||
493 | 493 | |||
#----------------------------------------------------------------- | 494 | 494 | #----------------------------------------------------------------- | |
#Run quod for the full sequencess of the query and subject proteins | 495 | 495 | #Run quod for the full sequencess of the query and subject proteins | |
496 | 496 | |||
497 | 497 | |||
#Extract TMS coordinates for query | 498 | 498 | #Extract TMS coordinates for query | |
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); | 499 | 499 | die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); | |
my $qTMS = ""; | 500 | 500 | my $qTMS = ""; | |
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { | 501 | 501 | if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { | |
$qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | 502 | 502 | $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | |
} | 503 | 503 | } | |
504 | 504 | |||
505 | 505 | |||
#Plot query hydropathy | 506 | 506 | #Plot query hydropathy | |
my $qPfam = get_pfam_coords_for_quod($q, "red"); | 507 | 507 | my $qPfam = get_pfam_coords_for_quod($q, "red"); | |
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; | 508 | 508 | my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; | |
my $cmd2 = qq(quod.py -q $labelsStr -l "$q" -o $qName --edgecolor red $ylimStr -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); | 509 | 509 | my $cmd2 = qq(quod.py -q $labelsStr -l "$q" -o $qName --edgecolor red $ylimStr -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); | |
# print "$cmd2\n\n"; | 510 | 510 | # print "$cmd2\n\n"; | |
# exit; | 511 | 511 | # exit; | |
system $cmd2 unless (-f $qName); | 512 | 512 | system $cmd2 unless (-f $qName); | |
return undef unless (-f $qName); | 513 | 513 | return undef unless (-f $qName); | |
514 | 514 | |||
515 | 515 | |||
516 | 516 | |||
#TMS coords for the subject | 517 | 517 | #TMS coords for the subject | |
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); | 518 | 518 | die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); | |
my $sTMS = ""; | 519 | 519 | my $sTMS = ""; | |
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { | 520 | 520 | if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { | |
$sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | 521 | 521 | $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | |
} | 522 | 522 | } | |
523 | 523 | |||
#Plot Subject hydropaty | 524 | 524 | #Plot Subject hydropaty | |
my $sPfam = get_pfam_coords_for_quod($s, "blue"); | 525 | 525 | my $sPfam = get_pfam_coords_for_quod($s, "blue"); | |
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; | 526 | 526 | my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; | |
my $cmd3 = qq(quod.py -q $labelsStr -l "$s" -o $sName --edgecolor blue $ylimStr -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); | 527 | 527 | my $cmd3 = qq(quod.py -q $labelsStr -l "$s" -o $sName --edgecolor blue $ylimStr -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); | |
# print "$cmd3\n\n"; | 528 | 528 | # print "$cmd3\n\n"; | |
# exit; | 529 | 529 | # exit; | |
system $cmd3 unless (-f $sName); | 530 | 530 | system $cmd3 unless (-f $sName); | |
return undef unless (-f $sName); | 531 | 531 | return undef unless (-f $sName); | |
532 | 532 | |||
533 | 533 | |||
return 1; | 534 | 534 | return 1; | |
} | 535 | 535 | } | |
536 | 536 | |||
#========================================================================== | 537 | 537 | #========================================================================== | |
#Get the string for quod that will plot the PFAM domains | 538 | 538 | #Get the string for quod that will plot the PFAM domains | |
539 | 539 | |||
sub get_pfam_coords_for_quod { | 540 | 540 | sub get_pfam_coords_for_quod { | |
541 | 541 | |||
my ($prot, $color) = @_; | 542 | 542 | my ($prot, $color) = @_; | |
543 | 543 | |||
#Format of PFAM hash: | 544 | 544 | #Format of PFAM hash: | |
# push (@{ $out->{$qacc}->{$pfamID} }, | 545 | 545 | # push (@{ $out->{$qacc}->{$pfamID} }, | |
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, | 546 | 546 | # {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, | |
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | 547 | 547 | # def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend }); | |
548 | 548 | |||
549 | 549 | |||
550 | 550 | |||
my $str = ""; | 551 | 551 | my $str = ""; | |
552 | 552 | |||
if (exists $pfamHits{$prot}) { | 553 | 553 | if (exists $pfamHits{$prot}) { | |
my @Doms = keys %{ $pfamHits{$prot} }; | 554 | 554 | my @Doms = keys %{ $pfamHits{$prot} }; | |
my $dcnt = 0; | 555 | 555 | my $dcnt = 0; | |
$str = "--region-font 12 --add-region "; | 556 | 556 | $str = "--region-font 12 --add-region "; | |
foreach my $d (@Doms) { | 557 | 557 | foreach my $d (@Doms) { | |
558 | 558 | |||
my @hits = @{ $pfamHits{$prot}{$d} }; | 559 | 559 | my @hits = @{ $pfamHits{$prot}{$d} }; | |
foreach my $hit (@hits) { | 560 | 560 | foreach my $hit (@hits) { | |
my $left = $hit->{qstart}; | 561 | 561 | my $left = $hit->{qstart}; | |
my $right = $hit->{qend}; | 562 | 562 | my $right = $hit->{qend}; | |
563 | 563 | |||
my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord | 564 | 564 | my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord | |
my $yposh = $yposl + 0.15; #domain height coord | 565 | 565 | my $yposh = $yposl + 0.15; #domain height coord | |
566 | 566 | |||
$str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; | 567 | 567 | $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; | |
$dcnt++; | 568 | 568 | $dcnt++; | |
} | 569 | 569 | } | |
} | 570 | 570 | } | |
} | 571 | 571 | } | |
572 | 572 | |||
return $str; | 573 | 573 | return $str; | |
574 | 574 | |||
} | 575 | 575 | } | |
576 | 576 | |||
577 | 577 | |||
#========================================================================== | 578 | 578 | #========================================================================== | |
#Extract the full sequences of the query and subject proteins | 579 | 579 | #Extract the full sequences of the query and subject proteins | |
#Examples: AKM80767.1 | 580 | 580 | #Examples: AKM80767.1 | |
581 | 581 | |||
582 | 582 | |||
583 | 583 | |||
sub extract_full_sequences { | 584 | 584 | sub extract_full_sequences { | |
585 | 585 | |||
my ($q, $s) = @_; | 586 | 586 | my ($q, $s) = @_; | |
587 | 587 | |||
588 | 588 | |||
my $q_seq = "$seqDir/${q}.faa"; | 589 | 589 | my $q_seq = "$seqDir/${q}.faa"; | |
my $s_seq = "$seqDir/${s}.faa"; | 590 | 590 | my $s_seq = "$seqDir/${s}.faa"; | |
591 | 591 | |||
#extract the query secuence from tcdb and the subject from the custom blastdb | 592 | 592 | #extract the query secuence from tcdb and the subject from the custom blastdb | |
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); | 593 | 593 | my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq); | |
system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); | 594 | 594 | system "$cmd1" unless (-f $q_seq && !(-z $q_seq)); | |
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); | 595 | 595 | die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq)); | |
596 | 596 | |||
597 | 597 | |||
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); | 598 | 598 | my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq); | |
system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); | 599 | 599 | system "$cmd2" unless (-f $s_seq && !(-z $s_seq)); | |
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); | 600 | 600 | die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq)); | |
} | 601 | 601 | } | |
602 | 602 | |||
603 | 603 | |||
604 | 604 | |||
605 | 605 | |||
606 | 606 | |||
607 | 607 | |||
608 | 608 | |||
#========================================================================== | 609 | 609 | #========================================================================== | |
#Sort alignmnet results by E-value | 610 | 610 | #Sort alignmnet results by E-value | |
611 | 611 | |||
sub by_evalue { | 612 | 612 | sub by_evalue { | |
$a->{evalue} <=> $b->{evalue}; | 613 | 613 | $a->{evalue} <=> $b->{evalue}; | |
} | 614 | 614 | } | |
615 | 615 | |||
616 | 616 | |||
617 | 617 | |||
#========================================================================== | 618 | 618 | #========================================================================== | |
#Run PFAM, hmmtop and parse results | 619 | 619 | #Run PFAM, hmmtop and parse results | |
620 | 620 | |||
621 | 621 | |||
sub run_pfam_hmmtop { | 622 | 622 | sub run_pfam_hmmtop { | |
my ($pfamOut, $hmmtopOut, $pfamClans) = @_; | 623 | 623 | my ($pfamOut, $hmmtopOut, $pfamClans) = @_; | |
624 | 624 | |||
625 | 625 | |||
#---------------------------------------------------------------------- | 626 | 626 | #---------------------------------------------------------------------- | |
#Generate blast DB for easy sequence retrieval | 627 | 627 | #Generate blast DB for easy sequence retrieval | |
628 | 628 | |||
print "Generate Blast DB with sequences for fast sequence retrieval...\n"; | 629 | 629 | print "Generate Blast DB with sequences for fast sequence retrieval...\n"; | |
630 | 630 | |||
#Get the sequences for which hmmscan will run | 631 | 631 | #Get the sequences for which hmmscan will run | |
my $allSeqsFile = "$seqDir/all_seqs.faa"; | 632 | 632 | my $allSeqsFile = "$seqDir/all_seqs.faa"; | |
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); | 633 | 633 | system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile)); | |
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); | 634 | 634 | die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile)); | |
635 | 635 | |||
636 | 636 | |||
#Generate blastdb ...assuming there are no duplicate sequences. | 637 | 637 | #Generate blastdb ...assuming there are no duplicate sequences. | |
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); | 638 | 638 | my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb); | |
print "$cmd1\n"; | 639 | 639 | print "$cmd1\n"; | |
system $cmd1 unless (-f "${blastdb}.pin"); | 640 | 640 | system $cmd1 unless (-f "${blastdb}.pin"); | |
system "rm $allSeqsFile" if (-f $allSeqsFile); | 641 | 641 | system "rm $allSeqsFile" if (-f $allSeqsFile); | |
642 | 642 | |||
643 | 643 | |||
#---------------------------------------------------------------------- | 644 | 644 | #---------------------------------------------------------------------- | |
#Get the accessions of the top hits in the alignments | 645 | 645 | #Get the accessions of the top hits in the alignments | |
646 | 646 | |||
#get the accessions with significant hits | 647 | 647 | #get the accessions with significant hits | |
my %accList = (); | 648 | 648 | my %accList = (); | |
foreach my $hit (@alnHits) { | 649 | 649 | foreach my $hit (@alnHits) { | |
$accList{$hit->{qacc}} = 1; | 650 | 650 | $accList{$hit->{qacc}} = 1; | |
$accList{$hit->{sacc}} = 1; | 651 | 651 | $accList{$hit->{sacc}} = 1; | |
} | 652 | 652 | } | |
653 | 653 | |||
654 | 654 | |||
#Save accessions to a file | 655 | 655 | #Save accessions to a file | |
my $idFile = "$seqDir/top_hits_accs.txt"; | 656 | 656 | my $idFile = "$seqDir/top_hits_accs.txt"; | |
unless (-f $idFile) { | 657 | 657 | unless (-f $idFile) { | |
open (my $afh, ">", $idFile) || die $!; | 658 | 658 | open (my $afh, ">", $idFile) || die $!; | |
print $afh join("\n", keys %accList), "\n"; | 659 | 659 | print $afh join("\n", keys %accList), "\n"; | |
close $afh; | 660 | 660 | close $afh; | |
} | 661 | 661 | } | |
662 | 662 | |||
#---------------------------------------------------------------------- | 663 | 663 | #---------------------------------------------------------------------- | |
#Extract full sequences for top hits. | 664 | 664 | #Extract full sequences for top hits. | |
665 | 665 | |||
my $topHitsSeqs = "$seqDir/topHits.faa"; | 666 | 666 | my $topHitsSeqs = "$seqDir/topHits.faa"; | |
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); | 667 | 667 | my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs); | |
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); | 668 | 668 | system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs)); | |
669 | 669 | |||
670 | 670 | |||
#---------------------------------------------------------------------- | 671 | 671 | #---------------------------------------------------------------------- | |
#run hmmscan on all the sequences for both files | 672 | 672 | #run hmmscan on all the sequences for both files | |
673 | 673 | |||
print "\nRunning hmmscan and parsing output....\n"; | 674 | 674 | print "\nRunning hmmscan and parsing output....\n"; | |
675 | 675 | |||
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm"; | 676 | 676 | my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/DB/domainDBs/xfamDB/Pfam-A.hmm"; | |
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); | 677 | 677 | my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs); | |
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); | 678 | 678 | system $cmd2 unless (-f $pfamFile && !(-z $pfamFile)); | |
679 | 679 | |||
680 | 680 | |||
#parse Pfam output | 681 | 681 | #parse Pfam output | |
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); | 682 | 682 | TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans); | |
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); | 683 | 683 | # print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(*pfamOut *pfamClans )]); | |
# exit; | 684 | 684 | # exit; | |
685 | 685 | |||
#---------------------------------------------------------------------- | 686 | 686 | #---------------------------------------------------------------------- | |
#Extract clans | 687 | 687 | #Extract clans | |
688 | 688 | |||
TCDB::Assorted::get_clans($pfamClans, $filesDir); | 689 | 689 | TCDB::Assorted::get_clans($pfamClans, $filesDir); | |
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); | 690 | 690 | # print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]); | |
# exit; | 691 | 691 | # exit; | |
692 | 692 | |||
#-------------------------------------------------------------------------- | 693 | 693 | #-------------------------------------------------------------------------- | |
#Run hmmtop on top hits for later hydropathy plots. | 694 | 694 | #Run hmmtop on top hits for later hydropathy plots. | |
695 | 695 | |||
print "Runnign HMMTOP and parsing output...\n"; | 696 | 696 | print "Runnign HMMTOP and parsing output...\n"; | |
697 | 697 | |||
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); | 698 | 698 | my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo); | |
system $cmd3 unless (-f $hmmtopFile); | 699 | 699 | system $cmd3 unless (-f $hmmtopFile); | |
system "rm $topHitsSeqs" if (-f $topHitsSeqs); | 700 | 700 | system "rm $topHitsSeqs" if (-f $topHitsSeqs); | |
701 | 701 | |||
#Parse hmmtop output | 702 | 702 | #Parse hmmtop output | |
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); | 703 | 703 | TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile); | |
704 | 704 | |||
} | 705 | 705 | } | |
706 | 706 | |||
707 | 707 | |||
#========================================================================== | 708 | 708 | #========================================================================== | |
#Parse ssearch36 output | 709 | 709 | #Parse ssearch36 output | |
710 | 710 | |||
sub parse_ssearch { | 711 | 711 | sub parse_ssearch { | |
712 | 712 | |||
my $out = shift; | 713 | 713 | my $out = shift; | |
714 | 714 | |||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); | 715 | 715 | my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile); | |
716 | 716 | |||
my $formatTmp = $parser->format(); | 717 | 717 | my $formatTmp = $parser->format(); | |
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); | 718 | 718 | # print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]); | |
# exit; | 719 | 719 | # exit; | |
720 | 720 | |||
while (my $result = $parser->next_result) { | 721 | 721 | while (my $result = $parser->next_result) { | |
722 | 722 | |||
my $qacc = $result->query_name; | 723 | 723 | my $qacc = $result->query_name; | |
my $qlen = $result->query_length; | 724 | 724 | my $qlen = $result->query_length; | |
725 | 725 | |||
726 | 726 | |||
HIT:while (my $hit = $result->next_hit) { | 727 | 727 | HIT:while (my $hit = $result->next_hit) { | |
HSP:while(my $hsp = $hit->next_hsp) { | 728 | 728 | HSP:while(my $hsp = $hit->next_hsp) { | |
729 | 729 | |||
#Alignment parameters | 730 | 730 | #Alignment parameters | |
my $sacc = $hit->name; | 731 | 731 | my $sacc = $hit->name; | |
my $slen = $hit->length; | 732 | 732 | my $slen = $hit->length; | |
my $eval = $hsp->evalue; | 733 | 733 | my $eval = $hsp->evalue; | |
my $id = $hsp->frac_identical('total') * 100; | 734 | 734 | my $id = $hsp->frac_identical('total') * 100; | |
735 | 735 | |||
#coordinates and sequence | 736 | 736 | #coordinates and sequence | |
my $qstart = $hsp->start('query'); | 737 | 737 | my $qstart = $hsp->start('query'); | |
my $qend = $hsp->end('query'); | 738 | 738 | my $qend = $hsp->end('query'); | |
my $sstart = $hsp->start('subject'); | 739 | 739 | my $sstart = $hsp->start('subject'); | |
my $send = $hsp->end('subject'); | 740 | 740 | my $send = $hsp->end('subject'); | |
my $qseq = $hsp->query_string; | 741 | 741 | my $qseq = $hsp->query_string; | |
my $sseq = $hsp->hit_string; | 742 | 742 | my $sseq = $hsp->hit_string; | |
my $hstr = $hsp->homology_string; | 743 | 743 | my $hstr = $hsp->homology_string; | |
744 | 744 | |||
745 | 745 | |||
#Check first that both proteins have the right length | 746 | 746 | #Check first that both proteins have the right length | |
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); | 747 | 747 | next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl)); | |
748 | 748 | |||
#If the alignment has less than $minLength aas, ignore it | 749 | 749 | #If the alignment has less than $minLength aas, ignore it | |
my $qtmp = $qseq; $qtmp =~ s/-//g; | 750 | 750 | my $qtmp = $qseq; $qtmp =~ s/-//g; | |
my $stmp = $sseq; $stmp =~ s/-//g; | 751 | 751 | my $stmp = $sseq; $stmp =~ s/-//g; | |
next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); | 752 | 752 | next HSP if (length($qtmp) < $minLength || length($stmp) < $minLength); | |
753 | 753 | |||
#Calculate coverages properly (do not use alignment length as it includes gaps | 754 | 754 | #Calculate coverages properly (do not use alignment length as it includes gaps | |
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; | 755 | 755 | my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100; | |
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; | 756 | 756 | my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp; | |
757 | 757 | |||
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; | 758 | 758 | my $sCov_tmp = ($send - $sstart + 1) / $slen * 100; | |
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; | 759 | 759 | my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp; | |
760 | 760 | |||
761 | 761 | |||
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | 762 | 762 | if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | |
763 | 763 | |||
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | 764 | 764 | push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | |
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | 765 | 765 | scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | |
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); | 766 | 766 | sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr}); | |
} | 767 | 767 | } | |
} # hsp | 768 | 768 | } # hsp | |
} # hit | 769 | 769 | } # hit | |
} # result | 770 | 770 | } # result | |
} | 771 | 771 | } | |
772 | 772 | |||
773 | 773 | |||
774 | 774 | |||
#========================================================================== | 775 | 775 | #========================================================================== | |
#Test whether the lengths of two proteins are withing a predefined | 776 | 776 | #Test whether the lengths of two proteins are withing a predefined | |
#legnth specified by the user. This are the options for control: | 777 | 777 | #legnth specified by the user. This are the options for control: | |
# X: Either protein is larger than the cutoff | 778 | 778 | # X: Either protein is larger than the cutoff | |
# B: Both proteins are larger than the cutoff | 779 | 779 | # B: Both proteins are larger than the cutoff | |
# Q: Only the query protein is larger than the cutoff | 780 | 780 | # Q: Only the query protein is larger than the cutoff | |
# S: Only the subject protein is larger than the cutoff | 781 | 781 | # S: Only the subject protein is larger than the cutoff | |
# N: No control. Any length is ok. | 782 | 782 | # N: No control. Any length is ok. | |
783 | 783 | |||
sub max_length_violation { | 784 | 784 | sub max_length_violation { | |
785 | 785 | |||
my ($qlen, $slen, $maxLen, $control) = @_; | 786 | 786 | my ($qlen, $slen, $maxLen, $control) = @_; | |
787 | 787 | |||
if ($control eq "X") { | 788 | 788 | if ($control eq "X") { | |
(($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; | 789 | 789 | (($qlen >= $maxLen) || ($slen >= $maxLen))? return 1 : return 0; | |
} | 790 | 790 | } | |
791 | 791 | |||
if ($control eq "B") { | 792 | 792 | if ($control eq "B") { | |
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; | 793 | 793 | (($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0; | |
} | 794 | 794 | } | |
795 | 795 | |||
if ($control eq "Q") { | 796 | 796 | if ($control eq "Q") { | |
($qlen >= $maxLen)? return 1 : return 0; | 797 | 797 | ($qlen >= $maxLen)? return 1 : return 0; | |
} | 798 | 798 | } | |
799 | 799 | |||
if ($control eq "S") { | 800 | 800 | if ($control eq "S") { | |
($slen >= $maxLen)? return 1 : return 0; | 801 | 801 | ($slen >= $maxLen)? return 1 : return 0; | |
} | 802 | 802 | } | |
803 | 803 | |||
if ($control eq "N") { | 804 | 804 | if ($control eq "N") { | |
return 0; | 805 | 805 | return 0; | |
} | 806 | 806 | } | |
807 | 807 | |||
die "Unknown control mode: $control"; | 808 | 808 | die "Unknown control mode: $control"; | |
809 | 809 | |||
} | 810 | 810 | } | |
811 | 811 | |||
812 | 812 | |||
813 | 813 | |||
814 | 814 | |||
#========================================================================== | 815 | 815 | #========================================================================== | |
#Parse blast output | 816 | 816 | #Parse blast output | |
817 | 817 | |||
818 | 818 | |||
sub parse_blast { | 819 | 819 | sub parse_blast { | |
my $out = shift; | 820 | 820 | my $out = shift; | |
821 | 821 | |||
open (my $fh, "<", $alnFile) || die $!; | 822 | 822 | open (my $fh, "<", $alnFile) || die $!; | |
LINE:while (<$fh>) { | 823 | 823 | LINE:while (<$fh>) { | |
chomp; | 824 | 824 | chomp; | |
next unless ($_); | 825 | 825 | next unless ($_); | |
next if (/^#/); | 826 | 826 | next if (/^#/); | |
827 | 827 | |||
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq | 828 | 828 | #Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq | |
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); | 829 | 829 | my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_); | |
830 | 830 | |||
831 | 831 | |||
if ($eval <= $evalue) { | 832 | 832 | if ($eval <= $evalue) { | |
833 | 833 | |||
my $qcov = ($qend - $qstart + 1) / $qlen * 100; | 834 | 834 | my $qcov = ($qend - $qstart + 1) / $qlen * 100; | |
my $scov = ($send - $sstart + 1) / $slen * 100; | 835 | 835 | my $scov = ($send - $sstart + 1) / $slen * 100; | |
836 | 836 | |||
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | 837 | 837 | if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) { | |
838 | 838 | |||
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | 839 | 839 | push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov, | |
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | 840 | 840 | scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend, | |
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); | 841 | 841 | sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq}); | |
} | 842 | 842 | } | |
} | 843 | 843 | } | |
} | 844 | 844 | } | |
close $fh; | 845 | 845 | close $fh; | |
} | 846 | 846 | } | |
847 | 847 | |||
848 | 848 | |||
849 | 849 | |||
850 | 850 | |||
851 | 851 | |||
#========================================================================== | 852 | 852 | #========================================================================== | |
#Run the alignemnt between the two files depending on the program | 853 | 853 | #Run the alignemnt between the two files depending on the program | |
#Selected by the user. | 854 | 854 | #Selected by the user. | |
855 | 855 |
locateFragment.pl
View file @
538dec8
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
6 | 6 | |||
$Data::Dumper::Deepcopy = 1; | 7 | 7 | $Data::Dumper::Deepcopy = 1; | |
8 | 8 | |||
use Getopt::Long; | 9 | 9 | use Getopt::Long; | |
use Bio::SearchIO; | 10 | 10 | use Bio::SearchIO; | |
use Bio::SeqIO; | 11 | 11 | use Bio::SeqIO; | |
12 | 12 | |||
13 | 13 | |||
#Local libraries | 14 | 14 | #Local libraries | |
use TCDB::CheckDependencies; | 15 | 15 | use TCDB::CheckDependencies; | |
use TCDB::Assorted; | 16 | 16 | use TCDB::Assorted; | |
17 | 17 | |||
18 | 18 | |||
########################################################################### | 19 | 19 | ########################################################################### | |
# | 20 | 20 | # | |
# Parse GBLAST output file and return the sequence of the proteins that | 21 | 21 | # Parse GBLAST output file and return the sequence of the proteins that | |
# meet the user's criteria. | 22 | 22 | # meet the user's criteria. | |
# | 23 | 23 | # | |
########################################################################### | 24 | 24 | ########################################################################### | |
25 | 25 | |||
26 | 26 | |||
#This will prevent quod and alnquod from going into interactive mode | 27 | 27 | #This will prevent quod and alnquod from going into interactive mode | |
$ENV{"MPLBACKEND"} = "agg"; | 28 | 28 | $ENV{"MPLBACKEND"} = "agg"; | |
29 | 29 | |||
30 | 30 | |||
#========================================================================== | 31 | 31 | #========================================================================== | |
#Check dependencies | 32 | 32 | #Check dependencies | |
33 | 33 | |||
my @dependencies = ("glsearch36", "blastdbcmd"); | 34 | 34 | my @dependencies = ("glsearch36", "blastdbcmd"); | |
my $CheckDep_obj = new TCDB::CheckDependencies(); | 35 | 35 | my $CheckDep_obj = new TCDB::CheckDependencies(); | |
$CheckDep_obj -> dependencies_list(\@dependencies); | 36 | 36 | $CheckDep_obj -> dependencies_list(\@dependencies); | |
$CheckDep_obj -> checkDependencies; | 37 | 37 | $CheckDep_obj -> checkDependencies; | |
38 | 38 | |||
39 | 39 | |||
40 | 40 | |||
41 | 41 | |||
#========================================================================== | 42 | 42 | #========================================================================== | |
#Read command line arguments | 43 | 43 | #Read command line arguments | |
44 | 44 | |||
my $fragment = undef; | 45 | 45 | my $fragment = undef; | |
my $accession = undef; | 46 | 46 | my $accession = undef; | |
my $accFile = undef | 47 | 47 | my $accFile = undef; | |
my $outdir = undef; | 48 | 48 | my $outdir = undef; | |
my $blastdb = undef; | 49 | 49 | my $blastdb = "uniref90"; | |
my $evalue = 1e-2; | 50 | 50 | my $evalue = 1e-2; | |
my $subMatrix = 'BL50'; | 51 | 51 | my $subMatrix = 'BL50'; | |
my $quiet = 0; | 52 | 52 | my $quiet = 0; | |
my $interactive = 0; | 53 | 53 | my $interactive = 0; | |
54 | 54 | |||
read_command_line(); | 55 | 55 | read_command_line(); | |
56 | 56 | |||
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | 57 | 57 | #print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet], | |
# [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | 58 | 58 | # [qw(*fragment *accession *accFile *outdir *blastdb *evalue *quiet)]); | |
#exit; | 59 | 59 | #exit; | |
60 | 60 | |||
61 | 61 | |||
62 | 62 | |||
63 | 63 | |||
#========================================================================== | 64 | 64 | #========================================================================== | |
#Get the sequences for the fragment and full proteins in files | 65 | 65 | #Get the sequences for the fragment and full proteins in files | |
66 | 66 | |||
67 | 67 | |||
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | 68 | 68 | my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) }; | |
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | 69 | 69 | #print Data::Dumper->Dump([$fragFile, $protFile ], [qw(*fragFile *protFile)]); | |
70 | 70 | |||
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | 71 | 71 | die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile); | |
72 | 72 | |||
#========================================================================== | 73 | 73 | #========================================================================== | |
#align fragment to full protein | 74 | 74 | #align fragment to full protein | |
75 | 75 | |||
my $alignFile = "$outdir/${accession}_glsearch.out"; | 76 | 76 | my $alignFile = "$outdir/${accession}_glsearch.out"; | |
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | 77 | 77 | my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile); | |
system $cmd unless (-f $alignFile); | 78 | 78 | system $cmd unless (-f $alignFile); | |
79 | 79 | |||
80 | 80 | |||
#========================================================================== | 81 | 81 | #========================================================================== | |
#Parse glsearch output and run quod | 82 | 82 | #Parse glsearch output and run quod | |
83 | 83 | |||
84 | 84 | |||
run_quod ($alignFile, $protFile); | 85 | 85 | run_quod ($alignFile, $protFile); | |
86 | 86 | |||
87 | 87 | |||
88 | 88 | |||
89 | 89 | |||
90 | 90 | |||
#========================================================================== | 91 | 91 | #========================================================================== | |
################ Subroutines definition beyond ths point ############## | 92 | 92 | ################ Subroutines definition beyond ths point ############## | |
#========================================================================== | 93 | 93 | #========================================================================== | |
94 | 94 | |||
95 | 95 | |||
sub run_quod { | 96 | 96 | sub run_quod { | |
97 | 97 | |||
my ($alignment, $sequence) = @_; | 98 | 98 | my ($alignment, $sequence) = @_; | |
99 | 99 | |||
100 | 100 | |||
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | 101 | 101 | my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment); | |
102 | 102 | |||
my @res = (); | 103 | 103 | my @res = (); | |
104 | 104 | |||
#---------------------------------------------------------------------- | 105 | 105 | #---------------------------------------------------------------------- | |
#Parse glsearch output | 106 | 106 | #Parse glsearch output | |
107 | 107 | |||
my $hsp_cnt = 1; | 108 | 108 | my $hsp_cnt = 1; | |
while (my $result = $parser->next_result) { | 109 | 109 | while (my $result = $parser->next_result) { | |
110 | 110 | |||
HIT:while (my $hit = $result->next_hit) { | 111 | 111 | HIT:while (my $hit = $result->next_hit) { | |
HSP:while(my $hsp = $hit->next_hsp) { | 112 | 112 | HSP:while(my $hsp = $hit->next_hsp) { | |
113 | 113 | |||
my %data = (); | 114 | 114 | my %data = (); | |
my $key = "hsp_$hsp_cnt"; | 115 | 115 | my $key = "hsp_$hsp_cnt"; | |
my $hId = $hsp->frac_identical('total'); #identity in the alignment | 116 | 116 | my $hId = $hsp->frac_identical('total'); #identity in the alignment | |
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | 117 | 117 | my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | |
118 | 118 | |||
#Alignment parameters | 119 | 119 | #Alignment parameters | |
$data{hsp} = $key; | 120 | 120 | $data{hsp} = $key; | |
$data{evalue} = $hsp->evalue; | 121 | 121 | $data{evalue} = $hsp->evalue; | |
$data{id} = $hId; | 122 | 122 | $data{id} = $hId; | |
$data{sim} = $hSim; | 123 | 123 | $data{sim} = $hSim; | |
124 | 124 | |||
#coordinates in the alignment to plot bars | 125 | 125 | #coordinates in the alignment to plot bars | |
$data{qstart} = $hsp->start('query'); | 126 | 126 | $data{qstart} = $hsp->start('query'); | |
$data{qend} = $hsp->end('query'); | 127 | 127 | $data{qend} = $hsp->end('query'); | |
$data{sstart} = $hsp->start('subject'); | 128 | 128 | $data{sstart} = $hsp->start('subject'); | |
$data{send} = $hsp->end('subject'); | 129 | 129 | $data{send} = $hsp->end('subject'); | |
130 | 130 | |||
push (@res, \%data); | 131 | 131 | push (@res, \%data); | |
$hsp_cnt++; | 132 | 132 | $hsp_cnt++; | |
} | 133 | 133 | } | |
} | 134 | 134 | } | |
} | 135 | 135 | } | |
136 | 136 | |||
die "Error: no match found between fragment and sequence" unless (@res); | 137 | 137 | die "Error: no match found between fragment and sequence" unless (@res); | |
# print Data::Dumper->Dump([\@res ], [qw(*res )]); | 138 | 138 | # print Data::Dumper->Dump([\@res ], [qw(*res )]); | |
139 | 139 | |||
140 | 140 | |||
#---------------------------------------------------------------------- | 141 | 141 | #---------------------------------------------------------------------- | |
#Generate quod plot | 142 | 142 | #Generate quod plot | |
143 | 143 | |||
#Format string for the regions | 144 | 144 | #Format string for the regions | |
my $regions = "--add-tms "; | 145 | 145 | my $regions = "--add-tms "; | |
my $coords = ""; | 146 | 146 | my $coords = ""; | |
foreach my $hit (@res) { | 147 | 147 | foreach my $hit (@res) { | |
148 | 148 | |||
$coords = $hit->{sstart} . "-" . $hit->{send}; | 149 | 149 | $coords = $hit->{sstart} . "-" . $hit->{send}; | |
$regions .= "${coords}:green"; | 150 | 150 | $regions .= "${coords}:green"; | |
151 | 151 | |||
#only the best HSP is required to be plotted | 152 | 152 | #only the best HSP is required to be plotted | |
last; | 153 | 153 | last; | |
} | 154 | 154 | } | |
155 | 155 | |||
my $outPlot = "$outdir/${accession}_map_frag.png"; | 156 | 156 | my $outPlot = "$outdir/${accession}_map_frag.png"; | |
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot"; | 157 | 157 | my $qstring = ($quiet)? "-q" : ""; | |
my $iString = ($interactive)? "-o $outPlot --show" : ""; | 158 | 158 | my $iString = "-o $outPlot"; | |
159 | 159 | |||
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence); | 160 | 160 | my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence 2>/dev/null); | |
print "$cmd\n"; | 161 | 161 | #print "$cmd\n"; | |
system $cmd; | 162 | 162 | system $cmd; | |
} | 163 | 163 | } | |
164 | 164 | |||
165 | 165 | |||
166 | 166 | |||
167 | 167 | |||
sub getSequences { | 168 | 168 | sub getSequences { | |
169 | 169 | |||
my ($frag, $acc) = @_; | 170 | 170 | my ($frag, $acc) = @_; | |
171 | 171 | |||
172 | 172 | |||
# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); | 173 | 173 | # print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(*frag *acc *accFile)]); | |
# exit; | 174 | 174 | # exit; | |
175 | 175 | |||
176 | 176 | |||
#Sequence for full protein | 177 | 177 | #Sequence for full protein | |
my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa"; | 178 | 178 | my $accSeq = ($accFile && -f $accFile)? $accFile : "$outdir/${acc}.faa"; | |
179 | 179 | |||
180 | 180 | |||
#Save fragment to file | 181 | 181 | #Save fragment to file | |
my $tmpFile = "$outdir/${acc}_frag.faa"; | 182 | 182 | my $tmpFile = "$outdir/${acc}_frag.faa"; | |
183 | 183 | |||
184 | 184 | |||
if (-f $frag) { | 185 | 185 | if (-f $frag) { | |
system "mv $frag $tmpFile" unless (-f $tmpFile); | 186 | 186 | system "mv $frag $tmpFile" unless (-f $tmpFile); | |
} | 187 | 187 | } | |
else { | 188 | 188 | else { | |
unless (-f $tmpFile) { | 189 | 189 | unless (-f $tmpFile) { | |
open (my $fh, ">", $tmpFile) || die $!; | 190 | 190 | open (my $fh, ">", $tmpFile) || die $!; | |
print $fh ">${acc} fragment\n$frag\n"; | 191 | 191 | print $fh ">${acc} fragment\n$frag\n"; | |
close $fh; | 192 | 192 | close $fh; | |
} | 193 | 193 | } | |
} | 194 | 194 | } | |
195 | 195 | |||
unless (-f $accSeq) { | 196 | 196 | unless (-f $accSeq) { | |
#Blast DB to be used | 197 | 197 | #Blast DB to be used | |
my $db = ($blastdb)? $blastdb : 'nr'; | 198 | 198 | my $db = ($blastdb)? $blastdb : 'nr'; | |
199 | 199 | |||
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | 200 | 200 | my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq); | |
system $cmd; | 201 | 201 | system $cmd; | |
202 | 202 | |||
#Remove the version and annotations from the sequence file | 203 | 203 | #Remove the version and annotations from the sequence file | |
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | 204 | 204 | my $cmd2 = qq(perl -i -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq); | |
system $cmd2 unless (-f "${accSeq}.pkp"); | 205 | 205 | system $cmd2 unless (-f "${accSeq}.pkp"); | |
} | 206 | 206 | } | |
207 | 207 | |||
#Return files to be aligned | 208 | 208 | #Return files to be aligned | |
return [$tmpFile, $accSeq]; | 209 | 209 | return [$tmpFile, $accSeq]; | |
} | 210 | 210 | } | |
211 | 211 | |||
212 | 212 | |||
213 | 213 | |||
214 | 214 | |||
#=========================================================================== | 215 | 215 | #=========================================================================== | |
#Read command line and print help | 216 | 216 | #Read command line and print help | |
217 | 217 | |||
218 | 218 | |||
sub read_command_line { | 219 | 219 | sub read_command_line { | |
220 | 220 | |||
print_help() unless (@ARGV); | 221 | 221 | print_help() unless (@ARGV); | |
222 | 222 | |||
my $status = GetOptions( | 223 | 223 | my $status = GetOptions( | |
"i|acc-file=s" => \&read_accFile, | 224 | 224 | "i|acc-file=s" => \&read_accFile, | |
"a|accession=s" => \&read_accession, | 225 | 225 | "a|accession=s" => \&read_accession, | |
"f|fragment=s" => \&read_fragment, | 226 | 226 | "f|fragment=s" => \&read_fragment, | |
"o|outdir=s" => \$outdir, | 227 | 227 | "o|outdir=s" => \$outdir, | |
"bdb|blastdb=s" => \&read_blastdb, | 228 | 228 | "bdb|blastdb=s" => \&read_blastdb, | |
"e|evalue=f" => \$evalue, | 229 | 229 | "e|evalue=f" => \$evalue, | |
"m|sub-matrix=s" => \&read_subMatrix, | 230 | 230 | "m|sub-matrix=s" => \&read_subMatrix, | |
"t|interactive!" => \$interactive, | 231 | 231 | "t|interactive!" => \$interactive, | |
"q|quiet!" => \$quiet, | 232 | 232 | "q|quiet!" => \$quiet, | |
"h|help" => sub { print_help(); }, | 233 | 233 | "h|help" => sub { print_help(); }, | |
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | 234 | 234 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | |
exit unless ($status); | 235 | 235 | exit unless ($status); | |
236 | 236 | |||
237 | 237 | |||
die "Error: option -f is mandatory." unless ($fragment); | 238 | 238 | die "Error: option -f is mandatory." unless ($fragment); | |
die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | 239 | 239 | die "Error: options -i or -a are mandatory." unless ($accession || $accFile); | |
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | 240 | 240 | die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive); | |
241 | 241 | |||
#Default value for output directory | 242 | 242 | #Default value for output directory | |
$outdir = "." unless ($outdir); | 243 | 243 | $outdir = "." unless ($outdir); | |
system "mkdir -p $outdir" unless (-d $outdir); | 244 | 244 | system "mkdir -p $outdir" unless (-d $outdir); | |
245 | 245 | |||
#When accession is not given, take the first part of the input file name as accession | 246 | 246 | #When accession is not given, take the first part of the input file name as accession | |
unless ($accession) { | 247 | 247 | unless ($accession) { | |
my @pathComp = split(/\//, $accFile); | 248 | 248 | my @pathComp = split(/\//, $accFile); | |
$accession = $pathComp[-1]; | 249 | 249 | $accession = $pathComp[-1]; | |
$accession =~ s/(\.faa|\.fasta)$//g; | 250 | 250 | $accession =~ s/(\.faa|\.fasta)$//g; | |
} | 251 | 251 | } | |
252 | 252 | |||
} | 253 | 253 | } | |
254 | 254 | |||
255 | 255 | |||
#========================================================================== | 256 | 256 | #========================================================================== | |
#Option -i | 257 | 257 | #Option -i | |
258 | 258 | |||
sub read_accFile { | 259 | 259 | sub read_accFile { | |
my ($opt, $value) = @_; | 260 | 260 | my ($opt, $value) = @_; | |
261 | 261 | |||
unless (-f $value && !(-z $value)) { | 262 | 262 | unless (-f $value && !(-z $value)) { | |
die "Error in option -i: File with sequences does not exist or is empty!\n"; | 263 | 263 | die "Error in option -i: File with sequences does not exist or is empty!\n"; | |
} | 264 | 264 | } | |
265 | 265 | |||
$accFile = $value; | 266 | 266 | $accFile = $value; | |
} | 267 | 267 | } | |
268 | 268 | |||
269 | 269 | |||
#========================================================================== | 270 | 270 | #========================================================================== | |
#Option -a | 271 | 271 | #Option -a | |
272 | 272 | |||
sub read_accession { | 273 | 273 | sub read_accession { | |
my ($opt, $value) = @_; | 274 | 274 | my ($opt, $value) = @_; | |
275 | 275 | |||
#Remove version number if any | 276 | 276 | #Remove version number if any | |
$value =~ s/\.\d+$//; | 277 | 277 | #$value =~ s/\.\d+$//; |
locfrag.sh
View file @
538dec8
1 | 1 | |||
#Help | 2 | 2 | #Help | |
if [ "$1" = "-h" ] | 3 | 3 | if [ "$1" = "-h" ] | |
then | 4 | 4 | then | |
echo "Locate aligned fragments within 2 full proteins" | 5 | 5 | echo "Locate aligned fragments within 2 full proteins" | |
echo "Arguments:" | 6 | 6 | echo "Arguments:" | |
echo " 1. NCBI Accession of first protein 1" | 7 | 7 | echo " 1. Accession of protein 1" | |
echo " 2. Aligned fragment of protein 1" | 8 | 8 | echo " 2. Aligned fragment of protein 1" | |
echo " 3. NCBI Accession of first protein 2" | 9 | 9 | echo " 3. Accession of protein 2" | |
echo " 4. Aligned fragment of protein 2" | 10 | 10 | echo " 4. Aligned fragment of protein 2" | |
echo " 5. (Optional) substitution matrix to use. (Defaul: BL50)" | 11 | 11 | echo " 5. (optional) blast DB to extract sequences from (Default: uniref90)" | |
echo " 6. (Optional) Indicate whether plots will be shown (Values: show/quiet; Default: show)" | 12 | 12 | echo " 6. (Optional) substitution matrix to use. (Defaul: BL50)" | |
13 | echo " 7. (Optional) Indicate whether plots will be shown (Values: show/quiet; Default: show)" | |||
exit 1 | 13 | 14 | exit 1 | |
fi | 14 | 15 | fi | |
15 | 16 | |||
16 | 17 | |||
17 | 18 | |||
#Define the substitution matrix to work with | 18 | 19 | #Define the substitution matrix to work with | |
mat="BL50" | 19 | 20 | mat="BL50" | |
mode="" | 20 | 21 | mode="" | |
22 | db="uniref90" | |||
21 | 23 | |||
24 | #Identify the blast DB to extract sequences from | |||
25 | if [[ ! -z "$5" ]] && ([[ "$5" != "quiet" ]] && [[ "$5" != "show" ]] && [[ "$5" != "uniref90" ]]) | |||
26 | then | |||
27 | db="$5" | |||
28 | fi | |||
22 | 29 | |||
#Identify the type of substitution matrix, if given | 23 | 30 | #Identify the type of substitution matrix, if given | |
if [[ ! -z "$5" ]] && ([[ "$5" != "quiet" ]] && [[ "$5" != "show" ]]) | 24 | 31 | if [[ ! -z "$6" ]] && ([[ "$6" != "quiet" ]] && [[ "$6" != "show" ]] && [[ "$6" != "uniref90" ]]) | |
then | 25 | 32 | then | |
mat=$5 | 26 | 33 | mat="$6" | |
fi | 27 | 34 | fi | |
28 | 35 | |||
29 | ||||
#Check the mode of operation: quiet/show | 30 | 36 | #Check the mode of operation: quiet/show | |
if [[ "$5" == "quiet" ]] || [[ "$6" == "quiet" ]] | 31 | 37 | if [[ "$6" == "quiet" ]] || [[ "$7" == "quiet" ]] | |
then | 32 | 38 | then | |
mode="-q" | 33 | 39 | mode="-q" | |
fi | 34 | 40 | fi | |
35 | 41 | |||
locateFragment.pl -a $1 -f $2 $mode | 36 | 42 | ||
locateFragment.pl -a $3 -f $4 $mode | 37 | 43 | ||
44 | #localizing fragments | |||
45 | locateFragment.pl -a $1 -f $2 $mode -bdb $db | |||
46 | locateFragment.pl -a $3 -f $4 $mode -bdb $db | |||
47 | ||||
48 | #Aligning fragments and full sequences | |||
alignSeqsFiles.pl -q $1_frag.faa -ql $1_frag -s $3_frag.faa -sl $3_frag -e 0.1 -c 20 -cc X -m $mat | 38 | 49 | alignSeqsFiles.pl -q $1_frag.faa -ql $1_frag -s $3_frag.faa -sl $3_frag -e 0.1 -c 20 -cc X -m $mat | |
alignSeqsFiles.pl -q $1.faa -ql $1 -s $3.faa -sl $3 -e 0.1 -c 5 -cc X -m $mat | 39 | 50 | alignSeqsFiles.pl -q $1.faa -ql $1 -s $3.faa -sl $3 -e 0.1 -c 5 -cc X -m $mat | |
40 | 51 | |||
41 | 52 | |||
#Open html reports if apropriate | 42 | 53 | #Open html reports if apropriate | |
if [[ $mode != "-q" ]] | 43 | 54 | if [[ $mode != "-q" ]] | |
then | 44 | 55 | then |