Commit 68b41f16a35757f498478ba2822e6978013608f2
1 parent
6342e60576
Exists in
master
update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts
Showing 3 changed files with 128 additions and 1080 deletions Side-by-side Diff
alignSeqsFiles.pl
View file @
68b41f1
... | ... | @@ -58,6 +58,8 @@ |
58 | 58 | my $segFilter = 'no'; |
59 | 59 | my $minLength = 30; #Min legnth of proteins to analyze (without gaps) |
60 | 60 | my $subMatrix = 'BL50'; |
61 | +my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high] | |
62 | +my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high] | |
61 | 63 | |
62 | 64 | #this can be used to remove long sequences from results |
63 | 65 | my $maxProtLength = 100000; #default threshold to allow any length |
... | ... | @@ -443,6 +445,9 @@ |
443 | 445 | #Run quod on the query, subject and the alignment. |
444 | 446 | |
445 | 447 | |
448 | +#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa | |
449 | + | |
450 | + | |
446 | 451 | sub run_quod { |
447 | 452 | |
448 | 453 | my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_; |
... | ... | @@ -469,8 +474,9 @@ |
469 | 474 | |
470 | 475 | #Note alnquod requires to add the extension to the image name |
471 | 476 | my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png"; |
472 | - my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | |
473 | - #print "$cmd1\n\n"; | |
477 | + my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa); | |
478 | +# print "$cmd1\n\n"; | |
479 | +# exit; | |
474 | 480 | system $cmd1 unless (-f "${alnFig}"); |
475 | 481 | return undef unless (-f "${alnFig}"); |
476 | 482 | |
477 | 483 | |
... | ... | @@ -483,17 +489,18 @@ |
483 | 489 | die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q}); |
484 | 490 | my $qTMS = ""; |
485 | 491 | if (scalar @{ $hmmtopHits{$q}{coords} } > 0) { |
486 | - $qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | |
492 | + $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange"; | |
487 | 493 | } |
488 | 494 | |
489 | 495 | |
490 | 496 | #Plot query hydropathy |
491 | 497 | my $qPfam = get_pfam_coords_for_quod($q, "red"); |
492 | - my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}"; | |
493 | - my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa); | |
494 | - #print "$cmd2\n\n"; | |
495 | - system $cmd2 unless (-f "${qName}.png"); | |
496 | - return undef unless (-f "${qName}.png"); | |
498 | + my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png"; | |
499 | + my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa); | |
500 | +# print "$cmd2\n\n"; | |
501 | +# exit; | |
502 | + system $cmd2 unless (-f $qName); | |
503 | + return undef unless (-f $qName); | |
497 | 504 | |
498 | 505 | |
499 | 506 | |
500 | 507 | |
... | ... | @@ -501,16 +508,17 @@ |
501 | 508 | die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s}); |
502 | 509 | my $sTMS = ""; |
503 | 510 | if (scalar @{ $hmmtopHits{$s}{coords} } > 0) { |
504 | - $sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | |
511 | + $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan"; | |
505 | 512 | } |
506 | 513 | |
507 | 514 | #Plot Subject hydropaty |
508 | 515 | my $sPfam = get_pfam_coords_for_quod($s, "blue"); |
509 | - my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}"; | |
510 | - my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa); | |
511 | - #print "$cmd3\n\n"; | |
512 | - system $cmd3 unless (-f "${sName}.png"); | |
513 | - return undef unless (-f "${sName}.png"); | |
516 | + my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png"; | |
517 | + my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa); | |
518 | +# print "$cmd3\n\n"; | |
519 | +# exit; | |
520 | + system $cmd3 unless (-f $sName); | |
521 | + return undef unless (-f $sName); | |
514 | 522 | |
515 | 523 | |
516 | 524 | return 1; |
... | ... | @@ -535,7 +543,7 @@ |
535 | 543 | if (exists $pfamHits{$prot}) { |
536 | 544 | my @Doms = keys %{ $pfamHits{$prot} }; |
537 | 545 | my $dcnt = 0; |
538 | - $str = "--region-font 12 -ar "; | |
546 | + $str = "--region-font 12 --add-region "; | |
539 | 547 | foreach my $d (@Doms) { |
540 | 548 | |
541 | 549 | my @hits = @{ $pfamHits{$prot}{$d} }; |
... | ... | @@ -543,8 +551,10 @@ |
543 | 551 | my $left = $hit->{qstart}; |
544 | 552 | my $right = $hit->{qend}; |
545 | 553 | |
546 | - my $ypos = -2.8 + $dcnt * 0.4; | |
547 | - $str .= "${left}-${right}:'${d}':${ypos}:$color "; | |
554 | + my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord | |
555 | + my $yposh = $yposl + 0.15; #domain height coord | |
556 | + | |
557 | + $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc "; | |
548 | 558 | $dcnt++; |
549 | 559 | } |
550 | 560 | } |
locateFragment.pl
View file @
68b41f1
tmsRepeat.pl
View file @
68b41f1
Diff suppressed. Click to show
1 | -#!/usr/bin/env perl -w | |
1 | +#!/usr/bin/env perl | |
2 | 2 | |
3 | -use warnings; | |
3 | +no warnings; | |
4 | 4 | use strict; |
5 | 5 | use Data::Dumper; |
6 | 6 | |
7 | -$Data::Dumper::Deepcopy = 1; | |
8 | -$Data::Dumper::Indent = 1; | |
9 | -#$Data::Dumper::Purity = 0; | |
10 | -$Data::Dumper::Sortkeys = 1; | |
11 | - | |
7 | +use TCDB::Repeats; | |
12 | 8 | use Getopt::Long; |
13 | -use Bio::SearchIO; | |
14 | -use Bio::SeqIO; | |
15 | 9 | |
16 | 10 | |
17 | -use TCDB::CheckDependencies; | |
18 | -use TCDB::Assorted; | |
11 | +my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences'; | |
12 | +my $seqsFile = undef; | |
13 | +my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop'; | |
14 | +my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP'; | |
19 | 15 | |
16 | +my $evalue = 1e-2; | |
17 | +my $coverage = 0.85; | |
18 | +my $identity = 0.2; | |
20 | 19 | |
20 | +my @tmsRanges = (); | |
21 | 21 | |
22 | -#========================================================================== | |
23 | -#Check dependencies | |
22 | +read_command_line(); | |
24 | 23 | |
25 | -my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py"); | |
26 | -my $CheckDep_obj = new TCDB::CheckDependencies(); | |
27 | -$CheckDep_obj -> dependencies_list(\@dependencies); | |
28 | -$CheckDep_obj -> checkDependencies; | |
29 | 24 | |
30 | - | |
31 | - | |
32 | -#========================================================================== | |
33 | -#Read command line options | |
34 | - | |
35 | -my $gs_infile = ""; | |
36 | -my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS | |
37 | -my $gs_idFormat = ""; | |
38 | -my $gs_repUnit = 0; | |
39 | -my $gs_seqDir = ""; | |
40 | -my $gs_tail = 5; | |
41 | -my $gs_evalue = 0.1; | |
42 | -my $gs_coverage = 0.8; | |
43 | -my $gs_identity = 0.25; | |
44 | -my $gsatShuffles = 1000; | |
45 | -my $min_gsat_score = 4.0; | |
46 | - | |
47 | -my $compStatsFlag = 1; | |
48 | -my $compStats = ""; | |
49 | -my $outdir = "repeats"; | |
50 | -my $repDir = "reports"; | |
51 | -my $seqDir = "sequences"; | |
52 | -my $alignDir = "alignments"; | |
53 | -my $plotsDir = "plots"; | |
54 | -my $goodHitsOnly = 1; #print only significant results, ignore everything else | |
55 | - | |
56 | - | |
57 | -#all (all sequences in output file) | |
58 | -#each (generate one directory per sequence.. for better organization) | |
59 | -#debug (it will print the contents of the hash table one sequences at a time) | |
60 | -my $mode = "all"; | |
61 | - | |
62 | -read_command_line_arguments(); | |
63 | - | |
64 | -#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir, | |
65 | -# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats], | |
66 | -# [qw(*infile *idFormat *repUnit *seqDir *tail *evalue | |
67 | -# *coverage *identity $gsatShuffles *compStatFlag *compStats)]); | |
25 | +#print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir], | |
26 | +# [qw(*seqsDir *seqsFile *tmsFile *outDir )]); | |
68 | 27 | #exit; |
69 | 28 | |
70 | -# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa | |
71 | 29 | |
30 | +#my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir, | |
31 | +# 'tmsFile' => $tmsFile, | |
32 | +# 'outDir' => $outDir, | |
33 | +# 'ranges2searchTMS' => \@TMSranges); | |
72 | 34 | |
73 | -#========================================================================== | |
74 | -#Read file with coordinates of TMSs and verify that the sequences are | |
75 | -#available | |
76 | 35 | |
77 | -my %gh_tms = (); | |
36 | +my @TMSranges = ([1, 3], [4, 6]); | |
78 | 37 | |
79 | -read_tms_coordinates_file($gs_infile, \%gh_tms); | |
38 | +my $repObj = TCDB::Repeat->new(); | |
80 | 39 | |
81 | -#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]); | |
82 | -#exit; | |
40 | +#$repObj->tmsFile($tmsFile); | |
41 | +#$repObj->seqsDir($seqsDir); | |
42 | +$repObj->seqsFile($seqsFile); | |
43 | +$repObj->outDir($outDir); | |
44 | +$repObj->evalueCutoff($evalue); | |
45 | +$repObj->identityCutoff($identity); | |
46 | +$repObj->coverageCutoff($coverage); | |
47 | +$repObj->TMSranges2search(\@TMSranges); | |
83 | 48 | |
49 | +$repObj-> findRepeatsTMSranges(); | |
84 | 50 | |
85 | -#=========================================================================== | |
86 | -#Main Output directory | |
51 | +#print Data::Dumper->Dump([$repObj ], [qw(*repObj)]); | |
87 | 52 | |
88 | -#Root directory for all results | |
89 | -system "mkdir -p $outdir" unless (-d $outdir); | |
90 | -die "Could not generate output directory: $outdir" unless (-d $outdir); | |
91 | 53 | |
92 | 54 | |
93 | 55 | |
94 | -#========================================================================== | |
95 | -#Search for repeats inside query sequences | |
96 | 56 | |
97 | -my %results = (); | |
98 | -my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots | |
99 | - | |
100 | -foreach my $ls_sid (keys %gh_tms) { | |
101 | - | |
102 | - my %gh_bundleSeqs = (); | |
103 | - my %gh_topHits = (); | |
104 | - | |
105 | - | |
106 | - print "Processing: $ls_sid\n"; | |
107 | - | |
108 | - | |
109 | - #Clean results if one output directory is generated per input sequence | |
110 | - %results = () if ($mode eq 'each'); | |
111 | - | |
112 | - | |
113 | - #Cut sequences in non overlaping regions with as many TMS as the | |
114 | - #repeat unit we want to find. | |
115 | - cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs); | |
116 | - | |
117 | - | |
118 | -# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]); | |
119 | -# <STDIN>; | |
120 | - | |
121 | - | |
122 | - #run ssearch to find potential repeats. | |
123 | - align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits); | |
124 | - | |
125 | - | |
126 | -# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]); | |
127 | -# <STDIN>; | |
128 | - | |
129 | - | |
130 | - #Collect results for final table | |
131 | - $results{$ls_sid} = \%gh_topHits; | |
132 | - | |
133 | - #present results per input sequence to verify everything looks fine. | |
134 | - if ($mode eq 'debug') { | |
135 | - print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]); | |
136 | - <STDIN>; | |
137 | - } | |
138 | - | |
139 | - print_reports(\%results) if ($mode eq 'each'); | |
140 | -} | |
141 | - | |
142 | - | |
143 | - | |
144 | - | |
145 | - | |
146 | 57 | #=========================================================================== |
147 | -#Print final results in summarized or detailed format | |
58 | +#Read command line and print help | |
148 | 59 | |
149 | -#print Data::Dumper->Dump([\%results ], [qw(*results )]); | |
150 | -#<STDIN>; | |
151 | 60 | |
152 | -print_reports(\%results) if ($mode eq 'all'); | |
61 | +sub read_command_line { | |
153 | 62 | |
63 | + print_help() unless (@ARGV); | |
154 | 64 | |
65 | + my $status = GetOptions( | |
66 | + "s|seqs-file=s" => \&read_seqsFile, | |
67 | + "d|seqs-dir=s" => \&read_seqsDir, | |
68 | + "o|outdir=s" => \&read_outdir, | |
69 | + "t|tms=s" => \&read_tmsFile, | |
70 | + "e|evalue=f" => \$evalue, | |
71 | + "i|identity=f" => \$identity, | |
72 | + "c|coverage=f" => \$coverage, | |
73 | + "h|help" => sub { print_help(); }, | |
74 | + "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }); | |
75 | + exit unless ($status); | |
155 | 76 | |
156 | 77 | |
157 | - | |
158 | -########################################################################### | |
159 | -# # | |
160 | -# Subroutine definition # | |
161 | -# # | |
162 | -########################################################################### | |
163 | - | |
164 | - | |
165 | -#print final_report | |
166 | - | |
167 | -sub print_reports { | |
168 | - | |
169 | - my $res = shift; | |
170 | - | |
171 | - | |
172 | - #Get the directory where reports will be saved | |
173 | - my $reportDir = undef; | |
174 | - if ($mode eq 'all') { | |
175 | - $reportDir = getReportsDir(); | |
176 | - } | |
177 | - else { | |
178 | - | |
179 | - #one id per report | |
180 | - my @ids = keys %$res; | |
181 | - my $seqId = $ids[0]; | |
182 | - | |
183 | - $reportDir = getReportsDir($seqId); | |
184 | - } | |
185 | - die "Error: invalid report dir" unless ($reportDir); | |
186 | - | |
187 | - | |
188 | - my $sumFile = "$reportDir/repeats_summary_report.txt"; | |
189 | - my $detailsFile = "$reportDir/repeats_detailed_report.txt"; | |
190 | - my $htmlFile = "$reportDir/report.html"; | |
191 | - | |
192 | - | |
193 | - open (my $htmlfh, ">", $htmlFile) || die $!; | |
194 | - | |
195 | - my $htmlHeader = <<HEADER; | |
196 | -<!DOCTYPE html> | |
197 | -<html xmlns="http://www.w3.org/1999/xhtml"> | |
198 | - <head> | |
199 | - <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
200 | - | |
201 | - <style type="text/css"> | |
202 | - | |
203 | -.label { | |
204 | - text-align: right; | |
205 | - width: 50px; | |
78 | + #Validadte input file option | |
79 | + die "Error: no sequence file detected!" unless ($seqsFile); | |
206 | 80 | } |
207 | 81 | |
208 | -.data { | |
209 | - text-align: left; | |
210 | - padding-left: 8px; | |
211 | - width: 100px; | |
212 | -} | |
213 | 82 | |
214 | -.uline { | |
215 | - text-decoration: underline; | |
216 | -} | |
217 | - | |
218 | -.seq { | |
219 | - border: 2px solid black; | |
220 | - height: 70px; | |
221 | - width: 100%; | |
222 | - overflow-x: auto; | |
223 | - overflow-y: hidden; | |
224 | - margin: 1em 0; | |
225 | - background: gray; | |
226 | - color: white; | |
227 | -} | |
228 | - | |
229 | -img { | |
230 | - display: block; | |
231 | - margin-left: auto; | |
232 | - margin-right: auto; | |
233 | - height: 250px; | |
234 | - width: auto; | |
235 | - max-width: 1500px; | |
236 | - max-height: 300px; | |
237 | -} | |
238 | - | |
239 | - </style> | |
240 | - <title>Inferring repeats of $gs_repUnit TMS</title> | |
241 | - </head> | |
242 | - <br /> | |
243 | - <h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1> | |
244 | - <body> | |
245 | - | |
246 | -HEADER | |
247 | - | |
248 | - print $htmlfh $htmlHeader; | |
249 | - open (my $sumh, ">", $sumFile) || die $!; | |
250 | - open (my $deth, ">", $detailsFile) || die $!; | |
251 | - | |
252 | - | |
253 | - #Header for summary table | |
254 | - print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n"; | |
255 | - | |
256 | - | |
257 | -# print Data::Dumper->Dump([$res ], [qw(*res )]); | |
258 | -# <STDIN>; | |
259 | - | |
260 | - | |
261 | - P:foreach my $id (sort {$a cmp $b} keys %$res) { | |
262 | - | |
263 | - #Jump to next result if there are NO hits for this protein and | |
264 | - #ONLY good hits are going to be recorded. | |
265 | - unless (%{ $res->{$id} }) { | |
266 | - next P if ($goodHitsOnly); | |
267 | - } | |
268 | - | |
269 | - | |
270 | - print $deth "===========================================================================\n"; | |
271 | - print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n"; | |
272 | - | |
273 | - #There must be results to continue | |
274 | - unless (%{ $res->{$id} }) { | |
275 | - print $sumh "$id\tNo_hits\n"; | |
276 | - print $deth "$id\tNo_hits\n\n\n"; | |
277 | - print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n"; | |
278 | - } | |
279 | - | |
280 | - print $deth "$id\n\n"; | |
281 | - print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n"; | |
282 | - | |
283 | - | |
284 | - | |
285 | - | |
286 | - #get the long bundle names | |
287 | - BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) { | |
288 | - | |
289 | - BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) { | |
290 | - | |
291 | - my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName}; | |
292 | - my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen}; | |
293 | - | |
294 | - #Each of the hits for this bundle | |
295 | - my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} }; | |
296 | - | |
297 | - #To get rid of a warning when there is only one hit. | |
298 | - my @hits = (scalar (@hits_tmp) > 1)? | |
299 | - sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp; | |
300 | - | |
301 | - foreach my $hit (@hits) { | |
302 | - | |
303 | - my $hName = $hit->{hName}; | |
304 | - my $hLen = $hit->{hLen}; | |
305 | - | |
306 | - my $evalue = sprintf ("%.1e", $hit->{hEvalue}); | |
307 | - my $ident = sprintf ("%.1f", $hit->{hId} * 100); | |
308 | - my $sim = sprintf ("%.1f", $hit->{hSim} * 100); | |
309 | - my $gsat = sprintf ("%.1f", $hit->{gsat}); | |
310 | - | |
311 | - my $alnLen = $hit->{alnLen}; | |
312 | - my $qCov = sprintf("%.1f", $hit->{qCov} * 100); | |
313 | - my $hCov = sprintf("%.1f", $hit->{hCov} * 100); | |
314 | - | |
315 | - | |
316 | - #The alignment | |
317 | - my $qstart = $hit->{qstart}; | |
318 | - my $qend = $hit->{qend}; | |
319 | - my $sstart = $hit->{sstart}; | |
320 | - my $send = $hit->{send}; | |
321 | - my $qSeq = $hit->{qSeq}; | |
322 | - my $homStr = $hit->{homStr}; | |
323 | - my $sSeq = $hit->{sSeq}; | |
324 | - | |
325 | - my $plot = $hit->{plot}; | |
326 | - | |
327 | - #For summary tab-delimitedfile (everything except the alignment) | |
328 | - print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n"; | |
329 | - | |
330 | - | |
331 | - #Detailed report that includes the alignment | |
332 | - print $deth "----------\n"; | |
333 | - print $deth "$qName ($qLen) vs $hName ($hLen)\n\n"; | |
334 | - print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n"; | |
335 | - print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n"; | |
336 | - print $deth "Alignment ($qName|${qstart}-$qend vs $hName|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n"; | |
337 | - | |
338 | - | |
339 | - #The HTML report (includes alignment and hydropathy image | |
340 | - my $repHit = <<HIT; | |
341 | - | |
342 | - <p><b>$qName ($qLen) vs $hName ($hLen)</b></p> | |
343 | - | |
344 | - <table width="600px" border="0" cellspacing="0" cellpadding="2"> | |
345 | - <tr> | |
346 | - <td class='label'><b>E-value:</b></td> | |
347 | - <td class='data'>$evalue</td> | |
348 | - <td class='label'><b>Identity:</b></td> | |
349 | - <td class='data'>${ident}%</td> | |
350 | - <td class='label'><b>Similarity:</b></td> | |
351 | - <td class='data'>${sim}%</td> | |
352 | - <td class='label'><b>GSAT:</b></td> | |
353 | - <td class='data'>$gsat</td> | |
354 | - </tr> | |
355 | - <tr> | |
356 | - <td class='label'><b>Aln:</b></td> | |
357 | - <td class='data'>$alnLen</td> | |
358 | - <td class='label'><b>Q_cov:</b></td> | |
359 | - <td class='data'>${qCov}%</td> | |
360 | - <td class='label'><b>S_cov:</b></td> | |
361 | - <td class='data'>${hCov}%</td> | |
362 | - <td class='label'></td> | |
363 | - <td class='data'></td> | |
364 | - </tr> | |
365 | - </table> | |
366 | - | |
367 | - <p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p> | |
368 | - <div class='seq'> | |
369 | - <pre> | |
370 | -$qSeq | |
371 | -$homStr | |
372 | -$sSeq | |
373 | - </pre> | |
374 | - </div> | |
375 | - <a href="$plot" target="_blank"><img src="$plot"/></a> | |
376 | - <br /> | |
377 | - <hr /> | |
378 | - | |
379 | -HIT | |
380 | - | |
381 | - print $htmlfh $repHit; | |
382 | - | |
383 | - } #hit | |
384 | - } #reference bundle number | |
385 | - } #Reference bundle name | |
386 | - } #Query protein | |
387 | - | |
388 | - #Close HTML report | |
389 | - my $closeRep = <<CLOSE; | |
390 | - </body> | |
391 | -</html> | |
392 | -CLOSE | |
393 | - | |
394 | - print $htmlfh $closeRep; | |
395 | - | |
396 | - close $sumh; | |
397 | - close $deth; | |
398 | - close $htmlfh; | |
399 | -} | |
400 | - | |
401 | - | |
402 | - | |
403 | 83 | #========================================================================== |
404 | -#Run ssearch36 between the different bundles in a sequence | |
84 | +#Option -s | |
405 | 85 | |
406 | -sub align_bundles { | |
86 | +sub read_seqsFile { | |
87 | + my ($opt, $value) = @_; | |
407 | 88 | |
408 | - my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_; | |
409 | - | |
410 | - %$lhr_topHits = (); | |
411 | - | |
412 | - #Directory where the sequences of TMS bundles are saved | |
413 | - my $sequencesDir = undef; | |
414 | - my $alignmentsDir = undef; | |
415 | - my $hydroPlotsDir = undef; | |
416 | - | |
417 | - if ($mode eq 'all') { | |
418 | - $sequencesDir = getSequencesDir(); | |
419 | - $alignmentsDir = getAlignmentsDir(); | |
420 | - $hydroPlotsDir = getPlotsDir(); | |
89 | + unless (-f $value && !(-z $value)) { | |
90 | + die "Error: file with sequences does not exist or is empty!\n"; | |
421 | 91 | } |
422 | - else { | |
423 | - $sequencesDir = getSequencesDir($seqId); | |
424 | - $alignmentsDir = getAlignmentsDir($seqId); | |
425 | - $hydroPlotsDir = getPlotsDir($seqId); | |
426 | - } | |
427 | - die "Error: invalid sequences dir" unless ($sequencesDir); | |
428 | - die "Error: invalid alignments dir" unless ($alignmentsDir); | |
429 | - die "Error: invalid plots dir" unless ($hydroPlotsDir); | |
430 | 92 | |
431 | - | |
432 | -# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]); | |
433 | -# <STDIN>; | |
434 | - | |
435 | - | |
436 | - #The bundle that will be used as reference for the comparison | |
437 | - REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) { | |
438 | - | |
439 | - my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0]; | |
440 | - | |
441 | - | |
442 | - #Id to name ssearch36 output files | |
443 | - my $id = $lhr_bundleSeqFiles->{$bundle}->[0]; | |
444 | - $id =~ s/\.faa//; | |
445 | - | |
446 | - | |
447 | - #For naming GSAT files (ID of system or protein accession) | |
448 | - my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef; | |
449 | - die "Could not extract accession from $id!" unless ($id); | |
450 | - | |
451 | - | |
452 | -# print Data::Dumper->Dump([$id, $tcAcc ], [qw(*id *tcAcc)]); | |
453 | -# <STDIN>; | |
454 | - | |
455 | - | |
456 | - #-------------------------------------------------------------------- | |
457 | - #Get the non-overlapping bundles to compare them against the | |
458 | - #reference bundle | |
459 | - | |
460 | - my @cmpFiles = (); | |
461 | - | |
462 | - #Initialize the index to the first non-overlapping bundle | |
463 | - my $next_bundle_idx = $bundle + $gs_repUnit; | |
464 | - | |
465 | - CMP:while (1) { | |
466 | - | |
467 | - #Exit if next bundle is not in bundles hash | |
468 | - last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx}); | |
469 | - | |
470 | - #Get file name for this non-overlapping bundle | |
471 | - my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0]; | |
472 | - push (@cmpFiles, $cmpBundle); | |
473 | - | |
474 | - #Update the index to the next non-overlapping bundle | |
475 | - $next_bundle_idx = $next_bundle_idx + $gs_repUnit; | |
476 | - } | |
477 | - | |
478 | - #go to next reference bundle if there are no non-overlapping bundles. | |
479 | - next REF unless (@cmpFiles); | |
480 | - | |
481 | - | |
482 | -# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]); | |
483 | -# <STDIN>; | |
484 | - | |
485 | - | |
486 | - #-------------------------------------------------------------------- | |
487 | - #Now run ssearch36 of the reference bundle against all its | |
488 | - #non-overlapping bundles | |
489 | - | |
490 | - #put all non-overlapping bundles into a file | |
491 | - my $libFile = "$sequencesDir/lib_$id.faa"; | |
492 | - my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile"; | |
493 | - system $cmd; | |
494 | - | |
495 | - | |
496 | - #run ssearch36 of $rFile vs @cmpFile | |
497 | - my $ssearchOut = "$alignmentsDir/ssearch_$id.out"; | |
498 | - my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut); | |
499 | - system "ssearch36 $ssearch_params" unless (-f $ssearchOut); | |
500 | - | |
501 | - | |
502 | -# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]); | |
503 | -# <STDIN>; | |
504 | - | |
505 | - | |
506 | - #--------------------------------------------------------------------- | |
507 | - #Estimate here the spacing between x-ticks for hydropathy plots | |
508 | - | |
509 | - my $protLen = $origSeqLength{$seqId}; | |
510 | - | |
511 | - my $xticksSpacing = undef; | |
512 | - if ($protLen <= 500) { | |
513 | - $xticksSpacing = 25; | |
514 | - } | |
515 | - elsif ($protLen <= 1000) { | |
516 | - $xticksSpacing = 50; | |
517 | - } | |
518 | - else { | |
519 | - $xticksSpacing = 100; | |
520 | - } | |
521 | - | |
522 | - | |
523 | - | |
524 | - #-------------------------------------------------------------------- | |
525 | - #parse ssearch36 output. For BioPerl resouces check: | |
526 | - #http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm | |
527 | - #https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html | |
528 | - | |
529 | - my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut); | |
530 | - | |
531 | - | |
532 | - #put hir the top hits | |
533 | - my %lh_hits = (); | |
534 | - | |
535 | - | |
536 | - while (my $result = $parser->next_result) { | |
537 | - | |
538 | - | |
539 | - my $qLen = $result->query_length; | |
540 | - $lh_hits{$bundle}{qName} = $result->query_name; | |
541 | - $lh_hits{$bundle}{qLen} = $qLen; | |
542 | - $lh_hits{$bundle}{hits} = []; | |
543 | - | |
544 | - | |
545 | - HIT:while (my $hit = $result->next_hit) { | |
546 | - | |
547 | - HSP:while(my $hsp = $hit->next_hsp) { | |
548 | - | |
549 | - | |
550 | -# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]); | |
551 | -# <STDIN>; | |
552 | - | |
553 | - | |
554 | - my %tmp = (); | |
555 | - | |
556 | - my $alnLen = $hsp->hsp_length; | |
557 | - my $hLen = $hit->length; | |
558 | - my $hEvalue = $hsp->evalue; | |
559 | - my $hId = $hsp->frac_identical('total'); #identity in the alignment | |
560 | - my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment | |
561 | - | |
562 | - | |
563 | - #coordinates in the alignment to properly calculate coverages | |
564 | - my $qstart = $hsp->start('query'); | |
565 | - my $qend = $hsp->end('query'); | |
566 | - my $sstart = $hsp->start('subject'); | |
567 | - my $send = $hsp->end('subject'); | |
568 | - | |
569 | - | |
570 | - #Calculate coverages properly (do not use alignment length as it includes gaps | |
571 | - | |
572 | - my $qCov_tmp = ($qend - $qstart + 1) / $qLen; | |
573 | - my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp; | |
574 | - | |
575 | - my $hCov_tmp = ($send - $sstart + 1) / $hLen; | |
576 | - my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp; | |
577 | - | |
578 | - | |
579 | -# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity], | |
580 | -# [qw(*qLen *qCov $hLen *hCov *coverageCutoff *evalue *evalCutoff *hId *IDcutoff)]); | |
581 | -# <STDIN>; | |
582 | - | |
583 | - | |
584 | - #Before storing hit results check minimum coverage, identity and evalue | |
585 | - next HSP unless (($qCov >= $gs_coverage || $hCov >= $gs_coverage) && | |
586 | - ($hEvalue <= $gs_evalue) && ($hId >= $gs_identity)); | |
587 | - | |
588 | - | |
589 | - #hit identity | |
590 | - $tmp{hName} = $hit->name; | |
591 | - $tmp{hLen} = $hLen; | |
592 | - | |
593 | - | |
594 | - #hit statistics | |
595 | - $tmp{alnLen} = $alnLen; | |
596 | - $tmp{hEvalue} = $hEvalue; | |
597 | - $tmp{hId} = $hId; | |
598 | - $tmp{hSim} = $hSim; | |
599 | - $tmp{qCov} = $qCov; | |
600 | - $tmp{hCov} = $hCov; | |
601 | - | |
602 | - | |
603 | - #The alignment | |
604 | - $tmp{qstart} = $qstart; | |
605 | - $tmp{qend} = $qend; | |
606 | - $tmp{sstart} = $sstart; | |
607 | - $tmp{send} = $send; | |
608 | - | |
609 | - $tmp{qSeq} = $hsp->query_string; | |
610 | - $tmp{sSeq} = $hsp->hit_string; | |
611 | - $tmp{homStr} = $hsp->homology_string; | |
612 | - | |
613 | - | |
614 | - #Get the GSAT score | |
615 | - my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat"; | |
616 | - | |
617 | - | |
618 | -# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n"; | |
619 | -# exit; | |
620 | - | |
621 | - system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile); | |
622 | - | |
623 | - my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile); | |
624 | - $tmp{gsat} = $gsat_score; | |
625 | - | |
626 | - | |
627 | -# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]); | |
628 | -# <STDIN>; | |
629 | - | |
630 | - | |
631 | - #GSAT is the last filter | |
632 | - next HSP unless ($gsat_score >= $min_gsat_score); | |
633 | - | |
634 | - #------------------------------------------------------------ | |
635 | - #Generate quod plot with the repeat | |
636 | - | |
637 | - my $whole_prot_seq = "$gs_seqDir/${seqId}.faa"; | |
638 | - die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq); | |
639 | - | |
640 | - | |
641 | - my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName}; | |
642 | - my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png"; | |
643 | - my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName}; | |
644 | - | |
645 | - #Get hydrophobic peaks coords | |
646 | - my $hydroPeaks = $gh_tms{$seqId}; | |
647 | - die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks }); | |
648 | - | |
649 | - | |
650 | - #format the hydrophobic peaks for quod | |
651 | - my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks; | |
652 | - my $pstring = join (" ", @peaks); | |
653 | - | |
654 | - | |
655 | - #---------- | |
656 | - #Calculate the positions of the aligned section of each bundle in the full sequence. | |
657 | - | |
658 | - my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef; | |
659 | - my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef; | |
660 | - die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid); | |
661 | - | |
662 | - | |
663 | - #extract initial positions for both bundles | |
664 | - my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1]; | |
665 | - my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1; | |
666 | - my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1]; | |
667 | - my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1; | |
668 | - die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend); | |
669 | - die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend); | |
670 | - | |
671 | - | |
672 | - #Calculate bundle positions here | |
673 | - my $qgp_start = $qbstart + ($qstart - 1); | |
674 | - my $qgp_end = $qbstart + ($qend - 1); | |
675 | - | |
676 | - my $sgp_start = $sbstart + ($sstart - 1); | |
677 | - my $sgp_end = $sbstart + ($send - 1); | |
678 | - | |
679 | - | |
680 | - #Format the coordinates for the repeats now | |
681 | - my $qrep = "${qgp_start}-${qgp_end}:green"; | |
682 | - my $srep = "${sgp_start}-${sgp_end}:blue"; | |
683 | - | |
684 | - #Format the coordinates for the bar delimiting the bundles | |
685 | - my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1"; | |
686 | - | |
687 | - #The quod command line | |
688 | - my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}"; | |
689 | - | |
690 | - my $img = "${plotFile}.png"; | |
691 | - system $cmd unless (-f $img); | |
692 | - die "Could not generate plot: $img" unless (-f $img); | |
693 | - | |
694 | - $tmp{plot} = $fileName; | |
695 | - | |
696 | - | |
697 | - #load the data into the hits section for this bundle | |
698 | - push (@{ $lh_hits{$bundle}{hits} }, \%tmp); | |
699 | - | |
700 | - | |
701 | - } #HSP | |
702 | - } #HIT | |
703 | - } #While | |
704 | - | |
705 | - | |
706 | - #Add results to the topHits hash | |
707 | - if (@{ $lh_hits{$bundle}{hits} }) { | |
708 | - $lhr_topHits->{$id} = \%lh_hits; | |
709 | - } | |
710 | - | |
711 | - } | |
93 | + $seqsFile = $value; | |
712 | 94 | } |
713 | 95 | |
714 | 96 | |
715 | - | |
716 | - | |
717 | 97 | #========================================================================== |
718 | -#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the | |
719 | -#sequence in TMS bundles of length rsize. | |
98 | +#Option -t | |
720 | 99 | |
100 | +sub read_tmsFile { | |
101 | + my ($opt, $value) = @_; | |
721 | 102 | |
722 | -sub cut_seq_in_tms_regions { | |
723 | - | |
724 | - my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_; | |
725 | - | |
726 | - | |
727 | - %$lhr_seqSegs = (); | |
728 | - | |
729 | - | |
730 | - #Get the directory where bundle sequences will be saved | |
731 | - my $sequencesDir = undef; | |
732 | - | |
733 | - if ($mode eq 'all') { | |
734 | - $sequencesDir = getSequencesDir(); | |
103 | + unless (-f $value && !(-z $value)) { | |
104 | + die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n"; | |
735 | 105 | } |
736 | - else { | |
737 | - $sequencesDir = getSequencesDir($ls_pid); | |
738 | - } | |
739 | - die "Error: invalid sequence dir" unless ($sequencesDir); | |
740 | 106 | |
741 | - | |
742 | - #---------------------------------------------------------------------- | |
743 | - #Get the coordinates of the overlapping bundles | |
744 | - | |
745 | - my @la_tms = @{ $lhr_tms->{$ls_pid} }; | |
746 | - | |
747 | - | |
748 | - | |
749 | - #Get the Length of the sequence of the query protein | |
750 | - my $seqFile = "$gs_seqDir/${ls_pid}.faa"; | |
751 | - my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta"); | |
752 | - my $seqObj = $obj->next_seq; | |
753 | - my $qlength = $seqObj->length; | |
754 | - die "Could not extract protein length." unless ($qlength); | |
755 | - | |
756 | - #Store the length of the original sequence for proper calculation of | |
757 | - #the x-ticks in the hydropathy plots of the results | |
758 | - $origSeqLength{$ls_pid} = $qlength; | |
759 | - | |
760 | - | |
761 | - | |
762 | - #Number of TMS in protein | |
763 | - my $ls_ntms = scalar (@la_tms); | |
764 | - | |
765 | - | |
766 | - | |
767 | - for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) { | |
768 | - | |
769 | - #TMS in bundle | |
770 | - my $left_tms = $la_tms[$idx - 1]; | |
771 | - my $right_tms = $la_tms[$idx + $ls_repeat - 2]; | |
772 | - | |
773 | - | |
774 | - #The coordinates of the bundle | |
775 | - my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail; | |
776 | - #my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail; | |
777 | - my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail; | |
778 | - | |
779 | - | |
780 | - #Cut and name the bundles only if bundle file does not exist | |
781 | - my $outfile = "${ls_pid}_bundle${idx}"; | |
782 | - unless (-f "$sequencesDir/${outfile}.faa") { | |
783 | - | |
784 | - #cutting bundle | |
785 | - my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0); | |
786 | - system "tmsplit $args > /dev/null"; | |
787 | - | |
788 | - #replace protein ID with bundle number to the ID so alignments can be easily identified | |
789 | - system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa); | |
790 | - } | |
791 | - | |
792 | - $lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos]; | |
793 | - } | |
107 | + $tmsFile = $value; | |
794 | 108 | } |
795 | 109 | |
796 | 110 | |
797 | - | |
798 | - | |
799 | 111 | #========================================================================== |
800 | -#Read file with the TMS coordinates of the input proteins. The TMS | |
801 | -#must have been validated with WHAT to make sure they are reliable. | |
112 | +#Option -d | |
802 | 113 | |
114 | +sub read_seqsDir { | |
115 | + my ($opt, $value) = @_; | |
803 | 116 | |
804 | -sub read_tms_coordinates_file { | |
117 | + die "Error: directory with sequences does not exist." unless (-d $value); | |
805 | 118 | |
806 | - my ($s_coordsFile, $hr_tms) = @_; | |
807 | - | |
808 | - open (my $fileh, "<", $s_coordsFile) || die $!; | |
809 | - | |
810 | - #----------------------------------------------------------------- | |
811 | - #The format of this file is protein ID followed by pairs of | |
812 | - #coordinates separated by dash: | |
813 | - # 2.A.43.1.1-O60931 1-20 25-35 50-68 .... | |
814 | - if ($infileFmt eq 'tms') { | |
815 | - | |
816 | - while(<$fileh>) { | |
817 | - chomp; | |
818 | - | |
819 | - #ignore empty lines; | |
820 | - next unless ($_); | |
821 | - | |
822 | - #extract id and TMSs coordinates | |
823 | - my ($id, @tms_str) = split(/\s+/, $_); | |
824 | - my @tms = map { [ split(/-/, $_) ] } @tms_str; | |
825 | - | |
826 | - | |
827 | - #For debugging purposes | |
828 | -# next unless ($id eq 'WP_100644534'); | |
829 | - | |
830 | - | |
831 | - $hr_tms->{$id} = \@tms; | |
832 | - | |
833 | - #Verify that the sequence is available for this protein | |
834 | - unless (-f "$gs_seqDir/${id}.faa" && ! (-z "$gs_seqDir/${id}.faa")) { | |
835 | - die "Could not find sequence for protein: $id in dir: $gs_seqDir -->"; | |
836 | - } | |
837 | - } #while | |
838 | - } | |
839 | - | |
840 | - #Input file is in HMMTOP format | |
841 | - else { | |
842 | - while(<$fileh>) { | |
843 | - chomp; | |
844 | - | |
845 | - #Remove trailing spaces | |
846 | - s/\s+$//; | |
847 | - | |
848 | - #ignore empty lines | |
849 | - next unless ($_); | |
850 | - | |
851 | - | |
852 | - #parse hmmtop line | |
853 | - my ($id, $ntms, $tms_str) = (/\S+\s+\d+\s+(\S+).+(IN|OUT)\s+(\d+)\s+([\d\s-]+)/)? ($1, $3, $4) : (); | |
854 | - | |
855 | - #For debugging purposes | |
856 | -# next unless ($id eq 'WP_100644534'); | |
857 | - | |
858 | - | |
859 | - if ($id && $ntms && $tms_str) { | |
860 | - | |
861 | - #extract the pairs of coordinates for TMS | |
862 | - my @coords = split(/\s+/, $tms_str); | |
863 | - my @tms = (); | |
864 | - for (my $i=0; $i < $#coords; $i += 2) { | |
865 | - push (@tms, [$coords[$i], $coords[$i+1]]); | |
866 | - } | |
867 | - | |
868 | - $hr_tms->{$id} = \@tms; | |
869 | - | |
870 | - } | |
871 | - else { | |
872 | - print "problem parsing HMMTOP line: $_\n";; | |
873 | - print Data::Dumper->Dump([$id, $ntms, $tms_str ], [qw(*id *ntms *tms_str )]); | |
874 | - exit;; | |
875 | - } | |
876 | - } | |
877 | - } | |
878 | - | |
879 | - close $fileh; | |
119 | + $seqsDir = $value; | |
880 | 120 | } |
881 | 121 | |
882 | 122 | |
883 | - | |
884 | 123 | #========================================================================== |
885 | -#Get the directory where the sequences of bundles will be saved. | |
124 | +#Option -o | |
886 | 125 | |
887 | -sub getSequencesDir { | |
126 | +sub read_outdir { | |
127 | + my ($opt, $value) = @_; | |
888 | 128 | |
889 | - my $protId = shift; | |
890 | - | |
891 | - my $dir = undef; | |
892 | - | |
893 | - if ($mode eq 'all') { | |
894 | - $dir = "$outdir/$seqDir"; | |
895 | - } | |
896 | - else { | |
897 | - die "Error: protein accession missing" unless ($protId); | |
898 | - $dir = "$outdir/$protId/$seqDir"; | |
899 | - } | |
900 | - | |
901 | - system "mkdir -p $dir" unless (-d $dir); | |
902 | - die "No dir for bundle sequences found: $dir" unless (-d $dir); | |
903 | - | |
904 | - return $dir; | |
129 | + $outDir = $value; | |
905 | 130 | } |
906 | 131 | |
907 | 132 | |
908 | 133 | #========================================================================== |
909 | -#Get the directory where the alignments will be saved | |
134 | +#option -h | |
910 | 135 | |
911 | -sub getAlignmentsDir { | |
912 | 136 | |
913 | - my $protId = shift; | |
914 | - | |
915 | - my $dir = undef; | |
916 | - | |
917 | - if ($mode eq 'all') { | |
918 | - $dir = "$outdir/$alignDir"; | |
919 | - } | |
920 | - else { | |
921 | - die "Error: protein accession missing" unless ($protId); | |
922 | - $dir = "$outdir/$protId/$alignDir"; | |
923 | - } | |
924 | - | |
925 | - system "mkdir -p $dir" unless (-d $dir); | |
926 | - die "No dir for alignments found: $dir" unless (-d $dir); | |
927 | - | |
928 | - return $dir; | |
929 | -} | |
930 | - | |
931 | - | |
932 | -#========================================================================== | |
933 | -#Get the directory where hydropathy plots will be saved | |
934 | - | |
935 | -sub getPlotsDir { | |
936 | - | |
937 | - my $protId = shift; | |
938 | - | |
939 | - my $dir = undef; | |
940 | - | |
941 | - if ($mode eq 'all') { | |
942 | - $dir = "$outdir/$plotsDir"; | |
943 | - } | |
944 | - else { | |
945 | - die "Error: protein accession missing" unless ($protId); | |
946 | - $dir = "$outdir/$protId/$plotsDir"; | |
947 | - } | |
948 | - | |
949 | - system "mkdir -p $dir" unless (-d $dir); | |
950 | - die "No dir for plots found: $dir" unless (-d $dir); | |
951 | - | |
952 | - return $dir; | |
953 | -} | |
954 | - | |
955 | - | |
956 | -#========================================================================== | |
957 | -#Get the directory where the reports will be saved | |
958 | - | |
959 | -sub getReportsDir { | |
960 | - | |
961 | - my $protId = shift; | |
962 | - | |
963 | - my $dir = undef; | |
964 | - | |
965 | - if ($mode eq 'all') { | |
966 | - $dir = "$outdir/$repDir"; | |
967 | - } | |
968 | - else { | |
969 | - die "Error: protein accession missing" unless ($protId); | |
970 | - $dir = "$outdir/$protId/$repDir"; | |
971 | - } | |
972 | - | |
973 | - system "mkdir -p $dir" unless (-d $dir); | |
974 | - die "No dir for reports found: $dir" unless (-d $dir); | |
975 | - | |
976 | - return $dir; | |
977 | -} | |
978 | - | |
979 | - | |
980 | - | |
981 | - | |
982 | - | |
983 | -#========================================================================== | |
984 | -#Read command-line arguments | |
985 | - | |
986 | -sub read_command_line_arguments { | |
987 | - | |
988 | - #if no arguments are given print the help | |
989 | - if (! @ARGV) { | |
990 | - print_help(); | |
991 | - } | |
992 | - | |
993 | - #---------------------------------------------------------------------- | |
994 | - #Parse command line arguments | |
995 | - | |
996 | - my $ls_status = GetOptions( | |
997 | - "i|infile=s" => \$gs_infile, | |
998 | - "if|infile-format=s" => \$infileFmt, | |
999 | - "o|outdir=s" => \$outdir, | |
1000 | - "f|id-format=s" => \$gs_idFormat, | |
1001 | - "r|rep-unit=i" => \$gs_repUnit, | |
1002 | - "t|tail-size=i" => \$gs_tail, | |
1003 | - "s|seqs=s" => \$gs_seqDir, | |
1004 | - "e|evalue=f" => \$gs_evalue, | |
1005 | - "c|coverage=f" => \$gs_coverage, | |
1006 | - "id|identity=f" => \$gs_identity, | |
1007 | - "ncs|no-comp-stats!" => \$compStatsFlag, | |
1008 | - "gs|gsat-shuffles=i" => \$gsatShuffles, | |
1009 | - "z|gsat-cutoff=f" => \$min_gsat_score, | |
1010 | - "m|mode=s" => \$mode, | |
1011 | - "h|help" => sub { print_help(); }, | |
1012 | - | |
1013 | - #For arguments that do not look like valid options | |
1014 | - "<>" => sub { die "Error: Unknown argument: $_[0]\n"; } | |
1015 | - ); | |
1016 | - die "\n" unless ($ls_status); | |
1017 | - | |
1018 | - #---------------------------------------------------------------------- | |
1019 | - #Validate command line arguments | |
1020 | - | |
1021 | - die "Error: argument -i is mandatory.\n" unless ($gs_infile); | |
1022 | - die "Error: argument -r is mandatory and must be greater than 0.0\n" unless ($gs_repUnit > 0); | |
1023 | - die "Error: augument -t must be grater than 0 and less than 16\n" if ($gs_tail > 15 || $gs_tail < 0); | |
1024 | - die "Error: argument -e must be greater than 0\n" unless ($gs_evalue >=0 ); | |
1025 | - die "Error: argument -c must be between 0.5 and 1.0\n" unless ($gs_coverage >= 0.0 && $gs_coverage <= 1.0); | |
1026 | - die "Error: argument -id must be between 0.25 and 1.0\n" unless ($gs_identity >= 0.0 && $gs_identity <= 1.0); | |
1027 | - | |
1028 | - #Option -f | |
1029 | - $gs_idFormat = lc $gs_idFormat; | |
1030 | - unless ($gs_idFormat =~ /^(tc|tca|o)$/) { | |
1031 | - die "Error: There are 3 Valid options for -f (tc, tca, o)\n"; | |
1032 | - } | |
1033 | - | |
1034 | - | |
1035 | - #option -if | |
1036 | - $infileFmt = lc $infileFmt; | |
1037 | - unless ($infileFmt =~ /^(hmmtop|tms)$/) { | |
1038 | - die "Error: invalid input file format: '$infileFmt' (Valid options: hmmtop, tms).\n"; | |
1039 | - } | |
1040 | - | |
1041 | - | |
1042 | - #option -m | |
1043 | - $mode = lc $mode; | |
1044 | - unless ($mode =~ /^(all|each|debug)$/) { | |
1045 | - die "Error: invalid mode of operation '$mode'. Valid options are: all, each!\n"; | |
1046 | - } | |
1047 | - | |
1048 | - | |
1049 | - #Option -s | |
1050 | - unless (-d $gs_seqDir) { | |
1051 | - die "Error: Directory with sequences must exits -> $gs_seqDir\n"; | |
1052 | - } | |
1053 | - | |
1054 | - | |
1055 | - #Validate GSAT cutoff | |
1056 | - unless ($min_gsat_score >= 0) { | |
1057 | - die "Use GSAT cutoff >= 3.0!\n"; | |
1058 | - } | |
1059 | - | |
1060 | - | |
1061 | - #option -ncs | |
1062 | - $compStats = ($compStatsFlag)? "" : "-k 1000 -z 11"; | |
1063 | -} | |
1064 | - | |
1065 | - | |
1066 | - | |
1067 | 137 | sub print_help { |
1068 | 138 | |
1069 | - my $help = <<'HELP'; | |
139 | + my $help = <<'HELP'; | |
1070 | 140 | |
1071 | -This script searches for regions of TMSs repeated in a full protein. | |
141 | +This program searches for reapeats between different user-specified | |
142 | +regions of proteins. | |
1072 | 143 | |
1073 | --i, --infile {path} | |
1074 | - Input file with id/accession(s) of the protein(s) to analyze and the coordinates | |
1075 | - of the TMSs in that protein(s). Use option -if to specify the format of this | |
1076 | - file. | |
1077 | - (Argument is mandatory). | |
144 | + Command line options: | |
1078 | 145 | |
1079 | --if, --infile-format {string} (optional) | |
1080 | - Format of the TMS coordenates. It can be either 'tms' or 'hmmtop'. | |
1081 | - (Default: hmmtop) | |
146 | + -s, --seqs-file {file} (mandatory) | |
147 | + Path to file in fasta format with all the input sequences. | |
148 | + THis option is incompatible with option -d. But one of the | |
149 | + two option must be given. | |
1082 | 150 | |
1083 | --o, --outdir {path} | |
1084 | - Output directory where results will be saved. | |
1085 | - (Default: repeats) | |
151 | + -d, --seqs-dir {path} (optional) | |
152 | + Path to directory where the input sequences are located. | |
153 | + This option is incompatible with options -s. But one of the | |
154 | + two option must be given. | |
1086 | 155 | |
1087 | --s, --seqs {path} | |
1088 | - Directory to access the sequences in FASTA format that will be used to | |
1089 | - search for repeats. One file per sequence, and the name of the file is | |
1090 | - the accession of the protein followed by '.faa' | |
1091 | - (Argument is mandatory) | |
156 | + -o, --oudir {papth} (optional) | |
157 | + Path to the output directory. | |
158 | + (Default: ./tmsRepeat) | |
1092 | 159 | |
1093 | --f, --id-format {string} | |
1094 | - Acceptable formats for identifiers: | |
1095 | - tc plain tcdb identifier of a system (e.g., 2.A.1.8.1) | |
1096 | - tca tcdb id and accession separated by dash (e.g. 2.A.1.8.3-Q9R6U5) | |
1097 | - o other, it can be refSeq, uniprot or custom, but it is requried | |
1098 | - that is is a single string without spaces. | |
1099 | - (Argument is mandatory) | |
160 | + -t, --tms {file} (optional) | |
161 | + File with the output of hmmtop for the input sequences, if available. | |
162 | + (Default: run hmmtop on input seqeunces) | |
1100 | 163 | |
1101 | --r, --repeat-unit {int) | |
1102 | - Size in TMS of the repeat unit to search in the protein. | |
1103 | - (Argument is mandatory) | |
164 | + -e, --evalue {float} (optional) | |
165 | + Maximal evalue cutoff for the aligned seqments. | |
166 | + (Default: 0.001) | |
1104 | 167 | |
1105 | --t, --tail-size {int} | |
1106 | - Number of residues to add to the beginning and end of TMS regions before | |
1107 | - running comparisons. Value should be less than or equal to 15 residues. | |
1108 | - (Default: 5); | |
168 | + -i, --identity {float} (optional) | |
169 | + Minimal identity in aligned regions. | |
170 | + (Default: 0.2) | |
1109 | 171 | |
1110 | --e, --evalue {float} | |
1111 | - Maximum evalue to consider an alignment between two TMS bundles significant. | |
1112 | - (Default: 0.1); | |
172 | + -c, --coverage {float) (optional) | |
173 | + Minimal coverage cutoff within the range: [0, 1] for the coverage of aligned regtions. | |
174 | + (Default: 0.85) | |
1113 | 175 | |
1114 | --ncs, --no-comp-stats {FLAG} | |
1115 | - If present, this flag indicates that E-values will not be corrected using | |
1116 | - compositional statistics. | |
1117 | - (Default: apply correction). | |
176 | + -h, --help | |
177 | + Display this help. Also displayed if script is run without arguments. | |
1118 | 178 | |
1119 | --c, --coverage {float} | |
1120 | - Minimum alignment coverage of the smallest bundle to consider an alignment | |
1121 | - signifiant. | |
1122 | - (Default: 0.8) | |
1123 | - | |
1124 | --id, --identity {float} | |
1125 | - Minimum identity, expressed as a float in the 0-1 range, to consider an | |
1126 | - alignment signficant. | |
1127 | - (Defatul: 0.25); | |
1128 | - | |
1129 | --gs, --gsat-shuffles {int} | |
1130 | - Number of shuffles that will be used to run GSAT on good matches. | |
1131 | - (Default: 1000); | |
1132 | - | |
1133 | --z, --gsat-cutoff {int} | |
1134 | - Minimum GSAT score cutoff to select good hits. | |
1135 | - (Default: 4.0) | |
1136 | - | |
1137 | --h, --help | |
1138 | - Print this help message. It takes precedence to any other option. | |
1139 | - | |
1140 | 179 | HELP |
1141 | 180 | |
1142 | - print $help; | |
1143 | - exit; | |
1144 | - | |
181 | + print $help; | |
182 | + exit; | |
1145 | 183 | } |