Commit 68b41f16a35757f498478ba2822e6978013608f2

Authored by Luis Arturo Medrano-Soto
1 parent 6342e60576
Exists in master

update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts

Showing 3 changed files with 128 additions and 1080 deletions Side-by-side Diff

alignSeqsFiles.pl View file @ 68b41f1
... ... @@ -58,6 +58,8 @@
58 58 my $segFilter = 'no';
59 59 my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
60 60 my $subMatrix = 'BL50';
  61 +my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
  62 +my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
61 63  
62 64 #this can be used to remove long sequences from results
63 65 my $maxProtLength = 100000; #default threshold to allow any length
... ... @@ -443,6 +445,9 @@
443 445 #Run quod on the query, subject and the alignment.
444 446  
445 447  
  448 +#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
  449 +
  450 +
446 451 sub run_quod {
447 452  
448 453 my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
... ... @@ -469,8 +474,9 @@
469 474  
470 475 #Note alnquod requires to add the extension to the image name
471 476 my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
472   - my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
473   - #print "$cmd1\n\n";
  477 + my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
  478 +# print "$cmd1\n\n";
  479 +# exit;
474 480 system $cmd1 unless (-f "${alnFig}");
475 481 return undef unless (-f "${alnFig}");
476 482  
477 483  
... ... @@ -483,17 +489,18 @@
483 489 die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
484 490 my $qTMS = "";
485 491 if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
486   - $qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
  492 + $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
487 493 }
488 494  
489 495  
490 496 #Plot query hydropathy
491 497 my $qPfam = get_pfam_coords_for_quod($q, "red");
492   - my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}";
493   - my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa);
494   - #print "$cmd2\n\n";
495   - system $cmd2 unless (-f "${qName}.png");
496   - return undef unless (-f "${qName}.png");
  498 + my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
  499 + my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
  500 +# print "$cmd2\n\n";
  501 +# exit;
  502 + system $cmd2 unless (-f $qName);
  503 + return undef unless (-f $qName);
497 504  
498 505  
499 506  
500 507  
... ... @@ -501,16 +508,17 @@
501 508 die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
502 509 my $sTMS = "";
503 510 if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
504   - $sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
  511 + $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
505 512 }
506 513  
507 514 #Plot Subject hydropaty
508 515 my $sPfam = get_pfam_coords_for_quod($s, "blue");
509   - my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}";
510   - my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa);
511   - #print "$cmd3\n\n";
512   - system $cmd3 unless (-f "${sName}.png");
513   - return undef unless (-f "${sName}.png");
  516 + my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
  517 + my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
  518 +# print "$cmd3\n\n";
  519 +# exit;
  520 + system $cmd3 unless (-f $sName);
  521 + return undef unless (-f $sName);
514 522  
515 523  
516 524 return 1;
... ... @@ -535,7 +543,7 @@
535 543 if (exists $pfamHits{$prot}) {
536 544 my @Doms = keys %{ $pfamHits{$prot} };
537 545 my $dcnt = 0;
538   - $str = "--region-font 12 -ar ";
  546 + $str = "--region-font 12 --add-region ";
539 547 foreach my $d (@Doms) {
540 548  
541 549 my @hits = @{ $pfamHits{$prot}{$d} };
... ... @@ -543,8 +551,10 @@
543 551 my $left = $hit->{qstart};
544 552 my $right = $hit->{qend};
545 553  
546   - my $ypos = -2.8 + $dcnt * 0.4;
547   - $str .= "${left}-${right}:'${d}':${ypos}:$color ";
  554 + my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
  555 + my $yposh = $yposl + 0.15; #domain height coord
  556 +
  557 + $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
548 558 $dcnt++;
549 559 }
550 560 }
locateFragment.pl View file @ 68b41f1
... ... @@ -142,7 +142,7 @@
142 142 #Generate quod plot
143 143  
144 144 #Format string for the regions
145   - my $regions = "-at ";
  145 + my $regions = "--add-tms ";
146 146 my $coords = "";
147 147 foreach my $hit (@res) {
148 148  
tmsRepeat.pl View file @ 68b41f1
Diff suppressed. Click to show
1   -#!/usr/bin/env perl -w
  1 +#!/usr/bin/env perl
2 2  
3   -use warnings;
  3 +no warnings;
4 4 use strict;
5 5 use Data::Dumper;
6 6  
7   -$Data::Dumper::Deepcopy = 1;
8   -$Data::Dumper::Indent = 1;
9   -#$Data::Dumper::Purity = 0;
10   -$Data::Dumper::Sortkeys = 1;
11   -
  7 +use TCDB::Repeats;
12 8 use Getopt::Long;
13   -use Bio::SearchIO;
14   -use Bio::SeqIO;
15 9  
16 10  
17   -use TCDB::CheckDependencies;
18   -use TCDB::Assorted;
  11 +my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences';
  12 +my $seqsFile = undef;
  13 +my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop';
  14 +my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP';
19 15  
  16 +my $evalue = 1e-2;
  17 +my $coverage = 0.85;
  18 +my $identity = 0.2;
20 19  
  20 +my @tmsRanges = ();
21 21  
22   -#==========================================================================
23   -#Check dependencies
  22 +read_command_line();
24 23  
25   -my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py");
26   -my $CheckDep_obj = new TCDB::CheckDependencies();
27   -$CheckDep_obj -> dependencies_list(\@dependencies);
28   -$CheckDep_obj -> checkDependencies;
29 24  
30   -
31   -
32   -#==========================================================================
33   -#Read command line options
34   -
35   -my $gs_infile = "";
36   -my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS
37   -my $gs_idFormat = "";
38   -my $gs_repUnit = 0;
39   -my $gs_seqDir = "";
40   -my $gs_tail = 5;
41   -my $gs_evalue = 0.1;
42   -my $gs_coverage = 0.8;
43   -my $gs_identity = 0.25;
44   -my $gsatShuffles = 1000;
45   -my $min_gsat_score = 4.0;
46   -
47   -my $compStatsFlag = 1;
48   -my $compStats = "";
49   -my $outdir = "repeats";
50   -my $repDir = "reports";
51   -my $seqDir = "sequences";
52   -my $alignDir = "alignments";
53   -my $plotsDir = "plots";
54   -my $goodHitsOnly = 1; #print only significant results, ignore everything else
55   -
56   -
57   -#all (all sequences in output file)
58   -#each (generate one directory per sequence.. for better organization)
59   -#debug (it will print the contents of the hash table one sequences at a time)
60   -my $mode = "all";
61   -
62   -read_command_line_arguments();
63   -
64   -#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir,
65   -# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats],
66   -# [qw(*infile *idFormat *repUnit *seqDir *tail *evalue
67   -# *coverage *identity $gsatShuffles *compStatFlag *compStats)]);
  25 +#print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir],
  26 +# [qw(*seqsDir *seqsFile *tmsFile *outDir )]);
68 27 #exit;
69 28  
70   -# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa
71 29  
  30 +#my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir,
  31 +# 'tmsFile' => $tmsFile,
  32 +# 'outDir' => $outDir,
  33 +# 'ranges2searchTMS' => \@TMSranges);
72 34  
73   -#==========================================================================
74   -#Read file with coordinates of TMSs and verify that the sequences are
75   -#available
76 35  
77   -my %gh_tms = ();
  36 +my @TMSranges = ([1, 3], [4, 6]);
78 37  
79   -read_tms_coordinates_file($gs_infile, \%gh_tms);
  38 +my $repObj = TCDB::Repeat->new();
80 39  
81   -#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]);
82   -#exit;
  40 +#$repObj->tmsFile($tmsFile);
  41 +#$repObj->seqsDir($seqsDir);
  42 +$repObj->seqsFile($seqsFile);
  43 +$repObj->outDir($outDir);
  44 +$repObj->evalueCutoff($evalue);
  45 +$repObj->identityCutoff($identity);
  46 +$repObj->coverageCutoff($coverage);
  47 +$repObj->TMSranges2search(\@TMSranges);
83 48  
  49 +$repObj-> findRepeatsTMSranges();
84 50  
85   -#===========================================================================
86   -#Main Output directory
  51 +#print Data::Dumper->Dump([$repObj ], [qw(*repObj)]);
87 52  
88   -#Root directory for all results
89   -system "mkdir -p $outdir" unless (-d $outdir);
90   -die "Could not generate output directory: $outdir" unless (-d $outdir);
91 53  
92 54  
93 55  
94   -#==========================================================================
95   -#Search for repeats inside query sequences
96 56  
97   -my %results = ();
98   -my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots
99   -
100   -foreach my $ls_sid (keys %gh_tms) {
101   -
102   - my %gh_bundleSeqs = ();
103   - my %gh_topHits = ();
104   -
105   -
106   - print "Processing: $ls_sid\n";
107   -
108   -
109   - #Clean results if one output directory is generated per input sequence
110   - %results = () if ($mode eq 'each');
111   -
112   -
113   - #Cut sequences in non overlaping regions with as many TMS as the
114   - #repeat unit we want to find.
115   - cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs);
116   -
117   -
118   -# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]);
119   -# <STDIN>;
120   -
121   -
122   - #run ssearch to find potential repeats.
123   - align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits);
124   -
125   -
126   -# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]);
127   -# <STDIN>;
128   -
129   -
130   - #Collect results for final table
131   - $results{$ls_sid} = \%gh_topHits;
132   -
133   - #present results per input sequence to verify everything looks fine.
134   - if ($mode eq 'debug') {
135   - print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]);
136   - <STDIN>;
137   - }
138   -
139   - print_reports(\%results) if ($mode eq 'each');
140   -}
141   -
142   -
143   -
144   -
145   -
146 57 #===========================================================================
147   -#Print final results in summarized or detailed format
  58 +#Read command line and print help
148 59  
149   -#print Data::Dumper->Dump([\%results ], [qw(*results )]);
150   -#<STDIN>;
151 60  
152   -print_reports(\%results) if ($mode eq 'all');
  61 +sub read_command_line {
153 62  
  63 + print_help() unless (@ARGV);
154 64  
  65 + my $status = GetOptions(
  66 + "s|seqs-file=s" => \&read_seqsFile,
  67 + "d|seqs-dir=s" => \&read_seqsDir,
  68 + "o|outdir=s" => \&read_outdir,
  69 + "t|tms=s" => \&read_tmsFile,
  70 + "e|evalue=f" => \$evalue,
  71 + "i|identity=f" => \$identity,
  72 + "c|coverage=f" => \$coverage,
  73 + "h|help" => sub { print_help(); },
  74 + "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
  75 + exit unless ($status);
155 76  
156 77  
157   -
158   -###########################################################################
159   -# #
160   -# Subroutine definition #
161   -# #
162   -###########################################################################
163   -
164   -
165   -#print final_report
166   -
167   -sub print_reports {
168   -
169   - my $res = shift;
170   -
171   -
172   - #Get the directory where reports will be saved
173   - my $reportDir = undef;
174   - if ($mode eq 'all') {
175   - $reportDir = getReportsDir();
176   - }
177   - else {
178   -
179   - #one id per report
180   - my @ids = keys %$res;
181   - my $seqId = $ids[0];
182   -
183   - $reportDir = getReportsDir($seqId);
184   - }
185   - die "Error: invalid report dir" unless ($reportDir);
186   -
187   -
188   - my $sumFile = "$reportDir/repeats_summary_report.txt";
189   - my $detailsFile = "$reportDir/repeats_detailed_report.txt";
190   - my $htmlFile = "$reportDir/report.html";
191   -
192   -
193   - open (my $htmlfh, ">", $htmlFile) || die $!;
194   -
195   - my $htmlHeader = <<HEADER;
196   -<!DOCTYPE html>
197   -<html xmlns="http://www.w3.org/1999/xhtml">
198   - <head>
199   - <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
200   -
201   - <style type="text/css">
202   -
203   -.label {
204   - text-align: right;
205   - width: 50px;
  78 + #Validadte input file option
  79 + die "Error: no sequence file detected!" unless ($seqsFile);
206 80 }
207 81  
208   -.data {
209   - text-align: left;
210   - padding-left: 8px;
211   - width: 100px;
212   -}
213 82  
214   -.uline {
215   - text-decoration: underline;
216   -}
217   -
218   -.seq {
219   - border: 2px solid black;
220   - height: 70px;
221   - width: 100%;
222   - overflow-x: auto;
223   - overflow-y: hidden;
224   - margin: 1em 0;
225   - background: gray;
226   - color: white;
227   -}
228   -
229   -img {
230   - display: block;
231   - margin-left: auto;
232   - margin-right: auto;
233   - height: 250px;
234   - width: auto;
235   - max-width: 1500px;
236   - max-height: 300px;
237   -}
238   -
239   - </style>
240   - <title>Inferring repeats of $gs_repUnit TMS</title>
241   - </head>
242   - <br />
243   - <h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1>
244   - <body>
245   -
246   -HEADER
247   -
248   - print $htmlfh $htmlHeader;
249   - open (my $sumh, ">", $sumFile) || die $!;
250   - open (my $deth, ">", $detailsFile) || die $!;
251   -
252   -
253   - #Header for summary table
254   - print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n";
255   -
256   -
257   -# print Data::Dumper->Dump([$res ], [qw(*res )]);
258   -# <STDIN>;
259   -
260   -
261   - P:foreach my $id (sort {$a cmp $b} keys %$res) {
262   -
263   - #Jump to next result if there are NO hits for this protein and
264   - #ONLY good hits are going to be recorded.
265   - unless (%{ $res->{$id} }) {
266   - next P if ($goodHitsOnly);
267   - }
268   -
269   -
270   - print $deth "===========================================================================\n";
271   - print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n";
272   -
273   - #There must be results to continue
274   - unless (%{ $res->{$id} }) {
275   - print $sumh "$id\tNo_hits\n";
276   - print $deth "$id\tNo_hits\n\n\n";
277   - print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n";
278   - }
279   -
280   - print $deth "$id\n\n";
281   - print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n";
282   -
283   -
284   -
285   -
286   - #get the long bundle names
287   - BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) {
288   -
289   - BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) {
290   -
291   - my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName};
292   - my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen};
293   -
294   - #Each of the hits for this bundle
295   - my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} };
296   -
297   - #To get rid of a warning when there is only one hit.
298   - my @hits = (scalar (@hits_tmp) > 1)?
299   - sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp;
300   -
301   - foreach my $hit (@hits) {
302   -
303   - my $hName = $hit->{hName};
304   - my $hLen = $hit->{hLen};
305   -
306   - my $evalue = sprintf ("%.1e", $hit->{hEvalue});
307   - my $ident = sprintf ("%.1f", $hit->{hId} * 100);
308   - my $sim = sprintf ("%.1f", $hit->{hSim} * 100);
309   - my $gsat = sprintf ("%.1f", $hit->{gsat});
310   -
311   - my $alnLen = $hit->{alnLen};
312   - my $qCov = sprintf("%.1f", $hit->{qCov} * 100);
313   - my $hCov = sprintf("%.1f", $hit->{hCov} * 100);
314   -
315   -
316   - #The alignment
317   - my $qstart = $hit->{qstart};
318   - my $qend = $hit->{qend};
319   - my $sstart = $hit->{sstart};
320   - my $send = $hit->{send};
321   - my $qSeq = $hit->{qSeq};
322   - my $homStr = $hit->{homStr};
323   - my $sSeq = $hit->{sSeq};
324   -
325   - my $plot = $hit->{plot};
326   -
327   - #For summary tab-delimitedfile (everything except the alignment)
328   - print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n";
329   -
330   -
331   - #Detailed report that includes the alignment
332   - print $deth "----------\n";
333   - print $deth "$qName ($qLen) vs $hName ($hLen)\n\n";
334   - print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n";
335   - print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n";
336   - print $deth "Alignment ($qName|${qstart}-$qend vs $hName|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n";
337   -
338   -
339   - #The HTML report (includes alignment and hydropathy image
340   - my $repHit = <<HIT;
341   -
342   - <p><b>$qName ($qLen) vs $hName ($hLen)</b></p>
343   -
344   - <table width="600px" border="0" cellspacing="0" cellpadding="2">
345   - <tr>
346   - <td class='label'><b>E-value:</b></td>
347   - <td class='data'>$evalue</td>
348   - <td class='label'><b>Identity:</b></td>
349   - <td class='data'>${ident}%</td>
350   - <td class='label'><b>Similarity:</b></td>
351   - <td class='data'>${sim}%</td>
352   - <td class='label'><b>GSAT:</b></td>
353   - <td class='data'>$gsat</td>
354   - </tr>
355   - <tr>
356   - <td class='label'><b>Aln:</b></td>
357   - <td class='data'>$alnLen</td>
358   - <td class='label'><b>Q_cov:</b></td>
359   - <td class='data'>${qCov}%</td>
360   - <td class='label'><b>S_cov:</b></td>
361   - <td class='data'>${hCov}%</td>
362   - <td class='label'></td>
363   - <td class='data'></td>
364   - </tr>
365   - </table>
366   -
367   - <p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p>
368   - <div class='seq'>
369   - <pre>
370   -$qSeq
371   -$homStr
372   -$sSeq
373   - </pre>
374   - </div>
375   - <a href="$plot" target="_blank"><img src="$plot"/></a>
376   - <br />
377   - <hr />
378   -
379   -HIT
380   -
381   - print $htmlfh $repHit;
382   -
383   - } #hit
384   - } #reference bundle number
385   - } #Reference bundle name
386   - } #Query protein
387   -
388   - #Close HTML report
389   - my $closeRep = <<CLOSE;
390   - </body>
391   -</html>
392   -CLOSE
393   -
394   - print $htmlfh $closeRep;
395   -
396   - close $sumh;
397   - close $deth;
398   - close $htmlfh;
399   -}
400   -
401   -
402   -
403 83 #==========================================================================
404   -#Run ssearch36 between the different bundles in a sequence
  84 +#Option -s
405 85  
406   -sub align_bundles {
  86 +sub read_seqsFile {
  87 + my ($opt, $value) = @_;
407 88  
408   - my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_;
409   -
410   - %$lhr_topHits = ();
411   -
412   - #Directory where the sequences of TMS bundles are saved
413   - my $sequencesDir = undef;
414   - my $alignmentsDir = undef;
415   - my $hydroPlotsDir = undef;
416   -
417   - if ($mode eq 'all') {
418   - $sequencesDir = getSequencesDir();
419   - $alignmentsDir = getAlignmentsDir();
420   - $hydroPlotsDir = getPlotsDir();
  89 + unless (-f $value && !(-z $value)) {
  90 + die "Error: file with sequences does not exist or is empty!\n";
421 91 }
422   - else {
423   - $sequencesDir = getSequencesDir($seqId);
424   - $alignmentsDir = getAlignmentsDir($seqId);
425   - $hydroPlotsDir = getPlotsDir($seqId);
426   - }
427   - die "Error: invalid sequences dir" unless ($sequencesDir);
428   - die "Error: invalid alignments dir" unless ($alignmentsDir);
429   - die "Error: invalid plots dir" unless ($hydroPlotsDir);
430 92  
431   -
432   -# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]);
433   -# <STDIN>;
434   -
435   -
436   - #The bundle that will be used as reference for the comparison
437   - REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) {
438   -
439   - my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0];
440   -
441   -
442   - #Id to name ssearch36 output files
443   - my $id = $lhr_bundleSeqFiles->{$bundle}->[0];
444   - $id =~ s/\.faa//;
445   -
446   -
447   - #For naming GSAT files (ID of system or protein accession)
448   - my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef;
449   - die "Could not extract accession from $id!" unless ($id);
450   -
451   -
452   -# print Data::Dumper->Dump([$id, $tcAcc ], [qw(*id *tcAcc)]);
453   -# <STDIN>;
454   -
455   -
456   - #--------------------------------------------------------------------
457   - #Get the non-overlapping bundles to compare them against the
458   - #reference bundle
459   -
460   - my @cmpFiles = ();
461   -
462   - #Initialize the index to the first non-overlapping bundle
463   - my $next_bundle_idx = $bundle + $gs_repUnit;
464   -
465   - CMP:while (1) {
466   -
467   - #Exit if next bundle is not in bundles hash
468   - last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx});
469   -
470   - #Get file name for this non-overlapping bundle
471   - my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0];
472   - push (@cmpFiles, $cmpBundle);
473   -
474   - #Update the index to the next non-overlapping bundle
475   - $next_bundle_idx = $next_bundle_idx + $gs_repUnit;
476   - }
477   -
478   - #go to next reference bundle if there are no non-overlapping bundles.
479   - next REF unless (@cmpFiles);
480   -
481   -
482   -# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]);
483   -# <STDIN>;
484   -
485   -
486   - #--------------------------------------------------------------------
487   - #Now run ssearch36 of the reference bundle against all its
488   - #non-overlapping bundles
489   -
490   - #put all non-overlapping bundles into a file
491   - my $libFile = "$sequencesDir/lib_$id.faa";
492   - my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile";
493   - system $cmd;
494   -
495   -
496   - #run ssearch36 of $rFile vs @cmpFile
497   - my $ssearchOut = "$alignmentsDir/ssearch_$id.out";
498   - my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut);
499   - system "ssearch36 $ssearch_params" unless (-f $ssearchOut);
500   -
501   -
502   -# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]);
503   -# <STDIN>;
504   -
505   -
506   - #---------------------------------------------------------------------
507   - #Estimate here the spacing between x-ticks for hydropathy plots
508   -
509   - my $protLen = $origSeqLength{$seqId};
510   -
511   - my $xticksSpacing = undef;
512   - if ($protLen <= 500) {
513   - $xticksSpacing = 25;
514   - }
515   - elsif ($protLen <= 1000) {
516   - $xticksSpacing = 50;
517   - }
518   - else {
519   - $xticksSpacing = 100;
520   - }
521   -
522   -
523   -
524   - #--------------------------------------------------------------------
525   - #parse ssearch36 output. For BioPerl resouces check:
526   - #http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm
527   - #https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html
528   -
529   - my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut);
530   -
531   -
532   - #put hir the top hits
533   - my %lh_hits = ();
534   -
535   -
536   - while (my $result = $parser->next_result) {
537   -
538   -
539   - my $qLen = $result->query_length;
540   - $lh_hits{$bundle}{qName} = $result->query_name;
541   - $lh_hits{$bundle}{qLen} = $qLen;
542   - $lh_hits{$bundle}{hits} = [];
543   -
544   -
545   - HIT:while (my $hit = $result->next_hit) {
546   -
547   - HSP:while(my $hsp = $hit->next_hsp) {
548   -
549   -
550   -# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]);
551   -# <STDIN>;
552   -
553   -
554   - my %tmp = ();
555   -
556   - my $alnLen = $hsp->hsp_length;
557   - my $hLen = $hit->length;
558   - my $hEvalue = $hsp->evalue;
559   - my $hId = $hsp->frac_identical('total'); #identity in the alignment
560   - my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
561   -
562   -
563   - #coordinates in the alignment to properly calculate coverages
564   - my $qstart = $hsp->start('query');
565   - my $qend = $hsp->end('query');
566   - my $sstart = $hsp->start('subject');
567   - my $send = $hsp->end('subject');
568   -
569   -
570   - #Calculate coverages properly (do not use alignment length as it includes gaps
571   -
572   - my $qCov_tmp = ($qend - $qstart + 1) / $qLen;
573   - my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp;
574   -
575   - my $hCov_tmp = ($send - $sstart + 1) / $hLen;
576   - my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp;
577   -
578   -
579   -# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity],
580   -# [qw(*qLen *qCov $hLen *hCov *coverageCutoff *evalue *evalCutoff *hId *IDcutoff)]);
581   -# <STDIN>;
582   -
583   -
584   - #Before storing hit results check minimum coverage, identity and evalue
585   - next HSP unless (($qCov >= $gs_coverage || $hCov >= $gs_coverage) &&
586   - ($hEvalue <= $gs_evalue) && ($hId >= $gs_identity));
587   -
588   -
589   - #hit identity
590   - $tmp{hName} = $hit->name;
591   - $tmp{hLen} = $hLen;
592   -
593   -
594   - #hit statistics
595   - $tmp{alnLen} = $alnLen;
596   - $tmp{hEvalue} = $hEvalue;
597   - $tmp{hId} = $hId;
598   - $tmp{hSim} = $hSim;
599   - $tmp{qCov} = $qCov;
600   - $tmp{hCov} = $hCov;
601   -
602   -
603   - #The alignment
604   - $tmp{qstart} = $qstart;
605   - $tmp{qend} = $qend;
606   - $tmp{sstart} = $sstart;
607   - $tmp{send} = $send;
608   -
609   - $tmp{qSeq} = $hsp->query_string;
610   - $tmp{sSeq} = $hsp->hit_string;
611   - $tmp{homStr} = $hsp->homology_string;
612   -
613   -
614   - #Get the GSAT score
615   - my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat";
616   -
617   -
618   -# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n";
619   -# exit;
620   -
621   - system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile);
622   -
623   - my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile);
624   - $tmp{gsat} = $gsat_score;
625   -
626   -
627   -# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]);
628   -# <STDIN>;
629   -
630   -
631   - #GSAT is the last filter
632   - next HSP unless ($gsat_score >= $min_gsat_score);
633   -
634   - #------------------------------------------------------------
635   - #Generate quod plot with the repeat
636   -
637   - my $whole_prot_seq = "$gs_seqDir/${seqId}.faa";
638   - die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq);
639   -
640   -
641   - my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName};
642   - my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png";
643   - my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName};
644   -
645   - #Get hydrophobic peaks coords
646   - my $hydroPeaks = $gh_tms{$seqId};
647   - die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks });
648   -
649   -
650   - #format the hydrophobic peaks for quod
651   - my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks;
652   - my $pstring = join (" ", @peaks);
653   -
654   -
655   - #----------
656   - #Calculate the positions of the aligned section of each bundle in the full sequence.
657   -
658   - my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef;
659   - my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef;
660   - die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid);
661   -
662   -
663   - #extract initial positions for both bundles
664   - my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1];
665   - my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1;
666   - my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1];
667   - my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1;
668   - die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend);
669   - die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend);
670   -
671   -
672   - #Calculate bundle positions here
673   - my $qgp_start = $qbstart + ($qstart - 1);
674   - my $qgp_end = $qbstart + ($qend - 1);
675   -
676   - my $sgp_start = $sbstart + ($sstart - 1);
677   - my $sgp_end = $sbstart + ($send - 1);
678   -
679   -
680   - #Format the coordinates for the repeats now
681   - my $qrep = "${qgp_start}-${qgp_end}:green";
682   - my $srep = "${sgp_start}-${sgp_end}:blue";
683   -
684   - #Format the coordinates for the bar delimiting the bundles
685   - my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1";
686   -
687   - #The quod command line
688   - my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}";
689   -
690   - my $img = "${plotFile}.png";
691   - system $cmd unless (-f $img);
692   - die "Could not generate plot: $img" unless (-f $img);
693   -
694   - $tmp{plot} = $fileName;
695   -
696   -
697   - #load the data into the hits section for this bundle
698   - push (@{ $lh_hits{$bundle}{hits} }, \%tmp);
699   -
700   -
701   - } #HSP
702   - } #HIT
703   - } #While
704   -
705   -
706   - #Add results to the topHits hash
707   - if (@{ $lh_hits{$bundle}{hits} }) {
708   - $lhr_topHits->{$id} = \%lh_hits;
709   - }
710   -
711   - }
  93 + $seqsFile = $value;
712 94 }
713 95  
714 96  
715   -
716   -
717 97 #==========================================================================
718   -#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the
719   -#sequence in TMS bundles of length rsize.
  98 +#Option -t
720 99  
  100 +sub read_tmsFile {
  101 + my ($opt, $value) = @_;
721 102  
722   -sub cut_seq_in_tms_regions {
723   -
724   - my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_;
725   -
726   -
727   - %$lhr_seqSegs = ();
728   -
729   -
730   - #Get the directory where bundle sequences will be saved
731   - my $sequencesDir = undef;
732   -
733   - if ($mode eq 'all') {
734   - $sequencesDir = getSequencesDir();
  103 + unless (-f $value && !(-z $value)) {
  104 + die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n";
735 105 }
736   - else {
737   - $sequencesDir = getSequencesDir($ls_pid);
738   - }
739   - die "Error: invalid sequence dir" unless ($sequencesDir);
740 106  
741   -
742   - #----------------------------------------------------------------------
743   - #Get the coordinates of the overlapping bundles
744   -
745   - my @la_tms = @{ $lhr_tms->{$ls_pid} };
746   -
747   -
748   -
749   - #Get the Length of the sequence of the query protein
750   - my $seqFile = "$gs_seqDir/${ls_pid}.faa";
751   - my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta");
752   - my $seqObj = $obj->next_seq;
753   - my $qlength = $seqObj->length;
754   - die "Could not extract protein length." unless ($qlength);
755   -
756   - #Store the length of the original sequence for proper calculation of
757   - #the x-ticks in the hydropathy plots of the results
758   - $origSeqLength{$ls_pid} = $qlength;
759   -
760   -
761   -
762   - #Number of TMS in protein
763   - my $ls_ntms = scalar (@la_tms);
764   -
765   -
766   -
767   - for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) {
768   -
769   - #TMS in bundle
770   - my $left_tms = $la_tms[$idx - 1];
771   - my $right_tms = $la_tms[$idx + $ls_repeat - 2];
772   -
773   -
774   - #The coordinates of the bundle
775   - my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail;
776   - #my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail;
777   - my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail;
778   -
779   -
780   - #Cut and name the bundles only if bundle file does not exist
781   - my $outfile = "${ls_pid}_bundle${idx}";
782   - unless (-f "$sequencesDir/${outfile}.faa") {
783   -
784   - #cutting bundle
785   - my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0);
786   - system "tmsplit $args > /dev/null";
787   -
788   - #replace protein ID with bundle number to the ID so alignments can be easily identified
789   - system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa);
790   - }
791   -
792   - $lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos];
793   - }
  107 + $tmsFile = $value;
794 108 }
795 109  
796 110  
797   -
798   -
799 111 #==========================================================================
800   -#Read file with the TMS coordinates of the input proteins. The TMS
801   -#must have been validated with WHAT to make sure they are reliable.
  112 +#Option -d
802 113  
  114 +sub read_seqsDir {
  115 + my ($opt, $value) = @_;
803 116  
804   -sub read_tms_coordinates_file {
  117 + die "Error: directory with sequences does not exist." unless (-d $value);
805 118  
806   - my ($s_coordsFile, $hr_tms) = @_;
807   -
808   - open (my $fileh, "<", $s_coordsFile) || die $!;
809   -
810   - #-----------------------------------------------------------------
811   - #The format of this file is protein ID followed by pairs of
812   - #coordinates separated by dash:
813   - # 2.A.43.1.1-O60931 1-20 25-35 50-68 ....
814   - if ($infileFmt eq 'tms') {
815   -
816   - while(<$fileh>) {
817   - chomp;
818   -
819   - #ignore empty lines;
820   - next unless ($_);
821   -
822   - #extract id and TMSs coordinates
823   - my ($id, @tms_str) = split(/\s+/, $_);
824   - my @tms = map { [ split(/-/, $_) ] } @tms_str;
825   -
826   -
827   - #For debugging purposes
828   -# next unless ($id eq 'WP_100644534');
829   -
830   -
831   - $hr_tms->{$id} = \@tms;
832   -
833   - #Verify that the sequence is available for this protein
834   - unless (-f "$gs_seqDir/${id}.faa" && ! (-z "$gs_seqDir/${id}.faa")) {
835   - die "Could not find sequence for protein: $id in dir: $gs_seqDir -->";
836   - }
837   - } #while
838   - }
839   -
840   - #Input file is in HMMTOP format
841   - else {
842   - while(<$fileh>) {
843   - chomp;
844   -
845   - #Remove trailing spaces
846   - s/\s+$//;
847   -
848   - #ignore empty lines
849   - next unless ($_);
850   -
851   -
852   - #parse hmmtop line
853   - my ($id, $ntms, $tms_str) = (/\S+\s+\d+\s+(\S+).+(IN|OUT)\s+(\d+)\s+([\d\s-]+)/)? ($1, $3, $4) : ();
854   -
855   - #For debugging purposes
856   -# next unless ($id eq 'WP_100644534');
857   -
858   -
859   - if ($id && $ntms && $tms_str) {
860   -
861   - #extract the pairs of coordinates for TMS
862   - my @coords = split(/\s+/, $tms_str);
863   - my @tms = ();
864   - for (my $i=0; $i < $#coords; $i += 2) {
865   - push (@tms, [$coords[$i], $coords[$i+1]]);
866   - }
867   -
868   - $hr_tms->{$id} = \@tms;
869   -
870   - }
871   - else {
872   - print "problem parsing HMMTOP line: $_\n";;
873   - print Data::Dumper->Dump([$id, $ntms, $tms_str ], [qw(*id *ntms *tms_str )]);
874   - exit;;
875   - }
876   - }
877   - }
878   -
879   - close $fileh;
  119 + $seqsDir = $value;
880 120 }
881 121  
882 122  
883   -
884 123 #==========================================================================
885   -#Get the directory where the sequences of bundles will be saved.
  124 +#Option -o
886 125  
887   -sub getSequencesDir {
  126 +sub read_outdir {
  127 + my ($opt, $value) = @_;
888 128  
889   - my $protId = shift;
890   -
891   - my $dir = undef;
892   -
893   - if ($mode eq 'all') {
894   - $dir = "$outdir/$seqDir";
895   - }
896   - else {
897   - die "Error: protein accession missing" unless ($protId);
898   - $dir = "$outdir/$protId/$seqDir";
899   - }
900   -
901   - system "mkdir -p $dir" unless (-d $dir);
902   - die "No dir for bundle sequences found: $dir" unless (-d $dir);
903   -
904   - return $dir;
  129 + $outDir = $value;
905 130 }
906 131  
907 132  
908 133 #==========================================================================
909   -#Get the directory where the alignments will be saved
  134 +#option -h
910 135  
911   -sub getAlignmentsDir {
912 136  
913   - my $protId = shift;
914   -
915   - my $dir = undef;
916   -
917   - if ($mode eq 'all') {
918   - $dir = "$outdir/$alignDir";
919   - }
920   - else {
921   - die "Error: protein accession missing" unless ($protId);
922   - $dir = "$outdir/$protId/$alignDir";
923   - }
924   -
925   - system "mkdir -p $dir" unless (-d $dir);
926   - die "No dir for alignments found: $dir" unless (-d $dir);
927   -
928   - return $dir;
929   -}
930   -
931   -
932   -#==========================================================================
933   -#Get the directory where hydropathy plots will be saved
934   -
935   -sub getPlotsDir {
936   -
937   - my $protId = shift;
938   -
939   - my $dir = undef;
940   -
941   - if ($mode eq 'all') {
942   - $dir = "$outdir/$plotsDir";
943   - }
944   - else {
945   - die "Error: protein accession missing" unless ($protId);
946   - $dir = "$outdir/$protId/$plotsDir";
947   - }
948   -
949   - system "mkdir -p $dir" unless (-d $dir);
950   - die "No dir for plots found: $dir" unless (-d $dir);
951   -
952   - return $dir;
953   -}
954   -
955   -
956   -#==========================================================================
957   -#Get the directory where the reports will be saved
958   -
959   -sub getReportsDir {
960   -
961   - my $protId = shift;
962   -
963   - my $dir = undef;
964   -
965   - if ($mode eq 'all') {
966   - $dir = "$outdir/$repDir";
967   - }
968   - else {
969   - die "Error: protein accession missing" unless ($protId);
970   - $dir = "$outdir/$protId/$repDir";
971   - }
972   -
973   - system "mkdir -p $dir" unless (-d $dir);
974   - die "No dir for reports found: $dir" unless (-d $dir);
975   -
976   - return $dir;
977   -}
978   -
979   -
980   -
981   -
982   -
983   -#==========================================================================
984   -#Read command-line arguments
985   -
986   -sub read_command_line_arguments {
987   -
988   - #if no arguments are given print the help
989   - if (! @ARGV) {
990   - print_help();
991   - }
992   -
993   - #----------------------------------------------------------------------
994   - #Parse command line arguments
995   -
996   - my $ls_status = GetOptions(
997   - "i|infile=s" => \$gs_infile,
998   - "if|infile-format=s" => \$infileFmt,
999   - "o|outdir=s" => \$outdir,
1000   - "f|id-format=s" => \$gs_idFormat,
1001   - "r|rep-unit=i" => \$gs_repUnit,
1002   - "t|tail-size=i" => \$gs_tail,
1003   - "s|seqs=s" => \$gs_seqDir,
1004   - "e|evalue=f" => \$gs_evalue,
1005   - "c|coverage=f" => \$gs_coverage,
1006   - "id|identity=f" => \$gs_identity,
1007   - "ncs|no-comp-stats!" => \$compStatsFlag,
1008   - "gs|gsat-shuffles=i" => \$gsatShuffles,
1009   - "z|gsat-cutoff=f" => \$min_gsat_score,
1010   - "m|mode=s" => \$mode,
1011   - "h|help" => sub { print_help(); },
1012   -
1013   - #For arguments that do not look like valid options
1014   - "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }
1015   - );
1016   - die "\n" unless ($ls_status);
1017   -
1018   - #----------------------------------------------------------------------
1019   - #Validate command line arguments
1020   -
1021   - die "Error: argument -i is mandatory.\n" unless ($gs_infile);
1022   - die "Error: argument -r is mandatory and must be greater than 0.0\n" unless ($gs_repUnit > 0);
1023   - die "Error: augument -t must be grater than 0 and less than 16\n" if ($gs_tail > 15 || $gs_tail < 0);
1024   - die "Error: argument -e must be greater than 0\n" unless ($gs_evalue >=0 );
1025   - die "Error: argument -c must be between 0.5 and 1.0\n" unless ($gs_coverage >= 0.0 && $gs_coverage <= 1.0);
1026   - die "Error: argument -id must be between 0.25 and 1.0\n" unless ($gs_identity >= 0.0 && $gs_identity <= 1.0);
1027   -
1028   - #Option -f
1029   - $gs_idFormat = lc $gs_idFormat;
1030   - unless ($gs_idFormat =~ /^(tc|tca|o)$/) {
1031   - die "Error: There are 3 Valid options for -f (tc, tca, o)\n";
1032   - }
1033   -
1034   -
1035   - #option -if
1036   - $infileFmt = lc $infileFmt;
1037   - unless ($infileFmt =~ /^(hmmtop|tms)$/) {
1038   - die "Error: invalid input file format: '$infileFmt' (Valid options: hmmtop, tms).\n";
1039   - }
1040   -
1041   -
1042   - #option -m
1043   - $mode = lc $mode;
1044   - unless ($mode =~ /^(all|each|debug)$/) {
1045   - die "Error: invalid mode of operation '$mode'. Valid options are: all, each!\n";
1046   - }
1047   -
1048   -
1049   - #Option -s
1050   - unless (-d $gs_seqDir) {
1051   - die "Error: Directory with sequences must exits -> $gs_seqDir\n";
1052   - }
1053   -
1054   -
1055   - #Validate GSAT cutoff
1056   - unless ($min_gsat_score >= 0) {
1057   - die "Use GSAT cutoff >= 3.0!\n";
1058   - }
1059   -
1060   -
1061   - #option -ncs
1062   - $compStats = ($compStatsFlag)? "" : "-k 1000 -z 11";
1063   -}
1064   -
1065   -
1066   -
1067 137 sub print_help {
1068 138  
1069   - my $help = <<'HELP';
  139 + my $help = <<'HELP';
1070 140  
1071   -This script searches for regions of TMSs repeated in a full protein.
  141 +This program searches for reapeats between different user-specified
  142 +regions of proteins.
1072 143  
1073   --i, --infile {path}
1074   - Input file with id/accession(s) of the protein(s) to analyze and the coordinates
1075   - of the TMSs in that protein(s). Use option -if to specify the format of this
1076   - file.
1077   - (Argument is mandatory).
  144 + Command line options:
1078 145  
1079   --if, --infile-format {string} (optional)
1080   - Format of the TMS coordenates. It can be either 'tms' or 'hmmtop'.
1081   - (Default: hmmtop)
  146 + -s, --seqs-file {file} (mandatory)
  147 + Path to file in fasta format with all the input sequences.
  148 + THis option is incompatible with option -d. But one of the
  149 + two option must be given.
1082 150  
1083   --o, --outdir {path}
1084   - Output directory where results will be saved.
1085   - (Default: repeats)
  151 + -d, --seqs-dir {path} (optional)
  152 + Path to directory where the input sequences are located.
  153 + This option is incompatible with options -s. But one of the
  154 + two option must be given.
1086 155  
1087   --s, --seqs {path}
1088   - Directory to access the sequences in FASTA format that will be used to
1089   - search for repeats. One file per sequence, and the name of the file is
1090   - the accession of the protein followed by '.faa'
1091   - (Argument is mandatory)
  156 + -o, --oudir {papth} (optional)
  157 + Path to the output directory.
  158 + (Default: ./tmsRepeat)
1092 159  
1093   --f, --id-format {string}
1094   - Acceptable formats for identifiers:
1095   - tc plain tcdb identifier of a system (e.g., 2.A.1.8.1)
1096   - tca tcdb id and accession separated by dash (e.g. 2.A.1.8.3-Q9R6U5)
1097   - o other, it can be refSeq, uniprot or custom, but it is requried
1098   - that is is a single string without spaces.
1099   - (Argument is mandatory)
  160 + -t, --tms {file} (optional)
  161 + File with the output of hmmtop for the input sequences, if available.
  162 + (Default: run hmmtop on input seqeunces)
1100 163  
1101   --r, --repeat-unit {int)
1102   - Size in TMS of the repeat unit to search in the protein.
1103   - (Argument is mandatory)
  164 + -e, --evalue {float} (optional)
  165 + Maximal evalue cutoff for the aligned seqments.
  166 + (Default: 0.001)
1104 167  
1105   --t, --tail-size {int}
1106   - Number of residues to add to the beginning and end of TMS regions before
1107   - running comparisons. Value should be less than or equal to 15 residues.
1108   - (Default: 5);
  168 + -i, --identity {float} (optional)
  169 + Minimal identity in aligned regions.
  170 + (Default: 0.2)
1109 171  
1110   --e, --evalue {float}
1111   - Maximum evalue to consider an alignment between two TMS bundles significant.
1112   - (Default: 0.1);
  172 + -c, --coverage {float) (optional)
  173 + Minimal coverage cutoff within the range: [0, 1] for the coverage of aligned regtions.
  174 + (Default: 0.85)
1113 175  
1114   --ncs, --no-comp-stats {FLAG}
1115   - If present, this flag indicates that E-values will not be corrected using
1116   - compositional statistics.
1117   - (Default: apply correction).
  176 + -h, --help
  177 + Display this help. Also displayed if script is run without arguments.
1118 178  
1119   --c, --coverage {float}
1120   - Minimum alignment coverage of the smallest bundle to consider an alignment
1121   - signifiant.
1122   - (Default: 0.8)
1123   -
1124   --id, --identity {float}
1125   - Minimum identity, expressed as a float in the 0-1 range, to consider an
1126   - alignment signficant.
1127   - (Defatul: 0.25);
1128   -
1129   --gs, --gsat-shuffles {int}
1130   - Number of shuffles that will be used to run GSAT on good matches.
1131   - (Default: 1000);
1132   -
1133   --z, --gsat-cutoff {int}
1134   - Minimum GSAT score cutoff to select good hits.
1135   - (Default: 4.0)
1136   -
1137   --h, --help
1138   - Print this help message. It takes precedence to any other option.
1139   -
1140 179 HELP
1141 180  
1142   - print $help;
1143   - exit;
1144   -
  181 + print $help;
  182 + exit;
1145 183 }