UCSD_WLU / repeatFinder

Commit 68b41f16a35757f498478ba2822e6978013608f2

Authored by Luis Arturo Medrano-Soto 2022-12-05 17:46:17 -0800

Exists in master

update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts

Showing 3 changed files with 128 additions and 1080 deletions Side-by-side Diff

alignSeqsFiles.pl
locateFragment.pl
tmsRepeat.pl

...	...	@@ -58,6 +58,8 @@
58	58	my $segFilter = 'no';
59	59	my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
60	60	my $subMatrix = 'BL50';
	61	+my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
	62	+my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
61	63
62	64	#this can be used to remove long sequences from results
63	65	my $maxProtLength = 100000; #default threshold to allow any length
...	...	@@ -443,6 +445,9 @@
443	445	#Run quod on the query, subject and the alignment.
444	446
445	447
	448	+#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
	449	+
	450	+
446	451	sub run_quod {
447	452
448	453	my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
...	...	@@ -469,8 +474,9 @@
469	474
470	475	#Note alnquod requires to add the extension to the image name
471	476	my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
472		- my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
473		- #print "$cmd1\n\n";
	477	+ my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
	478	+# print "$cmd1\n\n";
	479	+# exit;
474	480	system $cmd1 unless (-f "${alnFig}");
475	481	return undef unless (-f "${alnFig}");
476	482
477	483
...	...	@@ -483,17 +489,18 @@
483	489	die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
484	490	my $qTMS = "";
485	491	if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
486		- $qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
	492	+ $qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
487	493	}
488	494
489	495
490	496	#Plot query hydropathy
491	497	my $qPfam = get_pfam_coords_for_quod($q, "red");
492		- my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}";
493		- my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa);
494		- #print "$cmd2\n\n";
495		- system $cmd2 unless (-f "${qName}.png");
496		- return undef unless (-f "${qName}.png");
	498	+ my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
	499	+ my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
	500	+# print "$cmd2\n\n";
	501	+# exit;
	502	+ system $cmd2 unless (-f $qName);
	503	+ return undef unless (-f $qName);
497	504
498	505
499	506
500	507
...	...	@@ -501,16 +508,17 @@
501	508	die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
502	509	my $sTMS = "";
503	510	if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
504		- $sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
	511	+ $sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
505	512	}
506	513
507	514	#Plot Subject hydropaty
508	515	my $sPfam = get_pfam_coords_for_quod($s, "blue");
509		- my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}";
510		- my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa);
511		- #print "$cmd3\n\n";
512		- system $cmd3 unless (-f "${sName}.png");
513		- return undef unless (-f "${sName}.png");
	516	+ my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
	517	+ my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
	518	+# print "$cmd3\n\n";
	519	+# exit;
	520	+ system $cmd3 unless (-f $sName);
	521	+ return undef unless (-f $sName);
514	522
515	523
516	524	return 1;
...	...	@@ -535,7 +543,7 @@
535	543	if (exists $pfamHits{$prot}) {
536	544	my @Doms = keys %{ $pfamHits{$prot} };
537	545	my $dcnt = 0;
538		- $str = "--region-font 12 -ar ";
	546	+ $str = "--region-font 12 --add-region ";
539	547	foreach my $d (@Doms) {
540	548
541	549	my @hits = @{ $pfamHits{$prot}{$d} };
...	...	@@ -543,8 +551,10 @@
543	551	my $left = $hit->{qstart};
544	552	my $right = $hit->{qend};
545	553
546		- my $ypos = -2.8 + $dcnt * 0.4;
547		- $str .= "${left}-${right}:'${d}':${ypos}:$color ";
	554	+ my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
	555	+ my $yposh = $yposl + 0.15; #domain height coord
	556	+
	557	+ $str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
548	558	$dcnt++;
549	559	}
550	560	}

...	...	@@ -142,7 +142,7 @@
142	142	#Generate quod plot
143	143
144	144	#Format string for the regions
145		- my $regions = "-at ";
	145	+ my $regions = "--add-tms ";
146	146	my $coords = "";
147	147	foreach my $hit (@res) {
148	148

Diff suppressed. Click to show

1		-#!/usr/bin/env perl -w
	1	+#!/usr/bin/env perl
2	2
3		-use warnings;
	3	+no warnings;
4	4	use strict;
5	5	use Data::Dumper;
6	6
7		-$Data::Dumper::Deepcopy = 1;
8		-$Data::Dumper::Indent = 1;
9		-#$Data::Dumper::Purity = 0;
10		-$Data::Dumper::Sortkeys = 1;
11		-
	7	+use TCDB::Repeats;
12	8	use Getopt::Long;
13		-use Bio::SearchIO;
14		-use Bio::SeqIO;
15	9
16	10
17		-use TCDB::CheckDependencies;
18		-use TCDB::Assorted;
	11	+my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences';
	12	+my $seqsFile = undef;
	13	+my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop';
	14	+my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP';
19	15
	16	+my $evalue = 1e-2;
	17	+my $coverage = 0.85;
	18	+my $identity = 0.2;
20	19
	20	+my @tmsRanges = ();
21	21
22		-#==========================================================================
23		-#Check dependencies
	22	+read_command_line();
24	23
25		-my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py");
26		-my $CheckDep_obj = new TCDB::CheckDependencies();
27		-$CheckDep_obj -> dependencies_list(\@dependencies);
28		-$CheckDep_obj -> checkDependencies;
29	24
30		-
31		-
32		-#==========================================================================
33		-#Read command line options
34		-
35		-my $gs_infile = "";
36		-my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS
37		-my $gs_idFormat = "";
38		-my $gs_repUnit = 0;
39		-my $gs_seqDir = "";
40		-my $gs_tail = 5;
41		-my $gs_evalue = 0.1;
42		-my $gs_coverage = 0.8;
43		-my $gs_identity = 0.25;
44		-my $gsatShuffles = 1000;
45		-my $min_gsat_score = 4.0;
46		-
47		-my $compStatsFlag = 1;
48		-my $compStats = "";
49		-my $outdir = "repeats";
50		-my $repDir = "reports";
51		-my $seqDir = "sequences";
52		-my $alignDir = "alignments";
53		-my $plotsDir = "plots";
54		-my $goodHitsOnly = 1; #print only significant results, ignore everything else
55		-
56		-
57		-#all (all sequences in output file)
58		-#each (generate one directory per sequence.. for better organization)
59		-#debug (it will print the contents of the hash table one sequences at a time)
60		-my $mode = "all";
61		-
62		-read_command_line_arguments();
63		-
64		-#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir,
65		-# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats],
66		-# [qw(infile idFormat repUnit seqDir tail evalue
67		-# coverage identity $gsatShuffles compStatFlag compStats)]);
	25	+#print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir],
	26	+# [qw(seqsDir seqsFile tmsFile outDir )]);
68	27	#exit;
69	28
70		-# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa
71	29
	30	+#my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir,
	31	+# 'tmsFile' => $tmsFile,
	32	+# 'outDir' => $outDir,
	33	+# 'ranges2searchTMS' => \@TMSranges);
72	34
73		-#==========================================================================
74		-#Read file with coordinates of TMSs and verify that the sequences are
75		-#available
76	35
77		-my %gh_tms = ();
	36	+my @TMSranges = ([1, 3], [4, 6]);
78	37
79		-read_tms_coordinates_file($gs_infile, \%gh_tms);
	38	+my $repObj = TCDB::Repeat->new();
80	39
81		-#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]);
82		-#exit;
	40	+#$repObj->tmsFile($tmsFile);
	41	+#$repObj->seqsDir($seqsDir);
	42	+$repObj->seqsFile($seqsFile);
	43	+$repObj->outDir($outDir);
	44	+$repObj->evalueCutoff($evalue);
	45	+$repObj->identityCutoff($identity);
	46	+$repObj->coverageCutoff($coverage);
	47	+$repObj->TMSranges2search(\@TMSranges);
83	48
	49	+$repObj-> findRepeatsTMSranges();
84	50
85		-#===========================================================================
86		-#Main Output directory
	51	+#print Data::Dumper->Dump([$repObj ], [qw(*repObj)]);
87	52
88		-#Root directory for all results
89		-system "mkdir -p $outdir" unless (-d $outdir);
90		-die "Could not generate output directory: $outdir" unless (-d $outdir);
91	53
92	54
93	55
94		-#==========================================================================
95		-#Search for repeats inside query sequences
96	56
97		-my %results = ();
98		-my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots
99		-
100		-foreach my $ls_sid (keys %gh_tms) {
101		-
102		- my %gh_bundleSeqs = ();
103		- my %gh_topHits = ();
104		-
105		-
106		- print "Processing: $ls_sid\n";
107		-
108		-
109		- #Clean results if one output directory is generated per input sequence
110		- %results = () if ($mode eq 'each');
111		-
112		-
113		- #Cut sequences in non overlaping regions with as many TMS as the
114		- #repeat unit we want to find.
115		- cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs);
116		-
117		-
118		-# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]);
119		-# <STDIN>;
120		-
121		-
122		- #run ssearch to find potential repeats.
123		- align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits);
124		-
125		-
126		-# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]);
127		-# <STDIN>;
128		-
129		-
130		- #Collect results for final table
131		- $results{$ls_sid} = \%gh_topHits;
132		-
133		- #present results per input sequence to verify everything looks fine.
134		- if ($mode eq 'debug') {
135		- print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]);
136		- <STDIN>;
137		- }
138		-
139		- print_reports(\%results) if ($mode eq 'each');
140		-}
141		-
142		-
143		-
144		-
145		-
146	57	#===========================================================================
147		-#Print final results in summarized or detailed format
	58	+#Read command line and print help
148	59
149		-#print Data::Dumper->Dump([\%results ], [qw(*results )]);
150		-#<STDIN>;
151	60
152		-print_reports(\%results) if ($mode eq 'all');
	61	+sub read_command_line {
153	62
	63	+ print_help() unless (@ARGV);
154	64
	65	+ my $status = GetOptions(
	66	+ "s\|seqs-file=s" => \&read_seqsFile,
	67	+ "d\|seqs-dir=s" => \&read_seqsDir,
	68	+ "o\|outdir=s" => \&read_outdir,
	69	+ "t\|tms=s" => \&read_tmsFile,
	70	+ "e\|evalue=f" => \$evalue,
	71	+ "i\|identity=f" => \$identity,
	72	+ "c\|coverage=f" => \$coverage,
	73	+ "h\|help" => sub { print_help(); },
	74	+ "<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
	75	+ exit unless ($status);
155	76
156	77
157		-
158		-###########################################################################
159		-# #
160		-# Subroutine definition #
161		-# #
162		-###########################################################################
163		-
164		-
165		-#print final_report
166		-
167		-sub print_reports {
168		-
169		- my $res = shift;
170		-
171		-
172		- #Get the directory where reports will be saved
173		- my $reportDir = undef;
174		- if ($mode eq 'all') {
175		- $reportDir = getReportsDir();
176		- }
177		- else {
178		-
179		- #one id per report
180		- my @ids = keys %$res;
181		- my $seqId = $ids[0];
182		-
183		- $reportDir = getReportsDir($seqId);
184		- }
185		- die "Error: invalid report dir" unless ($reportDir);
186		-
187		-
188		- my $sumFile = "$reportDir/repeats_summary_report.txt";
189		- my $detailsFile = "$reportDir/repeats_detailed_report.txt";
190		- my $htmlFile = "$reportDir/report.html";
191		-
192		-
193		- open (my $htmlfh, ">", $htmlFile) \|\| die $!;
194		-
195		- my $htmlHeader = <<HEADER;
196		-<!DOCTYPE html>
197		-<html xmlns="http://www.w3.org/1999/xhtml">
198		- <head>
199		- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
200		-
201		- <style type="text/css">
202		-
203		-.label {
204		- text-align: right;
205		- width: 50px;
	78	+ #Validadte input file option
	79	+ die "Error: no sequence file detected!" unless ($seqsFile);
206	80	}
207	81
208		-.data {
209		- text-align: left;
210		- padding-left: 8px;
211		- width: 100px;
212		-}
213	82
214		-.uline {
215		- text-decoration: underline;
216		-}
217		-
218		-.seq {
219		- border: 2px solid black;
220		- height: 70px;
221		- width: 100%;
222		- overflow-x: auto;
223		- overflow-y: hidden;
224		- margin: 1em 0;
225		- background: gray;
226		- color: white;
227		-}
228		-
229		-img {
230		- display: block;
231		- margin-left: auto;
232		- margin-right: auto;
233		- height: 250px;
234		- width: auto;
235		- max-width: 1500px;
236		- max-height: 300px;
237		-}
238		-
239		- </style>
240		- <title>Inferring repeats of $gs_repUnit TMS</title>
241		- </head>
242		- <br />
243		- <h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1>
244		- <body>
245		-
246		-HEADER
247		-
248		- print $htmlfh $htmlHeader;
249		- open (my $sumh, ">", $sumFile) \|\| die $!;
250		- open (my $deth, ">", $detailsFile) \|\| die $!;
251		-
252		-
253		- #Header for summary table
254		- print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n";
255		-
256		-
257		-# print Data::Dumper->Dump([$res ], [qw(*res )]);
258		-# <STDIN>;
259		-
260		-
261		- P:foreach my $id (sort {$a cmp $b} keys %$res) {
262		-
263		- #Jump to next result if there are NO hits for this protein and
264		- #ONLY good hits are going to be recorded.
265		- unless (%{ $res->{$id} }) {
266		- next P if ($goodHitsOnly);
267		- }
268		-
269		-
270		- print $deth "===========================================================================\n";
271		- print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n";
272		-
273		- #There must be results to continue
274		- unless (%{ $res->{$id} }) {
275		- print $sumh "$id\tNo_hits\n";
276		- print $deth "$id\tNo_hits\n\n\n";
277		- print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n";
278		- }
279		-
280		- print $deth "$id\n\n";
281		- print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n";
282		-
283		-
284		-
285		-
286		- #get the long bundle names
287		- BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) {
288		-
289		- BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) {
290		-
291		- my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName};
292		- my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen};
293		-
294		- #Each of the hits for this bundle
295		- my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} };
296		-
297		- #To get rid of a warning when there is only one hit.
298		- my @hits = (scalar (@hits_tmp) > 1)?
299		- sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp;
300		-
301		- foreach my $hit (@hits) {
302		-
303		- my $hName = $hit->{hName};
304		- my $hLen = $hit->{hLen};
305		-
306		- my $evalue = sprintf ("%.1e", $hit->{hEvalue});
307		- my $ident = sprintf ("%.1f", $hit->{hId} * 100);
308		- my $sim = sprintf ("%.1f", $hit->{hSim} * 100);
309		- my $gsat = sprintf ("%.1f", $hit->{gsat});
310		-
311		- my $alnLen = $hit->{alnLen};
312		- my $qCov = sprintf("%.1f", $hit->{qCov} * 100);
313		- my $hCov = sprintf("%.1f", $hit->{hCov} * 100);
314		-
315		-
316		- #The alignment
317		- my $qstart = $hit->{qstart};
318		- my $qend = $hit->{qend};
319		- my $sstart = $hit->{sstart};
320		- my $send = $hit->{send};
321		- my $qSeq = $hit->{qSeq};
322		- my $homStr = $hit->{homStr};
323		- my $sSeq = $hit->{sSeq};
324		-
325		- my $plot = $hit->{plot};
326		-
327		- #For summary tab-delimitedfile (everything except the alignment)
328		- print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n";
329		-
330		-
331		- #Detailed report that includes the alignment
332		- print $deth "----------\n";
333		- print $deth "$qName ($qLen) vs $hName ($hLen)\n\n";
334		- print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n";
335		- print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n";
336		- print $deth "Alignment ($qName\|${qstart}-$qend vs $hName\|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n";
337		-
338		-
339		- #The HTML report (includes alignment and hydropathy image
340		- my $repHit = <<HIT;
341		-
342		- <p><b>$qName ($qLen) vs $hName ($hLen)</b></p>
343		-
344		- <table width="600px" border="0" cellspacing="0" cellpadding="2">
345		- <tr>
346		- <td class='label'><b>E-value:</b></td>
347		- <td class='data'>$evalue</td>
348		- <td class='label'><b>Identity:</b></td>
349		- <td class='data'>${ident}%</td>
350		- <td class='label'><b>Similarity:</b></td>
351		- <td class='data'>${sim}%</td>
352		- <td class='label'><b>GSAT:</b></td>
353		- <td class='data'>$gsat</td>
354		- </tr>
355		- <tr>
356		- <td class='label'><b>Aln:</b></td>
357		- <td class='data'>$alnLen</td>
358		- <td class='label'><b>Q_cov:</b></td>
359		- <td class='data'>${qCov}%</td>
360		- <td class='label'><b>S_cov:</b></td>
361		- <td class='data'>${hCov}%</td>
362		- <td class='label'></td>
363		- <td class='data'></td>
364		- </tr>
365		- </table>
366		-
367		- <p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p>
368		- <div class='seq'>
369		- <pre>
370		-$qSeq
371		-$homStr
372		-$sSeq
373		- </pre>
374		- </div>
375		- <a href="$plot" target="_blank"><img src="$plot"/></a>
376		- <br />
377		- <hr />
378		-
379		-HIT
380		-
381		- print $htmlfh $repHit;
382		-
383		- } #hit
384		- } #reference bundle number
385		- } #Reference bundle name
386		- } #Query protein
387		-
388		- #Close HTML report
389		- my $closeRep = <<CLOSE;
390		- </body>
391		-</html>
392		-CLOSE
393		-
394		- print $htmlfh $closeRep;
395		-
396		- close $sumh;
397		- close $deth;
398		- close $htmlfh;
399		-}
400		-
401		-
402		-
403	83	#==========================================================================
404		-#Run ssearch36 between the different bundles in a sequence
	84	+#Option -s
405	85
406		-sub align_bundles {
	86	+sub read_seqsFile {
	87	+ my ($opt, $value) = @_;
407	88
408		- my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_;
409		-
410		- %$lhr_topHits = ();
411		-
412		- #Directory where the sequences of TMS bundles are saved
413		- my $sequencesDir = undef;
414		- my $alignmentsDir = undef;
415		- my $hydroPlotsDir = undef;
416		-
417		- if ($mode eq 'all') {
418		- $sequencesDir = getSequencesDir();
419		- $alignmentsDir = getAlignmentsDir();
420		- $hydroPlotsDir = getPlotsDir();
	89	+ unless (-f $value && !(-z $value)) {
	90	+ die "Error: file with sequences does not exist or is empty!\n";
421	91	}
422		- else {
423		- $sequencesDir = getSequencesDir($seqId);
424		- $alignmentsDir = getAlignmentsDir($seqId);
425		- $hydroPlotsDir = getPlotsDir($seqId);
426		- }
427		- die "Error: invalid sequences dir" unless ($sequencesDir);
428		- die "Error: invalid alignments dir" unless ($alignmentsDir);
429		- die "Error: invalid plots dir" unless ($hydroPlotsDir);
430	92
431		-
432		-# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]);
433		-# <STDIN>;
434		-
435		-
436		- #The bundle that will be used as reference for the comparison
437		- REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) {
438		-
439		- my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0];
440		-
441		-
442		- #Id to name ssearch36 output files
443		- my $id = $lhr_bundleSeqFiles->{$bundle}->[0];
444		- $id =~ s/\.faa//;
445		-
446		-
447		- #For naming GSAT files (ID of system or protein accession)
448		- my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef;
449		- die "Could not extract accession from $id!" unless ($id);
450		-
451		-
452		-# print Data::Dumper->Dump([$id, $tcAcc ], [qw(id tcAcc)]);
453		-# <STDIN>;
454		-
455		-
456		- #--------------------------------------------------------------------
457		- #Get the non-overlapping bundles to compare them against the
458		- #reference bundle
459		-
460		- my @cmpFiles = ();
461		-
462		- #Initialize the index to the first non-overlapping bundle
463		- my $next_bundle_idx = $bundle + $gs_repUnit;
464		-
465		- CMP:while (1) {
466		-
467		- #Exit if next bundle is not in bundles hash
468		- last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx});
469		-
470		- #Get file name for this non-overlapping bundle
471		- my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0];
472		- push (@cmpFiles, $cmpBundle);
473		-
474		- #Update the index to the next non-overlapping bundle
475		- $next_bundle_idx = $next_bundle_idx + $gs_repUnit;
476		- }
477		-
478		- #go to next reference bundle if there are no non-overlapping bundles.
479		- next REF unless (@cmpFiles);
480		-
481		-
482		-# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]);
483		-# <STDIN>;
484		-
485		-
486		- #--------------------------------------------------------------------
487		- #Now run ssearch36 of the reference bundle against all its
488		- #non-overlapping bundles
489		-
490		- #put all non-overlapping bundles into a file
491		- my $libFile = "$sequencesDir/lib_$id.faa";
492		- my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile";
493		- system $cmd;
494		-
495		-
496		- #run ssearch36 of $rFile vs @cmpFile
497		- my $ssearchOut = "$alignmentsDir/ssearch_$id.out";
498		- my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut);
499		- system "ssearch36 $ssearch_params" unless (-f $ssearchOut);
500		-
501		-
502		-# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]);
503		-# <STDIN>;
504		-
505		-
506		- #---------------------------------------------------------------------
507		- #Estimate here the spacing between x-ticks for hydropathy plots
508		-
509		- my $protLen = $origSeqLength{$seqId};
510		-
511		- my $xticksSpacing = undef;
512		- if ($protLen <= 500) {
513		- $xticksSpacing = 25;
514		- }
515		- elsif ($protLen <= 1000) {
516		- $xticksSpacing = 50;
517		- }
518		- else {
519		- $xticksSpacing = 100;
520		- }
521		-
522		-
523		-
524		- #--------------------------------------------------------------------
525		- #parse ssearch36 output. For BioPerl resouces check:
526		- #http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm
527		- #https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html
528		-
529		- my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut);
530		-
531		-
532		- #put hir the top hits
533		- my %lh_hits = ();
534		-
535		-
536		- while (my $result = $parser->next_result) {
537		-
538		-
539		- my $qLen = $result->query_length;
540		- $lh_hits{$bundle}{qName} = $result->query_name;
541		- $lh_hits{$bundle}{qLen} = $qLen;
542		- $lh_hits{$bundle}{hits} = [];
543		-
544		-
545		- HIT:while (my $hit = $result->next_hit) {
546		-
547		- HSP:while(my $hsp = $hit->next_hsp) {
548		-
549		-
550		-# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]);
551		-# <STDIN>;
552		-
553		-
554		- my %tmp = ();
555		-
556		- my $alnLen = $hsp->hsp_length;
557		- my $hLen = $hit->length;
558		- my $hEvalue = $hsp->evalue;
559		- my $hId = $hsp->frac_identical('total'); #identity in the alignment
560		- my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
561		-
562		-
563		- #coordinates in the alignment to properly calculate coverages
564		- my $qstart = $hsp->start('query');
565		- my $qend = $hsp->end('query');
566		- my $sstart = $hsp->start('subject');
567		- my $send = $hsp->end('subject');
568		-
569		-
570		- #Calculate coverages properly (do not use alignment length as it includes gaps
571		-
572		- my $qCov_tmp = ($qend - $qstart + 1) / $qLen;
573		- my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp;
574		-
575		- my $hCov_tmp = ($send - $sstart + 1) / $hLen;
576		- my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp;
577		-
578		-
579		-# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity],
580		-# [qw(qLen qCov $hLen hCov coverageCutoff evalue evalCutoff hId IDcutoff)]);
581		-# <STDIN>;
582		-
583		-
584		- #Before storing hit results check minimum coverage, identity and evalue
585		- next HSP unless (($qCov >= $gs_coverage \|\| $hCov >= $gs_coverage) &&
586		- ($hEvalue <= $gs_evalue) && ($hId >= $gs_identity));
587		-
588		-
589		- #hit identity
590		- $tmp{hName} = $hit->name;
591		- $tmp{hLen} = $hLen;
592		-
593		-
594		- #hit statistics
595		- $tmp{alnLen} = $alnLen;
596		- $tmp{hEvalue} = $hEvalue;
597		- $tmp{hId} = $hId;
598		- $tmp{hSim} = $hSim;
599		- $tmp{qCov} = $qCov;
600		- $tmp{hCov} = $hCov;
601		-
602		-
603		- #The alignment
604		- $tmp{qstart} = $qstart;
605		- $tmp{qend} = $qend;
606		- $tmp{sstart} = $sstart;
607		- $tmp{send} = $send;
608		-
609		- $tmp{qSeq} = $hsp->query_string;
610		- $tmp{sSeq} = $hsp->hit_string;
611		- $tmp{homStr} = $hsp->homology_string;
612		-
613		-
614		- #Get the GSAT score
615		- my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat";
616		-
617		-
618		-# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n";
619		-# exit;
620		-
621		- system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile);
622		-
623		- my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile);
624		- $tmp{gsat} = $gsat_score;
625		-
626		-
627		-# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]);
628		-# <STDIN>;
629		-
630		-
631		- #GSAT is the last filter
632		- next HSP unless ($gsat_score >= $min_gsat_score);
633		-
634		- #------------------------------------------------------------
635		- #Generate quod plot with the repeat
636		-
637		- my $whole_prot_seq = "$gs_seqDir/${seqId}.faa";
638		- die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq);
639		-
640		-
641		- my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName};
642		- my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png";
643		- my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName};
644		-
645		- #Get hydrophobic peaks coords
646		- my $hydroPeaks = $gh_tms{$seqId};
647		- die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks });
648		-
649		-
650		- #format the hydrophobic peaks for quod
651		- my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks;
652		- my $pstring = join (" ", @peaks);
653		-
654		-
655		- #----------
656		- #Calculate the positions of the aligned section of each bundle in the full sequence.
657		-
658		- my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef;
659		- my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef;
660		- die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid);
661		-
662		-
663		- #extract initial positions for both bundles
664		- my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1];
665		- my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1;
666		- my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1];
667		- my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1;
668		- die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend);
669		- die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend);
670		-
671		-
672		- #Calculate bundle positions here
673		- my $qgp_start = $qbstart + ($qstart - 1);
674		- my $qgp_end = $qbstart + ($qend - 1);
675		-
676		- my $sgp_start = $sbstart + ($sstart - 1);
677		- my $sgp_end = $sbstart + ($send - 1);
678		-
679		-
680		- #Format the coordinates for the repeats now
681		- my $qrep = "${qgp_start}-${qgp_end}:green";
682		- my $srep = "${sgp_start}-${sgp_end}:blue";
683		-
684		- #Format the coordinates for the bar delimiting the bundles
685		- my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1";
686		-
687		- #The quod command line
688		- my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}";
689		-
690		- my $img = "${plotFile}.png";
691		- system $cmd unless (-f $img);
692		- die "Could not generate plot: $img" unless (-f $img);
693		-
694		- $tmp{plot} = $fileName;
695		-
696		-
697		- #load the data into the hits section for this bundle
698		- push (@{ $lh_hits{$bundle}{hits} }, \%tmp);
699		-
700		-
701		- } #HSP
702		- } #HIT
703		- } #While
704		-
705		-
706		- #Add results to the topHits hash
707		- if (@{ $lh_hits{$bundle}{hits} }) {
708		- $lhr_topHits->{$id} = \%lh_hits;
709		- }
710		-
711		- }
	93	+ $seqsFile = $value;
712	94	}
713	95
714	96
715		-
716		-
717	97	#==========================================================================
718		-#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the
719		-#sequence in TMS bundles of length rsize.
	98	+#Option -t
720	99
	100	+sub read_tmsFile {
	101	+ my ($opt, $value) = @_;
721	102
722		-sub cut_seq_in_tms_regions {
723		-
724		- my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_;
725		-
726		-
727		- %$lhr_seqSegs = ();
728		-
729		-
730		- #Get the directory where bundle sequences will be saved
731		- my $sequencesDir = undef;
732		-
733		- if ($mode eq 'all') {
734		- $sequencesDir = getSequencesDir();
	103	+ unless (-f $value && !(-z $value)) {
	104	+ die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n";
735	105	}
736		- else {
737		- $sequencesDir = getSequencesDir($ls_pid);
738		- }
739		- die "Error: invalid sequence dir" unless ($sequencesDir);
740	106
741		-
742		- #----------------------------------------------------------------------
743		- #Get the coordinates of the overlapping bundles
744		-
745		- my @la_tms = @{ $lhr_tms->{$ls_pid} };
746		-
747		-
748		-
749		- #Get the Length of the sequence of the query protein
750		- my $seqFile = "$gs_seqDir/${ls_pid}.faa";
751		- my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta");
752		- my $seqObj = $obj->next_seq;
753		- my $qlength = $seqObj->length;
754		- die "Could not extract protein length." unless ($qlength);
755		-
756		- #Store the length of the original sequence for proper calculation of
757		- #the x-ticks in the hydropathy plots of the results
758		- $origSeqLength{$ls_pid} = $qlength;
759		-
760		-
761		-
762		- #Number of TMS in protein
763		- my $ls_ntms = scalar (@la_tms);
764		-
765		-
766		-
767		- for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) {
768		-
769		- #TMS in bundle
770		- my $left_tms = $la_tms[$idx - 1];
771		- my $right_tms = $la_tms[$idx + $ls_repeat - 2];
772		-
773		-
774		- #The coordinates of the bundle
775		- my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail;
776		- #my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail;
777		- my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail;
778		-
779		-
780		- #Cut and name the bundles only if bundle file does not exist
781		- my $outfile = "${ls_pid}_bundle${idx}";
782		- unless (-f "$sequencesDir/${outfile}.faa") {
783		-
784		- #cutting bundle
785		- my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0);
786		- system "tmsplit $args > /dev/null";
787		-
788		- #replace protein ID with bundle number to the ID so alignments can be easily identified
789		- system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa);
790		- }
791		-
792		- $lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos];
793		- }
	107	+ $tmsFile = $value;
794	108	}
795	109
796	110
797		-
798		-
799	111	#==========================================================================
800		-#Read file with the TMS coordinates of the input proteins. The TMS
801		-#must have been validated with WHAT to make sure they are reliable.
	112	+#Option -d
802	113
	114	+sub read_seqsDir {
	115	+ my ($opt, $value) = @_;
803	116
804		-sub read_tms_coordinates_file {
	117	+ die "Error: directory with sequences does not exist." unless (-d $value);
805	118
806		- my ($s_coordsFile, $hr_tms) = @_;
807		-
808		- open (my $fileh, "<", $s_coordsFile) \|\| die $!;
809		-
810		- #-----------------------------------------------------------------
811		- #The format of this file is protein ID followed by pairs of
812		- #coordinates separated by dash:
813		- # 2.A.43.1.1-O60931 1-20 25-35 50-68 ....
814		- if ($infileFmt eq 'tms') {
815		-
816		- while(<$fileh>) {
817		- chomp;
818		-
819		- #ignore empty lines;
820		- next unless ($_);
821		-
822		- #extract id and TMSs coordinates
823		- my ($id, @tms_str) = split(/\s+/, $_);
824		- my @tms = map { [ split(/-/, $_) ] } @tms_str;
825		-
826		-
827		- #For debugging purposes
828		-# next unless ($id eq 'WP_100644534');
829		-
830		-
831		- $hr_tms->{$id} = \@tms;
832		-
833		- #Verify that the sequence is available for this protein
834		- unless (-f "$gs_seqDir/${id}.faa" && ! (-z "$gs_seqDir/${id}.faa")) {
835		- die "Could not find sequence for protein: $id in dir: $gs_seqDir -->";
836		- }
837		- } #while
838		- }
839		-
840		- #Input file is in HMMTOP format
841		- else {
842		- while(<$fileh>) {
843		- chomp;
844		-
845		- #Remove trailing spaces
846		- s/\s+$//;
847		-
848		- #ignore empty lines
849		- next unless ($_);
850		-
851		-
852		- #parse hmmtop line
853		- my ($id, $ntms, $tms_str) = (/\S+\s+\d+\s+(\S+).+(IN\|OUT)\s+(\d+)\s+([\d\s-]+)/)? ($1, $3, $4) : ();
854		-
855		- #For debugging purposes
856		-# next unless ($id eq 'WP_100644534');
857		-
858		-
859		- if ($id && $ntms && $tms_str) {
860		-
861		- #extract the pairs of coordinates for TMS
862		- my @coords = split(/\s+/, $tms_str);
863		- my @tms = ();
864		- for (my $i=0; $i < $#coords; $i += 2) {
865		- push (@tms, [$coords[$i], $coords[$i+1]]);
866		- }
867		-
868		- $hr_tms->{$id} = \@tms;
869		-
870		- }
871		- else {
872		- print "problem parsing HMMTOP line: $_\n";;
873		- print Data::Dumper->Dump([$id, $ntms, $tms_str ], [qw(id ntms *tms_str )]);
874		- exit;;
875		- }
876		- }
877		- }
878		-
879		- close $fileh;
	119	+ $seqsDir = $value;
880	120	}
881	121
882	122
883		-
884	123	#==========================================================================
885		-#Get the directory where the sequences of bundles will be saved.
	124	+#Option -o
886	125
887		-sub getSequencesDir {
	126	+sub read_outdir {
	127	+ my ($opt, $value) = @_;
888	128
889		- my $protId = shift;
890		-
891		- my $dir = undef;
892		-
893		- if ($mode eq 'all') {
894		- $dir = "$outdir/$seqDir";
895		- }
896		- else {
897		- die "Error: protein accession missing" unless ($protId);
898		- $dir = "$outdir/$protId/$seqDir";
899		- }
900		-
901		- system "mkdir -p $dir" unless (-d $dir);
902		- die "No dir for bundle sequences found: $dir" unless (-d $dir);
903		-
904		- return $dir;
	129	+ $outDir = $value;
905	130	}
906	131
907	132
908	133	#==========================================================================
909		-#Get the directory where the alignments will be saved
	134	+#option -h
910	135
911		-sub getAlignmentsDir {
912	136
913		- my $protId = shift;
914		-
915		- my $dir = undef;
916		-
917		- if ($mode eq 'all') {
918		- $dir = "$outdir/$alignDir";
919		- }
920		- else {
921		- die "Error: protein accession missing" unless ($protId);
922		- $dir = "$outdir/$protId/$alignDir";
923		- }
924		-
925		- system "mkdir -p $dir" unless (-d $dir);
926		- die "No dir for alignments found: $dir" unless (-d $dir);
927		-
928		- return $dir;
929		-}
930		-
931		-
932		-#==========================================================================
933		-#Get the directory where hydropathy plots will be saved
934		-
935		-sub getPlotsDir {
936		-
937		- my $protId = shift;
938		-
939		- my $dir = undef;
940		-
941		- if ($mode eq 'all') {
942		- $dir = "$outdir/$plotsDir";
943		- }
944		- else {
945		- die "Error: protein accession missing" unless ($protId);
946		- $dir = "$outdir/$protId/$plotsDir";
947		- }
948		-
949		- system "mkdir -p $dir" unless (-d $dir);
950		- die "No dir for plots found: $dir" unless (-d $dir);
951		-
952		- return $dir;
953		-}
954		-
955		-
956		-#==========================================================================
957		-#Get the directory where the reports will be saved
958		-
959		-sub getReportsDir {
960		-
961		- my $protId = shift;
962		-
963		- my $dir = undef;
964		-
965		- if ($mode eq 'all') {
966		- $dir = "$outdir/$repDir";
967		- }
968		- else {
969		- die "Error: protein accession missing" unless ($protId);
970		- $dir = "$outdir/$protId/$repDir";
971		- }
972		-
973		- system "mkdir -p $dir" unless (-d $dir);
974		- die "No dir for reports found: $dir" unless (-d $dir);
975		-
976		- return $dir;
977		-}
978		-
979		-
980		-
981		-
982		-
983		-#==========================================================================
984		-#Read command-line arguments
985		-
986		-sub read_command_line_arguments {
987		-
988		- #if no arguments are given print the help
989		- if (! @ARGV) {
990		- print_help();
991		- }
992		-
993		- #----------------------------------------------------------------------
994		- #Parse command line arguments
995		-
996		- my $ls_status = GetOptions(
997		- "i\|infile=s" => \$gs_infile,
998		- "if\|infile-format=s" => \$infileFmt,
999		- "o\|outdir=s" => \$outdir,
1000		- "f\|id-format=s" => \$gs_idFormat,
1001		- "r\|rep-unit=i" => \$gs_repUnit,
1002		- "t\|tail-size=i" => \$gs_tail,
1003		- "s\|seqs=s" => \$gs_seqDir,
1004		- "e\|evalue=f" => \$gs_evalue,
1005		- "c\|coverage=f" => \$gs_coverage,
1006		- "id\|identity=f" => \$gs_identity,
1007		- "ncs\|no-comp-stats!" => \$compStatsFlag,
1008		- "gs\|gsat-shuffles=i" => \$gsatShuffles,
1009		- "z\|gsat-cutoff=f" => \$min_gsat_score,
1010		- "m\|mode=s" => \$mode,
1011		- "h\|help" => sub { print_help(); },
1012		-
1013		- #For arguments that do not look like valid options
1014		- "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }
1015		- );
1016		- die "\n" unless ($ls_status);
1017		-
1018		- #----------------------------------------------------------------------
1019		- #Validate command line arguments
1020		-
1021		- die "Error: argument -i is mandatory.\n" unless ($gs_infile);
1022		- die "Error: argument -r is mandatory and must be greater than 0.0\n" unless ($gs_repUnit > 0);
1023		- die "Error: augument -t must be grater than 0 and less than 16\n" if ($gs_tail > 15 \|\| $gs_tail < 0);
1024		- die "Error: argument -e must be greater than 0\n" unless ($gs_evalue >=0 );
1025		- die "Error: argument -c must be between 0.5 and 1.0\n" unless ($gs_coverage >= 0.0 && $gs_coverage <= 1.0);
1026		- die "Error: argument -id must be between 0.25 and 1.0\n" unless ($gs_identity >= 0.0 && $gs_identity <= 1.0);
1027		-
1028		- #Option -f
1029		- $gs_idFormat = lc $gs_idFormat;
1030		- unless ($gs_idFormat =~ /^(tc\|tca\|o)$/) {
1031		- die "Error: There are 3 Valid options for -f (tc, tca, o)\n";
1032		- }
1033		-
1034		-
1035		- #option -if
1036		- $infileFmt = lc $infileFmt;
1037		- unless ($infileFmt =~ /^(hmmtop\|tms)$/) {
1038		- die "Error: invalid input file format: '$infileFmt' (Valid options: hmmtop, tms).\n";
1039		- }
1040		-
1041		-
1042		- #option -m
1043		- $mode = lc $mode;
1044		- unless ($mode =~ /^(all\|each\|debug)$/) {
1045		- die "Error: invalid mode of operation '$mode'. Valid options are: all, each!\n";
1046		- }
1047		-
1048		-
1049		- #Option -s
1050		- unless (-d $gs_seqDir) {
1051		- die "Error: Directory with sequences must exits -> $gs_seqDir\n";
1052		- }
1053		-
1054		-
1055		- #Validate GSAT cutoff
1056		- unless ($min_gsat_score >= 0) {
1057		- die "Use GSAT cutoff >= 3.0!\n";
1058		- }
1059		-
1060		-
1061		- #option -ncs
1062		- $compStats = ($compStatsFlag)? "" : "-k 1000 -z 11";
1063		-}
1064		-
1065		-
1066		-
1067	137	sub print_help {
1068	138
1069		- my $help = <<'HELP';
	139	+ my $help = <<'HELP';
1070	140
1071		-This script searches for regions of TMSs repeated in a full protein.
	141	+This program searches for reapeats between different user-specified
	142	+regions of proteins.
1072	143
1073		--i, --infile {path}
1074		- Input file with id/accession(s) of the protein(s) to analyze and the coordinates
1075		- of the TMSs in that protein(s). Use option -if to specify the format of this
1076		- file.
1077		- (Argument is mandatory).
	144	+ Command line options:
1078	145
1079		--if, --infile-format {string} (optional)
1080		- Format of the TMS coordenates. It can be either 'tms' or 'hmmtop'.
1081		- (Default: hmmtop)
	146	+ -s, --seqs-file {file} (mandatory)
	147	+ Path to file in fasta format with all the input sequences.
	148	+ THis option is incompatible with option -d. But one of the
	149	+ two option must be given.
1082	150
1083		--o, --outdir {path}
1084		- Output directory where results will be saved.
1085		- (Default: repeats)
	151	+ -d, --seqs-dir {path} (optional)
	152	+ Path to directory where the input sequences are located.
	153	+ This option is incompatible with options -s. But one of the
	154	+ two option must be given.
1086	155
1087		--s, --seqs {path}
1088		- Directory to access the sequences in FASTA format that will be used to
1089		- search for repeats. One file per sequence, and the name of the file is
1090		- the accession of the protein followed by '.faa'
1091		- (Argument is mandatory)
	156	+ -o, --oudir {papth} (optional)
	157	+ Path to the output directory.
	158	+ (Default: ./tmsRepeat)
1092	159
1093		--f, --id-format {string}
1094		- Acceptable formats for identifiers:
1095		- tc plain tcdb identifier of a system (e.g., 2.A.1.8.1)
1096		- tca tcdb id and accession separated by dash (e.g. 2.A.1.8.3-Q9R6U5)
1097		- o other, it can be refSeq, uniprot or custom, but it is requried
1098		- that is is a single string without spaces.
1099		- (Argument is mandatory)
	160	+ -t, --tms {file} (optional)
	161	+ File with the output of hmmtop for the input sequences, if available.
	162	+ (Default: run hmmtop on input seqeunces)
1100	163
1101		--r, --repeat-unit {int)
1102		- Size in TMS of the repeat unit to search in the protein.
1103		- (Argument is mandatory)
	164	+ -e, --evalue {float} (optional)
	165	+ Maximal evalue cutoff for the aligned seqments.
	166	+ (Default: 0.001)
1104	167
1105		--t, --tail-size {int}
1106		- Number of residues to add to the beginning and end of TMS regions before
1107		- running comparisons. Value should be less than or equal to 15 residues.
1108		- (Default: 5);
	168	+ -i, --identity {float} (optional)
	169	+ Minimal identity in aligned regions.
	170	+ (Default: 0.2)
1109	171
1110		--e, --evalue {float}
1111		- Maximum evalue to consider an alignment between two TMS bundles significant.
1112		- (Default: 0.1);
	172	+ -c, --coverage {float) (optional)
	173	+ Minimal coverage cutoff within the range: [0, 1] for the coverage of aligned regtions.
	174	+ (Default: 0.85)
1113	175
1114		--ncs, --no-comp-stats {FLAG}
1115		- If present, this flag indicates that E-values will not be corrected using
1116		- compositional statistics.
1117		- (Default: apply correction).
	176	+ -h, --help
	177	+ Display this help. Also displayed if script is run without arguments.
1118	178
1119		--c, --coverage {float}
1120		- Minimum alignment coverage of the smallest bundle to consider an alignment
1121		- signifiant.
1122		- (Default: 0.8)
1123		-
1124		--id, --identity {float}
1125		- Minimum identity, expressed as a float in the 0-1 range, to consider an
1126		- alignment signficant.
1127		- (Defatul: 0.25);
1128		-
1129		--gs, --gsat-shuffles {int}
1130		- Number of shuffles that will be used to run GSAT on good matches.
1131		- (Default: 1000);
1132		-
1133		--z, --gsat-cutoff {int}
1134		- Minimum GSAT score cutoff to select good hits.
1135		- (Default: 4.0)
1136		-
1137		--h, --help
1138		- Print this help message. It takes precedence to any other option.
1139		-
1140	179	HELP
1141	180
1142		- print $help;
1143		- exit;
1144		-
	181	+ print $help;
	182	+ exit;
1145	183	}