UCSD_WLU / repeatFinder

Commit 68b41f16a35757f498478ba2822e6978013608f2

Authored by Luis Arturo Medrano-Soto 2 years ago

Exists in master

update tmsRepeat.pl, alignSeqsFiles.pl and locateFragment.pl to work with python3 scripts

Showing 3 changed files with 128 additions and 1080 deletions Inline Diff

alignSeqsFiles.pl
locateFragment.pl
tmsRepeat.pl

#!/usr/bin/env perl -w	1	1	#!/usr/bin/env perl -w
	2	2
use strict;	3	3	use strict;
use warnings;	4	4	use warnings;
use Data::Dumper;	5	5	use Data::Dumper;
	6	6
$Data::Dumper::Deepcopy = 1;	7	7	$Data::Dumper::Deepcopy = 1;
	8	8
use Getopt::Long;	9	9	use Getopt::Long;
use LWP;	10	10	use LWP;
use Bio::SeqIO;	11	11	use Bio::SeqIO;
use Bio::SearchIO;	12	12	use Bio::SearchIO;
	13	13
#Local libraries	14	14	#Local libraries
use TCDB::CheckDependencies;	15	15	use TCDB::CheckDependencies;
use TCDB::Domain::PfamParser;	16	16	use TCDB::Domain::PfamParser;
use TCDB::Domain::Characterize;	17	17	use TCDB::Domain::Characterize;
use TCDB::Assorted;	18	18	use TCDB::Assorted;
	19	19
	20	20
###########################################################################	21	21	###########################################################################
#	22	22	#
# Comapre two files with fasta sequences and report the alignment parameters	23	23	# Comapre two files with fasta sequences and report the alignment parameters
# Along with hydropathy plots and PFAM domains.	24	24	# Along with hydropathy plots and PFAM domains.
#	25	25	#
###########################################################################	26	26	###########################################################################
	27	27
#==========================================================================	28	28	#==========================================================================
#Check dependencies	29	29	#Check dependencies
	30	30
my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd",	31	31	my @dependencies = ("zgrep", "blastp", "ssearch36", "hmmtop", "blastdbcmd",
"hmmscan");	32	32	"hmmscan");
my $CheckDep_obj = new TCDB::CheckDependencies();	33	33	my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies);	34	34	$CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies;	35	35	$CheckDep_obj -> checkDependencies;
	36	36
	37	37
#This will prevent quod and alnquod from going into interactive mode	38	38	#This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg";	39	39	$ENV{"MPLBACKEND"} = "agg";
	40	40
	41	41
#==========================================================================	42	42	#==========================================================================
#Read command line arguments	43	43	#Read command line arguments
	44	44
my $qfile = "";	45	45	my $qfile = "";
my $qProt = "";	46	46	my $qProt = "";
my $sfile = "";	47	47	my $sfile = "";
my $sProt = "";	48	48	my $sProt = "";
my $qlabel = "Query";	49	49	my $qlabel = "Query";
my $slabel = "Subject";	50	50	my $slabel = "Subject";
my $outdir = "";	51	51	my $outdir = "";
my $prog = 'ssearch36'; #'blastp';	52	52	my $prog = 'ssearch36'; #'blastp';
my $evalue = 1e-4;	53	53	my $evalue = 1e-4;
my $identity = 20.0;	54	54	my $identity = 20.0;
my $coverage = 40.0;	55	55	my $coverage = 40.0;
my $covControl = "X";	56	56	my $covControl = "X";
my $blastComp = "F"; #2;	57	57	my $blastComp = "F"; #2;
my $segFilter = 'no';	58	58	my $segFilter = 'no';
my $minLength = 30; #Min legnth of proteins to analyze (without gaps)	59	59	my $minLength = 30; #Min legnth of proteins to analyze (without gaps)
my $subMatrix = 'BL50';	60	60	my $subMatrix = 'BL50';
		61	my $hyd_qylim = undef; #Y-axis limits for query hydropathy plot [low, high]
		62	my $hyd_sylim = undef; #Y-axis limits for subject hydropathy plot [low, high]
	61	63
#this can be used to remove long sequences from results	62	64	#this can be used to remove long sequences from results
my $maxProtLength = 100000; #default threshold to allow any length	63	65	my $maxProtLength = 100000; #default threshold to allow any length
my $LengthControl = "N"; #same meaning as $covControl	64	66	my $LengthControl = "N"; #same meaning as $covControl
	65	67
#internal directories	66	68	#internal directories
my $filesDir = "";	67	69	my $filesDir = "";
my $plotsDir = "";	68	70	my $plotsDir = "";
my $seqDir = "";	69	71	my $seqDir = "";
my $blastDir = "";	70	72	my $blastDir = "";
	71	73
	72	74
read_command_line();	73	75	read_command_line();
	74	76
#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog,	75	77	#print Data::Dumper->Dump([$qfile, $qProt, $sfile, $sProt, $qlabel, $slabel, $outdir, $prog,
# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength,	76	78	# $evalue, $coverage, $covControl, $blastComp, $segFilter, $maxProtLength,
# $LengthControl],	77	79	# $LengthControl],
# [qw(qfile qProt sfile sProt qlabel slabel outdir prog	78	80	# [qw(qfile qProt sfile sProt qlabel slabel outdir prog
# evalue coverage covControl blastComp segFilter maxProtLength	79	81	# evalue coverage covControl blastComp segFilter maxProtLength
# *LengthControl)]);	80	82	# *LengthControl)]);
#exit;	81	83	#exit;
	82	84
	83	85
	84	86
#==========================================================================	85	87	#==========================================================================
#Output files	86	88	#Output files
	87	89
#The alignment file by blastp or ssearch36	88	90	#The alignment file by blastp or ssearch36
my $alnFile = "$filesDir/${prog}.out";	89	91	my $alnFile = "$filesDir/${prog}.out";
	90	92
#The results of running hmmscan	91	93	#The results of running hmmscan
my $pfamFile = "$filesDir/hmmscan.out";	92	94	my $pfamFile = "$filesDir/hmmscan.out";
	93	95
#The results from running hmmtop	94	96	#The results from running hmmtop
my $hmmtopFile = "$filesDir/hmmtop.out";	95	97	my $hmmtopFile = "$filesDir/hmmtop.out";
	96	98
#The blast database to retrieve sequences for ploting	97	99	#The blast database to retrieve sequences for ploting
my $blastdb = "$blastDir/sequences";	98	100	my $blastdb = "$blastDir/sequences";
	99	101
	100	102
	101	103
#==========================================================================	102	104	#==========================================================================
#Run the alignment first	103	105	#Run the alignment first
	104	106
print "Running $prog and parsing output....\n";	105	107	print "Running $prog and parsing output....\n";
run_alignment();	106	108	run_alignment();
	107	109
	108	110
my @alnHits = ();	109	111	my @alnHits = ();
if ($prog eq 'blastp') { parse_blast(\@alnHits); }	110	112	if ($prog eq 'blastp') { parse_blast(\@alnHits); }
elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)}	111	113	elsif ($prog eq 'ssearch36') { parse_ssearch(\@alnHits)}
	112	114
#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]);	113	115	#print Data::Dumper->Dump([\@alnHits ], [qw($alnHits )]);
#exit;	114	116	#exit;
	115	117
	116	118
die "No significant blastHits found!\n" unless (@alnHits);	117	119	die "No significant blastHits found!\n" unless (@alnHits);
	118	120
	119	121
	120	122
#==========================================================================	121	123	#==========================================================================
#Run pfam (get clans, hmmtop, and parse results	122	124	#Run pfam (get clans, hmmtop, and parse results
	123	125
my %pfamHits = ();	124	126	my %pfamHits = ();
my %clans = ();	125	127	my %clans = ();
my %hmmtopHits = ();	126	128	my %hmmtopHits = ();
run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans );	127	129	run_pfam_hmmtop(\%pfamHits, \%hmmtopHits,\%clans );
	128	130
#print Data::Dumper->Dump([\%clans], [qw(*clans)]);	129	131	#print Data::Dumper->Dump([\%clans], [qw(*clans)]);
#exit;	130	132	#exit;
	131	133
	132	134
#==========================================================================	133	135	#==========================================================================
#Parse the alignment results to make sure there are signficant results,	134	136	#Parse the alignment results to make sure there are signficant results,
#get domains for significant hits and plot the corresponding hydropathies.	135	137	#get domains for significant hits and plot the corresponding hydropathies.
	136	138
	137	139
	138	140
	139	141
print "Geneating report...\n";	140	142	print "Geneating report...\n";
generate_report();	141	143	generate_report();
	142	144
	143	145
	144	146
#==========================================================================	145	147	#==========================================================================
################ Subroutines definition beyond ths point ##############	146	148	################ Subroutines definition beyond ths point ##############
#==========================================================================	147	149	#==========================================================================
	148	150
	149	151
#==========================================================================	150	152	#==========================================================================
#Generate output for significant hits	151	153	#Generate output for significant hits
	152	154
sub generate_report {	153	155	sub generate_report {
	154	156
	155	157
#Prepare output files	156	158	#Prepare output files
my $htmlFile = "$outdir/report.html";	157	159	my $htmlFile = "$outdir/report.html";
my $plotsFile = "$outdir/plots.html";	158	160	my $plotsFile = "$outdir/plots.html";
	159	161
	160	162
my $htmlHeader = <<HEADER;	161	163	my $htmlHeader = <<HEADER;
<!DOCTYPE html>	162	164	<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">	163	165	<html xmlns="http://www.w3.org/1999/xhtml">
<head>	164	166	<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />	165	167	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
	166	168
<style type="text/css">	167	169	<style type="text/css">
	168	170
.label {	169	171	.label {
text-align: right;	170	172	text-align: right;
width: 50px;	171	173	width: 50px;
}	172	174	}
	173	175
.data {	174	176	.data {
text-align: left;	175	177	text-align: left;
padding-left: 8px;	176	178	padding-left: 8px;
width: 100px;	177	179	width: 100px;
}	178	180	}
	179	181
.uline {	180	182	.uline {
text-decoration: underline;	181	183	text-decoration: underline;
}	182	184	}
	183	185
.pfam {	184	186	.pfam {
text-align: center;	185	187	text-align: center;
vertical-align: middle;	186	188	vertical-align: middle;
}	187	189	}
	188	190
.seq {	189	191	.seq {
border: 2px solid black;	190	192	border: 2px solid black;
height: 70px;	191	193	height: 70px;
width: 100%;	192	194	width: 100%;
overflow-x: auto;	193	195	overflow-x: auto;
overflow-y: hidden;	194	196	overflow-y: hidden;
margin: 1em 0;	195	197	margin: 1em 0;
background: gray;	196	198	background: gray;
color: white;	197	199	color: white;
}	198	200	}
	199	201
	200	202
.dom {	201	203	.dom {
border: 2px solid black;	202	204	border: 2px solid black;
height: 100px;	203	205	height: 100px;
width: 100%;	204	206	width: 100%;
overflow-x: auto;	205	207	overflow-x: auto;
overflow-y: auto;	206	208	overflow-y: auto;
margin: 1em 0;	207	209	margin: 1em 0;
background: gray;	208	210	background: gray;
color: white;	209	211	color: white;
}	210	212	}
	211	213
img {	212	214	img {
display: block;	213	215	display: block;
margin-left: auto;	214	216	margin-left: auto;
margin-right: auto;	215	217	margin-right: auto;
height: 250px;	216	218	height: 250px;
width: auto;	217	219	width: auto;
max-width: 1500px;	218	220	max-width: 1500px;
max-height: 300px;	219	221	max-height: 300px;
}	220	222	}
	221	223
</style>	222	224	</style>
<title>$qlabel vs $slabel</title>	223	225	<title>$qlabel vs $slabel</title>
</head>	224	226	</head>
<br />	225	227	<br />
<h1 style='text-align:center'>$qlabel vs $slabel</h1>	226	228	<h1 style='text-align:center'>$qlabel vs $slabel</h1>
<body>	227	229	<body>
	228	230
HEADER	229	231	HEADER
	230	232
	231	233
open (my $outh, ">", $htmlFile) \|\| die $!;	232	234	open (my $outh, ">", $htmlFile) \|\| die $!;
print $outh $htmlHeader;	233	235	print $outh $htmlHeader;
	234	236
	235	237
foreach my $hit (sort by_evalue @alnHits) {	236	238	foreach my $hit (sort by_evalue @alnHits) {
	237	239
	238	240
my $qacc = $hit->{qacc};	239	241	my $qacc = $hit->{qacc};
my $qlen = $hit->{qlen};	240	242	my $qlen = $hit->{qlen};
my $qseq = $hit->{qseq};	241	243	my $qseq = $hit->{qseq};
my $qcov = sprintf("%.1f", $hit->{qcov});	242	244	my $qcov = sprintf("%.1f", $hit->{qcov});
my $qstart = $hit->{qstart};	243	245	my $qstart = $hit->{qstart};
my $qend = $hit->{qend};	244	246	my $qend = $hit->{qend};
	245	247
my $sacc = $hit->{sacc};	246	248	my $sacc = $hit->{sacc};
my $slen = $hit->{slen};	247	249	my $slen = $hit->{slen};
my $sseq = $hit->{sseq};	248	250	my $sseq = $hit->{sseq};
my $scov = sprintf("%.1f", $hit->{scov});	249	251	my $scov = sprintf("%.1f", $hit->{scov});
my $sstart = $hit->{sstart};	250	252	my $sstart = $hit->{sstart};
my $send = $hit->{send};	251	253	my $send = $hit->{send};
	252	254
my $eval = sprintf ("%.1e", $hit->{evalue});	253	255	my $eval = sprintf ("%.1e", $hit->{evalue});
my $id = sprintf ("%.1f", $hit->{id});	254	256	my $id = sprintf ("%.1f", $hit->{id});
my $hstr = $hit->{hstr};	255	257	my $hstr = $hit->{hstr};
	256	258
my $alnHit = <<HIT;	257	259	my $alnHit = <<HIT;
	258	260
<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>	259	261	<br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>
	260	262
<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p>	261	263	<p><b>$qacc ($qlen) vs $sacc ($slen)</b></p>
	262	264
<table width="600px" border="0" cellspacing="0" cellpadding="2">	263	265	<table width="600px" border="0" cellspacing="0" cellpadding="2">
<tr>	264	266	<tr>
<td class='label'><b>E-value:</b></td>	265	267	<td class='label'><b>E-value:</b></td>
<td class='data'>$eval</td>	266	268	<td class='data'>$eval</td>
<td class='label'><b>Identity:</b></td>	267	269	<td class='label'><b>Identity:</b></td>
<td class='data'>${id}%</td>	268	270	<td class='data'>${id}%</td>
<td class='label'><b>Q_coverage:</b></td>	269	271	<td class='label'><b>Q_coverage:</b></td>
<td class='data'>${qcov}%</td>	270	272	<td class='data'>${qcov}%</td>
<td class='label'><b>S_coverage:</b></td>	271	273	<td class='label'><b>S_coverage:</b></td>
<td class='data'>${scov}%</td>	272	274	<td class='data'>${scov}%</td>
</tr>	273	275	</tr>
<tr>	274	276	<tr>
<td class='label'><b>Q_aln:</b></td>	275	277	<td class='label'><b>Q_aln:</b></td>
<td class='data'>${qstart}-$qend</td>	276	278	<td class='data'>${qstart}-$qend</td>
<td class='label'><b>S_aln:</b></td>	277	279	<td class='label'><b>S_aln:</b></td>
<td class='data'>${sstart}-$send</td>	278	280	<td class='data'>${sstart}-$send</td>
<td class='label'></td>	279	281	<td class='label'></td>
<td class='data'></td>	280	282	<td class='data'></td>
<td class='label'></td>	281	283	<td class='label'></td>
<td class='data'></td>	282	284	<td class='data'></td>
</tr>	283	285	</tr>
</table>	284	286	</table>
<br />	285	287	<br />
	286	288
<p><b>Alignment:</b></p>	287	289	<p><b>Alignment:</b></p>
<div class='seq'>	288	290	<div class='seq'>
<pre>	289	291	<pre>
$qseq	290	292	$qseq
$hstr	291	293	$hstr
$sseq	292	294	$sseq
</pre>	293	295	</pre>
</div>	294	296	</div>
	295	297
HIT	296	298	HIT
	297	299
print $outh $alnHit;	298	300	print $outh $alnHit;
	299	301
	300	302
#Generate the hydropathy plots	301	303	#Generate the hydropathy plots
my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq);	302	304	my $good = run_quod($qacc, $sacc, $qstart, $qend, $sstart, $send, $qseq, $sseq);
die "Could not generate plots for hit: $qacc vs $sacc" unless ($good);	303	305	die "Could not generate plots for hit: $qacc vs $sacc" unless ($good);
	304	306
my $domData = generate_domain_data($qacc, $sacc);	305	307	my $domData = generate_domain_data($qacc, $sacc);
my $domHTML = "";	306	308	my $domHTML = "";
if ($domData) {	307	309	if ($domData) {
$domHTML =<<DATA;	308	310	$domHTML =<<DATA;
<br />	309	311	<br />
<hr />	310	312	<hr />
<p><b>Pfam info:</b></p>	311	313	<p><b>Pfam info:</b></p>
<div class='dom'>	312	314	<div class='dom'>
	313	315
$domData	314	316	$domData
	315	317
</div>	316	318	</div>
DATA	317	319	DATA
}	318	320	}
	319	321
my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png";	320	322	my $plot_aln = "plots/${qacc}_vs_${sacc}_qs${qstart}_qe${qend}_ss${sstart}_se${send}.png";
my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png";	321	323	my $qplot = "plots/${qacc}_vs_${sacc}_qaln_qs${qstart}_qe${qend}.png";
my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png";	322	324	my $splot = "plots/${qacc}_vs_${sacc}_saln_ss${sstart}_se${send}.png";
	323	325
#now include the plots	324	326	#now include the plots
my $prtPlots =<<PLOTS;	325	327	my $prtPlots =<<PLOTS;
	326	328
<br />	327	329	<br />
<table style="width:100%">	328	330	<table style="width:100%">
<tr>	329	331	<tr>
<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td>	330	332	<td><a href="$qplot" target="_blank"><img src="$qplot" alt="$qacc"></a></td>
<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td>	331	333	<td><a href="$splot" target="_blank"><img src="$splot" alt="$sacc"></a></td>
</tr>	332	334	</tr>
<tr>	333	335	<tr>
<td colspan="2" style="text-align: center;">	334	336	<td colspan="2" style="text-align: center;">
<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a>	335	337	<a href="$plot_aln" target="_blank"><img src="$plot_aln" alt="$qacc vs $sacc alignment"></a>
</td>	336	338	</td>
</tr>	337	339	</tr>
</table>	338	340	</table>
	339	341
$domHTML	340	342	$domHTML
	341	343
PLOTS	342	344	PLOTS
	343	345
print $outh $prtPlots;	344	346	print $outh $prtPlots;
}	345	347	}
	346	348
#Close HTML report	347	349	#Close HTML report
my $closeRep = <<CLOSE;	348	350	my $closeRep = <<CLOSE;
</body>	349	351	</body>
</html>	350	352	</html>
CLOSE	351	353	CLOSE
	352	354
print $outh $closeRep;	353	355	print $outh $closeRep;
	354	356
close $outh;	355	357	close $outh;
}	356	358	}
	357	359
	358	360
#==========================================================================	359	361	#==========================================================================
#Generate domain data for the html report	360	362	#Generate domain data for the html report
	361	363
	362	364
sub generate_domain_data {	363	365	sub generate_domain_data {
	364	366
my ($q, $s) = @_;	365	367	my ($q, $s) = @_;
	366	368
#Format of PFAM hash:	367	369	#Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} },	368	370	# push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval,	369	371	# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend, evalue=>$eval,
# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });	370	372	# dname=>$pfamName, def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
	371	373
	372	374
	373	375
	374	376
my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info);	375	377	my @cols = qw(Query Domain Clan Dom_length E-value Dom_start Dom_end Q_Start Q_end Dom_Name Dom_Info);
my $colStr = " <th>" . join ("</th>\n <th>", @cols) .	376	378	my $colStr = " <th>" . join ("</th>\n <th>", @cols) .
"</th>\n";	377	379	"</th>\n";
	378	380
my $header =<<HEADER;	379	381	my $header =<<HEADER;
<table border='1', style='width:100%'>	380	382	<table border='1', style='width:100%'>
<tr>	381	383	<tr>
$colStr	382	384	$colStr
</tr>	383	385	</tr>
HEADER	384	386	HEADER
	385	387
	386	388
my $res = "";	387	389	my $res = "";
foreach my $prot ($q, $s) {	388	390	foreach my $prot ($q, $s) {
	389	391
if (exists $pfamHits{$prot}) {	390	392	if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} };	391	393	my @Doms = keys %{ $pfamHits{$prot} };
	392	394
foreach my $d (@Doms) {	393	395	foreach my $d (@Doms) {
	394	396
my $clan = ($clans{$d})? $clans{$d} : "N/A";	395	397	my $clan = ($clans{$d})? $clans{$d} : "N/A";
	396	398
my @hits = @{ $pfamHits{$prot}{$d} };	397	399	my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) {	398	400	foreach my $hit (@hits) {
my $dlen = $hit->{dlen};	399	401	my $dlen = $hit->{dlen};
my $eval = $hit->{evalue};	400	402	my $eval = $hit->{evalue};
my $qstart = $hit->{qstart};	401	403	my $qstart = $hit->{qstart};
my $qend = $hit->{qend};	402	404	my $qend = $hit->{qend};
my $dstart = $hit->{dstart};	403	405	my $dstart = $hit->{dstart};
my $dend = $hit->{dend};	404	406	my $dend = $hit->{dend};
my $name = $hit->{dname};	405	407	my $name = $hit->{dname};
my $def = $hit->{def};	406	408	my $def = $hit->{def};
	407	409
$res .=<<ROW;	408	410	$res .=<<ROW;
<tr>	409	411	<tr>
<td class="pfam">$prot</td>	410	412	<td class="pfam">$prot</td>
<td class="pfam">$d</td>	411	413	<td class="pfam">$d</td>
<td class="pfam">$clan</td>	412	414	<td class="pfam">$clan</td>
<td class="pfam">$dlen</td>	413	415	<td class="pfam">$dlen</td>
<td class="pfam">$eval</td>	414	416	<td class="pfam">$eval</td>
<td class="pfam">$dstart</td>	415	417	<td class="pfam">$dstart</td>
<td class="pfam">$dend</td>	416	418	<td class="pfam">$dend</td>
<td class="pfam">$qstart</td>	417	419	<td class="pfam">$qstart</td>
<td class="pfam">$qend</td>	418	420	<td class="pfam">$qend</td>
<td class="pfam">$name</td>	419	421	<td class="pfam">$name</td>
<td class="pfam">$def</td>	420	422	<td class="pfam">$def</td>
</tr>	421	423	</tr>
ROW	422	424	ROW
}	423	425	}
}	424	426	}
}	425	427	}
}	426	428	}
	427	429
#Return final result	428	430	#Return final result
if ($res) {	429	431	if ($res) {
$header .= $res;	430	432	$header .= $res;
$header .= " </table>\n";	431	433	$header .= " </table>\n";
return $header;	432	434	return $header;
}	433	435	}
else {	434	436	else {
return $res;	435	437	return $res;
}	436	438	}
}	437	439	}
	438	440
	439	441
	440	442
	441	443
#==========================================================================	442	444	#==========================================================================
#Run quod on the query, subject and the alignment.	443	445	#Run quod on the query, subject and the alignment.
	444	446
	445	447
		448	#quod.py -q -l "HEB99829" -o plot.png --width 15 --edgecolor red --xticks 25 --no-tms +0 --add-tms 9-32 43-67 98-121 132-151 164-181 192-215 224-241:orange -w 17-245:+2.7:+:Alignment --region-font 12 --add-region 20-245:'PF07556':-2.8,-2.6:red,black:tc --mark +0:K,R,H:black --xlim 0 400 -- HEB99829.faa
		449
		450
sub run_quod {	446	451	sub run_quod {
	447	452
my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;	448	453	my ($q, $s, $qs, $qe, $ss, $se, $qseq, $sseq) = @_;
	449	454
	450	455
#extract sequences for query and subject	451	456	#extract sequences for query and subject
extract_full_sequences($q,$s);	452	457	extract_full_sequences($q,$s);
	453	458
	454	459
#-----------------------------------------------------------------	455	460	#-----------------------------------------------------------------
#Run quod for the alignment	456	461	#Run quod for the alignment
	457	462
#First save aligned segments to files	458	463	#First save aligned segments to files
my $qalnFile ="$seqDir/${q}_aln.faa";	459	464	my $qalnFile ="$seqDir/${q}_aln.faa";
open(my $qfh, '>', $qalnFile) \|\| die $!;	460	465	open(my $qfh, '>', $qalnFile) \|\| die $!;
print $qfh ">$q alignment\n$qseq\n";	461	466	print $qfh ">$q alignment\n$qseq\n";
close $qfh;	462	467	close $qfh;
	463	468
my $salnFile ="$seqDir/${s}_aln.faa";	464	469	my $salnFile ="$seqDir/${s}_aln.faa";
open(my $sfh, '>', $salnFile) \|\| die $!;	465	470	open(my $sfh, '>', $salnFile) \|\| die $!;
print $sfh ">$s alignment\n$sseq\n";	466	471	print $sfh ">$s alignment\n$sseq\n";
close $sfh;	467	472	close $sfh;
	468	473
	469	474
#Note alnquod requires to add the extension to the image name	470	475	#Note alnquod requires to add the extension to the image name
my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";	471	476	my $alnFig = "$plotsDir/${q}_vs_${s}_qs${qs}_qe${qe}_ss${ss}_se${se}.png";
my $cmd1 = qq(alnquod.py --grid -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);	472	477	my $cmd1 = qq(quod.py -q -l "$q (red) and $s (blue)" -o $alnFig --xticks 25 --width 15 --edgecolor +0:red +1:blue --facecolor +0:orange +1:cyan --multi frag -- $qalnFile $seqDir/${q}.faa $salnFile $seqDir/${s}.faa);
#print "$cmd1\n\n";	473	478	# print "$cmd1\n\n";
		479	# exit;
system $cmd1 unless (-f "${alnFig}");	474	480	system $cmd1 unless (-f "${alnFig}");
return undef unless (-f "${alnFig}");	475	481	return undef unless (-f "${alnFig}");
	476	482
	477	483
#-----------------------------------------------------------------	478	484	#-----------------------------------------------------------------
#Run quod for the full sequencess of the query and subject proteins	479	485	#Run quod for the full sequencess of the query and subject proteins
	480	486
	481	487
#Extract TMS coordinates for query	482	488	#Extract TMS coordinates for query
die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});	483	489	die "Error: no hmmtop results for: $q" unless (exists $hmmtopHits{$q});
my $qTMS = "";	484	490	my $qTMS = "";
if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {	485	491	if (scalar @{ $hmmtopHits{$q}{coords} } > 0) {
$qTMS = "-at " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";	486	492	$qTMS = "--add-tms " . join(",", @{ $hmmtopHits{$q}{coords} }) . ":orange";
}	487	493	}
	488	494
	489	495
#Plot query hydropathy	490	496	#Plot query hydropathy
my $qPfam = get_pfam_coords_for_quod($q, "red");	491	497	my $qPfam = get_pfam_coords_for_quod($q, "red");
my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}";	492	498	my $qName = "$plotsDir/${q}_vs_${s}_qaln_qs${qs}_qe${qe}.png";
my $cmd2 = qq(quod.py --grid -q -l "$q" -o $qName --width 15 --color red --xticks 25 -w ${qs}-${qe}::1 -t png -nt +0 $qTMS $qPfam -- $seqDir/${q}.faa);	493	499	my $cmd2 = qq(quod.py -q -l "$q" -o $qName --width 15 --edgecolor red --xticks 25 -w ${qs}-${qe}:+2.7:+:Alignment --no-tms +0 $qTMS $qPfam -- $seqDir/${q}.faa);
#print "$cmd2\n\n";	494	500	# print "$cmd2\n\n";
system $cmd2 unless (-f "${qName}.png");	495	501	# exit;
return undef unless (-f "${qName}.png");	496	502	system $cmd2 unless (-f $qName);
		503	return undef unless (-f $qName);
	497	504
	498	505
	499	506
#TMS coords for the subject	500	507	#TMS coords for the subject
die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});	501	508	die "Error: no hmmtop results for: $s" unless (exists $hmmtopHits{$s});
my $sTMS = "";	502	509	my $sTMS = "";
if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {	503	510	if (scalar @{ $hmmtopHits{$s}{coords} } > 0) {
$sTMS = "-at " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";	504	511	$sTMS = "--add-tms " . join(",", @{ $hmmtopHits{$s}{coords} }) . ":cyan";
}	505	512	}
	506	513
#Plot Subject hydropaty	507	514	#Plot Subject hydropaty
my $sPfam = get_pfam_coords_for_quod($s, "blue");	508	515	my $sPfam = get_pfam_coords_for_quod($s, "blue");
my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}";	509	516	my $sName = "$plotsDir/${q}_vs_${s}_saln_ss${ss}_se${se}.png";
my $cmd3 = qq(quod.py --grid -q -l "$s" -o $sName --width 15 --color blue --xticks 25 -w ${ss}-${se}::1 -t png -nt +0 $sTMS $sPfam -- $seqDir/${s}.faa);	510	517	my $cmd3 = qq(quod.py -q -l "$s" -o $sName --width 15 --edgecolor blue --xticks 25 -w ${ss}-${se}:+2.7:+:Alignment --no-tms +0 $sTMS $sPfam -- $seqDir/${s}.faa);
#print "$cmd3\n\n";	511	518	# print "$cmd3\n\n";
system $cmd3 unless (-f "${sName}.png");	512	519	# exit;
return undef unless (-f "${sName}.png");	513	520	system $cmd3 unless (-f $sName);
		521	return undef unless (-f $sName);
	514	522
	515	523
return 1;	516	524	return 1;
}	517	525	}
	518	526
#==========================================================================	519	527	#==========================================================================
#Get the string for quod that will plot the PFAM domains	520	528	#Get the string for quod that will plot the PFAM domains
	521	529
sub get_pfam_coords_for_quod {	522	530	sub get_pfam_coords_for_quod {
	523	531
my ($prot, $color) = @_;	524	532	my ($prot, $color) = @_;
	525	533
#Format of PFAM hash:	526	534	#Format of PFAM hash:
# push (@{ $out->{$qacc}->{$pfamID} },	527	535	# push (@{ $out->{$qacc}->{$pfamID} },
# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend,	528	536	# {pfamid=> $pfamID, dlen=>$pfamLen, dstart=>$dstart, dend=>$dend,
# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });	529	537	# def=>$def, qlen=>$qlen, qstart=>$qstart, qend=>$qend });
	530	538
	531	539
	532	540
my $str = "";	533	541	my $str = "";
	534	542
if (exists $pfamHits{$prot}) {	535	543	if (exists $pfamHits{$prot}) {
my @Doms = keys %{ $pfamHits{$prot} };	536	544	my @Doms = keys %{ $pfamHits{$prot} };
my $dcnt = 0;	537	545	my $dcnt = 0;
$str = "--region-font 12 -ar ";	538	546	$str = "--region-font 12 --add-region ";
foreach my $d (@Doms) {	539	547	foreach my $d (@Doms) {
	540	548
my @hits = @{ $pfamHits{$prot}{$d} };	541	549	my @hits = @{ $pfamHits{$prot}{$d} };
foreach my $hit (@hits) {	542	550	foreach my $hit (@hits) {
my $left = $hit->{qstart};	543	551	my $left = $hit->{qstart};
my $right = $hit->{qend};	544	552	my $right = $hit->{qend};
	545	553
my $ypos = -2.8 + $dcnt * 0.4;	546	554	my $yposl = -2.8 + $dcnt * 0.4; #domain bottom coord
$str .= "${left}-${right}:'${d}':${ypos}:$color ";	547	555	my $yposh = $yposl + 0.15; #domain height coord
		556
		557	$str .= "${left}-${right}:'${d}':${yposl},${yposh}:$color,black:tc ";
$dcnt++;	548	558	$dcnt++;
}	549	559	}
}	550	560	}
}	551	561	}
	552	562
return $str;	553	563	return $str;
	554	564
}	555	565	}
	556	566
	557	567
#==========================================================================	558	568	#==========================================================================
#Extract the full sequences of the query and subject proteins	559	569	#Extract the full sequences of the query and subject proteins
#Examples: AKM80767.1	560	570	#Examples: AKM80767.1
	561	571
	562	572
	563	573
sub extract_full_sequences {	564	574	sub extract_full_sequences {
	565	575
my ($q, $s) = @_;	566	576	my ($q, $s) = @_;
	567	577
	568	578
my $q_seq = "$seqDir/${q}.faa";	569	579	my $q_seq = "$seqDir/${q}.faa";
my $s_seq = "$seqDir/${s}.faa";	570	580	my $s_seq = "$seqDir/${s}.faa";
	571	581
#extract the query secuence from tcdb and the subject from the custom blastdb	572	582	#extract the query secuence from tcdb and the subject from the custom blastdb
my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq);	573	583	my $cmd1 = qq(blastdbcmd -db $blastdb -entry $q -target_only -out $q_seq);
system "$cmd1" unless (-f $q_seq && !(-z $q_seq));	574	584	system "$cmd1" unless (-f $q_seq && !(-z $q_seq));
die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq));	575	585	die "Could not extract sequence for $q" unless (-f $q_seq && !(-z $q_seq));
	576	586
	577	587
my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq);	578	588	my $cmd2 = qq(blastdbcmd -db $blastdb -entry $s -target_only -out $s_seq);
system "$cmd2" unless (-f $s_seq && !(-z $s_seq));	579	589	system "$cmd2" unless (-f $s_seq && !(-z $s_seq));
die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq));	580	590	die "Could not extract sequence for $s" unless (-f $s_seq && !(-z $s_seq));
}	581	591	}
	582	592
	583	593
	584	594
	585	595
	586	596
	587	597
	588	598
#==========================================================================	589	599	#==========================================================================
#Sort alignmnet results by E-value	590	600	#Sort alignmnet results by E-value
	591	601
sub by_evalue {	592	602	sub by_evalue {
$a->{evalue} <=> $b->{evalue};	593	603	$a->{evalue} <=> $b->{evalue};
}	594	604	}
	595	605
	596	606
	597	607
#==========================================================================	598	608	#==========================================================================
#Run PFAM, hmmtop and parse results	599	609	#Run PFAM, hmmtop and parse results
	600	610
	601	611
sub run_pfam_hmmtop {	602	612	sub run_pfam_hmmtop {
my ($pfamOut, $hmmtopOut, $pfamClans) = @_;	603	613	my ($pfamOut, $hmmtopOut, $pfamClans) = @_;
	604	614
	605	615
#----------------------------------------------------------------------	606	616	#----------------------------------------------------------------------
#Generate blast DB for easy sequence retrieval	607	617	#Generate blast DB for easy sequence retrieval
	608	618
print "Generate Blast DB with sequences for fast sequence retrieval...\n";	609	619	print "Generate Blast DB with sequences for fast sequence retrieval...\n";
	610	620
#Get the sequences for which hmmscan will run	611	621	#Get the sequences for which hmmscan will run
my $allSeqsFile = "$seqDir/all_seqs.faa";	612	622	my $allSeqsFile = "$seqDir/all_seqs.faa";
system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile));	613	623	system qq(cat $qfile $sfile > $allSeqsFile) unless (-f $allSeqsFile && !(-z $allSeqsFile));
die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile));	614	624	die "Could not generate file: $allSeqsFile" unless (-f $allSeqsFile && !(-z $allSeqsFile));
	615	625
	616	626
#Generate blastdb ...assuming there are no duplicate sequences.	617	627	#Generate blastdb ...assuming there are no duplicate sequences.
my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb);	618	628	my $cmd1 = qq(makeblastdb -dbtype prot -in $allSeqsFile -title '$qlabel plus $slabel' -parse_seqids -hash_index -out $blastdb);
print "$cmd1\n";	619	629	print "$cmd1\n";
system $cmd1 unless (-f "${blastdb}.pin");	620	630	system $cmd1 unless (-f "${blastdb}.pin");
system "rm $allSeqsFile" if (-f $allSeqsFile);	621	631	system "rm $allSeqsFile" if (-f $allSeqsFile);
	622	632
	623	633
#----------------------------------------------------------------------	624	634	#----------------------------------------------------------------------
#Get the accessions of the top hits in the alignments	625	635	#Get the accessions of the top hits in the alignments
	626	636
#get the accessions with significant hits	627	637	#get the accessions with significant hits
my %accList = ();	628	638	my %accList = ();
foreach my $hit (@alnHits) {	629	639	foreach my $hit (@alnHits) {
$accList{$hit->{qacc}} = 1;	630	640	$accList{$hit->{qacc}} = 1;
$accList{$hit->{sacc}} = 1;	631	641	$accList{$hit->{sacc}} = 1;
}	632	642	}
	633	643
	634	644
#Save accessions to a file	635	645	#Save accessions to a file
my $idFile = "$seqDir/top_hits_accs.txt";	636	646	my $idFile = "$seqDir/top_hits_accs.txt";
unless (-f $idFile) {	637	647	unless (-f $idFile) {
open (my $afh, ">", $idFile) \|\| die $!;	638	648	open (my $afh, ">", $idFile) \|\| die $!;
print $afh join("\n", keys %accList), "\n";	639	649	print $afh join("\n", keys %accList), "\n";
close $afh;	640	650	close $afh;
}	641	651	}
	642	652
#----------------------------------------------------------------------	643	653	#----------------------------------------------------------------------
#Extract full sequences for top hits.	644	654	#Extract full sequences for top hits.
	645	655
my $topHitsSeqs = "$seqDir/topHits.faa";	646	656	my $topHitsSeqs = "$seqDir/topHits.faa";
my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs);	647	657	my $cmdTopHits = qq(blastdbcmd -db $blastdb -entry_batch $idFile -target_only -out $topHitsSeqs);
system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs));	648	658	system $cmdTopHits unless (-f $topHitsSeqs && !(-z $topHitsSeqs));
	649	659
	650	660
#----------------------------------------------------------------------	651	661	#----------------------------------------------------------------------
#run hmmscan on all the sequences for both files	652	662	#run hmmscan on all the sequences for both files
	653	663
print "\nRunning hmmscan and parsing output....\n";	654	664	print "\nRunning hmmscan and parsing output....\n";
	655	665
my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm";	656	666	my $pfamDB = ($ENV{PFAMDB})? $ENV{PFAMDB} : "$ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm";
my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs);	657	667	my $cmd2 = qq(hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamFile $pfamDB $topHitsSeqs);
system $cmd2 unless (-f $pfamFile && !(-z $pfamFile));	658	668	system $cmd2 unless (-f $pfamFile && !(-z $pfamFile));
	659	669
	660	670
#parse Pfam output	661	671	#parse Pfam output
TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans);	662	672	TCDB::Assorted::parse_pfam($pfamFile, $pfamOut, $pfamClans);
# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(pfamOut pfamClans )]);	663	673	# print Data::Dumper->Dump([$pfamOut, $pfamClans ], [qw(pfamOut pfamClans )]);
# exit;	664	674	# exit;
	665	675
#----------------------------------------------------------------------	666	676	#----------------------------------------------------------------------
#Extract clans	667	677	#Extract clans
	668	678
TCDB::Assorted::get_clans($pfamClans, $filesDir);	669	679	TCDB::Assorted::get_clans($pfamClans, $filesDir);
# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]);	670	680	# print Data::Dumper->Dump([$pfamClans ], [qw(*clans )]);
# exit;	671	681	# exit;
	672	682
#--------------------------------------------------------------------------	673	683	#--------------------------------------------------------------------------
#Run hmmtop on top hits for later hydropathy plots.	674	684	#Run hmmtop on top hits for later hydropathy plots.
	675	685
print "Runnign HMMTOP and parsing output...\n";	676	686	print "Runnign HMMTOP and parsing output...\n";
	677	687
my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo);	678	688	my $cmd3 = qq(hmmtop -if=$topHitsSeqs -of=$hmmtopFile -sf=FAS -pi=spred -is=pseudo);
system $cmd3 unless (-f $hmmtopFile);	679	689	system $cmd3 unless (-f $hmmtopFile);
system "rm $topHitsSeqs" if (-f $topHitsSeqs);	680	690	system "rm $topHitsSeqs" if (-f $topHitsSeqs);
	681	691
#Parse hmmtop output	682	692	#Parse hmmtop output
TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile);	683	693	TCDB::Assorted::parse_hmmtop($hmmtopOut, $hmmtopFile);
	684	694
}	685	695	}
	686	696
	687	697
#==========================================================================	688	698	#==========================================================================
#Parse ssearch36 output	689	699	#Parse ssearch36 output
	690	700
sub parse_ssearch {	691	701	sub parse_ssearch {
	692	702
my $out = shift;	693	703	my $out = shift;
	694	704
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile);	695	705	my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alnFile);
	696	706
my $formatTmp = $parser->format();	697	707	my $formatTmp = $parser->format();
# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]);	698	708	# print Data::Dumper->Dump([$formatTmp ], [qw(*fileFormat )]);
# exit;	699	709	# exit;
	700	710
while (my $result = $parser->next_result) {	701	711	while (my $result = $parser->next_result) {
	702	712
my $qacc = $result->query_name;	703	713	my $qacc = $result->query_name;
my $qlen = $result->query_length;	704	714	my $qlen = $result->query_length;
	705	715
	706	716
HIT:while (my $hit = $result->next_hit) {	707	717	HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) {	708	718	HSP:while(my $hsp = $hit->next_hsp) {
	709	719
#Alignment parameters	710	720	#Alignment parameters
my $sacc = $hit->name;	711	721	my $sacc = $hit->name;
my $slen = $hit->length;	712	722	my $slen = $hit->length;
my $eval = $hsp->evalue;	713	723	my $eval = $hsp->evalue;
my $id = $hsp->frac_identical('total') * 100;	714	724	my $id = $hsp->frac_identical('total') * 100;
	715	725
#coordinates and sequence	716	726	#coordinates and sequence
my $qstart = $hsp->start('query');	717	727	my $qstart = $hsp->start('query');
my $qend = $hsp->end('query');	718	728	my $qend = $hsp->end('query');
my $sstart = $hsp->start('subject');	719	729	my $sstart = $hsp->start('subject');
my $send = $hsp->end('subject');	720	730	my $send = $hsp->end('subject');
my $qseq = $hsp->query_string;	721	731	my $qseq = $hsp->query_string;
my $sseq = $hsp->hit_string;	722	732	my $sseq = $hsp->hit_string;
my $hstr = $hsp->homology_string;	723	733	my $hstr = $hsp->homology_string;
	724	734
	725	735
#Check first that both proteins have the right length	726	736	#Check first that both proteins have the right length
next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl));	727	737	next HSP if (max_length_violation($qlen, $slen, $maxProtLength, $LengthControl));
	728	738
#If the alignment has less than $minLength aas, ignore it	729	739	#If the alignment has less than $minLength aas, ignore it
my $qtmp = $qseq; $qtmp =~ s/-//g;	730	740	my $qtmp = $qseq; $qtmp =~ s/-//g;
my $stmp = $sseq; $stmp =~ s/-//g;	731	741	my $stmp = $sseq; $stmp =~ s/-//g;
next HSP if (length($qtmp) < $minLength \|\| length($stmp) < $minLength);	732	742	next HSP if (length($qtmp) < $minLength \|\| length($stmp) < $minLength);
	733	743
#Calculate coverages properly (do not use alignment length as it includes gaps	734	744	#Calculate coverages properly (do not use alignment length as it includes gaps
my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100;	735	745	my $qCov_tmp = ($qend - $qstart + 1) / $qlen * 100;
my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp;	736	746	my $qcov = ($qCov_tmp > 100.0)? 100 : $qCov_tmp;
	737	747
my $sCov_tmp = ($send - $sstart + 1) / $slen * 100;	738	748	my $sCov_tmp = ($send - $sstart + 1) / $slen * 100;
my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp;	739	749	my $scov = ($sCov_tmp > 100.0)? 100 : $sCov_tmp;
	740	750
	741	751
if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {	742	752	if ($eval <= $evalue && TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
	743	753
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,	744	754	push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,	745	755	scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr});	746	756	sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq, hstr=>$hstr});
}	747	757	}
} # hsp	748	758	} # hsp
} # hit	749	759	} # hit
} # result	750	760	} # result
}	751	761	}
	752	762
	753	763
	754	764
#==========================================================================	755	765	#==========================================================================
#Test whether the lengths of two proteins are withing a predefined	756	766	#Test whether the lengths of two proteins are withing a predefined
#legnth specified by the user. This are the options for control:	757	767	#legnth specified by the user. This are the options for control:
# X: Either protein is larger than the cutoff	758	768	# X: Either protein is larger than the cutoff
# B: Both proteins are larger than the cutoff	759	769	# B: Both proteins are larger than the cutoff
# Q: Only the query protein is larger than the cutoff	760	770	# Q: Only the query protein is larger than the cutoff
# S: Only the subject protein is larger than the cutoff	761	771	# S: Only the subject protein is larger than the cutoff
# N: No control. Any length is ok.	762	772	# N: No control. Any length is ok.
	763	773
sub max_length_violation {	764	774	sub max_length_violation {
	765	775
my ($qlen, $slen, $maxLen, $control) = @_;	766	776	my ($qlen, $slen, $maxLen, $control) = @_;
	767	777
if ($control eq "X") {	768	778	if ($control eq "X") {
(($qlen >= $maxLen) \|\| ($slen >= $maxLen))? return 1 : return 0;	769	779	(($qlen >= $maxLen) \|\| ($slen >= $maxLen))? return 1 : return 0;
}	770	780	}
	771	781
if ($control eq "B") {	772	782	if ($control eq "B") {
(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0;	773	783	(($qlen >= $maxLen) && ($slen >= $maxLen))? return 1 : return 0;
}	774	784	}
	775	785
if ($control eq "Q") {	776	786	if ($control eq "Q") {
($qlen >= $maxLen)? return 1 : return 0;	777	787	($qlen >= $maxLen)? return 1 : return 0;
}	778	788	}
	779	789
if ($control eq "S") {	780	790	if ($control eq "S") {
($slen >= $maxLen)? return 1 : return 0;	781	791	($slen >= $maxLen)? return 1 : return 0;
}	782	792	}
	783	793
if ($control eq "N") {	784	794	if ($control eq "N") {
return 0;	785	795	return 0;
}	786	796	}
	787	797
die "Unknown control mode: $control";	788	798	die "Unknown control mode: $control";
	789	799
}	790	800	}
	791	801
	792	802
	793	803
	794	804
#==========================================================================	795	805	#==========================================================================
#Parse blast output	796	806	#Parse blast output
	797	807
	798	808
sub parse_blast {	799	809	sub parse_blast {
my $out = shift;	800	810	my $out = shift;
	801	811
open (my $fh, "<", $alnFile) \|\| die $!;	802	812	open (my $fh, "<", $alnFile) \|\| die $!;
LINE:while (<$fh>) {	803	813	LINE:while (<$fh>) {
chomp;	804	814	chomp;
next unless ($_);	805	815	next unless ($_);
next if (/^#/);	806	816	next if (/^#/);
	807	817
#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq	808	818	#Blast columns: qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq
my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_);	809	819	my ($qacc, $sacc, $qlen, $slen, $eval, $id, $qstart, $qend, $sstart, $send, $qseq, $sseq) = split (/\t/, $_);
	810	820
	811	821
if ($eval <= $evalue) {	812	822	if ($eval <= $evalue) {
	813	823
my $qcov = ($qend - $qstart + 1) / $qlen * 100;	814	824	my $qcov = ($qend - $qstart + 1) / $qlen * 100;
my $scov = ($send - $sstart + 1) / $slen * 100;	815	825	my $scov = ($send - $sstart + 1) / $slen * 100;
	816	826
if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {	817	827	if (TCDB::Assorted::coverage_ok($qcov, $scov, $coverage, $covControl)) {
	818	828
push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,	819	829	push(@{ $out }, {qacc=>$qacc, sacc=>$sacc, qlen=>$qlen, slen=>$slen, qcov=>$qcov,
scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,	820	830	scov=>$scov, evalue=>$eval, id=>$id, qstart=>$qstart, qend=>$qend,
sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq});	821	831	sstart=>$sstart, send=>$send, qseq=>$qseq, sseq=>$sseq});
}	822	832	}
}	823	833	}
}	824	834	}
close $fh;	825	835	close $fh;
}	826	836	}
	827	837
	828	838
	829	839
	830	840
	831	841
#==========================================================================	832	842	#==========================================================================
#Run the alignemnt between the two files depending on the program	833	843	#Run the alignemnt between the two files depending on the program
#Selected by the user.	834	844	#Selected by the user.
	835	845
sub run_alignment {	836	846	sub run_alignment {
	837	847
my $cmd = "";	838	848	my $cmd = "";
	839	849
	840	850
if ($prog eq 'blastp') {	841	851	if ($prog eq 'blastp') {
	842	852
my $compStr = "-comp_based_stats $blastComp";	843	853	my $compStr = "-comp_based_stats $blastComp";
my $segStr = "-seg $segFilter";	844	854	my $segStr = "-seg $segFilter";
my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq');	845	855	my $outFmt = qq(-outfmt '7 qacc sacc qlen slen evalue pident qstart qend sstart send qseq sseq');
	846	856
#Run blast	847	857	#Run blast
$cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr);	848	858	$cmd = qq(blastp -query $qfile -subject $sfile -matrix BLOSUM62 -out $alnFile $outFmt -evalue $evalue -use_sw_tback $compStr $segStr);
print "$cmd\n";	849	859	print "$cmd\n";
system $cmd unless (-f $alnFile && !(-z $alnFile));	850	860	system $cmd unless (-f $alnFile && !(-z $alnFile));
	851	861
#Append command line to the end of results file	852	862	#Append command line to the end of results file
open (my $fh, ">>", $alnFile) \|\| die $!;	853	863	open (my $fh, ">>", $alnFile) \|\| die $!;
print $fh "\n# $cmd\n";	854	864	print $fh "\n# $cmd\n";
close $fh;	855	865	close $fh;

#!/usr/bin/env perl -w	1	1	#!/usr/bin/env perl -w
	2	2
use strict;	3	3	use strict;
use warnings;	4	4	use warnings;
use Data::Dumper;	5	5	use Data::Dumper;
	6	6
$Data::Dumper::Deepcopy = 1;	7	7	$Data::Dumper::Deepcopy = 1;
	8	8
use Getopt::Long;	9	9	use Getopt::Long;
use Bio::SearchIO;	10	10	use Bio::SearchIO;
use Bio::SeqIO;	11	11	use Bio::SeqIO;
	12	12
	13	13
#Local libraries	14	14	#Local libraries
use TCDB::CheckDependencies;	15	15	use TCDB::CheckDependencies;
use TCDB::Assorted;	16	16	use TCDB::Assorted;
	17	17
	18	18
###########################################################################	19	19	###########################################################################
#	20	20	#
# Parse GBLAST output file and return the sequence of the proteins that	21	21	# Parse GBLAST output file and return the sequence of the proteins that
# meet the user's criteria.	22	22	# meet the user's criteria.
#	23	23	#
###########################################################################	24	24	###########################################################################
	25	25
	26	26
#This will prevent quod and alnquod from going into interactive mode	27	27	#This will prevent quod and alnquod from going into interactive mode
$ENV{"MPLBACKEND"} = "agg";	28	28	$ENV{"MPLBACKEND"} = "agg";
	29	29
	30	30
#==========================================================================	31	31	#==========================================================================
#Check dependencies	32	32	#Check dependencies
	33	33
my @dependencies = ("glsearch36", "blastdbcmd");	34	34	my @dependencies = ("glsearch36", "blastdbcmd");
my $CheckDep_obj = new TCDB::CheckDependencies();	35	35	my $CheckDep_obj = new TCDB::CheckDependencies();
$CheckDep_obj -> dependencies_list(\@dependencies);	36	36	$CheckDep_obj -> dependencies_list(\@dependencies);
$CheckDep_obj -> checkDependencies;	37	37	$CheckDep_obj -> checkDependencies;
	38	38
	39	39
	40	40
	41	41
#==========================================================================	42	42	#==========================================================================
#Read command line arguments	43	43	#Read command line arguments
	44	44
my $fragment = undef;	45	45	my $fragment = undef;
my $accession = undef;	46	46	my $accession = undef;
my $accFile = undef	47	47	my $accFile = undef
my $outdir = undef;	48	48	my $outdir = undef;
my $blastdb = undef;	49	49	my $blastdb = undef;
my $evalue = 1e-2;	50	50	my $evalue = 1e-2;
my $subMatrix = 'BL50';	51	51	my $subMatrix = 'BL50';
my $quiet = 0;	52	52	my $quiet = 0;
my $interactive = 0;	53	53	my $interactive = 0;
	54	54
read_command_line();	55	55	read_command_line();
	56	56
#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet],	57	57	#print Data::Dumper->Dump([$fragment, $accession, $accFile, $outdir, $blastdb, $evalue, $quiet],
# [qw(fragment accession accFile outdir blastdb evalue *quiet)]);	58	58	# [qw(fragment accession accFile outdir blastdb evalue *quiet)]);
#exit;	59	59	#exit;
	60	60
	61	61
	62	62
	63	63
#==========================================================================	64	64	#==========================================================================
#Get the sequences for the fragment and full proteins in files	65	65	#Get the sequences for the fragment and full proteins in files
	66	66
	67	67
my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) };	68	68	my ($fragFile, $protFile) = @{ getSequences($fragment, $accession) };
#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(fragFile protFile)]);	69	69	#print Data::Dumper->Dump([$fragFile, $protFile ], [qw(fragFile protFile)]);
	70	70
die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile);	71	71	die "Both files must exist:\n $fragFile\n $protFile" unless (-f $fragFile && -f $protFile);
	72	72
#==========================================================================	73	73	#==========================================================================
#align fragment to full protein	74	74	#align fragment to full protein
	75	75
my $alignFile = "$outdir/${accession}_glsearch.out";	76	76	my $alignFile = "$outdir/${accession}_glsearch.out";
my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile);	77	77	my $cmd = qq(glsearch36 -s BL62 -z 21 -k 10000 -E $evalue -s $subMatrix $fragFile $protFile > $alignFile);
system $cmd unless (-f $alignFile);	78	78	system $cmd unless (-f $alignFile);
	79	79
	80	80
#==========================================================================	81	81	#==========================================================================
#Parse glsearch output and run quod	82	82	#Parse glsearch output and run quod
	83	83
	84	84
run_quod ($alignFile, $protFile);	85	85	run_quod ($alignFile, $protFile);
	86	86
	87	87
	88	88
	89	89
	90	90
#==========================================================================	91	91	#==========================================================================
################ Subroutines definition beyond ths point ##############	92	92	################ Subroutines definition beyond ths point ##############
#==========================================================================	93	93	#==========================================================================
	94	94
	95	95
sub run_quod {	96	96	sub run_quod {
	97	97
my ($alignment, $sequence) = @_;	98	98	my ($alignment, $sequence) = @_;
	99	99
	100	100
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment);	101	101	my $parser = new Bio::SearchIO (-format => 'fasta', -file => $alignment);
	102	102
my @res = ();	103	103	my @res = ();
	104	104
#----------------------------------------------------------------------	105	105	#----------------------------------------------------------------------
#Parse glsearch output	106	106	#Parse glsearch output
	107	107
my $hsp_cnt = 1;	108	108	my $hsp_cnt = 1;
while (my $result = $parser->next_result) {	109	109	while (my $result = $parser->next_result) {
	110	110
HIT:while (my $hit = $result->next_hit) {	111	111	HIT:while (my $hit = $result->next_hit) {
HSP:while(my $hsp = $hit->next_hsp) {	112	112	HSP:while(my $hsp = $hit->next_hsp) {
	113	113
my %data = ();	114	114	my %data = ();
my $key = "hsp_$hsp_cnt";	115	115	my $key = "hsp_$hsp_cnt";
my $hId = $hsp->frac_identical('total'); #identity in the alignment	116	116	my $hId = $hsp->frac_identical('total'); #identity in the alignment
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment	117	117	my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment
	118	118
#Alignment parameters	119	119	#Alignment parameters
$data{hsp} = $key;	120	120	$data{hsp} = $key;
$data{evalue} = $hsp->evalue;	121	121	$data{evalue} = $hsp->evalue;
$data{id} = $hId;	122	122	$data{id} = $hId;
$data{sim} = $hSim;	123	123	$data{sim} = $hSim;
	124	124
#coordinates in the alignment to plot bars	125	125	#coordinates in the alignment to plot bars
$data{qstart} = $hsp->start('query');	126	126	$data{qstart} = $hsp->start('query');
$data{qend} = $hsp->end('query');	127	127	$data{qend} = $hsp->end('query');
$data{sstart} = $hsp->start('subject');	128	128	$data{sstart} = $hsp->start('subject');
$data{send} = $hsp->end('subject');	129	129	$data{send} = $hsp->end('subject');
	130	130
push (@res, \%data);	131	131	push (@res, \%data);
$hsp_cnt++;	132	132	$hsp_cnt++;
}	133	133	}
}	134	134	}
}	135	135	}
	136	136
die "Error: no match found between fragment and sequence" unless (@res);	137	137	die "Error: no match found between fragment and sequence" unless (@res);
# print Data::Dumper->Dump([\@res ], [qw(*res )]);	138	138	# print Data::Dumper->Dump([\@res ], [qw(*res )]);
	139	139
	140	140
#----------------------------------------------------------------------	141	141	#----------------------------------------------------------------------
#Generate quod plot	142	142	#Generate quod plot
	143	143
#Format string for the regions	144	144	#Format string for the regions
my $regions = "-at ";	145	145	my $regions = "--add-tms ";
my $coords = "";	146	146	my $coords = "";
foreach my $hit (@res) {	147	147	foreach my $hit (@res) {
	148	148
$coords = $hit->{sstart} . "-" . $hit->{send};	149	149	$coords = $hit->{sstart} . "-" . $hit->{send};
$regions .= "${coords}:green";	150	150	$regions .= "${coords}:green";
	151	151
#only the best HSP is required to be plotted	152	152	#only the best HSP is required to be plotted
last;	153	153	last;
}	154	154	}
	155	155
my $outPlot = "$outdir/${accession}_map_frag.png";	156	156	my $outPlot = "$outdir/${accession}_map_frag.png";
my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot";	157	157	my $qstring = ($quiet)? "-q -o $outPlot" : "-o $outPlot";
my $iString = ($interactive)? "-o $outPlot --show" : "";	158	158	my $iString = ($interactive)? "-o $outPlot --show" : "";
	159	159
my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence);	160	160	my $cmd = qq(quod.py $qstring $iString -l "$accession ($coords)" --xticks 25 --grid $regions -- $sequence);
print "$cmd\n";	161	161	print "$cmd\n";
system $cmd;	162	162	system $cmd;
}	163	163	}
	164	164
	165	165
	166	166
	167	167
sub getSequences {	168	168	sub getSequences {
	169	169
my ($frag, $acc) = @_;	170	170	my ($frag, $acc) = @_;
	171	171
	172	172
# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(frag acc *accFile)]);	173	173	# print Data::Dumper->Dump([$frag, $acc, $accFile ], [qw(frag acc *accFile)]);
# exit;	174	174	# exit;
	175	175
	176	176
#Sequence for full protein	177	177	#Sequence for full protein
my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa";	178	178	my $accSeq = (-f $accFile)? $accFile : "$outdir/${acc}.faa";
	179	179
	180	180
#Save fragment to file	181	181	#Save fragment to file
my $tmpFile = "$outdir/${acc}_frag.faa";	182	182	my $tmpFile = "$outdir/${acc}_frag.faa";
	183	183
	184	184
if (-f $frag) {	185	185	if (-f $frag) {
system "mv $frag $tmpFile" unless (-f $tmpFile);	186	186	system "mv $frag $tmpFile" unless (-f $tmpFile);
}	187	187	}
else {	188	188	else {
unless (-f $tmpFile) {	189	189	unless (-f $tmpFile) {
open (my $fh, ">", $tmpFile) \|\| die $!;	190	190	open (my $fh, ">", $tmpFile) \|\| die $!;
print $fh ">${acc} fragment\n$frag\n";	191	191	print $fh ">${acc} fragment\n$frag\n";
close $fh;	192	192	close $fh;
}	193	193	}
}	194	194	}
	195	195
unless (-f $accSeq) {	196	196	unless (-f $accSeq) {
#Blast DB to be used	197	197	#Blast DB to be used
my $db = ($blastdb)? $blastdb : 'nr';	198	198	my $db = ($blastdb)? $blastdb : 'nr';
	199	199
my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq);	200	200	my $cmd = qq(blastdbcmd -db $db -entry $acc -target_only -outfmt '\%f' -out $accSeq);
system $cmd;	201	201	system $cmd;
	202	202
#Remove the version and annotations from the sequence file	203	203	#Remove the version and annotations from the sequence file
my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq);	204	204	my $cmd2 = qq(perl -i.bkp -pe 's/^\\>(\\w+)\.\*/\\>\$1/;' $accSeq);
system $cmd2 unless (-f "${accSeq}.pkp");	205	205	system $cmd2 unless (-f "${accSeq}.pkp");
}	206	206	}
	207	207
#Return files to be aligned	208	208	#Return files to be aligned
return [$tmpFile, $accSeq];	209	209	return [$tmpFile, $accSeq];
}	210	210	}
	211	211
	212	212
	213	213
	214	214
#===========================================================================	215	215	#===========================================================================
#Read command line and print help	216	216	#Read command line and print help
	217	217
	218	218
sub read_command_line {	219	219	sub read_command_line {
	220	220
print_help() unless (@ARGV);	221	221	print_help() unless (@ARGV);
	222	222
my $status = GetOptions(	223	223	my $status = GetOptions(
"i\|acc-file=s" => \&read_accFile,	224	224	"i\|acc-file=s" => \&read_accFile,
"a\|accession=s" => \&read_accession,	225	225	"a\|accession=s" => \&read_accession,
"f\|fragment=s" => \&read_fragment,	226	226	"f\|fragment=s" => \&read_fragment,
"o\|outdir=s" => \$outdir,	227	227	"o\|outdir=s" => \$outdir,
"bdb\|blastdb=s" => \&read_blastdb,	228	228	"bdb\|blastdb=s" => \&read_blastdb,
"e\|evalue=f" => \$evalue,	229	229	"e\|evalue=f" => \$evalue,
"m\|sub-matrix=s" => \&read_subMatrix,	230	230	"m\|sub-matrix=s" => \&read_subMatrix,
"t\|interactive!" => \$interactive,	231	231	"t\|interactive!" => \$interactive,
"q\|quiet!" => \$quiet,	232	232	"q\|quiet!" => \$quiet,
"h\|help" => sub { print_help(); },	233	233	"h\|help" => sub { print_help(); },
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; });	234	234	"<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
exit unless ($status);	235	235	exit unless ($status);
	236	236
	237	237
die "Error: option -f is mandatory." unless ($fragment);	238	238	die "Error: option -f is mandatory." unless ($fragment);
die "Error: options -i or -a are mandatory." unless ($accession \|\| $accFile);	239	239	die "Error: options -i or -a are mandatory." unless ($accession \|\| $accFile);
die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive);	240	240	die "Error: flags -t and -q cannot be set at the same time!" if ($quiet && $interactive);
	241	241
#Default value for output directory	242	242	#Default value for output directory
$outdir = "." unless ($outdir);	243	243	$outdir = "." unless ($outdir);
system "mkdir -p $outdir" unless (-d $outdir);	244	244	system "mkdir -p $outdir" unless (-d $outdir);
	245	245

#!/usr/bin/env perl -w	1	1	#!/usr/bin/env perl
	2	2
use warnings;	3	3	no warnings;
use strict;	4	4	use strict;
use Data::Dumper;	5	5	use Data::Dumper;
	6	6
$Data::Dumper::Deepcopy = 1;	7	7	use TCDB::Repeats;
$Data::Dumper::Indent = 1;	8
#$Data::Dumper::Purity = 0;	9
$Data::Dumper::Sortkeys = 1;	10
	11
use Getopt::Long;	12	8	use Getopt::Long;
use Bio::SearchIO;	13
use Bio::SeqIO;	14
	15	9
	16	10
use TCDB::CheckDependencies;	17	11	my $seqsDir = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/sequences';
use TCDB::Assorted;	18	12	my $seqsFile = undef;
		13	my $tmsFile = undef; #'/Users/amedrano/Desktop/Mai_tmsRepeat/tms.hmmtop';
		14	my $outDir = "Repeats"; #'/Users/amedrano/Desktop/Mai_tmsRepeat/RepeatUnits/ResultsOOP';
	19	15
		16	my $evalue = 1e-2;
		17	my $coverage = 0.85;
		18	my $identity = 0.2;
	20	19
		20	my @tmsRanges = ();
	21	21
#==========================================================================	22	22	read_command_line();
#Check dependencies	23
	24	23
my @dependencies = ("water", "ssearch36", "extractFamily.pl", "tmsplit", "quod.py");	25
my $CheckDep_obj = new TCDB::CheckDependencies();	26
$CheckDep_obj -> dependencies_list(\@dependencies);	27
$CheckDep_obj -> checkDependencies;	28
	29	24
	30	25	#print Data::Dumper->Dump([$seqsDir, $seqsFile, $tmsFile, $outDir],
	31	26	# [qw(seqsDir seqsFile tmsFile outDir )]);
#==========================================================================	32
#Read command line options	33
	34
my $gs_infile = "";	35
my $infileFmt = "hmmtop"; #The other option is 'tms' which is the ID and TMS	36
my $gs_idFormat = "";	37
my $gs_repUnit = 0;	38
my $gs_seqDir = "";	39
my $gs_tail = 5;	40
my $gs_evalue = 0.1;	41
my $gs_coverage = 0.8;	42
my $gs_identity = 0.25;	43
my $gsatShuffles = 1000;	44
my $min_gsat_score = 4.0;	45
	46
my $compStatsFlag = 1;	47
my $compStats = "";	48
my $outdir = "repeats";	49
my $repDir = "reports";	50
my $seqDir = "sequences";	51
my $alignDir = "alignments";	52
my $plotsDir = "plots";	53
my $goodHitsOnly = 1; #print only significant results, ignore everything else	54
	55
	56
#all (all sequences in output file)	57
#each (generate one directory per sequence.. for better organization)	58
#debug (it will print the contents of the hash table one sequences at a time)	59
my $mode = "all";	60
	61
read_command_line_arguments();	62
	63
#print Data::Dumper->Dump([$gs_infile, $gs_idFormat, $gs_repUnit, $gs_seqDir,	64
# $gs_tail, $gs_evalue, $gs_coverage, $gs_identity, $gsatShuffles, $compStatsFlag, $compStats],	65
# [qw(infile idFormat repUnit seqDir tail evalue	66
# coverage identity $gsatShuffles compStatFlag compStats)]);	67
#exit;	68	27	#exit;
	69	28
# ssearch36 -p -k 1000 -z 11 -E 1.0 -s BL62 -W 0 4.B.1_4tms_all/sequences/4.B.1.1.2-Q4QLL1_bundle1.faa 4.B.1_4tms_all/sequences/lib_4.B.1.1.2-Q4QLL1_bundle1.faa	70
	71	29
		30	#my $repObj = TCDB::Repeat->new('seqsDir' => $seqsDir,
		31	# 'tmsFile' => $tmsFile,
		32	# 'outDir' => $outDir,
		33	# 'ranges2searchTMS' => \@TMSranges);
	72	34
#==========================================================================	73
#Read file with coordinates of TMSs and verify that the sequences are	74
#available	75
	76	35
my %gh_tms = ();	77	36	my @TMSranges = ([1, 3], [4, 6]);
	78	37
read_tms_coordinates_file($gs_infile, \%gh_tms);	79	38	my $repObj = TCDB::Repeat->new();
	80	39
#print Data::Dumper->Dump([ \%gh_tms], [qw(*tms )]);	81	40	#$repObj->tmsFile($tmsFile);
#exit;	82	41	#$repObj->seqsDir($seqsDir);
		42	$repObj->seqsFile($seqsFile);
		43	$repObj->outDir($outDir);
		44	$repObj->evalueCutoff($evalue);
		45	$repObj->identityCutoff($identity);
		46	$repObj->coverageCutoff($coverage);
		47	$repObj->TMSranges2search(\@TMSranges);
	83	48
		49	$repObj-> findRepeatsTMSranges();
	84	50
#===========================================================================	85	51	#print Data::Dumper->Dump([$repObj ], [qw(*repObj)]);
#Main Output directory	86
	87	52
#Root directory for all results	88
system "mkdir -p $outdir" unless (-d $outdir);	89
die "Could not generate output directory: $outdir" unless (-d $outdir);	90
	91	53
	92	54
	93	55
#==========================================================================	94
#Search for repeats inside query sequences	95
	96	56
my %results = ();	97
my %origSeqLength = (); #To calculate x-ticks spacing in hydropathy plots	98
	99
foreach my $ls_sid (keys %gh_tms) {	100
	101
my %gh_bundleSeqs = ();	102
my %gh_topHits = ();	103
	104
	105
print "Processing: $ls_sid\n";	106
	107
	108
#Clean results if one output directory is generated per input sequence	109
%results = () if ($mode eq 'each');	110
	111
	112
#Cut sequences in non overlaping regions with as many TMS as the	113
#repeat unit we want to find.	114
cut_seq_in_tms_regions ($ls_sid, $gs_repUnit, \%gh_tms, \%gh_bundleSeqs);	115
	116
	117
# print Data::Dumper->Dump([\%gh_bundleSeqs ], [qw(*bundleSeqs)]);	118
# <STDIN>;	119
	120
	121
#run ssearch to find potential repeats.	122
align_bundles($ls_sid,\%gh_bundleSeqs, \%gh_topHits);	123
	124
	125
# print Data::Dumper->Dump([\%gh_topHits ], [qw(*topHits )]);	126
# <STDIN>;	127
	128
	129
#Collect results for final table	130
$results{$ls_sid} = \%gh_topHits;	131
	132
#present results per input sequence to verify everything looks fine.	133
if ($mode eq 'debug') {	134
print Data::Dumper->Dump([\%gh_topHits], [qw(*topHits)]);	135
<STDIN>;	136
}	137
	138
print_reports(\%results) if ($mode eq 'each');	139
}	140
	141
	142
	143
	144
	145
#===========================================================================	146	57	#===========================================================================
#Print final results in summarized or detailed format	147	58	#Read command line and print help
	148	59
#print Data::Dumper->Dump([\%results ], [qw(*results )]);	149
#<STDIN>;	150
	151	60
print_reports(\%results) if ($mode eq 'all');	152	61	sub read_command_line {
	153	62
		63	print_help() unless (@ARGV);
	154	64
		65	my $status = GetOptions(
		66	"s\|seqs-file=s" => \&read_seqsFile,
		67	"d\|seqs-dir=s" => \&read_seqsDir,
		68	"o\|outdir=s" => \&read_outdir,
		69	"t\|tms=s" => \&read_tmsFile,
		70	"e\|evalue=f" => \$evalue,
		71	"i\|identity=f" => \$identity,
		72	"c\|coverage=f" => \$coverage,
		73	"h\|help" => sub { print_help(); },
		74	"<>" => sub { die "Error: Unknown argument: $_[0]\n"; });
		75	exit unless ($status);
	155	76
	156	77
	157	78	#Validadte input file option
###########################################################################	158	79	die "Error: no sequence file detected!" unless ($seqsFile);
# #	159
# Subroutine definition #	160
# #	161
###########################################################################	162
	163
	164
#print final_report	165
	166
sub print_reports {	167
	168
my $res = shift;	169
	170
	171
#Get the directory where reports will be saved	172
my $reportDir = undef;	173
if ($mode eq 'all') {	174
$reportDir = getReportsDir();	175
}	176
else {	177
	178
#one id per report	179
my @ids = keys %$res;	180
my $seqId = $ids[0];	181
	182
$reportDir = getReportsDir($seqId);	183
}	184
die "Error: invalid report dir" unless ($reportDir);	185
	186
	187
my $sumFile = "$reportDir/repeats_summary_report.txt";	188
my $detailsFile = "$reportDir/repeats_detailed_report.txt";	189
my $htmlFile = "$reportDir/report.html";	190
	191
	192
open (my $htmlfh, ">", $htmlFile) \|\| die $!;	193
	194
my $htmlHeader = <<HEADER;	195
<!DOCTYPE html>	196
<html xmlns="http://www.w3.org/1999/xhtml">	197
<head>	198
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />	199
	200
<style type="text/css">	201
	202
.label {	203
text-align: right;	204
width: 50px;	205
}	206	80	}
	207	81
.data {	208
text-align: left;	209
padding-left: 8px;	210
width: 100px;	211
}	212
	213	82
.uline {	214
text-decoration: underline;	215
}	216
	217
.seq {	218
border: 2px solid black;	219
height: 70px;	220
width: 100%;	221
overflow-x: auto;	222
overflow-y: hidden;	223
margin: 1em 0;	224
background: gray;	225
color: white;	226
}	227
	228
img {	229
display: block;	230
margin-left: auto;	231
margin-right: auto;	232
height: 250px;	233
width: auto;	234
max-width: 1500px;	235
max-height: 300px;	236
}	237
	238
</style>	239
<title>Inferring repeats of $gs_repUnit TMS</title>	240
</head>	241
<br />	242
<h1 style='text-align:center'>Inferred Repeats Based On ${gs_repUnit}-TMS Bundles</h1>	243
<body>	244
	245
HEADER	246
	247
print $htmlfh $htmlHeader;	248
open (my $sumh, ">", $sumFile) \|\| die $!;	249
open (my $deth, ">", $detailsFile) \|\| die $!;	250
	251
	252
#Header for summary table	253
print $sumh "#Accession\tQ_bundle\tS_bundle\tQ_len\tS_len\tE-value\tIdentity\tGSAT\tAln_len\tQ_cov\tS_cov\n";	254
	255
	256
# print Data::Dumper->Dump([$res ], [qw(*res )]);	257
# <STDIN>;	258
	259
	260
P:foreach my $id (sort {$a cmp $b} keys %$res) {	261
	262
#Jump to next result if there are NO hits for this protein and	263
#ONLY good hits are going to be recorded.	264
unless (%{ $res->{$id} }) {	265
next P if ($goodHitsOnly);	266
}	267
	268
	269
print $deth "===========================================================================\n";	270
print $htmlfh " <br /><hr style=\"border-style:solid; border-width:5px; color:black;\"/>\n";	271
	272
#There must be results to continue	273
unless (%{ $res->{$id} }) {	274
print $sumh "$id\tNo_hits\n";	275
print $deth "$id\tNo_hits\n\n\n";	276
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n <p><b>No candidate repeats found</b></p>\n";	277
}	278
	279
print $deth "$id\n\n";	280
print $htmlfh " <h2 style=\"text-align:center;\">$id</h2>\n";	281
	282
	283
	284
	285
#get the long bundle names	286
BS:foreach my $bundleName (sort {$a cmp $b} keys %{ $res->{$id} }) {	287
	288
BN:foreach my $bundleNumber (sort {$a <=> $b} keys %{ $res->{$id}->{$bundleName} }) {	289
	290
my $qName = $res->{$id}->{$bundleName}->{$bundleNumber}->{qName};	291
my $qLen = $res->{$id}->{$bundleName}->{$bundleNumber}->{qLen};	292
	293
#Each of the hits for this bundle	294
my @hits_tmp = @{ $res->{$id}->{$bundleName}->{$bundleNumber}->{hits} };	295
	296
#To get rid of a warning when there is only one hit.	297
my @hits = (scalar (@hits_tmp) > 1)?	298
sort {$a->{hName} cmp $b->{hName}} @hits_tmp : @hits_tmp;	299
	300
foreach my $hit (@hits) {	301
	302
my $hName = $hit->{hName};	303
my $hLen = $hit->{hLen};	304
	305
my $evalue = sprintf ("%.1e", $hit->{hEvalue});	306
my $ident = sprintf ("%.1f", $hit->{hId} * 100);	307
my $sim = sprintf ("%.1f", $hit->{hSim} * 100);	308
my $gsat = sprintf ("%.1f", $hit->{gsat});	309
	310
my $alnLen = $hit->{alnLen};	311
my $qCov = sprintf("%.1f", $hit->{qCov} * 100);	312
my $hCov = sprintf("%.1f", $hit->{hCov} * 100);	313
	314
	315
#The alignment	316
my $qstart = $hit->{qstart};	317
my $qend = $hit->{qend};	318
my $sstart = $hit->{sstart};	319
my $send = $hit->{send};	320
my $qSeq = $hit->{qSeq};	321
my $homStr = $hit->{homStr};	322
my $sSeq = $hit->{sSeq};	323
	324
my $plot = $hit->{plot};	325
	326
#For summary tab-delimitedfile (everything except the alignment)	327
print $sumh "$id\t$qName\t$hName\t$qLen\t$hLen\t$evalue\t$ident\t$gsat\t$alnLen\t$qCov\t$hCov\n";	328
	329
	330
#Detailed report that includes the alignment	331
print $deth "----------\n";	332
print $deth "$qName ($qLen) vs $hName ($hLen)\n\n";	333
print $deth "E-value: $evalue Identity: ${ident}% GSAT: $gsat\n";	334
print $deth "Q_cov: ${qCov}% S_cov: ${hCov}% Aln_length: $alnLen\n\n";	335
print $deth "Alignment ($qName\|${qstart}-$qend vs $hName\|${sstart}-$send):\n$qSeq\n$homStr\n$sSeq\n\n\n";	336
	337
	338
#The HTML report (includes alignment and hydropathy image	339
my $repHit = <<HIT;	340
	341
<p><b>$qName ($qLen) vs $hName ($hLen)</b></p>	342
	343
<table width="600px" border="0" cellspacing="0" cellpadding="2">	344
<tr>	345
<td class='label'><b>E-value:</b></td>	346
<td class='data'>$evalue</td>	347
<td class='label'><b>Identity:</b></td>	348
<td class='data'>${ident}%</td>	349
<td class='label'><b>Similarity:</b></td>	350
<td class='data'>${sim}%</td>	351
<td class='label'><b>GSAT:</b></td>	352
<td class='data'>$gsat</td>	353
</tr>	354
<tr>	355
<td class='label'><b>Aln:</b></td>	356
<td class='data'>$alnLen</td>	357
<td class='label'><b>Q_cov:</b></td>	358
<td class='data'>${qCov}%</td>	359
<td class='label'><b>S_cov:</b></td>	360
<td class='data'>${hCov}%</td>	361
<td class='label'></td>	362
<td class='data'></td>	363
</tr>	364
</table>	365
	366
<p><b>Alignment (</b>$qName:<b class="uline">${qstart}-$qend</b> vs $hName:<b class="uline">${sstart}-$send</b><b>):</b></p>	367
<div class='seq'>	368
<pre>	369
$qSeq	370
$homStr	371
$sSeq	372
</pre>	373
</div>	374
<a href="$plot" target="_blank"><img src="$plot"/></a>	375
<br />	376
<hr />	377
	378
HIT	379
	380
print $htmlfh $repHit;	381
	382
} #hit	383
} #reference bundle number	384
} #Reference bundle name	385
} #Query protein	386
	387
#Close HTML report	388
my $closeRep = <<CLOSE;	389
</body>	390
</html>	391
CLOSE	392
	393
print $htmlfh $closeRep;	394
	395
close $sumh;	396
close $deth;	397
close $htmlfh;	398
}	399
	400
	401
	402
#==========================================================================	403	83	#==========================================================================
#Run ssearch36 between the different bundles in a sequence	404	84	#Option -s
	405	85
sub align_bundles {	406	86	sub read_seqsFile {
		87	my ($opt, $value) = @_;
	407	88
my ($seqId, $lhr_bundleSeqFiles, $lhr_topHits) = @_;	408	89	unless (-f $value && !(-z $value)) {
	409	90	die "Error: file with sequences does not exist or is empty!\n";
%$lhr_topHits = ();	410
	411
#Directory where the sequences of TMS bundles are saved	412
my $sequencesDir = undef;	413
my $alignmentsDir = undef;	414
my $hydroPlotsDir = undef;	415
	416
if ($mode eq 'all') {	417
$sequencesDir = getSequencesDir();	418
$alignmentsDir = getAlignmentsDir();	419
$hydroPlotsDir = getPlotsDir();	420
}	421	91	}
else {	422
$sequencesDir = getSequencesDir($seqId);	423
$alignmentsDir = getAlignmentsDir($seqId);	424
$hydroPlotsDir = getPlotsDir($seqId);	425
}	426
die "Error: invalid sequences dir" unless ($sequencesDir);	427
die "Error: invalid alignments dir" unless ($alignmentsDir);	428
die "Error: invalid plots dir" unless ($hydroPlotsDir);	429
	430	92
	431	93	$seqsFile = $value;
# print Data::Dumper->Dump([$lhr_bundleSeqFiles ], [qw(*files )]);	432
# <STDIN>;	433
	434
	435
#The bundle that will be used as reference for the comparison	436
REF:foreach my $bundle (sort {$a <=> $b} keys %$lhr_bundleSeqFiles) {	437
	438
my $rFile = "$sequencesDir/" . $lhr_bundleSeqFiles->{$bundle}->[0];	439
	440
	441
#Id to name ssearch36 output files	442
my $id = $lhr_bundleSeqFiles->{$bundle}->[0];	443
$id =~ s/\.faa//;	444
	445
	446
#For naming GSAT files (ID of system or protein accession)	447
my $tcAcc = ($id =~ /(\S+)_bundle.*/)? $1 : undef;	448
die "Could not extract accession from $id!" unless ($id);	449
	450
	451
# print Data::Dumper->Dump([$id, $tcAcc ], [qw(id tcAcc)]);	452
# <STDIN>;	453
	454
	455
#--------------------------------------------------------------------	456
#Get the non-overlapping bundles to compare them against the	457
#reference bundle	458
	459
my @cmpFiles = ();	460
	461
#Initialize the index to the first non-overlapping bundle	462
my $next_bundle_idx = $bundle + $gs_repUnit;	463
	464
CMP:while (1) {	465
	466
#Exit if next bundle is not in bundles hash	467
last CMP unless (exists $lhr_bundleSeqFiles->{$next_bundle_idx});	468
	469
#Get file name for this non-overlapping bundle	470
my $cmpBundle = $sequencesDir . "/" . $lhr_bundleSeqFiles->{$next_bundle_idx}->[0];	471
push (@cmpFiles, $cmpBundle);	472
	473
#Update the index to the next non-overlapping bundle	474
$next_bundle_idx = $next_bundle_idx + $gs_repUnit;	475
}	476
	477
#go to next reference bundle if there are no non-overlapping bundles.	478
next REF unless (@cmpFiles);	479
	480
	481
# print Data::Dumper->Dump([\@cmpFiles ], [qw(*cmpFiles )]);	482
# <STDIN>;	483
	484
	485
#--------------------------------------------------------------------	486
#Now run ssearch36 of the reference bundle against all its	487
#non-overlapping bundles	488
	489
#put all non-overlapping bundles into a file	490
my $libFile = "$sequencesDir/lib_$id.faa";	491
my $cmd = "cat " . join(" ", @cmpFiles) . " > $libFile";	492
system $cmd;	493
	494
	495
#run ssearch36 of $rFile vs @cmpFile	496
my $ssearchOut = "$alignmentsDir/ssearch_$id.out";	497
my $ssearch_params = qq(-p $compStats -E $gs_evalue -s BL62 -W 0 $rFile $libFile > $ssearchOut);	498
system "ssearch36 $ssearch_params" unless (-f $ssearchOut);	499
	500
	501
# print Data::Dumper->Dump([$ssearchOut ], [qw(*ssearchOut )]);	502
# <STDIN>;	503
	504
	505
#---------------------------------------------------------------------	506
#Estimate here the spacing between x-ticks for hydropathy plots	507
	508
my $protLen = $origSeqLength{$seqId};	509
	510
my $xticksSpacing = undef;	511
if ($protLen <= 500) {	512
$xticksSpacing = 25;	513
}	514
elsif ($protLen <= 1000) {	515
$xticksSpacing = 50;	516
}	517
else {	518
$xticksSpacing = 100;	519
}	520
	521
	522
	523
#--------------------------------------------------------------------	524
#parse ssearch36 output. For BioPerl resouces check:	525
#http://search.cpan.org/dist/BioPerl/Bio/SearchIO.pm	526
#https://classes.soe.ucsc.edu/bme060/Winter07/bptutorial.html	527
	528
my $parser = new Bio::SearchIO (-format => 'fasta', -file => $ssearchOut);	529
	530
	531
#put hir the top hits	532
my %lh_hits = ();	533
	534
	535
while (my $result = $parser->next_result) {	536
	537
	538
my $qLen = $result->query_length;	539
$lh_hits{$bundle}{qName} = $result->query_name;	540
$lh_hits{$bundle}{qLen} = $qLen;	541
$lh_hits{$bundle}{hits} = [];	542
	543
	544
HIT:while (my $hit = $result->next_hit) {	545
	546
HSP:while(my $hsp = $hit->next_hsp) {	547
	548
	549
# print Data::Dumper->Dump([$hsp ], [qw(*hsp )]);	550
# <STDIN>;	551
	552
	553
my %tmp = ();	554
	555
my $alnLen = $hsp->hsp_length;	556
my $hLen = $hit->length;	557
my $hEvalue = $hsp->evalue;	558
my $hId = $hsp->frac_identical('total'); #identity in the alignment	559
my $hSim = $hsp->frac_conserved('total'); #similarity in the alignment	560
	561
	562
#coordinates in the alignment to properly calculate coverages	563
my $qstart = $hsp->start('query');	564
my $qend = $hsp->end('query');	565
my $sstart = $hsp->start('subject');	566
my $send = $hsp->end('subject');	567
	568
	569
#Calculate coverages properly (do not use alignment length as it includes gaps	570
	571
my $qCov_tmp = ($qend - $qstart + 1) / $qLen;	572
my $qCov = ($qCov_tmp > 1.0)? 1.0 : $qCov_tmp;	573
	574
my $hCov_tmp = ($send - $sstart + 1) / $hLen;	575
my $hCov = ($hCov_tmp > 1.0)? 1.0 : $hCov_tmp;	576
	577
	578
# print Data::Dumper->Dump([$qLen, $qCov, $hLen, $hCov, $gs_coverage, $hEvalue, $gs_evalue, $hId, $gs_identity],	579
# [qw(qLen qCov $hLen hCov coverageCutoff evalue evalCutoff hId IDcutoff)]);	580
# <STDIN>;	581
	582
	583
#Before storing hit results check minimum coverage, identity and evalue	584
next HSP unless (($qCov >= $gs_coverage \|\| $hCov >= $gs_coverage) &&	585
($hEvalue <= $gs_evalue) && ($hId >= $gs_identity));	586
	587
	588
#hit identity	589
$tmp{hName} = $hit->name;	590
$tmp{hLen} = $hLen;	591
	592
	593
#hit statistics	594
$tmp{alnLen} = $alnLen;	595
$tmp{hEvalue} = $hEvalue;	596
$tmp{hId} = $hId;	597
$tmp{hSim} = $hSim;	598
$tmp{qCov} = $qCov;	599
$tmp{hCov} = $hCov;	600
	601
	602
#The alignment	603
$tmp{qstart} = $qstart;	604
$tmp{qend} = $qend;	605
$tmp{sstart} = $sstart;	606
$tmp{send} = $send;	607
	608
$tmp{qSeq} = $hsp->query_string;	609
$tmp{sSeq} = $hsp->hit_string;	610
$tmp{homStr} = $hsp->homology_string;	611
	612
	613
#Get the GSAT score	614
my $gsat_outFile = "$alignmentsDir/${tcAcc}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".gsat";	615
	616
	617
# print "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile\n";	618
# exit;	619
	620
system "gsat.py $tmp{qSeq} $tmp{sSeq} $gsatShuffles > $gsat_outFile" unless (-f $gsat_outFile);	621
	622
my $gsat_score = TCDB::Assorted::get_gsat_score ($gsat_outFile);	623
$tmp{gsat} = $gsat_score;	624
	625
	626
# print Data::Dumper->Dump([\%tmp ], [qw(*matchData )]);	627
# <STDIN>;	628
	629
	630
#GSAT is the last filter	631
next HSP unless ($gsat_score >= $min_gsat_score);	632
	633
#------------------------------------------------------------	634
#Generate quod plot with the repeat	635
	636
my $whole_prot_seq = "$gs_seqDir/${seqId}.faa";	637
die "Protein sequence not found: $whole_prot_seq" unless (-f $whole_prot_seq);	638
	639
	640
my $plotFile = "$hydroPlotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName};	641
my $fileName = "../$plotsDir/${seqId}_" . $lh_hits{$bundle}{qName} . "_vs_" . $tmp{hName} . ".png";	642
my $plotTitle = $lh_hits{$bundle}{qName} . " vs " . $tmp{hName};	643
	644
#Get hydrophobic peaks coords	645
my $hydroPeaks = $gh_tms{$seqId};	646
die "No hydrophobic peaks found for sequence: $seqId" unless (@{ $hydroPeaks });	647
	648
	649
#format the hydrophobic peaks for quod	650
my @peaks = map { join ("-", @$_) . ":orange" } @$hydroPeaks;	651
my $pstring = join (" ", @peaks);	652
	653
	654
#----------	655
#Calculate the positions of the aligned section of each bundle in the full sequence.	656
	657
my $q_bid = ($lh_hits{$bundle}{qName} =~ /BDL(\d+)/)? $1 : undef;	658
my $s_bid = ( $tmp{hName} =~ /BDL(\d+)/)? $1 : undef;	659
die "Could not extract bundle number for: $lh_hits{$bundle}{qName} or $tmp{hName}" unless ($q_bid && $s_bid);	660
	661
	662
#extract initial positions for both bundles	663
my $qbstart = $lhr_bundleSeqFiles->{$q_bid}->[1];	664
my $qbend = $lhr_bundleSeqFiles->{$q_bid}->[2]; #$qLen - 1;	665
my $sbstart = $lhr_bundleSeqFiles->{$s_bid}->[1];	666
my $sbend = $lhr_bundleSeqFiles->{$s_bid}->[2]; #$hLen - 1;	667
die "Could not extract coords for bundle $q_bid" unless ($qbstart && $qbend);	668
die "Could not extract coords for bundle $s_bid" unless ($sbstart && $sbend);	669
	670
	671
#Calculate bundle positions here	672
my $qgp_start = $qbstart + ($qstart - 1);	673
my $qgp_end = $qbstart + ($qend - 1);	674
	675
my $sgp_start = $sbstart + ($sstart - 1);	676
my $sgp_end = $sbstart + ($send - 1);	677
	678
	679
#Format the coordinates for the repeats now	680
my $qrep = "${qgp_start}-${qgp_end}:green";	681
my $srep = "${sgp_start}-${sgp_end}:blue";	682
	683
#Format the coordinates for the bar delimiting the bundles	684
my $bars = "-w ${qbstart}-${qbend}::1 ${sbstart}-${sbend}::1";	685
	686
#The quod command line	687
my $cmd = "quod.py $whole_prot_seq -t png -l '$plotTitle' -o $plotFile -q -r 80 $bars --xticks $xticksSpacing -nt +0 -at ${pstring} ${qrep} ${srep}";	688
	689
my $img = "${plotFile}.png";	690
system $cmd unless (-f $img);	691
die "Could not generate plot: $img" unless (-f $img);	692
	693
$tmp{plot} = $fileName;	694
	695
	696
#load the data into the hits section for this bundle	697
push (@{ $lh_hits{$bundle}{hits} }, \%tmp);	698
	699
	700
} #HSP	701
} #HIT	702
} #While	703
	704
	705
#Add results to the topHits hash	706
if (@{ $lh_hits{$bundle}{hits} }) {	707
$lhr_topHits->{$id} = \%lh_hits;	708
}	709
	710
}	711
}	712	94	}
	713	95
	714	96
	715
	716
#==========================================================================	717	97	#==========================================================================
#Given a sequence, its TMS coordinates and a repeat size (rsize), cut the	718	98	#Option -t
#sequence in TMS bundles of length rsize.	719
	720	99
		100	sub read_tmsFile {
		101	my ($opt, $value) = @_;
	721	102
sub cut_seq_in_tms_regions {	722	103	unless (-f $value && !(-z $value)) {
	723	104	die "Error in option -t: File with TMSs (hhmtop output) does not exist or is empty!\n";
my ($ls_pid, $ls_repeat, $lhr_tms, $lhr_seqSegs) = @_;	724
	725
	726
%$lhr_seqSegs = ();	727
	728
	729
#Get the directory where bundle sequences will be saved	730
my $sequencesDir = undef;	731
	732
if ($mode eq 'all') {	733
$sequencesDir = getSequencesDir();	734
}	735	105	}
else {	736
$sequencesDir = getSequencesDir($ls_pid);	737
}	738
die "Error: invalid sequence dir" unless ($sequencesDir);	739
	740	106
	741	107	$tmsFile = $value;
#----------------------------------------------------------------------	742
#Get the coordinates of the overlapping bundles	743
	744
my @la_tms = @{ $lhr_tms->{$ls_pid} };	745
	746
	747
	748
#Get the Length of the sequence of the query protein	749
my $seqFile = "$gs_seqDir/${ls_pid}.faa";	750
my $obj = Bio::SeqIO->new(-file => $seqFile , -format => "fasta");	751
my $seqObj = $obj->next_seq;	752
my $qlength = $seqObj->length;	753
die "Could not extract protein length." unless ($qlength);	754
	755
#Store the length of the original sequence for proper calculation of	756
#the x-ticks in the hydropathy plots of the results	757
$origSeqLength{$ls_pid} = $qlength;	758
	759
	760
	761
#Number of TMS in protein	762
my $ls_ntms = scalar (@la_tms);	763
	764
	765
	766
for (my $idx=1; $idx <= ($ls_ntms - $ls_repeat + 1); $idx++) {	767
	768
#TMS in bundle	769
my $left_tms = $la_tms[$idx - 1];	770
my $right_tms = $la_tms[$idx + $ls_repeat - 2];	771
	772
	773
#The coordinates of the bundle	774
my $left_pos = (($left_tms->[0] - $gs_tail) <= 0)? 1 : $left_tms->[0] - $gs_tail;	775
#my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $right_tms->[1] : $right_tms->[1] + $gs_tail;	776
my $right_pos = (($right_tms->[1] + $gs_tail) >= $qlength)? $qlength - 1 : $right_tms->[1] + $gs_tail;	777
	778
	779
#Cut and name the bundles only if bundle file does not exist	780
my $outfile = "${ls_pid}_bundle${idx}";	781
unless (-f "$sequencesDir/${outfile}.faa") {	782
	783
#cutting bundle	784
my $args = qq(-if $seqFile -od $sequencesDir -of $outfile -rangeCut -s $left_pos -e $right_pos -t 0);	785
system "tmsplit $args > /dev/null";	786
	787
#replace protein ID with bundle number to the ID so alignments can be easily identified	788
system qq(perl -i -pe 's/>\\S+/>BDL$idx/' $sequencesDir/${outfile}.faa);	789
}	790
	791
$lhr_seqSegs->{$idx} = ["${outfile}.faa", $left_pos, $right_pos];	792
}	793
}	794	108	}
	795	109
	796	110
	797
	798
#==========================================================================	799	111	#==========================================================================
#Read file with the TMS coordinates of the input proteins. The TMS	800	112	#Option -d
#must have been validated with WHAT to make sure they are reliable.	801
	802	113
		114	sub read_seqsDir {
		115	my ($opt, $value) = @_;
	803	116
sub read_tms_coordinates_file {	804	117	die "Error: directory with sequences does not exist." unless (-d $value);
	805	118
my ($s_coordsFile, $hr_tms) = @_;	806	119	$seqsDir = $value;
	807
open (my $fileh, "<", $s_coordsFile) \|\| die $!;	808
	809
#-----------------------------------------------------------------	810
#The format of this file is protein ID followed by pairs of	811
#coordinates separated by dash:	812
# 2.A.43.1.1-O60931 1-20 25-35 50-68 ....	813
if ($infileFmt eq 'tms') {	814
	815
while(<$fileh>) {	816
chomp;	817
	818
#ignore empty lines;	819
next unless ($_);	820
	821
#extract id and TMSs coordinates	822
my ($id, @tms_str) = split(/\s+/, $_);	823
my @tms = map { [ split(/-/, $_) ] } @tms_str;	824
	825
	826
#For debugging purposes	827
# next unless ($id eq 'WP_100644534');	828
	829
	830
$hr_tms->{$id} = \@tms;	831
	832
#Verify that the sequence is available for this protein	833