Commit 9f332056e88eedaa57bcc71267eb489bd80226c2
1 parent
b344449ca4
Exists in
master
Fixed annoying bug where the origial input file(s) to getDomainTopology.pl was(were) modified
Showing 1 changed file with 11 additions and 3 deletions Inline Diff
getDomainTopology.pl
View file @
9f33205
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
#use List::Util qw(sum); | 6 | 6 | #use List::Util qw(sum); | |
7 | 7 | |||
use TCDB::Assorted; | 8 | 8 | use TCDB::Assorted; | |
use TCDB::Domain::PfamParser; | 9 | 9 | use TCDB::Domain::PfamParser; | |
use TCDB::Domain::Characterize; | 10 | 10 | use TCDB::Domain::Characterize; | |
11 | 11 | |||
use Getopt::Long; | 12 | 12 | use Getopt::Long; | |
13 | 13 | |||
#========================================================================== | 14 | 14 | #========================================================================== | |
#Global variables | 15 | 15 | #Global variables | |
16 | 16 | |||
#Query family or families | 17 | 17 | #Query family or families | |
my @fams = (); | 18 | 18 | my @fams = (); | |
19 | 19 | |||
#This is an option for TCDB::Assorted::getSystemAccessions() | 20 | 20 | #This is an option for TCDB::Assorted::getSystemAccessions() | |
my $treatAsSuperfamily = 0; | 21 | 21 | my $treatAsSuperfamily = 0; | |
22 | 22 | |||
#Options for TCDB::Domain::PfamParser | 23 | 23 | #Options for TCDB::Domain::PfamParser | |
my $domain_cov = 0.7; | 24 | 24 | my $domain_cov = 0.7; | |
my $prot_cov = 0.1; | 25 | 25 | my $prot_cov = 0.1; | |
my $evalue = 1e-5; | 26 | 26 | my $evalue = 1e-5; | |
my $prop_prots_w_domain = 0.05; | 27 | 27 | my $prop_prots_w_domain = 0.05; | |
28 | 28 | |||
#Options for TCDB::Domain::Characterize | 29 | 29 | #Options for TCDB::Domain::Characterize | |
my $rootDir = "."; | 30 | 30 | my $rootDir = "."; | |
31 | 31 | |||
#To extract the TCIDs of refernece families | 32 | 32 | #To extract the TCIDs of refernece families | |
my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/pfam/download/tcdb.faa"; | 33 | 33 | my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/pfam/download/tcdb.faa"; | |
my $pfamFile = "$ENV{RESEARCH_DATA}/pfam/tcdb.pfam-a.hmmscan.bz2"; | 34 | 34 | my $pfamFile = "$ENV{RESEARCH_DATA}/pfam/tcdb.pfam-a.hmmscan.bz2"; | |
my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; | 35 | 35 | my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; | |
my $prog = "ssearch36"; | 36 | 36 | my $prog = "ssearch36"; | |
my @candProjProts = (); | 37 | 37 | my @candProjProts = (); | |
my $analysisLevel = 'system'; | 38 | 38 | my $analysisLevel = 'system'; | |
39 | 39 | |||
40 | 40 | |||
41 | 41 | |||
#Read command line topology | 42 | 42 | #Read command line topology | |
read_command_line_arguments(); | 43 | 43 | read_command_line_arguments(); | |
44 | 44 | |||
45 | 45 | |||
die "TCDB sequences file not found or empty --> $tcdbSeqsFile\n" unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile)); | 46 | 46 | die "TCDB sequences file not found or empty --> $tcdbSeqsFile\n" unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile)); | |
die "TCDB hmmscan output file not found --> $pfamFile\n" unless (-f $pfamFile && !(-z $pfamFile)); | 47 | 47 | die "TCDB hmmscan output file not found --> $pfamFile\n" unless (-f $pfamFile && !(-z $pfamFile)); | |
48 | 48 | |||
49 | 49 | |||
#print Data::Dumper->Dump([\@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $blastdb, $prog, $domain_cov, | 50 | 50 | #print Data::Dumper->Dump([\@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $blastdb, $prog, $domain_cov, | |
# $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts], | 51 | 51 | # $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts], | |
# [qw(*fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *blastdb *prog *domain_cov | 52 | 52 | # [qw(*fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *blastdb *prog *domain_cov | |
# *prot_cov *evalue *prop_prots_w_domain *candProjProts)]); | 53 | 53 | # *prot_cov *evalue *prop_prots_w_domain *candProjProts)]); | |
#exit; | 54 | 54 | #exit; | |
55 | 55 | |||
56 | 56 | |||
#========================================================================== | 57 | 57 | #========================================================================== | |
#Split tcdb systems into single-component multi-component | 58 | 58 | #Split tcdb systems into single-component multi-component | |
59 | 59 | |||
60 | 60 | |||
if ($treatAsSuperfamily) { | 61 | 61 | if ($treatAsSuperfamily) { | |
62 | 62 | |||
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); | 63 | 63 | my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); | |
64 | 64 | |||
# print Data::Dumper->Dump([$tcids ], [qw(*tcids )]); | 65 | 65 | # print Data::Dumper->Dump([$tcids ], [qw(*tcids )]); | |
# exit; | 66 | 66 | # exit; | |
67 | 67 | |||
68 | 68 | |||
69 | 69 | |||
#========================================================================== | 70 | 70 | #========================================================================== | |
#Setup the thresholds for parsing the PFAM output | 71 | 71 | #Setup the thresholds for parsing the PFAM output | |
72 | 72 | |||
73 | 73 | |||
my $obj = new TCDB::Domain::PfamParser(); | 74 | 74 | my $obj = new TCDB::Domain::PfamParser(); | |
$obj->pfamFile($pfamFile); | 75 | 75 | $obj->pfamFile($pfamFile); | |
$obj->analysisLevel($analysisLevel); | 76 | 76 | $obj->analysisLevel($analysisLevel); | |
$obj->domCovCutoff($domain_cov); | 77 | 77 | $obj->domCovCutoff($domain_cov); | |
$obj->tcCovCutoff($prot_cov); | 78 | 78 | $obj->tcCovCutoff($prot_cov); | |
$obj->evalueCutoff($evalue); | 79 | 79 | $obj->evalueCutoff($evalue); | |
$obj->minProtsDom($prop_prots_w_domain); | 80 | 80 | $obj->minProtsDom($prop_prots_w_domain); | |
$obj->treatAsSuperfamily($treatAsSuperfamily); | 81 | 81 | $obj->treatAsSuperfamily($treatAsSuperfamily); | |
82 | 82 | |||
83 | 83 | |||
my %domFreq = (); | 84 | 84 | my %domFreq = (); | |
my %domCoords = (); | 85 | 85 | my %domCoords = (); | |
$obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords); | 86 | 86 | $obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords); | |
87 | 87 | |||
# print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | 88 | 88 | # print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | |
# exit; | 89 | 89 | # exit; | |
90 | 90 | |||
91 | 91 | |||
92 | 92 | |||
93 | 93 | |||
94 | 94 | |||
#========================================================================== | 95 | 95 | #========================================================================== | |
#Attempt to rescue the domains that were not recognized by PFAM in some | 96 | 96 | #Attempt to rescue the domains that were not recognized by PFAM in some | |
#Family members | 97 | 97 | #Family members | |
98 | 98 | |||
99 | 99 | |||
my $rescueObj = new TCDB::Domain::Characterize(); | 100 | 100 | my $rescueObj = new TCDB::Domain::Characterize(); | |
$rescueObj->rootDir($rootDir); | 101 | 101 | $rescueObj->rootDir($rootDir); | |
$rescueObj->tcdbFaa($tcdbSeqsFile); | 102 | 102 | $rescueObj->tcdbFaa($tcdbSeqsFile); | |
$rescueObj->domCoords(\%domCoords); | 103 | 103 | $rescueObj->domCoords(\%domCoords); | |
$rescueObj->domFreq(\%domFreq); | 104 | 104 | $rescueObj->domFreq(\%domFreq); | |
$rescueObj->tcids($tcids); | 105 | 105 | $rescueObj->tcids($tcids); | |
$rescueObj->searchWith($prog); | 106 | 106 | $rescueObj->searchWith($prog); | |
$rescueObj->blastdb($blastdb); | 107 | 107 | $rescueObj->blastdb($blastdb); | |
$rescueObj->evalue($evalue); | 108 | 108 | $rescueObj->evalue($evalue); | |
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); | 109 | 109 | $rescueObj->treatAsSuperfamily($treatAsSuperfamily); | |
110 | 110 | |||
111 | 111 | |||
$rescueObj->rescueDomains(\@fams); | 112 | 112 | $rescueObj->rescueDomains(\@fams); | |
113 | 113 | |||
} | 114 | 114 | } | |
else { | 115 | 115 | else { | |
116 | 116 | |||
foreach my $fam (@fams) { | 117 | 117 | foreach my $fam (@fams) { | |
118 | 118 | |||
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily); | 119 | 119 | my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily); | |
120 | 120 | |||
# print Data::Dumper->Dump([$tcids ], [qw( *tcids )]); | 121 | 121 | # print Data::Dumper->Dump([$tcids ], [qw( *tcids )]); | |
# exit; | 122 | 122 | # exit; | |
123 | 123 | |||
124 | 124 | |||
125 | 125 | |||
#========================================================================== | 126 | 126 | #========================================================================== | |
#Setup the thresholds for parsing the PFAM output | 127 | 127 | #Setup the thresholds for parsing the PFAM output | |
128 | 128 | |||
129 | 129 | |||
my $obj = new TCDB::Domain::PfamParser(); | 130 | 130 | my $obj = new TCDB::Domain::PfamParser(); | |
$obj->pfamFile($pfamFile); | 131 | 131 | $obj->pfamFile($pfamFile); | |
$obj->analysisLevel($analysisLevel); | 132 | 132 | $obj->analysisLevel($analysisLevel); | |
$obj->domCovCutoff($domain_cov); | 133 | 133 | $obj->domCovCutoff($domain_cov); | |
$obj->tcCovCutoff($prot_cov); | 134 | 134 | $obj->tcCovCutoff($prot_cov); | |
$obj->evalueCutoff($evalue); | 135 | 135 | $obj->evalueCutoff($evalue); | |
$obj->minProtsDom($prop_prots_w_domain); | 136 | 136 | $obj->minProtsDom($prop_prots_w_domain); | |
137 | 137 | |||
138 | 138 | |||
my %domFreq = (); | 139 | 139 | my %domFreq = (); | |
my %domCoords = (); | 140 | 140 | my %domCoords = (); | |
$obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords); | 141 | 141 | $obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords); | |
142 | 142 | |||
#print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | 143 | 143 | #print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | |
#exit; | 144 | 144 | #exit; | |
145 | 145 | |||
146 | 146 | |||
147 | 147 | |||
148 | 148 | |||
149 | 149 | |||
#========================================================================== | 150 | 150 | #========================================================================== | |
#Attempt to rescue the domains that were not recognized by PFAM in some | 151 | 151 | #Attempt to rescue the domains that were not recognized by PFAM in some | |
#Family members | 152 | 152 | #Family members | |
153 | 153 | |||
154 | 154 | |||
my $rescueObj = new TCDB::Domain::Characterize(); | 155 | 155 | my $rescueObj = new TCDB::Domain::Characterize(); | |
$rescueObj->rootDir($rootDir); | 156 | 156 | $rescueObj->rootDir($rootDir); | |
$rescueObj->tcdbFaa($tcdbSeqsFile); | 157 | 157 | $rescueObj->tcdbFaa($tcdbSeqsFile); | |
$rescueObj->domCoords(\%domCoords); | 158 | 158 | $rescueObj->domCoords(\%domCoords); | |
$rescueObj->domFreq(\%domFreq); | 159 | 159 | $rescueObj->domFreq(\%domFreq); | |
$rescueObj->tcids($tcids); | 160 | 160 | $rescueObj->tcids($tcids); | |
$rescueObj->searchWith($prog); | 161 | 161 | $rescueObj->searchWith($prog); | |
$rescueObj->blastdb($blastdb); | 162 | 162 | $rescueObj->blastdb($blastdb); | |
$rescueObj->evalue($evalue); | 163 | 163 | $rescueObj->evalue($evalue); | |
$rescueObj->domCovCutoff($domain_cov); | 164 | 164 | $rescueObj->domCovCutoff($domain_cov); | |
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); | 165 | 165 | $rescueObj->treatAsSuperfamily($treatAsSuperfamily); | |
166 | 166 | |||
$rescueObj->rescueDomains(); | 167 | 167 | $rescueObj->rescueDomains(); | |
} | 168 | 168 | } | |
} | 169 | 169 | } | |
170 | 170 | |||
171 | 171 | |||
172 | 172 | |||
173 | 173 | |||
########################################################################### | 174 | 174 | ########################################################################### | |
## Functions ## | 175 | 175 | ## Functions ## | |
########################################################################### | 176 | 176 | ########################################################################### | |
177 | 177 | |||
178 | 178 | |||
179 | 179 | |||
sub read_command_line_arguments { | 180 | 180 | sub read_command_line_arguments { | |
181 | 181 | |||
#if no arguments are given print the help | 182 | 182 | #if no arguments are given print the help | |
if (! @ARGV) { | 183 | 183 | if (! @ARGV) { | |
print_help(); | 184 | 184 | print_help(); | |
} | 185 | 185 | } | |
186 | 186 | |||
#---------------------------------------------------------------------- | 187 | 187 | #---------------------------------------------------------------------- | |
#Parse command line arguments | 188 | 188 | #Parse command line arguments | |
189 | 189 | |||
my $status = GetOptions( | 190 | 190 | my $status = GetOptions( | |
"f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated) | 191 | 191 | "f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated) | |
192 | 192 | |||
#Options for TCDB::Domain::PfamParser | 193 | 193 | #Options for TCDB::Domain::PfamParser | |
"dc|domain-cov=f" => \$domain_cov, | 194 | 194 | "dc|domain-cov=f" => \$domain_cov, | |
"pc|protein-cov=f" => \$prot_cov, | 195 | 195 | "pc|protein-cov=f" => \$prot_cov, | |
"e|evalue=f" => \$evalue, | 196 | 196 | "e|evalue=f" => \$evalue, | |
"m|prots-w-domain=f" => \$prop_prots_w_domain, | 197 | 197 | "m|prots-w-domain=f" => \$prop_prots_w_domain, | |
198 | 198 | |||
#Options for TCDB::Domain::Characterize | 199 | 199 | #Options for TCDB::Domain::Characterize | |
"pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto | 200 | 200 | "pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto | |
"o|outdir=s" => \&read_root_dir, #Ouput root directory | 201 | 201 | "o|outdir=s" => \&read_root_dir, #Ouput root directory | |
"s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB | 202 | 202 | "s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB | |
"sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family | 203 | 203 | "sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family | |
"pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB | 204 | 204 | "pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB | |
"b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences | 205 | 205 | "b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences | |
"p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36) | 206 | 206 | "p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36) | |
"h|help" => sub { print_help(); }, | 207 | 207 | "h|help" => sub { print_help(); }, | |
208 | 208 | |||
#For arguments that do not look like valid options | 209 | 209 | #For arguments that do not look like valid options | |
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; } | 210 | 210 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; } | |
); | 211 | 211 | ); | |
die "\n" unless ($status); | 212 | 212 | die "\n" unless ($status); | |
213 | 213 | |||
#---------------------------------------------------------------------- | 214 | 214 | #---------------------------------------------------------------------- | |
#Validate command line arguments | 215 | 215 | #Validate command line arguments | |
216 | 216 | |||
217 | 217 | |||
die "Error: Options -f and -pt are incompatible" if (@fams && @candProjProts); | 218 | 218 | die "Error: Options -f and -pt are incompatible" if (@fams && @candProjProts); | |
die "Error: either -f or -pt must be given" unless (@fams || @candProjProts); | 219 | 219 | die "Error: either -f or -pt must be given" unless (@fams || @candProjProts); | |
220 | 220 | |||
if (@candProjProts) { | 221 | 221 | if (@candProjProts) { | |
prepare_seqs_for_projection(); | 222 | 222 | prepare_seqs_for_projection(); | |
} | 223 | 223 | } | |
224 | 224 | |||
} | 225 | 225 | } | |
226 | 226 | |||
227 | 227 | |||
#========================================================================== | 228 | 228 | #========================================================================== | |
#Setup the environment for projection of domains onto sequences that are | 229 | 229 | #Setup the environment for projection of domains onto sequences that are | |
#not present in TCDB. | 230 | 230 | #not present in TCDB. | |
231 | 231 | |||
sub prepare_seqs_for_projection { | 232 | 232 | sub prepare_seqs_for_projection { | |
233 | 233 | |||
my $tcdbDir = "$rootDir/tcdb"; | 234 | 234 | my $tcdbDir = "$rootDir/tcdb"; | |
system "mkdir -p $tcdbDir" unless (-d $tcdbDir); | 235 | 235 | system "mkdir -p $tcdbDir" unless (-d $tcdbDir); | |
236 | 236 | |||
237 | #to prevent modifying the original files, here I'll save the input | |||
238 | #sequences with the artificial TCIDs | |||
239 | my $origInfilesDir = "$rootDir/inputFiles"; | |||
240 | system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir); | |||
241 | ||||
242 | ||||
#generate an empty "TCDB sequence file" that will contains proteins not in TCDB | 237 | 243 | #generate an empty "TCDB sequence file" that will contains proteins not in TCDB | |
my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; | 238 | 244 | my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; | |
system "cat /dev/null > $new_tcdbSeqsFile"; | 239 | 245 | system "cat /dev/null > $new_tcdbSeqsFile"; | |
240 | 246 | |||
241 | 247 | |||
#---------------------------------------------------------------------- | 242 | 248 | #---------------------------------------------------------------------- | |
#generate the new TCDB database relevant for the projection | 243 | 249 | #generate the new TCDB database relevant for the projection | |
244 | 250 | |||
foreach my $pair (@candProjProts) { | 245 | 251 | foreach my $pair (@candProjProts) { | |
246 | 252 | |||
my $tcid = $pair->[0]; | 247 | 253 | my $tcid = $pair->[0]; | |
my $tgtF = $pair->[1]; | 248 | 254 | my $tgtF = $pair->[1]; | |
249 | 255 | |||
256 | my @comp = split(/\//, $tgtF); | |||
257 | my $tgtFileName = $comp[-1]; | |||
250 | 258 | |||
#Add family to the main array (as if provided by the -f commandline option) | 251 | 259 | #Add family to the main array (as if provided by the -f commandline option) | |
push (@fams, $tcid); | 252 | 260 | push (@fams, $tcid); | |
253 | 261 | |||
254 | 262 | |||
#extracts the tcids of the systems under reference family | 255 | 263 | #extracts the tcids of the systems under reference family | |
# my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa"; | 256 | 264 | # my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa"; | |
# die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs); | 257 | 265 | # die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs); | |
258 | 266 | |||
my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0); | 259 | 267 | my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0); | |
260 | 268 | |||
# print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]); | 261 | 269 | # print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]); | |
# <STDIN>; | 262 | 270 | # <STDIN>; | |
263 | 271 | |||
#determine the TCID that will be used as reference for the target sequences | 264 | 272 | #determine the TCID that will be used as reference for the target sequences | |
my @systems = @{ $sysHash->{$tcid} }; | 265 | 273 | my @systems = @{ $sysHash->{$tcid} }; | |
die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems); | 266 | 274 | die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems); | |
267 | 275 | |||
my $tgtTC = $systems[-1]->[0]; | 268 | 276 | my $tgtTC = $systems[-1]->[0]; | |
$tgtTC =~ s/\.\d+$/\.10000/; | 269 | 277 | $tgtTC =~ s/\.\d+$/\.10000/; | |
270 | 278 | |||
271 | 279 | |||
#Replace the TCID in the file corresponding to the target proteins | 272 | 280 | #Replace the TCID in the file corresponding to the target proteins | |
my $cmd1 = qq(perl -i.orig -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF); | 273 | 281 | my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName); | |
system $cmd1 unless (-f "${tgtF}.orig"); | 274 | 282 | system $cmd1 unless (-f "$origInfilesDir/$tgtFileName"); | |
275 | 283 | |||
276 | 284 | |||
#Extract sequences for reference family | 277 | 285 | #Extract sequences for reference family | |
my $outFile = "$tcdbDir/tcdb-${tcid}.faa"; | 278 | 286 | my $outFile = "$tcdbDir/tcdb-${tcid}.faa"; | |
my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile); | 279 | 287 | my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile); | |
system $cmd2 unless (-f $outFile); | 280 | 288 | system $cmd2 unless (-f $outFile); | |
die "Could not generate sequence file: $outFile" unless (-f $outFile); | 281 | 289 | die "Could not generate sequence file: $outFile" unless (-f $outFile); | |
282 | 290 | |||
283 | 291 | |||
#Add family and target sequences to the new TCDB family | 284 | 292 | #Add family and target sequences to the new TCDB family | |
my $cmd3 = qq(cat $outFile $tgtF >> $new_tcdbSeqsFile); | 285 | 293 | my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile); | |
system $cmd3; | 286 | 294 | system $cmd3; | |
} | 287 | 295 | } | |
288 | 296 | |||
#---------------------------------------------------------------------- | 289 | 297 | #---------------------------------------------------------------------- | |
#Generate the PFam database | 290 | 298 | #Generate the PFam database | |
291 | 299 | |||
my $pfamD = "$rootDir/pfam"; | 292 | 300 | my $pfamD = "$rootDir/pfam"; | |
system "mkdir -p $pfamD" unless (-d $pfamD); | 293 | 301 | system "mkdir -p $pfamD" unless (-d $pfamD); | |
294 | 302 | |||
my $pfamTMPfile = "$pfamD/tcdb_pfam.out"; | 295 | 303 | my $pfamTMPfile = "$pfamD/tcdb_pfam.out"; | |
$pfamFile = "${pfamTMPfile}.bz2"; | 296 | 304 | $pfamFile = "${pfamTMPfile}.bz2"; | |
297 | 305 | |||
#run Pfam | 298 | 306 | #run Pfam | |
my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm $new_tcdbSeqsFile); | 299 | 307 | my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm $new_tcdbSeqsFile); | |
system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile); | 300 | 308 | system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile); | |
301 | 309 | |||
#compress pfam file | 302 | 310 | #compress pfam file | |
my $cmd5 = qq(bzip2 $pfamTMPfile); | 303 | 311 | my $cmd5 = qq(bzip2 $pfamTMPfile); | |
system $cmd5 unless (-f $pfamFile); | 304 | 312 | system $cmd5 unless (-f $pfamFile); | |
305 | 313 | |||
306 | 314 | |||
#---------------------------------------------------------------------- | 307 | 315 | #---------------------------------------------------------------------- | |
#now generate the blast DB | 308 | 316 | #now generate the blast DB | |
309 | 317 | |||
my $blastD = "$rootDir/blastdb"; | 310 | 318 | my $blastD = "$rootDir/blastdb"; | |
system "mkdir -p $blastD" unless (-d $blastD); | 311 | 319 | system "mkdir -p $blastD" unless (-d $blastD); | |
312 | 320 | |||
$blastdb = "$blastD/tcdb"; | 313 | 321 | $blastdb = "$blastD/tcdb"; | |
314 | 322 | |||
my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile); | 315 | 323 | my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile); | |
system $cmd6 unless (-f "${blastdb}.pin"); | 316 | 324 | system $cmd6 unless (-f "${blastdb}.pin"); | |
317 | 325 | |||
318 | 326 | |||
#For all purposes update the TCDB sequence file to point to the new "customized" file. | 319 | 327 | #For all purposes update the TCDB sequence file to point to the new "customized" file. | |
$tcdbSeqsFile = $new_tcdbSeqsFile; | 320 | 328 | $tcdbSeqsFile = $new_tcdbSeqsFile; | |
} | 321 | 329 | } | |
322 | 330 | |||
323 | 331 | |||
324 | 332 | |||
325 | 333 | |||
326 | 334 | |||
#========================================================================== | 327 | 335 | #========================================================================== | |
#Read the -pt option. It is expected that the user provides the family to which | 328 | 336 | #Read the -pt option. It is expected that the user provides the family to which | |
#the target proteins are expected to belong. Example format should is: | 329 | 337 | #the target proteins are expected to belong. Example format should is: | |
# -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}. | 330 | 338 | # -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}. | |
# | 331 | 339 | # | |
#NOTE: This option is incompatible with -f | 332 | 340 | #NOTE: This option is incompatible with -f | |
333 | 341 | |||
sub read_proj_targets { | 334 | 342 | sub read_proj_targets { | |
335 | 343 | |||
my ($opt, $value) = @_; | 336 | 344 | my ($opt, $value) = @_; | |
337 | 345 | |||
my @pairs = split (/:/, $value); | 338 | 346 | my @pairs = split (/:/, $value); | |
die "No significant argument passed to option -pt" unless (@pairs); | 339 | 347 | die "No significant argument passed to option -pt" unless (@pairs); |