Commit da6670c0fef4f224e47a3ee8f4a79eb5ca9396d2
1 parent
9f332056e8
Exists in
master
updated getDomainTopology to work with CDD and rpsblast
Showing 2 changed files with 218 additions and 43 deletions Inline Diff
getDomainTopology.pl
View file @
da6670c
#!/usr/bin/env perl -w | 1 | 1 | #!/usr/bin/env perl -w | |
2 | 2 | |||
use strict; | 3 | 3 | use strict; | |
use warnings; | 4 | 4 | use warnings; | |
use Data::Dumper; | 5 | 5 | use Data::Dumper; | |
#use List::Util qw(sum); | 6 | 6 | #use List::Util qw(sum); | |
7 | 7 | |||
use TCDB::Assorted; | 8 | 8 | use TCDB::Assorted; | |
use TCDB::Domain::PfamParser; | 9 | 9 | use TCDB::Domain::PfamParser; | |
10 | use TCDB::Domain::CDDparser; | |||
use TCDB::Domain::Characterize; | 10 | 11 | use TCDB::Domain::Characterize; | |
11 | 12 | |||
use Getopt::Long; | 12 | 13 | use Getopt::Long; | |
13 | 14 | |||
15 | # | |||
16 | #Domain projections should work both with CDD and Pfam | |||
17 | # | |||
18 | ||||
#========================================================================== | 14 | 19 | #========================================================================== | |
#Global variables | 15 | 20 | #Global variables | |
16 | 21 | |||
#Query family or families | 17 | 22 | #Query family or families | |
my @fams = (); | 18 | 23 | my @fams = (); | |
19 | 24 | |||
#This is an option for TCDB::Assorted::getSystemAccessions() | 20 | 25 | #This is an option for TCDB::Assorted::getSystemAccessions() | |
my $treatAsSuperfamily = 0; | 21 | 26 | my $treatAsSuperfamily = 0; | |
22 | 27 | |||
#Options for TCDB::Domain::PfamParser | 23 | 28 | #Options for TCDB::Domain::PfamParser | |
my $domain_cov = 0.7; | 24 | 29 | my $domain_cov = 0.7; | |
my $prot_cov = 0.1; | 25 | 30 | my $prot_cov = 0.1; | |
my $evalue = 1e-5; | 26 | 31 | my $evalue = 1e-3; | |
my $prop_prots_w_domain = 0.05; | 27 | 32 | my $prop_prots_w_domain = 0.05; | |
28 | 33 | |||
34 | my $domainAnalysisMode = "cdd"; # cdd or pfam | |||
35 | ||||
#Options for TCDB::Domain::Characterize | 29 | 36 | #Options for TCDB::Domain::Characterize | |
my $rootDir = "."; | 30 | 37 | my $rootDir = "."; | |
31 | 38 | |||
#To extract the TCIDs of refernece families | 32 | 39 | #To extract the TCIDs of refernece families | |
my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/pfam/download/tcdb.faa"; | 33 | 40 | my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa"; | |
my $pfamFile = "$ENV{RESEARCH_DATA}/pfam/tcdb.pfam-a.hmmscan.bz2"; | 34 | 41 | my $pfamFile = ""; | |
42 | my $cddFile = ""; | |||
my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; | 35 | 43 | my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; | |
my $prog = "ssearch36"; | 36 | 44 | my $prog = "ssearch36"; | |
my @candProjProts = (); | 37 | 45 | my @candProjProts = (); | |
my $analysisLevel = 'system'; | 38 | 46 | my $analysisLevel = 'system'; | |
39 | 47 | |||
40 | 48 | |||
41 | 49 | |||
#Read command line topology | 42 | 50 | #Read command line topology | |
read_command_line_arguments(); | 43 | 51 | read_command_line_arguments(); | |
44 | 52 | |||
45 | 53 | |||
die "TCDB sequences file not found or empty --> $tcdbSeqsFile\n" unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile)); | 46 | 54 | #Input files validation. | |
die "TCDB hmmscan output file not found --> $pfamFile\n" unless (-f $pfamFile && !(-z $pfamFile)); | 47 | 55 | die "TCDB sequences file not found or empty --> $tcdbSeqsFile." unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile)); | |
48 | 56 | |||
57 | if ($domainAnalysisMode eq "cdd") { | |||
58 | die "TCDB CDD rpsblast output file not found: $cddFile" unless (-f $cddFile && !(-z $cddFile)); | |||
59 | } | |||
60 | else { | |||
61 | die "TCDB hmmscan output file not found: $pfamFile" unless (-f $pfamFile && !(-z $pfamFile)); | |||
62 | } | |||
49 | 63 | |||
#print Data::Dumper->Dump([\@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $blastdb, $prog, $domain_cov, | 50 | 64 | ||
65 | ||||
66 | ||||
67 | #print Data::Dumper->Dump([$domainAnalysisMode, \@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $cddFile, $blastdb, $prog, $domain_cov, | |||
# $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts], | 51 | 68 | # $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts], | |
# [qw(*fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *blastdb *prog *domain_cov | 52 | 69 | # [qw(*domainAnalysisMode *fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *cddFile *blastdb *prog *domain_cov | |
# *prot_cov *evalue *prop_prots_w_domain *candProjProts)]); | 53 | 70 | # *prot_cov *evalue *prop_prots_w_domain *candProjProts)]); | |
#exit; | 54 | 71 | #exit; | |
55 | 72 | |||
56 | 73 | |||
#========================================================================== | 57 | 74 | #========================================================================== | |
#Split tcdb systems into single-component multi-component | 58 | 75 | #Split tcdb systems into single-component multi-component | |
59 | 76 | |||
60 | 77 | |||
if ($treatAsSuperfamily) { | 61 | 78 | if ($treatAsSuperfamily) { | |
62 | 79 | |||
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); | 63 | 80 | my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); | |
64 | 81 | |||
# print Data::Dumper->Dump([$tcids ], [qw(*tcids )]); | 65 | 82 | #print Data::Dumper->Dump([$tcids ], [qw(*tcids )]); | |
# exit; | 66 | 83 | #exit; | |
67 | 84 | |||
68 | ||||
69 | ||||
#========================================================================== | 70 | 85 | #========================================================================== | |
#Setup the thresholds for parsing the PFAM output | 71 | 86 | #Setup the thresholds for parsing the PFAM output | |
72 | 87 | |||
88 | my $obj = ""; | |||
89 | if ($domainAnalysisMode eq "cdd") { | |||
90 | $obj = new TCDB::Domain::CDDparser(); | |||
91 | $obj->cddFile($cddFile); | |||
92 | } | |||
93 | else { | |||
94 | $obj = new TCDB::Domain::PfamParser(); | |||
95 | $obj->pfamFile($pfamFile); | |||
96 | } | |||
73 | 97 | |||
my $obj = new TCDB::Domain::PfamParser(); | 74 | |||
$obj->pfamFile($pfamFile); | 75 | |||
$obj->analysisLevel($analysisLevel); | 76 | 98 | $obj->analysisLevel($analysisLevel); | |
$obj->domCovCutoff($domain_cov); | 77 | 99 | $obj->domCovCutoff($domain_cov); | |
$obj->tcCovCutoff($prot_cov); | 78 | 100 | $obj->tcCovCutoff($prot_cov); | |
$obj->evalueCutoff($evalue); | 79 | 101 | $obj->evalueCutoff($evalue); | |
$obj->minProtsDom($prop_prots_w_domain); | 80 | 102 | $obj->minProtsDom($prop_prots_w_domain); | |
$obj->treatAsSuperfamily($treatAsSuperfamily); | 81 | 103 | $obj->treatAsSuperfamily($treatAsSuperfamily); | |
82 | 104 | |||
83 | 105 | |||
my %domFreq = (); | 84 | 106 | my %domFreq = (); | |
my %domCoords = (); | 85 | 107 | my %domCoords = (); | |
$obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords); | 86 | 108 | $obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords); | |
87 | 109 | |||
# print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | 88 | 110 | # print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | |
# exit; | 89 | 111 | # exit; | |
90 | 112 | |||
91 | 113 | |||
92 | 114 | |||
93 | 115 | |||
94 | 116 | |||
#========================================================================== | 95 | 117 | #========================================================================== | |
#Attempt to rescue the domains that were not recognized by PFAM in some | 96 | 118 | #Attempt to rescue the domains that were not recognized by PFAM in some | |
#Family members | 97 | 119 | #Family members | |
98 | 120 | |||
99 | 121 | |||
my $rescueObj = new TCDB::Domain::Characterize(); | 100 | 122 | my $rescueObj = new TCDB::Domain::Characterize(); | |
$rescueObj->rootDir($rootDir); | 101 | 123 | $rescueObj->rootDir($rootDir); | |
$rescueObj->tcdbFaa($tcdbSeqsFile); | 102 | 124 | $rescueObj->tcdbFaa($tcdbSeqsFile); | |
$rescueObj->domCoords(\%domCoords); | 103 | 125 | $rescueObj->domCoords(\%domCoords); | |
$rescueObj->domFreq(\%domFreq); | 104 | 126 | $rescueObj->domFreq(\%domFreq); | |
$rescueObj->tcids($tcids); | 105 | 127 | $rescueObj->tcids($tcids); | |
$rescueObj->searchWith($prog); | 106 | 128 | $rescueObj->searchWith($prog); | |
$rescueObj->blastdb($blastdb); | 107 | 129 | $rescueObj->blastdb($blastdb); | |
$rescueObj->evalue($evalue); | 108 | 130 | $rescueObj->evalue($evalue); | |
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); | 109 | 131 | $rescueObj->treatAsSuperfamily($treatAsSuperfamily); | |
110 | 132 | |||
111 | 133 | |||
$rescueObj->rescueDomains(\@fams); | 112 | 134 | $rescueObj->rescueDomains(\@fams); | |
113 | 135 | |||
} | 114 | 136 | } | |
else { | 115 | 137 | else { | |
116 | 138 | |||
foreach my $fam (@fams) { | 117 | 139 | foreach my $fam (@fams) { | |
118 | 140 | |||
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily); | 119 | 141 | my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily); | |
120 | 142 | |||
# print Data::Dumper->Dump([$tcids ], [qw( *tcids )]); | 121 | 143 | # print Data::Dumper->Dump([$tcids ], [qw( *tcids )]); | |
# exit; | 122 | 144 | # exit; | |
123 | 145 | |||
124 | 146 | |||
125 | 147 | |||
#========================================================================== | 126 | 148 | #========================================================================== | |
#Setup the thresholds for parsing the PFAM output | 127 | 149 | #Setup the thresholds for parsing the PFAM output | |
128 | 150 | |||
129 | 151 | |||
my $obj = new TCDB::Domain::PfamParser(); | 130 | 152 | my $obj = ""; | |
$obj->pfamFile($pfamFile); | 131 | 153 | if ($domainAnalysisMode eq "cdd") { | |
154 | $obj = new TCDB::Domain::CDDparser(); | |||
155 | $obj->cddInFile($cddFile); | |||
156 | } | |||
157 | else { | |||
158 | $obj = new TCDB::Domain::PfamParser(); | |||
159 | $obj->pfamFile($pfamFile); | |||
160 | } | |||
161 | ||||
$obj->analysisLevel($analysisLevel); | 132 | 162 | $obj->analysisLevel($analysisLevel); | |
$obj->domCovCutoff($domain_cov); | 133 | 163 | $obj->domCovCutoff($domain_cov); | |
$obj->tcCovCutoff($prot_cov); | 134 | 164 | $obj->tcCovCutoff($prot_cov); | |
$obj->evalueCutoff($evalue); | 135 | 165 | $obj->evalueCutoff($evalue); | |
$obj->minProtsDom($prop_prots_w_domain); | 136 | 166 | $obj->minProtsDom($prop_prots_w_domain); | |
137 | 167 | |||
138 | 168 | |||
my %domFreq = (); | 139 | 169 | my %domFreq = (); | |
my %domCoords = (); | 140 | 170 | my %domCoords = (); | |
$obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords); | 141 | 171 | $obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords); | |
142 | 172 | |||
#print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | 143 | 173 | # print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); | |
#exit; | 144 | 174 | # exit; | |
145 | 175 | |||
146 | 176 | |||
147 | 177 | |||
148 | 178 | |||
149 | 179 | |||
#========================================================================== | 150 | 180 | #========================================================================== | |
#Attempt to rescue the domains that were not recognized by PFAM in some | 151 | 181 | #Attempt to rescue the domains that were not recognized by PFAM/CDD in some | |
#Family members | 152 | 182 | #Family members | |
153 | 183 | |||
154 | 184 | |||
my $rescueObj = new TCDB::Domain::Characterize(); | 155 | 185 | my $rescueObj = new TCDB::Domain::Characterize(); | |
$rescueObj->rootDir($rootDir); | 156 | 186 | $rescueObj->rootDir($rootDir); | |
$rescueObj->tcdbFaa($tcdbSeqsFile); | 157 | 187 | $rescueObj->tcdbFaa($tcdbSeqsFile); | |
$rescueObj->domCoords(\%domCoords); | 158 | 188 | $rescueObj->domCoords(\%domCoords); | |
$rescueObj->domFreq(\%domFreq); | 159 | 189 | $rescueObj->domFreq(\%domFreq); | |
$rescueObj->tcids($tcids); | 160 | 190 | $rescueObj->tcids($tcids); | |
$rescueObj->searchWith($prog); | 161 | 191 | $rescueObj->searchWith($prog); | |
$rescueObj->blastdb($blastdb); | 162 | 192 | $rescueObj->blastdb($blastdb); | |
$rescueObj->evalue($evalue); | 163 | 193 | $rescueObj->evalue($evalue); | |
$rescueObj->domCovCutoff($domain_cov); | 164 | 194 | $rescueObj->domCovCutoff($domain_cov); | |
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); | 165 | 195 | $rescueObj->treatAsSuperfamily($treatAsSuperfamily); | |
166 | 196 | |||
$rescueObj->rescueDomains(); | 167 | 197 | $rescueObj->rescueDomains(); | |
} | 168 | 198 | } | |
} | 169 | 199 | } | |
170 | 200 | |||
171 | 201 | |||
172 | 202 | |||
173 | 203 | |||
########################################################################### | 174 | 204 | ########################################################################### | |
## Functions ## | 175 | 205 | ## Functions ## | |
########################################################################### | 176 | 206 | ########################################################################### | |
177 | 207 | |||
178 | 208 | |||
179 | 209 | |||
sub read_command_line_arguments { | 180 | 210 | sub read_command_line_arguments { | |
181 | 211 | |||
#if no arguments are given print the help | 182 | 212 | #if no arguments are given print the help | |
if (! @ARGV) { | 183 | 213 | if (! @ARGV) { | |
print_help(); | 184 | 214 | print_help(); | |
} | 185 | 215 | } | |
186 | 216 | |||
#---------------------------------------------------------------------- | 187 | 217 | #---------------------------------------------------------------------- | |
#Parse command line arguments | 188 | 218 | #Parse command line arguments | |
189 | 219 | |||
my $status = GetOptions( | 190 | 220 | my $status = GetOptions( | |
"f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated) | 191 | 221 | "f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated) | |
222 | "dam|domain-analysis-mode=s" => \&read_domainAnalysisMode, #Perform the analysis based on CDD or Pfam domains | |||
192 | 223 | |||
#Options for TCDB::Domain::PfamParser | 193 | 224 | #Options for TCDB::Domain::PfamParser and TCDB::Domain::CDDparser | |
"dc|domain-cov=f" => \$domain_cov, | 194 | 225 | "dc|domain-cov=f" => \$domain_cov, | |
"pc|protein-cov=f" => \$prot_cov, | 195 | 226 | "pc|protein-cov=f" => \$prot_cov, | |
"e|evalue=f" => \$evalue, | 196 | 227 | "e|evalue=f" => \$evalue, | |
"m|prots-w-domain=f" => \$prop_prots_w_domain, | 197 | 228 | "m|prots-w-domain=f" => \$prop_prots_w_domain, | |
198 | 229 | |||
#Options for TCDB::Domain::Characterize | 199 | 230 | #Options for TCDB::Domain::Characterize | |
"pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto | 200 | 231 | "pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto | |
"o|outdir=s" => \&read_root_dir, #Ouput root directory | 201 | 232 | "o|outdir=s" => \&read_root_dir, #Ouput root directory | |
"s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB | 202 | 233 | "s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB | |
"sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family | 203 | 234 | "sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family | |
235 | "cdd=s" => \&read_cdd, #rpsblast output file for whole TCDB | |||
"pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB | 204 | 236 | "pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB | |
"b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences | 205 | 237 | "b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences | |
"p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36) | 206 | 238 | "p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36) | |
"h|help" => sub { print_help(); }, | 207 | 239 | "h|help" => sub { print_help(); }, | |
208 | 240 | |||
#For arguments that do not look like valid options | 209 | 241 | #For arguments that do not look like valid options | |
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; } | 210 | 242 | "<>" => sub { die "Error: Unknown argument: $_[0]\n"; } | |
); | 211 | 243 | ); | |
die "\n" unless ($status); | 212 | 244 | die "\n" unless ($status); | |
213 | 245 | |||
#---------------------------------------------------------------------- | 214 | 246 | #---------------------------------------------------------------------- | |
#Validate command line arguments | 215 | 247 | #Validate command line arguments | |
216 | 248 | |||
249 | if ($domainAnalysisMode eq "cdd") { | |||
250 | $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2" unless ($cddFile); | |||
217 | 251 | |||
die "Error: Options -f and -pt are incompatible" if (@fams && @candProjProts); | 218 | 252 | die "Error: the use of option -pfam with [-dam cdd] is incompatible" if ($pfamFile); | |
253 | } | |||
254 | else { | |||
255 | $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2" unless ($pfamFile); | |||
256 | ||||
257 | die "Error: the use of option -cdd with [-dam pfam] is incompatible" if ($cddFile); | |||
258 | } | |||
259 | ||||
260 | die "Error: options -f and -pt are incompatible" if (@fams && @candProjProts); | |||
die "Error: either -f or -pt must be given" unless (@fams || @candProjProts); | 219 | 261 | die "Error: either -f or -pt must be given" unless (@fams || @candProjProts); | |
220 | 262 | |||
263 | ||||
if (@candProjProts) { | 221 | 264 | if (@candProjProts) { | |
prepare_seqs_for_projection(); | 222 | 265 | prepare_seqs_for_projection(); | |
} | 223 | 266 | } | |
224 | 267 | |||
} | 225 | 268 | } | |
226 | 269 | |||
227 | 270 | |||
#========================================================================== | 228 | 271 | #========================================================================== | |
#Setup the environment for projection of domains onto sequences that are | 229 | 272 | #Setup the environment for projection of domains onto sequences that are | |
#not present in TCDB. | 230 | 273 | #not present in TCDB. | |
231 | 274 | |||
sub prepare_seqs_for_projection { | 232 | 275 | sub prepare_seqs_for_projection { | |
233 | 276 | |||
my $tcdbDir = "$rootDir/tcdb"; | 234 | 277 | my $tcdbDir = "$rootDir/tcdb"; | |
system "mkdir -p $tcdbDir" unless (-d $tcdbDir); | 235 | 278 | system "mkdir -p $tcdbDir" unless (-d $tcdbDir); | |
236 | 279 | |||
#to prevent modifying the original files, here I'll save the input | 237 | 280 | #to prevent modifying the original files, here I'll save the input | |
#sequences with the artificial TCIDs | 238 | 281 | #sequences with the artificial TCIDs | |
my $origInfilesDir = "$rootDir/inputFiles"; | 239 | 282 | my $origInfilesDir = "$rootDir/inputFiles"; | |
system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir); | 240 | 283 | system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir); | |
241 | 284 | |||
242 | 285 | |||
#generate an empty "TCDB sequence file" that will contains proteins not in TCDB | 243 | 286 | #generate an empty "TCDB sequence file" that will contain proteins not in TCDB | |
my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; | 244 | 287 | my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; | |
system "cat /dev/null > $new_tcdbSeqsFile"; | 245 | 288 | system "cat /dev/null > $new_tcdbSeqsFile"; | |
246 | 289 | |||
247 | 290 | |||
#---------------------------------------------------------------------- | 248 | 291 | #---------------------------------------------------------------------- | |
#generate the new TCDB database relevant for the projection | 249 | 292 | #generate the new TCDB database relevant for the projection | |
250 | 293 | |||
foreach my $pair (@candProjProts) { | 251 | 294 | foreach my $pair (@candProjProts) { | |
252 | 295 | |||
my $tcid = $pair->[0]; | 253 | 296 | my $tcid = $pair->[0]; | |
my $tgtF = $pair->[1]; | 254 | 297 | my $tgtF = $pair->[1]; | |
255 | 298 | |||
my @comp = split(/\//, $tgtF); | 256 | 299 | my @comp = split(/\//, $tgtF); | |
my $tgtFileName = $comp[-1]; | 257 | 300 | my $tgtFileName = $comp[-1]; | |
258 | 301 | |||
#Add family to the main array (as if provided by the -f commandline option) | 259 | 302 | #Add family to the main array (as if provided by the -f commandline option) | |
push (@fams, $tcid); | 260 | 303 | push (@fams, $tcid); | |
261 | 304 | |||
262 | 305 | |||
#extracts the tcids of the systems under reference family | 263 | 306 | #extracts the tcids of the systems under reference family | |
# my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa"; | 264 | 307 | # my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa"; | |
# die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs); | 265 | 308 | # die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs); | |
266 | 309 | |||
my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0); | 267 | 310 | my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0); | |
268 | 311 | |||
# print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]); | 269 | 312 | # print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]); | |
# <STDIN>; | 270 | 313 | # <STDIN>; | |
271 | 314 | |||
#determine the TCID that will be used as reference for the target sequences | 272 | 315 | #determine the TCID that will be used as reference for the target sequences | |
my @systems = @{ $sysHash->{$tcid} }; | 273 | 316 | my @systems = @{ $sysHash->{$tcid} }; | |
die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems); | 274 | 317 | die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems); | |
275 | 318 | |||
my $tgtTC = $systems[-1]->[0]; | 276 | 319 | my $tgtTC = $systems[-1]->[0]; | |
$tgtTC =~ s/\.\d+$/\.10000/; | 277 | 320 | $tgtTC =~ s/\.\d+$/\.10000/; | |
278 | 321 | |||
279 | 322 | |||
#Replace the TCID in the file corresponding to the target proteins | 280 | 323 | #Replace the TCID in the file corresponding to the target proteins | |
my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName); | 281 | 324 | my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName); | |
system $cmd1 unless (-f "$origInfilesDir/$tgtFileName"); | 282 | 325 | system $cmd1 unless (-f "$origInfilesDir/$tgtFileName"); | |
283 | 326 | |||
284 | 327 | |||
#Extract sequences for reference family | 285 | 328 | #Extract sequences for reference family | |
my $outFile = "$tcdbDir/tcdb-${tcid}.faa"; | 286 | 329 | my $outFile = "$tcdbDir/tcdb-${tcid}.faa"; | |
my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile); | 287 | 330 | my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile); | |
system $cmd2 unless (-f $outFile); | 288 | 331 | system $cmd2 unless (-f $outFile); | |
die "Could not generate sequence file: $outFile" unless (-f $outFile); | 289 | 332 | die "Could not generate sequence file: $outFile" unless (-f $outFile); | |
290 | 333 | |||
291 | 334 | |||
#Add family and target sequences to the new TCDB family | 292 | 335 | #Add family and target sequences to the new TCDB family | |
my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile); | 293 | 336 | my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile); | |
system $cmd3; | 294 | 337 | system $cmd3; | |
} | 295 | 338 | } | |
296 | 339 | |||
#---------------------------------------------------------------------- | 297 | 340 | #---------------------------------------------------------------------- | |
#Generate the PFam database | 298 | 341 | #Run PFam or CDD on the sequences | |
299 | 342 | |||
my $pfamD = "$rootDir/pfam"; | 300 | 343 | #Analysis with CDD | |
system "mkdir -p $pfamD" unless (-d $pfamD); | 301 | 344 | if ($domainAnalysisMode eq "cdd") { | |
345 | my $cddDir = "$rootDir/cdd"; | |||
346 | system "mkdir -p $cddDir" unless (-d $cddDir); | |||
302 | 347 | |||
my $pfamTMPfile = "$pfamD/tcdb_pfam.out"; | 303 | 348 | my $cddTMPfile = "$cddDir/tcdb_cdd.out"; | |
$pfamFile = "${pfamTMPfile}.bz2"; | 304 | 349 | $cddFile = "${cddTMPfile}.bz2"; | |
305 | 350 | |||
#run Pfam | 306 | 351 | #run cdd | |
my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm $new_tcdbSeqsFile); | 307 | 352 | my $cddDB = "$ENV{RESEARCH_DATA}/DB/domainDBs/cddDB/cdd"; | |
system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile); | 308 | 353 | my $ofmt = "7 qacc qlen sallacc slen evalue bitscore lengt qstart qend qcovhsp sstart send stitle"; | |
354 | my $cmd4 = qq (rpsblast -db $cddDB -query $new_tcdbSeqsFile -evalue $evalue -outfmt '${ofmt}' -out $cddTMPfile); | |||
355 | system $cmd4 unless (-f $cddTMPfile || -f $cddFile); | |||
309 | 356 | |||
#compress pfam file | 310 | 357 | #compress pfam file | |
my $cmd5 = qq(bzip2 $pfamTMPfile); | 311 | 358 | my $cmd5 = qq(bzip2 $cddTMPfile); | |
system $cmd5 unless (-f $pfamFile); | 312 | 359 | system $cmd5 unless (-f $cddFile); | |
360 | } | |||
313 | 361 | |||
362 | #Analysis with Pfam | |||
363 | else { | |||
364 | my $pfamD = "$rootDir/pfam"; | |||
365 | system "mkdir -p $pfamD" unless (-d $pfamD); | |||
314 | 366 | |||
367 | my $pfamTMPfile = "$pfamD/tcdb_pfam.out"; | |||
368 | $pfamFile = "${pfamTMPfile}.bz2"; | |||
369 | ||||
370 | #run Pfam | |||
371 | my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/DB/domainDBs/xfamDB/Pfam.hmm $new_tcdbSeqsFile); | |||
372 | system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile); | |||
373 | ||||
374 | #compress pfam file | |||
375 | my $cmd5 = qq(bzip2 $pfamTMPfile); | |||
376 | system $cmd5 unless (-f $pfamFile); | |||
377 | } | |||
378 | ||||
#---------------------------------------------------------------------- | 315 | 379 | #---------------------------------------------------------------------- | |
#now generate the blast DB | 316 | 380 | #now generate the blast DB | |
317 | 381 | |||
my $blastD = "$rootDir/blastdb"; | 318 | 382 | my $blastD = "$rootDir/blastdb"; | |
system "mkdir -p $blastD" unless (-d $blastD); | 319 | 383 | system "mkdir -p $blastD" unless (-d $blastD); | |
320 | 384 | |||
$blastdb = "$blastD/tcdb"; | 321 | 385 | $blastdb = "$blastD/tcdb"; | |
322 | 386 | |||
my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile); | 323 | 387 | my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile); | |
system $cmd6 unless (-f "${blastdb}.pin"); | 324 | 388 | system $cmd6 unless (-f "${blastdb}.pin"); | |
325 | 389 | |||
326 | 390 | |||
#For all purposes update the TCDB sequence file to point to the new "customized" file. | 327 | 391 | #For all purposes update the TCDB sequence file to point to the new "customized" file. | |
$tcdbSeqsFile = $new_tcdbSeqsFile; | 328 | 392 | $tcdbSeqsFile = $new_tcdbSeqsFile; | |
} | 329 | 393 | } | |
330 | 394 | |||
331 | 395 | |||
332 | 396 | |||
333 | 397 | |||
334 | 398 | |||
399 | ||||
#========================================================================== | 335 | 400 | #========================================================================== | |
#Read the -pt option. It is expected that the user provides the family to which | 336 | 401 | #Read the -pt option. It is expected that the user provides the family to which | |
#the target proteins are expected to belong. Example format should is: | 337 | 402 | #the target proteins are expected to belong. Example format: | |
# -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}. | 338 | 403 | # -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}. | |
# | 339 | 404 | # | |
#NOTE: This option is incompatible with -f | 340 | 405 | #NOTE: This option is incompatible with -f | |
341 | 406 | |||
sub read_proj_targets { | 342 | 407 | sub read_proj_targets { | |
343 | 408 | |||
my ($opt, $value) = @_; | 344 | 409 | my ($opt, $value) = @_; | |
345 | 410 | |||
my @pairs = split (/:/, $value); | 346 | 411 | my @pairs = split (/:/, $value); | |
die "No significant argument passed to option -pt" unless (@pairs); | 347 | 412 | die "No valid argument passed to option -pt" unless (@pairs); | |
348 | 413 | |||
foreach my $pair (@pairs) { | 349 | 414 | foreach my $pair (@pairs) { | |
my ($tc, $file) = split (/,/, $pair); | 350 | 415 | my ($tc, $file) = split (/,/, $pair); | |
die "Error: not a valid {tcid},{file} pair: $pair" unless ($tc && $file); | 351 | 416 | die "Error: not a valid {tcid},{file} pair: $pair" unless ($tc && $file); | |
352 | 417 | |||
TCDB::Assorted::validate_tcdb_id([$tc]); | 353 | 418 | TCDB::Assorted::validate_tcdb_id([$tc]); | |
354 | 419 | |||
unless (-f $file && !(-z $file)) { | 355 | 420 | unless (-f $file && !(-z $file)) { | |
die "File with projection targets for $tc was not found or empty: $file"; | 356 | 421 | die "File with projection targets for $tc was not found or empty: $file"; | |
} | 357 | 422 | } | |
358 | 423 | |||
push (@candProjProts, [$tc, $file]); | 359 | 424 | push (@candProjProts, [$tc, $file]); | |
360 | 425 | |||
} | 361 | 426 | } | |
} | 362 | 427 | } | |
363 | 428 | |||
364 | 429 | |||
365 | 430 | |||
#========================================================================== | 366 | 431 | #========================================================================== | |
#Read the -f option | 367 | 432 | #Read the -f option | |
368 | 433 | |||
sub read_fams { | 369 | 434 | sub read_fams { | |
370 | 435 | |||
my ($opt, $value) = @_; | 371 | 436 | my ($opt, $value) = @_; | |
372 | 437 | |||
@fams = split (/,/, $value); | 373 | 438 | @fams = split (/,/, $value); | |
374 | 439 | |||
TCDB::Assorted::validate_tcdb_id(\@fams); | 375 | 440 | TCDB::Assorted::validate_tcdb_id(\@fams); | |
} | 376 | 441 | } | |
377 | 442 | |||
378 | 443 | |||
#========================================================================== | 379 | 444 | #========================================================================== | |
#Read the -d option | 380 | 445 | #Read the -dam option (Domain Analysis Mode) | |
381 | 446 | |||
447 | sub read_domainAnalysisMode { | |||
448 | ||||
449 | my ($opt, $value) = @_; | |||
450 | ||||
451 | my $tmp = lc $value; | |||
452 | die "Unrecognized mode: $value" unless ($tmp =~ /(cdd|pfam)/); | |||
453 | ||||
454 | $domainAnalysisMode = $tmp; | |||
455 | } | |||
456 | ||||
457 | ||||
458 | ||||
459 | #========================================================================== | |||
460 | #Read the -o option | |||
461 | ||||
sub read_root_dir { | 382 | 462 | sub read_root_dir { | |
383 | 463 | |||
my ($opt, $value) = @_; | 384 | 464 | my ($opt, $value) = @_; | |
385 | 465 | |||
system "mkdir -p $value" unless (-d $value); | 386 | 466 | system "mkdir -p $value" unless (-d $value); | |
387 | 467 | |||
$rootDir = $value; | 388 | 468 | $rootDir = $value; | |
} | 389 | 469 | } | |
390 | 470 | |||
391 | 471 | |||
#========================================================================== | 392 | 472 | #========================================================================== | |
#Read the -s option | 393 | 473 | #Read the -s option | |
394 | 474 | |||
sub read_tcdb_seqs { | 395 | 475 | sub read_tcdb_seqs { | |
396 | 476 | |||
my ($opt, $value) = @_; | 397 | 477 | my ($opt, $value) = @_; | |
398 | 478 | |||
die "File with TCDB sequences must exist and not be empty: $value" unless (-f $value && !(-z $value)); | 399 | 479 | die "File with TCDB sequences must exist and not be empty: $value" unless (-f $value && !(-z $value)); | |
400 | 480 | |||
$tcdbSeqsFile = $value; | 401 | 481 | $tcdbSeqsFile = $value; | |
} | 402 | 482 | } | |
403 | 483 | |||
404 | 484 | |||
405 | 485 |
testCDDparser.pl
View file @
da6670c
File was created | 1 | #!/usr/bin/env perl -w | ||
2 | ||||
3 | use strict; | |||
4 | use warnings; | |||
5 | use Data::Dumper; | |||
6 | #use List::Util qw(sum); | |||
7 | ||||
8 | use TCDB::Assorted; | |||
9 | use TCDB::Domain::PfamParser; | |||
10 | use TCDB::Domain::CDDparser; | |||
11 | use TCDB::Domain::Characterize; | |||
12 | ||||
13 | use Getopt::Long; | |||
14 | ||||
15 | # | |||
16 | #Domain projections should work both with CDD and Pfam | |||
17 | # | |||
18 | ||||
19 | #========================================================================== | |||
20 | #Global variables | |||
21 | ||||
22 | #Query family or families | |||
23 | my @fams = ("2.A.123"); | |||
24 | ||||
25 | #This is an option for TCDB::Assorted::getSystemAccessions() | |||
26 | my $treatAsSuperfamily = 0; | |||
27 | ||||
28 | #Options for parsers TCDB::Domain::PfamParser and TCDB::Domain::CDDparser | |||
29 | my $domain_cov = 0.7; | |||
30 | my $prot_cov = 0.1; | |||
31 | my $evalue = 1e-5; | |||
32 | my $prop_prots_w_domain = 0.05; | |||
33 | ||||
34 | #Options for TCDB::Domain::Characterize | |||
35 | my $rootDir = "."; | |||
36 | ||||
37 | #To extract the TCIDs of refernece families | |||
38 | my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa"; | |||
39 | my $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2"; | |||
40 | my $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2"; | |||
41 | my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; | |||
42 | my $prog = "ssearch36"; | |||
43 | my @candProjProts = (); | |||
44 | my $analysisLevel = 'system'; | |||
45 | ||||
46 | ||||
47 | ||||
48 | my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); | |||
49 | #print Data::Dumper->Dump([$tcids ], [qw(*tcids)]); | |||
50 | ||||
51 |