Commit da6670c0fef4f224e47a3ee8f4a79eb5ca9396d2

Authored by Luis Arturo Medrano-Soto
1 parent 9f332056e8
Exists in master

updated getDomainTopology to work with CDD and rpsblast

Showing 2 changed files with 218 additions and 43 deletions Inline Diff

getDomainTopology.pl View file @ da6670c
#!/usr/bin/env perl -w 1 1 #!/usr/bin/env perl -w
2 2
use strict; 3 3 use strict;
use warnings; 4 4 use warnings;
use Data::Dumper; 5 5 use Data::Dumper;
#use List::Util qw(sum); 6 6 #use List::Util qw(sum);
7 7
use TCDB::Assorted; 8 8 use TCDB::Assorted;
use TCDB::Domain::PfamParser; 9 9 use TCDB::Domain::PfamParser;
10 use TCDB::Domain::CDDparser;
use TCDB::Domain::Characterize; 10 11 use TCDB::Domain::Characterize;
11 12
use Getopt::Long; 12 13 use Getopt::Long;
13 14
15 #
16 #Domain projections should work both with CDD and Pfam
17 #
18
#========================================================================== 14 19 #==========================================================================
#Global variables 15 20 #Global variables
16 21
#Query family or families 17 22 #Query family or families
my @fams = (); 18 23 my @fams = ();
19 24
#This is an option for TCDB::Assorted::getSystemAccessions() 20 25 #This is an option for TCDB::Assorted::getSystemAccessions()
my $treatAsSuperfamily = 0; 21 26 my $treatAsSuperfamily = 0;
22 27
#Options for TCDB::Domain::PfamParser 23 28 #Options for TCDB::Domain::PfamParser
my $domain_cov = 0.7; 24 29 my $domain_cov = 0.7;
my $prot_cov = 0.1; 25 30 my $prot_cov = 0.1;
my $evalue = 1e-5; 26 31 my $evalue = 1e-3;
my $prop_prots_w_domain = 0.05; 27 32 my $prop_prots_w_domain = 0.05;
28 33
34 my $domainAnalysisMode = "cdd"; # cdd or pfam
35
#Options for TCDB::Domain::Characterize 29 36 #Options for TCDB::Domain::Characterize
my $rootDir = "."; 30 37 my $rootDir = ".";
31 38
#To extract the TCIDs of refernece families 32 39 #To extract the TCIDs of refernece families
my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/pfam/download/tcdb.faa"; 33 40 my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa";
my $pfamFile = "$ENV{RESEARCH_DATA}/pfam/tcdb.pfam-a.hmmscan.bz2"; 34 41 my $pfamFile = "";
42 my $cddFile = "";
my $blastdb = "$ENV{HOME}/db/blastdb/tcdb"; 35 43 my $blastdb = "$ENV{HOME}/db/blastdb/tcdb";
my $prog = "ssearch36"; 36 44 my $prog = "ssearch36";
my @candProjProts = (); 37 45 my @candProjProts = ();
my $analysisLevel = 'system'; 38 46 my $analysisLevel = 'system';
39 47
40 48
41 49
#Read command line topology 42 50 #Read command line topology
read_command_line_arguments(); 43 51 read_command_line_arguments();
44 52
45 53
die "TCDB sequences file not found or empty --> $tcdbSeqsFile\n" unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile)); 46 54 #Input files validation.
die "TCDB hmmscan output file not found --> $pfamFile\n" unless (-f $pfamFile && !(-z $pfamFile)); 47 55 die "TCDB sequences file not found or empty --> $tcdbSeqsFile." unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile));
48 56
57 if ($domainAnalysisMode eq "cdd") {
58 die "TCDB CDD rpsblast output file not found: $cddFile" unless (-f $cddFile && !(-z $cddFile));
59 }
60 else {
61 die "TCDB hmmscan output file not found: $pfamFile" unless (-f $pfamFile && !(-z $pfamFile));
62 }
49 63
#print Data::Dumper->Dump([\@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $blastdb, $prog, $domain_cov, 50 64
65
66
67 #print Data::Dumper->Dump([$domainAnalysisMode, \@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $cddFile, $blastdb, $prog, $domain_cov,
# $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts], 51 68 # $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts],
# [qw(*fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *blastdb *prog *domain_cov 52 69 # [qw(*domainAnalysisMode *fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *cddFile *blastdb *prog *domain_cov
# *prot_cov *evalue *prop_prots_w_domain *candProjProts)]); 53 70 # *prot_cov *evalue *prop_prots_w_domain *candProjProts)]);
#exit; 54 71 #exit;
55 72
56 73
#========================================================================== 57 74 #==========================================================================
#Split tcdb systems into single-component multi-component 58 75 #Split tcdb systems into single-component multi-component
59 76
60 77
if ($treatAsSuperfamily) { 61 78 if ($treatAsSuperfamily) {
62 79
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily); 63 80 my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily);
64 81
# print Data::Dumper->Dump([$tcids ], [qw(*tcids )]); 65 82 #print Data::Dumper->Dump([$tcids ], [qw(*tcids )]);
# exit; 66 83 #exit;
67 84
68
69
#========================================================================== 70 85 #==========================================================================
#Setup the thresholds for parsing the PFAM output 71 86 #Setup the thresholds for parsing the PFAM output
72 87
88 my $obj = "";
89 if ($domainAnalysisMode eq "cdd") {
90 $obj = new TCDB::Domain::CDDparser();
91 $obj->cddFile($cddFile);
92 }
93 else {
94 $obj = new TCDB::Domain::PfamParser();
95 $obj->pfamFile($pfamFile);
96 }
73 97
my $obj = new TCDB::Domain::PfamParser(); 74
$obj->pfamFile($pfamFile); 75
$obj->analysisLevel($analysisLevel); 76 98 $obj->analysisLevel($analysisLevel);
$obj->domCovCutoff($domain_cov); 77 99 $obj->domCovCutoff($domain_cov);
$obj->tcCovCutoff($prot_cov); 78 100 $obj->tcCovCutoff($prot_cov);
$obj->evalueCutoff($evalue); 79 101 $obj->evalueCutoff($evalue);
$obj->minProtsDom($prop_prots_w_domain); 80 102 $obj->minProtsDom($prop_prots_w_domain);
$obj->treatAsSuperfamily($treatAsSuperfamily); 81 103 $obj->treatAsSuperfamily($treatAsSuperfamily);
82 104
83 105
my %domFreq = (); 84 106 my %domFreq = ();
my %domCoords = (); 85 107 my %domCoords = ();
$obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords); 86 108 $obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords);
87 109
# print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); 88 110 # print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]);
# exit; 89 111 # exit;
90 112
91 113
92 114
93 115
94 116
#========================================================================== 95 117 #==========================================================================
#Attempt to rescue the domains that were not recognized by PFAM in some 96 118 #Attempt to rescue the domains that were not recognized by PFAM in some
#Family members 97 119 #Family members
98 120
99 121
my $rescueObj = new TCDB::Domain::Characterize(); 100 122 my $rescueObj = new TCDB::Domain::Characterize();
$rescueObj->rootDir($rootDir); 101 123 $rescueObj->rootDir($rootDir);
$rescueObj->tcdbFaa($tcdbSeqsFile); 102 124 $rescueObj->tcdbFaa($tcdbSeqsFile);
$rescueObj->domCoords(\%domCoords); 103 125 $rescueObj->domCoords(\%domCoords);
$rescueObj->domFreq(\%domFreq); 104 126 $rescueObj->domFreq(\%domFreq);
$rescueObj->tcids($tcids); 105 127 $rescueObj->tcids($tcids);
$rescueObj->searchWith($prog); 106 128 $rescueObj->searchWith($prog);
$rescueObj->blastdb($blastdb); 107 129 $rescueObj->blastdb($blastdb);
$rescueObj->evalue($evalue); 108 130 $rescueObj->evalue($evalue);
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); 109 131 $rescueObj->treatAsSuperfamily($treatAsSuperfamily);
110 132
111 133
$rescueObj->rescueDomains(\@fams); 112 134 $rescueObj->rescueDomains(\@fams);
113 135
} 114 136 }
else { 115 137 else {
116 138
foreach my $fam (@fams) { 117 139 foreach my $fam (@fams) {
118 140
my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily); 119 141 my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, [$fam], $treatAsSuperfamily);
120 142
# print Data::Dumper->Dump([$tcids ], [qw( *tcids )]); 121 143 # print Data::Dumper->Dump([$tcids ], [qw( *tcids )]);
# exit; 122 144 # exit;
123 145
124 146
125 147
#========================================================================== 126 148 #==========================================================================
#Setup the thresholds for parsing the PFAM output 127 149 #Setup the thresholds for parsing the PFAM output
128 150
129 151
my $obj = new TCDB::Domain::PfamParser(); 130 152 my $obj = "";
$obj->pfamFile($pfamFile); 131 153 if ($domainAnalysisMode eq "cdd") {
154 $obj = new TCDB::Domain::CDDparser();
155 $obj->cddInFile($cddFile);
156 }
157 else {
158 $obj = new TCDB::Domain::PfamParser();
159 $obj->pfamFile($pfamFile);
160 }
161
$obj->analysisLevel($analysisLevel); 132 162 $obj->analysisLevel($analysisLevel);
$obj->domCovCutoff($domain_cov); 133 163 $obj->domCovCutoff($domain_cov);
$obj->tcCovCutoff($prot_cov); 134 164 $obj->tcCovCutoff($prot_cov);
$obj->evalueCutoff($evalue); 135 165 $obj->evalueCutoff($evalue);
$obj->minProtsDom($prop_prots_w_domain); 136 166 $obj->minProtsDom($prop_prots_w_domain);
137 167
138 168
my %domFreq = (); 139 169 my %domFreq = ();
my %domCoords = (); 140 170 my %domCoords = ();
$obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords); 141 171 $obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords);
142 172
#print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]); 143 173 # print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]);
#exit; 144 174 # exit;
145 175
146 176
147 177
148 178
149 179
#========================================================================== 150 180 #==========================================================================
#Attempt to rescue the domains that were not recognized by PFAM in some 151 181 #Attempt to rescue the domains that were not recognized by PFAM/CDD in some
#Family members 152 182 #Family members
153 183
154 184
my $rescueObj = new TCDB::Domain::Characterize(); 155 185 my $rescueObj = new TCDB::Domain::Characterize();
$rescueObj->rootDir($rootDir); 156 186 $rescueObj->rootDir($rootDir);
$rescueObj->tcdbFaa($tcdbSeqsFile); 157 187 $rescueObj->tcdbFaa($tcdbSeqsFile);
$rescueObj->domCoords(\%domCoords); 158 188 $rescueObj->domCoords(\%domCoords);
$rescueObj->domFreq(\%domFreq); 159 189 $rescueObj->domFreq(\%domFreq);
$rescueObj->tcids($tcids); 160 190 $rescueObj->tcids($tcids);
$rescueObj->searchWith($prog); 161 191 $rescueObj->searchWith($prog);
$rescueObj->blastdb($blastdb); 162 192 $rescueObj->blastdb($blastdb);
$rescueObj->evalue($evalue); 163 193 $rescueObj->evalue($evalue);
$rescueObj->domCovCutoff($domain_cov); 164 194 $rescueObj->domCovCutoff($domain_cov);
$rescueObj->treatAsSuperfamily($treatAsSuperfamily); 165 195 $rescueObj->treatAsSuperfamily($treatAsSuperfamily);
166 196
$rescueObj->rescueDomains(); 167 197 $rescueObj->rescueDomains();
} 168 198 }
} 169 199 }
170 200
171 201
172 202
173 203
########################################################################### 174 204 ###########################################################################
## Functions ## 175 205 ## Functions ##
########################################################################### 176 206 ###########################################################################
177 207
178 208
179 209
sub read_command_line_arguments { 180 210 sub read_command_line_arguments {
181 211
#if no arguments are given print the help 182 212 #if no arguments are given print the help
if (! @ARGV) { 183 213 if (! @ARGV) {
print_help(); 184 214 print_help();
} 185 215 }
186 216
#---------------------------------------------------------------------- 187 217 #----------------------------------------------------------------------
#Parse command line arguments 188 218 #Parse command line arguments
189 219
my $status = GetOptions( 190 220 my $status = GetOptions(
"f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated) 191 221 "f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated)
222 "dam|domain-analysis-mode=s" => \&read_domainAnalysisMode, #Perform the analysis based on CDD or Pfam domains
192 223
#Options for TCDB::Domain::PfamParser 193 224 #Options for TCDB::Domain::PfamParser and TCDB::Domain::CDDparser
"dc|domain-cov=f" => \$domain_cov, 194 225 "dc|domain-cov=f" => \$domain_cov,
"pc|protein-cov=f" => \$prot_cov, 195 226 "pc|protein-cov=f" => \$prot_cov,
"e|evalue=f" => \$evalue, 196 227 "e|evalue=f" => \$evalue,
"m|prots-w-domain=f" => \$prop_prots_w_domain, 197 228 "m|prots-w-domain=f" => \$prop_prots_w_domain,
198 229
#Options for TCDB::Domain::Characterize 199 230 #Options for TCDB::Domain::Characterize
"pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto 200 231 "pt|proj-targets=s" => \&read_proj_targets, #Target Proteins, NOT in TCDB, to project domains onto
"o|outdir=s" => \&read_root_dir, #Ouput root directory 201 232 "o|outdir=s" => \&read_root_dir, #Ouput root directory
"s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB 202 233 "s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB
"sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family 203 234 "sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family
235 "cdd=s" => \&read_cdd, #rpsblast output file for whole TCDB
"pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB 204 236 "pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB
"b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences 205 237 "b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences
"p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36) 206 238 "p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36)
"h|help" => sub { print_help(); }, 207 239 "h|help" => sub { print_help(); },
208 240
#For arguments that do not look like valid options 209 241 #For arguments that do not look like valid options
"<>" => sub { die "Error: Unknown argument: $_[0]\n"; } 210 242 "<>" => sub { die "Error: Unknown argument: $_[0]\n"; }
); 211 243 );
die "\n" unless ($status); 212 244 die "\n" unless ($status);
213 245
#---------------------------------------------------------------------- 214 246 #----------------------------------------------------------------------
#Validate command line arguments 215 247 #Validate command line arguments
216 248
249 if ($domainAnalysisMode eq "cdd") {
250 $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2" unless ($cddFile);
217 251
die "Error: Options -f and -pt are incompatible" if (@fams && @candProjProts); 218 252 die "Error: the use of option -pfam with [-dam cdd] is incompatible" if ($pfamFile);
253 }
254 else {
255 $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2" unless ($pfamFile);
256
257 die "Error: the use of option -cdd with [-dam pfam] is incompatible" if ($cddFile);
258 }
259
260 die "Error: options -f and -pt are incompatible" if (@fams && @candProjProts);
die "Error: either -f or -pt must be given" unless (@fams || @candProjProts); 219 261 die "Error: either -f or -pt must be given" unless (@fams || @candProjProts);
220 262
263
if (@candProjProts) { 221 264 if (@candProjProts) {
prepare_seqs_for_projection(); 222 265 prepare_seqs_for_projection();
} 223 266 }
224 267
} 225 268 }
226 269
227 270
#========================================================================== 228 271 #==========================================================================
#Setup the environment for projection of domains onto sequences that are 229 272 #Setup the environment for projection of domains onto sequences that are
#not present in TCDB. 230 273 #not present in TCDB.
231 274
sub prepare_seqs_for_projection { 232 275 sub prepare_seqs_for_projection {
233 276
my $tcdbDir = "$rootDir/tcdb"; 234 277 my $tcdbDir = "$rootDir/tcdb";
system "mkdir -p $tcdbDir" unless (-d $tcdbDir); 235 278 system "mkdir -p $tcdbDir" unless (-d $tcdbDir);
236 279
#to prevent modifying the original files, here I'll save the input 237 280 #to prevent modifying the original files, here I'll save the input
#sequences with the artificial TCIDs 238 281 #sequences with the artificial TCIDs
my $origInfilesDir = "$rootDir/inputFiles"; 239 282 my $origInfilesDir = "$rootDir/inputFiles";
system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir); 240 283 system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir);
241 284
242 285
#generate an empty "TCDB sequence file" that will contains proteins not in TCDB 243 286 #generate an empty "TCDB sequence file" that will contain proteins not in TCDB
my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; 244 287 my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa";
system "cat /dev/null > $new_tcdbSeqsFile"; 245 288 system "cat /dev/null > $new_tcdbSeqsFile";
246 289
247 290
#---------------------------------------------------------------------- 248 291 #----------------------------------------------------------------------
#generate the new TCDB database relevant for the projection 249 292 #generate the new TCDB database relevant for the projection
250 293
foreach my $pair (@candProjProts) { 251 294 foreach my $pair (@candProjProts) {
252 295
my $tcid = $pair->[0]; 253 296 my $tcid = $pair->[0];
my $tgtF = $pair->[1]; 254 297 my $tgtF = $pair->[1];
255 298
my @comp = split(/\//, $tgtF); 256 299 my @comp = split(/\//, $tgtF);
my $tgtFileName = $comp[-1]; 257 300 my $tgtFileName = $comp[-1];
258 301
#Add family to the main array (as if provided by the -f commandline option) 259 302 #Add family to the main array (as if provided by the -f commandline option)
push (@fams, $tcid); 260 303 push (@fams, $tcid);
261 304
262 305
#extracts the tcids of the systems under reference family 263 306 #extracts the tcids of the systems under reference family
# my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa"; 264 307 # my $tcdbSeqs = $tcdbSeqsFile; #"$ENV{HOME}/db/blastdb/tcdb.faa";
# die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs); 265 308 # die "TCDB sequences not found: $tcdbSeqs" unless (-f $tcdbSeqs);
266 309
my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0); 267 310 my $sysHash = TCDB::Assorted::getSystemAccessions($tcdbSeqsFile, 'both', 'system', [$tcid], 0);
268 311
# print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]); 269 312 # print Data::Dumper->Dump([$sysHash, $tcdbSeqs], [qw(*sysHash *tcdbSeqs)]);
# <STDIN>; 270 313 # <STDIN>;
271 314
#determine the TCID that will be used as reference for the target sequences 272 315 #determine the TCID that will be used as reference for the target sequences
my @systems = @{ $sysHash->{$tcid} }; 273 316 my @systems = @{ $sysHash->{$tcid} };
die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems); 274 317 die "Could not find TCIDs for $tcid in $tcdbSeqsFile" unless (@systems);
275 318
my $tgtTC = $systems[-1]->[0]; 276 319 my $tgtTC = $systems[-1]->[0];
$tgtTC =~ s/\.\d+$/\.10000/; 277 320 $tgtTC =~ s/\.\d+$/\.10000/;
278 321
279 322
#Replace the TCID in the file corresponding to the target proteins 280 323 #Replace the TCID in the file corresponding to the target proteins
my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName); 281 324 my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName);
system $cmd1 unless (-f "$origInfilesDir/$tgtFileName"); 282 325 system $cmd1 unless (-f "$origInfilesDir/$tgtFileName");
283 326
284 327
#Extract sequences for reference family 285 328 #Extract sequences for reference family
my $outFile = "$tcdbDir/tcdb-${tcid}.faa"; 286 329 my $outFile = "$tcdbDir/tcdb-${tcid}.faa";
my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile); 287 330 my $cmd2 = qq(extractTCDB.pl -i $tcid -o $tcdbDir -d $tcdbSeqsFile);
system $cmd2 unless (-f $outFile); 288 331 system $cmd2 unless (-f $outFile);
die "Could not generate sequence file: $outFile" unless (-f $outFile); 289 332 die "Could not generate sequence file: $outFile" unless (-f $outFile);
290 333
291 334
#Add family and target sequences to the new TCDB family 292 335 #Add family and target sequences to the new TCDB family
my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile); 293 336 my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile);
system $cmd3; 294 337 system $cmd3;
} 295 338 }
296 339
#---------------------------------------------------------------------- 297 340 #----------------------------------------------------------------------
#Generate the PFam database 298 341 #Run PFam or CDD on the sequences
299 342
my $pfamD = "$rootDir/pfam"; 300 343 #Analysis with CDD
system "mkdir -p $pfamD" unless (-d $pfamD); 301 344 if ($domainAnalysisMode eq "cdd") {
345 my $cddDir = "$rootDir/cdd";
346 system "mkdir -p $cddDir" unless (-d $cddDir);
302 347
my $pfamTMPfile = "$pfamD/tcdb_pfam.out"; 303 348 my $cddTMPfile = "$cddDir/tcdb_cdd.out";
$pfamFile = "${pfamTMPfile}.bz2"; 304 349 $cddFile = "${cddTMPfile}.bz2";
305 350
#run Pfam 306 351 #run cdd
my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm $new_tcdbSeqsFile); 307 352 my $cddDB = "$ENV{RESEARCH_DATA}/DB/domainDBs/cddDB/cdd";
system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile); 308 353 my $ofmt = "7 qacc qlen sallacc slen evalue bitscore lengt qstart qend qcovhsp sstart send stitle";
354 my $cmd4 = qq (rpsblast -db $cddDB -query $new_tcdbSeqsFile -evalue $evalue -outfmt '${ofmt}' -out $cddTMPfile);
355 system $cmd4 unless (-f $cddTMPfile || -f $cddFile);
309 356
#compress pfam file 310 357 #compress pfam file
my $cmd5 = qq(bzip2 $pfamTMPfile); 311 358 my $cmd5 = qq(bzip2 $cddTMPfile);
system $cmd5 unless (-f $pfamFile); 312 359 system $cmd5 unless (-f $cddFile);
360 }
313 361
362 #Analysis with Pfam
363 else {
364 my $pfamD = "$rootDir/pfam";
365 system "mkdir -p $pfamD" unless (-d $pfamD);
314 366
367 my $pfamTMPfile = "$pfamD/tcdb_pfam.out";
368 $pfamFile = "${pfamTMPfile}.bz2";
369
370 #run Pfam
371 my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/DB/domainDBs/xfamDB/Pfam.hmm $new_tcdbSeqsFile);
372 system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile);
373
374 #compress pfam file
375 my $cmd5 = qq(bzip2 $pfamTMPfile);
376 system $cmd5 unless (-f $pfamFile);
377 }
378
#---------------------------------------------------------------------- 315 379 #----------------------------------------------------------------------
#now generate the blast DB 316 380 #now generate the blast DB
317 381
my $blastD = "$rootDir/blastdb"; 318 382 my $blastD = "$rootDir/blastdb";
system "mkdir -p $blastD" unless (-d $blastD); 319 383 system "mkdir -p $blastD" unless (-d $blastD);
320 384
$blastdb = "$blastD/tcdb"; 321 385 $blastdb = "$blastD/tcdb";
322 386
my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile); 323 387 my $cmd6 = qq(extractTCDB.pl -i tcdb -o $blastD -f blast -d $new_tcdbSeqsFile);
system $cmd6 unless (-f "${blastdb}.pin"); 324 388 system $cmd6 unless (-f "${blastdb}.pin");
325 389
326 390
#For all purposes update the TCDB sequence file to point to the new "customized" file. 327 391 #For all purposes update the TCDB sequence file to point to the new "customized" file.
$tcdbSeqsFile = $new_tcdbSeqsFile; 328 392 $tcdbSeqsFile = $new_tcdbSeqsFile;
} 329 393 }
330 394
331 395
332 396
333 397
334 398
399
#========================================================================== 335 400 #==========================================================================
#Read the -pt option. It is expected that the user provides the family to which 336 401 #Read the -pt option. It is expected that the user provides the family to which
#the target proteins are expected to belong. Example format should is: 337 402 #the target proteins are expected to belong. Example format:
# -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}. 338 403 # -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}.
# 339 404 #
#NOTE: This option is incompatible with -f 340 405 #NOTE: This option is incompatible with -f
341 406
sub read_proj_targets { 342 407 sub read_proj_targets {
343 408
my ($opt, $value) = @_; 344 409 my ($opt, $value) = @_;
345 410
my @pairs = split (/:/, $value); 346 411 my @pairs = split (/:/, $value);
die "No significant argument passed to option -pt" unless (@pairs); 347 412 die "No valid argument passed to option -pt" unless (@pairs);
348 413
foreach my $pair (@pairs) { 349 414 foreach my $pair (@pairs) {
my ($tc, $file) = split (/,/, $pair); 350 415 my ($tc, $file) = split (/,/, $pair);
die "Error: not a valid {tcid},{file} pair: $pair" unless ($tc && $file); 351 416 die "Error: not a valid {tcid},{file} pair: $pair" unless ($tc && $file);
352 417
TCDB::Assorted::validate_tcdb_id([$tc]); 353 418 TCDB::Assorted::validate_tcdb_id([$tc]);
354 419
unless (-f $file && !(-z $file)) { 355 420 unless (-f $file && !(-z $file)) {
die "File with projection targets for $tc was not found or empty: $file"; 356 421 die "File with projection targets for $tc was not found or empty: $file";
} 357 422 }
358 423
push (@candProjProts, [$tc, $file]); 359 424 push (@candProjProts, [$tc, $file]);
360 425
} 361 426 }
} 362 427 }
363 428
364 429
365 430
#========================================================================== 366 431 #==========================================================================
#Read the -f option 367 432 #Read the -f option
368 433
sub read_fams { 369 434 sub read_fams {
370 435
my ($opt, $value) = @_; 371 436 my ($opt, $value) = @_;
372 437
@fams = split (/,/, $value); 373 438 @fams = split (/,/, $value);
374 439
TCDB::Assorted::validate_tcdb_id(\@fams); 375 440 TCDB::Assorted::validate_tcdb_id(\@fams);
} 376 441 }
377 442
378 443
#========================================================================== 379 444 #==========================================================================
#Read the -d option 380 445 #Read the -dam option (Domain Analysis Mode)
381 446
447 sub read_domainAnalysisMode {
448
449 my ($opt, $value) = @_;
450
451 my $tmp = lc $value;
452 die "Unrecognized mode: $value" unless ($tmp =~ /(cdd|pfam)/);
453
454 $domainAnalysisMode = $tmp;
455 }
456
457
458
459 #==========================================================================
460 #Read the -o option
461
sub read_root_dir { 382 462 sub read_root_dir {
383 463
my ($opt, $value) = @_; 384 464 my ($opt, $value) = @_;
385 465
system "mkdir -p $value" unless (-d $value); 386 466 system "mkdir -p $value" unless (-d $value);
387 467
$rootDir = $value; 388 468 $rootDir = $value;
} 389 469 }
390 470
391 471
#========================================================================== 392 472 #==========================================================================
#Read the -s option 393 473 #Read the -s option
394 474
sub read_tcdb_seqs { 395 475 sub read_tcdb_seqs {
396 476
my ($opt, $value) = @_; 397 477 my ($opt, $value) = @_;
398 478
die "File with TCDB sequences must exist and not be empty: $value" unless (-f $value && !(-z $value)); 399 479 die "File with TCDB sequences must exist and not be empty: $value" unless (-f $value && !(-z $value));
400 480
$tcdbSeqsFile = $value; 401 481 $tcdbSeqsFile = $value;
} 402 482 }
403 483
404 484
405 485
testCDDparser.pl View file @ da6670c
File was created 1 #!/usr/bin/env perl -w
2
3 use strict;
4 use warnings;
5 use Data::Dumper;
6 #use List::Util qw(sum);
7
8 use TCDB::Assorted;
9 use TCDB::Domain::PfamParser;
10 use TCDB::Domain::CDDparser;
11 use TCDB::Domain::Characterize;
12
13 use Getopt::Long;
14
15 #
16 #Domain projections should work both with CDD and Pfam
17 #
18
19 #==========================================================================
20 #Global variables
21
22 #Query family or families
23 my @fams = ("2.A.123");
24
25 #This is an option for TCDB::Assorted::getSystemAccessions()
26 my $treatAsSuperfamily = 0;
27
28 #Options for parsers TCDB::Domain::PfamParser and TCDB::Domain::CDDparser
29 my $domain_cov = 0.7;
30 my $prot_cov = 0.1;
31 my $evalue = 1e-5;
32 my $prop_prots_w_domain = 0.05;
33
34 #Options for TCDB::Domain::Characterize
35 my $rootDir = ".";
36
37 #To extract the TCIDs of refernece families
38 my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa";
39 my $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2";
40 my $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2";
41 my $blastdb = "$ENV{HOME}/db/blastdb/tcdb";
42 my $prog = "ssearch36";
43 my @candProjProts = ();
44 my $analysisLevel = 'system';
45
46
47
48 my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily);
49 #print Data::Dumper->Dump([$tcids ], [qw(*tcids)]);
50
51