Commit da6670c0fef4f224e47a3ee8f4a79eb5ca9396d2

Authored by Luis Arturo Medrano-Soto
1 parent 9f332056e8
Exists in master

updated getDomainTopology to work with CDD and rpsblast

Showing 2 changed files with 218 additions and 43 deletions Side-by-side Diff

getDomainTopology.pl View file @ da6670c
... ... @@ -7,10 +7,15 @@
7 7  
8 8 use TCDB::Assorted;
9 9 use TCDB::Domain::PfamParser;
  10 +use TCDB::Domain::CDDparser;
10 11 use TCDB::Domain::Characterize;
11 12  
12 13 use Getopt::Long;
13 14  
  15 +#
  16 +#Domain projections should work both with CDD and Pfam
  17 +#
  18 +
14 19 #==========================================================================
15 20 #Global variables
16 21  
17 22  
18 23  
... ... @@ -23,15 +28,18 @@
23 28 #Options for TCDB::Domain::PfamParser
24 29 my $domain_cov = 0.7;
25 30 my $prot_cov = 0.1;
26   -my $evalue = 1e-5;
  31 +my $evalue = 1e-3;
27 32 my $prop_prots_w_domain = 0.05;
28 33  
  34 +my $domainAnalysisMode = "cdd"; # cdd or pfam
  35 +
29 36 #Options for TCDB::Domain::Characterize
30 37 my $rootDir = ".";
31 38  
32 39 #To extract the TCIDs of refernece families
33   -my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/pfam/download/tcdb.faa";
34   -my $pfamFile = "$ENV{RESEARCH_DATA}/pfam/tcdb.pfam-a.hmmscan.bz2";
  40 +my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa";
  41 +my $pfamFile = "";
  42 +my $cddFile = "";
35 43 my $blastdb = "$ENV{HOME}/db/blastdb/tcdb";
36 44 my $prog = "ssearch36";
37 45 my @candProjProts = ();
38 46  
39 47  
40 48  
... ... @@ -43,13 +51,22 @@
43 51 read_command_line_arguments();
44 52  
45 53  
46   -die "TCDB sequences file not found or empty --> $tcdbSeqsFile\n" unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile));
47   -die "TCDB hmmscan output file not found --> $pfamFile\n" unless (-f $pfamFile && !(-z $pfamFile));
  54 +#Input files validation.
  55 +die "TCDB sequences file not found or empty --> $tcdbSeqsFile." unless (-f $tcdbSeqsFile && !(-z $tcdbSeqsFile));
48 56  
  57 +if ($domainAnalysisMode eq "cdd") {
  58 + die "TCDB CDD rpsblast output file not found: $cddFile" unless (-f $cddFile && !(-z $cddFile));
  59 +}
  60 +else {
  61 + die "TCDB hmmscan output file not found: $pfamFile" unless (-f $pfamFile && !(-z $pfamFile));
  62 +}
49 63  
50   -#print Data::Dumper->Dump([\@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $blastdb, $prog, $domain_cov,
  64 +
  65 +
  66 +
  67 +#print Data::Dumper->Dump([$domainAnalysisMode, \@fams, $treatAsSuperfamily, $rootDir, $tcdbSeqsFile, $pfamFile, $cddFile, $blastdb, $prog, $domain_cov,
51 68 # $prot_cov, $evalue, $prop_prots_w_domain, \@candProjProts],
52   -# [qw(*fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *blastdb *prog *domain_cov
  69 +# [qw(*domainAnalysisMode *fams *treatAsSuperfamily *rootDir *tcdbSeqsFile *pfamFile *cddFile *blastdb *prog *domain_cov
53 70 # *prot_cov *evalue *prop_prots_w_domain *candProjProts)]);
54 71 #exit;
55 72  
56 73  
57 74  
58 75  
59 76  
... ... @@ -60,19 +77,24 @@
60 77  
61 78 if ($treatAsSuperfamily) {
62 79  
63   - my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily);
  80 + my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily);
64 81  
65   -# print Data::Dumper->Dump([$tcids ], [qw(*tcids )]);
66   -# exit;
  82 + #print Data::Dumper->Dump([$tcids ], [qw(*tcids )]);
  83 + #exit;
67 84  
68   -
69   -
70 85 #==========================================================================
71 86 #Setup the thresholds for parsing the PFAM output
72 87  
  88 + my $obj = "";
  89 + if ($domainAnalysisMode eq "cdd") {
  90 + $obj = new TCDB::Domain::CDDparser();
  91 + $obj->cddFile($cddFile);
  92 + }
  93 + else {
  94 + $obj = new TCDB::Domain::PfamParser();
  95 + $obj->pfamFile($pfamFile);
  96 + }
73 97  
74   - my $obj = new TCDB::Domain::PfamParser();
75   - $obj->pfamFile($pfamFile);
76 98 $obj->analysisLevel($analysisLevel);
77 99 $obj->domCovCutoff($domain_cov);
78 100 $obj->tcCovCutoff($prot_cov);
... ... @@ -127,8 +149,16 @@
127 149 #Setup the thresholds for parsing the PFAM output
128 150  
129 151  
130   - my $obj = new TCDB::Domain::PfamParser();
131   - $obj->pfamFile($pfamFile);
  152 + my $obj = "";
  153 + if ($domainAnalysisMode eq "cdd") {
  154 + $obj = new TCDB::Domain::CDDparser();
  155 + $obj->cddInFile($cddFile);
  156 + }
  157 + else {
  158 + $obj = new TCDB::Domain::PfamParser();
  159 + $obj->pfamFile($pfamFile);
  160 + }
  161 +
132 162 $obj->analysisLevel($analysisLevel);
133 163 $obj->domCovCutoff($domain_cov);
134 164 $obj->tcCovCutoff($prot_cov);
135 165  
... ... @@ -140,15 +170,15 @@
140 170 my %domCoords = ();
141 171 $obj->getDomainStatsForUserFamilies([], $tcids, \%domFreq, \%domCoords);
142 172  
143   - #print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]);
144   - #exit;
  173 +# print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]);
  174 +# exit;
145 175  
146 176  
147 177  
148 178  
149 179  
150 180 #==========================================================================
151   - #Attempt to rescue the domains that were not recognized by PFAM in some
  181 + #Attempt to rescue the domains that were not recognized by PFAM/CDD in some
152 182 #Family members
153 183  
154 184  
155 185  
... ... @@ -188,9 +218,10 @@
188 218 #Parse command line arguments
189 219  
190 220 my $status = GetOptions(
191   - "f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated)
  221 + "f|family=s" => \&read_fams, #TCIDs of families to analyze (comma separated)
  222 + "dam|domain-analysis-mode=s" => \&read_domainAnalysisMode, #Perform the analysis based on CDD or Pfam domains
192 223  
193   - #Options for TCDB::Domain::PfamParser
  224 + #Options for TCDB::Domain::PfamParser and TCDB::Domain::CDDparser
194 225 "dc|domain-cov=f" => \$domain_cov,
195 226 "pc|protein-cov=f" => \$prot_cov,
196 227 "e|evalue=f" => \$evalue,
... ... @@ -201,6 +232,7 @@
201 232 "o|outdir=s" => \&read_root_dir, #Ouput root directory
202 233 "s|tcdb-seqs=s" => \&read_tcdb_seqs, #File with all sequences in TCDB
203 234 "sf|superfamily!" => \$treatAsSuperfamily, #File with the sequences of the reference family
  235 + "cdd=s" => \&read_cdd, #rpsblast output file for whole TCDB
204 236 "pfam=s" => \&read_pfam, #hmmscan output file for whole TCDB
205 237 "b|blastdb=s" => \&read_blastdb, #Full path of blastdb to extract sequences
206 238 "p|rescue-prog=s" => \&read_prog, #Read the program that will be used to rescue domains (blastp|ssearch36)
207 239  
208 240  
... ... @@ -214,10 +246,21 @@
214 246 #----------------------------------------------------------------------
215 247 #Validate command line arguments
216 248  
  249 + if ($domainAnalysisMode eq "cdd") {
  250 + $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2" unless ($cddFile);
217 251  
218   - die "Error: Options -f and -pt are incompatible" if (@fams && @candProjProts);
  252 + die "Error: the use of option -pfam with [-dam cdd] is incompatible" if ($pfamFile);
  253 + }
  254 + else {
  255 + $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2" unless ($pfamFile);
  256 +
  257 + die "Error: the use of option -cdd with [-dam pfam] is incompatible" if ($cddFile);
  258 + }
  259 +
  260 + die "Error: options -f and -pt are incompatible" if (@fams && @candProjProts);
219 261 die "Error: either -f or -pt must be given" unless (@fams || @candProjProts);
220 262  
  263 +
221 264 if (@candProjProts) {
222 265 prepare_seqs_for_projection();
223 266 }
... ... @@ -240,7 +283,7 @@
240 283 system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir);
241 284  
242 285  
243   - #generate an empty "TCDB sequence file" that will contains proteins not in TCDB
  286 + #generate an empty "TCDB sequence file" that will contain proteins not in TCDB
244 287 my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa";
245 288 system "cat /dev/null > $new_tcdbSeqsFile";
246 289  
247 290  
248 291  
249 292  
250 293  
251 294  
252 295  
... ... @@ -295,23 +338,44 @@
295 338 }
296 339  
297 340 #----------------------------------------------------------------------
298   - #Generate the PFam database
  341 + #Run PFam or CDD on the sequences
299 342  
300   - my $pfamD = "$rootDir/pfam";
301   - system "mkdir -p $pfamD" unless (-d $pfamD);
  343 + #Analysis with CDD
  344 + if ($domainAnalysisMode eq "cdd") {
  345 + my $cddDir = "$rootDir/cdd";
  346 + system "mkdir -p $cddDir" unless (-d $cddDir);
302 347  
303   - my $pfamTMPfile = "$pfamD/tcdb_pfam.out";
304   - $pfamFile = "${pfamTMPfile}.bz2";
  348 + my $cddTMPfile = "$cddDir/tcdb_cdd.out";
  349 + $cddFile = "${cddTMPfile}.bz2";
305 350  
306   - #run Pfam
307   - my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/pfam/pfamdb/Pfam-A.hmm $new_tcdbSeqsFile);
308   - system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile);
  351 + #run cdd
  352 + my $cddDB = "$ENV{RESEARCH_DATA}/DB/domainDBs/cddDB/cdd";
  353 + my $ofmt = "7 qacc qlen sallacc slen evalue bitscore lengt qstart qend qcovhsp sstart send stitle";
  354 + my $cmd4 = qq (rpsblast -db $cddDB -query $new_tcdbSeqsFile -evalue $evalue -outfmt '${ofmt}' -out $cddTMPfile);
  355 + system $cmd4 unless (-f $cddTMPfile || -f $cddFile);
309 356  
310   - #compress pfam file
311   - my $cmd5 = qq(bzip2 $pfamTMPfile);
312   - system $cmd5 unless (-f $pfamFile);
  357 + #compress pfam file
  358 + my $cmd5 = qq(bzip2 $cddTMPfile);
  359 + system $cmd5 unless (-f $cddFile);
  360 + }
313 361  
  362 + #Analysis with Pfam
  363 + else {
  364 + my $pfamD = "$rootDir/pfam";
  365 + system "mkdir -p $pfamD" unless (-d $pfamD);
314 366  
  367 + my $pfamTMPfile = "$pfamD/tcdb_pfam.out";
  368 + $pfamFile = "${pfamTMPfile}.bz2";
  369 +
  370 + #run Pfam
  371 + my $cmd4 = qq (hmmscan --cpu 4 --noali --cut_ga -o /dev/null --domtblout $pfamTMPfile $ENV{RESEARCH_DATA}/DB/domainDBs/xfamDB/Pfam.hmm $new_tcdbSeqsFile);
  372 + system $cmd4 unless (-f $pfamTMPfile || -f $pfamFile);
  373 +
  374 + #compress pfam file
  375 + my $cmd5 = qq(bzip2 $pfamTMPfile);
  376 + system $cmd5 unless (-f $pfamFile);
  377 + }
  378 +
315 379 #----------------------------------------------------------------------
316 380 #now generate the blast DB
317 381  
318 382  
... ... @@ -332,9 +396,10 @@
332 396  
333 397  
334 398  
  399 +
335 400 #==========================================================================
336 401 #Read the -pt option. It is expected that the user provides the family to which
337   -#the target proteins are expected to belong. Example format should is:
  402 +#the target proteins are expected to belong. Example format:
338 403 # -pt {tcid_1},{file with target sequences 1}:{tcid_2},{file with target sequences 2}.
339 404 #
340 405 #NOTE: This option is incompatible with -f
... ... @@ -344,7 +409,7 @@
344 409 my ($opt, $value) = @_;
345 410  
346 411 my @pairs = split (/:/, $value);
347   - die "No significant argument passed to option -pt" unless (@pairs);
  412 + die "No valid argument passed to option -pt" unless (@pairs);
348 413  
349 414 foreach my $pair (@pairs) {
350 415 my ($tc, $file) = split (/,/, $pair);
351 416  
... ... @@ -377,8 +442,23 @@
377 442  
378 443  
379 444 #==========================================================================
380   -#Read the -d option
  445 +#Read the -dam option (Domain Analysis Mode)
381 446  
  447 +sub read_domainAnalysisMode {
  448 +
  449 + my ($opt, $value) = @_;
  450 +
  451 + my $tmp = lc $value;
  452 + die "Unrecognized mode: $value" unless ($tmp =~ /(cdd|pfam)/);
  453 +
  454 + $domainAnalysisMode = $tmp;
  455 +}
  456 +
  457 +
  458 +
  459 +#==========================================================================
  460 +#Read the -o option
  461 +
382 462 sub read_root_dir {
383 463  
384 464 my ($opt, $value) = @_;
... ... @@ -404,6 +484,19 @@
404 484  
405 485  
406 486 #==========================================================================
  487 +#Read the -cdd option
  488 +
  489 +
  490 +sub read_cdd {
  491 +
  492 + my ($opt, $value) = @_;
  493 +
  494 + die "File with TCDB sequences must exist and not be empty: $value" unless (-f $value && !(-z $value));
  495 +
  496 + $cddFile = $value;
  497 +}
  498 +
  499 +#==========================================================================
407 500 #Read the -pfam option
408 501  
409 502 sub read_pfam {
... ... @@ -416,6 +509,8 @@
416 509 }
417 510  
418 511  
  512 +
  513 +
419 514 #==========================================================================
420 515 #Read the -b option
421 516  
... ... @@ -432,7 +527,7 @@
432 527  
433 528  
434 529 #==========================================================================
435   -#Read the - option
  530 +#Read the -p option
436 531  
437 532 sub read_prog {
438 533  
... ... @@ -475,6 +570,10 @@
475 570 This option is incompatible with option -f. But either -f or -pt
476 571 must be given.
477 572  
  573 + -dam, --domain-analysis-mode (string) (default: cdd)
  574 + Specify if analysis will be performed on the output of rpsblast against CDD
  575 + (cdd) or hmmscan againts Pfam (pfam).
  576 +
478 577 -dc, --domain-cov {float} (Optional; Default: 0.7)
479 578 Minimum coverage of the Pfam domain to consider it a match. If coverage
480 579 is less than the specified threshold, the coverage must apply to the
481 580  
482 581  
... ... @@ -494,16 +593,21 @@
494 593 -o, --outdir {path} (Default: .)
495 594 Directory where results and intermediary files will be saved.
496 595  
497   - -s, --tcdb-seqs {file} (Optional; Default: $RESEARCH_DATA/pfam/download/tcdb.faa)
  596 + -s, --tcdb-seqs {file} (Optional; Default: $RESEARCH_DATA/DB/domainDBs/TCDB/domainScans/tcdb.faa)
498 597 FASTA file with all sequences in TCDB (as generated by program
499   - extractTCDB.pl). This file will be used to extract sequences
  598 + extractTCDB.pl. This file will be used to extract sequences
500 599 and TCIDs for all members of the input family. This allows to
501 600 freeze TCDB contents at a specific date, for the purpose defining
502 601 a project.
503 602  
504   - -pfam {file} (Optional; Default: $RESEARCH_DATA/pfam/tcdb.pfam-a.hmmscan.bz2)
505   - The output of running hmmscan against all TCDB, or at least the input
506   - family(-ies).
  603 + -pfam {file} (Optional; Default: $RESEARCH_DATA/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2)
  604 + The output of running hmmscan against all TCDB, or at least the input family(-ies). This option is not
  605 + compatible with option -cdd
  606 +
  607 + -cdd {file} (Optional; Default: $RESEARCH_DATA/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2)
  608 + The output of running rpsblast against CDD using format options:
  609 + [-outfmt '7 qacc qlen sallacc slen evalue bitscore lengt qstart qend qcovhsp sstart send stitle']
  610 + This option is not compatible with option -pfam.
507 611  
508 612 -b, --blastdb {path} (Default: $HOME/db/blastdb/tcdb)
509 613 Full path to the blast database containing all the sequences in
testCDDparser.pl View file @ da6670c
  1 +#!/usr/bin/env perl -w
  2 +
  3 +use strict;
  4 +use warnings;
  5 +use Data::Dumper;
  6 +#use List::Util qw(sum);
  7 +
  8 +use TCDB::Assorted;
  9 +use TCDB::Domain::PfamParser;
  10 +use TCDB::Domain::CDDparser;
  11 +use TCDB::Domain::Characterize;
  12 +
  13 +use Getopt::Long;
  14 +
  15 +#
  16 +#Domain projections should work both with CDD and Pfam
  17 +#
  18 +
  19 +#==========================================================================
  20 +#Global variables
  21 +
  22 +#Query family or families
  23 +my @fams = ("2.A.123");
  24 +
  25 +#This is an option for TCDB::Assorted::getSystemAccessions()
  26 +my $treatAsSuperfamily = 0;
  27 +
  28 +#Options for parsers TCDB::Domain::PfamParser and TCDB::Domain::CDDparser
  29 +my $domain_cov = 0.7;
  30 +my $prot_cov = 0.1;
  31 +my $evalue = 1e-5;
  32 +my $prop_prots_w_domain = 0.05;
  33 +
  34 +#Options for TCDB::Domain::Characterize
  35 +my $rootDir = ".";
  36 +
  37 +#To extract the TCIDs of refernece families
  38 +my $tcdbSeqsFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.faa";
  39 +my $pfamFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.pfam-a.hmmscan.bz2";
  40 +my $cddFile = "$ENV{RESEARCH_DATA}/DB/domainDBs/TCDB/domainScans/tcdb.cdd.rpsblast.bz2";
  41 +my $blastdb = "$ENV{HOME}/db/blastdb/tcdb";
  42 +my $prog = "ssearch36";
  43 +my @candProjProts = ();
  44 +my $analysisLevel = 'system';
  45 +
  46 +
  47 +
  48 +my $tcids = getSystemAccessions($tcdbSeqsFile, 'both', $analysisLevel, \@fams, $treatAsSuperfamily);
  49 +#print Data::Dumper->Dump([$tcids ], [qw(*tcids)]);
  50 +
  51 +
  52 +#==========================================================================
  53 +#Setup the thresholds for parsing the PFAM output
  54 +
  55 +
  56 +my $obj = new TCDB::Domain::CDDparser();
  57 +$obj->cddInFile($cddFile);
  58 +$obj->analysisLevel($analysisLevel);
  59 +$obj->domCovCutoff($domain_cov);
  60 +$obj->tcCovCutoff($prot_cov);
  61 +$obj->evalueCutoff($evalue);
  62 +$obj->minProtsDom($prop_prots_w_domain);
  63 +$obj->treatAsSuperfamily($treatAsSuperfamily);
  64 +
  65 +
  66 +my %domFreq = ();
  67 +my %domCoords = ();
  68 +$obj->getDomainStatsForUserFamilies(\@fams, $tcids, \%domFreq, \%domCoords);
  69 +
  70 +print Data::Dumper->Dump([ \%domFreq, \%domCoords ], [qw( *domFreq *domCoords )]);
  71 +exit;