diff --git a/getDomainTopology.pl b/getDomainTopology.pl index fd02b6e..ecce9e5 100755 --- a/getDomainTopology.pl +++ b/getDomainTopology.pl @@ -234,6 +234,12 @@ sub prepare_seqs_for_projection { my $tcdbDir = "$rootDir/tcdb"; system "mkdir -p $tcdbDir" unless (-d $tcdbDir); + #to prevent modifying the original files, here I'll save the input + #sequences with the artificial TCIDs + my $origInfilesDir = "$rootDir/inputFiles"; + system "mkdir -p $origInfilesDir" unless (-d $origInfilesDir); + + #generate an empty "TCDB sequence file" that will contains proteins not in TCDB my $new_tcdbSeqsFile = "$tcdbDir/tcdb.faa"; system "cat /dev/null > $new_tcdbSeqsFile"; @@ -247,6 +253,8 @@ sub prepare_seqs_for_projection { my $tcid = $pair->[0]; my $tgtF = $pair->[1]; + my @comp = split(/\//, $tgtF); + my $tgtFileName = $comp[-1]; #Add family to the main array (as if provided by the -f commandline option) push (@fams, $tcid); @@ -270,8 +278,8 @@ sub prepare_seqs_for_projection { #Replace the TCID in the file corresponding to the target proteins - my $cmd1 = qq(perl -i.orig -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF); - system $cmd1 unless (-f "${tgtF}.orig"); + my $cmd1 = qq(perl -pe 's/\\>([a-zA-Z0-9_-]+).*/\\>${tgtTC}-\$1/;' $tgtF > $origInfilesDir/$tgtFileName); + system $cmd1 unless (-f "$origInfilesDir/$tgtFileName"); #Extract sequences for reference family @@ -282,7 +290,7 @@ sub prepare_seqs_for_projection { #Add family and target sequences to the new TCDB family - my $cmd3 = qq(cat $outFile $tgtF >> $new_tcdbSeqsFile); + my $cmd3 = qq(cat $outFile $origInfilesDir/$tgtFileName >> $new_tcdbSeqsFile); system $cmd3; }