#!usr/bin/perl -w # code to perform the covarion test. # argument= a file name. # The file should contain info about the data matrix, tree, clades, # the type of test to be done, the characters to exlude... # Results will be located in a directory named "results" if not # otherwise specified (in the input file) # output: - a conclusion file # - the estimated tree, in case estimation has been performed. # # example of use: # perl covarionTest.pl inputfile # perl covarionTest.pl < inputfile # # that script calls the following other programs: # paup # resolvetree.pl for resolving trees with 0-length branches # seq-gen # oneTestStatistic (C program) for computing the value of the test # statistic W on one data set. # # example of input file (or standard input): # # excludeChar=3 NBoot=10 ras=1 treeMethod=par treeFile=MLtree.nex # matrixFile=myMatrix_nexus_or_phylip_format # partition=Amborella,Calycanthus, Marchantia,Physcomitrella ,Anthoceros,Psilotum # model=REV ncat=4 directory=myDirName # # # - matrixFile: must be in phylip or nexus format. # - treeMethod can be "parsimony" (or "par"), "likelihood" (or "like") # If "par", the MP tree is computed and used or bootstrapping. # If "like", it will be the ML tree. In all other cases, the # tree is to be read from a file. # - the tree in treeFile must be in Nexus format. # - model can be one of "HKY" "F84" or "REV". default=HKY. Indicates what # substitution model will be used for both estimation of parameters # and simulation. # - ncat: integer value. default=4. Number of gamma rate categories, used # both in estimation and simulation. # - nboot= number of bootstrap replicates. default=1000. # - ras: type of test. 0: homogeneity test. 1: RAS vs RAS+COV test. # default=1. # - directory: name of directory for output. default="results" # - partition: list of one clade's taxon names, separated by commas. # - ecludeChar: if =3 then 3d positions are ecluded. if =12 then 1st # and 2d positions are ecluded. Otherwise, all positions are included. # the test statistic is performed on sites with no gaps only. Bootstrap # replicates have a number of characters equal to the number of no-gaps # characters of the matrix. # Ooops! I'm not sure how ambiguous characters are treated. I need to look # into the C program "oneTestStatistic". # - I haven't checked yet that interleave is ok. # - model=JC not permitted. Should I add another option like "nst=1" # and/or "basefrequencies=equal" to allow for simpler models? # ---------------------------- random seed ----------------------- $seed = time ^ $$ ^ unpack "%L*", `ps axww | gzip`; srand($seed); # ---------------------------------------------------------------- $SeqGen = "./seq-gen"; $Paup = "paup"; $oneStat = "./oneTestStatistic"; $resolvetree= "resolvetree.pl"; $excludeChar=0; #0: no character excluded. 3: 3d positions excluded #12: 1st and 2d positions excluded $NBoot = 1000; $ras = 1; # 1 for the RAS vs RAS+COV test (left-sided test), # 0 for the homogeneity test (right-sided test). $model="HKY"; # could also be F84 or REV. $ncat = 4; # number of gamma rate categories $directory="results/"; # ----------------- Read standard input ------------------------------- while (<>){ if (/\bexcludeChar\s*=\s*(\d+)/i){ $excludeChar =$1;} if (/\bNBoot\s*=\s*(\d+)/i){$NBoot =$1;} if (/\bras\s*=\s*(\d)/i){$ras=$1;} if (/\btreeMethod\s*=\s*(\w+)/i){$speed =$1;} if (/\bmodel\s*=\s*(\w+)/i){$model =$1;} if (/\bncat\s*=\s*(\d+)/i){$ncat =$1;} if (/\bmatrixFile\s*=\s*([^\s]+)/i){$DataMatrix =$1;} if (/\btreeFile\s*=\s*([^\s]+)/i){$ModelTreeNex =$1;} if (/\bpartition\s*=((\s*\w+\s*,)+\s*\w+)/i){$partition =$1;} if (/\bdirectory\s*=\s*(\w+)/i){$directory ="$1/";} } # ------------------------------------------------------------------ if ($excludeChar eq 12){print "first and second characters excluded\n";} elsif ($excludeChar eq 3){print "third characters excluded\n";} else {print "all characters included\n";} if ($NBoot > 1){ print "$NBoot replicates will be simulated for parametric bootstrapping\n";} else {die "bad value for NBoot: $NBoot\n";} if ($ras){ print "test with rate variation: RAS vs RAS+COV\n";} else{ print "homogeneity test\n";} if ($speed && $speed =~ /^par/i){ $speed="par"; print "MP (maximum parsimony) tree will be computed and used for bootstrapping\n";} elsif ($speed && $speed =~ /^like/i){ $speed="like"; print "ML (maximum likelihood) tree will be computed and used for bootstrapping\n";} else{ $speed="fast"; die "I can't open tree file $ModelTreeNex\n" if (!(-r $ModelTreeNex)); print "Tree read from file $ModelTreeNex\n"; } if (!(-d $directory)) { mkdir $directory; } else {die "ooops! I don't want to overwrite directory $directory\n";} if ($DataMatrix){ die "I can't open Data matrix file $DataMatrix\n" if (!(-r $DataMatrix)); print "Data matrix read from file $DataMatrix (phylip or nexus format)\n"; } else{ die "no entry for matrixFile\n";} if ($partition){ @clade=split(/[(\s*,\s*)]/,$partition); foreach (@clade){ # this loop removes extra beginning spaces and empty names s/^\s*//; if (!($_ eq "")){ push @cladeA, $_;} } print "Partition defined by the group: @cladeA\n"; } else{ die "no entry for partition\n";} if (($model eq "HKY")||($model eq "F84")||($model eq "REV")){ print "Model for estimation and simulation: $model\n"; } else{ die "bad model of substitution: $model\n";} if ($ncat > 1){ print "gamma rate distribution with $ncat categories\n";} else{ die "bad number of gamma rate categories: $ncat\n";} # ---------- read data matrix ---------------------------------------- # ---------- build data structure for the partition ------------------ open FH1, "<$DataMatrix"; while (){ chomp; if ($_){ if (/^#nexus/i){ $format="nexus"; } else{ $format="phylip";} last; } } close FH1; if ($format eq "phylip"){ open FH1, "<$DataMatrix"; while () { # get nTax, NChar and taxon names @separatelines = split(/\r/,$_); foreach (@separatelines){ if (/^\s*\d+\s+\d+\s*$/){ ($nTaxa,$NChar)=split; } elsif (/[ACGT]/) { $name=(split)[0]; push @taxonNames, $name; } } } close FH1; } elsif ($format eq "nexus"){ open FH1, "<$DataMatrix"; my $start; my $ntax; while () { @separatelines = split(/\r/,$_); foreach (@separatelines){ if (/dimensions/i){ if (/ntax\s*=\s*(\d+)/i){$nTaxa= $1;} if (/nchar\s*=\s*(\d+)/i){$NChar=$1;} } if (/end;/i){$start=0;} if ($start){ if (/[ACGT]/) { $ntax++; if ($ntax<=$nTaxa){ push @taxonNames, (split)[0]; } } } if (/matrix/i){$start=1;} } } close FH1; } $nTax=@taxonNames; if (!($nTaxa eq $nTax)){ die "problem in reading the data matrix, on the number of taxa\n";} foreach $tax (@taxonNames){ my $notfound=1; foreach (@cladeA){ if ($tax eq $_){ $notfound=0; last;} } if ($notfound){ push @cladeNA, $tax;} } $NtaxA = @cladeA; $NtaxNA= @cladeNA; if (($NtaxA<2)||($NtaxNA<2)){ die "each partition should contain at least 2 taxa\n";} # ----------------------------------------------------------------------- $conclusionFile = $directory . "conclusion"; $tempDir = $directory."temp/"; if (!(-d $tempDir)) { mkdir $tempDir;} $DataMatrixPhy = $tempDir . "DataMatrix.phy"; $DataMatrixNex = $tempDir . "DataMatrix.nex"; $PauplogFile = $tempDir . "Paup.log"; $EstimTreePhy = $tempDir . "PaupTree.altphy"; $EstimTreeNex = $tempDir . "PaupTree.nex"; $EstimTreeAltNex = $tempDir . "PaupTree.altnex"; $SGtreeFile= $tempDir . "SGtree.phy"; $newOriginalMatrixPhy = $tempDir . "originalDataMatrix.phy"; $CladeDefinitionFile = $tempDir . "CladeInfo.txt"; open FH4, ">$conclusionFile"; print FH4 "Covarion test:\t".($ras?"RAS vs RAS+COV\n":"homogeneous vs heterogeneous\n"); print FH4 "\nData matrix:\t$DataMatrix\n"; print FH4 "'clade' 1 has $NtaxA taxa: @cladeA\n"; print FH4 "'clade' 2 has $NtaxNA taxa: @cladeNA\n"; print FH4 "Number of characters:\t$NChar\n"; print FH4 "excluded positions:\t"; if ($excludeChar eq 12){ print FH4 "1st and 2d\n";} elsif ($excludeChar eq 3){print FH4 "3d\n";} else {print FH4 "none\n";} close FH4; # ----------------- write a new phylip data file -------------------------- # ----------------- excluding characters if necessary --------------------- if ($excludeChar eq 12){ $NChar=int($NChar/3);} elsif ($excludeChar eq 3) {$rest=$NChar%3; $NChar=int($NChar/3)*2+$rest;} open FH1, "<$DataMatrix"; my $start; if ($format eq "phylip"){$start=1;} while () { @separatelines = split(/\r/,$_); foreach (@separatelines){ if (/end;/i){$start=0;} if ($start){ if (/[ACGT]/) { ($name,$nucleotides)=split; while ($nucleotides){ $i= ++$i{$name}; $nuc=substr($nucleotides,0,1,""); if ($excludeChar eq 3){ $condition = ($i%3?1:0);} elsif ($excludeChar eq 12){ $condition = ($i%3?0:1);} else{ $condition = 1;} if ($condition){ $sequence{$name} .= $nuc; } } } } if (/matrix/i){$start=1;} } } close FH1; open FH2, ">$newOriginalMatrixPhy"; print FH2 "$nTaxa $NChar\n"; foreach (@taxonNames){ print FH2 "$_ $sequence{$_}\n";} close FH2; $DataMatrix=$newOriginalMatrixPhy; # ---------- compute the test statistic on the original dataset ----------- # ------------------------------------------------------------------------- open FH1, ">$CladeDefinitionFile"; print FH1 "cladeA @cladeA\n"; print FH1 "cladeNA @cladeNA\n"; close FH1; $LTestString = "$oneStat -c $CladeDefinitionFile < $DataMatrix"; $originalTestvalue = `$LTestString`; chomp $originalTestvalue; open FH2, ">>$conclusionFile"; if ($excludeChar){ print FH2 "Number of included characters:\t$NChar\n\n";} close FH2; # --------------- Convert the data file into a nexus file ------------------ # -------------------------------------------------------------------------- open FH1, "<$DataMatrix"; open FH2, ">$DataMatrixNex"; print FH2 "#nexus\n"; print FH2 "\nbegin data;\ndimensions ntax=$nTaxa nchar=$NChar;\n"; print FH2 "format datatype=dna gap=-;\nmatrix\n"; while () { if (/[ACGT]/) { print FH2;} } print FH2 "\n;\nend;\n"; close FH1; # ------------------ Paup block ------------------------- $paupSeed = int(10000*rand()); if (!($model eq "REV")){ $PaupModel = "nst=2 basefreq=estimate tratio=estimate variant=$model ";} else { $PaupModel = "nst=6 basefreq=estimate rmatrix=estimate ";} if ($ras){$PaupModel .= "rates=gamma shape=estimate ncat=$ncat ";} # +Gamma print FH2 "\nbegin paup;\n"; print FH2 "set maxtrees=100 increase=no warntsave=no warnTree=no autoclose=yes status=no;\n"; print FH2 "constraints mycons=((@cladeA),(@cladeNA));\n"; if ($speed eq "par"){ print FH2 "set criterion= parsimony;\n"; print FH2 "hsearch swap=tbr rseed=$paupSeed enforce=yes constraints=mycons;\n"; print FH2 "ConTree / treefile=$EstimTreeNex replace=yes;\n"; print FH2 "Gettrees file=$EstimTreeNex;\n"; } elsif ($speed eq "like"){ print FH2 "set criterion= likelihood;\nlset $PaupModel;\n"; print FH2 "hsearch swap=tbr enforce=yes constraint=mycons timelimit=1200;\n"; print FH2 "ConTree / treefile=$EstimTreeNex replace=yes;\n"; print FH2 "Gettrees file=$EstimTreeNex;\n"; } else{ print FH2 "Gettrees file=$ModelTreeNex;\n"; } print FH2 "set criterion= likelihood ;\n"; print FH2 "lset $PaupModel;\n"; print FH2 "log start file=$PauplogFile replace=yes;\n"; print FH2 "lscore;\n"; print FH2 "lset basefreq=previous ".($ras?" shape=previous ":""); if ($model eq "REV"){ print FH2 "rmatrix=previous ;\n";} else{ print FH2 "tratio=previous ;\n";} print FH2 "exclude gapped; \nlog stop;\n"; print FH2 "savetrees file=$EstimTreeAltNex format=altnexus brlens=yes replace=yes;\n"; print FH2 "quit;\n"; print FH2 "end;\n"; close FH2; # --- run paup --------- system( "$Paup $DataMatrixNex" ); # --- store the results (MPtree, parameters, number of no-gap sites,...) ----- if (($speed eq "par")||($speed eq "like")){ if ($speed eq "par"){$MP="MP";} else{$MP="ML";} $MPtreeFile = $directory.$MP."tree.nex"; system("cp $EstimTreeNex $MPtreeFile"); } open FH1, "<$PauplogFile"; while (){ if ($start){ if (/^Shape/){ $start=0; $alpha=(split)[1]; } elsif (/^\s+[ACGT]\s/){ push @freq, (split)[1]; } elsif (/^\s+[ACG][CGT]\s/){ push @rmatrix, (split)[1]; } elsif (/^\s*exp\. ratio/){ $titv=(split)[-1]; } } if (/^Base frequencies/){$start=1;} if (/Number of included characters/){ $NChar=(split)[5]; } } close FH1; open FH1, "<$EstimTreeAltNex"; open FH2, ">$EstimTreePhy"; while (){ if (/tree PAUP_1/){ my $tree = (split)[4]; print FH2 "$tree\n"; } } open FH1, ">>$conclusionFile"; if ($speed eq "par"){print FH1 "estimation:\tMP tree\n";} elsif ($speed eq "like"){print FH1 "estimation:\tlikelihood tree\n";} else { print FH1 "tree:\tfrom file $ModelTreeNex.\n";} print FH1 "parameter estimates:\n\tbase frequencies (ACGT)= @freq\n"; if ($model eq "REV"){ print FH1 "\tRmatrix rates= @rmatrix\n";} else {print FH1 "\ttitv ratio = $titv\n";} if ($ras){ print FH1 "\tshape (alpha) = $alpha\n";} print FH1 "Number of characters with no gaps:\t$NChar\n"; print FH1 "Test statistic:\t\t$originalTestvalue\n"; print FH1 "Number of bootstrap replicates:\t$NBoot\n"; close FH1; # --------------- Resolve the tree with 0-length branches---------------- # ----------------------------------------------------------------------- system("perl $resolvetree $EstimTreePhy > $SGtreeFile"); # ---------------- Bootstrap analysis ------------------------------------ # ------------------------------------------------------------------------ @statvalues = (); # see if alpha is not infinite ... if ( ($ras && !($alpha =~ /\d/)) or !(($model eq "REV")or($titv =~ /\d/))){ die "infinite parameter estimate\n"; } print STDOUT "Parametric bootstrap starting: $NBoot replicates.\n"; # ---------------------- sequence simulation ------------------- $SGoptions = "-m$model -l $NChar -f @freq "; if (($model eq "HKY")||($model eq "F84")){ $SGoptions .= "-t $titv "; } else {$SGoptions .= "-r @rmatrix ";} if ($ras){ $SGoptions .= "-a $alpha -g $ncat "; } $SGoptions .= "-q -or "; $SGfiles = " < $SGtreeFile > $DataMatrixPhy"; foreach (1..$NBoot){ print STDOUT "."; $SGseed = int(100000*rand()); $SGnewOptions = $SGoptions." -z $SGseed "; system("$SeqGen $SGnewOptions $SGfiles "); # ---- get the test statistic on that bootstrap dataset --- $statString = "$oneStat -c $CladeDefinitionFile < $DataMatrixPhy"; $statvalue = `$statString`; chomp $statvalue; push @statvalues, $statvalue; } print STDOUT "\n"; # ------------ compute p-value ------------------------------- # ------------------------------------------------------------ $pvalue=0; foreach (@statvalues){ if ($ras?($_<=$originalTestvalue):($_>=$originalTestvalue)){ $pvalue++; } } $pvalue /= @statvalues; if ($pvalue eq 0){ $minp = 1/$NBoot; $pvalue = "<$minp"; } open FH1, ">>$conclusionFile"; print FH1 "p-value:\t$pvalue\n\n"; close FH1; print "p-value is $pvalue\n\n"; # ------------ file management ------------------ unlink glob "$tempDir*"; rmdir $tempDir;