lg = function(n){ # returns the smallest integer >= the log of n in base 2 if (n<=0) return(NaN) else return(ceiling(log2(n))) } ln<-function(n){ floor(log2(n)) } # this one is the largest integer <= log of n in base 2. des = function(n){ # returns the number of bits required to describe n, # when there is no information about how large n is. # Uses logarithmic ramp. See Li & Vitanyi 1997. l = length(n); na = sum(is.na(n)) if (na){ return(NA) quit } # in case n is a vector, all description lengths are added. if (l>=2){ # n is a vector len = 0; for (i in 1:l){ len = len + des(n[i]); } len; } else { # in case n is a number if (n<0) return(NaN) else { len = 1; while (n>=2){ n = ln(n); len = len+n+1; } return(len); } } } cutoff <- function(Ntax,Ngroup=1,nni.dist=0,NNI=FALSE){ if (Ngroup==1){ cat("enter a value for l (number of trees in H1 hypothesis)\n");} else{ length<-(Ngroup-1)*lg(2*Ntax-3)+des(Ngroup); if (NNI){length<-length+des(nni.dist)+sum(nni.dist)*(lg(Ntax-3)+1);} else{length<-length+(Ngroup-1)*(2*Ntax-4+Ntax*lg(Ntax));} length/(2+lg(2*Ntax-3)); } } DL = function(score,Ngroup,Nchar,Ntax,nni.dist=c(),Nletters=4){ # Returns the Description Length of a data with "Ntax" taxa and # "Nchar" characters, partitionned into "Ngroup" groups and with # parsimony score "score". This score is the sum of all scores for # each group, where each group is allowed to have its own best MP tree. # The code will describe trees separately if no nni distances are provided. # Otherwise, trees other than the first will be described through their # NNI differences with the first tree. In this case, # the NNI distances between trees 2, 3, etc. and tree 1 must be given # as the vector in nni.dist. # This description assumes there are 4 letters (ACGT) by default. # For more letters, the option Nletters can be used. b = lg(Nletters) NNI = length(nni.dist) # number of bits required to describe a single letter. lgNedge = lg(2*Ntax-3); # this is the number of bits required to describe an edge of the tree onetreedes = 2*Ntax-4+Ntax*lg(Ntax); # this is the number of bits required to describe the first tree. res = onetreedes + Ngroup*lgNedge + 2*b*Nchar + score*(b+lgNedge) if (Ngroup>1){ res = res +des(Ngroup) if (NNI){ if (NNI == Ngroup-1){ res = res + des(nni.dist) + sum(nni.dist)*(lg(Ntax-3)+1) } else{ cat("not enough or too many NNI distances\n") } } else{ res = res + (Ngroup-1)*onetreedes; } } res; } find.MDL = function(inputfile="scores_partitions", outputfile="MDL.out",rm.groups=TRUE){ # input: file with a list of partitions (column 1), their scores (column 2), # and their number of groups (column 3). # The number of taxa, total number of characters and whether gaps were # counted as a separate charater needs to be given at the beginning of # the file. # rm.groups: if TRUE, removes partitions that correspond to a single group # and do not include all of the data. Keep true partitions only. # output: file with the same data along with the Description Length of each # partition, sorted by DL. gendat = read.table(inputfile,header=T,com="[",nrows=1) Ntax = gendat[1,1]; Nchar = gendat[1,2]; gap = gendat[1,3] cat("Ntax:",Ntax,"Nchar:",Nchar,"gap:",gap,"\n") score.data = read.table(inputfile,header=T,com="[",skip=2, colClasses=c("character",rep("integer",2))) Npartitions = dim(score.data)[1] if (rm.groups){ ind=c() for (i in 1:Npartitions){ if( sum(strsplit(score.data[i,1],"")[[1]] == 0) ==0) ind = c(ind,i) } score.data = score.data[ind,] Npartitions=length(ind) } print(str(score.data)) dl = c(); for (i in 1:Npartitions){ partition = score.data[i,1]; dl = c(dl, DL(score.data[i,2],score.data[i,3],Nchar,Ntax,Nletters=4+gap)) } score.data$DL = dl; # now sorting by Description Length and writing up the output file. ord = order(score.data$DL) cat("The 5 best partitions are:\n") print(score.data[ord[1:5],]) comments = paste("[MDL analysis from ",inputfile,"]\n") write(file=outputfile,x=comments) write.table(score.data[ord,],file=outputfile,quote=FALSE,row.names=FALSE, append=TRUE) } find.MDL.nni = function(inputfile="MDL_4genes.out", outputfile="MDLnni.out", nni.dist=list(c(),c(7),c(7),c(7),c(5),c(5),c(5),c(4),c(5,7),c(5,5),c(6,6),c(4,7),c(5,7),c(6,6),c(5,6,7))){ # input: file with a list of partitions (column 1), their scores (column 2), # their number of groups (column 3) and their DL calculated based on a # separate description of the trees (column 4). Only the first N # partitions will be considered, where N is the number of items in "nni.dist". # nni.dist is a list of vectors, one for each partition. # For each partition, the vector contains the nni # distances between the first tree and the other trees. For a partition with # k groups, the vector should be of length k-1. # The number of taxa, total number of characters and whether gaps were # counted as a separate charater need to be given at the beginning of # the file. # output: file with the same data for the first Npart partitions, along # with the Description Length of each partition based on the NNI description # of the trees (except for the first tree). gendat = read.table(inputfile,header=T,com="[",nrows=1) Ntax = gendat[1,1]; Nchar = gendat[1,2]; gap = gendat[1,3] cat("Ntax:",Ntax,"Nchar:",Nchar,"gap:",gap,"\n") Npart = length(nni.dist) score.data = read.table(inputfile,header=T,com="[",skip=2,nrows=Npart, colClasses=c("character",rep("integer",2))) print(str(score.data)) score.data[,1] = as.character(score.data[,1]) dl = c(); for (i in 1:Npart){ partition = score.data[i,1]; dl = c(dl, DL(score.data[i,2],score.data[i,3],Nchar,Ntax,Nletters=4+gap, nni.dist=nni.dist[[i]])) } score.data$DLnni = dl; # now sorting by Description Length and writing up the output file. ord = order(score.data$DLnni) print(score.data[ord,]) comments = paste("Ntax Nchar gap\n",Ntax,Nchar,gap,"\n[MDL analysis from ",inputfile,"]\n") write(file=outputfile,x=comments,append=TRUE) write.table(score.data[ord,c(1:3,5:6)],file=outputfile,quote=FALSE,col.names=TRUE,row.names=FALSE,append=TRUE) } index2group = function(index=i,Ngenes=8){ group = "" for (g in 1:Ngenes){ a = floor(index/2); r = as.character(index-2*a) group = paste(group,r,sep="") index = a; } return(group) } group2index = function(group="10000000"){ Ngenes = nchar(group) index=0; a=1; for (g in 1:Ngenes){ r = as.numeric(substr(group,1,1)) group = substr(group,2,Ngenes+1-g) index = index + a*r a = 2*a } return(index) }