/* Nucleotide Sequence Generator - seq-gen, version 1.2.5 */ /* (c) Copyright 1996-2001, Andrew Rambaut & Nick Grassly */ /* Department of Zoology, University of Oxford */ #include #include #include #include #include #include #ifdef __MWERKS__ #ifdef __DROP_CONSOLE__ #include "DropConsole.h" #else #include #endif #endif #include "global.h" #include "treefile.h" #include "evolve.h" #include "models.h" #include "progress.h" #include "random.h" #include "covarion.h" #define PROGRAM_NAME "seq-gen-cov" #define VERSION_NUMBER "Version 1.2.5-covarion added" char *ModelNames[numModels]={ "F84", "HKY", "REV" }; int printHelp, treeFile, textFile, numDatasets, numTrees; int scaleTrees, scaleBranches, ancestorSeq, writeAncestors, writeRates; int writeLengths, writeHidStates; int *partitionLengths; double *partitionRates; double treeScale, branchScale; double covarionSpeed; char treeFileName[256]; char textFileName[256]; int hasAlignment, numSequences, numSites; char **names; char **sequences; FILE *tree_fv; /* prototypes */ static void PrintTitle(); static void PrintUsage(); static void PrintVerbose(FILE *fv); static void ReadParams(); static void ReadFileParams(); static void AllocateMemory(); static void ReadFile(); static int OpenTreeFile(); /* functions */ static void PrintTitle() { fprintf(stderr, "Sequence Generator - %s\n", PROGRAM_NAME); fprintf(stderr, "%s\n", VERSION_NUMBER); fprintf(stderr, "(c) Copyright, 1996-2000 Andrew Rambaut and Nick Grassly\n"); fprintf(stderr, "Department of Zoology, University of Oxford\n"); fprintf(stderr, "South Parks Road, Oxford OX1 3PS, U.K.\n\n"); } static void PrintUsage() { fprintf(stderr, "Usage: seq-gen-cov [-m MODEL] [-l #] [-n #] [-p #] [-s # | -d #]\n"); fprintf(stderr, " [-c #1 #2 #3 | -a # [-g #]][-f #A #C #G #T] [-t #] [-k #]\n"); fprintf(stderr, " [-z #] [-o[p][r][n]] [-w[a][r][h][l]] [-x NAME] [-q] [-h]\n"); fprintf(stderr, " [-v #1 #2] [treefile]\n"); fprintf(stderr, " -m: MODEL = F84, HKY, REV [default = F84]\n"); fprintf(stderr, " -l: # = sequence length [default = 1000].\n"); fprintf(stderr, " -n: # = simulated datasets per tree [default = 1].\n"); fprintf(stderr, " -p: # = number of partitions (and trees) per sequence [default = 1].\n"); fprintf(stderr, " -s: # = branch length scaling factor [default = 1.0].\n"); fprintf(stderr, " -d: # = total tree scale [default = use branch lengths].\n"); fprintf(stderr, " -c: #1 #2 #3 = rates for codon position heterogeneity [default = none].\n"); fprintf(stderr, " -a: # = shape (alpha) for gamma rate heterogeneity [default = none].\n"); fprintf(stderr, " -g: # = number of gamma rate categories [default = continuous].\n"); fprintf(stderr, " -i: # = proportion of invariable sites [default = 0.0].\n"); fprintf(stderr, " -t: # = transition-transversion ratio [default = equal rate].\n"); fprintf(stderr, " -r: #1 #2 #3 #4 #5 #6= general rate matrix [default = all 1.0].\n"); fprintf(stderr, " -f: #A #C #G #T = nucleotide frequencies [default = all equal].\n"); fprintf(stderr, " -k: # = use sequence k as ancestral (needs alignment) [default = random].\n"); fprintf(stderr, " -z: # = seed for random number generator [default = system generated].\n"); fprintf(stderr, " -o: Output file format [default = PHYLIP]\n"); fprintf(stderr, " p PHYLIP format\n"); fprintf(stderr, " r relaxed PHYLIP format\n"); fprintf(stderr, " n NEXUS format\n"); fprintf(stderr, " -w: Write additional information [default = none]\n"); fprintf(stderr, " a Write ancestral sequences for each node\n"); fprintf(stderr, " r Write rate for each site\n"); fprintf(stderr, " h Write covarion state (on = + off = -) for each site and each node.\n"); fprintf(stderr, " l Write effective length for each site (covarion process)\n"); fprintf(stderr, " -x: NAME = a text file to insert after every dataset [default = none].\n"); fprintf(stderr, " -h: Give this help message\n"); fprintf(stderr, " -q: Quiet\n"); fprintf(stderr, " -v: #1 #2 = ON frequency and speed of the covarion evolution [default = none].\n"); fprintf(stderr, " speed is the average number of swithes per (average) nucleotide substitution.\n\n"); fprintf(stderr, " treefile: name of tree file [default = trees on stdin]\n"); } void ReadParams(int argc, char **argv) { int i, j; char ch, *P, st[4]; double cov[2]; model=HKY; scaleTrees=0; treeScale=0.0; scaleBranches=0; branchScale=0.0; maxPartitions=1; numPartitions=1; userSeed = 0; numCats=1; rateHetero=NoRates; catRate[0]=1.0; gammaShape=1.0; invariableSites=0; proportionInvariable = 0.0; covarion=0; covarionSpeed=0.0; ONfreq=0.5; equalFreqs = 1; equalTstv = 1; freq[A]=freq[C]=freq[G]=freq[T]=0.25; tstv=0.50002; Rmat[0]=Rmat[1]=Rmat[2]=Rmat[3]=Rmat[4]=Rmat[5]=1.0; numBases=-1; numDatasets=1; ancestorSeq=0; writeAncestors=0; writeRates=0; writeLengths=0; writeHidStates=0; verbose=1; fileFormat = PHYLIPFormat; quiet=0; treeFile=0; textFile=0; for (i=1; i= 1.0) { fprintf(stderr, "Bad Proportion of Invariable Sites: %s\n\n", argv[i]); exit(0); } invariableSites = 1; break; case 'A': if (rateHetero==CodonRates) { fprintf(stderr, "You can only have codon rates or gamma rates not both\n\n", argv[i]); exit(0); } if (rateHetero==NoRates) rateHetero=GammaRates; if (GetDoubleParams(argc, argv, &i, P, 1, &gammaShape) || gammaShape<=0.0) { fprintf(stderr, "Bad Gamma Shape: %s\n\n", argv[i]); exit(0); } break; case 'G': if (rateHetero==CodonRates) { fprintf(stderr, "You can only have codon rates or gamma rates not both\n\n", argv[i]); exit(0); } rateHetero=DiscreteGammaRates; if (GetIntParams(argc, argv, &i, P, 1, &numCats) || numCats<2 || numCats>MAX_RATE_CATS) { fprintf(stderr, "Bad number of Gamma Categories: %s\n\n", argv[i]); exit(0); } break; case 'F': equalFreqs = 0; if (GetDoubleParams(argc, argv, &i, P, 4, freq)) { fprintf(stderr, "Bad Nucleotide Frequencies: %s\n\n", argv[i]); exit(0); } break; case 'T': equalTstv = 0; if (GetDoubleParams(argc, argv, &i, P, 1, &tstv)) { fprintf(stderr, "Bad Transition-Transversion Ratio: %s\n\n", argv[i]); exit(0); } break; case 'R': equalTstv = 0; if (GetDoubleParams(argc, argv, &i, P, 6, Rmat)) { fprintf(stderr, "Bad General Rate Matrix: %s\n\n", argv[i]); exit(0); } if (Rmat[5]!=1.0) { for (j=0; j<5; j++) Rmat[j]/=Rmat[5]; Rmat[5]=1.0; } break; case 'D': scaleTrees=1; if (GetDoubleParams(argc, argv, &i, P, 1, &treeScale) || treeScale<=0.0) { fprintf(stderr, "Bad Total Tree Scale: %s\n\n", argv[i]); exit(0); } if (scaleBranches) { fprintf(stderr, "You can't specify both the -d and -s options\n\n", argv[i]); exit(0); } break; case 'S': scaleBranches=1; if (GetDoubleParams(argc, argv, &i, P, 1, &branchScale) || branchScale<=0.0) { fprintf(stderr, "Bad Branch Length Scale: %s\n\n", argv[i]); exit(0); } if (scaleTrees) { fprintf(stderr, "You can't specify both the -d and -s options\n\n", argv[i]); exit(0); } break; case 'K': if (GetIntParams(argc, argv, &i, P, 1, &ancestorSeq) || numCats<1) { fprintf(stderr, "Bad ancestral sequence number: %s\n\n", argv[i]); exit(0); } break; case 'Z': userSeed = 1; if (GetIntParams(argc, argv, &i, P, 1, &randomSeed)) { fprintf(stderr, "Bad random number generator seed: %s\n\n", argv[i]); exit(0); } break; case 'O': switch (*P) { case 'P': fileFormat=PHYLIPFormat; break; case 'R': fileFormat=RelaxedFormat; break; case 'N': fileFormat=NEXUSFormat; break; default: fprintf(stderr, "Unknown output format: %s\n\n", argv[i]); PrintUsage(); exit(0); } break; case 'W': switch (*P) { case 'A': writeAncestors=1; break; case 'R': writeRates=1; break; case 'H': writeHidStates=1; break; case 'L': writeLengths=1; break; default: fprintf(stderr, "Unknown write mode: %s\n\n", argv[i]); PrintUsage(); exit(0); } break; case 'Q': quiet=1; break; case 'V': if (GetDoubleParams(argc, argv, &i, P, 2, cov) ||cov[0]<=0.0 || cov[0]>=1.0||cov[1]<= 0.0) { fprintf(stderr, "Bad Covarion Parameters: %s %s \n\n", argv[i-1], argv[i]); exit(0); } covarion=1; ONfreq=cov[0]; covarionSpeed=cov[1]; break; default: fprintf(stderr, "Illegal command parameter: %s\n\n", argv[i]); PrintUsage(); exit(0); break; } } } } void PrintVerbose(FILE *fv) { int i; fprintf(fv, "Simulations of %d taxa, %d bases\n", numTaxa, numBases); fprintf(fv, " for %d tree(s) with %d dataset(s) per tree\n", numTrees, numDatasets); if (numPartitions > 1) { fprintf(fv, " and %d partitions (and trees) per dataset\n", numPartitions); fprintf(fv, " Partition No. Sites Relative Rate\n"); for (i = 0; i < numPartitions; i++) fprintf(fv, " %4d %7d %lf\n", i+1, partitionLengths[i], partitionRates[i]); } fputc('\n', fv); if (scaleTrees) { fprintf(fv, "Branch lengths of trees scaled so that tree is %G from root to tip\n\n", treeScale); } else if (scaleBranches) { fprintf(fv, "Branch lengths of trees multiplied by %G\n\n", branchScale); } else { fprintf(fv, "Branch lengths assumed to be number of substitutions per site\n\n"); } if (rateHetero==CodonRates) { fprintf(fv, "Codon position rate heterogeneity:\n"); fprintf(fv, " rates = 1:%f 2:%f 3:%f\n", catRate[0], catRate[1], catRate[2]); } else if (rateHetero==GammaRates) { fprintf(fv, "Continuous gamma rate heterogeneity:\n"); fprintf(fv, " shape = %f\n", gammaShape); } else if (rateHetero==DiscreteGammaRates) { fprintf(fv, "Discrete gamma rate heterogeneity:\n"); fprintf(fv, " shape = %f, %d categories\n", gammaShape, numCats); } else fprintf(fv, "Rate homogeneity of sites.\n"); if (invariableSites) { fprintf(fv, "Invariable sites model:\n"); fprintf(fv, " proportion invariable = %f\n", proportionInvariable); } fprintf(fv, "Model=%s\n", ModelNames[model]); if (equalTstv) fprintf(fv, " Rate of transitions and transversions equal:\n"); if (model==F84) fprintf(fv, " transition/transversion ratio = %G (K=%G)\n", tstv, kappa); else if (model==HKY) fprintf(fv, " transition/transversion ratio = %G (kappa=%G)\n", tstv, kappa); else if (model==REV) { fprintf(fv, " rate matrix = gamma1:%7.4f alpha1:%7.4f beta1:%7.4f\n", Rmat[0], Rmat[1], Rmat[2]); fprintf(fv, " beta2:%7.4f alpha2:%7.4f\n", Rmat[3], Rmat[4]); fprintf(fv, " gamma2: %7.4f\n", Rmat[5]); /* fprintf(stderr, " mean transition/transversion ratio = %G\n", tstv);*/ } if (equalFreqs) fprintf(fv, " Base frequencies equal:\n"); fprintf(fv, " frequencies = A:%G C:%G G:%G T:%G\n\n", freq[A], freq[C], freq[G], freq[T]); if (covarion) { fprintf(fv, "Covarion evolution:\n ON frequency = %G\n",ONfreq); fprintf(fv, " speed (average number of swithes / substitution) = %G\n\n",covarionSpeed); } else fprintf(fv, "No covarion evolution \n\n"); } void ReadFileParams() { char ch, st[256]; hasAlignment=0; ch=fgetc(stdin); while (!feof(stdin) && isspace(ch)) ch=fgetc(stdin); ungetc(ch, stdin); if (ch!='(' && isdigit(ch)) { fgets(st, 255, stdin); if ( sscanf( st, " %d %d", &numSequences, &numSites)!=2 ) { fprintf(stderr, "Unable to read parameters from standard input\n"); exit(0); } hasAlignment=1; } } void AllocateMemory() { int i; names=(char **)AllocMem(sizeof(char *)*numSequences, "names", "AllocateMemory", 0); sequences=(char **)AllocMem(sizeof(char *)*numSequences, "sequences", "AllocateMemory", 0); /* hiddenStates= */ for (i=0; i 1) { fprintf(stderr, "Writing ancestral sequences can only be used for a single partition.\n"); exit(0); } /* if (!userSeed) randomSeed = time(NULL); */ if (!userSeed) randomSeed = time(NULL) + clock(); SetSeed(randomSeed); if (printHelp) { PrintTitle(); PrintUsage(); exit(0); } if (!quiet) PrintTitle(); ReadFileParams(); if ((ancestorSeq>0 && !hasAlignment) || ancestorSeq>numSequences) { fprintf(stderr, "Bad ancestral sequence number\n"); exit(0); } if (textFile) { if ( (text_fv=fopen(textFileName, "rt"))==NULL ) { fprintf(stderr, "Error opening text file for insertion into output: '%s'\n", textFileName); exit(0); } } ancestor=NULL; if (hasAlignment) { AllocateMemory(); ReadFile(); if (numBases<0) numBases=numSites; if (ancestorSeq>0) { if (numBases!=numSites) { fprintf(stderr, "Ancestral sequence is of a different length to the simulated sequences\n"); exit(0); } ancestor=sequences[ancestorSeq-1]; } } else if (numBases<0) numBases=1000; SetModel(model); numTaxa=-1; scale=1.0; treeSet = (TTree **)malloc(sizeof(TTree **) * maxPartitions); if (treeSet==NULL) { fprintf(stderr, "Out of memory\n"); exit(0); } partitionLengths = (int *)malloc(sizeof(int) * maxPartitions); if (partitionLengths==NULL) { fprintf(stderr, "Out of memory\n"); exit(0); } partitionRates = (double *)malloc(sizeof(double) * maxPartitions); if (partitionRates==NULL) { fprintf(stderr, "Out of memory\n"); exit(0); } for (i = 0; i < maxPartitions; i++) { if ((treeSet[i]=NewTree())==NULL) { fprintf(stderr, "Out of memory\n"); exit(0); } } numTrees = OpenTreeFile(); CreateRates(); treeNo=0; do { partitionLengths[0] = -1; ReadTree(tree_fv, treeSet[0], treeNo+1, 0, NULL, &partitionLengths[0], &partitionRates[0]); if (treeNo==0) { numTaxa=treeSet[0]->numTips; if (!quiet) fprintf(stderr, "Random number generator seed: %ld\n\n", randomSeed); if (fileFormat == NEXUSFormat) { fprintf(stdout, "#NEXUS\n"); fprintf(stdout, "[\nGenerated by %s %s\n\n", PROGRAM_NAME, VERSION_NUMBER); PrintVerbose(stdout); fprintf(stdout, "]\n\n"); } } else if (treeSet[0]->numTips != numTaxa) { fprintf(stderr, "All trees must have the same number of tips.\n"); exit(0); } if (maxPartitions == 1) { if (partitionLengths[0] != -1) { fprintf(stderr, "\nWARNING: The treefile contained partion lengths but only one partition\n"); fprintf(stderr, "was specified.\n"); } partitionLengths[0] = numBases; } sumLength = partitionLengths[0]; i = 1; while (sumLength < numBases && i <= maxPartitions) { if (!IsTreeAvail(tree_fv)) { fprintf(stderr, "\nA set of trees number %d had less partition length (%d) than\n"); fprintf(stderr, "was required to make a sequence of length %d.\n", treeNo + 1, sumLength, numBases); exit(0); } ReadTree(tree_fv, treeSet[i], treeNo+1, treeSet[0]->numTips, treeSet[0]->names, &partitionLengths[i], &partitionRates[i]); if (treeSet[i]->numTips != numTaxa) { fprintf(stderr, "All trees must have the same number of tips.\n"); exit(0); } sumLength += partitionLengths[i]; i++; } if (i > maxPartitions) { fprintf(stderr, "\nA set of trees number %d had more partitions (%d) than\n"); fprintf(stderr, "was specified in the user options (%d).\n", treeNo + 1, i, maxPartitions); } numPartitions = i; if (sumLength != numBases) { fprintf(stderr, "The sum of the partition lengths in the treefile does not equal\n"); fprintf(stderr, "the specified number of sites.\n"); exit(0); } for (i = 0; i < numPartitions; i++) { CreateSequences(treeSet[i], partitionLengths[i]); if (covarion) { CreateHiddenStates(treeSet[i], partitionLengths[i]); } } if (numPartitions > 1) { sum = 0.0; for (i = 0; i < numPartitions; i++) sum += partitionRates[i] * partitionLengths[i]; for (i = 0; i < numPartitions; i++) partitionRates[i] *= numBases / sum; } if (treeNo==0 && verbose && !quiet) { PrintVerbose(stderr); InitProgressBar(numTrees*numDatasets); DrawProgressBar(); } for (i=0; irooted) { fprintf(stderr, "To scale tree length, they must be rooted and ultrametric.\n"); exit(0); } scale *= treeScale/treeSet[j]->totalLength; } else if (scaleBranches) scale *= branchScale; if (covarion) { covarionRate = scale*covarionSpeed; EvolveHiddenStates(treeSet[j], k, partitionLengths[j]); } if (covarion) { scale /= ONfreq; } EvolveSequences(treeSet[j], k, partitionLengths[j], scale, ancestor); k += partitionLengths[j]; } if (writeAncestors) WriteAncestralSequences(stdout, treeSet[0]); else WriteSequences(stdout, (numTrees > 1 ? treeNo+1 : -1), (numDatasets > 1 ? i+1 : -1), treeSet, partitionLengths); if (writeRates) { WriteRates(stderr); } if (writeHidStates && covarion) { WriteHiddenStates(stderr, treeSet[0]); } if (writeLengths && covarion) { WriteAncestralEffectiveLengths(stderr, treeSet[0]); } if (textFile) { while (!feof(text_fv)) { ch = fgetc(text_fv); if (!feof(text_fv)) fputc(ch, stdout); } fputc('\n', stdout); rewind(text_fv); } if (verbose && !quiet) ProgressBar(); } for (i = 0; i < numPartitions; i++) DisposeTree(treeSet[i]); treeNo++; } while (IsTreeAvail(tree_fv)); /* for (i = 0; i < maxPartitions; i++) FreeTree(treeSet[i]); */ if (treeFile) fclose(tree_fv); if (textFile) fclose(text_fv); totalSecs = (double)(clock() - totalStart) / CLOCKS_PER_SEC; if (!quiet) { fprintf(stderr, "Time taken: %G seconds\n", totalSecs); if (verboseMemory) fprintf(stderr, "Total memory used: %ld\n", totalMem); } return 0; }