/*
* File : correlations.c
* Author : Gavin Sherlock
* Date : September 29th 1999
* Version 1.0
*
* This program reads a preclustering file, and produces a file that contains a sorted list of correlations.
* The user can input a cutoff (as low as .5) and a number (as many as 50)
*/
#include "correlations.h"
/* Main */
int main(int argc, char *argv[]){
char ifile[1024]; /* Path names must be 1024 chars or less */
int numExperiments=0;
int numLines=0;
float *dataMatrix;
clusterRec cluster;
float *eWeights;
FILE *istream;
char **experimentNames;
/* get relevant input information */
if(argc==1 || argc==0){
printf ("Enter input file: ");
gets (ifile);
GetUserInput();
}else{
ParseOptions(ifile, argc, argv);
}
/* Read in the Data */
istream=OpenInFile(ifile);
GetDataSize(istream, &numExperiments, &numLines);
cluster.numGenes=numLines; /* put these in experiment to cut down on number of arguments that need passing */
DoMemoryAllocation(&eWeights, numExperiments, &experimentNames, &cluster, &dataMatrix);
rewind(istream);
ReadInData(istream, &cluster, numExperiments, eWeights, experimentNames, dataMatrix);
fclose (istream);
/* transform data if necessary */
if(gLogData) LogTransformData(dataMatrix, cluster.numGenes, numExperiments);
MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, ifile);
free (dataMatrix);
free (eWeights);
FreeExperimentNames(experimentNames, numExperiments);
FreeCluster(&cluster);
printf("Finished\n");
fflush(stdout);
return 0;
}
/* Function: OpenInFile
* Usage: istream=OpenInFile(ifile)
* ----------------------
* This functions attempts to open a file using the name
* passed into it, and reports an error if it fails, otherwise
* returns a file handle for reading.
*/
FILE* OpenInFile(char *ifile){
FILE *istream;
if ((istream = fopen(ifile, "rb")) == NULL){ /* Open the file OK? */
Error("\nError opening input\n"); /* NO, say so */
} /* end if */
return (istream);
}
/*
* Function : GetFilePrefix
* Usage : prefix=GetFilePrefix(ifile)
* -----------------------------------
* This function returns the prefix to a filename, which
* is all characters up to the last '.' character, or if
* one is not encountered, then all characters. If a unique
* identifier was passed in, via the -u option, then the directory
* path, plus the UID will be returned instead.
*/
char *GetFilePrefix(char *ifile){
char* prefix;
int letter=0;
int last=0;
int seen=0;
int dirLet=0;
while (ifile[letter]!='\0'){
if (ifile[letter]=='.'){
last=letter;
seen=1;
}else if (ifile[letter]=='/'){
seen=0;
if (gUID) dirLet=letter; /* remember the directory path */
}
letter++;
if (letter==1024) Error("The name of your input file is longer than 1024 characters");
}
if (seen && !gUID){ /* saw a period */
prefix=malloc((last+2)*sizeof(char));
if (prefix==NULL) Error ("Not enough room for prefix\n");
strncpy(prefix, ifile, last+1);
prefix[last+1]='\0';
}else if (!gUID){ /* no period */
prefix=malloc((letter+2)*sizeof(char));
if (prefix==NULL) Error ("Not enough room for prefix\n");
strcpy(prefix, ifile);
prefix[letter]='.';
prefix[letter+1]='\0';
}else{ /* a UID was passed in */
prefix=malloc((dirLet+strlen(gPrefix)+3)*sizeof(char));
if (dirLet) {
strncpy(prefix, ifile, dirLet+1);
prefix[dirLet+1]='\0';
strcat(prefix, gPrefix);
}else{
strcpy(prefix, gPrefix);
}
strcat(prefix, ".");
prefix[dirLet+2+strlen(gPrefix)]='\0';
}
return (prefix);
}
/*
* Function: GetUserInput
* Usage: GetUserInput()
* ---------------------------
* This function gets input form the user, as regards whether they want to cluster
* experiments, and whether they want to use a centered metric for correlation.
* It also checks whether they want to partition the data (and how), whether they
* want to log transform the data, and whether they want to normalize of filter the data.
*/
void GetUserInput(){
GetTransformationOptions();
GetGeneMetric();
GetCutOff();
GetNumCorrelations();
}
/*
* Function: GetTransformationOptions
* Usage:
* ----------------------------------
* This function checks whether they want to log transform the data
*/
void GetTransformationOptions(void){
char inputLine[64];
printf("\nDo the data need to be log transformed?(yes/no)\n");
CheckYesOrNo(inputLine);
if ((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))){
gLogData=1;
}
}
/*
* Function: GetGeneMetric
* Usage: GetGeneMetric()
* ------------------------
* This function checks whether they want to use a centered or
* uncentered metric for the genes, if pearson was chosen as the
* distance metric.
*/
void GetGeneMetric(void){
char inputLine[64];
printf("Do you want to use a correlation with a centered metric (yes/no)? \n");
CheckYesOrNo(inputLine);
if ((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))){
gCentered=1;
}
}
/*
* Function: GetCutOff
* Usage: GetCutoff()
* -------------------
* This functions finds out what cutoff the user wants. If they enter less than 0.2
* it will use 0.2. The default is 0.8
*/
void GetCutOff(void){
char inputLine[64];
printf("What do you want to use as a cutoff?\n");
printf("(Enter a value greater than 0.2, or hit return to use the default of 0.8)");
gets(inputLine);
if (!strcmp(inputLine, "")){
/* they just hit return, so use default */
}else{
gCutOff=StringToReal(inputLine);
if (gCutOff < 0.2){
printf("You entered a cutoff less than 0.2. .2 will be used as a cutoff\n");
gCutOff = 0.2;
}
if (gCutOff > 1) Error("You chose a cutoff greater than 1");
}
}
/*
* Function: GetNumCorrelations
* Usage : GetNumCorrelations();
* -----------------------------
* This function finds out the number of correlations that the person wants to store.
* The default is 20, and they can have up to 50.
*/
void GetNumCorrelations(void){
char inputLine[64];
printf("How many correlations do you want?\n");
printf("(Enter a number, or hit return to get the default of 20)");
gets(inputLine);
if (!strcmp(inputLine, "")){
/* they just hit return, so use default */
}else{
gMaxNumCorrelations=(int)StringToReal(inputLine);
if (gMaxNumCorrelations < 1) Error("You chose a value less than 1");
}
}
/*
* Function: GetYesOrNo
* Usage:
* --------------------
* This function simply asks for an input, and checks whether it is yes or no.
* It stays continues prompting for a yes or no until one is received.
*/
void CheckYesOrNo(char *inputLine){
while(1){
gets(inputLine);
if (!((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))||(!strcmp(inputLine, "no"))
||(!strcmp(inputLine, "NO")))){
printf("Please answer yes or no\n");
}else{
break;
}
}
}
/*
* Function: ParseOptions
* Usage: ParseOptions(ifile, argc, argv)
* ----------------------------------------------
* This function parses supplied command line options.
* Command line options are as follows:
* -f filename
* -g 1|2 ; 1 indicates non-centered metric, 2 indicates centered metric. 1 is default.
* -e 0|1|2 ; 0 indicates no experiment clustering, see above for 1 and 2. 0 is the default.
* The arguments can be passed in any order.
*/
void ParseOptions(char *ifile, int argc, char** argv){
int i;
int foundFileName=0;
int length;
if (!(strcmp("-h", argv[1]))){ /* they want help */
Usage();
exit(0); /* NOTE EXITING PROGRAM HERE!!!! */
}
if ((argc-1)%2)Error("Incorrect number of command line arguments");
for(i=1; i<argc; i+=2){
if (!(strcmp("-l", argv[i]))){ /* log transformation */
if(!(strcmp("0", argv[i+1]))){ /* default, no action needed */
}else if (!(strcmp("1", argv[i+1]))){
gLogData=1;
}else{
Error("Unrecognized value for the \"-l\" option. Only 0 and 1 are valid");
}
}else if (!(strcmp("-u", argv[i]))){ /* want a unique id */
length=strlen(argv[i+1]);
gPrefix=malloc((length+1)*sizeof(char));
strcpy(gPrefix, argv[i+1]);
gPrefix[length]='\0';
gUID=1;
}else if (!(strcmp("-f", argv[i]))){ /* filename */
strcpy(ifile, argv[i+1]);
foundFileName=1;
}else if (!(strcmp("-corr", argv[i]))){ /* correlation parameter */
if (!(strcmp("1", argv[i+1]))){ /* default, no action needed */
}else if (!(strcmp("2", argv[i+1]))){
gCentered=1;
}else{
Error("Unrecognized value for the \"-corr\" option. Only 1 and 2 are valid");
}
}else if (!(strcmp("-cutoff", argv[i]))){ /* cutoff parameter */
gCutOff=StringToReal(argv[i+1]);
if (gCutOff <0.2){
printf("You entered a cutoff less than 0.2. .2 will be used as a cutoff\n");
gCutOff = 0.2;
}
if (gCutOff > 1) Error("You chose a cutoff greater than 1");
}else if (!(strcmp("-num", argv[i]))){ /* number of correlations that they want */
gMaxNumCorrelations=(int)StringToReal(argv[i+1]);
if (gMaxNumCorrelations < 1) Error("You chose a value less than 1");
}else if (!(strcmp("-showCorr", argv[i]))){ /* indicating whether they want the correlations */
gShowCorrelations = (int)StringToReal(argv[i+1]);
if (gShowCorrelations > 1 || gShowCorrelations < 0){
Error("You have chosen an out of range value for defining whether to show correlations or not.");
}
}else{
Error("Unrecognized command line option. Only -f, -corr, -cutoff, -num, -l and -u are valid (currently!)");
}
}
if(!foundFileName)Error("You passed in command line arguments without specifying a filename");
}
/*
* Function: Usage
* Usage : Usage()
* ---------------
* This functions prints out the command line arguments that may be used with the program,
* and what they mean.
*/
void Usage(void){
printf("\n\nThe program \"correlations\" will take a preclustering file as an input, and produce a\n");
printf("file containing the correlations for each gene in sorted order.\n");
printf("The output file will be named with the same stem as the input file, but with a .stdCor suffix\n\n");
printf("Usage:\n");
printf("The following command line arguments may be used:\n\n");
printf("-f Allows you to specify the preclustering filename. Relative paths may be used\n\n");
printf("-corr 1|2 Allows you to specify whether you want an uncentered (1) or a centered (2) metric.\n");
printf(" 1 is the default\n\n");
printf("-cutoff Allows you to specify a cutoff, correlations above which will be stored\n\n");
printf("-num Allows you to specify the number of correlations that you would like to store\n");
printf(" 20 is the default\n\n");
printf("-l 0|1 Allows you to specify if you want to log transform the data (1)\n");
printf(" 0 is the default\n\n");
printf("-u Allows you to specify a unique id by which you output file will be named\n");
printf(" eg. correlations -f sample.pcl -u 888\n");
printf(" will produce an output file named 888.stdCor\n\n");
printf("-showCorr 0|1 specifies whether you want to see thhe correlations themselves.\n");
printf(" 1 is the default\n\n");
printf("Questions or comments should be addressed to sherlock@genome.stanford.edu\n\n");
}
/*
* Function: OpenOutFile
* Usage: outfile=OpenOutFile("outfile.txt")
* -----------------------------------------
* This functions attempts to open a file using the name
* passed into it, and reports an error if it fails, otherwise
* returns a file handle for writing.
*/
FILE* OpenOutFile(char *ofile){
FILE *ostream;
if ((ostream = fopen(ofile, "w")) == NULL){ /* Open the file OK? */
Error("\nError opening output file: %s\n", ofile); /* NO, say so */
} /* end if */
return (ostream);
}
/*
* Function: OpenForAppend
* Usage: outfile=OpenForAppend("outfile.txt")
* -----------------------------------------
* This functions attempts to open a file for appending using the name
* passed into it, and reports an error if it fails, otherwise
* returns a file handle for appending.
*/
FILE* OpenForAppend(char *ofile){
FILE *ostream;
if ((ostream = fopen(ofile, "a")) == NULL){ /* Open the file OK? */
Error("\nError opening output file: %s\n", ofile); /* NO, say so */
} /* end if */
return (ostream);
}
/* Function: GetDataSize
* Usage:GetDataSize(istream, &numExperiments, &numLines)
* --------------------------------------------------
* This functions parses the data file, to determine the number
* of genes and experiments represented therein. Because the
* variables are passed in by address then there is no return argument.
*/
void GetDataSize(FILE *istream, int *numExperiments, int *numLines){
int nextbyte;
int extraChar;
printf("Getting size of data...\n");
fflush(stdout);
while ((nextbyte = fgetc(istream))){ /* get characters from input file */
if (nextbyte=='\t'){ /* count tabs to get number of columns */
(*numExperiments)++;
}
/* the next bit looks for the end of line, whether it's PC ('\r\n'),
* Mac ('\r') or UNIX ('\n'). It checks to see if the following character
* has to do with the end of line also. If not, it puts it back in the stream.
*/
if ((nextbyte=='\r') || (nextbyte=='\n')){ /* look for end of line */
extraChar=fgetc(istream);
if (extraChar!='\n' && extraChar!=EOF){
ungetc(extraChar, istream);
}
break; /* only reading first line to get number of columns */
}
}
(*numExperiments)-=2; /* the first two tabs didn't precede data so decrease the number */
while((nextbyte = fgetc(istream)) != EOF){ /* Read char.s until end of file */
switch (nextbyte) /* What char was read? */
{
case '\r': /* on the Mac or the PC, \r will be the first thing seen to indicate an end of line */
(*numLines)++;
extraChar=fgetc(istream); /* get rid of extra character if needs be */
if (extraChar!='\n' && extraChar!=EOF){
ungetc(extraChar, istream);
}
break;
case '\n': /* on UNIX \n will be seen as the end of line */
(*numLines)++;
break;
default:
break;
} /* end switch */
} /* end while */
(*numLines)--; /* because there's an extra line in the file */
}
/*
* Function: DoMemoryAllocation
* Usage: DoMemoryAllocation(&eWeights, numExperiments, &experimentNames, &cluster, &dataMatrix);
* -------------------------------------------------------------------------------------------------------------------
* This function does the DMA that is needed for all the data structures that are required,
* including the dataMatrix, the experiment names, and the nameRecs.
*/
void DoMemoryAllocation(float **eWeights, int numExperiments, char ***experimentNames,
clusterRec *cluster, float **dataMatrix){
nameRec *namePtr;
*eWeights=malloc(numExperiments*sizeof(float));
if (*eWeights==NULL) Error("Not enough memory available for eWeights");
*experimentNames=malloc(numExperiments*sizeof(char*));
if (*experimentNames==NULL) Error ("No memory available for experimentNames");
(*cluster).genes=malloc(cluster->numGenes * sizeof(nameRec)); /* allocate the memory for the name records */
if ((*cluster).genes==NULL) Error("No memory available for nameRecs");
/* now initialize the genes array */
for (namePtr=&(cluster->genes[0]); namePtr<&(cluster->genes[cluster->numGenes]); namePtr++){
InitializeArray(namePtr);
}
/* allocate dataMatrix */
*dataMatrix=malloc(numExperiments*(cluster->numGenes)*sizeof(float));
if (*dataMatrix==NULL) Error("No memory available for dataMatrix");
}
/*
* Function: ReadInData
* Usage: ReadInData(istream, &cluster, numExperiments, eWeights, numLines, &experimentNames, dataMatrix)
* -------------------------------------------------------------
* This functions parses the data file, to retrieve the log transformed data.
* it also retrieves the eWeights, that form the second line of the file.
*/
float *ReadInData(FILE *istream, clusterRec *cluster, int numExperiments, float *eWeights, char **experimentNames, float *dataMatrix){
char buffer[1024];
char nextbyte;
char extraChar;
int currCol=0;
int letter=0;
int dataCol=0;
int currLine=0;
printf("Reading Data...\n");
fflush(stdout);
/* parse the first line for the experiment names
* should really consolidate code for parsing first and second lines
* by switching again, dependent on the lineNumber, 1 or 2
*/
while ((nextbyte=fgetc(istream))){
if (nextbyte=='\t'){
buffer[letter]='\0';
switch (++currCol){ /* depends what column we're looking at as to what we'll do */
case 1:
case 2:
case 3:
letter=0;
break;
default: /* other columns must be data columns */
if (letter!=0){
experimentNames[dataCol]=malloc((letter+1)*sizeof(char));
}else{
Error("An experiment is unnamed at data column %d\n", dataCol+1);
}
if (experimentNames[dataCol]==NULL) Error("Not enough memory for experiment name %d", dataCol+1);
strcpy(experimentNames[dataCol], buffer);
letter=0;
dataCol++;
break;
} /* end switch */
}else if (nextbyte=='\r' || nextbyte=='\n'){
buffer[letter]='\0';
if (letter!=0){
experimentNames[dataCol]=malloc((letter+1)*sizeof(char));
}else{
Error("An experiment is unnamed at data column %d\n", dataCol+1);
}
if (experimentNames[dataCol]==NULL) Error("Not enough memory for experiment name %d", dataCol+1);
strcpy(experimentNames[dataCol], buffer);
extraChar=fgetc(istream); /* get rid of extra character if needs be */
if (extraChar!='\n' && extraChar!=EOF){
ungetc(extraChar, istream);
}
break;
}else{
if (letter>=1024) Error("An experiment name longer than 1024 bytes exists in the file on the first line.");
buffer[letter++]=nextbyte;
}
}
letter=0;
dataCol=0;
currCol=0;
/* Now read in the eWeights line */
while ((nextbyte=fgetc(istream))){
if (nextbyte=='\t'){
buffer[letter]='\0';
switch (++currCol){
case 1:
case 2:
case 3:
letter=0;
break;
default: /* other columns must be data columns */
eWeights[dataCol]=StringToReal(buffer);
letter=0;
dataCol++;
break;
} /* end switch */
}else if (nextbyte=='\r' || nextbyte=='\n'){
buffer[letter]='\0';
eWeights[dataCol]=StringToReal(buffer);
extraChar=fgetc(istream); /* get rid of extra character if needs be */
if (extraChar!='\n' && extraChar!=EOF){
ungetc(extraChar, istream);
}
break;
}else{
if (letter>=1024) Error("A token longer than 1024 bytes exists in the file on the second line.");
buffer[letter++]=nextbyte;
}
}
for (currLine=0; currLine<cluster->numGenes; currLine++){
ReadOneLine(istream, dataMatrix, numExperiments, currLine, &(cluster->genes[currLine]));
}
printf("Done reading data...\n");
fflush(stdout);
return (dataMatrix);
}
/*
* Function: InitializeArray
* Usage: InitializeArray(&(names[count]));
* ----------------------------------------
* This simply initializes the fields of each nameRec to be zero or NULL
*/
void InitializeArray(nameRec *names){
names->orf=NULL;
names->name=NULL;
names->joined=0;
names->numCorrelations=0;
names->last=NULL;
names->first=NULL;
}
/* Function: ReadOneLine
* Usage: ReadOneLine(istream, dataMatrix, &numDataPoints, numExperiments, currLine, &(names[currLine]))
* ---------------------------------------------------
* This function is called from ReadInData, and will parse one
* line of data, depositing the data into dataMatrix, the space for
* which was allocated in the ReadInData function.
*/
void ReadOneLine(FILE *istream, float *dataMatrix, int numExperiments, int currLine, nameRec *names){
char buffer[1024];
int currCol=0;
int dataCol=0;
int tabCount=0; /* if two tabs next to each other, then a null value exists */
char nextbyte;
char extraChar;
int letter=0;
/* initialize gene records */
while ((nextbyte=fgetc(istream))!=EOF){
switch (nextbyte){
case '\t':
tabCount++;
if (dataCol>=numExperiments) Error("There are more columns of data than expected on data line %d. Only %d columns were expected.\n", currLine+1, numExperiments);
if (tabCount==2){ /* a null data point exists, or there's a missing name */
if (currCol>2){
dataMatrix[currLine*numExperiments+dataCol]=NODATA;
currCol++;
dataCol++;
tabCount=1;
letter=0;
break;
}else{
tabCount=0;
}
}
buffer[letter]='\0';
switch (currCol++){
case 0: /* First column is the ORF name */
if (letter>0){
names->orf=malloc((letter+1)*sizeof(char));
if (names->orf==NULL) Error("Not Enough Memory Available for orfs\n");
strcpy(names->orf, buffer);
}else{
/*printf("Missing Name\n");*/
}
letter=0;
break;
case 1: /* Second column is the SGD name */
if (letter>0){
names->name=malloc((letter+1)*sizeof(char));
if (names->name==NULL) Error("Not Enough Memory Available for names\n");
strcpy(names->name, buffer);
}else{
/*printf("Missing Name\n");*/
}
letter=0;
break;
case 2: /* Third column is the row weight */
names->rowWeight=(float)(StringToReal(buffer));
letter=0;
break;
default: /* other columns must be data columns */
dataMatrix[currLine*numExperiments+dataCol]=(float)(StringToReal(buffer));
letter=0;
dataCol++;
break;
}
break;
case '\r':
case '\n':
extraChar=fgetc(istream); /* get rid of extra character if needs be */
if (extraChar!='\n' && extraChar!=EOF){
ungetc(extraChar, istream);
}
tabCount++;
if (dataCol>=numExperiments) Error("There are more columns of data than expected on data line %d. Only %d columns were expected, but %d were found.\n", currLine+1, numExperiments, dataCol+1);
if(dataCol<numExperiments-1)Error("There are less columns of data than expected on data line %d. %d columns were expected, but only %d were found.\n", currLine+1, numExperiments, dataCol+1);
if (tabCount==2){ /* a null data point exists */
dataMatrix[currLine*numExperiments+dataCol]=NODATA;
}else{
buffer[letter]='\0';
dataMatrix[currLine*numExperiments+dataCol]=(float)(StringToReal(buffer));
}
return;
break;
default:
if (letter>=1024) Error("A token longer than 1024 bytes exists in the file on line %d", currLine);
buffer[letter++]=nextbyte;
tabCount=0;
break;
}
}
}
/*
* Function: StringToReal
* Usage: s=StringToReal(buffer)
* ---------------------------
* This function converts a string representing a real number
* into its corresponding value. If the string is not a legal
* floating point number, or it contains extraneous characters,
* StringToReal signals an Error condition. This code was taken
* from Eric Roberts' book "The Art and Science of C".
*/
double StringToReal(char *s){
double result;
char dummy;
if (s==NULL) Error("NULL string passed to StringToReal");
if (sscanf(s, " %lg %c", &result, &dummy) !=1){
Error("StringToReal called on illegal number %s", s);
}
return (result);
}
/*
* Function: MakeCorrelations
* Usage: MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, numGenes, 'G')
* MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, numExperiments, 'E')
* --------------------------------------------------
* This function generates the correlation coefficients
* by comparing each profile to every other profile.
* It cuts the work in half by not comparing A vs B, and
* B vs A, which should be identical. It will save up to gMaxNumCorrelations
* of the top correlations. This allows correlation scores to
* be freed once the node has been joined. The scores must be in
* sorted order. There are pointers to both the top and bottom scores,
* so that it can rapidly be decided whether a correlation is the highest,
* or whether a newly calculated correlation should be included.
* By passing in different "total", and "type", the function can be made to
* calculate correlations for experiments, or genes.
*/
void MakeCorrelations(clusterRec *cluster, float *dataMatrix, int numExperiments, float *eWeights, char *ifile){
int counter;
int comparedToCounter;
float pearsonCorrelation;
FILE *outfile;
int numAfterWhichToPrint=400000/cluster->numGenes;
char *filename;
printf("Making correlations\n");
fflush(stdout);
MakeFileName(ifile, &filename);
outfile = OpenOutFile(filename);
for (counter=0; counter<cluster->numGenes-1; counter++){
for (comparedToCounter=counter+1; comparedToCounter<cluster->numGenes; comparedToCounter++){
pearsonCorrelation=CalculateCorrelation(&dataMatrix[counter*numExperiments], &dataMatrix[comparedToCounter*numExperiments], numExperiments, eWeights);
/* Once we have the correlation coefficient we have to check
* if it's greater than the last value in the linked list
* for BOTH the geneCounter gene, and the comparedToCounter
* gene. This way every linked list will contain its top
* five most similar correlations. If you only checked the
* geneCounter gene, then it is possible (it does happen, I
* checked) that you would not be storing the top correlations
* for some genes anywhere.
*/
CheckToInsert(cluster, counter, comparedToCounter, pearsonCorrelation);
CheckToInsert(cluster, comparedToCounter, counter, pearsonCorrelation);
}
/* now print out relevant correlations for gene, then free those correlations */
fprintf(outfile, "%s", cluster->genes[counter].orf);
PrintOneGene(cluster->genes[counter].first, outfile, cluster);
FreeCorrelations(&(cluster->genes[counter].first));
if (!(counter%numAfterWhichToPrint)){
printf ("%d\n", counter);
fflush(stdout);
}
}
/* print correlations for final gene */
fprintf(outfile, "%s", cluster->genes[counter].orf);
PrintOneGene(cluster->genes[counter].first, outfile, cluster);
FreeCorrelations(&(cluster->genes[counter].first));
printf("Done Making Correlations\n");
fflush(stdout);
fclose(outfile);
free(filename);
}
/*
* Function: CalculateCorrelation
* Usage: pearsonCorrelation=CalculateCorrelation(geneCounter, comparedToCounter, dataMatrix, numExperiments, eWeights, type, cluster->numGenes)
* ------------------------------------------------------------------------------------------------------------------------
* This function calculates the correlation between two expression profiles, whose identities are
* indicated by geneCounter and comparedToCounter. The algorithm calculates the pearson correlation
* and only utilizes data where the two profiles have data (experiments) in common. It is able
* to keep running totals, so that it does not have to go over the datasets once again after it
* has found which experiments are in common between two profiles. This code is taken almost verbatim
* from Mike Eisen's original correlation code. One bug in that code has been fixed (the original
* code wrongly used numExperiments, instead of Count, in the final calculation) and the data access looks
* somewhat different to reflect the nature of the way that I have stored the data in what was a
* dynamically allocated array. I have used pointers all the time to access the array data. It looks
* ugly and confusing, as I'm updating the pointers themselves, rather than a separate variable. It is,
* however, far quicker (~35%!).
*/
float CalculateCorrelation(float *genePtr, float *cmpPtr, int numExperiments, float *colPtr){
float Sum1, Sum2, Sum11, Sum22, Sum12, Ave1, Ave2, Count, Corr, norm;
register float colWeight;
register float geneVal;
register float cmpVal;
float *max=genePtr+numExperiments;
Sum1 = Sum2 = Sum11 = Sum22 = Sum12 = Count = Ave1 = Ave2 = 0;
Corr=-1.0;
for(; genePtr<max; genePtr++, cmpPtr++, colPtr++){
geneVal=*genePtr;
cmpVal=*cmpPtr;
if(geneVal!=NODATA && cmpVal!=NODATA){
colWeight=*colPtr;
Sum1+=colWeight * geneVal;
Sum2+=colWeight * cmpVal;
Sum11+=colWeight * geneVal * geneVal;
Sum22+=colWeight * cmpVal * cmpVal;
Sum12+=colWeight * geneVal * cmpVal;
Count+=colWeight;
}
}
if (Count){
if(gCentered){
Ave1 = Sum1/Count;
Ave2 = Sum2/Count;
norm=sqrt((Sum11 - 2 * Ave1 * Sum1 + Count * Ave1 * Ave1)*(Sum22 - 2 * Ave2 * Sum2 + Count * Ave2 * Ave2));
if (norm>0){
Corr = (Sum12 - Sum1 * Ave2 - Sum2 * Ave1 + Count * Ave1 * Ave2) /norm;
}
}else{
norm=sqrt(Sum11*Sum22);
if (norm>0){
Corr=(Sum12/norm);
}
}
}else{
Corr=0;
}
return (Corr);
}
/*
* Function: CheckToInsert
* Usage: CheckToInsert(cluster, geneCounter, comparedToCounter, pearsonCorrelation);
* -------------------------------------------------------------------------------------
* This function checks whether a correlation that was just made is
* worthy of insertion into a list of correlations, based on it's value.
* if so it makes a new record, which contains a number that indicates
* the gene to which the geneCounter gene is correlated to, then calls
* a function to insert it. It will also check whether there are enough
* correlations already in the list, and delete the lowest one if appropriate.
*/
void CheckToInsert(clusterRec *cluster, int geneCounter, int comparedToCounter, float pearsonCorrelation){
correlationRec *newOne;
if (pearsonCorrelation > gCutOff && ((cluster->genes[geneCounter].last==NULL)||
(pearsonCorrelation > cluster->genes[geneCounter].last->corr)|| (cluster->genes[geneCounter].numCorrelations<gMaxNumCorrelations))){
if (cluster->genes[geneCounter].numCorrelations==gMaxNumCorrelations){
cluster->genes[geneCounter].last->corr=pearsonCorrelation;
cluster->genes[geneCounter].last->ORFnumber=comparedToCounter;
cluster->genes[geneCounter].last=SwitchLast(&(cluster->genes[geneCounter].first), cluster->genes[geneCounter].last);
}else{
newOne=MakeNewRecord(pearsonCorrelation, comparedToCounter);
InsertSorted(&(cluster->genes[geneCounter].first), newOne);
if (newOne->next==NULL){
cluster->genes[geneCounter].last=newOne;
}
++cluster->genes[geneCounter].numCorrelations;
}
}
}
/*
* Function: SwitchLast
*/
correlationRec *SwitchLast(correlationRec **list, correlationRec* newOne){
correlationRec *curr, *prev;
prev=NULL;
for (curr=*list; curr->next!=NULL; curr=curr->next){
if (curr->corr<newOne->corr) break; /* We passed it, so now want to insert here */
prev=curr;
}
/* now curr points to the entry to delete (we already actually knew this)
* and prev points to the entry that will now be the end of the list
*/
newOne->next=curr;
if (prev!=NULL){
prev->next=newOne;
}else{
*list=newOne;
}
for(;curr->next!=newOne;curr=curr->next){}
curr->next=NULL;
return curr;
}
/*
* Function: MakeNewRecord
* Usage: MakeNewRecord(pearsonCorrelation, comparedToCounter)
* -----------------------------------------------------------
* This function allocates space for a new correlationRec
* and initializes its fields
*/
correlationRec *MakeNewRecord(double correlation, int geneNumber){
correlationRec *newOne;
newOne=malloc(sizeof(correlationRec));
if (newOne==NULL) Error("Not enough memory available for a new correlationRec");
newOne->corr=correlation;
newOne->ORFnumber=geneNumber;
newOne->next=NULL;
return (newOne);
}
/*
* Function: InsertSorted
* Usage: InsertSorted(&(cluster->genes[geneCounter]), cp)
* ----------------------------------------------------------
* This function will insert a correlationRec in order (with
* respect to the value of the correlation it contains). It
* handles the special case of their being no previous correlationRecs
* in the list. This is why the list has to be passed in as a double
* pointer.
*/
void InsertSorted(correlationRec **list, correlationRec *newOne){
correlationRec *prev, *curr;
prev=NULL;
for (curr=*list; curr!=NULL; curr=curr->next){
if (curr->corr<newOne->corr) break; /* We passed it, so now want to insert here */
prev=curr;
}
/* now, "prev" points to the one before where the newOne will be inserted,
next "curr" points to the one after */
newOne->next=curr;
if (prev !=NULL){
prev->next=newOne;
}else{
*list = newOne;
}
}
/*
* Function: DeleteLast
* Usage: cluster->genes[geneCounter].last=DeleteLast(cluster->genes[geneCounter].first)
* -------------------------------------------------------
* This function is invoked when the last correlation in a list is to a gene
* that has just been joined into a node. Hence the correlationRec associated
* with that correlation is deleted, and the one in the list previous to it
* is returned as now being at the end of the list.
*/
correlationRec *DeleteLast(correlationRec **list){
correlationRec *curr, *prev;
prev=NULL;
for (curr=*list; curr->next!=NULL; curr=curr->next){
prev=curr;
}
/* now curr points to the entry to delete (we already actually knew this)
* and prev points to the entry that will now be the end of the list
*/
prev->next=NULL;
free (curr);
return prev;
}
/*
* Function: FreeCluster
* Usage: FreeCluster(cluster)
* ---------------------------------
* This function frees the memory associated with a particular dataset,
* including all ORF and genes names.
*/
void FreeCluster(clusterRec *cluster){
int counter;
for (counter=0; counter<cluster->numGenes; counter++){
if (cluster->genes[counter].orf!=NULL) free(cluster->genes[counter].orf);
if (cluster->genes[counter].name!=NULL) free(cluster->genes[counter].name);
}
free (cluster->genes);
}
/*
* Function: FreeCorrelations
* Usage: FreeCorrelations(&(cluster->genes[node1].first))
* ----------------------------------------------------------
* This function walks through a linked list freeing the memory
* associated with each stored correlation.
*/
void FreeCorrelations(correlationRec **node){
correlationRec *curr, *next;
for (curr=*node; curr!=NULL;){
next=curr->next;
free(curr);
curr=next;
}
}
/*
* Function: MakeFile
* Usage: MakeFileNames(ifile, &fileName)
* -----------------------------------------------------------------------
* This function simply determines from the name of the input file, what the
* output files should be called.
*/
void MakeFileName(char *ifile, char **fileName){
char* prefix;
prefix=GetFilePrefix(ifile);
*fileName=malloc((strlen(prefix)+7)*sizeof(char));
if(*fileName==NULL)Error("Not enough memory for filenames");
strcpy(*fileName, prefix);
strcpy(*fileName+strlen(prefix), "stdCor\0");
free(prefix);
}
/*
* Function: Error
* Usage: Error(msg, ...)
* ----------------------
* This function generates an error string, expanding % constructions
* appearing in the error message string just as printf does.
* After printing the error message, the program terminates,
* This code was taken from Eric Roberts' "The Art and Science of C".
*/
void Error(char *msg, ...){
va_list args;
va_start(args, msg);
fprintf(stderr, "Error: ");
vfprintf(stderr, msg, args);
fprintf(stderr, "\n");
va_end(args);
exit(1);
}
/* Function: FreeExperimentNames
* Usage: FreeExperimentNames(experimentNames, numExperiments)
* --------------------------------------------------------------
* This function frees the storage associated with keeping track of the experiment names
*/
void FreeExperimentNames(char **experimentNames, int numExperiments){
int i;
for (i=0; i<numExperiments; i++){
free (experimentNames[i]);
}
free(experimentNames);
}
/*
* Function: PrintOneGene
* Usage: PrintOneGene(cluster->genes[counter].first, outfile)
* --------------------------------------------------------------
* This function will print out the correlations to a particular gene,
* The output is tab delimited, and consists of both the gene name,
* and the correlation value itself.
*/
void PrintOneGene(correlationRec *list, FILE *outfile, clusterRec *cluster){
correlationRec *curr;
for (curr=list; curr!=NULL; curr=curr->next){
fprintf(outfile, "\t%s", cluster->genes[(curr->ORFnumber)].orf);
if (gShowCorrelations){
fprintf(outfile, "\t%f", curr->corr);
}
}
fprintf(outfile, "\n");
}
/*
* Function : LogTransformData
* Usage : LogTransformData(dataMatrix, numGenes, numExperiments)
* --------------------------------------------------------------
* This function transforms all the data in the dataMatrix into
* into log base2
*/
void LogTransformData(float *dataMatrix, int numGenes, int numExperiments){
int gene;
int experiment;
float log2=log(2);
printf("Log Transforming Data\n");
for (experiment=0; experiment<numExperiments; experiment++){
for (gene=0; gene<numGenes; gene++){
if (dataMatrix[gene*numExperiments+experiment]!=NODATA){
if (dataMatrix[gene*numExperiments+experiment]<=0){
Error("There's is a value in gene %d, experiment %d, at or below zero, that can't be logged", gene, experiment);
}else{
dataMatrix[gene*numExperiments+experiment]=log(dataMatrix[gene*numExperiments+experiment])/log2;
}
}
}
}
}