📄 main.c
字号:
/*----------------------------------------------------------------------- | Sample classification and feature selection for high-dimensional | | genomic data using a genetic algorithm and k-nearest neighbor method | | | | Leping Li, Ph.D. | | | | National Institute of Environmental Health Sciences | | National Institute of Health | | | | Li3@niehs.nih.gov | | | | Date: March 26, 1999 | | Last Modification: Sept. 18, 2004 | | | | copyright (c) 1999-2003 | -----------------------------------------------------------------------*/#include <stdio.h>#include <stdlib.h>#include <math.h>#include <string.h>#include <malloc.h>#include <ctype.h>#include <time.h>#include "ga_knn.h"void rank_by_variation(double **expValue,int numSamples,int numVariables, SampleInfo *sample,char **variableName);int main(int argc,char **argv) { FILE *fp,*f_update,*fq2,*fq3; register int iii,ii,jj,kk,i,j,m,n; int numNiches,populationSize,numGenerations,numRuns; int numVariables,chromosomeLength,numTopGenes; int numSamples,numTraining,numTesting; int knn,numSolutionObtained,numSolutionsSpecified; int logTransform,medianCentering,zTransform,rangeScale,application,majorityRule; int totalCorrectOneRun; double percentCorrectOneRun[200]; /* allows up to 200 top-ranked features */ SampleInfo *sample; /* sample names and classes */ Neighbor **neighbors; /* knn neighbors & distances */ Fitness **fitness; /* fitness scores, all chr */ Fitness *niche; /* fitness scores, niches */ Wheel *weight; /* weights for chromosomes */ Class *class; /* sample class */ int ***allChr; /* all chromosomes */ int *bestChr; /* near-optimal chromosome */ int *predictedClass; /* predicted class */ double **expValue; /* expression data matrix */ char dataFileName[200]; /* input data file name */ char *c,**variableName; char **missingIndicator; int seed,target_R2,printFitnessScore; int solutionFound,bestNicheId,bestChrId,worstChrId; Solution *selectCount,*countCopy; /* no. times a feature selected */ time_t t; if (argc<23) { printf("ga_knn argument: \n"); printf(" -a application\n"); printf(" 1 = application 1 variable selection using all samples.\n"); printf(" 2 = application 2 variable selection using only the training samples.\n"); printf(" 3 = application 3 leave-one-out cross validation.\n"); printf(" 4 = application 4 split the data into training and test sets multiple times.\n\n"); printf(" -c classification rule\n"); printf(" 1 = majority rule (default).\n"); printf(" 2 = consensus rule.\n\n"); printf(" -d chromosome length [integer]\n"); printf(" 10 or 20 should be ok for most cases.\n\n"); printf(" -f data file name [char]\n\n"); printf(" -k k-nearest neighbors[interger]\n"); printf(" try 3 or 5.\n\n"); printf(" -n number of samples in the data [integer]\n\n"); printf(" -p print fitness score on screen for every generation of GA run\n"); printf(" 0 = no print\n"); printf(" 1 = print\n\n"); printf(" -r target termination cutoff [integer]\n\n"); printf(" should be less or equal to the number of training samples\n"); printf(" -s total number of solutions to be collected [integer]\n"); printf(" try 5000.\n\n"); printf(" -t number of training samples [integer]\n"); printf(" should be less or equal to total number of samples.\n\n"); printf(" -v number of variables (genes or m/z ratios) in your data [integer]\n\n"); printf(" -N standardization multiple choices are allowed e.g. -N 1 2 3 [Default: none]\n"); printf(" 1 = log2 transformation\n"); printf(" 2 = median cetering of columns\n"); printf(" 3 = z-transformation of rows\n"); printf(" 4 = range scale (may be used for SELDI-TOF data)\n\n"); exit(0); } logTransform=-1; medianCentering=-1; zTransform=-1; knn=-1; chromosomeLength=-1; rangeScale=-1; application=-1; majorityRule=-1; numVariables=-1; numSamples=-1; numTraining=-1; numSolutionsSpecified=-1; target_R2=-1; printFitnessScore=-1; for (i=0; i<argc; i++) printf("%s\n",argv[i]); for (i=1; i<argc-1; i++) { if (argv[i][0]=='-') { switch(argv[i][1]) { case 'a': application=atoi(argv[i+1]); break; case 'c': majorityRule=atoi(argv[i+1]); break; case 'd': chromosomeLength=atoi(argv[i+1]); break; case 'f': strcpy(dataFileName,argv[i+1]); break; case 'k': knn=atoi(argv[i+1]); break; case 'n': numSamples=atoi(argv[i+1]); break; case 'p': printFitnessScore=atoi(argv[i+1]); break; case 'r': target_R2=atoi(argv[i+1]); break; case 's': numSolutionsSpecified=atoi(argv[i+1]); break; case 't': numTraining=atoi(argv[i+1]); break; case 'v': numVariables=atoi(argv[i+1]); break; case 'N': if (atoi(argv[i+1])==1 && argv[i+1][0]!='-') logTransform=1; if (i+1<argc && atoi(argv[i+1])==2 && argv[i+1][0]!='-') medianCentering=1; if (i+1<argc && atoi(argv[i+1])==3 && argv[i+1][0]!='-') zTransform=1; if (i+1<argc && atoi(argv[i+1])==4 && argv[i+1][0]!='-') rangeScale=1; break; default: break; } } } seed=time(0); random_initialize(seed); numNiches=3; populationSize=50; numGenerations=50; if (application<1 || application>4) application=1; if (majorityRule !=1 && majorityRule!=2) majorityRule=1; if (application==3) numTraining=numSamples-1; numTesting=numSamples-numTraining; if (numTesting<1) { numTraining=numSamples; numTesting=0; } printf("\nData info:\n"); printf(" Total number of samples: %d\n",numSamples); printf(" Number of samples in training set: %d\n",numTraining); printf(" Number of samples in test set: %d\n",numTesting); printf(" Number of variables (genes or m/z...): %d\n",numVariables); printf(" Data file: %s\n",dataFileName); printf("\nGA parameters:\n"); printf(" Number of niches: %d\n",numNiches); printf(" Number of generations: %d\n",numGenerations); printf(" Population size: %d\n",populationSize); printf(" Chromosome length (d): %d\n",chromosomeLength); printf(" Termination fitness cutoff: %d\n",target_R2); printf("\nK-nearest neighbors:\n"); printf(" KNN: %d\n",knn); printf("Others:\n"); printf(" Number of near-optimal chromosomes: %d\n",numSolutionsSpecified); if (numNiches<1 || populationSize<1 || numGenerations<1 || numSamples<1 || numVariables<1 || chromosomeLength<1 || knn<1 || numSolutionsSpecified <1 || target_R2<0) { printf("Invalid input... parameter(s) is negative or zero. Bye...\n"); exit(0); } if (target_R2>numTraining) { printf("\nError: Termination fitness cutoff is greater than the number of training samples!\n\n"); exit(0); } /* finished reading parameters and names of input files */ /* read data file delimited) */ sample=alloc_sample(numSamples); variableName=alloc_char_char(numVariables+5,MAX_NUM_CHAR); /* up to 3000 characters long */ missingIndicator=alloc_char_char(numVariables+5,numSamples); expValue=read_data(argv[1],dataFileName,numSamples,numVariables,variableName,sample,missingIndicator); #ifdef RANK_BY_VARIATION rank_by_variation(expValue,numSamples,numVariables,sample,variableName); exit(0);#endif#ifdef DEBUG_PRINT debug_print(expValue,numSamples,numVariables,variableName,sample,0);#endif class=assign_class(sample,numSamples); missing_value_impute(expValue,class,sample,missingIndicator,numVariables,numSamples); if (logTransform==1) log_transform(expValue,numSamples,numVariables); /* make sure knn is not too large */ check_knn(numSamples,class->min,knn,application); /*-------------------------------------------------------------------*/ /* data standardization and transformation */ /*-------------------------------------------------------------------*/ if (medianCentering==1) { printf("\n\nMedian centering each sample (column).\n"); center_array(expValue,numSamples,numVariables); } if (zTransform==1) { printf("applying z-transformation (standardization, mean=0 sd=1) to each varialbe (row).\n"); autoscale(expValue,numSamples,numVariables); } if (rangeScale==1) { printf("applying range scale to each varialbe (row).\n"); range_scale(expValue,numSamples,numVariables); } if (chromosomeLength>=numVariables) chromosomeLength=numVariables; /*--------------------------------------------------------*/ /* additional memory allocations */ /*--------------------------------------------------------*/ neighbors =alloc_neighbors(numSamples,numSamples); allChr =alloc_int_int_int(numNiches,populationSize,chromosomeLength); bestChr =alloc_int(chromosomeLength); fitness =alloc_fitness_fitness(numNiches,populationSize); niche =alloc_fitness(numNiches); selectCount =alloc_solution(numVariables); countCopy =alloc_solution(numVariables); weight =alloc_wheel(populationSize); predictedClass=alloc_int(numSamples); fq2=NULL; if (application==4) { /* split the data into training and test sets multiple times */ numRuns=50; fq2=fopen("prediction_percentage.txt","w"); fq3=fopen("prediction_test_set.txt","w"); } else if (application==3) { /* leave one out cross validation */ fq3=fopen("prediction_loocv.txt","w"); numRuns=numSamples; } else { /* applications 1 & 2 */ fq3=fopen("prediction_test_set.txt","w"); numRuns=1; } fp=fopen("ga_knn_info.txt","w"); c=(char *)calloc(50,sizeof(char)); t=time(NULL); c=asctime(localtime(&t)); fprintf(fp,"##################################################################\n"); fprintf(fp," Gene Selection and Sample Classification Using\n"); fprintf(fp," the Genetic algorithm/k-nearest neighbors algorithm\n\n"); fprintf(fp," Author: Leping Li\n\n"); fprintf(fp," National Institute of Environmental Health Sciences\n"); fprintf(fp," Research Triangle Park, NC 27709\n"); fprintf(fp," Email: Li3@niehs.nih.gov\n\n"); fprintf(fp," Copyright (c) 1999-2003\n\n"); fprintf(fp," ---------------------Version 1.02--------------------- \n"); fprintf(fp," Last modification: September 18, 2004\n\n"); fprintf(fp,"Date & time calculation performed: %s\n",c); fprintf(fp,"\nData info:\n"); fprintf(fp," Total number of samples: %d\n",numSamples); fprintf(fp," Number of samples in training set: %d\n",numTraining); fprintf(fp," Number of samples in test set: %d\n",numTesting); fprintf(fp," Number of variables (genes or m/z...): %d\n",numVariables); fprintf(fp," Data file: %s\n",dataFileName); fprintf(fp,"\nGA parameters:\n"); fprintf(fp," Number of niches: %d\n",numNiches); fprintf(fp," Number of generations: %d\n",numGenerations); fprintf(fp," Population size: %d\n",populationSize); fprintf(fp," Chromosome length (d): %d\n",chromosomeLength); fprintf(fp," Termination fitness cutoff: %d\n",target_R2); fprintf(fp," Number of solutions specified: %d\n",numSolutionsSpecified); fprintf(fp,"\nOthers:\n"); fprintf(fp," Random seed number: %d\n",seed); fprintf(fp,"\nTotal number of classes: %4d, individual class type: ",class->num_class); for (i=0; i<class->num_class; i++) fprintf(fp,"%3d ",class->type[i]); fprintf(fp,".\n"); fprintf(fp,"Number of samples in each class: "); for (j=0; j<class->num_class; j++) fprintf(fp,"%3d [class - %1d] ",class->count[j],class->type[j]); fprintf(fp,"\n"); fprintf(fp,"Minimal and maximal numbers of samples in a class: %4d %4d\n",class->min,class->max); fprintf(fp,"\n"); for (i=0; i<class->num_class; i++) { fprintf(fp," class[%d]: ",class->type[i]); for (j=0; j<class->count[i]; j++) { fprintf(fp,"%4d ",class->which[i][j]+1); } fprintf(fp,"\n"); } if (majorityRule==0) { fprintf(fp,"\nKNN:\n"); fprintf(fp," k-nearest neighbors=%2d\n",knn); fprintf(fp," A consensus rule applies.\n"); fprintf(fp," - All neighbors must agree (e.g, %d out of %d).\n\n",knn,knn); } else if (majorityRule==1) { printf("\nKNN: using a majority rule.\n\n"); fprintf(fp,"\nKNN:\n"); fprintf(fp," k-nearest neighbors=%2d\n",knn); fprintf(fp," A majority rule applies.\n"); fprintf(fp," - A majority of the neighbors must agree that is AT LEAST\n"); fprintf(fp," %d out of the %d neighbors must be the same type.\n", (int)(ceil((double)knn/2.0)),knn); } if (application==4) { if (numTesting !=0) { fprintf(fp,"\nApplication 4...\n"); fprintf(fp," The data set is randomly divided into a training and test set.\n"); fprintf(fp," The number of samples in each class in both training and test\n"); fprintf(fp," sets are proportional.\n\n"); fprintf(fp," The training set is used to develop a classification rule which\n"); fprintf(fp," is subsequently used to predict the classes of the test samples\n"); fprintf(fp," This process is repeated many times (default is 50), each of which.\n"); fprintf(fp," uses different training and test sets that are randonly partitioned.\n\n"); } } if (application==3) { fprintf(fp,"\nApplication 3...\n"); fprintf(fp," A leave-one-out cross-validation (LOOCV) is carried out.\n"); } fprintf(fp," \nInformation on data processing:\n"); if (logTransform==1) fprintf(fp," Data are log2 transformed.\n\n"); if (medianCentering==1) fprintf(fp," Each sample (column) is centered by the median of all elements in the col.\n"); if (zTransform==1) fprintf(fp," Each row (variable, e.g., m/z or gene) is z-transformed (mean=0 sd=1).\n"); if (rangeScale==1) fprintf(fp," Each row (variable, e.g., m/z or gene) is range-standardized (Between 0 and 1).\n"); fprintf(fp,"##################################################################\n"); fflush (fp); fclose(fp); for (j=0;j<numVariables;j++) { selectCount[j].index=j; selectCount[j].total=0; } printf("\nRuning GA/KNN ... It may take a few minutes to many hours depending on\n"); printf("the size of your data and how difficult the classes can be separated. In a\n"); printf("few minutes, it should print out on screen the number of solutions obtained\n"); printf("so far. If it takes too long, kill the job (Control/C), reduce fitness\n"); printf("cuotff and restart it.\n\n"); for (iii=0; iii<numRuns; iii++) { if (application==4) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -