/* * Copyright © 2006. Patrick Meirmans. * * You can use or modify this source code for any use, provided that when it is used for a scientific publication, * a reference is given to: * * P.G. Meirmans, 2006, USING THE AMOVA FRAMEWORK TO ESTIMATE A STANDARDISED GENETIC DIFFERENTIATION MEASURE, Evolution, 60 (11), p.2399-2402 * * for bugs or any other remarks, please email to pmeirmans _at_ cfl _dot_ forestry _dot_ ca */ #include #include #include #include #include #define VERSION 0.1 #define EMAIL "pmeirmans" #define MAILDOMAIN "cfl.forestry.ca" int max_num_alleles (int ***markerdata, int num_locs, int num_inds, int max_ploi, int num_digits); int main(void) { int a, p, k, i, j, l, v, w, y, z; int num_digits = 2; int num_inds; int num_pops; int num_locs; int max_ploi; int getout; int fstat = 0; char s[100], d[100]; char in_name[50]; char out_name[50]; char c; char commentline[202]; char *pop_names; char *ind_names; char **loc_names; int ***markerdata; int *pop_vector; int allele_num; int *alleles; int allele, recode_counter, name_counter, found; FILE *reading, *writing; printf("\n --------------------------------------------------------------------"); printf("\n\n recodeData version %.1f, by Patrick Meirmans", VERSION); printf("\n\n Canadian Forest Service"); printf("\n %s@%s", EMAIL, MAILDOMAIN); printf("\n\n --------------------------------------------------------------------\n"); i = 0; while (i == 0){ /*Loop for opening input-file*/ printf("\n\nFiles with the extension .dat are expected to be in Fstat-format."); printf("\nAll other files are expected to be in GenoType-format."); printf("\n\nGive name of input file: "); fgets(in_name, 50, stdin); j = strlen(in_name); in_name[j-1] = '\0'; reading = fopen(in_name, "r"); if(reading == NULL){ printf("ERROR: Could not open input-file\n"); j = strlen(in_name); if(in_name[j - 4] != '.'){ printf("\nTry opening %s.txt (y/n)? ", in_name); fgets(s, 50, stdin); printf("\n"); if (s[0] == 'y' || s[0] == 'Y'){ sprintf(in_name, "%s.txt", in_name); reading = fopen(in_name, "r"); if(reading == NULL){ printf("ERROR: Could not open input-file\n"); } else{ i++; } } } } else{ i++; } } i = strlen(in_name); if(i > 4){ if(in_name[i - 4] == '.' && in_name[i - 3] == 'd' && in_name[i - 2] == 'a' && in_name[i - 1] == 't'){ fstat = 1; printf("Data read from Fstat-format\n\n"); } } if(fstat == 0){ /* When data is in GenoType format */ fgets(commentline, 200, reading); /*lees commentline en stop deze in array*/ fscanf(reading, "%d", &num_inds); /*lees aantal individuen */ fscanf(reading, "%d", &num_pops); /*lees aantal populaties */ fscanf(reading, "%d", &num_locs); /*lees aantal loci */ fscanf(reading, "%d", &max_ploi); /*lees maximum ploidie niveau */ fscanf(reading, "%d", &num_digits); /*lees aantal digits per allel */ if ((pop_names = malloc(num_pops * 26 * sizeof(char))) == NULL) { (void)printf("ERROR: pop_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } if ((pop_vector = calloc(num_inds, sizeof(int))) == NULL) { (void)printf("ERROR: pop_vector malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } if ((ind_names = malloc((num_inds +1) * 12 * sizeof(char))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } if ((loc_names = malloc(num_locs * sizeof(char *))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } for(l = 0; l < num_locs; l++){ if ((loc_names[l] = malloc(25 * sizeof(char))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } } /* do 3d m/calloc for markerdata */ if ((markerdata = malloc(num_inds * sizeof(int **))) == NULL) { (void)printf("ERROR: markerdata malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } for(i = 0; i < num_inds; i++){ if ((markerdata[i] = malloc(num_locs * sizeof(int *))) == NULL) { (void)printf("ERROR: markerdata malloc%d failed, more memory needed", i); (void)exit(EXIT_FAILURE); } } for(i = 0; i < num_inds; i++){ for(l = 0; l < num_locs; l++){ if ((markerdata[i][l] = calloc(max_ploi, sizeof(int))) == NULL) { (void)printf("ERROR: markerdata malloc %d %d failed, more memory needed", i, l); (void)exit(EXIT_FAILURE); } } } for(l = 0; l < num_locs; l++){ sprintf(&loc_names[l][0], "Locus%d", l+1); } for(i = 0; i < num_pops; i++){ fscanf(reading, "%s", (pop_names + (i * 25))); } for(i = 0; i < num_inds; i++){ while( (isdigit(c = fgetc(reading))) == 0) { if (c == EOF){ (void)printf("ERROR: wrong number of individuals"); (void)exit(EXIT_FAILURE); } } ungetc(c, reading); fscanf(reading, "%d", &pop_vector[i]); /*i = individu, l = locus, a =allel*/ if(pop_vector[i] > num_pops){ (void)printf("ERROR: individual %d population number too high (%d)", i, pop_vector[i]); (void)exit(EXIT_FAILURE); } fscanf(reading, "%s", (ind_names + (i * 12))); for(l = 0; l < num_locs; l++){ fscanf(reading, "%s", s); if (isdigit(s[0]) == 0 || s[0] == '\n' || s[0] == EOF){ (void)printf("ERROR: input file error, individual %d, locus %d", (i + 1), (l + 1)); (void)exit(EXIT_FAILURE); } v = strlen(s); w = v / (num_digits); /*w = aantal hele allelen*/ y = (v % num_digits); /*y = overgebleven karakters*/ if(w > max_ploi || (y > 0 && (w + 1) > max_ploi) ){ (void)printf("ERROR: ploidy higher than maximum, individual %d, locus %d", (i + 1), (l + 1)); (void)exit(EXIT_FAILURE); } v = 0; if(y > 0){ strncpy(d, s, y); d[y] = '\0'; markerdata[i][l][v] = atoi(d); v++; } if(w > 0){ for(z = 0; z < w; z++){ strncpy(d, &s[y + z * num_digits], num_digits); d[num_digits] = '\0'; markerdata[i][l][v] = atoi(d); v++; } } } } } if(fstat == 1){ /* When data is in Fstat format */ num_inds = 0; max_ploi = 2; sprintf(commentline, "Data read from Fstat file\n"); fscanf(reading, "%d", &num_pops); /*lees aantal populaties */ fscanf(reading, "%d", &num_locs); /*lees aantal loci */ fscanf(reading, "%d", &getout); /*lees maximum allele length, not used */ fscanf(reading, "%d", &num_digits); /*lees aantal digits per allel */ /* malloc for locus names */ if ((loc_names = malloc(num_locs * sizeof(char *))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } for(l = 0; l < num_locs; l++){ if ((loc_names[l] = malloc(25 * sizeof(char))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } } for(l = 0; l < num_locs; l++){ fscanf(reading, "%s", loc_names[l]); /* read locus names */ } num_inds = 0; while((c = fgetc(reading)) != EOF){ ungetc(c, reading); fscanf(reading, "%d", &i); for(l = 0; l < num_locs; l++){ fscanf(reading, "%s", s); if (isdigit(s[0]) == 0 || s[0] == '\n' || s[0] == EOF){ (void)printf("ERROR: input file error, individual %d, locus %d", (i + 1), (l + 1)); (void)exit(EXIT_FAILURE); } v = strlen(s); w = v / (num_digits); /*w = aantal hele allelen*/ y = (v % num_digits); /*y = overgebleven karakters*/ if(y > 0){ w++; } if(w > max_ploi){ max_ploi = w; } } num_inds++; } fclose(reading); /*close and reopen file, get back to beginning of data */ reading = fopen(in_name, "r"); for(i = 1; i <= (num_locs + 4); i++){ fscanf(reading, "%s", s); } /*do bunch of mallocs*/ if ((pop_names = malloc(num_pops * 25 * sizeof(char))) == NULL) { (void)printf("ERROR: pop_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } if ((pop_vector = malloc(num_inds * sizeof(int))) == NULL) { (void)printf("ERROR: pop_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } if ((ind_names = malloc(num_inds * 12 * sizeof(char))) == NULL) { (void)printf("ERROR: ind_names malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } /* do 3d m/calloc for markerdata */ if ((markerdata = malloc(num_inds * sizeof(int **))) == NULL) { (void)printf("ERROR: markerdata malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } for(i = 0; i < num_inds; i++){ if ((markerdata[i] = malloc(num_locs * sizeof(int *))) == NULL) { (void)printf("ERROR: markerdata malloc%d failed, more memory needed", i); (void)exit(EXIT_FAILURE); } } for(i = 0; i < num_inds; i++){ for(l = 0; l < num_locs; l++){ if ((markerdata[i][l] = calloc(max_ploi, sizeof(int))) == NULL) { (void)printf("ERROR: markerdata malloc %d %d failed, more memory needed", i, l); (void)exit(EXIT_FAILURE); } } } for(i = 0; i < num_pops; i++){ /*generic population names */ sprintf((pop_names + (i * 25)), "pop%d", i + 1); } /*start reading data */ for(i = 0; i < num_inds; i++){ fscanf(reading, "%d", &pop_vector[i]); /*i = individu, l = locus, a =allel*/ if(pop_vector[i] > num_pops){ (void)printf("ERROR: individual %d population number too high: %d", i, pop_vector[i]); (void)exit(EXIT_FAILURE); } sprintf((ind_names + (i * 12)), "ind%d", i + 1); for(l = 0; l < num_locs; l++){ fscanf(reading, "%s", s); if (isdigit(s[0]) == 0 || s[0] == '\n' || s[0] == EOF){ (void)printf("ERROR: input file error, individual %d, locus %d", (i + 1), (l + 1)); (void)exit(EXIT_FAILURE); } v = strlen(s); w = v / (num_digits); /*w = aantal hele allelen*/ y = (v % num_digits); /*y = overgebleven karakters*/ v = 0; if(y > 0){ strncpy(d, s, y); d[y] = '\0'; markerdata[i][l][v] = atoi(d); v++; } if(w > 0){ for(z = 0; z < w; z++){ strncpy(d, &s[y + z * num_digits], num_digits); d[num_digits] = '\0'; markerdata[i][l][v] = atoi(d); v++; } } } } } /* print overview of input */ printf("Data overview:\n"); printf("%d individuals\n", num_inds); if(num_pops == 1){ printf("1 population\n"); } else{ printf("%d populations\n", num_pops); } if(num_locs == 1){ printf("1 locus\n"); } else{ printf("%d loci\n", num_locs); } if(num_digits == 1){ printf("1 digit per allele\n"); } else{ printf("%d digits per allele\n\n", num_digits); } allele_num = max_num_alleles(markerdata, num_locs, num_inds, max_ploi, num_digits); printf("Maximum number of alleles per locus: %d\n", abs(allele_num)); fflush(stdout); printf("\nType return to continue"); fgets(s, 50, stdin); /* alloc memory for allele names */ if ((alleles = malloc(allele_num * sizeof(int))) == NULL) { (void)printf("ERROR: alleles malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } /* now recode to make everything population specific */ for(l = 0; l < num_locs; l++){ recode_counter = 0; for(p = 1; p <= num_pops; p++){ /* reset allele names */ name_counter = 0; for(a = 0; a < allele_num; a++){ alleles[a] = 0; } /*check all alleles for all individuals for this pop */ for(i = 0; i < num_inds; i++){ if(pop_vector[i] == p){ for(a = 0; a < max_ploi; a++){ allele = markerdata[i][l][a]; if(allele != 0){ /*check if name has already been put into names vector, if so get index of name to use for new name */ found = 0; for(k = 0; k < name_counter && found == 0; k++){ if(allele == alleles[k]){ found = 1; } } if(found == 0){ alleles[name_counter] = allele; name_counter++; k = name_counter; } markerdata[i][l][a] = recode_counter + k; /*new name */ } } } } recode_counter += name_counter; } } if(recode_counter > 999){ printf("\n\nThe new number of alleles (%d) is higher than the maximum allowed in Fstat-files (999), it may not be possible to perform any analyses with the outputfile.\n", recode_counter); } /*now write the data in Fstat format*/ i = 0; while (i == 0){ printf("\nGive name of ouput file: "); fgets(out_name, 50, stdin); j = strlen(out_name); out_name[j-1] = '\0'; writing = fopen(out_name, "w"); if(writing == NULL){ printf("ERROR: Could not write to file\n"); } else{ i++; } } fprintf(writing, "%d\t%d\t%d\t%d\n", num_pops, num_locs, 999, 3); for(l = 0; l < num_locs; l++){ fprintf(writing, "%s\n", loc_names[l]); } for(i = 0; i < num_inds; i++){ fprintf(writing, "%d", pop_vector[i]); for(l = 0; l < num_locs; l++){ fprintf(writing, "\t"); for(a = 0; a < max_ploi; a++){ fprintf(writing, "%03d", markerdata[i][l][a]); } } fprintf(writing, "\n"); } printf("\n\nFile recoded successfully\n"); fclose(writing); return 0; } /* counts maximum number of alleles per locus */ int max_num_alleles(int ***markerdata, int num_locs, int num_inds, int max_ploi, int num_digits){ int i, k, l, a; int present; int *alleles; int allele_i; int allele_num_loc; int max_num = 0; /* first put all alleles into overly big array */ if ((alleles = calloc(num_locs * pow(10, num_digits), sizeof(int))) == NULL) { (void)printf("ERROR: alleles malloc failed, more memory needed"); (void)exit(EXIT_FAILURE); } for(l = 0; l < num_locs; l++){ allele_num_loc = 0; for(i = 0; i < num_inds; i++){ for(a = 0; a < max_ploi; a++){ allele_i = markerdata[i][l][a]; if (allele_i != 0){ present = k = 0; while(present == 0 && k < allele_num_loc){ if (allele_i == alleles[k]){ present++; } k++; } if(present == 0){ alleles[allele_num_loc] = allele_i; allele_num_loc++; } } } } if(allele_num_loc > max_num){ max_num = allele_num_loc; } } free(alleles); return max_num; }