Con estos pequeños archivos se puede hacer un analisis de kmer rápido y simple en linux. Para compilar solamente ejecutar:
gcc -o kmer kmer.c
Luego para analizar la ocurrencia de 41mer ejecutar:
cat fastQfile.fq | parser | ./kmer 41 > salida.txt
La salida será algo así:
AAATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAA 190
AATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAAT 189
ATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATC 190
TAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCT 193
AATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTG 193
ATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTGA 193
En la primer columna está el mer y en la segunda la frecuencia. Los archivos son:
–parser–
#!/bin/bash ##parser from fastQ to kmer if [ $# -eq 0 ]; then echo "usage: parser file.fq > file.txt"; exit 1; fi #total of lines lines=$(cat $1 | wc -l); for i in $(seq 2 4 $lines); do sed $i'q;d' $1 done exit 0;
–kmer.c–
/** * Author: Matias Neiff and Tatiana Ponce * Created: 21.08.2020 * * Software to evaluate the occurrence of kmers in a dna sequence. Use it good, use it for good. * otput: kmer [tab] frecuency * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. **/ #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <string.h> #include <unistd.h> // this value has to be sufficient big to host the entire sequence #define MAX_SEQ_LEN 24845291000 // this is the read buffer #define MAX_READ_LEN 100 * 1024 # 24845291 //compare to array string and return true if are identical int compareArrays(char a[], char b[], int n) { for (int i=0; i<n; ++i) if (a[i] != b[i]) return 0; return 1; } int usage() { printf("Usage: cat file | kmer k\n"); exit(1); } int main(int argc, char **argv) { int k; char* p; errno = 0; long conv; // at least need two arguments if (argc == 2) { conv = strtol(argv[1], &p, 10); } else { usage(); } //check if K is integer if (errno != 0 || *p != '\0' || conv > 9999 || conv < 1) { usage(); } else { k = conv; } // allocate memory for the three big strings char* seq; char* buff; char* prevkmer; seq = (char*) malloc(MAX_SEQ_LEN*sizeof(char)); buff = (char*) malloc(MAX_READ_LEN*sizeof(char)); prevkmer = (char*) malloc(MAX_SEQ_LEN*sizeof(char)); if (seq == NULL) { printf("Memory not allocated.\n"); exit(0); } // we need to store the bytes readed trouhgt the pipe, and the total of bytes readed ssize_t bytesRead=0; ssize_t totalRead=0; bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN ); while (bytesRead != 0) { buff[bytesRead]='\0'; strcat(seq,buff); totalRead = totalRead + bytesRead; bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN ); } // add the null to the end to complete the string format of C seq[totalRead]='\0'; char kmer[k]; char window[k]; int seqlen = totalRead; int freq; prevkmer[0]='\0'; //start the analysis for(int index=0;index<seqlen-k;index++) { // select kmer for (int i=0;i<k;i++) { kmer[i]=seq[i+index]; } // prevkmer store already proccessed kmer to avoid procces twice kmer[k]='\0'; // do not procces if was already proccess with this kmer or if have an return car (incomplete mer) if (!strstr(prevkmer,kmer) && !strstr(kmer,"\n")) { // store as previous kmer to not proccess again strcat(prevkmer,kmer); freq=0; // search frecuency of this kmer for (int x=0;x<seqlen-k;x++) { //fill the window for (int w=0;w<k;w++) { window[w]=seq[x+w]; } // if kmer equal to window if(compareArrays(kmer,window,k)) { freq++;x=x+k;} } if (freq>1) printf("%s\t%d\n",kmer,freq); } } //end for free(seq); free(buff); return 0; } // end function: main