Con estos pequeños archivos se puede hacer un analisis de kmer rápido y simple en linux. Para compilar solamente ejecutar:
gcc -o kmer kmer.c
Luego para analizar la ocurrencia de 41mer ejecutar:
cat fastQfile.fq | parser | ./kmer 41 > salida.txt
La salida será algo así:
AAATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAA 190
AATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAAT 189
ATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATC 190
TAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCT 193
AATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTG 193
ATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTGA 193
En la primer columna está el mer y en la segunda la frecuencia. Los archivos son:
–parser–
04 | echo "usage: parser file.fq > file.txt" ; |
08 | lines=$( cat $1 | wc -l); |
09 | for i in $( seq 2 4 $lines); do |
–kmer.c–
/**
* Author: Matias Neiff and Tatiana Ponce
* Created: 21.08.2020
*
* Software to evaluate the occurrence of kmers in a dna sequence. Use it good, use it for good.
* otput: kmer [tab] frecuency
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
**/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
// this value has to be sufficient big to host the entire sequence
#define MAX_SEQ_LEN 24845291000
// this is the read buffer
#define MAX_READ_LEN 100 * 1024
# 24845291
//compare to array string and return true if are identical
int compareArrays(char a[], char b[], int n)
{
for (int i=0; i<n; ++i) if (a[i] != b[i]) return 0;
return 1;
}
int usage() {
printf("Usage: cat file | kmer k\n");
exit(1);
}
int main(int argc, char **argv) {
int k;
char* p;
errno = 0;
long conv;
// at least need two arguments
if (argc == 2) {
conv = strtol(argv[1], &p, 10);
} else {
usage();
}
//check if K is integer
if (errno != 0 || *p != '\0' || conv > 9999 || conv < 1) {
usage();
} else {
k = conv;
}
// allocate memory for the three big strings
char* seq;
char* buff;
char* prevkmer;
seq = (char*) malloc(MAX_SEQ_LEN*sizeof(char));
buff = (char*) malloc(MAX_READ_LEN*sizeof(char));
prevkmer = (char*) malloc(MAX_SEQ_LEN*sizeof(char));
if (seq == NULL) {
printf("Memory not allocated.\n");
exit(0);
}
// we need to store the bytes readed trouhgt the pipe, and the total of bytes readed
ssize_t bytesRead=0;
ssize_t totalRead=0;
bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN );
while (bytesRead != 0) {
buff[bytesRead]='\0';
strcat(seq,buff);
totalRead = totalRead + bytesRead;
bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN );
}
// add the null to the end to complete the string format of C
seq[totalRead]='\0';
char kmer[k];
char window[k];
int seqlen = totalRead;
int freq;
prevkmer[0]='\0';
//start the analysis
for(int index=0;index<seqlen-k;index++) {
// select kmer
for (int i=0;i<k;i++) {
kmer[i]=seq[i+index];
}
// prevkmer store already proccessed kmer to avoid procces twice
kmer[k]='\0';
// do not procces if was already proccess with this kmer or if have an return car (incomplete mer)
if (!strstr(prevkmer,kmer) && !strstr(kmer,"\n")) {
// store as previous kmer to not proccess again
strcat(prevkmer,kmer);
freq=0;
// search frecuency of this kmer
for (int x=0;x<seqlen-k;x++) {
//fill the window
for (int w=0;w<k;w++) {
window[w]=seq[x+w];
}
// if kmer equal to window
if(compareArrays(kmer,window,k)) { freq++;x=x+k;}
}
if (freq>1) printf("%s\t%d\n",kmer,freq);
}
} //end for
free(seq);
free(buff);
return 0;
} // end function: main
This entry was posted on Monday, August 22nd, 2022 at 5:30 am and is filed under Uncategorized. You can follow any responses to this entry through the RSS 2.0 feed.
Both comments and pings are currently closed.