Analisis de kmer

Con estos pequeños archivos se puede hacer un analisis de kmer rápido y simple en linux. Para compilar solamente ejecutar:
gcc -o kmer kmer.c

Luego para analizar la ocurrencia de 41mer ejecutar:
cat fastQfile.fq | parser | ./kmer 41 > salida.txt

La salida será algo así:

AAATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAA 190
AATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAAT 189
ATAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATC 190
TAATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCT 193
AATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTG 193
ATGTACGGGGGAGATGCATGAATTTTTCGGATAAAATCTGA 193

En la primer columna está el mer y en la segunda la frecuencia. Los archivos son:

–parser–

#!/bin/bash
##parser from fastQ to kmer
if [ $# -eq 0 ]; then
    echo "usage: parser file.fq > file.txt";
    exit 1;
fi
#total of lines
lines=$(cat $1 | wc -l);
for i in $(seq 2 4 $lines); do
    sed $i'q;d' $1
done
exit 0;

–kmer.c–

/**
 * Author:    Matias Neiff and Tatiana Ponce
 * Created:   21.08.2020
 *
 * Software to evaluate the occurrence of kmers in a dna sequence. Use it good, use it for good.
 * otput: kmer [tab] frecuency
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
 **/

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
// this value has to be sufficient big to host the entire sequence
#define MAX_SEQ_LEN 24845291000
// this is the read buffer
#define MAX_READ_LEN 100 * 1024
# 24845291

//compare to array string and return true if are identical
int compareArrays(char a[], char b[], int n)
{
  for (int i=0; i<n; ++i) if (a[i] != b[i]) return 0;
  return 1;
}

int usage() {
        printf("Usage: cat file | kmer k\n");
	exit(1);
}

int main(int argc, char **argv) {
    int k;
    char* p;
    errno = 0;
    long conv;
    // at least need two arguments
    if (argc == 2) {
	conv = strtol(argv[1], &p, 10);
    } else {
	usage();
    }

    //check if K is integer
    if (errno != 0 || *p != '\0' || conv > 9999 || conv < 1) {
	usage();
    } else {
	k = conv;
    }
    // allocate memory for the three big strings
    char* seq;
    char* buff;
    char* prevkmer;
    seq = (char*) malloc(MAX_SEQ_LEN*sizeof(char));
    buff = (char*) malloc(MAX_READ_LEN*sizeof(char));
    prevkmer = (char*) malloc(MAX_SEQ_LEN*sizeof(char));
    if (seq == NULL) {
        printf("Memory not allocated.\n");
        exit(0);
    }
    // we need to store the bytes readed trouhgt the pipe, and the total of bytes readed
    ssize_t bytesRead=0;
    ssize_t totalRead=0;
    bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN );
    while (bytesRead != 0) {
	buff[bytesRead]='\0';
	strcat(seq,buff);
	totalRead = totalRead + bytesRead;
	bytesRead = read( STDIN_FILENO, buff, MAX_READ_LEN );
    }
    // add the null to the end to complete the string format of C
    seq[totalRead]='\0';
    char kmer[k];
    char window[k];
    int seqlen = totalRead;
    int freq;
    prevkmer[0]='\0';
    //start the analysis
    for(int index=0;index<seqlen-k;index++) {
	// select kmer
	for (int i=0;i<k;i++) {
	    kmer[i]=seq[i+index];
	}
       // prevkmer store already proccessed kmer to avoid procces twice
	kmer[k]='\0';
	// do not procces if was already proccess with this kmer or if have an return car (incomplete mer)
	if (!strstr(prevkmer,kmer) && !strstr(kmer,"\n")) {
	// store as previous kmer to not proccess again
    	strcat(prevkmer,kmer);
	freq=0;
	// search frecuency of this kmer
	for (int x=0;x<seqlen-k;x++) {
	    //fill the window
	    for (int w=0;w<k;w++) {
		window[w]=seq[x+w];
	    }
	    // if kmer equal to window
	    if(compareArrays(kmer,window,k)) { freq++;x=x+k;}	    
	}
	if (freq>1) printf("%s\t%d\n",kmer,freq);
	}
    } //end for
    free(seq);
    free(buff);
  return 0;
} // end function: main

Comments are closed.