Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active August 29, 2015 14:05
Show Gist options
  • Save lh3/432050e7eefec3662e93 to your computer and use it in GitHub Desktop.
Save lh3/432050e7eefec3662e93 to your computer and use it in GitHub Desktop.
#include <zlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
unsigned char seq_nt6_table[256] = {
0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
};
int main(int argc, char *argv[])
{
gzFile fp;
kseq_t *ks;
int kmer, i, l, mask;
char *nei;
if (argc < 2) {
fprintf(stderr, "Usage: kfreq <kmer> <in.fa>\n");
return 1;
}
// get the k-mer
l = strlen(argv[1]);
for (i = kmer = 0; i < l; ++i) {
int c = seq_nt6_table[(int)argv[1][i]];
assert(c >= 1 && c <= 4);
kmer = kmer << 2 | (c - 1);
}
mask = (1<<2*l) - 1;
// get the neighbors
nei = calloc(1, 1<<2*l);
for (i = 0; i < l; ++i) {
int j, x, c;
c = kmer >> 2*i & 3;
x = kmer & ~(3 << 2*i);
for (j = 0; j < 4; ++j)
if (j != c) nei[x|j<<2*i] = 1;
}
fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r");
ks = kseq_init(fp);
while (kseq_read(ks) >= 0) {
int k, x[2], cnt, cnt_nei;
x[0] = x[1] = k = cnt = cnt_nei = 0;
for (i = 0; i < ks->seq.l; ++i) {
int c = seq_nt6_table[(int)ks->seq.s[i]];
if (c >= 1 && c <= 4) {
x[0] = (x[0] << 2 | (c - 1)) & mask;
x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1));
if (k < l) ++k;
if (k == l) {
if (x[0] == kmer || x[1] == kmer) ++cnt;
else if (nei[x[0]] || nei[x[1]]) ++cnt_nei;
}
} else k = 0;
}
printf("%s\t%ld\t%d\t%d\n", ks->name.s, ks->seq.l, cnt, cnt_nei);
}
kseq_destroy(ks);
gzclose(fp);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment