Last active
January 8, 2017 10:24
-
-
Save Muzietto/7345f9aa910a409fe7eac8bdf93a008d to your computer and use it in GitHub Desktop.
Kinda SumPrefix solution using differences to express distances from relevant targets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function genomicRangeQuery(nucleotides, mins, maxs) { | |
var result = new Array(mins.length).fill(-1); | |
var distances = distancesFromLastSeen(nucleotides); | |
for (var i = 0; i < mins.length; i++) { | |
var segmentWidth = maxs[i] - mins[i] + 1; | |
if (distances['A'][maxs[i]] < segmentWidth) { | |
result[i] = 1; | |
continue; | |
} | |
if (distances['C'][maxs[i]] < segmentWidth) { | |
result[i] = 2; | |
continue; | |
} | |
if (distances['G'][maxs[i]] < segmentWidth) { | |
result[i] = 3; | |
continue; | |
} | |
if (distances['T'][maxs[i]] < segmentWidth) { | |
result[i] = 4; | |
continue; | |
} | |
} | |
return result; | |
} | |
function distancesFromLastSeen(nucleotides) { | |
var nucs = nucleotides.split(''); | |
var result = { | |
'A': new Array(nucs.length).fill(-1), | |
'C': new Array(nucs.length).fill(-1), | |
'G': new Array(nucs.length).fill(-1), | |
'T': new Array(nucs.length).fill(-1) | |
} | |
var lastSeen_A = -1; | |
var lastSeen_C = -1; | |
var lastSeen_G = -1; | |
var lastSeen_T = -1; | |
for (var i = 0; i < nucs.length; i++) { | |
if (nucs[i] === 'A') lastSeen_A = i; | |
if (nucs[i] === 'C') lastSeen_C = i; | |
if (nucs[i] === 'G') lastSeen_G = i; | |
if (nucs[i] === 'T') lastSeen_T = i; | |
result['A'][i] = i - lastSeen_A; | |
result['C'][i] = i - lastSeen_C; | |
result['G'][i] = i - lastSeen_G; | |
result['T'][i] = i - lastSeen_T; | |
} | |
return result; | |
} |
We preprocess the DNA string storing at each position the distance from the last time we saw each nucleotide. Obviously the preprocess creates a rectangular matrix.
For each query we check the last positions in each queried subsequence. If the distance from e.g. 'A' is less than the subsequence width, our subsequence contains at least one adenine.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Problem is presented here.
Given a DNA sequence e.g. 'AGTACCTGA' we want to know whether a given subsequence contains a given nucleotide without having to parse the whole subsequence at each query.
What are we to do?