Skip to content

Instantly share code, notes, and snippets.

@KirillLykov
Created April 7, 2019 10:11
Show Gist options
  • Select an option

  • Save KirillLykov/12ec47fecb238aeb6699389661ed0028 to your computer and use it in GitHub Desktop.

Select an option

Save KirillLykov/12ec47fecb238aeb6699389661ed0028 to your computer and use it in GitHub Desktop.
Count unique subsequences of DNA string, solution using DFA
// Count number of unique subsequencies of DNA sequence
#include <bits/stdc++.h>
using namespace std;
using lint = long long int;
// only 4 letters
int char2int(char c) {
if (c == 'A') return 0;
if (c == 'C') return 1;
if (c == 'G') return 2;
return 3;
}
// internal representation if DFA is array Nx4
const vector<vector<int>> buildDFA(const string& s) {
const size_t n = s.size();
vector<vector<int>> dfa(n+1);
vector<int> next(4, -1);
for (int i = n; i >= 0; --i) {
dfa[i] = next;
if (i != 0)
next[char2int(s[i-1])] = i;
}
return dfa;
}
lint countUniqueSubseq(const string& s) {
const size_t n = s.size();
// 1. Build DFA by the input sequece
// For that, for each position i in s
// consider edges labeled with 'ACGT' to be
// the index of the first occurence of the
// corresponding index on the right.
// Enumerate position in string starting from 1 in DFA.
// For example,
// 123
// aba => 0 -a-> 1 -a->3
// | b ^
// | V /
// \-b-> 2 --a
const vector<vector<int>>& dfa = buildDFA(s);
// 2. Now we can compute for each vertex number of
// pathes which end in it. Lets call this value weight
// Vertices are numbred in topolocial order, which means
// that we can traver dfa from left to right and carry forward
// weight of current vertex
vector<lint> weight(n+1,0); // weight[1] correspond to s[0]
weight[0] = 1;
for (int i = 0; i <= n; ++i) {
for (int letter = 0; letter < 4; ++letter) {
if (dfa[i][letter] != -1)
weight[dfa[i][letter]] += weight[i];
}
}
// 3. Just accumulate all the weights
lint res = 0;
for (int i = 1; i <= n; ++i)
res = res + weight[i];
return res;
}
int main() {
ios_base::sync_with_stdio(0); cin.tie(NULL);
string s;
cin >> s;
cout << countUniqueSubseq(s) << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment