Skip to content

Instantly share code, notes, and snippets.

@jedavidson
Last active January 10, 2025 11:04
Show Gist options
  • Save jedavidson/2ca7b94079670b12ba2dc5efbc30df52 to your computer and use it in GitHub Desktop.
Save jedavidson/2ca7b94079670b12ba2dc5efbc30df52 to your computer and use it in GitHub Desktop.
#include <cassert>
#include <string>
// A storage class for DNA sequences, which are sequences (i.e. strings)
// consisting of one of four bases: A, C, T and G.
class dna_store {
public:
// Adds a sequence to the store. You may assume that the sequence
// is a nonempty string made up only of the letters {A, C, T, G}.
void add(const std::string &seq) {
return;
}
// Searches for a DNA sequence in the store.
bool contains(const std::string &seq) {
return false;
}
// As for contains, except allows method callers to add wildcards, by
// writing . anywhere in a search sequence to stand for any letter.
bool contains_with_wildcard(const std::string &seq) const {
return false;
}
};
// Compile: clang++ -std=c++20 dna.cpp -o dna
int main() {
auto store = dna_store{};
for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
store.add(seq);
}
for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
assert(store.contains(seq));
}
assert(!store.contains("CC"));
assert(store.contains_with_wildcard("."));
assert(!store.contains_with_wildcard("..."));
assert(store.contains_with_wildcard("AC..AG."));
for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
assert(store.contains_with_wildcard(seq));
}
assert(!store.contains_with_wildcard("CC"));
}
class DNAStore:
def __init__(self):
"""
Creates a new DNA store.
"""
pass
def add(self, sequence: str):
"""
Adds a sequence to the DNA store. You may assume that the sequence
is a nonempty string made up only of the letters {A, C, T, G}.
"""
pass
def contains(self, sequence: str) -> bool:
"""
Searches for a DNA sequence in the DNA store.
"""
return True
def contains_with_wildcard(self, sequence: str) -> bool:
"""
As for contains, except allows method callers to add wildcards, by
writing . anywhere in a search sequence to stand for any letter.
"""
return True
if __name__ == "__main__":
store = DNAStore()
for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
store.add(sequence)
for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
assert store.contains(sequence)
assert not store.contains("CC")
assert store.contains_with_wildcard(".")
assert not store.contains_with_wildcard("...")
assert store.contains_with_wildcard("AC..AG.")
for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
assert store.contains_with_wildcard(sequence)
assert not store.contains_with_wildcard("CC")
@jedavidson
Copy link
Author

jedavidson commented Feb 18, 2023

Sample C++ solution:

#include <cassert>
#include <memory>
#include <stack>
#include <string>
#include <unordered_map>
#include <utility>

class dna_store {
  public:
    void add(const std::string &seq) {
        auto curr = std::ref(root);
        for (const auto base : seq) {
            auto &child = curr.get().children[base];
            if (!child) {
                child.reset(new node());
            }
            curr = *child;
        }

        curr.get().is_terminal = true;
    }

    bool contains(const std::string &seq) const {
        auto curr = std::cref(root);
        for (const auto base : seq) {
            const auto &children = curr.get().children;
            const auto it = children.find(base);
            if (it == children.end()) {
                return false;
            }
            curr = *it->second;
        }

        return curr.get().is_terminal;
    }

    bool contains_with_wildcard(const std::string &seq) const {
        auto stack = std::stack<std::pair<const node &, int>>{};
        stack.emplace(root, 0);
        while (!stack.empty()) {
            const auto [curr, i] = stack.top();
            stack.pop();
            if (i == seq.size()) {
                if (curr.is_terminal) {
                    return true;
                }
                continue;
            }

            auto base = seq[i];
            if (base == '.') {
                for (const auto &[_, child] : curr.children) {
                    stack.emplace(*child, i + 1);
                }
            } else {
                const auto &children = curr.children;
                const auto it = children.find(base);
                if (it != children.end()) {
                    stack.emplace(*it->second, i + 1);
                }
            }
        }

        return false;
    }

  private:
    struct node {
        std::unordered_map<char, std::unique_ptr<node>> children;
        bool is_terminal;
    };

    node root;
};

// Compile: clang++ -std=c++20 dna.cpp -o dna
int main() {
    auto store = dna_store{};
    for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
        store.add(seq);
    }

    for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
        assert(store.contains(seq));
    }
    assert(!store.contains("CC"));

    assert(store.contains_with_wildcard("."));
    assert(!store.contains_with_wildcard("..."));
    assert(store.contains_with_wildcard("AC..AG."));
    for (const auto &seq : {"AC", "AT", "ACTG", "ACTGAGT", "A", "C"}) {
        assert(store.contains_with_wildcard(seq));
    }
    assert(!store.contains_with_wildcard("CC"));
}

Sample Python solution:

from collections import defaultdict


class Node:
    def __init__(self) -> None:
        self.children = defaultdict(Node)
        self.is_terminal = False

class DNAStore:
    def __init__(self):
        self.root = Node()

    def add(self, sequence: str):
        curr = self.root
        for base in sequence:
            curr = curr.children[base]

        curr.is_terminal = True

    def contains(self, sequence: str) -> bool:
        curr = self.root
        for base in sequence:
            if base not in curr.children:
                return False
            curr = curr.children[base]

        return curr.is_terminal

    def contains_with_wildcard(self, sequence: str) -> bool:
        stack = [(self.root, 0)]
        while stack:
            curr, i = stack.pop()
            if curr.is_terminal and i == len(sequence):
                return True
            elif i == len(sequence):
                continue

            base = sequence[i]
            if base == '.':
                for child in curr.children.values():
                    stack.append((child, i + 1))
            elif base in curr.children:
                stack.append((curr.children[base], i + 1))

        return False


if __name__ == "__main__":
    store = DNAStore()

    for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
        store.add(sequence)

    for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
        assert store.contains(sequence)
    assert not store.contains("CC")

    assert store.contains_with_wildcard(".")
    assert not store.contains_with_wildcard("...")
    assert store.contains_with_wildcard("AC..AG.")
    for sequence in ("AC", "AT", "ACTG", "ACTGAGT", "A", "C"):
        assert store.contains_with_wildcard(sequence)
    assert not store.contains_with_wildcard("CC")

Time complexity is

  • $\Theta(n)$ for insert and contains, and
  • worst-case $O(4^n)$ for contains_with_wildcard, which is realised when the store contains all $n$-length sequences and a sequence of $n$ wildcards is searched for.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment