Skip to content

Instantly share code, notes, and snippets.

Created July 10, 2020 21:37
Show Gist options
  • Save AlexRiina/b32c3eca896480f0d7d6091a561542b9 to your computer and use it in GitHub Desktop.
Save AlexRiina/b32c3eca896480f0d7d6091a561542b9 to your computer and use it in GitHub Desktop.
Find duplicate notes in Anki
Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.
pip install anki_sqlalchemy bs4 sklearn
cp ${ANKI_DATABASE:?replace me} backup.db
import re
import bs4
import random
from anki_sqlalchemy import Note, Collection
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
CLOZE_EXTRACT = re.compile(r"{{(?P<group>.*?)::(?P<answer>.*?)(::.*?)?}}")
engine = create_engine("sqlite:///backup.db", echo=True)
Session = sessionmaker(bind=engine)
session = Session()
col = session.query(Collection).one()
cloze = next(model for model in col.models.values()
if model["name"] == "Cloze")
cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
text_ord = next(
field for field in cloze["flds"] if field["name"] == "Text")["ord"]
def flatten(a: str) -> str:
a = a.replace("&nbsp;", " ")
if '<' in a:
a = bs4.BeautifulSoup(a).get_text()
return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()
cloze_pairs = { flatten(note.fields[text_ord]) for note in cloze_notes}
cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())
tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
text_vectors = tfidf.fit_transform(cloze_texts)
nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
neighbors_lists = nbrs.radius_neighbors(
text_vectors.todense(), radius=0.9, return_distance=False
neighbors_lists = [
(index, list(set(neighbors) - {index}))
for index, neighbors in enumerate(neighbors_lists)
if len(neighbors) > 1 # more than self
for idx, neighbors in random.sample(neighbors_lists, 12):
print(f"{cloze_texts[idx]} is similar to")
for neighbor in neighbors:
print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment