Skip to content

Instantly share code, notes, and snippets.

@AlexRiina
Created July 10, 2020 21:37
Show Gist options
  • Save AlexRiina/b32c3eca896480f0d7d6091a561542b9 to your computer and use it in GitHub Desktop.
Save AlexRiina/b32c3eca896480f0d7d6091a561542b9 to your computer and use it in GitHub Desktop.
Find duplicate notes in Anki
"""
Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.
```
pip install anki_sqlalchemy bs4 sklearn
cp ${ANKI_DATABASE:?replace me} backup.db
python anki_similarity.py
```
"""
import re
import bs4
import random
from anki_sqlalchemy import Note, Collection
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
CLOZE_EXTRACT = re.compile(r"{{(?P<group>.*?)::(?P<answer>.*?)(::.*?)?}}")
engine = create_engine("sqlite:///backup.db", echo=True)
Session = sessionmaker(bind=engine)
session = Session()
col = session.query(Collection).one()
cloze = next(model for model in col.models.values()
if model["name"] == "Cloze")
cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
text_ord = next(
field for field in cloze["flds"] if field["name"] == "Text")["ord"]
def flatten(a: str) -> str:
a = a.replace("&nbsp;", " ")
if '<' in a:
a = bs4.BeautifulSoup(a).get_text()
return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()
cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes}
cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())
tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
text_vectors = tfidf.fit_transform(cloze_texts)
nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
nbrs.fit(text_vectors.todense())
neighbors_lists = nbrs.radius_neighbors(
text_vectors.todense(), radius=0.9, return_distance=False
)
neighbors_lists = [
(index, list(set(neighbors) - {index}))
for index, neighbors in enumerate(neighbors_lists)
if len(neighbors) > 1 # more than self
]
for idx, neighbors in random.sample(neighbors_lists, 12):
print(f"{cloze_texts[idx]} is similar to")
for neighbor in neighbors:
print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment