Created
July 10, 2020 21:37
-
-
Save AlexRiina/b32c3eca896480f0d7d6091a561542b9 to your computer and use it in GitHub Desktop.
Find duplicate notes in Anki
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes. | |
``` | |
pip install anki_sqlalchemy bs4 sklearn | |
cp ${ANKI_DATABASE:?replace me} backup.db | |
python anki_similarity.py | |
``` | |
""" | |
import re | |
import bs4 | |
import random | |
from anki_sqlalchemy import Note, Collection | |
from sqlalchemy import create_engine | |
from sqlalchemy.orm import sessionmaker | |
from sklearn.neighbors import NearestNeighbors | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
CLOZE_EXTRACT = re.compile(r"{{(?P<group>.*?)::(?P<answer>.*?)(::.*?)?}}") | |
engine = create_engine("sqlite:///backup.db", echo=True) | |
Session = sessionmaker(bind=engine) | |
session = Session() | |
col = session.query(Collection).one() | |
cloze = next(model for model in col.models.values() | |
if model["name"] == "Cloze") | |
cloze_notes = session.query(Note).filter_by(model_id=cloze["id"]) | |
text_ord = next( | |
field for field in cloze["flds"] if field["name"] == "Text")["ord"] | |
def flatten(a: str) -> str: | |
a = a.replace(" ", " ") | |
if '<' in a: | |
a = bs4.BeautifulSoup(a).get_text() | |
return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower() | |
cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes} | |
cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values()) | |
tfidf = TfidfVectorizer(use_idf=False, stop_words='english') | |
text_vectors = tfidf.fit_transform(cloze_texts) | |
nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree") | |
nbrs.fit(text_vectors.todense()) | |
neighbors_lists = nbrs.radius_neighbors( | |
text_vectors.todense(), radius=0.9, return_distance=False | |
) | |
neighbors_lists = [ | |
(index, list(set(neighbors) - {index})) | |
for index, neighbors in enumerate(neighbors_lists) | |
if len(neighbors) > 1 # more than self | |
] | |
for idx, neighbors in random.sample(neighbors_lists, 12): | |
print(f"{cloze_texts[idx]} is similar to") | |
for neighbor in neighbors: | |
print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment