Last active
May 19, 2024 21:48
-
-
Save cryptocoinserver/399eff4505708bca8f7074ab6eebe8cb to your computer and use it in GitHub Desktop.
This script checks for similar cards in a deck and adds tags to them. It uses TF-IDF and cosine similarity to calculate the similarity between cards. It connects to the Anki collection with ankipandas. Anki needs to be closed while running it. For the changes to show in Anki "Tools" > "Check database" before "Browse".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sqlite3 | |
import zipfile | |
import numpy as np | |
import pandas as pd | |
import spacy | |
from ankipandas import Collection | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from spacy.lang.de.stop_words import STOP_WORDS | |
from tqdm import tqdm | |
tqdm.pandas() | |
class AnkiCardSimilarity: | |
def __init__(self, deck_name, nlp, stopwords, lemmatization, threshold=0.8): | |
self.col = Collection() | |
self.deck_name = deck_name | |
self.nlp = nlp | |
self.stopwords = stopwords | |
self.lemmatization = lemmatization | |
self.threshold = threshold | |
def _preprocess(self, text): | |
# preprocess text | |
text = re.sub("<[^<]+?>", "", text) | |
text = re.sub("[^a-zA-ZäöüÄÖÜß]", " ", text) | |
text = text.lower() | |
text = " ".join([word for word in text.split() if word not in self.stopwords]) | |
if self.lemmatization: | |
text = " ".join([token.lemma_ for token in self.nlp(text)]) | |
return text | |
def _preprocess_notes(self, notes): | |
# apply with tqdm preprocessing to cards nflds field. before that turns the list into a string. | |
print("preprocessing notes") | |
notes["preprocced"] = notes["nflds"].progress_apply(lambda x: " ".join(x)) | |
notes["preprocced"] = notes["preprocced"].progress_apply(self._preprocess) | |
return notes | |
def _get_similar_notes(self, notes): | |
# get similar cards using TF-IDF and cosine similarity. returns a list of tuples with the corresponding nid values in cards | |
print("calculating similarity") | |
tfidf = TfidfVectorizer() | |
tfidf_matrix = tfidf.fit_transform(notes["preprocced"]) | |
cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T) | |
similar_cards = [] | |
for i in tqdm(range(cosine_similarities.shape[0])): | |
for j in range(i + 1, cosine_similarities.shape[0]): | |
if cosine_similarities[i, j] > self.threshold: | |
similar_cards.append( | |
( | |
notes.iloc[i]["nid"], | |
notes.iloc[j]["nid"], | |
cosine_similarities[i, j], | |
) | |
) | |
return similar_cards | |
def _add_tags(self, notes, similar_notes): | |
# add tags to similar cards and less information tag to card with less information and remvove duplicated tags in the end | |
# with tqdm | |
print("adding tags to cards") | |
for note1, note2, similarity in tqdm(similar_notes): | |
notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[ | |
notes["nid"] == note1, "ntags" | |
].apply( | |
lambda x: x | |
+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"] | |
) | |
notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[ | |
notes["nid"] == note2, "ntags" | |
].apply( | |
lambda x: x | |
+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"] | |
) | |
if len(notes.loc[notes["nid"] == note1, "preprocced"].iloc[0]) < len( | |
notes.loc[notes["nid"] == note2, "preprocced"].iloc[0] | |
): | |
notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[ | |
notes["nid"] == note1, "ntags" | |
].apply(lambda x: x + ["SimilarityCheck::less-information"]) | |
else: | |
notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[ | |
notes["nid"] == note2, "ntags" | |
].apply(lambda x: x + ["SimilarityCheck::less-information"]) | |
notes["ntags"] = notes["ntags"].apply(lambda x: list(set(x))) | |
return notes | |
def _save_cards(self): | |
self.col.summarize_changes(output="print") | |
self.col.write(modify=True) | |
def run(self): | |
# run the whole process | |
decks = self.col.cards.list_decks() | |
# print(decks) | |
cards = self.col.cards.merge_notes() | |
cards = cards[cards["cdeck"].str.startswith(self.deck_name)] | |
note_ids = cards.nid | |
selected_notes = self.col.notes[self.col.notes.nid.isin(note_ids)] | |
selected_notes = selected_notes.reset_index() | |
selected_notes = self._preprocess_notes(selected_notes) | |
similar_notes = self._get_similar_notes(selected_notes) | |
selected_notes = self._add_tags(selected_notes, similar_notes) | |
selected_notes = selected_notes.set_index("nid") | |
# drop preprocced column | |
selected_notes = selected_notes.drop(columns=["preprocced"]) | |
selected_notes.info() | |
# update cards in collection | |
self.col.notes.update(selected_notes) | |
self._save_cards() | |
if __name__ == "__main__": | |
deck_name = "Statistik" | |
nlp = spacy.load("de_core_news_lg") | |
stopwords = STOP_WORDS | |
lemmatization = True | |
threshold = 0.8 | |
anki_card_similarity = AnkiCardSimilarity( | |
deck_name, nlp, stopwords, lemmatization, threshold | |
) | |
anki_card_similarity.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I would like to add a variation of this code changing the code to support English and comment out Ankipanda's write error for manual addition into Anki using nid:"unspaced, unquoted comma output from local file". (Also removes unused imports and, the default threshold value due to varying use cases, it'll need to be fine-tuned to your content.)