Skip to content

Instantly share code, notes, and snippets.

@cryptocoinserver
Last active May 19, 2024 21:48
Show Gist options
  • Save cryptocoinserver/399eff4505708bca8f7074ab6eebe8cb to your computer and use it in GitHub Desktop.
Save cryptocoinserver/399eff4505708bca8f7074ab6eebe8cb to your computer and use it in GitHub Desktop.
This script checks for similar cards in a deck and adds tags to them. It uses TF-IDF and cosine similarity to calculate the similarity between cards. It connects to the Anki collection with ankipandas. Anki needs to be closed while running it. For the changes to show in Anki "Tools" > "Check database" before "Browse".
import os
import re
import sqlite3
import zipfile
import numpy as np
import pandas as pd
import spacy
from ankipandas import Collection
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
from tqdm import tqdm
tqdm.pandas()
class AnkiCardSimilarity:
def __init__(self, deck_name, nlp, stopwords, lemmatization, threshold=0.8):
self.col = Collection()
self.deck_name = deck_name
self.nlp = nlp
self.stopwords = stopwords
self.lemmatization = lemmatization
self.threshold = threshold
def _preprocess(self, text):
# preprocess text
text = re.sub("<[^<]+?>", "", text)
text = re.sub("[^a-zA-ZäöüÄÖÜß]", " ", text)
text = text.lower()
text = " ".join([word for word in text.split() if word not in self.stopwords])
if self.lemmatization:
text = " ".join([token.lemma_ for token in self.nlp(text)])
return text
def _preprocess_notes(self, notes):
# apply with tqdm preprocessing to cards nflds field. before that turns the list into a string.
print("preprocessing notes")
notes["preprocced"] = notes["nflds"].progress_apply(lambda x: " ".join(x))
notes["preprocced"] = notes["preprocced"].progress_apply(self._preprocess)
return notes
def _get_similar_notes(self, notes):
# get similar cards using TF-IDF and cosine similarity. returns a list of tuples with the corresponding nid values in cards
print("calculating similarity")
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(notes["preprocced"])
cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T)
similar_cards = []
for i in tqdm(range(cosine_similarities.shape[0])):
for j in range(i + 1, cosine_similarities.shape[0]):
if cosine_similarities[i, j] > self.threshold:
similar_cards.append(
(
notes.iloc[i]["nid"],
notes.iloc[j]["nid"],
cosine_similarities[i, j],
)
)
return similar_cards
def _add_tags(self, notes, similar_notes):
# add tags to similar cards and less information tag to card with less information and remvove duplicated tags in the end
# with tqdm
print("adding tags to cards")
for note1, note2, similarity in tqdm(similar_notes):
notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
notes["nid"] == note1, "ntags"
].apply(
lambda x: x
+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
)
notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
notes["nid"] == note2, "ntags"
].apply(
lambda x: x
+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
)
if len(notes.loc[notes["nid"] == note1, "preprocced"].iloc[0]) < len(
notes.loc[notes["nid"] == note2, "preprocced"].iloc[0]
):
notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
notes["nid"] == note1, "ntags"
].apply(lambda x: x + ["SimilarityCheck::less-information"])
else:
notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
notes["nid"] == note2, "ntags"
].apply(lambda x: x + ["SimilarityCheck::less-information"])
notes["ntags"] = notes["ntags"].apply(lambda x: list(set(x)))
return notes
def _save_cards(self):
self.col.summarize_changes(output="print")
self.col.write(modify=True)
def run(self):
# run the whole process
decks = self.col.cards.list_decks()
# print(decks)
cards = self.col.cards.merge_notes()
cards = cards[cards["cdeck"].str.startswith(self.deck_name)]
note_ids = cards.nid
selected_notes = self.col.notes[self.col.notes.nid.isin(note_ids)]
selected_notes = selected_notes.reset_index()
selected_notes = self._preprocess_notes(selected_notes)
similar_notes = self._get_similar_notes(selected_notes)
selected_notes = self._add_tags(selected_notes, similar_notes)
selected_notes = selected_notes.set_index("nid")
# drop preprocced column
selected_notes = selected_notes.drop(columns=["preprocced"])
selected_notes.info()
# update cards in collection
self.col.notes.update(selected_notes)
self._save_cards()
if __name__ == "__main__":
deck_name = "Statistik"
nlp = spacy.load("de_core_news_lg")
stopwords = STOP_WORDS
lemmatization = True
threshold = 0.8
anki_card_similarity = AnkiCardSimilarity(
deck_name, nlp, stopwords, lemmatization, threshold
)
anki_card_similarity.run()
@kornkaobat
Copy link

kornkaobat commented Feb 3, 2024

I would like to add a variation of this code changing the code to support English and comment out Ankipanda's write error for manual addition into Anki using nid:"unspaced, unquoted comma output from local file". (Also removes unused imports and, the default threshold value due to varying use cases, it'll need to be fine-tuned to your content.)

import re

import numpy as np
import spacy
from ankipandas import Collection
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

tqdm.pandas()


class AnkiCardSimilarity:
    def __init__(self, deck_name, nlp, stopwords, lemmatization, threshold):
        self.col = Collection()
        self.deck_name = deck_name
        self.nlp = nlp
        self.stopwords = stopwords
        self.lemmatization = lemmatization
        self.threshold = threshold

    def _preprocess(self, text):
        # preprocess text
        # Removes HTML tags, preserving content inside tags
        text = re.sub("<[^<]+?>", "", text)
        # Restricts characters based on model language
        text = re.sub("[^a-zA-Z]", " ", text)
        text = text.lower()
        text = " ".join([word for word in text.split() if word not in self.stopwords])
        if self.lemmatization:
            text = " ".join([token.lemma_ for token in self.nlp(text)])
        return text

    def _preprocess_notes(self, notes):
        # apply with tqdm preprocessing to cards nflds field. before that turns the list into a string.
        print("preprocessing notes")
        notes["preprocced"] = notes["nflds"].progress_apply(lambda x: " ".join(x))
        notes["preprocced"] = notes["preprocced"].progress_apply(self._preprocess)
        return notes

    def _get_similar_notes(self, notes):
        # get similar cards using TF-IDF and cosine similarity. returns a list of tuples with the corresponding nid values in cards
        print("calculating similarity")
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(notes["preprocced"])
        cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T)
        similar_cards = []
        for i in tqdm(range(cosine_similarities.shape[0])):
            for j in range(i + 1, cosine_similarities.shape[0]):
                if cosine_similarities[i, j] > self.threshold:
                    similar_cards.append(
                        (
                            notes.iloc[i]["nid"],
                            notes.iloc[j]["nid"],
                            cosine_similarities[i, j],
                        )
                    )
        similarnid = [i[1] for i in similar_cards]
        f = open("output.txt", "w")
        print(similarnid, file=f)
        # print(*similar_cards, sep = ",") 
        # print("Count Notes:", len(notes))
        # print("Count Similar:", len(similar_cards))
        return similar_cards

    def _add_tags(self, notes, similar_notes):
        # add tags to similar cards and less information tag to card with less information and remvove duplicated tags in the end
        # with tqdm
        print("adding tags to cards")
        for note1, note2, similarity in tqdm(similar_notes):
            notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
                notes["nid"] == note1, "ntags"
            ].apply(
                lambda x: x
                + [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
            )
            notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
                notes["nid"] == note2, "ntags"
            ].apply(
                lambda x: x
                + [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
            )

            if len(notes.loc[notes["nid"] == note1, "preprocced"].iloc[0]) < len(
                notes.loc[notes["nid"] == note2, "preprocced"].iloc[0]
            ):
                notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
                    notes["nid"] == note1, "ntags"
                ].apply(lambda x: x + ["SimilarityCheck::less-information"])
            else:
                notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
                    notes["nid"] == note2, "ntags"
                ].apply(lambda x: x + ["SimilarityCheck::less-information"])

        notes["ntags"] = notes["ntags"].apply(lambda x: list(set(x)))
        return notes

    # def _save_cards(self):
        # self.col.summarize_changes(output="print")
        # self.col.write(modify=True)

    def run(self):
        # run the whole process
        decks = self.col.cards.list_decks()
        # print(decks)
        cards = self.col.cards.merge_notes()
        cards = cards[cards["cdeck"].str.startswith(self.deck_name)]

        note_ids = cards.nid
        selected_notes = self.col.notes[self.col.notes.nid.isin(note_ids)]

        selected_notes = selected_notes.reset_index()

        selected_notes = self._preprocess_notes(selected_notes)
        similar_notes = self._get_similar_notes(selected_notes)
        # selected_notes = self._add_tags(selected_notes, similar_notes)

        # selected_notes = selected_notes.set_index("nid")
        # drop preprocced column
        # selected_notes = selected_notes.drop(columns=["preprocced"])

        # selected_notes.info()

        # update cards in collection
        # self.col.notes.update(selected_notes)

        # self._save_cards()

if __name__ == "__main__":
    deck_name = "deckname"
    nlp = spacy.load("en_core_web_lg")
    stopwords = STOP_WORDS
    lemmatization = True
    # --------------------------------
    # Step 1: Remove irrelevant notes: threshold = 0.3
    # Step 2: This threshold below marks card too similar to existing pivot card for removal.
    threshold = 0.325
    # --------------------------------
    anki_card_similarity = AnkiCardSimilarity(
        deck_name, nlp, stopwords, lemmatization, threshold
    )
    anki_card_similarity.run()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment