AlexRiina · July 10, 2020 21:37
diff --git a/anki_similarity.py b/anki_similarity.py
 """
 Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.

 ```
 pip install anki_sqlalchemy bs4 sklearn
 cp ${ANKI_DATABASE:?replace me} backup.db
 python anki_similarity.py
 ```
 """

 import re
 import bs4
 import random
 from anki_sqlalchemy import Note, Collection
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker

 from sklearn.neighbors import NearestNeighbors
 from sklearn.feature_extraction.text import TfidfVectorizer

 CLOZE_EXTRACT = re.compile(r"{{(?P<group>.*?)::(?P<answer>.*?)(::.*?)?}}")

 engine = create_engine("sqlite:///backup.db", echo=True)
 Session = sessionmaker(bind=engine)
 session = Session()

 col = session.query(Collection).one()

 cloze = next(model for model in col.models.values()
             if model["name"] == "Cloze")
 cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
 text_ord = next(
    field for field in cloze["flds"] if field["name"] == "Text")["ord"]


 def flatten(a: str) -> str:
    a = a.replace("&nbsp;", " ")

    if '<' in a:
        a = bs4.BeautifulSoup(a).get_text()

    return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()


 cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes}
 cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())

 tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
 text_vectors = tfidf.fit_transform(cloze_texts)

 nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
 nbrs.fit(text_vectors.todense())
 neighbors_lists = nbrs.radius_neighbors(
    text_vectors.todense(), radius=0.9, return_distance=False
 )
 neighbors_lists = [
    (index, list(set(neighbors) - {index}))
    for index, neighbors in enumerate(neighbors_lists)
    if len(neighbors) > 1  # more than self
 ]

 for idx, neighbors in random.sample(neighbors_lists, 12):
    print(f"{cloze_texts[idx]} is similar to")

    for neighbor in neighbors:
        print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))
	"""
	Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.

	```
	pip install anki_sqlalchemy bs4 sklearn
	cp ${ANKI_DATABASE:?replace me} backup.db
	python anki_similarity.py
	```
	"""

	import re
	import bs4
	import random
	from anki_sqlalchemy import Note, Collection
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	from sklearn.neighbors import NearestNeighbors
	from sklearn.feature_extraction.text import TfidfVectorizer

	CLOZE_EXTRACT = re.compile(r"{{(?P<group>.?)::(?P<answer>.?)(::.*?)?}}")

	engine = create_engine("sqlite:///backup.db", echo=True)
	Session = sessionmaker(bind=engine)
	session = Session()

	col = session.query(Collection).one()

	cloze = next(model for model in col.models.values()
	if model["name"] == "Cloze")
	cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
	text_ord = next(
	field for field in cloze["flds"] if field["name"] == "Text")["ord"]


	def flatten(a: str) -> str:
	a = a.replace(" ", " ")

	if '<' in a:
	a = bs4.BeautifulSoup(a).get_text()

	return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()


	cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes}
	cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())

	tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
	text_vectors = tfidf.fit_transform(cloze_texts)

	nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
	nbrs.fit(text_vectors.todense())
	neighbors_lists = nbrs.radius_neighbors(
	text_vectors.todense(), radius=0.9, return_distance=False
	)
	neighbors_lists = [
	(index, list(set(neighbors) - {index}))
	for index, neighbors in enumerate(neighbors_lists)
	if len(neighbors) > 1 # more than self
	]

	for idx, neighbors in random.sample(neighbors_lists, 12):
	print(f"{cloze_texts[idx]} is similar to")

	for neighbor in neighbors:
	print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))