Skip to content

Instantly share code, notes, and snippets.

@bwesterb
Created November 1, 2019 10:29
Show Gist options
  • Save bwesterb/4601744ad81c5092a4927d0c30ad07d4 to your computer and use it in GitHub Desktop.
Save bwesterb/4601744ad81c5092a4927d0c30ad07d4 to your computer and use it in GitHub Desktop.
import csv
import json
from unidecode import unidecode
anum = set('abcdefghijklmnopqrstuvwxyz0123456789')
sfxs = {}
print("Computing suffices...")
for i, x in enumerate(json.load(open('all_songs.json'))):
def add(a, t, i):
awords = ''.join(x if x in anum else ' ' for x in a.lower()).split()
twords = ''.join(x if x in anum else ' ' for x in t.lower()).split()
for j in range(len(awords)):
for k in range(len(twords)):
sfx = ' '.join(awords[j:] + twords[k:])
if sfx not in sfxs:
sfxs[sfx] = []
sfxs[sfx].append((i,a+' - '+t))
add(unidecode(x['artist']), unidecode(x['title']), i)
print(f" {len(sfxs)}")
print("Sorting suffices")
sorted_sfxs = list(sfxs.keys())
sorted_sfxs.sort()
print("Writing")
with open('marietje', 'w') as f:
w = csv.writer(f, delimiter=",", quoting=csv.QUOTE_ALL)
for sfx in sorted_sfxs:
if len(sfxs[sfx]) == 1:
w.writerow([sfx, i ])
else:
for j, i_at in enumerate(sorted(sfxs[sfx], key=lambda x: x[1])):
w.writerow([
sfx + f"\0{j}",
i_at[0]
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment