Skip to content

Instantly share code, notes, and snippets.

@rgov
Created April 14, 2020 23:25
Show Gist options
  • Select an option

  • Save rgov/9f0665b83dee1c004e33e898a628b62f to your computer and use it in GitHub Desktop.

Select an option

Save rgov/9f0665b83dee1c004e33e898a628b62f to your computer and use it in GitHub Desktop.
import numpy as np
from sklearn import linear_model
def letvec(w):
return np.array([ w.count(k) for k in 'abcdefghijklmnopqrstuvwxyz' ])
freq = [ 8.167, 1.492, 2.202, 4.253, 12.702, 2.228, 2.015, 6.094, 6.966, 0.153,
1.292, 4.025, 2.406, 6.749, 7.507, 1.929, 0.095, 5.987, 6.327, 9.356,
2.758, 0.978, 2.560, 0.150, 1.994, 0.077 ]
assert len(freq) == 26
# from https://github.com/first20hours/google-10000-english
with open('google-10000-english-usa-no-swears.txt') as f:
words = [ w.lower().rstrip() for w in f.readlines() ]
wordmat = np.array([ letvec(w) for w in words ]).transpose()
assert (letvec(words[0]) == wordmat[:,0]).all()
target_len = 200
target_vec = np.array(freq) * target_len / 100.0
lars = linear_model.LassoLars(alpha=0.01, fit_intercept=False, positive=True, max_iter=50000)
lars.fit(wordmat, target_vec)
solution = []
for i, w in enumerate(lars.coef_):
for _ in range(int(round(w))):
solution.append(words[i])
solution = ' '.join(solution)
print(len(solution), solution)
print('got: ', letvec(' '.join(solution)))
print('want:', target_vec.astype(int))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment