Created
June 19, 2023 23:24
-
-
Save jsettlem/f8b5c38555dfa302a640434da4a7353e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import random | |
from pprint import pprint | |
from sklearn.manifold import TSNE | |
from collections import defaultdict | |
from gensim.models import KeyedVectors | |
import numpy as np | |
import json | |
from sklearn.neighbors import NearestNeighbors | |
import struct | |
from semantle_words_3 import frpergJbeqf as old_frpergJbeqf | |
from semantle_words_4 import frpergJbeqf | |
frequency = defaultdict(lambda: 0) | |
with open("../../../enwiki-20190320-words-frequency.txt", 'r', encoding="utf-8") as f: | |
for line in f: | |
pair = line.split() | |
frequency[pair[0].lower()] = int(pair[1]) | |
def normalize(v): | |
norm = np.linalg.norm(v) | |
if norm == 0: | |
return v | |
return v / norm | |
def pol2cart(rho, phi): | |
x = rho * np.cos(phi) | |
y = rho * np.sin(phi) | |
return (x, y) | |
def generate_puzzle(puzzle_index = 1, puzzle_type="pimantle", word=""): | |
model: KeyedVectors = KeyedVectors.load("../../../pruned_words.bin") | |
if puzzle_type == "semantle": | |
secret_word = codecs.encode(frpergJbeqf[puzzle_index], 'rot_13') | |
else: | |
secret_word = word | |
best_1000 = model.similar_by_key(secret_word, topn=len(model.index_to_key)) | |
best_1000_labels = [secret_word] + [x[0] for x in best_1000] | |
best_1000_indexes = [model.key_to_index[word] for word in best_1000_labels] | |
best_1000_similarity = [1.0] + [x[1] for x in best_1000] | |
best_1000_vectors = [model[secret_word]] + np.array([model[x] for x in best_1000_labels]) | |
for vec in best_1000_vectors: | |
vec[:] = normalize(vec) | |
best_1000_scaler = TSNE(n_components=1, init="random", learning_rate="auto", verbose=1).fit_transform( | |
best_1000_vectors) | |
twimst_factor = random.uniform(-1.5 * np.pi, 1.5 * np.pi) | |
twist_factor = random.uniform(-2 * np.pi, 2 * np.pi) | |
similarity_array = np.array(best_1000_similarity) | |
best_1000_polar = np.column_stack(pol2cart( | |
50_000 ** (1 - similarity_array) - 1, | |
np.squeeze( | |
(best_1000_scaler - np.min(best_1000_scaler)) / np.ptp(best_1000_scaler) * np.pi * 2 | |
) + similarity_array * twimst_factor + twist_factor)) | |
best_1000_vecs = best_1000_polar / 50_000 | |
neighbors = NearestNeighbors(n_neighbors=10, n_jobs=-1).fit(best_1000_vecs).kneighbors(return_distance=False) | |
neighbor_similarities = [] | |
neighbor_distances = [] | |
for i, neighbor in enumerate(neighbors): | |
node = best_1000_labels[i] | |
neighbor_words = [best_1000_labels[x] for x in neighbor] | |
neighbor_similarities.append([model.similarity(node, x) for x in neighbor_words]) | |
neighbor_distances.append([best_1000_vecs[x] - best_1000_vecs[i] for x in neighbor]) | |
neighbor_similarities = np.array(neighbor_similarities, ndmin=2) | |
neighbor_distances = np.array(neighbor_distances, ndmin=3) | |
total_distances = np.array([ | |
[distance * similarity * (1 - radius) for distance, similarity in zip(distances, similarities)] for | |
distances, similarities, radius in zip(neighbor_distances, neighbor_similarities, best_1000_similarity) | |
]) | |
total_distances = np.sum(total_distances, axis=1) | |
best_1000_vecs += total_distances | |
dump = [] | |
for i in range(len(best_1000_labels)): | |
dump.append([best_1000_indexes[i], round(best_1000_vecs[i][0], 6), round(best_1000_vecs[i][1], 6), | |
round(best_1000_similarity[i], 3)]) | |
dump.sort(key=lambda x: x[3], reverse=True) | |
buff = bytearray() | |
buff.extend(struct.pack("<I", model.key_to_index[secret_word])) | |
for entry in dump: | |
buff.extend(struct.pack("<Iffe", entry[0], entry[1], entry[2], entry[3])) | |
with open(f"../public/{'secret_words' if puzzle_type == 'pimantle' else 'semantle_words'}/secret_word_{puzzle_index}.bin", "wb") as f: | |
f.write(buff) | |
pimantle_puzzles = [] | |
with open("pimantle_words_2.txt", 'r', encoding="utf-8") as f: | |
for line in f: | |
pimantle_puzzles.append(line.strip()) | |
random.shuffle(pimantle_puzzles) | |
for i, puzzle in enumerate(pimantle_puzzles): | |
generate_puzzle(puzzle_index=i + 233, word=puzzle, puzzle_type="pimantle") | |
with open("puzzle_log.txt", 'a') as f: | |
f.write(f"pimantle puzzle{i + 233} generated: {codecs.encode(puzzle, 'rot_13')}\n") | |
# for i in range(200, 400): | |
# print("generating semantle puzzle", i) | |
# if i > 199 or frpergJbeqf[i] != old_frpergJbeqf[i]: | |
# generate_puzzle(puzzle_index=i, puzzle_type="semantle", word=frpergJbeqf[i]) | |
# else: | |
# print("skipping", i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment