$ python3 -m venv ~/.embeddings
$ source ~/.embeddings/bin/activate
$ pip install -U \
sentence-transformers \
pandas \
umap-learn$ wget https://gist.githubusercontent.com/trag1c/f74b2ab3589bc4ce5706f934616f6195/raw/5aa7de70fc83664017cb97dd02cbf6dc76b9e4a3/nouns.txt
$ wc -l nouns.txt # 40,940 lines, 378 KB
$ head nouns.txt'hood
a
a'man
a-horizon
a-line
a-list
a-team
aa
aachen
aalborg
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
df = pd.read_csv('nouns.txt', header=None, names=['noun'])
embeddings = model.encode(df['noun'].tolist(), device='cpu') # ~5 minutes
df['embedding'] = list(embeddings)
embeddings_array = np.array(embeddings)
# Initialize UMAP to reduce to 2 components
# You might need to adjust parameters like n_neighbors and min_dist
reducer = umap.UMAP(n_components=2,
random_state=42,
n_neighbors=15,
min_dist=0.1)
embeddings_umap = reducer.fit_transform(embeddings_array)
df['umap_x'] = embeddings_umap[:, 0]
df['umap_y'] = embeddings_umap[:, 1]
# Initialize t-SNE to reduce to 2 components
# You might need to adjust parameters like n_iter and
# perplexity depending on your data
tsne = TSNE(n_components=2,
random_state=42,
max_iter=300,
perplexity=30)
embeddings_tsne = tsne.fit_transform(embeddings_array)
df['tsne_x'] = embeddings_tsne[:, 0]
df['tsne_y'] = embeddings_tsne[:, 1]
# Initialize PCA to reduce to 2 components
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings_array)
df['pca_x'] = embeddings_2d[:, 0]
df['pca_y'] = embeddings_2d[:, 1]
df.to_json('english_nouns_embeddings.json',
index=False,
orient='records',
lines=True)The above JSON is 543 MB uncompressed.
$ head -n1 english_nouns_embeddings.json | jq -S .{
"embedding": [
0.0209693518,
0.0295862984,
-0.0088079795,
-0.0459533148,
0.0224828105,
-0.0211671218,
-0.0128744617,
0.018182138,
0.0353978872,
-0.0108804181,
0.0141729573,
0.0424714759,
-0.0265897792,
-0.0214717165,
-0.0192336626,
-0.0085277101,
-0.0232005138,
0.0296546668,
-0.0113789486,
-0.0069909217,
0.0345830396,
-0.0235321596,
-0.0262915511,
-0.0516209342,
-0.0222521424,
-0.0102051198,
-0.0220041163,
-0.0317917876,
-0.0506815612,
-0.0395757817,
-0.0132756773,
0.0400672108,
-0.0094182435,
-0.0526665263,
-0.0304834172,
0.0561967604,
0.0243229512,
0.025846405,
-0.051909145,
0.01715694,
-0.0173763763,
0.0734321326,
0.0289790798,
-0.0244779885,
-0.017198436,
0.0105180833,
0.018143123,
0.0024040719,
0.0090004411,
0.0404919237,
0.0496148244,
0.0264843348,
-0.0212947894,
-0.0201991722,
-0.0395246297,
0.0233553257,
-0.0122203827,
-0.0142700244,
-0.0511177443,
0.0280076284,
-0.0358541943,
0.0033826509,
0.0178490616,
-0.0191307329,
-0.0374656953,
0.0613618828,
0.0410517976,
0.0405144356,
-0.0476211309,
0.021690134,
-0.006072619,
0.0103548625,
-0.0186568461,
-0.0061095143,
-0.017931642,
-0.0265272036,
0.0313385949,
-0.0071906247,
0.0120600658,
-0.0211219769,
0.0810838267,
0.0108550508,
-0.0044385786,
-0.0221983567,
0.0107457777,
0.0332191549,
0.0458686128,
0.0032653683,
0.0322599821,
0.0309017655,
-0.0248356145,
0.0559167266,
0.0333743989,
-0.056417428,
-0.0357224122,
-0.0163866319,
0.0492218435,
0.0393368602,
-0.0197999924,
-0.0390380844,
-0.0542951524,
-0.0221371595,
0.0203299522,
-0.0139840292,
-0.0205444191,
0.0215668194,
0.0305597559,
0.0342049673,
-0.0124181751,
0.026684396,
0.059180133,
0.0307563059,
0.018508058,
-0.0217743106,
-0.0456136167,
0.0302798301,
-0.0083205104,
-0.0379479863,
-0.0408654213,
0.0417722017,
0.0330663286,
0.0355529152,
0.029629739,
-0.0403966755,
0.0628189966,
-0.0325347632,
0.0185809694,
0.0388640389,
-0.0407639146,
-0.0147091877,
0.0221831575,
0.0251516588,
-0.0669362918,
-0.0119042322,
-0.0367835127,
-0.033422675,
-0.0012959191,
-0.0364792347,
-0.0082611069,
0.0266640168,
-0.0409457013,
-0.0053990013,
0.0188553706,
-0.0373026468,
-0.0558662862,
-0.051155176,
-0.0266269278,
0.0372218601,
-0.0260336194,
-0.0217141714,
-0.0087615754,
-0.0400849655,
-0.0243186951,
0.0159475394,
0.0240793172,
-0.0329706445,
0.0305529144,
0.0193705242,
0.021987373,
-0.0119589949,
-0.0116854357,
-0.0299953092,
-0.0107918112,
-0.0239745174,
0.0048658471,
-0.0400158353,
0.0501299351,
0.0281819161,
0.0172789302,
-0.013480789,
-0.0336827077,
-0.0407149568,
0.0375330448,
-0.0669582635,
0.0001516601,
0.0394926704,
0.0097546149,
0.022112336,
0.0328667462,
-0.0254752766,
0.0095153712,
0.0528677925,
0.0392245837,
-0.0350097939,
-0.0162911769,
0.0347194523,
0.0297702141,
0.0089268526,
0.0425772071,
0.0469902977,
-0.0011375649,
0.0174294543,
-0.0290927999,
0.0331506431,
-0.0032818238,
0.0470519327,
0.0115588736,
0.0260876603,
-0.0434244014,
-0.0306274407,
-0.0122683942,
0.0308749545,
-0.0535881631,
-0.0039703725,
0.0003285207,
-0.0088744182,
-0.0402123518,
-0.0163041968,
0.0315622203,
-0.0311529599,
-0.0455186889,
-0.0229611862,
0.0308164172,
0.0198655128,
-0.0179250576,
-0.0365995839,
-0.045459833,
-0.028752774,
0.0290797893,
0.0326028429,
0.0111602386,
0.0352096558,
-0.0219787359,
0.0508713163,
0.0518390834,
0.0116550727,
0.0391072594,
0.0525139086,
0.0243314058,
0.0166568924,
0.0231482349,
0.0099797053,
-0.0645264536,
-0.0321495794,
0.0203887112,
0.0052086976,
-0.0512637682,
-0.0035339238,
0.023472093,
0.0269396026,
-0.0427433364,
0.0185829941,
-0.0258611199,
-0.031797953,
-0.0319291875,
-0.0040472895,
0.0387544595,
-0.0447691344,
0.0446037725,
-0.0192819294,
0.0305649452,
-0.0211614333,
-0.0381722748,
0.0129556246,
-0.0000011376,
0.0316919982,
0.037226297,
-0.0320734233,
-0.0006805841,
-0.036002025,
-0.0066234721,
-0.0238228049,
-0.005316101,
0.0109401448,
0.0289450064,
0.0163936876,
-0.0156585146,
-0.0104713961,
-0.0260060541,
-0.0339103043,
-0.0555517152,
-0.036610689,
-0.0686736926,
-0.0509143472,
0.0055140834,
0.0076908832,
0.0011184398,
-0.0367619842,
-0.0031454354,
-0.0387876704,
0.0285349321,
-0.019006839,
0.0317083485,
-0.0180730466,
-0.0000141082,
-0.0105660344,
0.0409711078,
-0.0007463103,
0.046723865,
-0.0132832862,
0.0349774472,
-0.000787138,
0.0635638461,
0.0462373346,
0.017122373,
-0.0276494753,
-0.0345153473,
0.0168291479,
-0.0340132043,
0.0290941447,
-0.0202005021,
-0.0096772565,
0.0097899884,
0.0028106626,
-0.0522424057,
-0.0201352034,
-0.0100116413,
0.0293398518,
-0.0667592213,
0.0331836715,
0.016991023,
0.0402692072,
0.0091954079,
-0.0347385965,
-0.030812813,
-0.0163064171,
-0.0245329738,
0.0397904515,
0.0007123339,
0.0107593564,
-0.0307044219,
-0.0276725646,
0.0029435889,
0.0200403407,
0.0340115577,
0.0281871352,
0.0315990672,
-0.0299624037,
0.0296624657,
-0.023852922,
0.0424911715,
-0.0079179192,
0.0232507773,
0.0551960878,
-0.0192315616,
0.0289863497,
-0.0482864566,
0.0357451253,
0.0316745937,
-0.0360504501,
0.0359704606,
-0.0240616146,
-0.0568793006,
-0.041521538,
0.0423808768,
-0.0336635225,
0.0629118159,
-0.0111614801,
0.026405625,
0.0220579058,
0.0073547233,
-0.0611607581,
0.0378272869,
0.0395223498,
0.0077721481,
0.035675969,
0.0499266535,
-0.0494170487,
-0.0155187342,
-0.0292464793,
0.0225869492,
0.018405214,
0.0485708751,
-0.0117317773,
-0.0548212826,
-0.0193721373,
0.0118090287,
-0.0102043618,
-0.0449720956,
-0.0388428904,
0.0360574573,
-0.0008622319,
-0.0402790084,
0.0279318579,
0.0083922101,
-0.0206332989,
-0.0623439811,
-0.0283181779,
0.0194201842,
-0.0360995606,
-0.0146288136,
-0.0343820043,
0.0150679192,
-0.0287550855,
-0.0173342843,
-0.0353738181,
0.1212059706,
0.0410613269,
0.0594697334,
-0.0411364995,
-0.0070348345,
0.0079861665,
0.017971959,
0.0110538732,
0.0390482582,
-0.0024026046,
-0.0353778563,
0.014001416,
0.0229275227,
-0.0100745223,
0.0200572368,
0.044213146,
0.0083899386,
0.0368935503,
0.0474514626,
-0.023946343,
0.044176057,
-0.0184172317,
0.0643697158,
-0.0037938014,
-0.0414736606,
-0.012716678,
-0.0305536389,
0.0295148306,
-0.0499612056,
0.0213498622,
-0.0293764416,
0.0247344561,
-0.04535513,
-0.0242566094,
0.0144114988,
0.0154011678,
0.0250254441,
-0.0226014517,
0.0198221207,
-0.0134301828,
-0.0087794065,
-0.0360809751,
-0.0215101093,
0.0120552434,
-0.0110391984,
0.0349443741,
0.0247976463,
-0.0180906542,
-0.0331310071,
0.0365732014,
0.0014791918,
0.0325356834,
-0.0295489207,
-0.0544516556,
-0.0193147901,
-0.033356864,
-0.0164007936,
-0.0502496101,
0.0265280437,
0.0060925703,
0.0502372161,
0.0341086313,
-0.0251622461,
-0.0261270944,
0.0254502334,
0.0334242284,
-0.0223658625,
-0.0221175216,
-0.0383289754,
-0.0079270527,
-0.0294444785,
-0.0294273868,
-0.0048331949,
0.0168414917,
0.0450549126,
-0.0041373195,
-0.0304317512,
0.0390468389,
0.0295974873,
-0.0131256906,
0.0482443571,
-0.0348374359,
-0.0401860513,
0.0134029957,
0.0632714778,
-0.0295371562,
0.0223626588,
0.0216887817,
-0.0408438593,
0.0354124792,
0.0271598957,
0.0407038219,
-0.0113523025,
0.0447149687,
0.0515610762,
-0.0254576467,
0.0421642549,
0.0016588916,
-0.0400784574,
-0.0405693576,
-0.0093578855,
0.026271116,
-0.0287056975,
-0.0404037163,
-0.0193084944,
-0.0327932239,
-0.0138056781,
0.0255068317,
0.025912568,
-0.0126587125,
0.020332545,
-0.0077821352,
0.0290759746,
-0.03563153,
0.0462082513,
0.0201782435,
0.0451675206,
-0.0551839508,
0.0270315371,
0.0681210309,
-0.0284195505,
-0.0032899112,
-0.0313073434,
0.0423563011,
-0.0336817726,
0.0293466263,
-0.0008110962,
0.0156540889,
0.0148133337,
0.0478417054,
0.0214194581,
-0.0185815543,
-0.0323111303,
0.0128650926,
0.020402059,
-0.0104735494,
0.0469952859,
-0.0105279442,
-0.0304869786,
0.0366102457,
0.0149554564,
0.0309617445,
-0.0316070244,
0.0125645753,
0.0279128887,
0.0471985713,
-0.0322069339,
-0.0408290699,
-0.006752301,
-0.0099181067,
0.0246835668,
-0.0301144999,
0.0007761294,
0.0156682525,
-0.0210529659,
-0.0142926583,
0.0035671161,
0.022564115,
0.0006949899,
-0.0356223918,
-0.0275947452,
-0.0033707574,
-0.0333421454,
0.0814901888,
0.0319818072,
0.0303727482,
-0.0355849862,
0.0384520143,
0.0183307379,
0.0113455607,
-0.012709639,
-0.0340244211,
0.0052889735,
0.0233105049,
0.0253675934,
-0.0248706359,
-0.0161531884,
0.020203365,
-0.0119545115,
0.0266116261,
0.0015284149,
0.0506447032,
0.0288682655,
-0.0344327092,
-0.0427137278,
0.0164817795,
0.0211026557,
-0.0268885437,
-0.0176220592,
0.0108112954,
-0.0092313858,
0.0416985601,
-0.0222866815,
0.0213053208,
-0.0326152146,
-0.0076078107,
-0.0118786199,
0.0414005257,
-0.0330643244,
-0.014406533,
-0.0183761306,
-0.0666471943,
0.0087963324,
-0.0227254033,
-0.0050551924,
-0.0135915671,
0.0267581809,
0.0049198424,
0.0232600663,
0.0439702943,
-0.0033316161,
-0.0458412915,
0.0262015704,
0.0313095637,
0.0312864818,
0.0300762411,
0.0283417627,
0.0482886322,
0.0091890767,
-0.0200660564,
0.0079197874,
0.0058328044,
0.0328955874,
-0.0507892594,
-0.0351615734,
-0.0378128216,
0.0568035543,
0.0093723312,
-0.0007100698,
-0.0312908404,
-0.0117230108,
0.0187149681,
-0.0201639421,
-0.0319319814,
-0.0417805761,
-0.0142896157,
0.0121932253,
0.0212670974,
0.0534488671,
-0.0344997607,
0.0595792904,
-0.02013533,
0.0246236008,
0.0042019305,
0.0089885257,
0.0521499924,
-0.0444618873,
-0.0438503399,
0.0097546717,
0.013979855,
0.0345792323,
-0.0616038404,
0.0141092837,
0.0469235964,
-0.025821209,
-0.0312981084,
0.0087995725,
-0.0371481851,
-0.0204385351,
-0.031223027,
0.0102228839,
-0.0092473822,
-0.0457439125,
-0.0162340682,
-0.0357426852,
0.0244887955,
0.0055851829,
-0.0418862551,
0.0012564531,
-0.0660171956,
0.0620723069,
-0.0126664573,
-0.031107774,
-0.0314597823,
0.0451110192,
0.0146396356,
-0.0378251635,
-0.0116761848,
-0.0163542107,
-0.0380556732,
-0.0454918928,
0.0305940136,
0.028994102,
-0.0074521531,
0.0008181499,
0.0155450404,
-0.0309920236,
0.0529217049,
0.0051110564,
-0.027355006,
-0.0318875499,
-0.0009333206,
-0.0248232968,
0.0127897337,
-0.0034162374,
0.0247016437,
-0.0165034197,
-0.0388171747,
-0.0231051464,
-0.0325858183,
0.0385181569,
-0.0175988898,
-0.0252954829,
-0.0224670563,
-0.0341567621,
-0.006135799,
-0.0235438906,
0.0200684387,
0.0412655436,
0.0129300421,
-0.0130231436,
-0.0354661196,
-0.046388682,
-0.0054452177,
0.0016485839,
0.0222829618,
0.0313020907,
-0.0035835647,
-0.0203108396,
0.0224885568,
-0.028149616,
-0.0071915374,
-0.0194772165,
0.0241824128,
0.0204026848,
0.0073143337,
0.029760154,
0.0159036554,
0.0225175284,
-0.015433616,
0.0208503548,
-0.0099719223,
-0.0255125761,
-0.0144161787,
-0.0495240502,
-0.0189127903,
-0.0296715423,
-0.0179692786,
-0.0503818542,
-0.0400476418,
0.0013398189,
0.0443602912,
0.0234716777,
-0.0287368111,
0.0371714942,
0.0107460925,
-0.0081192423,
0.0290751047,
-0.0062438874,
-0.0260490924,
-0.0159503855,
0.0542117693,
-0.0184430685,
-0.0001741781,
0.000105841,
0.0387855135,
-0.0454050675,
0.0197075289,
-0.0263075717,
-0.0466152541,
-0.0000517273,
-0.0407572724,
0.0419508219,
-0.0103557967,
-0.0106247691,
0.01329502,
-0.0242548492,
0.0026721237,
-0.0307056867,
0.0446519591,
-0.0385647044,
-0.0201329049,
-0.0328766666,
-0.0227338206,
-0.0471698269,
0.0280558448,
-0.0421798155,
0.0155909369,
0.028434081,
0.0205651801,
0.0171259306,
-0.0262722075,
-0.0418900512,
-0.0464811064,
0.0544118509,
-0.0377529524,
-0.010740743,
0.0328052007,
0.0474963635,
0.0329601169,
-0.0237841383,
0.0705928057,
-0.006729302,
0.0402654596,
-0.0601078272,
-0.0055932449,
0.0124147236,
-0.0190048851,
-0.0137882056,
-0.0133231683,
-0.0283970572,
0.0310874227,
-0.0289976168,
-0.0318350047,
0.0166407116,
0.034679573,
0.0308916308,
-0.0140700368,
-0.0268861856,
0.0401877426,
0.0284167994,
-0.0332013033,
0.013229182,
-0.0155338338,
0.0144393016,
-0.0053956243,
0.0018296494,
0.0035085836,
0.0317150056,
0.0099322228,
0.0011139049,
0.0100091966,
-0.0245791171,
-0.0447922572,
-0.0098157022,
-0.0167717542,
0.0208248887,
0.0267619006,
-0.0394736081,
0.0342808142,
-0.0176907256,
-0.0079401312,
-0.0325489417,
-0.000136078,
-0.0459268279,
-0.0205733497,
-0.030992426,
0.0258086361,
-0.0096477475,
0.0337738246,
0.0217929054,
-0.0099123102,
0.0262974706,
-0.0392559543,
-0.0261121076,
-0.0383180864,
0.0248299967,
0.0225088671,
0.0481038801,
0.0214297511,
0.0067982767,
-0.0047251037,
0.0192816295,
0.025647385,
0.0276637208,
-0.0286916886,
-0.0399720371,
-0.0515545867,
0.0031136288,
-0.044454433,
-0.0496935435,
-0.0071890517,
-0.0246596057,
0.0019692394,
0.0235680323,
-0.0646600574,
0.0376344696,
0.030344978,
0.0349166021,
-0.0516276583,
-0.0142631456,
0.0244923756,
0.0302715562,
0.0298976712,
-0.0289576091,
-0.0081273736,
0.0327973962,
0.0049907914,
-0.0411853082,
0.0213390049,
-0.0022287539,
-0.0243975446,
-0.0429922007,
0.0262807719,
-0.0540811867,
-0.0225217231,
0.0401476435,
-0.0347246975,
0.0188993979,
-0.0435507856,
-0.0276457723,
0.0111918133,
0.0131014818,
0.0323058441,
-0.026858503,
-0.022298621,
0.0214992072,
0.0202586278,
0.0123076849,
-0.0448534973,
0.0254809484,
-0.0089451885,
0.0247238986,
0.0094152214,
-0.0396872684,
-0.0049851462,
-0.0493030436,
-0.0076600472,
-0.0098495325,
0.0261633303,
0.0226382837,
0.0130626643,
-0.0403791144,
0.0194507521,
-0.0074133319,
-0.0322876349,
0.0390661284,
-0.0157513861,
-0.0232072789,
-0.0470984094,
0.0295367986,
-0.0086077694,
0.0023800987,
-0.0450371541,
0.0434041545,
-0.0752443075,
0.0371450111,
0.0121524232,
-0.0163129028,
-0.0222043246,
-0.027562689,
0.0792250633,
-0.0461375415,
0.0463698804,
-0.0236569718,
-0.0382023938,
-0.0239726454,
-0.022758849,
-0.0277945865,
0.0390588827,
-0.0323359407,
-0.0151317567,
-0.0384403728,
-0.0280766897,
0.0371154621,
-0.041675929,
-0.032662753,
0.0172623806,
-0.0005475966,
0.0271037742,
0.0006433257,
-0.0168039426,
0.0409340709,
-0.0406970643,
-0.0092932684,
0.0338824689,
-0.0222130045,
0.0350924544,
-0.0158042423,
-0.0223131422,
0.0174127258,
0.0427834354,
-0.0071554137,
-0.0322854705,
0.0292007923,
0.0156856608,
-0.0471513718,
0.048447337,
0.0183705296,
0.0196946822,
0.0249012057,
0.055753082,
0.049804423,
0.0095135421,
-0.018171696,
-0.0304422602,
0.0013614555,
0.0260705967,
0.0129869459,
0.0690831989,
-0.0123054367,
0.0183607712,
-0.025714837,
0.038501028,
0.0157119315,
0.0149366427,
-0.0733807087,
0.0145389112,
0.0364543013,
-0.0317060612,
-0.0254941229,
0.039663706,
-0.0340209715,
-0.0249860864,
0.0298891813,
-0.0083171288,
0.038008593,
-0.02306097,
-0.0496346876,
0.030797882,
-0.0400088541,
0.0218714513,
-0.0286973603,
0.0429913178,
-0.0013876557,
-0.0249241609,
0.005262584,
0.0352865122,
0.0386088155,
0.0526812449,
0.0158120226,
-0.0322280042,
0.0142834345,
0.0503321029,
0.0596599281,
0.0551041663,
-0.0521138385,
-0.024790477,
-0.0027090984,
0.0322798342,
-0.0368478782,
0.0060664196,
-0.0347755663,
-0.0091366656,
0.0440737791,
0.0134830838,
0.0105462698,
0.0221342091,
-0.0110708773,
-0.0341020115,
0.0155420471,
-0.0004499416,
0.0299057104,
0.0636167303,
0.0486852899,
-0.0117931152,
-0.0233345088,
0.0397362895
],
"noun": "'hood",
"pca_x": -0.0585507676,
"pca_y": 0.071247533,
"tsne_x": -2.656069994,
"tsne_y": 5.315205574,
"umap_x": -0.5095627308,
"umap_y": -1.0305397511
}