# import re
# from hashlib import sha1
import gensim
import pandas as pd
import umap.umap_ as umap
import numpy as np
from numpy import log10
from tqdm import tqdm
from babyplots import Babyplot


# model = gensim.models.KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin", binary=True)


# allowable_words = set()
# with open("words_alpha.txt") as walpha:
#     for line in walpha.readlines():
#         allowable_words.add(line.strip())

# banned_hashes = set()
# with open("banned.txt") as f:
#     for line in f:
#         banned_hashes.add(line.strip())

# simple_word = re.compile("^[a-z]*$")
# words = []
# for word in model.key_to_index:
#     if simple_word.match(word) and word in allowable_words:
#         h = sha1()
#         h.update(("banned" + word).encode("ascii"))
#         hash = h.hexdigest()
#         if not hash in banned_hashes:
#             words.append(word)

# len(words)


# w2v_allowed = gensim.models.keyedvectors.KeyedVectors(300)
# for word in tqdm(words):
#     v = model.get_vector(word)
#     w2v_allowed.add_vector(word, v)


# w2v_allowed.save_word2vec_format("allowed_word2vec.bin", binary=True)


model = gensim.models.KeyedVectors.load_word2vec_format("allowed_word2vec.bin", binary=True)


secret_word = "shot"
topn = 1000
top_words = model.most_similar(secret_word, topn=topn)
top_words.insert(0, (secret_word, 1))
included_words = [x[0] for x in top_words]


guesses = []
with open("guesses.txt", "r") as guesses_file:
    for line in guesses_file:
        guesses.append(line.rstrip("\n"))


n_words_around_guess = 1000

for w in tqdm(guesses):
    if w in included_words:
        continue
    try: 
        words_around_guess = model.most_similar(w, topn=n_words_around_guess)
        words_around_guess = [w for w in words_around_guess if w[0] not in included_words]
        top_words += words_around_guess
        included_words += [w[0] for w in words_around_guess]
    except KeyError:
        continue

len(top_words)

100%|██████████| 60/60 [00:04<00:00, 12.51it/s]

20826


guesses.append(secret_word)


model_reduced = model[[w[0] for w in top_words]]

model_reduced.shape

(20826, 300)


reducer = umap.UMAP(metric='cosine', n_neighbors=15, min_dist=0.05, random_state=42, n_components=3)
embedding = reducer.fit_transform(model_reduced)


d = pd.DataFrame(embedding, columns=['umap1', 'umap2', 'umap3'])
d['word'] = [w[0] for w in top_words]
d['similarity'] = [w[1] for w in top_words]
d['log_similarity'] = d['similarity'].apply(log10)
d['word_index'] = np.arange(len(d)) + 1
d['log_word_index'] = d['word_index'].apply(log10)
d['word_index_rev'] = len(d) - d['word_index']
d['log_word_index_rev'] = 1 - d["log_word_index"]


d.head()


guesses = sorted(set([g for g in guesses if g in d["word"].tolist()]), key=guesses.index)
d_guesses = d.loc[d["word"].isin(guesses)]

d_guesses = d_guesses.set_index("word").loc[guesses].reset_index()
d_guesses["order"] = d_guesses.index


bp = Babyplot(background_color="#262020ff")

bp.add_plot_from_dataframe(
    d,
    "pointCloud",
    "values",
    "log_word_index_rev",
    ["umap1", "umap2", "umap3"],
    {
        "colorScale": "Spectral"
    }
)

bp.add_plot_from_dataframe(d_guesses, "line", "values", "order", ["umap1", "umap2", "umap3"], {
    "colorScale": "YlGnBu",
    "labels": d_guesses["word"].tolist(),
    "labelSize": 80,
    "labelColor": "white",
    "colorScaleInverted": True
    })

bp

	umap1	umap2	umap3	word	similarity	log_similarity	word_index	log_word_index	word_index_rev	log_word_index_rev
0	11.525807	8.440066	6.365793	shot	1.000000	0.000000	1	0.000000	20825	1.000000
1	11.597922	8.653831	6.365990	shots	0.694082	-0.158589	2	0.301030	20824	0.698970
2	11.313740	8.286745	6.431290	shooting	0.646509	-0.189426	3	0.477121	20823	0.522879
3	11.395639	8.349465	6.409628	shoot	0.602124	-0.220314	4	0.602060	20822	0.397940
4	11.395675	8.422284	6.300252	fired	0.552951	-0.257313	5	0.698970	20821	0.301030

UMAP Representation of a Semantle Game¶

March 24th, 2022 - Nils Trost¶