Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Created November 3, 2025 00:08
Show Gist options
  • Save jweinst1/07d0b3b4b558048fbc2b1cff36299cb2 to your computer and use it in GitHub Desktop.
Save jweinst1/07d0b3b4b558048fbc2b1cff36299cb2 to your computer and use it in GitHub Desktop.
hamming in a skip gram context
#include <vector>
#include <cstdint>
#include <cstdio>
#include <cassert>
#include <iostream>
#include <bitset>
#include <queue>
#include <unordered_map>
#include <string>
typedef std::vector<std::vector<std::string>> WordBank;
/**
* This is a way of implementing skip gram within a hamming context
* Instead of float dimensions, each bit is a possible context the word appears in.
* The comparison of two vectors occurs via hamming distance.
* Granularity is solved via specific contexts and variable lengths
* "the (target) is good"
* "the (target) is really good"
* The above means two different bits will be set.
* This allows more sensitivity in the hamming world.
*
* */
/**
* For now this is just word ahead contexts
* */
static void bankToCorp(const WordBank& bank, std::unordered_map<std::string, size_t>& corp) {
size_t start = 0;
for (const auto& seq: bank) {
for (int i = 0; i < seq.size(); ++i)
{
const auto& result = corp.insert({seq[i], start});
if (result.second) {
++start;
}
}
}
}
static void printCorp(const std::unordered_map<std::string, size_t>& corp) {
for (auto const& [key, value] : corp) {
std::printf("%s -> %zu\n", key.c_str(), value);
}
}
static size_t seqToBits(const std::unordered_map<std::string, size_t>& corp, const WordBank& bank, const std::string& target) {
size_t result = 0;
for (const auto& seq: bank) {
for (int i = 0; i < (seq.size() - 1); ++i) {
if (seq[i] == target) {
const auto found = corp.find(seq[i + 1]);
assert(found != corp.end());
result |= 1 << (found->second);
}
}
}
return result;
}
int main(int argc, char const *argv[])
{
static const WordBank myBank = {{"the", "ball", "is", "good"}, {"the", "town", "is", "good"}};
std::unordered_map<std::string, size_t> corp;
bankToCorp(myBank, corp);
printCorp(corp);
const auto result = seqToBits(corp, myBank, "the");
std::cout << std::bitset<8>(result) << "\n";
/***
* town -> 4
good -> 3
ball -> 1
is -> 2
the -> 0
00010010
* */
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment