Skip to content

Instantly share code, notes, and snippets.

View macleginn's full-sized avatar

Dmitry Nikolayev macleginn

View GitHub Profile
@macleginn
macleginn / bpe.py
Created November 15, 2024 10:33
BPE without word separation
import urllib.request
from collections import defaultdict
raw_bytes = urllib.request.urlopen(
'http://www.sls.hawaii.edu/bley-vroman/brown.txt')
brown_corpus = raw_bytes.read().decode('utf8').replace('\r\n', '\n')
B = brown_corpus[:250]
def find_pairs(text):
pairs = defaultdict(int)
import pickle
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
with open('sts_attributions/shelf_approx_attr_l-9_N-100.pkl', 'rb') as inp:
shelf_approx = pickle.load(inp)
@macleginn
macleginn / xsbert_worker_process.py
Last active September 8, 2023 12:50
XSBERT worker process
import os
import sys
import pickle
import requests
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Pooling
from sentence_transformers import util
from xsbert import models
@macleginn
macleginn / xsbert_queue_server.py
Created September 8, 2023 11:44
XSBERT queue server
import json
from http.server import BaseHTTPRequestHandler, HTTPServer
import pandas as pd
hostName = "localhost"
serverPort = 20000
# A global variable to store the queue elements
queue = []
@macleginn
macleginn / clusterise_domain.py
Created December 20, 2022 08:28
Clusterisation of fine-grained CMP domains based on SBERT sentence similarities
from collections import defaultdict
from itertools import combinations
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
def compute_kernel_bias(vecs, k=None):
"""
Code taken from: https://github.com/bojone/BERT-whitening
import os
import sys
import shutil
def copy_tree(src, dst):
'''
Copy a directory tree from src to dst ignoring dangling
symlinks, retrieving files symlinks point to, and
breaking the cycles, i.e. never copying the same
@macleginn
macleginn / predict_from_CLS.py
Last active June 3, 2022 13:54
Training and evaluation code for a simple model that predicts a token removed from a sentence
import json
from math import ceil
from random import shuffle
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, get_scheduler
@macleginn
macleginn / simulation_step.py
Last active February 11, 2022 12:01
A step in the simulation of random feature spread on a network guided by NPM
import numpy as np
# We're given an n by n distance matrix *D* with transfer
# probabilities for a given pair of nodes (for any feature),
# a feature matrix *M*, and a dropout probability p_d.
# We convert the transfer probabilities to no-transfer probabilities
# and take their logs
L = np.log(1 - D)
# Собираем вместе все возможные знаки пунктуации
import sys
from unicodedata import category
chrs = (chr(i) for i in range(sys.maxunicode + 1))
punctuation = set(c for c in chrs if category(c).startswith("P"))
# Дефис бывает внутри слов
punctuation.remove('-')
def tokenize(s, lower_case=False):
@macleginn
macleginn / get_roberta_word_embeddings.py
Created June 21, 2021 07:17
Code for extracting word embeddings from RoBERTa
def rm_whitespace(s):
if s.startswith('Ġ'):
return s[1:]
else:
return s
def get_tokens_with_ranges(input_string, tokenizer):
'''
RoBERTa prepends 'Ġ' to the beginning of what it