Dmitry Nikolayev macleginn

macleginn / primer_on_extracting_attention_weights.py

Created April 18, 2025 11:29

An example of extracting attention weights from a transformer encoder in PyTorch that works with PyTorch 2.6 with some explanatory comments.

	import torch
	import torch.nn as nn
	from collections import defaultdict

	# --- Hook Implementation ---

	# Dictionary to store attention weights during the forward pass
	# Structure: {layer_index: attention_weights_tensor}
	# The tensor shape will be (batch_size, num_heads, seq_len, seq_len)

macleginn / bpe.py

Created November 15, 2024 10:33

BPE without word separation

	import urllib.request
	from collections import defaultdict

	raw_bytes = urllib.request.urlopen(
	'http://www.sls.hawaii.edu/bley-vroman/brown.txt')
	brown_corpus = raw_bytes.read().decode('utf8').replace('\r\n', '\n')
	B = brown_corpus[:250]

	def find_pairs(text):
	pairs = defaultdict(int)

macleginn / sumpos_test.py

Last active October 6, 2023 15:13

	import pickle
	import numpy as np
	import pandas as pd
	import torch
	import matplotlib.pyplot as plt
	import seaborn as sns
	from tqdm.auto import tqdm

	with open('sts_attributions/shelf_approx_attr_l-9_N-100.pkl', 'rb') as inp:
	shelf_approx = pickle.load(inp)

macleginn / xsbert_worker_process.py

Last active September 8, 2023 12:50

XSBERT worker process

	import os
	import sys
	import pickle
	import requests
	import torch
	from sentence_transformers import SentenceTransformer
	from sentence_transformers.models import Pooling
	from sentence_transformers import util
	from xsbert import models

macleginn / xsbert_queue_server.py

Created September 8, 2023 11:44

XSBERT queue server

	import json
	from http.server import BaseHTTPRequestHandler, HTTPServer
	import pandas as pd

	hostName = "localhost"
	serverPort = 20000
	# A global variable to store the queue elements
	queue = []

macleginn / clusterise_domain.py

Created December 20, 2022 08:28

Clusterisation of fine-grained CMP domains based on SBERT sentence similarities

	from collections import defaultdict
	from itertools import combinations
	import pandas as pd
	import numpy as np
	from sentence_transformers import SentenceTransformer, util


	def compute_kernel_bias(vecs, k=None):
	"""
	Code taken from: https://github.com/bojone/BERT-whitening

macleginn / copy_dir.py

Created June 15, 2022 08:54

	import os
	import sys
	import shutil


	def copy_tree(src, dst):
	'''
	Copy a directory tree from src to dst ignoring dangling
	symlinks, retrieving files symlinks point to, and
	breaking the cycles, i.e. never copying the same

macleginn / predict_from_CLS.py

Last active June 3, 2022 13:54

Training and evaluation code for a simple model that predicts a token removed from a sentence

	import json
	from math import ceil
	from random import shuffle

	import torch
	import torch.nn as nn

	from transformers import AutoTokenizer, AutoModel
	from transformers import AdamW, get_scheduler

macleginn / simulation_step.py

Last active February 11, 2022 12:01

A step in the simulation of random feature spread on a network guided by NPM

	import numpy as np

	# We're given an n by n distance matrix D with transfer
	# probabilities for a given pair of nodes (for any feature),
	# a feature matrix M, and a dropout probability p_d.

	# We convert the transfer probabilities to no-transfer probabilities
	# and take their logs
	L = np.log(1 - D)

macleginn / get_n_grams.py

Created June 26, 2021 08:32

	# Собираем вместе все возможные знаки пунктуации
	import sys
	from unicodedata import category
	chrs = (chr(i) for i in range(sys.maxunicode + 1))
	punctuation = set(c for c in chrs if category(c).startswith("P"))
	# Дефис бывает внутри слов
	punctuation.remove('-')


	def tokenize(s, lower_case=False):