This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from collections import defaultdict | |
raw_bytes = urllib.request.urlopen( | |
'http://www.sls.hawaii.edu/bley-vroman/brown.txt') | |
brown_corpus = raw_bytes.read().decode('utf8').replace('\r\n', '\n') | |
B = brown_corpus[:250] | |
def find_pairs(text): | |
pairs = defaultdict(int) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import numpy as np | |
import pandas as pd | |
import torch | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from tqdm.auto import tqdm | |
with open('sts_attributions/shelf_approx_attr_l-9_N-100.pkl', 'rb') as inp: | |
shelf_approx = pickle.load(inp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import pickle | |
import requests | |
import torch | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers.models import Pooling | |
from sentence_transformers import util | |
from xsbert import models |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from http.server import BaseHTTPRequestHandler, HTTPServer | |
import pandas as pd | |
hostName = "localhost" | |
serverPort = 20000 | |
# A global variable to store the queue elements | |
queue = [] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from itertools import combinations | |
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer, util | |
def compute_kernel_bias(vecs, k=None): | |
""" | |
Code taken from: https://github.com/bojone/BERT-whitening |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import shutil | |
def copy_tree(src, dst): | |
''' | |
Copy a directory tree from src to dst ignoring dangling | |
symlinks, retrieving files symlinks point to, and | |
breaking the cycles, i.e. never copying the same |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from math import ceil | |
from random import shuffle | |
import torch | |
import torch.nn as nn | |
from transformers import AutoTokenizer, AutoModel | |
from transformers import AdamW, get_scheduler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# We're given an n by n distance matrix *D* with transfer | |
# probabilities for a given pair of nodes (for any feature), | |
# a feature matrix *M*, and a dropout probability p_d. | |
# We convert the transfer probabilities to no-transfer probabilities | |
# and take their logs | |
L = np.log(1 - D) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Собираем вместе все возможные знаки пунктуации | |
import sys | |
from unicodedata import category | |
chrs = (chr(i) for i in range(sys.maxunicode + 1)) | |
punctuation = set(c for c in chrs if category(c).startswith("P")) | |
# Дефис бывает внутри слов | |
punctuation.remove('-') | |
def tokenize(s, lower_case=False): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rm_whitespace(s): | |
if s.startswith('Ġ'): | |
return s[1:] | |
else: | |
return s | |
def get_tokens_with_ranges(input_string, tokenizer): | |
''' | |
RoBERTa prepends 'Ġ' to the beginning of what it |
NewerOlder