Skip to content

Instantly share code, notes, and snippets.

@morrisalp
morrisalp / grequests_tqdm.py
Last active September 18, 2023 12:19
send async HTTP requests using grequests with tqdm progress bar
from tqdm import tqdm
import requests, grequests
class ProgressSession():
def __init__(self, urls):
self.pbar = tqdm(total = len(urls), desc = 'Making async requests')
self.urls = urls
def update(self, r, *args, **kwargs):
if not r.is_redirect:
self.pbar.update()
@morrisalp
morrisalp / spacy_newline.py
Last active February 12, 2020 11:50
Spacy English model with sentence segmentation on newlines
import spacy
nlp = spacy.load('en')
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == "\n":
doc[token.i+1].is_sent_start = True
return doc
@morrisalp
morrisalp / transformer.py
Last active February 2, 2021 09:12
minimal TF 2.0 (+ Keras) example of a transformer, based on the Peter Bloem article "Transformers from Scratch" (http://www.peterbloem.nl/blog/transformers)
from tensorflow.keras.layers import Input, Dense, Lambda, Reshape, Activation, Layer, LayerNormalization, Add
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow as tf
class SelfAttention(Layer):
def __init__(self, heads = 8):
super().__init__()
self.heads = heads
@morrisalp
morrisalp / load_conll2003.py
Created November 19, 2019 17:22
load CONLL2003 dataset using Pandas
import pandas as pd
def read_conll(filename):
df = pd.read_csv(filename,
sep = ' ', header = None, keep_default_na = False,
names = ['TOKEN', 'POS', 'CHUNK', 'NE'],
quoting = 3, skip_blank_lines = False)
df['SENTENCE'] = (df.TOKEN == '').cumsum()
return df[df.TOKEN != '']
@morrisalp
morrisalp / .vimrc
Last active October 5, 2019 07:08
my personal .vimrc
set encoding=utf-8
set autoindent
set expandtab
set tabstop=4
set shiftwidth=4
set number
set hlsearch incsearch
set wildmenu
set showcmd
syntax on
@morrisalp
morrisalp / wiktionary_category.py
Last active June 24, 2023 21:05
Get all page names in a given Wiktionary category (e.g. "English lemmas") using the Wiki REST API.
import requests
def pages_in_wiktionary_category(category_name, language = 'en'):
cont = ''
while True:
url = f'https://{language}.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category_name}&cmlimit=500&format=json&cmcontinue={cont}'
obj = requests.get(url).json()
for x in obj['query']['categorymembers']: yield x['title']
if 'continue' not in obj: break
cont = obj['continue']['cmcontinue']