Skip to content

Instantly share code, notes, and snippets.

View pmbaumgartner's full-sized avatar

Peter Baumgartner pmbaumgartner

View GitHub Profile
@pmbaumgartner
pmbaumgartner / docx-cli-search.md
Created July 19, 2021 15:17
Search the contents of Word docs via CLI

Search Contents of Word Documents from the Terminal

You'll need ripgrep and pandoc to get started. You can read more about ripgrep here and pandoc here. I use both of these frequently and they're quite helpful.

You can install them both with homebrew:

brew install pandoc ripgrep
@pmbaumgartner
pmbaumgartner / cloud-init.yaml
Last active August 13, 2025 19:50
Multipass & Docker Setup
#cloud-config
package_upgrade: true
ssh_authorized_keys:
- <your key>
packages:
- apt-transport-https
- ca-certificates
- curl
@pmbaumgartner
pmbaumgartner / softie.py
Last active July 12, 2024 13:50
Create a soft label classifier from any scikit-learn regressor object
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.special import expit, logit
class SoftLabelClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, regressor, eps=0.001):
self.regressor = regressor
self.eps = eps
def fit(self, X, y=None):
@pmbaumgartner
pmbaumgartner / dash.py
Created December 15, 2021 00:10
simple app for clicking on data to label
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import json
clicked = []
external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]
app = dash.Dash(external_stylesheets=external_stylesheets)
import random
from typing import List, Union
import shapely.geometry as geo
from tqdm import tqdm
MultiLineStringType = Union[List[geo.LineString], geo.MultiLineString]
def overlap_lines(
@pmbaumgartner
pmbaumgartner / spacy_colab_gpu.py
Last active January 7, 2022 00:52
Install spaCy with correct cuda version on Colab with GPU
# SETUP
cuda_version = !nvcc --version | grep -Po '^.*release \K([0-9]+\.[0-9])' | sed 's/\.//g'
# REQUIREMENTS
from pathlib import Path
reqs = f"""
spacy[cuda{cuda_version.s},transformers,lookups]==3.2.1
"""
Path("requirements.txt").write_text(reqs)
!pip install --quiet -r requirements.txt
@pmbaumgartner
pmbaumgartner / cleaning_tokenizer.py
Created January 10, 2022 15:49
Clean in spaCy Tokenizer
from spacy.tokenizer import Tokenizer
class CTLTokenizer(Tokenizer):
# https://stackoverflow.com/a/58718664
def __call__(self, string) -> spacy.tokens.Doc:
string = self.clean_string(string)
doc = super().__call__(string)
return doc
def clean_string(self, string: str) -> str:
@pmbaumgartner
pmbaumgartner / digit_ngram_suggester.py
Last active July 1, 2024 16:08
A span candidate suggester function for spaCy that suggests spans containing a digit.
from typing import Optional, Iterable, cast, List
from thinc.api import get_current_ops, Ops
from thinc.types import Ragged, Ints1d
from spacy.pipeline.spancat import Suggester
from spacy.tokens import Doc
from spacy.util import registry
@registry.misc("ngram_digits_suggester.v1")
@pmbaumgartner
pmbaumgartner / getiosevka.sh
Last active September 22, 2025 10:29
Iosevka Ubuntu Install - edited from https://blog.programster.org/install-iosevka-fonts
RELEASE="15.2.0"
mkdir /tmp/iosevka-font/v$RELEASE
cd /tmp/iosevka-font/v$RELEASE
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-$RELEASE.zip
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-aile-$RELEASE.zip
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-curly-$RELEASE.zip
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-curly-slab-$RELEASE.zip
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-etoile-$RELEASE.zip
wget https://github.com/be5invis/Iosevka/releases/download/v$RELEASE/ttf-iosevka-fixed-$RELEASE.zip
@pmbaumgartner
pmbaumgartner / dep-displacy.py
Created June 23, 2022 20:20
dependency matcher displacy example
from spacy import displacy
from spacy.tokens import Doc
from spacy.util import get_lang_class
from pathlib import Path
words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
doc = Doc(get_lang_class("en")().vocab, words=words, heads=heads, deps=deps)