Skip to content

Instantly share code, notes, and snippets.

View wesslen's full-sized avatar

Ryan Wesslen wesslen

View GitHub Profile
@wesslen
wesslen / relations_validation.py
Last active June 5, 2023 17:21
Prodigy relations validation with validate_answer callback that checks that both relations are labeled entities
# Prodigy v1.11.x; some imports will change for v1.12+
import copy
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import srsly
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from spacy.util import filter_spans
@wesslen
wesslen / jsonl-to-coco.py
Last active April 6, 2023 13:45
Convert Prodigy jsonl bounding box annotations to coco format
import json
from typing import List
import srsly
import typer
app = typer.Typer()
def convert_to_coco(input_file: str, output_file: str):
# Load the JSONL file using srsly
{"text":"How Silicon Valley Pushed Coding Into American Classrooms","meta":{"source":"The New York Times","i":0}}
{"text":"Women in Tech Speak Frankly on Culture of Harassment","meta":{"source":"The New York Times","i":1}}
{"text":"Silicon Valley Investors Flexed Their Muscles in Uber Fight","meta":{"source":"The New York Times","i":2}}
{"text":"Uber is a Creature of an Industry Struggling to Grow Up","meta":{"source":"The New York Times","i":3}}
{"text":"\u2018The Internet Is Broken\u2019: @ev Is Trying to Salvage It","meta":{"source":"The New York Times","i":4}}
{"text":"The South Park Commons Fills a Hole in the Tech Landscape","meta":{"source":"The New York Times","i":5}}
{"text":"The Closing of the Republican Mind","meta":{"source":"The New York Times","i":6}}
{"text":"Writers From the Right and Left on Trump Jr., the Future of the F.B.I., Health Care and More","meta":{"source":"The New York Times","i":7}}
{"text":"Daily Report: From Lean to Fat Start-Ups","meta":{"source":"The New York Times","i":8}}
{"
@wesslen
wesslen / overlapping.jsonl
Last active March 16, 2023 15:50
Textcat classification with pre-annotated overlapping spans, see https://support.prodi.gy/t/textcat-using-span-overlapping-view/6434
{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]}
@wesslen
wesslen / ner_manual.py
Created March 1, 2023 13:11
Prodigy ner.manual recipe modifying the port number
from typing import List, Optional
import spacy
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.models.matcher import PatternMatcher
from prodigy.util import split_string
# Helper function for removing token information from examples
@wesslen
wesslen / prodigy-ner-streamlit.py
Created February 27, 2023 19:58
Python script for Prodigy NER dataset viewer using Streamlit
"""
Example of a Streamlit app for an interactive Prodigy NER dataset viewer.
Requires the Prodigy annotation tool to be installed: https://prodi.gy
See here for details on Streamlit: https://streamlit.io.
"""
import streamlit as st
from prodigy.components.db import connect
from prodigy.models.ner import merge_spans
import pandas as pd
import spacy
@wesslen
wesslen / dog_image_recipe.py
Created February 20, 2023 15:25
Dog API Prodigy recipe
import prodigy
import requests
import random
from typing import List
from prodigy.components.preprocess import fetch_media
from prodigy.util import split_string
from prodigy import set_hashes
def get_stream(labels):
while True:
@wesslen
wesslen / partition-jsonl.py
Created February 17, 2023 19:25
Partition jsonl with clumper
"""Prepare files for ML steps. Converts the JSONL file."""
import typer
import random
import pathlib
from rich.console import Console
from clumper import Clumper
from pathlib import Path
def prepare(input_path: Path, train_jsonl: Path, valid_jsonl: Path):
@wesslen
wesslen / textcat_sent_sequence.py
Last active January 30, 2023 15:59
Prodigy recipe for binary text classification on the sentence level (highlighted) within context of paragraph.
import prodigy
import spacy
from prodigy.components.loaders import JSONL
@prodigy.recipe(
"textcat_sent_sequence",
dataset=("Dataset to save answers to", "positional", None, str),
examples=("Examples to load from disk", "positional", None, str),
model=("spaCy model to load", "positional", None, str),
label=("Label for annotated data", "positional", None, str),
@wesslen
wesslen / 01-preprocess-sentences.py
Last active September 7, 2022 13:35
spaCy sentencizer script
import spacy
import srsly # to easily read/write JSONL etc.
nlp = spacy.load("en_core_web_sm") # or whatever you need
examples = srsly.read_jsonl("./data.jsonl")
texts = (eg["text"] for eg in examples)
new_examples = []
for doc in nlp.pipe(texts):
for sent in doc.sents: