wesslen’s gists

wesslen / relations_validation.py

Last active June 5, 2023 17:21

Prodigy relations validation with validate_answer callback that checks that both relations are labeled entities

	# Prodigy v1.11.x; some imports will change for v1.12+
	import copy
	from pathlib import Path
	from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

	import srsly
	from spacy.language import Language
	from spacy.tokens import Doc, Span, Token
	from spacy.util import filter_spans

wesslen / jsonl-to-coco.py

Last active April 6, 2023 13:45

Convert Prodigy jsonl bounding box annotations to coco format

	import json
	from typing import List
	import srsly
	import typer

	app = typer.Typer()


	def convert_to_coco(input_file: str, output_file: str):
	# Load the JSONL file using srsly

wesslen / nyt_text_dedup.jsonl

Created April 3, 2023 16:37

	{"text":"How Silicon Valley Pushed Coding Into American Classrooms","meta":{"source":"The New York Times","i":0}}
	{"text":"Women in Tech Speak Frankly on Culture of Harassment","meta":{"source":"The New York Times","i":1}}
	{"text":"Silicon Valley Investors Flexed Their Muscles in Uber Fight","meta":{"source":"The New York Times","i":2}}
	{"text":"Uber is a Creature of an Industry Struggling to Grow Up","meta":{"source":"The New York Times","i":3}}
	{"text":"\u2018The Internet Is Broken\u2019: @ev Is Trying to Salvage It","meta":{"source":"The New York Times","i":4}}
	{"text":"The South Park Commons Fills a Hole in the Tech Landscape","meta":{"source":"The New York Times","i":5}}
	{"text":"The Closing of the Republican Mind","meta":{"source":"The New York Times","i":6}}
	{"text":"Writers From the Right and Left on Trump Jr., the Future of the F.B.I., Health Care and More","meta":{"source":"The New York Times","i":7}}
	{"text":"Daily Report: From Lean to Fat Start-Ups","meta":{"source":"The New York Times","i":8}}
	{"

wesslen / overlapping.jsonl

Last active March 16, 2023 15:50

Textcat classification with pre-annotated overlapping spans, see https://support.prodi.gy/t/textcat-using-span-overlapping-view/6434

{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]}

wesslen / ner_manual.py

Created March 1, 2023 13:11

Prodigy ner.manual recipe modifying the port number

	from typing import List, Optional
	import spacy
	import prodigy
	from prodigy.components.loaders import JSONL
	from prodigy.components.preprocess import add_tokens
	from prodigy.models.matcher import PatternMatcher
	from prodigy.util import split_string


	# Helper function for removing token information from examples

wesslen / prodigy-ner-streamlit.py

Created February 27, 2023 19:58

Python script for Prodigy NER dataset viewer using Streamlit

	"""
	Example of a Streamlit app for an interactive Prodigy NER dataset viewer.
	Requires the Prodigy annotation tool to be installed: https://prodi.gy
	See here for details on Streamlit: https://streamlit.io.
	"""
	import streamlit as st
	from prodigy.components.db import connect
	from prodigy.models.ner import merge_spans
	import pandas as pd
	import spacy

wesslen / dog_image_recipe.py

Created February 20, 2023 15:25

Dog API Prodigy recipe

	import prodigy
	import requests
	import random
	from typing import List
	from prodigy.components.preprocess import fetch_media
	from prodigy.util import split_string
	from prodigy import set_hashes

	def get_stream(labels):
	while True:

wesslen / partition-jsonl.py

Created February 17, 2023 19:25

Partition jsonl with clumper

	"""Prepare files for ML steps. Converts the JSONL file."""
	import typer
	import random
	import pathlib
	from rich.console import Console
	from clumper import Clumper
	from pathlib import Path


	def prepare(input_path: Path, train_jsonl: Path, valid_jsonl: Path):

wesslen / textcat_sent_sequence.py

Last active January 30, 2023 15:59

Prodigy recipe for binary text classification on the sentence level (highlighted) within context of paragraph.

	import prodigy
	import spacy
	from prodigy.components.loaders import JSONL

	@prodigy.recipe(
	"textcat_sent_sequence",
	dataset=("Dataset to save answers to", "positional", None, str),
	examples=("Examples to load from disk", "positional", None, str),
	model=("spaCy model to load", "positional", None, str),
	label=("Label for annotated data", "positional", None, str),

wesslen / 01-preprocess-sentences.py

Last active September 7, 2022 13:35

spaCy sentencizer script

	import spacy
	import srsly # to easily read/write JSONL etc.

	nlp = spacy.load("en_core_web_sm") # or whatever you need
	examples = srsly.read_jsonl("./data.jsonl")
	texts = (eg["text"] for eg in examples)

	new_examples = []
	for doc in nlp.pipe(texts):
	for sent in doc.sents:

Ryan Wesslen wesslen