Skip to content

Instantly share code, notes, and snippets.

#!/bin/sed -f
# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
# Yeah, sure.
# expected input: raw text with ONE SENTENCE TOKEN PER LINE
# by Robert MacIntyre, University of Pennsylvania, late 1995.
# If this wasn't such a trivial program, I'd include all that stuff about
@jdkato
jdkato / Demo.c
Created August 9, 2017 06:07 — forked from mikeando/Demo.c
Example of using C++ from C.
#include "HMyClass.h"
#include <stdio.h>
void my_eh( const char * error_message, void * unused)
{
printf("my_eh: %s\n", error_message);
}
int main()
{
@jdkato
jdkato / analysis.md
Last active January 30, 2018 18:41
Linode Analysis
@jdkato
jdkato / api.go
Created July 13, 2018 17:08
`prose` v2.0.0 API
package main
import (
"gopkg.in/jdkato/prose.v2"
)
func main() {
// You can pass "functional options" to control the document-creation
// pipeline -- e.g., `prose.WithExtraction(false)` disables named-entity
// extraction.
@jdkato
jdkato / tokenize.go
Last active July 13, 2018 17:13
`prose` v2.0.0 example
package main
import (
"gopkg.in/jdkato/prose.v2"
)
func main() {
doc, _ := prose.NewDocument("This is a sentence.")
for _, token := range doc.Tokens() {
fmt.Println(token.Text)
Type Example
Email addresses Jane.Doe@example.com
Hashtags #trending
Mentions @jdkato
URLs https://github.com/jdkato/prose
Emoticons :-), >:(, o_0, etc.
@jdkato
jdkato / golden-rules.csv
Last active July 13, 2018 17:37
Segment Evaluations
Name Language License GRS (English) GRS (Other) Speed
Pragmatic Segmenter Ruby MIT 98.08% (51/52) 100.00% 3.84 s
prose Go MIT 75.00% (39/52) N/A 0.96 s (different hardware)
TactfulTokenizer Ruby GNU GPLv3 65.38% (34/52) 48.57% 46.32 s
OpenNLP Java APLv2 59.62% (31/52) 45.71% 1.27 s
Standford CoreNLP Java GNU GPLv3 59.62% (31/52) 31.43% 0.92 s
Splitta Python APLv2 55.77% (29/52) 37.14% N/A
Punkt Python APLv2 46.15% (24/52) 48.57% 1.79 s
SRX English Ruby GNU GPLv3 30.77% (16/52) 28.57% 6.19 s
Scapel Ruby GNU GPLv3 28.85% (15/52) 20.00% 0.13 s
Library Accuracy† 5-Run Average (seconds)
NLTK 0.893 7.224
prose 0.961 2.538
package main
import (
"gopkg.in/jdkato/prose.v2"
)
func main() {
doc, _ := prose.NewDocument("Lebron James plays basketball in Los Angeles.")
for _, ent := range doc.Entities() {
fmt.Println(ent.Text, ent.Label)
package main
import (
"gopkg.in/jdkato/prose.v2"
)
func main() {
// Where `ents` is `[]prose.LabeledEntity` from some external source.
//
// "iOS" is what we've named this model, but it can be anything.