Skip to content

Instantly share code, notes, and snippets.

@b2m
Created May 13, 2025 13:44
Show Gist options
  • Save b2m/cadf88263f7978be96c164a89968c44c to your computer and use it in GitHub Desktop.
Save b2m/cadf88263f7978be96c164a89968c44c to your computer and use it in GitHub Desktop.
Documented FastAPI wrapper arround spaCy to be used together with OpenRefine. Run with `uv run spacy_fastapi.py`.
# /// script
# requires-python = ">=3.12,<3.13"
# dependencies = [
# "spacy==3.8.5",
# "langdetect",
# "fastapi",
# "pydantic",
# "uvicorn",
# "de_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0.tar.gz",
# ]
# ///
import re
from http import HTTPStatus
from typing import List
import de_core_news_lg as model
import uvicorn
from fastapi import FastAPI
from langdetect import detect
from pydantic import BaseModel, Field
app = FastAPI(
title="NLP services based on spaCy",
description="""
Provides some NLP component from [spaCy](https://spacy.io/) as web service.
- spaCy: 3.8.5
- Model: [de_core_news_lg](https://spacy.io/models/de#de_core_news_lg)
""",
docs_url="/",
)
# load trained pipeline with only the NER component
nlp = model.load()
class Lang_Request(BaseModel):
"""
Request with text to perform language detection.
"""
text: str = Field(
...,
title="Text",
description="Text to detect language from.",
example="Martin Luther war in Wittenberg.",
)
class NER_Request(BaseModel):
"""
Request with text to perform NER.
"""
text: str = Field(
...,
title="Text",
description="Text to extract entities from.",
example="Martin Luther war in Wittenberg.",
)
class Normalize_Request(BaseModel):
"""
Request with text to perform normalization.
"""
text: str = Field(
...,
title="Text",
description="Text to perform normalization.",
example="Wälder (Wald) Hügel Berge Segel-Boote 42.",
)
class Distance_Request(BaseModel):
"""
Request with two texts to perform distance/similarity calculation.
"""
text1: str = Field(
...,
title="Text 1",
description="First text sample to compare to second text sample.",
example="Berg",
)
text2: str = Field(
...,
title="Text 2",
description="Second text sample to compare to first text sample.",
example="Hügel",
)
class Entity(BaseModel):
"""
Named Entity found in the text.
"""
start: int = Field(
...,
title="Start",
description="Start position of entity in the text.",
ge=0,
example=0,
)
end: int = Field(
...,
title="End",
description="End position of entity in the text.",
ge=1,
example=2,
)
text: str = Field(
...,
title="Text",
description="The text of the Named Entity.",
min_length=1,
example="Martin Luther",
)
label: str = Field(
...,
title="Label",
description="The label (type) for the Named Entity.",
example="PER",
)
def spacy_similarity(text1: str, text2: str) -> float:
with nlp.select_pipes(disable=["ner"]):
doc_a = nlp(text1)
doc_b = nlp(text2)
sim = doc_a.similarity(doc_b)
return sim
@app.post(
"/langdetect",
summary="Detect the language of the text.",
responses={
HTTPStatus.OK: {
"description": "ISO 369-1 representation of detected language for the given text.",
"content": {"application/json": {"example": "de"}},
}
},
tags=["Analyze"],
)
def langdetect(lang_request: Lang_Request) -> str:
"""
Tries to determine the language of the given text and returns the [ISO 369-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
"""
return detect(lang_request.text)
@app.post(
"/similarity",
summary="Calculate vector similarity between texts.",
response_description="The vector based similarity between the two given strings.",
tags=["Distance"],
)
def similarity(distance_request: Distance_Request) -> float:
"""
Uses vector based distance calculation to calculate the similarity between two strings, with 0 as lowest value (no similarity).
"""
return spacy_similarity(distance_request.text1, distance_request.text2)
@app.post(
"/distance",
summary="Calculate vector distance between texts.",
response_description="The vector based distance between the two given strings.",
tags=["Distance"],
)
def distance(distance_request: Distance_Request) -> float:
"""
Same as `1 - similarity`.
You can use this for nearest neighbor clustering where you need distance instead of similarity, with 1 as highest value (highest distance).
"""
return 1 - spacy_similarity(distance_request.text1, distance_request.text2)
@app.post(
"/normalize",
summary="Normalize the given text.",
responses={
HTTPStatus.OK: {
"description": "Normalized words separated by whitespace.",
"content": {
"application/json": {"example": "42 berg boot hügel segel wald"}
},
}
},
tags=["Keying"],
)
def normalize(normalize_request: Normalize_Request) -> str:
"""
Normalize a given text for clustering.
1. All non word characters are removed.
2. Words are reduced to their lemma.
3. Words are transformed to lowercase.
4. Duplicates are removed.
5. Words are ordered alphabetically.
"""
text = re.sub(r"\W|_", " ", normalize_request.text)
text = re.sub(r"\s+", " ", text)
with nlp.select_pipes(disable=["ner"]):
doc = nlp(text)
lemmas = " ".join(
sorted({token.lemma_.lower() for token in doc if not token.is_stop})
)
return lemmas
@app.post(
"/ner",
response_model=List[Entity],
summary="Perform NER on text.",
response_description="List of found entities.",
tags=["Analyze"],
)
def ner(ner_request: NER_Request) -> list[Entity]:
"""
Performs a Named Entity Recognition on the given `text`.
Will return the found entities in a list.
"""
doc = nlp(ner_request.text)
return [
Entity(start=ent.start, end=ent.end, text=ent.text, label=ent.label_)
for ent in doc.ents
]
if __name__ == "__main__":
uvicorn.run("spacy_fastapi:app", host="127.0.0.1", port=5000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment