b2m · May 13, 2025 13:44
diff --git a/spacy_fastapi.py b/spacy_fastapi.py
 # /// script
 # requires-python = ">=3.12,<3.13"
 # dependencies = [
 #     "spacy==3.8.5",
 #     "langdetect",
 #     "fastapi",
 #     "pydantic",
 #     "uvicorn",
 #     "de_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0.tar.gz",
 # ]
 # ///

 import re
 from http import HTTPStatus
 from typing import List

 import de_core_news_lg as model
 import uvicorn
 from fastapi import FastAPI
 from langdetect import detect
 from pydantic import BaseModel, Field

 app = FastAPI(
    title="NLP services based on spaCy",
    description="""
 Provides some NLP component from [spaCy](https://spacy.io/) as web service.
 - spaCy: 3.8.5
 - Model: [de_core_news_lg](https://spacy.io/models/de#de_core_news_lg)
 """,
    docs_url="/",
 )


 # load trained pipeline with only the NER component
 nlp = model.load()


 class Lang_Request(BaseModel):
    """
    Request with text to perform language detection.
    """

    text: str = Field(
        ...,
        title="Text",
        description="Text to detect language from.",
        example="Martin Luther war in Wittenberg.",
    )


 class NER_Request(BaseModel):
    """
    Request with text to perform NER.
    """

    text: str = Field(
        ...,
        title="Text",
        description="Text to extract entities from.",
        example="Martin Luther war in Wittenberg.",
    )


 class Normalize_Request(BaseModel):
    """
    Request with text to perform normalization.
    """

    text: str = Field(
        ...,
        title="Text",
        description="Text to perform normalization.",
        example="Wälder (Wald) Hügel Berge Segel-Boote 42.",
    )


 class Distance_Request(BaseModel):
    """
    Request with two texts to perform distance/similarity calculation.
    """

    text1: str = Field(
        ...,
        title="Text 1",
        description="First text sample to compare to second text sample.",
        example="Berg",
    )
    text2: str = Field(
        ...,
        title="Text 2",
        description="Second text sample to compare to first text sample.",
        example="Hügel",
    )


 class Entity(BaseModel):
    """
    Named Entity found in the text.
    """

    start: int = Field(
        ...,
        title="Start",
        description="Start position of entity in the text.",
        ge=0,
        example=0,
    )
    end: int = Field(
        ...,
        title="End",
        description="End position of entity in the text.",
        ge=1,
        example=2,
    )
    text: str = Field(
        ...,
        title="Text",
        description="The text of the Named Entity.",
        min_length=1,
        example="Martin Luther",
    )
    label: str = Field(
        ...,
        title="Label",
        description="The label (type) for the Named Entity.",
        example="PER",
    )


 def spacy_similarity(text1: str, text2: str) -> float:
    with nlp.select_pipes(disable=["ner"]):
        doc_a = nlp(text1)
        doc_b = nlp(text2)
        sim = doc_a.similarity(doc_b)
    return sim


 @app.post(
    "/langdetect",
    summary="Detect the language of the text.",
    responses={
        HTTPStatus.OK: {
            "description": "ISO 369-1 representation of detected language for the given text.",
            "content": {"application/json": {"example": "de"}},
        }
    },
    tags=["Analyze"],
 )
 def langdetect(lang_request: Lang_Request) -> str:
    """
    Tries to determine the language of the given text and returns the [ISO 369-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
    """
    return detect(lang_request.text)


 @app.post(
    "/similarity",
    summary="Calculate vector similarity between texts.",
    response_description="The vector based similarity between the two given strings.",
    tags=["Distance"],
 )
 def similarity(distance_request: Distance_Request) -> float:
    """
    Uses vector based distance calculation to calculate the similarity between two strings, with 0 as lowest value (no similarity).
    """
    return spacy_similarity(distance_request.text1, distance_request.text2)


 @app.post(
    "/distance",
    summary="Calculate vector distance between texts.",
    response_description="The vector based distance between the two given strings.",
    tags=["Distance"],
 )
 def distance(distance_request: Distance_Request) -> float:
    """
    Same as `1 - similarity`.

    You can use this for nearest neighbor clustering where you need distance instead of similarity, with 1 as highest value (highest distance).
    """
    return 1 - spacy_similarity(distance_request.text1, distance_request.text2)


 @app.post(
    "/normalize",
    summary="Normalize the given text.",
    responses={
        HTTPStatus.OK: {
            "description": "Normalized words separated by whitespace.",
            "content": {
                "application/json": {"example": "42 berg boot hügel segel wald"}
            },
        }
    },
    tags=["Keying"],
 )
 def normalize(normalize_request: Normalize_Request) -> str:
    """
    Normalize a given text for clustering.

    1. All non word characters are removed.
    2. Words are reduced to their lemma.
    3. Words are transformed to lowercase.
    4. Duplicates are removed.
    5. Words are ordered alphabetically.
    """
    text = re.sub(r"\W|_", " ", normalize_request.text)
    text = re.sub(r"\s+", " ", text)
    with nlp.select_pipes(disable=["ner"]):
        doc = nlp(text)
    lemmas = " ".join(
        sorted({token.lemma_.lower() for token in doc if not token.is_stop})
    )
    return lemmas


 @app.post(
    "/ner",
    response_model=List[Entity],
    summary="Perform NER on text.",
    response_description="List of found entities.",
    tags=["Analyze"],
 )
 def ner(ner_request: NER_Request) -> list[Entity]:
    """
    Performs a Named Entity Recognition on the given `text`.
    Will return the found entities in a list.
    """
    doc = nlp(ner_request.text)
    return [
        Entity(start=ent.start, end=ent.end, text=ent.text, label=ent.label_)
        for ent in doc.ents
    ]


 if __name__ == "__main__":
    uvicorn.run("spacy_fastapi:app", host="127.0.0.1", port=5000)
	# /// script
	# requires-python = ">=3.12,<3.13"
	# dependencies = [
	# "spacy==3.8.5",
	# "langdetect",
	# "fastapi",
	# "pydantic",
	# "uvicorn",
	# "de_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0.tar.gz",
	# ]
	# ///

	import re
	from http import HTTPStatus
	from typing import List

	import de_core_news_lg as model
	import uvicorn
	from fastapi import FastAPI
	from langdetect import detect
	from pydantic import BaseModel, Field

	app = FastAPI(
	title="NLP services based on spaCy",
	description="""
	Provides some NLP component from [spaCy](https://spacy.io/) as web service.
	- spaCy: 3.8.5
	- Model: [de_core_news_lg](https://spacy.io/models/de#de_core_news_lg)
	""",
	docs_url="/",
	)


	# load trained pipeline with only the NER component
	nlp = model.load()


	class Lang_Request(BaseModel):
	"""
	Request with text to perform language detection.
	"""

	text: str = Field(
	...,
	title="Text",
	description="Text to detect language from.",
	example="Martin Luther war in Wittenberg.",
	)


	class NER_Request(BaseModel):
	"""
	Request with text to perform NER.
	"""

	text: str = Field(
	...,
	title="Text",
	description="Text to extract entities from.",
	example="Martin Luther war in Wittenberg.",
	)


	class Normalize_Request(BaseModel):
	"""
	Request with text to perform normalization.
	"""

	text: str = Field(
	...,
	title="Text",
	description="Text to perform normalization.",
	example="Wälder (Wald) Hügel Berge Segel-Boote 42.",
	)


	class Distance_Request(BaseModel):
	"""
	Request with two texts to perform distance/similarity calculation.
	"""

	text1: str = Field(
	...,
	title="Text 1",
	description="First text sample to compare to second text sample.",
	example="Berg",
	)
	text2: str = Field(
	...,
	title="Text 2",
	description="Second text sample to compare to first text sample.",
	example="Hügel",
	)


	class Entity(BaseModel):
	"""
	Named Entity found in the text.
	"""

	start: int = Field(
	...,
	title="Start",
	description="Start position of entity in the text.",
	ge=0,
	example=0,
	)
	end: int = Field(
	...,
	title="End",
	description="End position of entity in the text.",
	ge=1,
	example=2,
	)
	text: str = Field(
	...,
	title="Text",
	description="The text of the Named Entity.",
	min_length=1,
	example="Martin Luther",
	)
	label: str = Field(
	...,
	title="Label",
	description="The label (type) for the Named Entity.",
	example="PER",
	)


	def spacy_similarity(text1: str, text2: str) -> float:
	with nlp.select_pipes(disable=["ner"]):
	doc_a = nlp(text1)
	doc_b = nlp(text2)
	sim = doc_a.similarity(doc_b)
	return sim


	@app.post(
	"/langdetect",
	summary="Detect the language of the text.",
	responses={
	HTTPStatus.OK: {
	"description": "ISO 369-1 representation of detected language for the given text.",
	"content": {"application/json": {"example": "de"}},
	}
	},
	tags=["Analyze"],
	)
	def langdetect(lang_request: Lang_Request) -> str:
	"""
	Tries to determine the language of the given text and returns the [ISO 369-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
	"""
	return detect(lang_request.text)


	@app.post(
	"/similarity",
	summary="Calculate vector similarity between texts.",
	response_description="The vector based similarity between the two given strings.",
	tags=["Distance"],
	)
	def similarity(distance_request: Distance_Request) -> float:
	"""
	Uses vector based distance calculation to calculate the similarity between two strings, with 0 as lowest value (no similarity).
	"""
	return spacy_similarity(distance_request.text1, distance_request.text2)


	@app.post(
	"/distance",
	summary="Calculate vector distance between texts.",
	response_description="The vector based distance between the two given strings.",
	tags=["Distance"],
	)
	def distance(distance_request: Distance_Request) -> float:
	"""
	Same as `1 - similarity`.

	You can use this for nearest neighbor clustering where you need distance instead of similarity, with 1 as highest value (highest distance).
	"""
	return 1 - spacy_similarity(distance_request.text1, distance_request.text2)


	@app.post(
	"/normalize",
	summary="Normalize the given text.",
	responses={
	HTTPStatus.OK: {
	"description": "Normalized words separated by whitespace.",
	"content": {
	"application/json": {"example": "42 berg boot hügel segel wald"}
	},
	}
	},
	tags=["Keying"],
	)
	def normalize(normalize_request: Normalize_Request) -> str:
	"""
	Normalize a given text for clustering.

	1. All non word characters are removed.
	2. Words are reduced to their lemma.
	3. Words are transformed to lowercase.
	4. Duplicates are removed.
	5. Words are ordered alphabetically.
	"""
	text = re.sub(r"\W\|_", " ", normalize_request.text)
	text = re.sub(r"\s+", " ", text)
	with nlp.select_pipes(disable=["ner"]):
	doc = nlp(text)
	lemmas = " ".join(
	sorted({token.lemma_.lower() for token in doc if not token.is_stop})
	)
	return lemmas


	@app.post(
	"/ner",
	response_model=List[Entity],
	summary="Perform NER on text.",
	response_description="List of found entities.",
	tags=["Analyze"],
	)
	def ner(ner_request: NER_Request) -> list[Entity]:
	"""
	Performs a Named Entity Recognition on the given `text`.
	Will return the found entities in a list.
	"""
	doc = nlp(ner_request.text)
	return [
	Entity(start=ent.start, end=ent.end, text=ent.text, label=ent.label_)
	for ent in doc.ents
	]


	if __name__ == "__main__":
	uvicorn.run("spacy_fastapi:app", host="127.0.0.1", port=5000)