Skip to content

Instantly share code, notes, and snippets.

@b2m
Created May 13, 2025 13:45
Show Gist options
  • Save b2m/91f3b812bcf0975c4d2cb3230099366a to your computer and use it in GitHub Desktop.
Save b2m/91f3b812bcf0975c4d2cb3230099366a to your computer and use it in GitHub Desktop.
Documented FastAPI wrapper arround RapidFuzz to be used together with OpenRefine. Run with `uv run rapidfuzz_fastapi.py`.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "fastapi",
# "pydantic",
# "uvicorn",
# "rapidfuzz",
# ]
# ///
from typing import Optional
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel, Field
from rapidfuzz import fuzz
from rapidfuzz.utils import default_process
host = "127.0.0.1"
port = 5000
app = FastAPI(
title="Levenshtein services based on RapidFuzz",
description=f"""
Provides the functions from [RapidFuzz](https://github.com/rapidfuzz/RapidFuzz) as web service.
Use from [custom clustering](https://openrefine.org/docs/manual/cellediting#custom-clustering-methods)
in [OpenRefine](https://openrefine.org/) with the following *Jython* code:
```python
import json, urllib, urllib2
url = "http://{host}:{port}/ratio"
request_data = json.dumps({{
"text1": value1.encode("utf-8"),
"text2": value2.encode("utf-8"),
"process": True,
}})
request = urllib2.Request(
url,
request_data,
{{"Content-Type": "application/json"}},
)
response = urllib2.urlopen(request)
return json.dumps(json.load(response), ensure_ascii=False)
```
""",
docs_url="/",
)
class Fuzz_Request(BaseModel):
"""
Request with two texts to perform Levenshtein distance calculation.
"""
text1: str = Field(
...,
title="Text 1",
description="First text sample to compare to second text sample.",
example="fuzzy was a bear",
)
text2: str = Field(
...,
title="Text 2",
description="Second text sample to compare to first text sample.",
example="fuzzy wuzzy was a bear",
)
process: Optional[bool] = Field(
True,
title="Process Text",
description="Indicator whether text should get processed (remove non alphanumeric characters, trimming whitespace, converting to lowercase).",
)
response_text = "The distance between the two given strings as as ratio between 0 (near) and 1.0 (remote)."
def convert_ratio(ratio: float) -> float:
"""Convert ratio from similarity 0 - 100 to distance 0 - 1.0."""
return (100.0 - ratio) / 100.0
@app.post(
"/ratio",
summary="Ratio",
response_description=response_text,
)
def ratio(fuzz_request: Fuzz_Request) -> float:
"""
Calculates the normalized Indel similarity.
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio> for details.
"""
ratio = fuzz.ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/partial_ratio",
summary="Partial Ratio",
response_description=response_text,
)
def partial_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Searches for the optimal alignment of the shorter string in the longer string
and returns the [ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio)
for this alignment.
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio> for details.
"""
ratio = fuzz.partial_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/token_set_ratio",
summary="Token Set Ratio",
response_description=response_text,
)
def token_set_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Compares the words in the strings based on unique and common words between them using
[ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio).
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-set-ratio> for details.
"""
ratio = fuzz.token_set_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/partial_token_set_ratio",
summary="Partial Token Set Ratio",
response_description=response_text,
)
def partial_token_set_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Compares the words in the strings based on unique and common words between them using
[partial_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio).
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-set-ratio> for details.
"""
ratio = fuzz.partial_token_set_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/token_sort_ratio",
summary="Token Sort Ratio",
response_description=response_text,
)
def token_sort_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Sorts the words in the strings and calculates the
[ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio) between them.
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-sort-ratio> for details.
"""
ratio = fuzz.token_sort_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/partial_token_sort_ratio",
summary="Partial Ratio",
response_description=response_text,
)
def partial_token_sort_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Sorts the words in the strings and calculates the
[partial_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio) between them.
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-sort-ratio> for details.
"""
ratio = fuzz.partial_token_sort_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/token_ratio",
summary="Token Ratio",
response_description=response_text,
)
def token_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Helper method that returns the **maximum** of
[token_set_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-set-ratio) and
[token_sort_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-sort-ratio).
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-ratio> for details.
"""
ratio = fuzz.token_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/partial_token_ratio",
summary="Partial Token Ratio",
response_description=response_text,
)
def partial_token_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Helper method that returns the **maximum** of
[partial_token_set_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-set-ratio) and
[partial_token_sort_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-sort-ratio).
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-ratio> for details.
"""
ratio = fuzz.partial_token_ratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
@app.post(
"/weighted_ratio",
summary="Weighted Ratio",
response_description=response_text,
)
def weighted_ratio(fuzz_request: Fuzz_Request) -> float:
"""
Calculates a weighted ratio based on the other ratio algorithms.
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#wratio> for details.
"""
ratio = fuzz.Wratio(
fuzz_request.text1,
fuzz_request.text2,
processor=default_process if fuzz_request.process else None,
)
return convert_ratio(ratio)
if __name__ == "__main__":
uvicorn.run("rapidfuzz_fastapi:app", host=host, port=port)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment