Created
May 13, 2025 13:45
-
-
Save b2m/91f3b812bcf0975c4d2cb3230099366a to your computer and use it in GitHub Desktop.
Documented FastAPI wrapper arround RapidFuzz to be used together with OpenRefine. Run with `uv run rapidfuzz_fastapi.py`.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "fastapi", | |
# "pydantic", | |
# "uvicorn", | |
# "rapidfuzz", | |
# ] | |
# /// | |
from typing import Optional | |
import uvicorn | |
from fastapi import FastAPI | |
from pydantic import BaseModel, Field | |
from rapidfuzz import fuzz | |
from rapidfuzz.utils import default_process | |
host = "127.0.0.1" | |
port = 5000 | |
app = FastAPI( | |
title="Levenshtein services based on RapidFuzz", | |
description=f""" | |
Provides the functions from [RapidFuzz](https://github.com/rapidfuzz/RapidFuzz) as web service. | |
Use from [custom clustering](https://openrefine.org/docs/manual/cellediting#custom-clustering-methods) | |
in [OpenRefine](https://openrefine.org/) with the following *Jython* code: | |
```python | |
import json, urllib, urllib2 | |
url = "http://{host}:{port}/ratio" | |
request_data = json.dumps({{ | |
"text1": value1.encode("utf-8"), | |
"text2": value2.encode("utf-8"), | |
"process": True, | |
}}) | |
request = urllib2.Request( | |
url, | |
request_data, | |
{{"Content-Type": "application/json"}}, | |
) | |
response = urllib2.urlopen(request) | |
return json.dumps(json.load(response), ensure_ascii=False) | |
``` | |
""", | |
docs_url="/", | |
) | |
class Fuzz_Request(BaseModel): | |
""" | |
Request with two texts to perform Levenshtein distance calculation. | |
""" | |
text1: str = Field( | |
..., | |
title="Text 1", | |
description="First text sample to compare to second text sample.", | |
example="fuzzy was a bear", | |
) | |
text2: str = Field( | |
..., | |
title="Text 2", | |
description="Second text sample to compare to first text sample.", | |
example="fuzzy wuzzy was a bear", | |
) | |
process: Optional[bool] = Field( | |
True, | |
title="Process Text", | |
description="Indicator whether text should get processed (remove non alphanumeric characters, trimming whitespace, converting to lowercase).", | |
) | |
response_text = "The distance between the two given strings as as ratio between 0 (near) and 1.0 (remote)." | |
def convert_ratio(ratio: float) -> float: | |
"""Convert ratio from similarity 0 - 100 to distance 0 - 1.0.""" | |
return (100.0 - ratio) / 100.0 | |
@app.post( | |
"/ratio", | |
summary="Ratio", | |
response_description=response_text, | |
) | |
def ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Calculates the normalized Indel similarity. | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio> for details. | |
""" | |
ratio = fuzz.ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/partial_ratio", | |
summary="Partial Ratio", | |
response_description=response_text, | |
) | |
def partial_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Searches for the optimal alignment of the shorter string in the longer string | |
and returns the [ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio) | |
for this alignment. | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio> for details. | |
""" | |
ratio = fuzz.partial_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/token_set_ratio", | |
summary="Token Set Ratio", | |
response_description=response_text, | |
) | |
def token_set_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Compares the words in the strings based on unique and common words between them using | |
[ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio). | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-set-ratio> for details. | |
""" | |
ratio = fuzz.token_set_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/partial_token_set_ratio", | |
summary="Partial Token Set Ratio", | |
response_description=response_text, | |
) | |
def partial_token_set_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Compares the words in the strings based on unique and common words between them using | |
[partial_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio). | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-set-ratio> for details. | |
""" | |
ratio = fuzz.partial_token_set_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/token_sort_ratio", | |
summary="Token Sort Ratio", | |
response_description=response_text, | |
) | |
def token_sort_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Sorts the words in the strings and calculates the | |
[ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#ratio) between them. | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-sort-ratio> for details. | |
""" | |
ratio = fuzz.token_sort_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/partial_token_sort_ratio", | |
summary="Partial Ratio", | |
response_description=response_text, | |
) | |
def partial_token_sort_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Sorts the words in the strings and calculates the | |
[partial_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-ratio) between them. | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-sort-ratio> for details. | |
""" | |
ratio = fuzz.partial_token_sort_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/token_ratio", | |
summary="Token Ratio", | |
response_description=response_text, | |
) | |
def token_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Helper method that returns the **maximum** of | |
[token_set_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-set-ratio) and | |
[token_sort_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-sort-ratio). | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#token-ratio> for details. | |
""" | |
ratio = fuzz.token_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/partial_token_ratio", | |
summary="Partial Token Ratio", | |
response_description=response_text, | |
) | |
def partial_token_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Helper method that returns the **maximum** of | |
[partial_token_set_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-set-ratio) and | |
[partial_token_sort_ratio](https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-sort-ratio). | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#partial-token-ratio> for details. | |
""" | |
ratio = fuzz.partial_token_ratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
@app.post( | |
"/weighted_ratio", | |
summary="Weighted Ratio", | |
response_description=response_text, | |
) | |
def weighted_ratio(fuzz_request: Fuzz_Request) -> float: | |
""" | |
Calculates a weighted ratio based on the other ratio algorithms. | |
See <https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#wratio> for details. | |
""" | |
ratio = fuzz.Wratio( | |
fuzz_request.text1, | |
fuzz_request.text2, | |
processor=default_process if fuzz_request.process else None, | |
) | |
return convert_ratio(ratio) | |
if __name__ == "__main__": | |
uvicorn.run("rapidfuzz_fastapi:app", host=host, port=port) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment