Skip to content

Instantly share code, notes, and snippets.

@sahuguet
Last active October 10, 2024 01:05
Show Gist options
  • Save sahuguet/9d98c0287eb6641b1a87822509633d46 to your computer and use it in GitHub Desktop.
Save sahuguet/9d98c0287eb6641b1a87822509633d46 to your computer and use it in GitHub Desktop.
Presidio example
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNcUZJO94FrY1ndKHIXI2R1",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/sahuguet/9d98c0287eb6641b1a87822509633d46/untitled8.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QWgSB3ca4n0P"
},
"outputs": [],
"source": [
"# download presidio\n",
"!pip install presidio_analyzer presidio_anonymizer > /dev/null\n",
"!python -m spacy download en_core_web_lg"
]
},
{
"cell_type": "code",
"source": [
"from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n",
"from presidio_anonymizer import AnonymizerEngine\n",
"from presidio_anonymizer.entities import OperatorConfig\n",
"import json\n",
"from pprint import pprint"
],
"metadata": {
"id": "Zi2xQ1vg47zL"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"text_to_anonymize = \"\"\"His name is Mr. Jones.\n",
"He has both French and US passports.\n",
"His phone number is tel:212-555-5555.\n",
"He worked for Nokia and NYU between 2001 and 2006.\n",
"He is making $100,000 a year.\n",
"He got is PhD in 2001 from BYU.\n",
"\"\"\""
],
"metadata": {
"id": "QJAKCueP4_zU"
},
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"source": [
"analyzer = AnalyzerEngine()\n",
"analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n",
"\n",
"print(analyzer_results)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6VYEXUd45A9w",
"outputId": "2c3b5b6c-76b0-47d4-cff3-de8df80f357b"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: es, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: pl, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNifRecognizer supported languages: es, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNieRecognizer supported languages: es, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItDriverLicenseRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItFiscalCodeRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItVatCodeRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItIdentityCardRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItPassportRecognizer supported languages: it, registry supported languages: en\n",
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - PlPeselRecognizer supported languages: pl, registry supported languages: en\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[type: PERSON, start: 16, end: 21, score: 0.85, type: NRP, start: 35, end: 41, score: 0.85, type: LOCATION, start: 46, end: 48, score: 0.85, type: DATE_TIME, start: 126, end: 147, score: 0.85, type: DATE_TIME, start: 196, end: 200, score: 0.85, type: PHONE_NUMBER, start: 84, end: 96, score: 0.75]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"anonymizer = AnonymizerEngine()\n",
"\n",
"anonymized_results = anonymizer.anonymize(\n",
" text=text_to_anonymize,\n",
" analyzer_results=analyzer_results,\n",
" operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"<REDACTED>\"})})\n",
"\n",
"print(f\"text: {anonymized_results.text}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RZWobKFW5yLs",
"outputId": "c79b8e76-071c-4e4c-ff4a-e7da8f241b50"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"text: His name is Mr. <REDACTED>.\n",
"He has both <REDACTED> and <REDACTED> passports.\n",
"His phone number is tel:<REDACTED>.\n",
"He worked for Nokia and NYU <REDACTED>.\n",
"He is making $100,000 a year.\n",
"He got is PhD in <REDACTED> from BYU.\n",
"\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment