Last active
October 10, 2024 01:05
-
-
Save sahuguet/9d98c0287eb6641b1a87822509633d46 to your computer and use it in GitHub Desktop.
Presidio example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyNcUZJO94FrY1ndKHIXI2R1", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sahuguet/9d98c0287eb6641b1a87822509633d46/untitled8.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "QWgSB3ca4n0P" | |
}, | |
"outputs": [], | |
"source": [ | |
"# download presidio\n", | |
"!pip install presidio_analyzer presidio_anonymizer > /dev/null\n", | |
"!python -m spacy download en_core_web_lg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n", | |
"from presidio_anonymizer import AnonymizerEngine\n", | |
"from presidio_anonymizer.entities import OperatorConfig\n", | |
"import json\n", | |
"from pprint import pprint" | |
], | |
"metadata": { | |
"id": "Zi2xQ1vg47zL" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"text_to_anonymize = \"\"\"His name is Mr. Jones.\n", | |
"He has both French and US passports.\n", | |
"His phone number is tel:212-555-5555.\n", | |
"He worked for Nokia and NYU between 2001 and 2006.\n", | |
"He is making $100,000 a year.\n", | |
"He got is PhD in 2001 from BYU.\n", | |
"\"\"\"" | |
], | |
"metadata": { | |
"id": "QJAKCueP4_zU" | |
}, | |
"execution_count": 15, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"analyzer = AnalyzerEngine()\n", | |
"analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n", | |
"\n", | |
"print(analyzer_results)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "6VYEXUd45A9w", | |
"outputId": "2c3b5b6c-76b0-47d4-cff3-de8df80f357b" | |
}, | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: es, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: pl, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNifRecognizer supported languages: es, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNieRecognizer supported languages: es, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItDriverLicenseRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItFiscalCodeRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItVatCodeRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItIdentityCardRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItPassportRecognizer supported languages: it, registry supported languages: en\n", | |
"WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - PlPeselRecognizer supported languages: pl, registry supported languages: en\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"[type: PERSON, start: 16, end: 21, score: 0.85, type: NRP, start: 35, end: 41, score: 0.85, type: LOCATION, start: 46, end: 48, score: 0.85, type: DATE_TIME, start: 126, end: 147, score: 0.85, type: DATE_TIME, start: 196, end: 200, score: 0.85, type: PHONE_NUMBER, start: 84, end: 96, score: 0.75]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"anonymizer = AnonymizerEngine()\n", | |
"\n", | |
"anonymized_results = anonymizer.anonymize(\n", | |
" text=text_to_anonymize,\n", | |
" analyzer_results=analyzer_results,\n", | |
" operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"<REDACTED>\"})})\n", | |
"\n", | |
"print(f\"text: {anonymized_results.text}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "RZWobKFW5yLs", | |
"outputId": "c79b8e76-071c-4e4c-ff4a-e7da8f241b50" | |
}, | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"text: His name is Mr. <REDACTED>.\n", | |
"He has both <REDACTED> and <REDACTED> passports.\n", | |
"His phone number is tel:<REDACTED>.\n", | |
"He worked for Nokia and NYU <REDACTED>.\n", | |
"He is making $100,000 a year.\n", | |
"He got is PhD in <REDACTED> from BYU.\n", | |
"\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment