Created
March 18, 2023 08:25
-
-
Save jarnaldich/24ece34b6fb441c3ef8878a39a265b82 to your computer and use it in GitHub Desktop.
[Near Duplicate Detection] #data #qc #jupyter #python #nltk #jaccard #levenshtein
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"language_info": { | |
"codemirror_mode": { | |
"name": "python", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8" | |
}, | |
"kernelspec": { | |
"name": "python", | |
"display_name": "Python (Pyodide)", | |
"language": "python" | |
} | |
}, | |
"nbformat_minor": 4, | |
"nbformat": 4, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"source": "import numbers\nimport pandas as pd\nfrom pandas.api.types import is_string_dtype\nfrom js import fetch\nfrom collections import defaultdict\nimport nltk\nimport matplotlib", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "%%javascript\nwindow.saveJSONP = async (urlString, file_path, mime_type='text/json', binary=false) => {\n const sc = document.createElement('script');\n var url = new URL(urlString);\n url.searchParams.append('callback', 'window.corsCallBack');\n \n sc.src = url.toString();\n\n window.corsCallBack = async (data) => {\n console.log(data);\n\n // Open (or create) the file storage\n var open = indexedDB.open('JupyterLite Storage');\n\n // Create the schema\n open.onupgradeneeded = function() {\n throw Error('Error opening IndexedDB. Should not ever need to upgrade JupyterLite Storage Schema');\n };\n\n open.onsuccess = function() {\n // Start a new transaction\n var db = open.result;\n var tx = db.transaction(\"files\", \"readwrite\");\n var store = tx.objectStore(\"files\");\n\n var now = new Date();\n\n var value = {\n 'name': file_path.split(/[\\\\/]/).pop(),\n 'path': file_path,\n 'format': binary ? 'binary' : 'text',\n 'created': now.toISOString(),\n 'last_modified': now.toISOString(),\n 'content': JSON.stringify(data),\n 'mimetype': mime_type,\n 'type': 'file',\n 'writable': true\n }; \n\n const countRequest = store.count(file_path);\n countRequest.onsuccess = () => {\n console.log(countRequest.result);\n if(countRequest.result > 0) {\n store.put(value, file_path);\n } else {\n store.add(value, file_path);\n } \n }; \n\n // Close the db when the transaction is done\n tx.oncomplete = function() {\n db.close();\n };\n }\n }\n\n document.getElementsByTagName('head')[0].appendChild(sc);\n}\n", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "%%javascript\nvar url = 'https://opendata-ajuntament.barcelona.cat/data/es/api/3/action/datastore_search?resource_id=69ae574f-adfc-4660-8f81-73103de169ff'\nwindow.saveJSONP(url, 'data/menors.json')\n", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "import json\nimport pandas as pd\n\nwith open('data/menors.json', 'r') as f:\n data = json.load(f)\n \ndf = pd.read_json(json.dumps(data['result']['records']))", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "def near_duplicates(factors, min_jaccard: float, max_levenshtein: int):\n trigrams = [ set(''.join(g) for g in nltk.ngrams(f, 3)) for f in factors ]\n jaccard = dict()\n levenshtein = dict()\n for i in range(len(factors)):\n for j in range(i+1, len(factors)):\n denom = float(len(trigrams[i] | trigrams[j]))\n if denom > 0:\n jaccard[(i,j)] = float(len(trigrams[i] & trigrams[j])) / denom\n else:\n jaccard[(i,j)] = np.NaN\n levenshtein[(i,j)] = nltk.edit_distance(factors[i], factors[j])\n\n acum = []\n for (i,j),v in jaccard.items():\n if v >= min_jaccard and levenshtein[(i,j)] <= max_levenshtein: \n acum.append([i,j,factors[i], factors[j], jaccard[(i,j)], levenshtein[(i,j)]])\n\n return pd.DataFrame(acum, columns=['i', 'j', 'factor_i', 'factor_j', 'jaccard_ij', 'levenshtein_ij'])", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "def df_dups(df, cols=None, except_cols=[], min_jaccard=0.3, max_levenshtein=4):\n acum = []\n \n if cols is None:\n cols = df.columns\n\n if isinstance(min_jaccard, numbers.Number):\n mj = defaultdict(lambda : min_jaccard)\n else:\n mj = min_jaccard\n\n if isinstance(max_levenshtein, numbers.Number):\n ml = defaultdict(lambda: max_levenshtein)\n else:\n ml = max_levenshtein\n\n for c in cols:\n\n if c in except_cols or not is_string_dtype(df[c]):\n continue\n \n print(c)\n\n factors = df[c].factorize()[1]\n col_dups = near_duplicates(factors, mj[c], ml[c])\n col_dups['col'] = c\n acum.append(col_dups)\n\n return pd.concat(acum)", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "df_dups(df, cols=['Proveïdor', 'Objecte del contracte', \n 'Tipus Contracte'])", | |
"metadata": { | |
"trusted": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "", | |
"metadata": {}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment