Created
May 21, 2022 18:45
-
-
Save alonsosilvaallende/f14e217df7e2290b4b1383224d46b7ed to your computer and use it in GitHub Desktop.
Untitled2.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.145104Z", | |
"end_time": "2022-05-21T18:45:59.247338Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "%load_ext autoreload\n%autoreload 2", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.260636Z", | |
"end_time": "2022-05-21T18:45:59.309081Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import re\nfrom pathlib import Path", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.317768Z", | |
"end_time": "2022-05-21T18:45:59.377366Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "path = Path('/Users/alonsosilva/Dropbox/Constitutions/')", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.385581Z", | |
"end_time": "2022-05-21T18:45:59.451204Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def read_and_filter(filename):\n with open(filename, 'rt') as f:\n text = f.read()\n text = re.sub('\\n\\n+', '\\n', text)\n text = re.sub('(?<=\\w)\\s*\\n(?=\\w)', ' ', text)\n text = text.replace(' ', ' ')\n text = text.replace('', '')\n return text", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.458028Z", | |
"end_time": "2022-05-21T18:45:59.504457Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "prev = read_and_filter(path/'docs/prev.txt')\nprint(prev[0:500])", | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "FIJA EL TEXTO REFUNDIDO, COORDINADO Y SISTEMATIZADO DE LA CONSTITUCIÓN POLÍTICA DE LA REPÚBLICA DE CHILE\n Núm. 100.- Santiago, 17 de septiembre de 2005.-\n Visto: En uso de las facultades que me confiere el artículo 2° de la Ley Nº 20.050, y teniendo presente lo dispuesto en el artículo 32 N°8 de la Constitución Política de 1980,\n Decreto:\n Fíjase el siguiente texto refundido, coordinado y sistematizado de la Constitución Política de la República:\n Capítulo I\n BASES DE LA INSTITUCI\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.509213Z", | |
"end_time": "2022-05-21T18:45:59.575341Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = read_and_filter(path/'docs/borrador.txt')\nprint(text[0:500])", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "•\nEste documento es un consolidado que reúne las normas aprobadas por el Pleno de la Convención Constitucional, ordenadas por comisión. La relación de números de los artículos obedece a lo dispuesto en los respectivos informes y no es el orden definitivo, ya que ese proceso deberá ser realizado por la Comisión de Armonización\n(Actualizado el 14.05.22).\nCONSOLIDADO NORMAS APROBADAS PARA LA PROPUESTA CONSTITUCIONAL POR EL PLENO DE LA CONVENCIÓN CAPÍTULO (COM 1)\nDE LA DEMOCRACIA 1.- Artículo 2°.- D\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:45:59.579981Z", | |
"end_time": "2022-05-21T18:46:04.262361Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import spacy\n\nnlp = spacy.load(\"es_core_news_sm\")", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:04.264969Z", | |
"end_time": "2022-05-21T18:46:19.068355Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc_prev = nlp(prev.lower())\ndoc_new = nlp(text.lower())", | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.074076Z", | |
"end_time": "2022-05-21T18:46:19.234341Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nouns_prev = [token for token in doc_prev if token.pos_ == 'NOUN']\nnouns_new = [token for token in doc_new if token.pos_ == 'NOUN']", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.239032Z", | |
"end_time": "2022-05-21T18:46:19.347063Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "adj_prev = [token for token in doc_prev if token.pos_ == 'ADJ']\nadj_new = [token for token in doc_new if token.pos_ == 'ADJ']", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.350096Z", | |
"end_time": "2022-05-21T18:46:19.414888Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "len(nouns_prev), len(adj_prev), len(nouns_new), len(adj_new)", | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 11, | |
"data": { | |
"text/plain": "(8343, 3300, 14633, 5410)" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.417095Z", | |
"end_time": "2022-05-21T18:46:19.527890Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print(f\"Proporción adjetivos/sustantivos antigua: {len(adj_prev)/len(nouns_prev):.3f}\")\nprint(f\"Proporción adjetivos/sustantivos nueva: {len(adj_new)/len(nouns_new):.3f}\")", | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Proporción adjetivos/sustantivos antigua: 0.396\nProporción adjetivos/sustantivos nueva: 0.370\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.539397Z", | |
"end_time": "2022-05-21T18:46:19.683402Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nouns_prev_dict = [token.morph.to_dict() for token in nouns_prev]\nnouns_new_dict = [token.morph.to_dict() for token in nouns_new]", | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.689146Z", | |
"end_time": "2022-05-21T18:46:19.799027Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "genders_prev = [nouns_prev_dict[i]['Gender'] for i in range(len(nouns_prev_dict)) \\\n if 'Gender' in nouns_prev_dict[i].keys()]\ngenders_new = [nouns_new_dict[i]['Gender'] for i in range(len(nouns_new_dict)) \\\n if 'Gender' in nouns_new_dict[i].keys()]", | |
"execution_count": 14, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.803567Z", | |
"end_time": "2022-05-21T18:46:19.905935Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "genders_prev.count('Masc'), genders_prev.count('Fem'), genders_new.count('Masc'), genders_new.count('Fem')", | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 15, | |
"data": { | |
"text/plain": "(4037, 3676, 6168, 7400)" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2022-05-21T18:46:19.911115Z", | |
"end_time": "2022-05-21T18:46:19.994379Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print(f\"Porcentaje de sustantivos masculinos antigua: \\\n{100*genders_prev.count('Masc')/(genders_prev.count('Masc') +genders_prev.count('Fem')):.1f}%\")\nprint(f\"Porcentaje de sustantivos masculinos nueva: \\\n{100*genders_new.count('Masc')/(genders_new.count('Masc') +genders_new.count('Fem')):.1f}%\")\nprint(f\"Porcentaje de sustantivos femeninos antigua: \\\n{100*genders_prev.count('Fem')/(genders_prev.count('Masc') +genders_prev.count('Fem')):.1f}%\")\nprint(f\"Porcentaje de sustantivos femeninos nueva: \\\n{100*genders_new.count('Fem')/(genders_new.count('Masc') +genders_new.count('Fem')):.1f}%\")", | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Porcentaje de sustantivos masculinos antigua: 52.3%\nPorcentaje de sustantivos masculinos nueva: 45.5%\nPorcentaje de sustantivos femeninos antigua: 47.7%\nPorcentaje de sustantivos femeninos nueva: 54.5%\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.4", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "Untitled2.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment