Created
August 10, 2015 21:25
-
-
Save cmstewart/1703e16f374ee2f2826b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:6c1e3a0587a0f04d4fd364271ca5075ff91211fc60442351747d433f77b6fcf4" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Introducci\u00f3n \n", | |
"\n", | |
"\"\"\"Este proyecto se trata de un trabajo de corpus sobre las diferencias ling\u00fc\u00edsticas entre hablantes nativos y hablantes de segunda lengua\"\"\"\n", | |
"\n", | |
"\n", | |
"## Adquisici\u00f3n de los dados para los hablantes nativos\n", | |
"\n", | |
"import requests\n", | |
"import zipfile\n", | |
"import StringIO\n", | |
"\n", | |
"zip_file_url = 'http://lenguajeacademico.info/proyecto/corpus_CLAE.zip'\n", | |
"r = requests.get(zip_file_url)\n", | |
"z = zipfile.ZipFile(StringIO.StringIO(r.content))\n", | |
"z.extractall()\n", | |
"\n", | |
"%cd corpus_CLAE/\n", | |
"%rm C*.txt" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"/Users/christopherstewart/Desktop/sp_heritage/5/1.4/corpus_CLAE\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Las 20 palabras m\u00e1s usadas por los hablantes nativos.\n", | |
"\n", | |
"import nltk\n", | |
"from nltk.corpus import PlaintextCorpusReader\n", | |
"from nltk import Text\n", | |
"from nltk import FreqDist\n", | |
"\n", | |
"L1_path = '.'\n", | |
"L1_corp = [w.lower() for w in Text(PlaintextCorpusReader(L1_path, '.*', encoding='utf-8').words()) if w.isalpha()]\n", | |
"L1_toks_fd = FreqDist(L1_corp)\n", | |
"L1_toks_fd.tabulate(15)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" de la que y el en los a se las del es por una un \n", | |
"17965 12392 8313 7765 7301 7087 5289 5143 3694 3637 3263 2805 2626 2514 2351 \n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Las 20 palabras m\u00e1s usadas por los hablantes L2 (del corpus de Maria [email del 25 de mayo])\n", | |
"\n", | |
"L2_path = '.'\n", | |
"L2_corp = [w.lower() for w in Text(PlaintextCorpusReader(L2_path, 'CORPUS_TOTAL.txt', encoding='utf-8').words()) if w.isalpha()]\n", | |
"L2_toks_fd = FreqDist(L2_corp)\n", | |
"L2_toks_fd.tabulate(15)\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" de que los la en a y el no es para por un una se \n", | |
"2654 2154 1464 1400 1365 1234 1174 1167 671 621 568 563 546 510 489 \n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 pares de palabras m\u00e1s usados en el corpus L1.\n", | |
"\n", | |
"from nltk.util import bigrams\n", | |
"\n", | |
"L1_bigrs = FreqDist(bigrams(L1_corp))\n", | |
"L1_bigrs_top = [i[0] for i in L1_bigrs.most_common(10)]\n", | |
"for x in L1_bigrs_top:\n", | |
" print x[0], x[1]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"de la\n", | |
"de los\n", | |
"en el\n", | |
"de las\n", | |
"en la\n", | |
"a la\n", | |
"que se\n", | |
"la geograf\u00eda\n", | |
"lo que\n", | |
"y la\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 pares de palabras m\u00e1s usados en el corpus L2.\n", | |
"\n", | |
"L2_bigrs = FreqDist(bigrams(L2_corp))\n", | |
"L2_bigrs_top = [i[0] for i in L2_bigrs.most_common(10)]\n", | |
"for x in L2_bigrs_top:\n", | |
" print x[0], x[1]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"estados unidos\n", | |
"de los\n", | |
"los inmigrantes\n", | |
"a los\n", | |
"de la\n", | |
"los estados\n", | |
"que los\n", | |
"en el\n", | |
"a la\n", | |
"en los\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'trigrams' m\u00e1s usados en el corpus L1.\n", | |
"\n", | |
"from nltk.util import trigrams\n", | |
"\n", | |
"L1_trigrs = FreqDist(trigrams(L1_corp))\n", | |
"L1_trigrs_top = [i[0] for i in L1_trigrs.most_common(10)]\n", | |
"for x in L1_trigrs_top:\n", | |
" print x[0], x[1], x[2]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"de la geograf\u00eda\n", | |
"de la historia\n", | |
"a trav\u00e9s de\n", | |
"a partir de\n", | |
"una de las\n", | |
"el desarrollo de\n", | |
"por lo que\n", | |
"una serie de\n", | |
"uno de los\n", | |
"esperanza de vida\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'trigrams' m\u00e1s usados en el corpus L2.\n", | |
"\n", | |
"L2_trigrs = FreqDist(trigrams(L2_corp))\n", | |
"L2_trigrs_top = [i[0] for i in L2_trigrs.most_common(10)]\n", | |
"for x in L2_trigrs_top:\n", | |
" print x[0], x[1], x[2]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"los estados unidos\n", | |
"en los estados\n", | |
"que los inmigrantes\n", | |
"en este pa\u00eds\n", | |
"las personas que\n", | |
"a los estudiantes\n", | |
"los ee uu\n", | |
"de los estados\n", | |
"de los inmigrantes\n", | |
"una reforma migratoria\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'tetragrams' m\u00e1s usados en el corpus L1.\n", | |
"\n", | |
"from nltk.util import ngrams\n", | |
"\n", | |
"def make_ngrams(input_list, n):\n", | |
" return zip(*[input_list[i:] for i in range(n)])\n", | |
"\n", | |
"L1_tetra = FreqDist(make_ngrams(L1_corp, 4))\n", | |
"\n", | |
"L1_tetragrs_top = [i[0] for i in L1_tetra.most_common(10)]\n", | |
"for x in L1_tetragrs_top:\n", | |
" print x[0], x[1], x[2], x[3]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"la mayor\u00eda de los\n", | |
"\u00edndice de desarrollo humano\n", | |
"a lo largo de\n", | |
"la esperanza de vida\n", | |
"de las naciones unidas\n", | |
"de tal forma que\n", | |
"teor\u00eda de la geograf\u00eda\n", | |
"don quijote de la\n", | |
"en el caso de\n", | |
"de vida al nacer\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'tetragrams' m\u00e1s usados en el corpus L2.\n", | |
"\n", | |
"L2_tetra = FreqDist(make_ngrams(L2_corp, 4))\n", | |
"\n", | |
"L2_tetragrs_top = [i[0] for i in L2_tetra.most_common(10)]\n", | |
"for x in L2_tetragrs_top:\n", | |
" print x[0], x[1], x[2], x[3]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"en los estados unidos\n", | |
"de los estados unidos\n", | |
"a los estados unidos\n", | |
"en los ee uu\n", | |
"mi punto de vista\n", | |
"la mayor\u00eda de los\n", | |
"estoy de acuerdo con\n", | |
"que los inmigrantes no\n", | |
"los estados unidos no\n", | |
"que las personas que\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'pentagrams' m\u00e1s usados en el corpus L1.\n", | |
"\n", | |
"L1_penta = FreqDist(make_ngrams(L1_corp, 5))\n", | |
"\n", | |
"L1_penta_top = [i[0] for i in L1_penta.most_common(10)]\n", | |
"for x in L1_penta_top:\n", | |
" print x[0], x[1], x[2], x[3], x[4]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"esperanza de vida al nacer\n", | |
"don quijote de la mancha\n", | |
"los derechos de los trabajadores\n", | |
"tasa bruta combinada de matriculaci\u00f3n\n", | |
"a lo largo de la\n", | |
"la teor\u00eda de la geograf\u00eda\n", | |
"ense\u00f1anza primaria secundaria y terciaria\n", | |
"tasa de alfabetizaci\u00f3n de adultos\n", | |
"el \u00edndice de desarrollo humano\n", | |
"en ense\u00f1anza primaria secundaria y\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'pentagrams' m\u00e1s usados en el corpus L2.\n", | |
"\n", | |
"L2_penta = FreqDist(make_ngrams(L2_corp, 5))\n", | |
"\n", | |
"L2_penta_top = [i[0] for i in L2_penta.most_common(10)]\n", | |
"for x in L2_penta_top:\n", | |
" print x[0], x[1], x[2], x[3], x[4]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"desde mi punto de vista\n", | |
"la disciplina en las escuelas\n", | |
"congreso de los estados unidos\n", | |
"el congreso de los estados\n", | |
"en mi punto de vista\n", | |
"es el spanglish un idioma\n", | |
"diferencias entre lpac y ard\n", | |
"los estados unidos los inmigrantes\n", | |
"un proyecto de responsabilidad social\n", | |
"de los estados unidos y\n" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'sextagrams' m\u00e1s usados en el corpus L1.\n", | |
"\n", | |
"L1_sexta = FreqDist(make_ngrams(L1_corp, 6))\n", | |
"\n", | |
"L1_sexta_top = [i[0] for i in L1_sexta.most_common(10)]\n", | |
"for x in L1_sexta_top:\n", | |
" print x[0], x[1], x[2], x[3], x[4], x[5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"en ense\u00f1anza primaria secundaria y terciaria\n", | |
"de matriculaci\u00f3n en ense\u00f1anza primaria secundaria\n", | |
"tasa bruta combinada de matriculaci\u00f3n en\n", | |
"matriculaci\u00f3n en ense\u00f1anza primaria secundaria y\n", | |
"bruta combinada de matriculaci\u00f3n en ense\u00f1anza\n", | |
"combinada de matriculaci\u00f3n en ense\u00f1anza primaria\n", | |
"perspectivas de poblaci\u00f3n en el mundo\n", | |
"de las perspectivas de poblaci\u00f3n en\n", | |
"la esperanza de vida al nacer\n", | |
"las perspectivas de poblaci\u00f3n en el\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Los 10 'sextagrams' m\u00e1s usados en el corpus L2.\n", | |
"\n", | |
"L2_sexta = FreqDist(make_ngrams(L2_corp, 6))\n", | |
"\n", | |
"L2_sexta_top = [i[0] for i in L2_sexta.most_common(10)]\n", | |
"for x in L2_sexta_top:\n", | |
" print x[0], x[1], x[2], x[3], x[4], x[5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"el congreso de los estados unidos\n", | |
"formar un grupo con el fin\n", | |
"el fin de promover las necesidades\n", | |
"grupo con el fin de promover\n", | |
"un grupo con el fin de\n", | |
"la econom\u00eda de los estados unidos\n", | |
"y diferencias entre lpac y ard\n", | |
"similitudes y diferencias entre lpac y\n", | |
"con el fin de promover las\n", | |
"inmigrantes no est\u00e1 disminuyendo sino al\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment