Last active
June 17, 2022 08:13
-
-
Save quadrismegistus/0fb3821cd00d7879bf6ce3bc9ab954fe to your computer and use it in GitHub Desktop.
lltk-readme.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.7.7 64-bit", | |
"language": "python", | |
"name": "python37764bit686b23b387564d06bfd55da2c42a5653" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7-final" | |
}, | |
"colab": { | |
"name": "lltk-readme.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/quadrismegistus/0fb3821cd00d7879bf6ce3bc9ab954fe/lltk-readme.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "CsMBZWcQl83A" | |
}, | |
"source": [ | |
"# lltk\n", | |
"\n", | |
"Literary Language Tool Kit (LLTK): corpora, models, and tools for the study of complex language." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "nB0mhKvnl83B" | |
}, | |
"source": [ | |
"## Install\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "58WUG38Il83B" | |
}, | |
"source": [ | |
"# install\n", | |
"!pip install -qU lltk-dh" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wdB-rKRxG6yy" | |
}, | |
"source": [ | |
"# terminal commands\n", | |
"!lltk -h" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ef53mRODl83C" | |
}, | |
"source": [ | |
"# or load lltk for use within python\n", | |
"import lltk" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Osa_OFyVl83C" | |
}, | |
"source": [ | |
"## Load corpus" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RzMCHl6XHsR_" | |
}, | |
"source": [ | |
"# show corpora in markdown\n", | |
"lltk.show()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fPszjt9Ul83C" | |
}, | |
"source": [ | |
"# load a corpus: e.g. the txtLAB450 dataset of 450 En/Fr/Ger novels \n", | |
"corpus = lltk.load('TxtLab')" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_PKoIizBl83C" | |
}, | |
"source": [ | |
"## Accessing metadata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "B1kSX0_8l83D" | |
}, | |
"source": [ | |
"# metadata as dataframe\n", | |
"corpus.meta" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fncfUpXmvaU2" | |
}, | |
"source": [ | |
"# filter metadata\n", | |
"smpl=corpus.meta.query('1770<=year<1780 & language==\"English\"')\n", | |
"smpl" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "76zmAdtkvplV" | |
}, | |
"source": [ | |
"## Plotting metadata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "F2JwvRxUl83D" | |
}, | |
"source": [ | |
"# Breakdown of novels' language\n", | |
"corpus.meta['language'].value_counts().plot.pie()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nP4I0GRul83D" | |
}, | |
"source": [ | |
"# Breakdown of author gender\n", | |
"corpus.meta['gender'].value_counts().plot.pie()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bxW_g0hnl83E" | |
}, | |
"source": [ | |
"# Distribution of years per language\n", | |
"corpus.meta.groupby('language').year.plot.density()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "tveInQQEl83E" | |
}, | |
"source": [ | |
"## Accessing data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mLmZcuB5v35C" | |
}, | |
"source": [ | |
"### Most frequent words (MFW)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"tags": [], | |
"id": "gf_OzwLcl83E" | |
}, | |
"source": [ | |
"# Most frequent words overall\n", | |
"corpus.mfw_df()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"tags": [], | |
"id": "85hP7a22l83E" | |
}, | |
"source": [ | |
"# Or, in more detail:\n", | |
"\n", | |
"# set texts\n", | |
"english_texts = [t for t in corpus.texts() if t.language=='English']\n", | |
"# or: english_texts = corpus.meta.query('language==\"English\"')\n", | |
"\n", | |
"# create dataframe\n", | |
"top_noun_by_period = corpus.mfw_df(\n", | |
" texts=english_texts, # specify texts (otherwise all)\n", | |
" # how words work\n", | |
" only_pos = {'n*'}, # only part(s) of speech (Penn Treebank, n*)\n", | |
" n=25, # Specify number of top words\n", | |
" by_ntext=False, # Count number of documents not number of words\n", | |
" by_fpm=True, # Count by within-text relative sums\n", | |
" min_count=None, # Minimum count of word\n", | |
" excl_stopwords=False, # Exclude stopwords (set in config.txt)\n", | |
" excl_top=200, # Exclude words ranked 1:`not_top`\n", | |
" \n", | |
" # how periods work\n", | |
" yearbin=50, # Average relative counts across `yearbin` periods\n", | |
" col_group='period', # Which column to store `yearbin` period on\n", | |
" n_by_period=None, # Number of top words per period\n", | |
" keep_periods=True, # Keep periods in output dataframe\n", | |
" n_agg='median', # How to aggregate across periods\n", | |
" min_periods=None, # minimum number of periods a word must appear in\n", | |
" valtype='fpm', # valtype to compute top words by\n", | |
" #**attrs\n", | |
")\n", | |
"top_noun_by_period" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HCeFIYXcl83E" | |
}, | |
"source": [ | |
"# Simple plot\n", | |
"import plotnine as p9\n", | |
"p9.options.figure_size=(11,9)\n", | |
"p9.ggplot(\n", | |
" p9.aes(x='period',y='fpm',label='word',group='word'),\n", | |
" data=top_noun_by_period\n", | |
") + p9.facet_wrap('pos0') + p9.geom_line(alpha=0.25) + p9.geom_text(size=8) + p9.theme_classic()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "R6Xadd-Il83F" | |
}, | |
"source": [ | |
"## Document-term matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "z9KwR4Tdl83F" | |
}, | |
"source": [ | |
"# Get a document-term matrix\n", | |
"corpus.dtm()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "N2Swvs999_-G" | |
}, | |
"source": [ | |
"# Filter\n", | |
"tfs = corpus.dtm(\n", | |
" texts=[t for t in corpus.texts() if t.language=='English'],\n", | |
" n=1000,\n", | |
" only_pos={'n*'},\n", | |
" tf=True,\n", | |
" meta=['year']\n", | |
")\n", | |
"fpms = tfs * 1000000\n", | |
"fpms" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BpBVFc5Cl83G" | |
}, | |
"source": [ | |
"## Most distinctive words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "frPbarZGIkdN" | |
}, | |
"source": [ | |
"# Set of texts to use\n", | |
"english_texts = [t for t in corpus.texts() if t.language=='English']\n", | |
"\n", | |
"# Set of words to use\n", | |
"top_nouns = corpus.mfw(\n", | |
" n=1000,\n", | |
" texts=english_texts,\n", | |
" only_pos={'n*'} \n", | |
")\n", | |
"\n", | |
"# Make a tfidf dtm\n", | |
"tfidfs=corpus.dtm(\n", | |
" texts=english_texts,\n", | |
" words=top_nouns, \n", | |
" tfidf=True,\n", | |
")\n", | |
"tfidfs" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ya7y8Zsnl83G" | |
}, | |
"source": [ | |
"# sort by text\n", | |
"tfidfs.T['EN_1927_Woolf,Virginia_TotheLighthouse_Novel'].sort_values(ascending=False).head(10)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "qT26sGRbD3VM" | |
}, | |
"source": [ | |
"# sort by text\n", | |
"tfidfs.T['EN_1771_Mackenzie,Henry_TheManofFeeling_Novel'].sort_values(ascending=False).head(10)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1sFDNPk1l83G" | |
}, | |
"source": [ | |
"# Group tfidf into MDW\n", | |
"corpus.mdw(\n", | |
" 'gender',\n", | |
" texts=corpus.meta.query('language==\"English\" & 1800<=year<1900'),\n", | |
" agg='median',\n", | |
" n=2000,\n", | |
" pos_only={'n*'}\n", | |
")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XHE_-a4vl83G" | |
}, | |
"source": [ | |
"## Others...\n", | |
"\n", | |
"Clustering, word2vec, classification, character networks. Documentation forthcoming but check out [the models folder](https://github.com/quadrismegistus/lltk/tree/master/lltk/model)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"metadata": { | |
"id": "LNnx-j8ptU5g" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment