Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save juanshishido/24679d20ae715d71dbe5 to your computer and use it in GitHub Desktop.
Save juanshishido/24679d20ae715d71dbe5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "## Imports"
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "import re\n\nimport numpy as np\nimport lda\nimport nltk\nimport gensim\nfrom nltk.data import find\nfrom nltk.corpus import wordnet as wn\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import regexp_tokenize\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom gensim.models import Word2Vec\n",
"execution_count": 1,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Functions"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def load_text(file_path):\n \"\"\"Given a file path, loads a .txt file.\n Also removes chapter and section headings.\n Returns a single string.\n \"\"\"\n with open (file_path, 'r', encoding='utf-8') as jsm:\n text = jsm.read()\n \n return re.sub('\\s+', ' ',\n re.sub(r'[A-Z]{2,}', '',\n re.sub('((?<=[A-Z])\\sI | I\\s(?=[A-Z]))', ' ', text)))",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def split_on_sentence(text):\n \"\"\"Tokenize the text on sentences.\n Returns a list of strings (sentences).\n \"\"\"\n sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n return sent_tokenizer.tokenize(text)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def re_punc(text):\n \"\"\"Remove all punctuation. Keep apostrophes.\"\"\"\n return re.sub(r'[!\"#$%&()*+,\\-\\./:;<=>?@\\[\\]^_`\\\\{\\|}]+', '', text)",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def remove_punctuation(sentences):\n \"\"\"Remove punctuation based on `re_punc`.\n Returns either a list of string or a single string,\n based on the input type.\n \"\"\"\n if type(sentences) is list:\n return [re_punc(sentence).strip() for sentence in sentences]\n else:\n return re_punc(sentences).strip()",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def split_on_word(text):\n \"\"\"Use regular expression tokenizer.\n Keep apostrophes.\n Returns a list of lists, one list for each sentence:\n [[word, word], [word, word, ..., word], ...].\n \"\"\"\n if type(text) is list:\n return [regexp_tokenize(sentence, pattern=\"\\w+(?:[-']\\w+)*\") for sentence in text]\n else:\n return regexp_tokenize(text, pattern=\"\\w+(?:[-']\\w+)*\")",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def normalize(tokenized_words):\n \"\"\"Removes stop words, numbers, short words, and lowercases text.\n Returns a list of lists, one list for each sentence:\n [[word, word], [word, word, ..., word], ...].\n \"\"\"\n stop_words = stopwords.words('english')\n return [[w.lower() for w in sent\n if (w.lower() not in stop_words) and\\\n (not(w.lower().isnumeric())) and\\\n (len(w) > 2)]\n for sent in tokenized_words]",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "text = load_text('/Users/JS/Code/_INFO256/text-collection/jsm-collection.txt')",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "sentences = split_on_sentence(text)",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "words = split_on_word(remove_punctuation(sentences))",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "words_norm = normalize(words)",
"execution_count": 11,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "sents_norm = [' '.join(s) for s in words_norm]",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "vectorizer = CountVectorizer(analyzer = \"word\", \n tokenizer = None, \n preprocessor = None, \n stop_words = None)",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "X = vectorizer.fit_transform(sents_norm).toarray()",
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "vocab = vectorizer.get_feature_names()",
"execution_count": 15,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Latent Dirichlet Allocation"
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "model = lda.LDA(n_topics=5, n_iter=1500, random_state=1868)\nmodel.fit(X)",
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": "WARNING:lda:all zero row in document-term matrix found\n",
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "<lda.lda.LDA at 0x114771518>"
},
"metadata": {},
"execution_count": 16
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "topic_word = model.topic_word_",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "n_top_words = 26",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "for i, topic_dist in enumerate(topic_word):\n topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]\n print('Topic ' + str(i) + ': ', end='\\n')\n print(' '.join(topic_words), end='\\n\\n')",
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"text": "Topic 0: \none laws causes cause law may effect would two nature case phenomena cases another must circumstances effects different therefore every method known number even phenomenon\n\nTopic 1: \none would government may even power people human society good men much general persons every great state life opinion others public person moral without though\n\nTopic 2: \nwould capital labor money value production wages country cost one price demand increase produce may england commodities amount quantity land profits rate much less profit\n\nTopic 3: \npolitical economy time first see book great years made father history also work new states review chapter philosophy two principles english united chap last french\n\nTopic 4: \none may general name things proposition must science true thing word meaning two propositions nature therefore subject class names mind case truth every fact first\n\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "I decided to try to find topics in my text collection using LDA. I initially had some issues getting `gensim` to work, so I used [this implementation](https://github.com/ariddell/lda/). (I was eventually able to install `gensim`.) The \"documents\" in my document-term matrix were sentences from my text collection. I used my code from the keyphrase extraction assignment to process the text and then used Scikit-Learn's `CountVectorizer` to create the document-term matrix. I tried several values for the `n_topics` parameter and settled on five. Because my text collection includes all of Mill's available works from Project Gutenberg, there is variety in topics. Had I used a single text, the results may not have been as good as they were. The challenge with LDA (and perhaps k-means, too) is how to describe the topics that result. Below is my attempt.\n\nTopic descriptions:\n\n* **0**: Law\n* **1**: Government\n* **2**: Economics\n* **3**: Political Economy\n* **4**: Science"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Word2Vec"
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))\nmodel = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Nouns"
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "five_nouns = ['abeyance', 'arsenic', 'women', 'constitution', 'ethics']\npos='n'",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "print('NOUNS', end='\\n\\n')\nfor w in five_nouns:\n print(w, end='\\n')\n print('Word2Vec', [t[0] for t in model.most_similar(positive=w, topn = 5)], end='\\n')\n print('WordNet (Synset)', [s for s in wn.synsets(w, pos)][:3], end='\\n')\n print('WordNet (Lemmas)', [s.lemma_names() for s in wn.synsets(w, pos)][:3], end='\\n\\n')",
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"text": "NOUNS\n\nabeyance\nWord2Vec ['pending', 'limbo', 'quashed', 'cognizance', 'indefinitely']\nWordNet (Synset) [Synset('abeyance.n.01')]\nWordNet (Lemmas) [['abeyance', 'suspension']]\n\narsenic\nWord2Vec ['chromium', 'nitrate', 'nitrates', 'cadmium', 'benzene']\nWordNet (Synset) [Synset('arsenic.n.01'), Synset('arsenic.n.02')]\nWordNet (Lemmas) [['arsenic', 'arsenic_trioxide', 'arsenous_anhydride', 'arsenous_oxide', 'white_arsenic', 'ratsbane'], ['arsenic', 'As', 'atomic_number_33']]\n\nwomen\nWord2Vec ['men', 'Women', 'girls', 'females', 'mothers']\nWordNet (Synset) [Synset('woman.n.01'), Synset('woman.n.02'), Synset('charwoman.n.01')]\nWordNet (Lemmas) [['woman', 'adult_female'], ['woman'], ['charwoman', 'char', 'cleaning_woman', 'cleaning_lady', 'woman']]\n\nconstitution\nWord2Vec ['Constitution', 'constitutional', 'constitutions', 'Constitutions', 'Constitutional']\nWordNet (Synset) [Synset('fundamental_law.n.01'), Synset('constitution.n.02'), Synset('united_states_constitution.n.01')]\nWordNet (Lemmas) [['fundamental_law', 'organic_law', 'constitution'], ['constitution', 'establishment', 'formation', 'organization', 'organisation'], ['United_States_Constitution', 'U.S._Constitution', 'US_Constitution', 'Constitution', 'Constitution_of_the_United_States']]\n\nethics\nWord2Vec ['Ethics', 'ethical', 'morality', 'morals', 'impropriety']\nWordNet (Synset) [Synset('ethical_motive.n.01'), Synset('ethics.n.02'), Synset('ethic.n.01')]\nWordNet (Lemmas) [['ethical_motive', 'ethics', 'morals', 'morality'], ['ethics', 'moral_philosophy'], ['ethic', 'moral_principle', 'value-system', 'value_orientation']]\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "five_adjectives = ['philosophical', 'honorable', 'excellent', 'historical', 'opposed']\npos = 'a'",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "print('ADJECTIVES', end='\\n\\n')\nfor w in five_adjectives:\n print(w, end='\\n')\n print('Word2Vec', [t[0] for t in model.most_similar(positive=w, topn = 5)], end='\\n')\n print('WordNet (Synset)', [s for s in wn.synsets(w, pos)][:3], end='\\n')\n print('WordNet (Lemmas)', [s.lemma_names() for s in wn.synsets(w, pos)][:3], end='\\n\\n')",
"execution_count": 24,
"outputs": [
{
"output_type": "stream",
"text": "ADJECTIVES\n\nphilosophical\nWord2Vec ['philosophic', 'metaphysical', 'theological', 'philosophically', 'humanistic']\nWordNet (Synset) [Synset('philosophic.a.01'), Synset('philosophical.s.02')]\nWordNet (Lemmas) [['philosophic', 'philosophical'], ['philosophical', 'philosophic']]\n\nhonorable\nWord2Vec ['honorably', 'noble', 'dignified', 'selfless', 'sincere']\nWordNet (Synset) [Synset('honest.a.01'), Synset('honorable.a.02'), Synset('ethical.s.03')]\nWordNet (Lemmas) [['honest', 'honorable'], ['honorable', 'honourable'], ['ethical', 'honorable', 'honourable']]\n\nexcellent\nWord2Vec ['terrific', 'superb', 'exceptional', 'fantastic', 'good']\nWordNet (Synset) [Synset('excellent.s.01')]\nWordNet (Lemmas) [['excellent', 'first-class', 'fantabulous', 'splendid']]\n\nhistorical\nWord2Vec ['historic', 'Historical', 'archeological', 'archaeological', 'historians']\nWordNet (Synset) [Synset('historical.a.01'), Synset('historical.s.02'), Synset('historic.s.01')]\nWordNet (Lemmas) [['historical'], ['historical'], ['historic', 'historical']]\n\nopposed\nWord2Vec ['oppose', 'opposes', 'objected', 'advocated', 'objecting']\nWordNet (Synset) [Synset('opposed.a.01')]\nWordNet (Lemmas) [['opposed']]\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "five_verbs = ['commenced', 'accomplish', 'visit', 'discuss', 'consider']\npos = 'v'",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "print('VERBS', end='\\n\\n')\nfor w in five_verbs:\n print(w, end='\\n')\n print('Word2Vec', [t[0] for t in model.most_similar(positive=w, topn = 5)], end='\\n')\n print('WordNet (Synset)', [s for s in wn.synsets(w, pos)][:3], end='\\n')\n print('WordNet (Lemmas)', [s.lemma_names() for s in wn.synsets(w, pos)][:3], end='\\n\\n')",
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": "VERBS\n\ncommenced\nWord2Vec ['commence', 'commencing', 'commences', 'recommence', 'initiated']\nWordNet (Synset) [Synset('get_down.v.07'), Synset('begin.v.03'), Synset('start.v.06')]\nWordNet (Lemmas) [['get_down', 'begin', 'get', 'start_out', 'start', 'set_about', 'set_out', 'commence'], ['begin', 'lead_off', 'start', 'commence'], ['start', 'start_up', 'embark_on', 'commence']]\n\naccomplish\nWord2Vec ['accomplishing', 'accomplished', 'achieve', 'accomplishes', 'attain']\nWordNet (Synset) [Synset('carry_through.v.01'), Synset('achieve.v.01')]\nWordNet (Lemmas) [['carry_through', 'accomplish', 'execute', 'carry_out', 'action', 'fulfill', 'fulfil'], ['achieve', 'accomplish', 'attain', 'reach']]\n\nvisit\nWord2Vec ['visiting', 'visits', 'visited', 'trip', 'Visiting']\nWordNet (Synset) [Synset('visit.v.01'), Synset('travel_to.v.01'), Synset('visit.v.03')]\nWordNet (Lemmas) [['visit', 'see'], ['travel_to', 'visit'], ['visit', 'call_in', 'call']]\n\ndiscuss\nWord2Vec ['discussed', 'discussing', 'talk', 'discusses', 'examine']\nWordNet (Synset) [Synset('discourse.v.01'), Synset('hash_out.v.01')]\nWordNet (Lemmas) [['discourse', 'talk_about', 'discuss'], ['hash_out', 'discuss', 'talk_over']]\n\nconsider\nWord2Vec ['considering', 'contemplate', 'reconsider', 'recommend', 'considered']\nWordNet (Synset) [Synset('see.v.05'), Synset('study.v.03'), Synset('consider.v.03')]\nWordNet (Lemmas) [['see', 'consider', 'reckon', 'view', 'regard'], ['study', 'consider'], ['consider', 'take', 'deal', 'look_at']]\n\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Overall, Word2Vec performed better than WordNet Synsets. For WordNet, I printed both the Synsets and the lemmas.\n\nThe \"arsenic\" example was interesting. Word2Vec gave related words *other* than arsenic, such as \"chromium\" and \"nitrate,\" while WordNet results gave results such as \"arsenic_trioxide\" and \"arsenous_oxide.\" For \"women,\" Word2Vec returned \"Women.\" Because of this, I tried editing the words in `pruned.word2vec.txt`, but was unable to. In some cases, WordNet gave terms that were more helpful in understanding the search term. For \"constitution,\" for example, Word2Vec returned words like \"constitutional\" and \"Constitutions,\" but WordNet returned \"fundamental_law\" and \"establishment.\" For vebs, the results were quite even."
}
],
"metadata": {
"language_info": {
"file_extension": ".py",
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"mimetype": "text/x-python",
"version": "3.4.2",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"name": "python"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment