This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ### R code from vignette source 'Presentation_2.Rnw' | |
| ### Encoding: UTF-8 | |
| ################################################### | |
| ### code chunk number 1: init | |
| ################################################### | |
| options(width=60) | |
| ################################################### |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import nltk | |
| nltk.download("stopwords") | |
| nltk.download('vader_lexicon') | |
| ### Pull | |
| url = 'http://www.gutenberg.org/files/501/501-0.txt' | |
| res = requests.get(url) | |
| html_page = res.content |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.tokenize import RegexpTokenizer | |
| tokenizer = RegexpTokenizer(r'\w+') | |
| tokens = tokenizer.tokenize(text) | |
| tokens = [i.lower() for i in tokens] | |
| ## Uncomment and remove the ")" to get length of longest word | |
| print("Longest word in text: " + max(tokens, key=len) )# + " is " + str(len(max(tokens, key=len))) + " characters long") | |
| ## Longest real word | |
| tokens = [y for y in tokens if y != "cutterigsloop"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def Flesch_Kincaid(text): | |
| sentences = text.split('.') | |
| avg_sentence_len = sum(len(x.split()) for x in sentences) / len(sentences) | |
| syllables = sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y"] else 0,text))) | |
| word_count = len(text.split(' ')) | |
| mean_syllables_per_word = syllables/float(word_count) | |
| return (0.39 * avg_sentence_len) + (11.8 * mean_syllables_per_word) - 15.59 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def text_processing(input_text): | |
| tokens = tokenizer.tokenize(input_text) | |
| lemmatizer = WordNetLemmatizer() | |
| tokens = [lemmatizer.lemmatize(i) for i in tokens] | |
| stops = set(stopwords.words('english')) | |
| values = [i for i in tokens if i not in stops] | |
| weird = ["wa", "u"] | |
| values = [i for i in values if i not in weird] | |
| return(values) | |
| values = text_processing(text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| animals = {"polynesia":"parrot", "gubgub":"pig", "cheechee":"monkey", "tootoo":"owl","pushmipullyu":"two-headed unicorn", "whitey":"mouse", "jip":"dog", "dabdab":"duck", "toggle":"horse", "cheapside":"sparrow"} | |
| text = ' '.join([animals.get(i, i) for i in text.split()]) | |
| with open("../data/Animal_Names.txt", "r") as f: | |
| animals = f.readlines() | |
| animals = [x.split("\n")[0] for x in animals] | |
| Animals_in_Text = set(animals) & set(tokens) | |
| print("Doctor Dolittle interacts with " + str(len(Animals_in_Text)) + " different kinds of animals") | |
| values = [i for i in values if i in Animals_in_Text] | |
| freq = nltk.FreqDist(values) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter | |
| import numpy as np | |
| from wordcloud import WordCloud, ImageColorGenerator | |
| from PIL import Image | |
| import matplotlib.pyplot as plt | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| ## Split into chapters | |
| Chapters = text.split("chapter") | |
| count = 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def Chapter_Sentiment(Chapter_Text): | |
| def Average(lst): | |
| return float(sum(lst)) / len(lst) | |
| sid = SentimentIntensityAnalyzer() | |
| sentences = Chapter_Text.split('.') | |
| sentiment_scores = [] | |
| for item in sentences: | |
| sentiment = sid.polarity_scores(item) | |
| sentiment_value = sentiment['compound'] | |
| sentiment_scores.append(sentiment_value) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| with open("../data/War_Terms.txt", "r") as f: | |
| war_terms = f.readlines() | |
| war_terms = [x.split("\n")[0] for x in war_terms] | |
| War_Terms_in_Text = set(war_terms) & set(tokens) | |
| print('Number of war terms used in the story: '+ str(len(War_Terms_in_Text))) | |
| values = [i for i in tokens if i in War_Terms_in_Text] | |
| sentences = text.split('.') | |
| for item in sentences: | |
| if "death" in item: | |
| print(item) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| url = 'http://www.gutenberg.org/files/501/501-0.txt' | |
| res = requests.get(url) | |
| html_page = res.content | |
| soup = BeautifulSoup(html_page, 'html.parser') | |
| text = soup.find_all(text=True) | |
| text = str(text) | |
| text = text.replace("\n", " ").replace("\r", " ").replace("\\r", " ").replace("\\n", " ").replace("_", "").lower() | |
| text = text.split("the first chapter")[1].split("illustration: the end")[0] | |
| with open("../data/Doctor_Dolittle.txt", "w") as f: | |
| f.write(text) |
OlderNewer