This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### R code from vignette source 'Presentation_2.Rnw' | |
### Encoding: UTF-8 | |
################################################### | |
### code chunk number 1: init | |
################################################### | |
options(width=60) | |
################################################### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import nltk | |
nltk.download("stopwords") | |
nltk.download('vader_lexicon') | |
### Pull | |
url = 'http://www.gutenberg.org/files/501/501-0.txt' | |
res = requests.get(url) | |
html_page = res.content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.tokenize import RegexpTokenizer | |
tokenizer = RegexpTokenizer(r'\w+') | |
tokens = tokenizer.tokenize(text) | |
tokens = [i.lower() for i in tokens] | |
## Uncomment and remove the ")" to get length of longest word | |
print("Longest word in text: " + max(tokens, key=len) )# + " is " + str(len(max(tokens, key=len))) + " characters long") | |
## Longest real word | |
tokens = [y for y in tokens if y != "cutterigsloop"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def Flesch_Kincaid(text): | |
sentences = text.split('.') | |
avg_sentence_len = sum(len(x.split()) for x in sentences) / len(sentences) | |
syllables = sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y"] else 0,text))) | |
word_count = len(text.split(' ')) | |
mean_syllables_per_word = syllables/float(word_count) | |
return (0.39 * avg_sentence_len) + (11.8 * mean_syllables_per_word) - 15.59 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_processing(input_text): | |
tokens = tokenizer.tokenize(input_text) | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(i) for i in tokens] | |
stops = set(stopwords.words('english')) | |
values = [i for i in tokens if i not in stops] | |
weird = ["wa", "u"] | |
values = [i for i in values if i not in weird] | |
return(values) | |
values = text_processing(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
text = text.translate(str.maketrans('', '', string.punctuation)) | |
animals = {"polynesia":"parrot", "gubgub":"pig", "cheechee":"monkey", "tootoo":"owl","pushmipullyu":"two-headed unicorn", "whitey":"mouse", "jip":"dog", "dabdab":"duck", "toggle":"horse", "cheapside":"sparrow"} | |
text = ' '.join([animals.get(i, i) for i in text.split()]) | |
with open("../data/Animal_Names.txt", "r") as f: | |
animals = f.readlines() | |
animals = [x.split("\n")[0] for x in animals] | |
Animals_in_Text = set(animals) & set(tokens) | |
print("Doctor Dolittle interacts with " + str(len(Animals_in_Text)) + " different kinds of animals") | |
values = [i for i in values if i in Animals_in_Text] | |
freq = nltk.FreqDist(values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import numpy as np | |
from wordcloud import WordCloud, ImageColorGenerator | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
from matplotlib.backends.backend_pdf import PdfPages | |
## Split into chapters | |
Chapters = text.split("chapter") | |
count = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def Chapter_Sentiment(Chapter_Text): | |
def Average(lst): | |
return float(sum(lst)) / len(lst) | |
sid = SentimentIntensityAnalyzer() | |
sentences = Chapter_Text.split('.') | |
sentiment_scores = [] | |
for item in sentences: | |
sentiment = sid.polarity_scores(item) | |
sentiment_value = sentiment['compound'] | |
sentiment_scores.append(sentiment_value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open("../data/War_Terms.txt", "r") as f: | |
war_terms = f.readlines() | |
war_terms = [x.split("\n")[0] for x in war_terms] | |
War_Terms_in_Text = set(war_terms) & set(tokens) | |
print('Number of war terms used in the story: '+ str(len(War_Terms_in_Text))) | |
values = [i for i in tokens if i in War_Terms_in_Text] | |
sentences = text.split('.') | |
for item in sentences: | |
if "death" in item: | |
print(item) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
url = 'http://www.gutenberg.org/files/501/501-0.txt' | |
res = requests.get(url) | |
html_page = res.content | |
soup = BeautifulSoup(html_page, 'html.parser') | |
text = soup.find_all(text=True) | |
text = str(text) | |
text = text.replace("\n", " ").replace("\r", " ").replace("\\r", " ").replace("\\n", " ").replace("_", "").lower() | |
text = text.split("the first chapter")[1].split("illustration: the end")[0] | |
with open("../data/Doctor_Dolittle.txt", "w") as f: | |
f.write(text) |
OlderNewer