TODO: Write a project description
TODO: Describe the installation process
def show_missing(df): | |
""" | |
Return the total missing values and the percentage of | |
missing values by column. | |
""" | |
null_count = df.isnull().sum() | |
null_percentage = (null_count / df.shape[0]) * 100 | |
empty_count = pd.Series(((df == ' ') | (df == '')).sum()) | |
empty_percentage = (empty_count / df.shape[0]) * 100 | |
nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum()) |
choices = set([item for sublist in articles for item in sublist]) | |
cleaned_articles = [] | |
for article in articles: | |
article_entities = [] | |
for entity in set(article): | |
article_entities.append(process.extractOne(entity, choices)[0]) | |
cleaned_articles.append(article_entities) |
def clean(text): | |
""" | |
A simple function to clean up the data. All the words that | |
are not designated as a stop word is then lemmatized after | |
encoding and basic regex parsing are performed. | |
""" | |
wnl = nltk.stem.WordNetLemmatizer() | |
stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS | |
text = (unicodedata.normalize('NFKD', text) | |
.encode('ascii', 'ignore') |
from joblib import Parallel, delayed | |
def chunker(iterable, total_length, chunksize): | |
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize)) | |
def flatten(list_of_lists): | |
"Flatten a list of lists to a combined list" | |
return [item for sublist in list_of_lists for item in sublist] | |
def process_chunk(texts): |
entity_counts = [] | |
entity_counts.append(('Democrats', df_counts.loc[df_counts.entity.isin(['Democrats', 'Dems', 'Democrat'])]['count'].sum())) | |
entity_counts.append(('Americans', df_counts.loc[df_counts.entity.isin(['American', 'Americans'])]['count'].sum())) | |
entity_counts.append(('Congress', df_counts.loc[df_counts.entity.isin(['House', 'Senate', 'Congress'])]['count'].sum())) | |
entity_counts.append(('America', df_counts.loc[df_counts.entity.isin(['U.S.', 'the United States', 'America'])]['count'].sum())) | |
entity_counts.append(('Republicans', df_counts.loc[df_counts.entity.isin(['Republican', 'Republicans'])]['count'].sum())) | |
entity_counts.append(('China', 533)) | |
entity_counts.append(('FBI', 316)) |
import pandas as pd | |
# to print out all the outputs | |
from IPython.core.interactiveshell import InteractiveShell | |
InteractiveShell.ast_node_interactivity = "all" | |
# set display options | |
pd.set_option('display.max_columns', None) | |
pd.set_option('display.max_rows', None) | |
pd.set_option('display.max_colwidth', -1) |
df = pd.read_csv('allservicecalls.csv') | |
df.head() | |
df.info() | |
show_missing(df) |
df['Dept'].value_counts(dropna=False) | |
df_null = df.loc[df['Dept'].isnull()] | |
df_null.head() | |
df_null.shape | |
df_notnull = df.loc[df['Dept'].notnull()] | |
df_notnull.head() | |
df_notnull.shape |