Skip to content

Instantly share code, notes, and snippets.

@rohitr360
Created May 8, 2016 16:27
Show Gist options
  • Save rohitr360/9a386689e3819d9f071faac59d9b98fa to your computer and use it in GitHub Desktop.
Save rohitr360/9a386689e3819d9f071faac59d9b98fa to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import re, os, time, csv, random, pymongo, math, html, string, html2text
from pymongo import MongoClient
import xml.etree.ElementTree as ET
import pymongo
import sklearn
#Currently we have a pandas DataFrame with a row for each article classification
#The fields assumed when writing this code are 'grade', 'text', 'title', 'date', etc.
def get_lm_words():
global lm_words
lm_words = pd.read_csv('../../pluribus_labs/earnings_call/data/dictionary_data/LoughranMcDonald_MasterDictionary_2014.csv')['Word']
return lm_words
def get_tf_vector(text):
text = text.upper()
text_split = text.split()
tf_dict = {}
for word in lm_words:
tf_dict[word] = pd.Series(text_split).map(lambda x: word == x).sum()
tf_vector = tf_dict.values()
tf_keys = tf_dict.keys()
return tf_vector
def get_grade_tf_df(df):
text_col = df.text
df['tf_vector'] = text_col.map(lambda x: get_tf_vector(x))
return df[['grade', 'tf_vector']]
def df_to_csv(df, path):
return df.to_csv(path)
def logistic_regression_fit(grade_tf_df):
grade = grade_tf_df.grade
tf_vector = grade_tf_df.tf_vector
LR_model = sklearn.linear_model.LogisticRegression()
def logistic_regression_predict(LR_fit, tf_matrix):
#The tf matrix is a list of tf_vectors you want a grade for
#This functions returns a classification (grade) for each input tf_vector
LR_predict = LR_fit.predict(tf_vector)
return LR_predict
def logistic_regression_summary(LR_fit):
return LR_fit.summary()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment