Skip to content

Instantly share code, notes, and snippets.

@mistermichaelll
Last active May 31, 2019 15:39
Show Gist options
  • Select an option

  • Save mistermichaelll/28f38a9e10677c50f6c5fc42b84533b0 to your computer and use it in GitHub Desktop.

Select an option

Save mistermichaelll/28f38a9e10677c50f6c5fc42b84533b0 to your computer and use it in GitHub Desktop.
This script parses text interviews for certain words/phrases. It writes a DataFrame to a csv file containing the word/phrase searched for, subject ID number, subject gender, subject age, subject gender, appearance number, the phrase before and after, and the word before and after.
# =====================================================
# This script was created to help parse the information
# in a text interview programmatically as opposed to
# doing so by hand. Currently, this program will
# find a specifed word, number the first ocurrence of
# the word, and return the phrase before and after the
# word.
# =====================================================
import os
import re
import pandas as pd
import numpy as np
current_path = "~/Documents/Transcriptions/Transcriptions/"
dataframe_path = "~/Documents/Transcriptions/DataFrames/"
# this function gets the
# text file read into Python
# --------------------------
def text_grabber(file_path, file_name):
file = open(os.path.expanduser(file_path + file_name), encoding = "utf-8")
lines = file.readlines()
return(lines)
# this function helps us count the number
# of times that a word appears
# in the entire text
# ----------------------------
def word_counter(text_object, word_to_find):
word_count = 0
for line in text_object:
line = line.lower()
for word in line.split():
if re.findall(word_to_find, word):
word_count = word_count + 1
return(word_count)
# This function returns the
# appearance number in an array.
# ------------------------------
def word_num(text_object, word_to_find):
num_array = []
word_num = 1
for line in text_object:
line = line.lower() # convert to lower case
if re.findall(word_to_find, line):
num_array.append(word_num)
word_num = word_num + 1
return(num_array)
# This function gets the phrase before
# ------------------------------------
def get_before(text_object, word_to_find):
num_array = []
before = 0
for line in text_object:
line = line.lower()
if re.findall(word_to_find, line):
word_part = line.partition(word_to_find)
phrase_before = word_part[before]
num_array.append(phrase_before)
return(num_array)
# This function gets
# the phrase after a word
# ------------------------
def get_after(text_object, word_to_find):
num_array = []
after = 2
for line in text_object:
line = line.lower()
if re.findall(word_to_find, line):
word_part = line.partition(word_to_find)
phrase_after = word_part[after]
num_array.append(phrase_after)
return(num_array)
# this function gets the
# immediate word before a word
# ----------------------------
def get_word_before(text_object, word_to_find):
num_array = []
before = 0
for line in text_object:
line = line.lower()
line = line.strip()
if re.findall(word_to_find, line):
word_part = line.partition(word_to_find)
phrase_before = word_part[before]
phrase_split = phrase_before.split()
# add exception handling for places where
# the phrase is on the previous line
try:
word_before = phrase_split[len(phrase_split) - 1]
except IndexError:
word_before = "NA"
num_array.append(word_before)
return(num_array)
# this function gets the
# immediate word after a given word
# ---------------------------------
def get_word_after(text_object, word_to_find):
num_array = []
after = 2
for line in text_object:
line = line.lower()
line = line.strip()
if re.findall(word_to_find, line):
word_part = line.partition(word_to_find)
phrase_after = word_part[after]
phrase_split = phrase_after.split()
# add exception handling for places where
# the phrase is on the next line
try:
word_after = phrase_split[0]
except IndexError:
word_after = "NA"
num_array.append(word_after)
return(num_array)
# this function assigns an ID number for use when
# later combining dataframe together.
# ------------------------------------------------
def assign_id_number(text_object, word_to_find, ID_num):
num_array = []
i = 0
while i < len(word_num(text_object, word_to_find)):
num_array.append(ID_num)
i = i + 1
return(num_array)
# this function lists the word that we are looking at
# so there is more context
# ---------------------------------------------------
def assign_word_var(text_object, word_to_find, word):
num_array = []
i = 0
while i < len(word_num(text_object, word_to_find)):
num_array.append(word)
i = i + 1
return(num_array)
# this function gives subject a gender
# ------------------------------------
def assign_gender(text_object, word_to_find, gender):
num_array = []
i = 0
while i < len(word_num(text_object, word_to_find)):
num_array.append(gender)
i = i + 1
return(num_array)
# this function gives subject an age
# ------------------------------------
def assign_age(text_object, word_to_find, age):
num_array = []
i = 0
while i < len(word_num(text_object, word_to_find)):
num_array.append(age)
i = i + 1
return(num_array)
# this function gives subject a country of origin
# -----------------------------------------------
def assign_country(text_object, word_to_find, country):
num_array = []
i = 0
while i < len(word_num(text_object, word_to_find)):
num_array.append(country)
i = i + 1
return(num_array)
# this function runs all the previous functions,
# writes the result to a csv, and returns a
# Pandas dataframe of the information we want.
# # --------------------------------------------
def main_part(file_name, current_path, dataframe_path, word_to_find, subject_ID_num, gender, age, country):
text_object = text_grabber(current_path, file_name)
num = np.array(word_num(text_object, word_to_find))
phrase_before = np.array(get_before(text_object, word_to_find))
phrase_after = np.array(get_after(text_object, word_to_find))
word_before = np.array(get_word_before(text_object, word_to_find))
word_after = np.array(get_word_after(text_object, word_to_find))
id_num = np.array(assign_id_number(text_object, word_to_find, subject_ID_num))
word_var = np.array(assign_word_var(text_object, word_to_find, word_to_find))
gender = np.array(assign_gender(text_object, word_to_find, gender))
age = np.array(assign_age(text_object, word_to_find, age))
country = np.array(assign_country(text_object, word_to_find, country))
df = pd.DataFrame({"Word_Examined":word_var, "Subject_ID_Num":id_num, "Gender":gender, "Age":age, "Country":country, "Word_Appearance_Num_This_Interview":num,
"Phrase_Before":phrase_before, "Phrase_After":phrase_after, "Word_Before":word_before, "Word_After":word_after})
print("The word", "\"", word_to_find, "\"", "appeared", word_counter(text_object, word_to_find), "total times in this subject's interview.")
print(len(word_num(text_object, word_to_find)), "uses of the word were selected (based on whether they were its first appearance on a line).")
print("A DataFrame containing this information was written to \n\"", current_path, "\" with the name", word_to_find + "_" + str(subject_ID_num) + ".csv", ".\n")
df.to_csv(os.path.expanduser(dataframe_path + word_to_find + "_" + str(subject_ID_num) + ".csv"), index = False)
return(df)
df = main_part("1718.txt", current_path, dataframe_path, "uhm", "1718", "F", 18, "United States")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment