Last active
May 31, 2019 15:39
-
-
Save mistermichaelll/28f38a9e10677c50f6c5fc42b84533b0 to your computer and use it in GitHub Desktop.
This script parses text interviews for certain words/phrases. It writes a DataFrame to a csv file containing the word/phrase searched for, subject ID number, subject gender, subject age, subject gender, appearance number, the phrase before and after, and the word before and after.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ===================================================== | |
| # This script was created to help parse the information | |
| # in a text interview programmatically as opposed to | |
| # doing so by hand. Currently, this program will | |
| # find a specifed word, number the first ocurrence of | |
| # the word, and return the phrase before and after the | |
| # word. | |
| # ===================================================== | |
| import os | |
| import re | |
| import pandas as pd | |
| import numpy as np | |
| current_path = "~/Documents/Transcriptions/Transcriptions/" | |
| dataframe_path = "~/Documents/Transcriptions/DataFrames/" | |
| # this function gets the | |
| # text file read into Python | |
| # -------------------------- | |
| def text_grabber(file_path, file_name): | |
| file = open(os.path.expanduser(file_path + file_name), encoding = "utf-8") | |
| lines = file.readlines() | |
| return(lines) | |
| # this function helps us count the number | |
| # of times that a word appears | |
| # in the entire text | |
| # ---------------------------- | |
| def word_counter(text_object, word_to_find): | |
| word_count = 0 | |
| for line in text_object: | |
| line = line.lower() | |
| for word in line.split(): | |
| if re.findall(word_to_find, word): | |
| word_count = word_count + 1 | |
| return(word_count) | |
| # This function returns the | |
| # appearance number in an array. | |
| # ------------------------------ | |
| def word_num(text_object, word_to_find): | |
| num_array = [] | |
| word_num = 1 | |
| for line in text_object: | |
| line = line.lower() # convert to lower case | |
| if re.findall(word_to_find, line): | |
| num_array.append(word_num) | |
| word_num = word_num + 1 | |
| return(num_array) | |
| # This function gets the phrase before | |
| # ------------------------------------ | |
| def get_before(text_object, word_to_find): | |
| num_array = [] | |
| before = 0 | |
| for line in text_object: | |
| line = line.lower() | |
| if re.findall(word_to_find, line): | |
| word_part = line.partition(word_to_find) | |
| phrase_before = word_part[before] | |
| num_array.append(phrase_before) | |
| return(num_array) | |
| # This function gets | |
| # the phrase after a word | |
| # ------------------------ | |
| def get_after(text_object, word_to_find): | |
| num_array = [] | |
| after = 2 | |
| for line in text_object: | |
| line = line.lower() | |
| if re.findall(word_to_find, line): | |
| word_part = line.partition(word_to_find) | |
| phrase_after = word_part[after] | |
| num_array.append(phrase_after) | |
| return(num_array) | |
| # this function gets the | |
| # immediate word before a word | |
| # ---------------------------- | |
| def get_word_before(text_object, word_to_find): | |
| num_array = [] | |
| before = 0 | |
| for line in text_object: | |
| line = line.lower() | |
| line = line.strip() | |
| if re.findall(word_to_find, line): | |
| word_part = line.partition(word_to_find) | |
| phrase_before = word_part[before] | |
| phrase_split = phrase_before.split() | |
| # add exception handling for places where | |
| # the phrase is on the previous line | |
| try: | |
| word_before = phrase_split[len(phrase_split) - 1] | |
| except IndexError: | |
| word_before = "NA" | |
| num_array.append(word_before) | |
| return(num_array) | |
| # this function gets the | |
| # immediate word after a given word | |
| # --------------------------------- | |
| def get_word_after(text_object, word_to_find): | |
| num_array = [] | |
| after = 2 | |
| for line in text_object: | |
| line = line.lower() | |
| line = line.strip() | |
| if re.findall(word_to_find, line): | |
| word_part = line.partition(word_to_find) | |
| phrase_after = word_part[after] | |
| phrase_split = phrase_after.split() | |
| # add exception handling for places where | |
| # the phrase is on the next line | |
| try: | |
| word_after = phrase_split[0] | |
| except IndexError: | |
| word_after = "NA" | |
| num_array.append(word_after) | |
| return(num_array) | |
| # this function assigns an ID number for use when | |
| # later combining dataframe together. | |
| # ------------------------------------------------ | |
| def assign_id_number(text_object, word_to_find, ID_num): | |
| num_array = [] | |
| i = 0 | |
| while i < len(word_num(text_object, word_to_find)): | |
| num_array.append(ID_num) | |
| i = i + 1 | |
| return(num_array) | |
| # this function lists the word that we are looking at | |
| # so there is more context | |
| # --------------------------------------------------- | |
| def assign_word_var(text_object, word_to_find, word): | |
| num_array = [] | |
| i = 0 | |
| while i < len(word_num(text_object, word_to_find)): | |
| num_array.append(word) | |
| i = i + 1 | |
| return(num_array) | |
| # this function gives subject a gender | |
| # ------------------------------------ | |
| def assign_gender(text_object, word_to_find, gender): | |
| num_array = [] | |
| i = 0 | |
| while i < len(word_num(text_object, word_to_find)): | |
| num_array.append(gender) | |
| i = i + 1 | |
| return(num_array) | |
| # this function gives subject an age | |
| # ------------------------------------ | |
| def assign_age(text_object, word_to_find, age): | |
| num_array = [] | |
| i = 0 | |
| while i < len(word_num(text_object, word_to_find)): | |
| num_array.append(age) | |
| i = i + 1 | |
| return(num_array) | |
| # this function gives subject a country of origin | |
| # ----------------------------------------------- | |
| def assign_country(text_object, word_to_find, country): | |
| num_array = [] | |
| i = 0 | |
| while i < len(word_num(text_object, word_to_find)): | |
| num_array.append(country) | |
| i = i + 1 | |
| return(num_array) | |
| # this function runs all the previous functions, | |
| # writes the result to a csv, and returns a | |
| # Pandas dataframe of the information we want. | |
| # # -------------------------------------------- | |
| def main_part(file_name, current_path, dataframe_path, word_to_find, subject_ID_num, gender, age, country): | |
| text_object = text_grabber(current_path, file_name) | |
| num = np.array(word_num(text_object, word_to_find)) | |
| phrase_before = np.array(get_before(text_object, word_to_find)) | |
| phrase_after = np.array(get_after(text_object, word_to_find)) | |
| word_before = np.array(get_word_before(text_object, word_to_find)) | |
| word_after = np.array(get_word_after(text_object, word_to_find)) | |
| id_num = np.array(assign_id_number(text_object, word_to_find, subject_ID_num)) | |
| word_var = np.array(assign_word_var(text_object, word_to_find, word_to_find)) | |
| gender = np.array(assign_gender(text_object, word_to_find, gender)) | |
| age = np.array(assign_age(text_object, word_to_find, age)) | |
| country = np.array(assign_country(text_object, word_to_find, country)) | |
| df = pd.DataFrame({"Word_Examined":word_var, "Subject_ID_Num":id_num, "Gender":gender, "Age":age, "Country":country, "Word_Appearance_Num_This_Interview":num, | |
| "Phrase_Before":phrase_before, "Phrase_After":phrase_after, "Word_Before":word_before, "Word_After":word_after}) | |
| print("The word", "\"", word_to_find, "\"", "appeared", word_counter(text_object, word_to_find), "total times in this subject's interview.") | |
| print(len(word_num(text_object, word_to_find)), "uses of the word were selected (based on whether they were its first appearance on a line).") | |
| print("A DataFrame containing this information was written to \n\"", current_path, "\" with the name", word_to_find + "_" + str(subject_ID_num) + ".csv", ".\n") | |
| df.to_csv(os.path.expanduser(dataframe_path + word_to_find + "_" + str(subject_ID_num) + ".csv"), index = False) | |
| return(df) | |
| df = main_part("1718.txt", current_path, dataframe_path, "uhm", "1718", "F", 18, "United States") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment