mistermichaelll · May 31, 2019 15:39
diff --git a/span_ling.py b/span_ling.py
 # =====================================================
 # This script was created to help parse the information
 # in a text interview programmatically as opposed to 
 # doing so by hand. Currently, this program will 
 # find a specifed word, number the first ocurrence of
 # the word, and return the phrase before and after the 
 # word. 
 # =====================================================

 import os 
 import re
 import pandas as pd 
 import numpy as np 
 current_path = "~/Documents/Transcriptions/Transcriptions/"
 dataframe_path = "~/Documents/Transcriptions/DataFrames/"

 # this function gets the 
 # text file read into Python
 # --------------------------
 def text_grabber(file_path, file_name):
    file = open(os.path.expanduser(file_path + file_name), encoding = "utf-8")
    lines = file.readlines()
    return(lines)

 # this function helps us count the number
 # of times that a word appears 
 # in the entire text
 # ----------------------------
 def word_counter(text_object, word_to_find):
    word_count = 0
    for line in text_object:
        line = line.lower()
        for word in line.split():
            if re.findall(word_to_find, word):
                word_count = word_count + 1
    return(word_count)

 # This function returns the 
 # appearance number in an array.  
 # ------------------------------
 def word_num(text_object, word_to_find):
    num_array = []
    word_num = 1
    for line in text_object:
        line = line.lower() # convert to lower case
        if re.findall(word_to_find, line):
            num_array.append(word_num)
            word_num = word_num + 1
    return(num_array)

 # This function gets the phrase before
 # ------------------------------------
 def get_before(text_object, word_to_find):
    num_array = []
    before = 0 
    for line in text_object:
        line = line.lower()
        if re.findall(word_to_find, line):
            word_part = line.partition(word_to_find)
            phrase_before = word_part[before]
            num_array.append(phrase_before)
    return(num_array)

 # This function gets 
 # the phrase after a word 
 # ------------------------
 def get_after(text_object, word_to_find):
    num_array = []
    after = 2 
    for line in text_object:
        line = line.lower()
        if re.findall(word_to_find, line):
            word_part = line.partition(word_to_find)
            phrase_after = word_part[after]
            num_array.append(phrase_after)
    return(num_array)

 # this function gets the 
 # immediate word before a word
 # ----------------------------
 def get_word_before(text_object, word_to_find):
    num_array = []
    before = 0 
    for line in text_object:
        line = line.lower()
        line = line.strip()
        if re.findall(word_to_find, line):
            word_part = line.partition(word_to_find)
            phrase_before = word_part[before]
            phrase_split = phrase_before.split()
            # add exception handling for places where
            # the phrase is on the previous line
            try: 
                word_before = phrase_split[len(phrase_split) - 1]
            except IndexError:
                word_before = "NA"
            num_array.append(word_before)
    return(num_array)

 # this function gets the 
 # immediate word after a given word 
 # ---------------------------------
 def get_word_after(text_object, word_to_find):
    num_array = []
    after = 2
    for line in text_object:
        line = line.lower()
        line = line.strip()
        if re.findall(word_to_find, line):
            word_part = line.partition(word_to_find)
            phrase_after = word_part[after]
            phrase_split = phrase_after.split()
            # add exception handling for places where
            # the phrase is on the next line
            try: 
                word_after = phrase_split[0]
            except IndexError:
                word_after = "NA"
            num_array.append(word_after)
    return(num_array)

 # this function assigns an ID number for use when 
 # later combining dataframe together. 
 # ------------------------------------------------
 def assign_id_number(text_object, word_to_find, ID_num):
    num_array = []
    i = 0
    while i < len(word_num(text_object, word_to_find)):
        num_array.append(ID_num)
        i = i + 1 
    return(num_array)

 # this function lists the word that we are looking at
 # so there is more context 
 # ---------------------------------------------------
 def assign_word_var(text_object, word_to_find, word):
    num_array = []
    i = 0
    while i < len(word_num(text_object, word_to_find)):
        num_array.append(word)
        i = i + 1 
    return(num_array)

 # this function gives subject a gender
 # ------------------------------------
 def assign_gender(text_object, word_to_find, gender):
    num_array = []
    i = 0
    while i < len(word_num(text_object, word_to_find)):
        num_array.append(gender)
        i = i + 1 
    return(num_array)

 # this function gives subject an age
 # ------------------------------------
 def assign_age(text_object, word_to_find, age):
    num_array = []
    i = 0
    while i < len(word_num(text_object, word_to_find)):
        num_array.append(age)
        i = i + 1 
    return(num_array)

 # this function gives subject a country of origin
 # -----------------------------------------------
 def assign_country(text_object, word_to_find, country):
    num_array = []
    i = 0
    while i < len(word_num(text_object, word_to_find)):
        num_array.append(country)
        i = i + 1 
    return(num_array)

 # this function runs all the previous functions, 
 # writes the result to a csv, and returns a 
 # Pandas dataframe of the information we want.
 # # -------------------------------------------- 
 def main_part(file_name, current_path, dataframe_path, word_to_find, subject_ID_num, gender, age, country):
    text_object = text_grabber(current_path, file_name)

    num = np.array(word_num(text_object, word_to_find))

    phrase_before = np.array(get_before(text_object, word_to_find))

    phrase_after = np.array(get_after(text_object, word_to_find))

    word_before = np.array(get_word_before(text_object, word_to_find))

    word_after = np.array(get_word_after(text_object, word_to_find))

    id_num = np.array(assign_id_number(text_object, word_to_find, subject_ID_num))

    word_var = np.array(assign_word_var(text_object, word_to_find, word_to_find))

    gender = np.array(assign_gender(text_object, word_to_find, gender))

    age = np.array(assign_age(text_object, word_to_find, age))

    country = np.array(assign_country(text_object, word_to_find, country))

    df = pd.DataFrame({"Word_Examined":word_var, "Subject_ID_Num":id_num, "Gender":gender, "Age":age, "Country":country, "Word_Appearance_Num_This_Interview":num, 
    "Phrase_Before":phrase_before, "Phrase_After":phrase_after, "Word_Before":word_before, "Word_After":word_after})

    print("The word", "\"", word_to_find, "\"", "appeared", word_counter(text_object, word_to_find), "total times in this subject's interview.")
    print(len(word_num(text_object, word_to_find)), "uses of the word were selected (based on whether they were its first appearance on a line).")
    print("A DataFrame containing this information was written to \n\"", current_path, "\" with the name", word_to_find + "_" + str(subject_ID_num) + ".csv", ".\n")
    
    df.to_csv(os.path.expanduser(dataframe_path + word_to_find + "_" + str(subject_ID_num) + ".csv"), index = False)

    return(df)

 df = main_part("1718.txt", current_path, dataframe_path, "uhm", "1718", "F", 18, "United States")
	# =====================================================
	# This script was created to help parse the information
	# in a text interview programmatically as opposed to
	# doing so by hand. Currently, this program will
	# find a specifed word, number the first ocurrence of
	# the word, and return the phrase before and after the
	# word.
	# =====================================================

	import os
	import re
	import pandas as pd
	import numpy as np
	current_path = "~/Documents/Transcriptions/Transcriptions/"
	dataframe_path = "~/Documents/Transcriptions/DataFrames/"

	# this function gets the
	# text file read into Python
	# --------------------------
	def text_grabber(file_path, file_name):
	file = open(os.path.expanduser(file_path + file_name), encoding = "utf-8")
	lines = file.readlines()
	return(lines)

	# this function helps us count the number
	# of times that a word appears
	# in the entire text
	# ----------------------------
	def word_counter(text_object, word_to_find):
	word_count = 0
	for line in text_object:
	line = line.lower()
	for word in line.split():
	if re.findall(word_to_find, word):
	word_count = word_count + 1
	return(word_count)

	# This function returns the
	# appearance number in an array.
	# ------------------------------
	def word_num(text_object, word_to_find):
	num_array = []
	word_num = 1
	for line in text_object:
	line = line.lower() # convert to lower case
	if re.findall(word_to_find, line):
	num_array.append(word_num)
	word_num = word_num + 1
	return(num_array)

	# This function gets the phrase before
	# ------------------------------------
	def get_before(text_object, word_to_find):
	num_array = []
	before = 0
	for line in text_object:
	line = line.lower()
	if re.findall(word_to_find, line):
	word_part = line.partition(word_to_find)
	phrase_before = word_part[before]
	num_array.append(phrase_before)
	return(num_array)

	# This function gets
	# the phrase after a word
	# ------------------------
	def get_after(text_object, word_to_find):
	num_array = []
	after = 2
	for line in text_object:
	line = line.lower()
	if re.findall(word_to_find, line):
	word_part = line.partition(word_to_find)
	phrase_after = word_part[after]
	num_array.append(phrase_after)
	return(num_array)

	# this function gets the
	# immediate word before a word
	# ----------------------------
	def get_word_before(text_object, word_to_find):
	num_array = []
	before = 0
	for line in text_object:
	line = line.lower()
	line = line.strip()
	if re.findall(word_to_find, line):
	word_part = line.partition(word_to_find)
	phrase_before = word_part[before]
	phrase_split = phrase_before.split()
	# add exception handling for places where
	# the phrase is on the previous line
	try:
	word_before = phrase_split[len(phrase_split) - 1]
	except IndexError:
	word_before = "NA"
	num_array.append(word_before)
	return(num_array)

	# this function gets the
	# immediate word after a given word
	# ---------------------------------
	def get_word_after(text_object, word_to_find):
	num_array = []
	after = 2
	for line in text_object:
	line = line.lower()
	line = line.strip()
	if re.findall(word_to_find, line):
	word_part = line.partition(word_to_find)
	phrase_after = word_part[after]
	phrase_split = phrase_after.split()
	# add exception handling for places where
	# the phrase is on the next line
	try:
	word_after = phrase_split[0]
	except IndexError:
	word_after = "NA"
	num_array.append(word_after)
	return(num_array)

	# this function assigns an ID number for use when
	# later combining dataframe together.
	# ------------------------------------------------
	def assign_id_number(text_object, word_to_find, ID_num):
	num_array = []
	i = 0
	while i < len(word_num(text_object, word_to_find)):
	num_array.append(ID_num)
	i = i + 1
	return(num_array)

	# this function lists the word that we are looking at
	# so there is more context
	# ---------------------------------------------------
	def assign_word_var(text_object, word_to_find, word):
	num_array = []
	i = 0
	while i < len(word_num(text_object, word_to_find)):
	num_array.append(word)
	i = i + 1
	return(num_array)

	# this function gives subject a gender
	# ------------------------------------
	def assign_gender(text_object, word_to_find, gender):
	num_array = []
	i = 0
	while i < len(word_num(text_object, word_to_find)):
	num_array.append(gender)
	i = i + 1
	return(num_array)

	# this function gives subject an age
	# ------------------------------------
	def assign_age(text_object, word_to_find, age):
	num_array = []
	i = 0
	while i < len(word_num(text_object, word_to_find)):
	num_array.append(age)
	i = i + 1
	return(num_array)

	# this function gives subject a country of origin
	# -----------------------------------------------
	def assign_country(text_object, word_to_find, country):
	num_array = []
	i = 0
	while i < len(word_num(text_object, word_to_find)):
	num_array.append(country)
	i = i + 1
	return(num_array)

	# this function runs all the previous functions,
	# writes the result to a csv, and returns a
	# Pandas dataframe of the information we want.
	# # --------------------------------------------
	def main_part(file_name, current_path, dataframe_path, word_to_find, subject_ID_num, gender, age, country):
	text_object = text_grabber(current_path, file_name)

	num = np.array(word_num(text_object, word_to_find))

	phrase_before = np.array(get_before(text_object, word_to_find))

	phrase_after = np.array(get_after(text_object, word_to_find))

	word_before = np.array(get_word_before(text_object, word_to_find))

	word_after = np.array(get_word_after(text_object, word_to_find))

	id_num = np.array(assign_id_number(text_object, word_to_find, subject_ID_num))

	word_var = np.array(assign_word_var(text_object, word_to_find, word_to_find))

	gender = np.array(assign_gender(text_object, word_to_find, gender))

	age = np.array(assign_age(text_object, word_to_find, age))

	country = np.array(assign_country(text_object, word_to_find, country))

	df = pd.DataFrame({"Word_Examined":word_var, "Subject_ID_Num":id_num, "Gender":gender, "Age":age, "Country":country, "Word_Appearance_Num_This_Interview":num,
	"Phrase_Before":phrase_before, "Phrase_After":phrase_after, "Word_Before":word_before, "Word_After":word_after})

	print("The word", "\"", word_to_find, "\"", "appeared", word_counter(text_object, word_to_find), "total times in this subject's interview.")
	print(len(word_num(text_object, word_to_find)), "uses of the word were selected (based on whether they were its first appearance on a line).")
	print("A DataFrame containing this information was written to \n\"", current_path, "\" with the name", word_to_find + "_" + str(subject_ID_num) + ".csv", ".\n")

	df.to_csv(os.path.expanduser(dataframe_path + word_to_find + "_" + str(subject_ID_num) + ".csv"), index = False)

	return(df)

	df = main_part("1718.txt", current_path, dataframe_path, "uhm", "1718", "F", 18, "United States")
No results found