Last active
May 9, 2019 01:24
-
-
Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.
Data Science: What is the near future of Superheroines?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import re | |
def get_first_name(aString): | |
if not aString: | |
return aString | |
ss = aString.replace('*', '').split(' ') # ['Leonard', 'Nimoy*Chris', 'PineZachary', 'QuintoZoe', 'SaldanaKarl'] | |
name_list = [] # result returned for this function | |
for name in ss: | |
names = re.findall('([A-Z])', name) #[N, C] | |
if len(names) == 1: # L | |
name_list.append(name) | |
elif len(names) == 2: | |
name_list.append(name[name.find(names[1],1):]) #name.find(names['C']) = 5 --> name[5:] | |
return name_list[0:-1] # array | |
def read_marvel(file_name): | |
movies = [] | |
with open(file_name) as f: | |
movies = f.read().split('\n') | |
return movies | |
def read_csv(file_name): # file_name = 'boxoffice.csv' | |
movie_list = [] # create an empty list | |
with open(file_name) as f: | |
txt = f.read() | |
row_list = txt.split('\n') # a list of each row of data | |
for row in row_list: | |
name_list = row.split(',') | |
if len(name_list) >= 3: | |
name_list[1] = get_first_name(name_list[1]) | |
# 0: movie name, 1: actor names, 2: year | |
movie_list.append((name_list[0], name_list[1], name_list[2])) | |
return movie_list | |
def lookup_gender(filename): | |
gender_dict = {} | |
with open(filename) as f: | |
firstname_gender = f.read().split('\n') | |
for t in firstname_gender[:-1]: # there is a '' at the end cuz rows were split by \n | |
firstname, gender = t.split(',') | |
gender_dict[firstname] = gender | |
return gender_dict | |
## SCRIPT begins | |
# construct a first name to gender dictionary | |
gender_dict = lookup_gender('name.csv') | |
movie_list = read_csv('boxoffice.csv') # movie_list has 3 columns: name, actor list, year | |
all_movie_dict = collections.OrderedDict() | |
sorted_movie_list = sorted(movie_list, key=lambda x:x[2], reverse=True) | |
##all_actors = set() | |
##for m in sorted_movie_list: | |
## for n in m[1]: | |
## all_actors.add(n) | |
## | |
##with open('raw_name.csv','w') as f: | |
## for name in all_actors: | |
## f.write(name + '\n') | |
## Analysze all movie's actor gender by year | |
all_year_dict = collections.OrderedDict() | |
for m in sorted_movie_list: | |
all_movie_dict[m[0]] = [m[1], m[2]] | |
year = m[2] | |
actors = m[1] | |
try: | |
genders = [gender_dict[name] for name in actors] | |
except KeyError: | |
pass | |
if year in all_year_dict: | |
all_year_dict[year] += genders | |
else: | |
all_year_dict[year] = genders | |
print ' key: year, value: list of gender of male/female' | |
for i in all_year_dict: | |
my_temp_dict = {j:all_year_dict[i].count(j) for j in all_year_dict[i]} | |
print i, my_temp_dict | |
## Analyze Marvel's movie actor gender by year | |
# construct a dict, key: year, value: list of gender of male/female | |
year_dict = collections.OrderedDict() | |
for marvel_movie in read_marvel('marvel_movies.txt'): # m as movie name | |
# for each name in the marvel movie | |
year = str(all_movie_dict[marvel_movie][1]) | |
actors = all_movie_dict[marvel_movie][0] | |
gender = [gender_dict[name] for name in actors] | |
if year in year_dict: | |
year_dict[year] += gender | |
else: | |
year_dict[year] = gender | |
# construct a dict, key: male/female, value: count | |
print 'key: male/female, value: count' | |
for i in year_dict: | |
my_dict = {j:year_dict[i].count(j) for j in year_dict[i]} | |
print i, my_dict | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment