You need to download the source of the data (the html) of IMDb and save it as data.html, then run parse_imdb.py, this will generate the csv.
Then you can run plot.py.
Remember to use python3
| actor;season_1;season_2;season_3;season_4;season_5;season_6 | |
| Tyrion Lannister;52:45;65:0;50:0;47:45;44:0;34:0 | |
| Jon Snow;54:45;33:45;36:0;42:30;49:15;52:0 | |
| Daenerys Targaryen;54:0;38:45;30:45;27:30;41:45;28:45 | |
| Cersei Lannister;28:0;36:15;22:0;37:15;49:45;28:30 | |
| Sansa Stark;24:0;34:0;33:30;32:45;34:45;40:30 | |
| Arya Stark;33:30;38:45;26:30;25:45;34:45;30:0 | |
| Jaime Lannister;21:0;18:15;34:45;36:0;21:45;30:45 | |
| Theon Greyjoy;16:45;40:15;22:30;12:15;15:30;16:30 | |
| Samwell Tarly;21:15;11:15;24:0;26:0;23:30;16:0 | |
| Jorah Mormont;27:30;17:30;18:45;13:30;29:45;10:30 | |
| Petyr 'Littlefinger' Baelish;25:45;21:15;8:45;19:0;18:30;9:0 | |
| Eddard 'Ned' Stark;92:30;0:0;0:0;0:0;0:0;7:15 | |
| Brienne of Tarth;0:0;18:0;21:15;20:0;14:45;15:30 | |
| Davos Seaworth;0:0;19:15;19:0;11:30;12:30;27:15 | |
| Bran Stark;25:45;13:15;16:45;12:0;0:0;18:15 | |
| Catelyn Stark;42:15;24:0;16:30;0:0;0:0;0:0 | |
| Lord Varys;19:0;14:0;15:45;6:30;10:30;16:0 | |
| Tywin Lannister;10:15;17:15;21:15;28:30;1:0;0:0 | |
| Margaery Tyrell;0:0;9:30;21:15;17:45;11:15;18:15 | |
| Robb Stark;24:45;27:30;25:30;0:0;0:0;0:0 | |
| Stannis Baratheon;0:0;19:45;14:15;10:0;29:15;0:0 | |
| Sandor 'The Hound' Clegane;6:45;11:45;16:0;28:30;0:0;9:45 | |
| Joffrey Baratheon;17:45;17:30;20:15;14:45;0:0;0:0 | |
| Ramsay Bolton;0:0;0:0;12:45;13:30;18:30;21:15 | |
| Melisandre;0:0;12:45;18:0;5:15;11:45;18:0 | |
| Bronn;15:0;14:30;6:45;8:45;13:30;5:30 | |
| Gilly;0:0;3:15;13:30;9:15;11:45;15:15 | |
| Ygritte;0:0;17:45;25:0;8:15;0:0;0:0 | |
| Shae;8:0;16:0;14:30;8:45;0:0;0:0 | |
| Daario Naharis;0:0;0:0;6:30;7:30;19:15;12:45 | |
| Missandei;0:0;0:0;11:30;9:30;13:15;11:30 | |
| Tommen Baratheon;0:45;4:15;0:0;9:45;11:30;17:0 | |
| Tormund Giantsbane;0:0;0:0;8:45;7:15;12:15;13:30 | |
| Podrick Payne;0:0;3:45;5:30;16:0;9:0;7:0 | |
| Olenna Tyrell;0:0;0:0;13:45;8:0;8:0;8:30 | |
| High Sparrow;0:0;0:0;0:0;0:0;17:0;20:30 | |
| Barristan Selmy;8:45;0:0;13:30;7:45;7:15;0:0 | |
| Grand Maester Pycelle;13:45;6:15;2:45;5:15;2:30;4:45 | |
| Grey Worm;0:0;0:0;5:30;6:30;10:15;10:0 | |
| Loras Tyrell;5:45;8:0;6:15;1:45;4:30;5:45 | |
| Talisa Maegyr;0:0;14:45;16:0;0:0;0:0;0:0 | |
| Robert Baratheon;30:30;0:0;0:0;0:0;0:0;0:0 | |
| Roose Bolton;0:0;3:30;8:45;5:45;9:0;3:15 | |
| Osha;8:0;9:30;8:45;0:0;0:0;3:30 | |
| Hodor;3:0;6:0;7:0;7:0;0:0;6:30 | |
| Gendry;2:45;9:0;17:0;0:0;0:0;0:0 | |
| Oberyn Martell;0:0;0:0;0:0;28:30;0:0;0:0 | |
| Eddison Tollett;0:0;5:0;3:45;5:45;4:15;9:30 | |
| Yara Greyjoy;0:0;10:0;2:45;2:15;0:0;12:0 | |
| Meera Reed;0:0;0:0;8:15;9:15;0:0;9:30 | |
| Jaqen H'ghar;0:0;8:0;0:0;0:0;11:15;7:15 | |
| Alliser Thorne;6:45;0:0;0:0;9:15;5:45;4:15 | |
| Khal Drogo;22:45;2:15;0:0;0:0;0:0;0:0 | |
| Renly Baratheon;9:30;14:30;0:0;0:0;0:0;0:0 | |
| Maester Luwin;6:45;16:45;0:0;0:0;0:0;0:0 | |
| Ros;8:15;10:15;3:15;0:0;0:0;0:0 | |
| Grenn;6:15;5:0;3:0;7:0;0:0;0:0 | |
| Mance Rayder;0:0;0:0;6:45;7:0;7:15;0:0 | |
| Jeor Mormont;10:0;7:0;4:0;0:0;0:0;0:0 | |
| Viserys Targaryen;20:30;0:0;0:0;0:0;0:0;0:0 | |
| Qyburn;0:0;0:0;5:45;2:15;4:15;7:0 | |
| Jojen Reed;0:0;0:0;9:45;9:15;0:0;0:0 | |
| Maester Aemon;5:30;0:0;1:15;6:0;6:15;0:0 | |
| Gregor 'The Mountain' Clegane;2:30;1:45;0:0;4:45;0:30;9:15 | |
| Ellaria Sand;0:0;0:0;0:0;6:45;8:45;3:0 | |
| Lancel Lannister;3:15;6:30;0:0;0:0;4:0;5:15 | |
| Shireen Baratheon;0:0;0:0;4:0;2:45;11:30;0:0 | |
| Edmure Tully;0:0;0:0;10:30;0:0;0:0;7:15 | |
| Lysa Arryn;6:0;0:0;0:0;10:30;0:0;0:0 | |
| Meryn Trant;3:0;2:45;1:15;1:45;7:15;0:0 | |
| Brynden 'Blackfish' Tully;0:0;0:0;9:30;0:0;0:0;6:30 | |
| Walder Frey;3:0;0:0;7:0;0:0;0:0;5:45 | |
| Thoros of Myr;0:0;0:0;12:45;0:0;0:0;2:15 | |
| Janos Slynt;1:0;4:15;0:0;5:15;3:45;0:0 | |
| Locke;0:0;0:0;7:0;6:0;0:0;0:0 | |
| Myranda;0:0;0:0;3:30;1:30;7:15;0:45 | |
| Rickon Stark;1:15;4:15;3:45;0:0;0:0;3:15 | |
| Rodrik Cassel;10:0;2:0;0:0;0:0;0:0;0:45 | |
| Waif;0:0;0:0;0:0;0:0;4:30;8:15 | |
| Hot Pie;0:45;5:30;4:30;2:0;0:0;0:0 | |
| Rast;4:15;0:0;4:0;4:30;0:0;0:0 | |
| Septa Unella;0:0;0:0;0:0;0:0;7:30;5:0 | |
| Olly;0:0;0:0;0:0;3:15;7:15;2:0 | |
| Doreah;9:30;3:0;0:0;0:0;0:0;0:0 | |
| Balon Greyjoy;0:0;6:30;2:30;0:0;0:0;3:15 | |
| Benjen Stark;6:15;0:0;0:0;0:0;0:0;5:45 | |
| Pypar;7:0;0:0;0:45;4:15;0:0;0:0 | |
| Yoren;6:0;6:0;0:0;0:0;0:0;0:0 | |
| Myrcella Baratheon;0:45;1:30;0:0;0:0;8:0;1:30 | |
| Hizdahr zo Loraq;0:0;0:0;0:0;2:0;9:45;0:0 | |
| Mace Tyrell;0:0;0:0;0:0;3:45;3:30;4:15 | |
| Robin Arryn;3:45;0:0;0:0;4:45;0:30;2:15 | |
| Beric Dondarrion;0:30;0:0;8:15;0:0;0:0;2:30 | |
| Karl Tanner;0:0;0:0;0:0;2:30;8:45;0:0 | |
| Selyse Baratheon;0:0;0:15;2:0;5:15;3:45;0:0 | |
| Xaro Xhoan Daxos;0:0;10:30;0:0;0:0;0:0;0:0 | |
| Irri;7:0;3:0;0:0;0:0;0:0;0:0 | |
| Lady Crane;0:0;0:0;0:0;0:0;0:0;10:0 | |
| Qhorin Halfhand;0:0;9:30;0:0;0:0;0:0;0:0 | |
| Orell;0:0;0:0;9:0;0:0;0:0;0:0 |
| from bs4 import BeautifulSoup | |
| from html import unescape | |
| import re | |
| import requests as r | |
| def time_to_float(time): | |
| time = time.split(":") | |
| if len(time) == 1: | |
| return int(time[0]) | |
| else: | |
| hours, minutes = time | |
| if not hours: | |
| hours = 0 | |
| return int(hours) + int(minutes)/60 | |
| def float_to_time(value): | |
| if value >= 1: | |
| minute = int((value % int(value))*60) | |
| hour = int(value) | |
| return "{0}:{1}".format(hour, minute) | |
| else: | |
| return "0:{0}".format(int(value*60)) | |
| # I downloaded the page an saved as data.html | |
| soup = BeautifulSoup(open("data.html"), "html.parser") | |
| mydivs = soup.find_all("div", class_=["info", "description"])[1:] | |
| name_regex = re.compile(r">(.*?)</a>") | |
| season_regex = re.compile(r"\* [Ss]eason (\d):.*<(.*)>") | |
| characters = dict() | |
| for div in mydivs: | |
| if div.a: | |
| text = str(unescape(div.a)) | |
| character = name_regex.findall(text)[0] | |
| characters[character] = [0, 0, 0, 0, 0, 0] | |
| else: | |
| for children in div.children: | |
| if "NavigableString" in type(children).__name__: | |
| text = str(children) | |
| duration = season_regex.findall(text) | |
| if duration: | |
| season, time = duration[0] | |
| characters[character][int(season) - 1] += time_to_float(time) | |
| with open("all_seasons.csv", "w") as db: | |
| db.write("actor;season_1;season_2;season_3;season_4;season_5;season_6\n") | |
| for character in characters: | |
| db.write("{0};{1}\n".format( | |
| character, | |
| ";".join(list(map(float_to_time, characters[character]))))) |
| #!/anaconda/bin/python3 | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import numpy as np | |
| plt.style.use("ggplot") | |
| # the time is stored as minutes:seconds, ignoring if they make more than | |
| # one hour. | |
| def time_to_float(value): | |
| if value != "0": | |
| minute, second = list(map(int, value.split(":"))) | |
| second = second/60 | |
| return minute + second | |
| else: | |
| return 0 | |
| def plot_by_time(df): | |
| fig = plt.figure(figsize=(8,20)) | |
| ax = fig.add_subplot(111) | |
| df.plot.barh(ax=ax, x='actor', y=df.columns[1:len(df.columns)-1], \ | |
| stacked=True, width=.6) | |
| ax.axvline(df.median()['total'], color='black', linestyle="dashed") | |
| ax.set_title("Screen time of GOT characters") | |
| ax.set_ylabel("") | |
| ax.set_xlabel("Time in minutes") | |
| fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
| plt.legend() | |
| ax.legend(["Median", "Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"]) | |
| fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
| fig.savefig("all_actors.png", dpi=300, format="png") | |
| def plot_by_time_by_season(df): | |
| fig = plt.figure(figsize=(16,12)) | |
| index = 1 | |
| cmap = plt.get_cmap('Set1') | |
| colors = [cmap(i) for i in np.linspace(0, 1, 6)] | |
| for season in df.columns[1:len(df.columns)-1]: | |
| # we need to sort depending of the season. | |
| temp = df.sort_values(by=season) | |
| ax = fig.add_subplot(2, 3, index) | |
| ax.set_title(season.replace("_", " ").title()) | |
| # We just make one figure, so we plot the top 10 actors by season | |
| temp.iloc[-10:].plot.barh(ax=ax, x='actor', y=season, \ | |
| stacked=True, width=.6, legend=False, \ | |
| color = colors[index - 1]) | |
| index += 1 | |
| ax.set_ylabel("") | |
| ax.set_xlabel("Time in minutes") | |
| fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
| fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
| fig.savefig("char_by_season.png", dpi=300, format="png") | |
| def by_house(df): | |
| # in most cases, last name is the house. | |
| df['house'] = df['actor'].map(lambda x: x.split(" ")[-1]) | |
| # fix for snow :( | |
| df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark' | |
| # delete people without last name | |
| df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1] | |
| grouped = df.groupby(by='house').sum() | |
| fig = plt.figure() | |
| ax = fig.add_subplot(111) | |
| grouped = grouped.reset_index().sort_values(by='total') | |
| grouped.iloc[-10:].plot.barh(ax=ax, x='house', y=df.columns[1:len(df.columns)-2], \ | |
| stacked=True, width=.6) | |
| ax.legend(["Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"]) | |
| ax.set_title("Screen time by house") | |
| ax.set_xlabel("Minutes") | |
| plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
| fig.savefig("by_house.png", dpi=300, format="png") | |
| def by_season(df): | |
| # in most cases, last name is the house. | |
| df['house'] = df['actor'].map(lambda x: x.split(" ")[-1]) | |
| fig = plt.figure(figsize=(19, 10)) | |
| # fix for snow :( | |
| df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark' | |
| # delete people without last name | |
| df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1] | |
| grouped = df.groupby(by='house').sum() | |
| grouped = grouped.reset_index() | |
| cmap = plt.get_cmap('Set1') | |
| colors = [cmap(i) for i in np.linspace(0, 1, 6)] | |
| row = 0 | |
| for i, season in enumerate(list(map(lambda x: "season_%d" % x, range(1, 7)))): | |
| if i > 0 and i % 3 == 0: | |
| row += 1 | |
| i = i % 3 | |
| index = row * 3 + i + 1 | |
| ax = fig.add_subplot(2, 3, index) | |
| temp = grouped.sort_values(by=season).iloc[-10:] | |
| temp.plot.barh(ax=ax, x='house', y=season, legend=False, color=colors[index - 1]) | |
| ax.set_ylabel("House") | |
| ax.set_xlabel("Minutes") | |
| ax.set_title(season.replace("_", " ").title()) | |
| plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
| fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
| fig.savefig("by_season.png", dpi=300, format="png") | |
| df = pd.read_csv("all_seasons.csv", sep=";") | |
| df['total'] = 0 | |
| for col in df.columns[1:len(df.columns) - 1]: | |
| df[col] = df[col].map(time_to_float) | |
| df['total'] += df[col] | |
| df = df.sort_values(by="total") | |
| for season in df.columns[1:len(df.columns)-1]: | |
| print(df[season].sum() / 60) | |
| # Just uncomment the plot you want | |
| #plot_by_time(df) | |
| #plot_by_time_by_season(df) | |
| #by_house(df) | |
| #by_season(df) | |
| #plt.show() |