ZaxR · October 17, 2019 21:26
diff --git a/movie_keyword_parser.py b/movie_keyword_parser.py
 # Parser as part of this interview challenge: https://github.com/GuildEducationInc/data-engineer-project
 # Dataset: https://www.kaggle.com/rounakbanik/the-movies-dataset
 # Alternative ETL process, including DB schema: https://github.com/guenthermi/the-movie-database-import
 import ast

 import pandas as pd


 keywords_df = pd.read_csv("the-movies-dataset/keywords.csv", nrows=1000)
 keywords_df = keywords_df.rename(columns={"id": "movie_id"})

 # The keywords column has the json-like string.
 # Eval each row's string, turning it into a python list of dicts.
 # Use ast.literal_eval() instead of eval() for injection safety.
 # Covert the lists of dicts to new columns in a new df.
 df = keywords_df['keywords'].apply(ast.literal_eval).apply(pd.Series)

 # There's still nested json-line structure in the new columns.
 # Take a similar approach to split the remaining data into columns.
 # We will also covert the data to long form,
 # so each movie_id will have a row for every plot type it has.
 to_concat = []
 for col in df.columns:
    tdf = df[col].apply(pd.Series)[['id', 'name']]
    tdf.index = keywords_df['movie_id'].copy()
    to_concat.append(tdf)
 df = pd.concat(to_concat, axis="rows")

 # Because some movies have fewer plot types than others,
 # there were null values in some of our split columns.
 # Let's drop those, rename our id column to something clearer,
 # sort our index by movie and plot, and reset our index
 df = (df
      .dropna(how='all')
      .rename(columns={"id": "plot_id"})
      .reset_index()
      .sort_values(by=["movie_id", "plot_id"])
      .reset_index(drop=True))

 # Write out our cleaned up csv file without the generic range index.
 df.to_csv("the-movies-dataset/keywords_clean_long_form.csv", index=False)
	# Parser as part of this interview challenge: https://github.com/GuildEducationInc/data-engineer-project
	# Dataset: https://www.kaggle.com/rounakbanik/the-movies-dataset
	# Alternative ETL process, including DB schema: https://github.com/guenthermi/the-movie-database-import
	import ast

	import pandas as pd


	keywords_df = pd.read_csv("the-movies-dataset/keywords.csv", nrows=1000)
	keywords_df = keywords_df.rename(columns={"id": "movie_id"})

	# The keywords column has the json-like string.
	# Eval each row's string, turning it into a python list of dicts.
	# Use ast.literal_eval() instead of eval() for injection safety.
	# Covert the lists of dicts to new columns in a new df.
	df = keywords_df['keywords'].apply(ast.literal_eval).apply(pd.Series)

	# There's still nested json-line structure in the new columns.
	# Take a similar approach to split the remaining data into columns.
	# We will also covert the data to long form,
	# so each movie_id will have a row for every plot type it has.
	to_concat = []
	for col in df.columns:
	tdf = df[col].apply(pd.Series)[['id', 'name']]
	tdf.index = keywords_df['movie_id'].copy()
	to_concat.append(tdf)
	df = pd.concat(to_concat, axis="rows")

	# Because some movies have fewer plot types than others,
	# there were null values in some of our split columns.
	# Let's drop those, rename our id column to something clearer,
	# sort our index by movie and plot, and reset our index
	df = (df
	.dropna(how='all')
	.rename(columns={"id": "plot_id"})
	.reset_index()
	.sort_values(by=["movie_id", "plot_id"])
	.reset_index(drop=True))

	# Write out our cleaned up csv file without the generic range index.
	df.to_csv("the-movies-dataset/keywords_clean_long_form.csv", index=False)