Created
December 14, 2017 07:22
-
-
Save a-y-khan/b0c4c7413febcbc0219a3740f8306e91 to your computer and use it in GitHub Desktop.
Convert Movie dataset CSV files to Pandas DataFrames
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import ast | |
import pandas as pd | |
from datetime import datetime | |
from distutils.util import strtobool | |
from typing import Dict, List, Union | |
def convert_to_dict_list(string: str) -> Union[List[Dict], None]: | |
""" | |
Use Python's Active Syntax Tree (ast) to convert strings to a list of dict objects. | |
This is an easy way to deal with the lists and the single-quoted dicts, which the JSON converter won't handle. | |
""" | |
# initial sanity check for malformed input: need minimum 2 characters to form valid dict object | |
if len(string) <= 1: | |
return None | |
try: | |
converted_value = ast.literal_eval(string) | |
if type(converted_value) is list: | |
return converted_value | |
else: | |
return None | |
except ValueError: | |
return None | |
def convert_to_dict(string: str) -> Union[Dict, None]: | |
""" | |
Use Python's Active Syntax Tree (ast) to convert strings to dict objects. | |
This is an easy way to deal with the single-quoted dicts, which the JSON converter won't handle. | |
""" | |
# initial sanity check for malformed input: need minimum 2 characters to form valid dict object | |
if len(string) <= 1: | |
return None | |
try: | |
converted_value = ast.literal_eval(string) | |
if type(converted_value) is list and len(converted_value) == 1: | |
return converted_value[0] | |
elif type(converted_value) is dict: | |
return converted_value | |
else: | |
return None | |
except ValueError: | |
return None | |
def convert_to_bool_object(string: str) -> pd.np.object: | |
try: | |
converted_value = strtobool(string) | |
return pd.np.bool(converted_value) | |
except: | |
# promotes to object | |
return pd.np.nan | |
def convert_to_float(string: str) -> pd.np.float: | |
try: | |
return pd.np.float(string) | |
except: | |
return pd.np.nan | |
def cleanup_str(string: str) -> str: | |
"""Basic string cleanup. If the string matches any empty or null patterns, return empty string.""" | |
try: | |
if len(string) < 1 or 'NaN' in string or string is None: | |
return '' | |
return string | |
except: | |
return '' | |
def convert_to_date(string: str) -> Union[datetime, None]: | |
try: | |
result = datetime.strptime(string, '%Y-%m-%d') | |
return result | |
except: | |
return None | |
def convert_to_imdb_id(id: str) -> str: | |
"""Format IMDB IDs to follow the same ID convention (starts with 'tt'.""" | |
try: | |
id = cleanup_str(id) | |
if not id.startswith('tt'): | |
return 'tt{}'.format(id) | |
return id | |
except: | |
return None | |
def convert_to_id(id: str) -> pd.np.int32: | |
try: | |
return pd.np.int32(id) | |
except: | |
return -1 | |
def main(path: str) -> None: | |
# credits.csv, keywords.csv and movies_metadata.csv contain "stringified" JSON Objects | |
# converter needed since default does not convert stringified JSON | |
credits_df = pd.read_csv('{}/credits.csv'.format(path), | |
header=0, delimiter=',', | |
names=['cast', 'crew', 'tmdbId'], | |
converters={'cast': convert_to_dict_list, | |
'crew': convert_to_dict_list, | |
'tmdbId': convert_to_id}) | |
print(credits_df.dtypes) | |
print(credits_df.info(memory_usage='deep')) | |
credits_df.to_pickle('{}/credits.p'.format(path)) | |
keywords_df = pd.read_csv('{}/keywords.csv'.format(path), | |
header=0, delimiter=',', | |
names=['tmdbId', 'keywords'], | |
converters={'tmdbId': convert_to_id, | |
'keywords': convert_to_dict_list}) | |
print(keywords_df.dtypes) | |
print(keywords_df.info(memory_usage='deep')) | |
keywords_df.to_pickle('{}/keywords.p'.format(path)) | |
movies_metadata_df = pd.read_csv('{}/movies_metadata.csv'.format(path), | |
header=0, delimiter=',', | |
names=['adult', | |
'belongs_to_collection', | |
'budget', | |
'genres', | |
'homepage', | |
'tmdbId', | |
'imdbId', | |
'original_language', | |
'original_title', | |
'overview', | |
'popularity', | |
'poster_path', | |
'production_companies', | |
'production_countries', | |
'release_date', | |
'revenue', | |
'runtime', | |
'spoken_languages', | |
'status', | |
'tagline', | |
'title', | |
'video', | |
'vote_average', | |
'vote_count'], | |
converters={'adult': convert_to_bool_object, | |
'belongs_to_collection': convert_to_dict, | |
'budget': convert_to_float, | |
'genres': convert_to_dict_list, | |
'homepage': cleanup_str, | |
'tmdbId': convert_to_id, | |
'imdbId': convert_to_imdb_id, | |
'original_language': cleanup_str, | |
'original_title': cleanup_str, | |
'overview': cleanup_str, | |
'popularity': convert_to_float, | |
'poster_path': cleanup_str, | |
'production_companies': convert_to_dict_list, | |
'production_countries': convert_to_dict_list, | |
'release_date': convert_to_date, | |
'revenue': convert_to_float, | |
'runtime': convert_to_float, | |
'spoken_languages': convert_to_dict_list, | |
'status': cleanup_str, | |
'tagline': cleanup_str, | |
'title': cleanup_str, | |
'video': convert_to_bool_object, | |
'vote_average': convert_to_float, | |
'vote_count': convert_to_float}) | |
# cleanup malformed rows | |
null_adult_cols = movies_metadata_df.loc[movies_metadata_df['adult'].isnull()].index.values | |
movies_metadata_df = movies_metadata_df.drop(null_adult_cols) | |
movies_metadata_df['adult'] = movies_metadata_df.adult.astype(bool) | |
movies_metadata_df['video'] = movies_metadata_df.video.astype(bool) | |
print(movies_metadata_df.dtypes) | |
print(movies_metadata_df.info(memory_usage='deep')) | |
movies_metadata_df.to_pickle('{}/movies_metadata.p'.format(path)) | |
# Simple CSV files | |
links_df = pd.read_csv('{}/links.csv'.format(path), | |
header=0, delimiter=',', | |
names=['movieId', 'imdbId', 'tmdbId'], | |
converters={'movieId': convert_to_id, | |
'imdbId': convert_to_imdb_id, | |
'tmdbId': convert_to_id}) | |
print(links_df.dtypes) | |
print(links_df.info(memory_usage='deep')) | |
links_df.to_pickle('{}/links.p'.format(path)) | |
links_small_df = pd.read_csv('{}/links_small.csv'.format(path), | |
header=0, delimiter=',', | |
names=['movieId', 'imdbId', 'tmdbId'], | |
converters={'movieId': convert_to_id, | |
'imdbId': convert_to_imdb_id, | |
'tmdbId': convert_to_id}) | |
print(links_small_df.dtypes) | |
print(links_small_df.info(memory_usage='deep')) | |
links_small_df.to_pickle('{}/links_small.p'.format(path)) | |
ratings_small_df = pd.read_csv('{}/ratings_small.csv'.format(path), | |
header=0, delimiter=',', | |
names=['userId', 'movieId', 'rating', 'timestamp'], | |
converters={'userId': convert_to_id, | |
'movieId': convert_to_id, | |
'rating': convert_to_float, | |
'timestamp': convert_to_float}) | |
print(ratings_small_df.dtypes) | |
print(ratings_small_df.info(memory_usage='deep')) | |
ratings_small_df.to_pickle('{}/ratings_small.p'.format(path)) | |
if '__main__' == __name__: | |
parser = argparse.ArgumentParser(description='Load and import movies data files to Pandas dataframes.' | |
'Load files from and output picked dataframes to PATH. Default is "."') | |
parser.add_argument('--path', default='.', required=False) | |
args = parser.parse_args() | |
# print(args) | |
main(args.path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment