Skip to content

Instantly share code, notes, and snippets.

@gaphex
Created June 25, 2019 13:10
Show Gist options
  • Save gaphex/bb4548f4fb625ba3a19a0e70d868b0a4 to your computer and use it in GitHub Desktop.
Save gaphex/bb4548f4fb625ba3a19a0e70d868b0a4 to your computer and use it in GitHub Desktop.
import pandas as pd
import json
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xvzf MovieSummaries.tar.gz
plots_df = pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None)
meta_df = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
plot = {}
metadata = {}
movie_data = {}
for movie_id, movie_plot in plots_df.values:
plot[movie_id] = movie_plot
for movie_id, movie_name, movie_genre in meta_df[[0,2,8]].values:
genre = list(json.loads(movie_genre).values())
if len(genre):
metadata[movie_id] = {"name": movie_name,
"genre": genre}
for movie_id in set(plot.keys())&set(metadata.keys()):
movie_data[metadata[movie_id]['name']] = {"genre": metadata[movie_id]['genre'],
"plot": plot[movie_id]}
X, Y, names = [], [], []
for movie_name, movie_meta in movie_data.items():
X.append(movie_meta['plot'])
Y.append(movie_meta['genre'])
names.append(movie_name)
X = bert_vectorizer(X, verbose=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment