Skip to content

Instantly share code, notes, and snippets.

@erap129
Last active February 5, 2022 19:10
Show Gist options
  • Save erap129/7eecb658e93b16e72cb7273abcdfcef1 to your computer and use it in GitHub Desktop.
Save erap129/7eecb658e93b16e72cb7273abcdfcef1 to your computer and use it in GitHub Desktop.
def get_wikipedia_page_name(raw_name):
names = wikipedia.search(raw_name)
if len(names) == 0:
return ''
else:
return names[0]
def get_movie_plot(page_name):
try:
try:
movie_page_content = str(wikipedia.page(page_name, auto_suggest=False).content)
except wikipedia.DisambiguationError as e:
for option in e.options:
if 'film' in option:
movie_page_content = str(wikipedia.page(option, auto_suggest=False).content)
return ''
except (wikipedia.PageError, KeyError):
return ''
re_groups = re.search("Plot ==(.*?)=+ [A-Z]", str(movie_page_content).replace('\n', ''))
if re_groups:
return re_groups.group(1)
else:
return ''
movies_df['wikipedia_page_name'] = movies_df['movie_name'].progress_apply(lambda name: get_wikipedia_page_name(name))
movies_df['movie_plot'] = movies_df['wikipedia_page_name'].progress_apply(lambda page_name: get_movie_plot(page_name))
print(f'There are {movies_df["movie_plot"].isna().sum()} NaN movie plots')
movies_df[['movie_name', 'movie_plot']].head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment