Last active
October 20, 2022 19:30
-
-
Save aleenprd/090c49fd0fbb1f53afbfcc758dfcbf52 to your computer and use it in GitHub Desktop.
scrape_imdb_reviews_pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@timing | |
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None: | |
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews. | |
Args: | |
season_link (str): URL pointing to season page. | |
show_link (str): URL pointing to show general reviews. | |
driver_service (Service): a Chrome web driver. | |
output_path (str): path including filename where we want to save the CSV. | |
""" | |
# Results dataframes: one for each episode in the season | |
re_dfs = [] # Will concatenate all dataframes at the end. | |
episodes_links = get_episodes_links(link=season_link, driver_service=driver_service) | |
print("Episodes: ", episodes_links) | |
# We use TQDM to construct a progress bar, showing us how far off we are with scraping. | |
# For each episode, we want to get the reviws page, scroll till the end and make our DF. | |
for ep in tqdm(episodes_links): | |
reviews_page = get_reviews_page(ep) | |
print("Parsing Reviews at: ", reviews_page) | |
reviews_soup = scroll_reviews_and_cook_soup( | |
link=reviews_page, driver_service=driver_service) | |
df_temp = scrape_reviews_page(reviews_soup) | |
df_temp["episode_number"] = int(ep.split("ep")[-1]) | |
re_dfs.append(df_temp) | |
sleep(5) # Sleep again again for a while to not overwhelm server with requests | |
show_reviews_link = reviews_page = get_reviews_page(show_link) | |
print("Parsing Reviews at: ", show_reviews_link) | |
show_reviews_soup = scroll_reviews_and_cook_soup(link=show_reviews_link, driver_service=driver_service) | |
df_temp = scrape_reviews_page(show_reviews_soup) | |
df_temp["episode_number"] = 0 | |
re_dfs.append(df_temp) | |
season_reviews_df = pd.concat(re_dfs) | |
season_reviews_df.to_csv(output_path, header=True, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment