Last active
October 21, 2022 19:22
-
-
Save aleenprd/981194b2197a462161d66680525a74c1 to your computer and use it in GitHub Desktop.
scrape_reviews_page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame: | |
"""Scrape IMDB reviews page. | |
Note: Extracts ratings, usernames, review date, titles, review body text, | |
number of reactions, total reactions to review. | |
Args: | |
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page. | |
Returns: | |
df_out (pd.DataFrame): a Pandas DataFrame with all of the above | |
structured as columns. | |
""" | |
# Initialize dataframe columns as empty lists to pe populated | |
df_out = pd.DataFrame() | |
review_ratings = [] | |
user_names = [] | |
review_dates = [] | |
review_titles = [] | |
review_texts = [] | |
num_helpful_reactions = [] | |
num_total_reactions = [] | |
# Find all review boxes on the page so we can iterate over them | |
review_boxes = reviews_soup.find_all('div', {"class": "lister-item"}) | |
for review in review_boxes: | |
# Rating of review | |
review_rating = fetch_el_if_available(review, "div", "ipl-ratings-bar") | |
if review_rating is not None: | |
review_rating = float(review_rating.replace("\n", "").split("/")[0]) | |
review_ratings.append(review_rating) | |
# User name | |
user_name_and_date = fetch_el_if_available(review, "div", "display-name-date") | |
if user_name_and_date is not None: | |
user_name_and_date = user_name_and_date.replace("\n", "").split(" ") | |
user_names.append(user_name_and_date[0]) | |
else: | |
user_names.append(None) | |
# Review date | |
review_date = fetch_el_if_available(review, "span", "review-date") | |
if review_date is not None: | |
review_date = review_date.replace("\n", "").strip() | |
review_dates.append(review_date) | |
# Title of review | |
review_title = fetch_el_if_available(review, "a", "title") | |
if review_title is not None: | |
review_title = review_title.replace("\n", "") | |
review_titles.append(review_title) | |
# Text of review | |
review_text = fetch_el_if_available(review, "div", "text") | |
if review_title is not None: | |
review_text = review_text.replace("\n", "") | |
review_texts.append(review_text) | |
# Review Reactions | |
reactions = fetch_el_if_available(review, "div", "actions") | |
if reactions is not None: | |
reactions = reactions.replace("\n", "").strip().split(" ") | |
num_helpful_reactions.append(float(reactions[0].replace(",", ""))) | |
num_total_reactions.append(float(reactions[3].replace(",", ""))) | |
else: | |
num_helpful_reactions.append(None) | |
num_total_reactions.append(None) | |
df_out["review_rating"] = review_ratings | |
df_out["user_name"] = user_names | |
df_out["review_date"] = review_dates | |
df_out["review_title"] = review_titles | |
df_out["review_text"] = review_texts | |
df_out["num_helpful_reactions"] = num_helpful_reactions | |
df_out["num_total_reactions"] = num_total_reactions | |
return df_out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment