aleenprd · October 21, 2022 19:22
diff --git a/scrape_reviews_page.py b/scrape_reviews_page.py
 def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame:
    """Scrape IMDB reviews page.

    Note: Extracts ratings, usernames, review date, titles, review body text,
    number of reactions, total reactions to review.

    Args:
        reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.
 
    Returns:
        df_out (pd.DataFrame): a Pandas DataFrame with all of the above
        structured as columns.
    """
    # Initialize dataframe columns as empty lists to pe populated
    df_out = pd.DataFrame()
    review_ratings = []
    user_names = []
    review_dates = [] 
    review_titles = []
    review_texts = []
    num_helpful_reactions = []
    num_total_reactions = []

    # Find all review boxes on the page so we can iterate over them
    review_boxes = reviews_soup.find_all('div', {"class": "lister-item"})

    for review in review_boxes:
        # Rating of review
        review_rating = fetch_el_if_available(review, "div", "ipl-ratings-bar")
        if review_rating is not None:
            review_rating = float(review_rating.replace("\n", "").split("/")[0])
        review_ratings.append(review_rating)

        # User name
        user_name_and_date = fetch_el_if_available(review, "div", "display-name-date")
        if user_name_and_date is not None:
            user_name_and_date = user_name_and_date.replace("\n", "").split(" ")
            user_names.append(user_name_and_date[0])
        else:
            user_names.append(None)
        
        # Review date
        review_date = fetch_el_if_available(review, "span", "review-date")
        if review_date is not None:
            review_date = review_date.replace("\n", "").strip()
        review_dates.append(review_date)
        
        # Title of review
        review_title = fetch_el_if_available(review, "a", "title")
        if review_title is not None:
            review_title = review_title.replace("\n", "")
        review_titles.append(review_title)
        
        # Text of review
        review_text = fetch_el_if_available(review, "div", "text")
        if review_title is not None:
            review_text = review_text.replace("\n", "")
        review_texts.append(review_text)
        
        # Review Reactions
        reactions = fetch_el_if_available(review, "div", "actions")
        if reactions is not None:
            reactions = reactions.replace("\n", "").strip().split(" ")
            num_helpful_reactions.append(float(reactions[0].replace(",", "")))
            num_total_reactions.append(float(reactions[3].replace(",", "")))
        else:
            num_helpful_reactions.append(None)
            num_total_reactions.append(None)
    
    df_out["review_rating"] = review_ratings
    df_out["user_name"] = user_names
    df_out["review_date"] = review_dates
    df_out["review_title"] = review_titles
    df_out["review_text"] = review_texts
    df_out["num_helpful_reactions"] = num_helpful_reactions
    df_out["num_total_reactions"] = num_total_reactions

    return df_out
	def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame:
	"""Scrape IMDB reviews page.

	Note: Extracts ratings, usernames, review date, titles, review body text,
	number of reactions, total reactions to review.

	Args:
	reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.

	Returns:
	df_out (pd.DataFrame): a Pandas DataFrame with all of the above
	structured as columns.
	"""
	# Initialize dataframe columns as empty lists to pe populated
	df_out = pd.DataFrame()
	review_ratings = []
	user_names = []
	review_dates = []
	review_titles = []
	review_texts = []
	num_helpful_reactions = []
	num_total_reactions = []

	# Find all review boxes on the page so we can iterate over them
	review_boxes = reviews_soup.find_all('div', {"class": "lister-item"})

	for review in review_boxes:
	# Rating of review
	review_rating = fetch_el_if_available(review, "div", "ipl-ratings-bar")
	if review_rating is not None:
	review_rating = float(review_rating.replace("\n", "").split("/")[0])
	review_ratings.append(review_rating)

	# User name
	user_name_and_date = fetch_el_if_available(review, "div", "display-name-date")
	if user_name_and_date is not None:
	user_name_and_date = user_name_and_date.replace("\n", "").split(" ")
	user_names.append(user_name_and_date[0])
	else:
	user_names.append(None)

	# Review date
	review_date = fetch_el_if_available(review, "span", "review-date")
	if review_date is not None:
	review_date = review_date.replace("\n", "").strip()
	review_dates.append(review_date)

	# Title of review
	review_title = fetch_el_if_available(review, "a", "title")
	if review_title is not None:
	review_title = review_title.replace("\n", "")
	review_titles.append(review_title)

	# Text of review
	review_text = fetch_el_if_available(review, "div", "text")
	if review_title is not None:
	review_text = review_text.replace("\n", "")
	review_texts.append(review_text)

	# Review Reactions
	reactions = fetch_el_if_available(review, "div", "actions")
	if reactions is not None:
	reactions = reactions.replace("\n", "").strip().split(" ")
	num_helpful_reactions.append(float(reactions[0].replace(",", "")))
	num_total_reactions.append(float(reactions[3].replace(",", "")))
	else:
	num_helpful_reactions.append(None)
	num_total_reactions.append(None)

	df_out["review_rating"] = review_ratings
	df_out["user_name"] = user_names
	df_out["review_date"] = review_dates
	df_out["review_title"] = review_titles
	df_out["review_text"] = review_texts
	df_out["num_helpful_reactions"] = num_helpful_reactions
	df_out["num_total_reactions"] = num_total_reactions

	return df_out