This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from urllib import request | |
from bs4 import BeautifulSoup | |
from fake_useragent import UserAgent | |
from typing import Union | |
from time import sleep | |
class WorldPostCodeScraper: | |
"""Scraper class for https://worldpostalcode.com/.""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Python executable which scrapes IMDB for reviews.""" | |
import argparse | |
import pandas as pd | |
from time import sleep | |
from tqdm import tqdm | |
from dependencies.general import timing | |
from dependencies.scrapers import ImdbReviewScraper |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ImdbReviewScraper(Scraper): | |
"""Implements methods for scraping IMDB. | |
Inherited Attributes: | |
chromedriver (chromedriver): a Chrome webdriver for Selenium. | |
Own Methods: | |
@staticmethod get_ratings_page | |
@staticmethod get_reviews_page | |
get_episodes_links |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ImdbReviewScraper(Scraper): | |
"""Implements methods for scraping IMDB. | |
Inherited Attributes: | |
chromedriver (chromedriver): a Chrome webdriver for Selenium. | |
Own Methods: | |
@staticmethod get_ratings_page | |
@staticmethod get_reviews_page | |
get_episodes_links |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ScraperException(Exception): | |
"""Starting point for Scraper exceptions.""" | |
pass | |
class ImdbScraperException(ScraperException): | |
"""Starting point for Scraper exceptions.""" | |
pass | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data manipulation | |
import pandas as pd | |
import re as regex | |
# Scraping | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@timing | |
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None: | |
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews. | |
Args: | |
season_link (str): URL pointing to season page. | |
show_link (str): URL pointing to show general reviews. | |
driver_service (Service): a Chrome web driver. | |
output_path (str): path including filename where we want to save the CSV. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame: | |
"""Scrape IMDB reviews page. | |
Note: Extracts ratings, usernames, review date, titles, review body text, | |
number of reactions, total reactions to review. | |
Args: | |
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page. | |
Returns: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Union | |
def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: str) -> Union[str, None]: | |
"""Returns element text if found, otherwise returns None. | |
Args: | |
soup (BeautifulSoup): a b24 soup. | |
element_type (str): HTML type e.g. 'div'. | |
class_type (str): the class of the desired element. | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_ratings_page(episode_page: str, suffix: str="/ratings/?ref_=tt_ov_rt"): | |
return ("/").join(episode_page.split("/")[:-1]) + suffix | |
def get_reviews_page(episode_page: str, suffix: str="/reviews?ref_=tt_urv"): | |
return ("/").join(episode_page.split("/")[:-1]) + suffix |
NewerOlder