Skip to content

Instantly share code, notes, and snippets.

@tonyahowe
Created November 28, 2021 16:27
Show Gist options
  • Save tonyahowe/000e78ff7a846e9bdaca910a13e2009a to your computer and use it in GitHub Desktop.
Save tonyahowe/000e78ff7a846e9bdaca910a13e2009a to your computer and use it in GitHub Desktop.
IMDB_fetcher
import requests
from bs4 import BeautifulSoup
import pandas as pd
def findByTagAndAttrs(soup, ifFailed, *args):
i = 0
try:
while True:
result = soup.find(args[i][0], args[i][1])
i += 1
if i >= len(args): break
return result.text.strip()
except:
return ifFailed
def findByText(soup, attr, text_to_find,next_attr):
try:
return soup.find(attr, text=text_to_find).find_next(next_attr).text.strip()
except:
return None
def fetch_record_info(id) -> pd.core.series.Series:
"""
fetch a record from website/api and scrape it
the return must match addition_columns defined above
"""
#synopsis_re = re.compile("^synopsis-")
#columns = soup.findAll('td', text = re.compile('your regex here'), attrs = {'class' : 'pos'})
#id = source_row[ID_COLUMN_INDEX].strip().lstrip().rstrip()
url = f"https://www.imdb.com/title/{id}/"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
countries_of_origin = soup.find("li",{"data-testid": "title-details-origin"}).find_all("a")
test = map(lambda x: x.text, countries_of_origin)
countries_of_origin=", ".join(test)
print(countries_of_origin)
storyline = soup.find("div", {"data-testid": "storyline-plot-summary"}).find("div").find("div").text
print(storyline)
budget = soup.find("li",{"data-testid": "title-boxoffice-budget"}).find("li").find("span").text.strip().split(" ")[0]
print(budget)
opening_weekend = soup.find("li",{"data-testid": "title-boxoffice-openingweekenddomestic"}).find("li").find("span").text
print(opening_weekend)
production_company = soup.find("li",{"data-testid": "title-details-companies"}).find("li").find("a").text
print(production_company)
url = f"https://www.imdb.com/title/{id}/plotsummary"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
synopsis = soup.find("ul", {"id": "plot-synopsis-content"}).find("li").text
print(synopsis)
# call imdb_search_functions
#source_row[ADDITIONAL_COLUMNS[0]] = storyline #adding the new columns to the row
#source_row[ADDITIONAL_COLUMNS[1]] = synopsis
#source_row[ADDITIONAL_COLUMNS[2]] = country_of_origin
#source_row[ADDITIONAL_COLUMNS[3]] = budget
#source_row[ADDITIONAL_COLUMNS[4]] = opening_weekend
#source_row[ADDITIONAL_COLUMNS[5]] = production_company
#source_row[ADDITIONAL_COLUMNS[6]] = distributor
#return source_row
if __name__=="__main__":
fetch_record_info("tt0078748")
# * ************************************
# * version
# * ************************************
__version__ = "1.0.0"
# * ************************************
# * imports
# * ************************************
import os
import sys
import logging
from time import sleep
from numpy import product
# from typing import List
# import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re # re = regular expression
from imdb_data_settings import *
# * ***************************************
# * convenience variables - do not modify
# * ***************************************
__source_column_names = []
__output_column_names = []
__titletype_to_include = []
"lower cased titletype to include in search"
__genres_to_include = []
"lower cased genres to include in search"
__genres_to_exclude = []
"lower cased genres to exclude in search"
# * ************************************
# * functions
# * ************************************
# need to add a count function for all films in file with horror in genre and either movie or short in titletype
# after specific titles subsetted, need a count of all these films
def init():
"""
initialize the program:
- initialize convenience variables
- create an empty excel file for output - will overwrite existing output file
"""
global __source_column_names, __output_column_names, __genres_to_include, __genres_to_exclude, __titletype_to_include #these variables are global but they"ll be changed here in this function
# convert search genres to lower case to avoid case mismatch
def lowercase(genre): return genre.lower()
__genres_to_include = list(map(lowercase, GENRES_TO_INCLUDE)) # for each item passed to function, map x onto it, and make sure it"s a list
__genres_to_exclude = list(map(lowercase, GENRES_TO_EXCLUDE))
def lowercase(titletype): return titletype.lower()
__titletype_to_include = list(map(lowercase, TITLETYPE_TO_INCLUDE))
source_file_column_names = []
# get the header row from the source file if needed
if SOURCE_COLUMN_NAMES and len(SOURCE_COLUMN_NAMES) > 0:
__source_column_names = SOURCE_COLUMN_NAMES
else:
df = pd.read_csv(SOURCE_FILENAME, nrows=0, sep=SOURCE_FILE_DELIMITER) #weird pandas thing-- pull zero rows, but want the headers for each column
__source_column_names = ((df.columns).values).tolist() #grab columns of df, then grab values of columns.df, then create list of them -- could also be df.columns.values.tolist()
#df is an object w/properties, one of which is columns; columns is an object w/properties, one of which is values; values is an object with methods, one of which is tolist()
#properties = adjectives/nouns, methods = verbs
logging.info("Source file column names: \n%s", #like print but not--conditional printing, if rises to specified debug level
__source_column_names) #log prints "source file....: new line and the string i pass--__source_column_names
__output_column_names = __source_column_names + ADDITIONAL_COLUMNS
logging.info("Destination file column names: \n%s",
__output_column_names)
if OVERWRITE_OUTPUT_FILE or not os.path.isfile(OUTPUT_FILENAME): #with these output_column_names, create an empty df and save to file if file is not already there
output_df = pd.DataFrame(columns=__output_column_names)
output_df.to_csv(
OUTPUT_FILENAME, sep=OUTPUT_FILE_DELIMITER, index=False)
def fetch_and_save_batch(start_at) -> bool: # -> returns bool
"""
fetch the next batch. if no records left, return false
"""
logging.info(f"fetching records {start_at} to {start_at + BATCH_SIZE}")
# read next batch from source file
source_df = pd.read_csv(SOURCE_FILENAME, sep=SOURCE_FILE_DELIMITER, header=None,
skiprows=start_at, nrows=BATCH_SIZE, names=__source_column_names)
#putting all this specific stuff into source_df; header = none because if in middle of file, no headers! skiprows=start_at b/c that"s where we"re starting; names are __source_column_names
logging.debug(source_df)
if source_df.empty:
return False
output_df = pd.DataFrame(columns=__output_column_names)
# maybe rename include list to genre_include_list and create a type_include_list as well -- to enable us to search only horror genres that are also movies or shorts
for source_row in source_df.iloc: #iloc = array of rows; if wanted to select a particular one, use iloc[20] for row 21 in a zero-referenced df. fyi, source_row is an array, itself
# if match_genre(source_row[GENRES_COLUMN_INDEX]):
if match_genre_search_criteria(
text=source_row[GENRES_COLUMN_INDEX],
include_list=__genres_to_include,
exclude_list=__genres_to_exclude,
match_all=MATCH_ALL_GENRES
) and match_titletype_search_criteria(
text=source_row[TITLE_TYPE_COLUMN_INDEX],
include_list=__titletype_to_include,
match_all=MATCH_ALL_TITLETYPES
):
logging.debug("fetching data for: %s", source_row.to_dict())
#take batch of 1000, test, add to df, append; restarts with empty df for next batch
destination_row = fetch_record_info(source_row) #creating an array destination_row of source row plus fetched info
logging.debug("fetched: %s", source_row.to_dict())
output_df = output_df.append(destination_row) #appending it to the output_df
sleep(DELAY_BETWEEN_RECORDS)
# save batch to output file
output_df.to_csv(OUTPUT_FILENAME, mode="a", #a = append
sep=OUTPUT_FILE_DELIMITER, index=False, header=False)
logging.info("Saving %i records", len(output_df))
return True
def fetch_record_info(source_row) -> pd.core.series.Series:
"""
fetch a record from website/api and scrape it
the return must match addition_columns defined above
"""
url = f"https://www.imdb.com/title/{id}/"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
try:
countries_of_origin = soup.find("li",{"data-testid": "title-details-origin"}).find_all("a")
test = map(lambda x: x.text, countries_of_origin)
countries_of_origin=", ".join(test)
source_row[ADDITIONAL_COLUMNS[COUNTRY_COLUMN_INDEX]] = countries_of_origin
except:
pass
try:
storyline = soup.find("div", {"data-testid": "storyline-plot-summary"}).find("div").find("div").text
source_row[ADDITIONAL_COLUMNS[STORYLINE_COLUMN_INDEX]] = storyline
except:
pass
try:
budget = soup.find("li",{"data-testid": "title-boxoffice-budget"}).find("li").find("span").text.strip().split(" ")[0]
source_row[ADDITIONAL_COLUMNS[BUDGET_COLUMN_INDEX]] = budget
except:
pass
try:
opening_weekend = soup.find("li",{"data-testid": "title-boxoffice-openingweekenddomestic"}).find("li").find("span").text
source_row[ADDITIONAL_COLUMNS[OPENING_WEEKEND_COLUMN_INDEX]] = opening_weekend
except:
pass
try:
production_company = soup.find("li",{"data-testid": "title-details-companies"}).find("li").find("a").text
source_row[ADDITIONAL_COLUMNS[PRODUCTION_COMPANY_COLUMN_INDEX]] = production_company
except:
pass
try:
rating = soup.find("div", {"data-testid": "hero-rating-bar__aggregate-rating__score"}).find("span").text
source_row[ADDITIONAL_COLUMNS[RATING_COLUMN_INDEX]] = rating
except:
pass
try:
number_of_votes = soup.find("div", {"class": "AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3"}).text
source_row[ADDITIONAL_COLUMNS[NUMBER_OF_VOTES_COLUMN_INDEX]] = rating
except:
pass
url = f"https://www.imdb.com/title/{id}/plotsummary"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
try:
synopsis = soup.find("ul", {"id": "plot-synopsis-content"}).find("li").text
source_row[ADDITIONAL_COLUMNS[SYNOPSIS_COLUMN_INDEX]] = synopsis
except:
pass
return source_row
# * ************************************
# * main
# * ************************************
if __name__ == "__main__":
logging.info("**** Welcome to data fetcher ****")
init()
start_at = START_AT_INDEX
while fetch_and_save_batch(start_at): #while returning true, increment the start at in the batch
start_at += BATCH_SIZE
# can be removed
if SOURCE_RECORDS_TO_FETCH and start_at >= SOURCE_RECORDS_TO_FETCH:
break
logging.info(
f"sleeping between batches for {DELAY_BETWEEN_BATCHES} sec...") #f = formatted string; variables in curly braces. Could also have used %s as above
sleep(DELAY_BETWEEN_BATCHES)
# * ************************************
# * version
# * ************************************
__version__ = "1.0.0"
import logging
import sys
# * ************************************
# * knobs - modify as needed
# * ************************************
DEBUG_LEVEL = logging.DEBUG
"""
print debug messages with level <= DEBUG_LEVEL
logging.DEBUG -> print every thing including python logs (only for trouble shooting)
use logging.debug('my message') for this level
logging.INFO -> print only informational messages (RECOMMENDED)
use logging.info('my message') for this level
logging.NOSET -> disable all messages
"""
logging.basicConfig(stream=sys.stdout, level=DEBUG_LEVEL)
SOURCE_FILENAME = "titles.tsv"
"""
source file with all film id's and genres
"""
SOURCE_FILE_DELIMITER = '\t'
"""
delimiter used in source file - can be '\t' or ','
"""
SOURCE_COLUMN_NAMES = None
"""
set to array of strings represting column names in output file
must correspond one to one with source file columns
MUST BE SET if source file doesnot include a header row
set to None to use the same names as the source file (ONLY if header row exists)
"""
ID_COLUMN_INDEX = 0
"""
index of id column - first column -> index = 0, etc..
"""
TITLE_TYPE_COLUMN_INDEX = 1
TITLE_COLUMN_INDEX = 2
YEAR_COLUMN_INDEX = 5
GENRES_COLUMN_INDEX = 8
"""
index of genres column - first column -> index = 0, etc..
"""
STORYLINE_COLUMN_INDEX = 9
SYNOPSIS_COLUMN_INDEX = 10
COUNTRY_COLUMN_INDEX = 11
BUDGET_COLUMN_INDEX = 12
OPENING_WEEKEND_COLUMN_INDEX = 13
PRODUCTION_COMPANY_COLUMN_INDEX = 14
RATING_COLUMN_INDEX = 15
NUMBER_OF_VOTES_COLUMN_INDEX = 16
TITLETYPE_TO_INCLUDE = ["short", "movie"]
MATCH_ALL_TITLETYPES = False
GENRES_TO_INCLUDE = ["Horror"]
"""
genre(s) to search by - search can be for ALL or for ANY
"""
MATCH_ALL_GENRES = True
"""
set to true to match ALL genres, false to match ANY genre
"""
GENRES_TO_EXCLUDE = []
"""
exclude any films that have ANY of these genre(s)
"""
OUTPUT_FILENAME = "imdb_horror_data_TH.tsv"
"""
file to output result to - existing file will be overwritten
"""
OUTPUT_FILE_DELIMITER = '\t'
"""
delimiter used in output file - can be '\t' or ','
"""
ADDITIONAL_COLUMNS = ["storyline", "synopsis", "countries_of_origin", "budget", "opening_weekend", "production_company", "rating", "number_of_votes"]
"""
additional columns to add to output - update as needed in fetch_record_info()
"""
START_AT_INDEX = 101
"""
set to 1 to start at begining of source file - 0 if file does not have a header row
set to higher value to skip that many records
in case a previous run was aborted, make sure to set INIT_OUTPUT_FILE to False
"""
OVERWRITE_OUTPUT_FILE = False
"""
if true, create new output file and overwrite existing one
if false, file must already exist and data will be appended to it
"""
BATCH_SIZE = 100
"""
number of records to process from source file each batch
"""
SOURCE_RECORDS_TO_FETCH = 100
"""
total number of records to process from source file - set to None to process all
"""
DELAY_BETWEEN_RECORDS = 2
"""
number of seconds to delay between fetching records
"""
DELAY_BETWEEN_BATCHES = 5
"""
number of seconds for additional delay between batches
"""
OUTPUT_RECORDS_TO_PLOT = 150
"""
number of records in output file to process and plot - set to None for all
"""
MATCH_ALL_SEARCH_WORDS = False
"""
set to True to match ALL words - False to match ANY of the words
"""
IGNORE_NUMBERS_IN_SEARCH = True
"""
numbers can cause problems for some plotting packages
"""
SEARCH_WORDS = ['artist', 'author', 'filmmaker', 'musician',
'sculptor', 'painter', 'painting', 'dancer', 'director',
'create', 'creator', 'creative', 'created',
'creates', 'photographer', 'photograph', 'music', 'dance', 'graffiti']
"""
key words to search for in title, summary and synopsis
"""
SEARCH_WORDS_EXCLUDED = []
"""
exclude records that have ANY of these words in title, summary or synopsis
"""
PLOT_HTML_FILE_NAME = 'imdb_horror_data.html'
"""
name html plot file
"""
# * ************************************
# * common functions
# * ************************************
def match_genre_search_criteria(text, include_list, exclude_list=[], match_all=False) -> bool:
"""
check if word matches search criteria (exclusion THEN inclusion list)
if match_all is True, ALL words in include_list must match, otherwise just one is needed to match
"""
text = text.lower()
# check exclusion list first
for word in exclude_list:
if word in text:
return False
# check inclusion list
for word in include_list:
if word in text:
if not match_all:
return True
else:
if match_all:
return False
# if we get to this point
if match_all:
# all the words have matched otherwise a mismatch would have already returned False
return True
else:
# all the words have mismatched otherwise a match would have already returned True
return False
def match_titletype_search_criteria(text, include_list, match_all=False) -> bool:
text = text.lower()
for word in include_list:
if word in text:
if not match_all:
return True
else:
if match_all:
return False
if match_all:
return True
else:
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment