Skip to content

Instantly share code, notes, and snippets.

@eduardogpg
Last active November 25, 2023 14:44
Show Gist options
  • Save eduardogpg/85a88210bf51fb86787f2530391981e9 to your computer and use it in GitHub Desktop.
Save eduardogpg/85a88210bf51fb86787f2530391981e9 to your computer and use it in GitHub Desktop.
Una clase del BC de Ciencia de datos.
import csv
import json
import requests
from bs4 import BeautifulSoup
TITLE = 'Próximos lanzamientos'
URL = 'https://www.imdb.com/calendar/?region=MX'
"""
1.- Obtener el maqueto HTML
- Si el archivo HTML no existe de forma local, crearlo.
- Si el archivo HTML existe de forma local, obtener su contenido.
2.- Obtener la información
- Nombre
- Categorias
- Reparto
3.- Generar un archivo CSV
"""
def get_imdb_content():
"""Get the content of the IMDB calendar page
Returns:
string -- The content of the IMDB calendar page
None -- If the request was not successful
"""
headers = {
'User-Agent': 'Mozilla/5.0'
}
response = requests.get(URL, headers=headers) # 20x - 30x - 40x - 50x
if response.status_code == 200:
return response.text
return None
def create_imdb_file_local(content):
"""Crear un archivo local a partir del maquetado de un sitio web."""
try:
with open('imdb.html', 'w') as file:
file.write(content)
except:
pass
def get_imdb_file_local():
"""Lee el contenitdo de un archivo local"""
content = None
try:
with open('imdb.html', 'r') as file:
content = file.read()
except:
pass
return content
def get_local_imdb_content():
"""Obtiene el contenido del maquetado de Imdb. Ya sea de forma local o del servidor"""
content = get_imdb_file_local()
if content:
return content
content = get_imdb_content()
create_imdb_file_local(content)
return content
def create_movie(tag):
main_div = tag.find('div', {'class': 'ipc-metadata-list-summary-item__c' }) # None
name = main_div.div.a.text
ul_categories = main_div.find('ul', {
'class': 'ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base'
})
ul_cast = main_div.find('ul', {
'class': 'ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__stl base'
}) # None
cast = None
categories = [ category.span.text for category in ul_categories.find_all('li') ]
cast = [ cast.span.text for cast in ul_cast.find_all('li') ] if ul_cast else []
return (name, categories, cast) # Retornamos un tupla
def create_csv_movies_file(movies):
with open('movies.csv', 'w') as file:
writer = csv.writer(file, delimiter="-")
writer.writerow(['name', 'categories', 'cast'])
for movie in movies:
writer.writerow([
movie[0], # name
",".join(movie[1]), # categories
",".join(movie[2]), # cast
])
def create_json_movies_file(movies):
movies_list = [
{
'name': movie[0],
'categories': movie[1],
'cast': movie[2]
}
for movie in movies
]
with open('movies.json', 'w', encoding='UTF-8') as file:
json.dump(movies_list, file, indent=4)
def main():
content = get_local_imdb_content()
# HTML
soup = BeautifulSoup(content, 'html.parser')
li_tags = soup.find_all('li', {
'data-testid': 'coming-soon-entry',
'class': 'ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE'
})
movies = []
for tag in li_tags:
movie = create_movie(tag)
movies.append(movie)
# create_csv_movies_file(movies)
create_json_movies_file(movies)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment