Created
March 30, 2019 13:40
-
-
Save sergiolucero/304af786375e3ffd80df33171b7c9fe1 to your computer and use it in GitHub Desktop.
television scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, requests, time | |
| import sqlite3, pandas as pd | |
| from bs4 import BeautifulSoup | |
| STATIC_FOLDER = os.getenv('STATIC_FOLDER') | |
| CANALES = ['rec-tv','canal-13-c','mega','canal-13','tvn','chilevision','ucv-tv', | |
| 'tnt','la-red','fox','tnt','hbo','sony','axn','warner','a-e-mundo', | |
| 'universal-channel','space','fx'] | |
| # 'tvn-24h' | |
| def get_date(channel, date): | |
| url = f'https://mi.tv/cl/async/channel/{channel}/{date}/-180' | |
| bs = BeautifulSoup(requests.get(url).text,'lxml') | |
| contents = bs.find_all('div',attrs={'class':'content'}) | |
| # factor into a function (span.sub-title,span.time.text,h2.text.strip) | |
| episodes = [c0.find_next('span',attrs={'class':'sub-title'}).text | |
| for c0 in contents] | |
| times = [c0.find_next('span',attrs={'class':'time'}).text | |
| for c0 in contents] | |
| titulos = [c0.find_next('h2').text.strip() for c0 in contents] | |
| df = pd.DataFrame(dict(hora=times,episodio=episodes,programa=titulos)) | |
| df['fecha'] = date | |
| #print(date,len(df)) | |
| df = df[['fecha','hora','programa','episodio']] | |
| return df | |
| ######################## | |
| def channel_scraper(channel,dates): | |
| df = pd.concat([get_date(channel,date) for date in dates]) | |
| df['canal'] = channel | |
| return df | |
| ##### | |
| if __name__ == '__main__': | |
| MESES = [10,11,12] | |
| t0 = time.time() | |
| df = pd.DataFrame() | |
| for c in CANALES: | |
| for m in MESES: | |
| dates = ['2018-%02d-%02d' %(m,d) for d in range(1,32)] | |
| df = df.append(channel_scraper(c,dates)) | |
| dt = time.time()-t0 | |
| print(c,m,len(df),round(len(df)/dt,2)) | |
| df.to_sql('parrilla', sqlite3.connect('television.db'), if_exists='replace') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment