Skip to content

Instantly share code, notes, and snippets.

@sakethramanujam
Created October 22, 2019 09:14
Show Gist options
  • Save sakethramanujam/e6f9fb4329e76ca209719637a0d08377 to your computer and use it in GitHub Desktop.
Save sakethramanujam/e6f9fb4329e76ca209719637a0d08377 to your computer and use it in GitHub Desktop.
Python script to scrape details from netflix usage log.
import pandas as pd
import urllib
from bs4 import BeautifulSoup
import argparse
def _parse():
parser = argparse.ArgumentParser()
parser.add_argument('-html', type=str, help='path to html file')
parser.add_argument('-f', '--filename', type=str, help='name to save the output')
return parser.parse_args()
def _generate(path: str, outfile: str):
path = 'file:///' + path
file = urllib.request.urlopen(path)
soup = BeautifulSoup(file.read(), 'html.parser')
ul = soup.find('ul', attrs={'class': 'structural retable'})
lis = ul.findAll('li')
data = []
for li in lis:
device = li.find('h3').text
divs = li.findAll('div', attrs={'class': 'activityAccess'})
for div in divs:
location_ip = div.find('div').text.split(' - ')
last_used = div.find('div', attrs={'class': 'activityDate'}).text.split(',')
date = last_used[0].split(':')[1]
time = last_used[1]
location = location_ip[0]
ip = location_ip[1]
i = {}
i['device'] = device
i['ip'] = ip
i['location'] = location
i['date'] = date
i['time'] = time
data.append(i)
df = pd.DataFrame(data)
df.to_csv(outfile, index=False)
def main():
args = _parse()
out_filename = args.filename
html_path = args.html
print(out_filename)
_generate(path=html_path, outfile=out_filename)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment