Created
October 22, 2019 09:14
-
-
Save sakethramanujam/e6f9fb4329e76ca209719637a0d08377 to your computer and use it in GitHub Desktop.
Python script to scrape details from netflix usage log.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import urllib | |
from bs4 import BeautifulSoup | |
import argparse | |
def _parse(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-html', type=str, help='path to html file') | |
parser.add_argument('-f', '--filename', type=str, help='name to save the output') | |
return parser.parse_args() | |
def _generate(path: str, outfile: str): | |
path = 'file:///' + path | |
file = urllib.request.urlopen(path) | |
soup = BeautifulSoup(file.read(), 'html.parser') | |
ul = soup.find('ul', attrs={'class': 'structural retable'}) | |
lis = ul.findAll('li') | |
data = [] | |
for li in lis: | |
device = li.find('h3').text | |
divs = li.findAll('div', attrs={'class': 'activityAccess'}) | |
for div in divs: | |
location_ip = div.find('div').text.split(' - ') | |
last_used = div.find('div', attrs={'class': 'activityDate'}).text.split(',') | |
date = last_used[0].split(':')[1] | |
time = last_used[1] | |
location = location_ip[0] | |
ip = location_ip[1] | |
i = {} | |
i['device'] = device | |
i['ip'] = ip | |
i['location'] = location | |
i['date'] = date | |
i['time'] = time | |
data.append(i) | |
df = pd.DataFrame(data) | |
df.to_csv(outfile, index=False) | |
def main(): | |
args = _parse() | |
out_filename = args.filename | |
html_path = args.html | |
print(out_filename) | |
_generate(path=html_path, outfile=out_filename) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment