Skip to content

Instantly share code, notes, and snippets.

@justdoit0823
Created February 6, 2020 05:15
Show Gist options
  • Save justdoit0823/9df8d50b094eb62de55e7c68bd983cd6 to your computer and use it in GitHub Desktop.
Save justdoit0823/9df8d50b094eb62de55e7c68bd983cd6 to your computer and use it in GitHub Desktop.
collect spark job duration from yarn application web page.
import bs4
import requests
def parse_ts(s):
s = s.strip()
v, unit = s.split(' ')[:2]
v = float(v)
if unit == 'ms':
return v
elif unit == 's':
return v * 1000
elif unit == 'min':
return v * 1000 * 60
raise ValueError(f'unknown time unit {unit}')
def get_page_duration(url, i):
data = requests.get(url, params={'completedJob.page': i}).text
sp = bs4.BeautifulSoup(data)
table = sp.find('table', attrs={'id': 'completedJob-table'})
return sum(parse_ts(tr.find_all('td')[3].text) for tr in table.find_all('tr'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment