Skip to content

Instantly share code, notes, and snippets.

@sleirsgoevy
Last active March 15, 2019 21:30
Show Gist options
  • Save sleirsgoevy/d9a47dfe59ea5126edf09943ac65e361 to your computer and use it in GitHub Desktop.
Save sleirsgoevy/d9a47dfe59ea5126edf09943ac65e361 to your computer and use it in GitHub Desktop.
SBrowse - a minimalistic pure-Python text-mode browser
import html.parser, urllib.parse, urllib.request, pydoc, random
class Parser(html.parser.HTMLParser):
def __init__(self):
html.parser.HTMLParser.__init__(self)
self.li_stack = [None]
self.table_stack = [None]
self.tag_stack = []
self.repr_stack = ['']
self.form_stack = [None]
self.tarea_stack = []
self.select_stack = [None]
self.interactive = []
def interact(self, x):
self.interactive.append(x)
return len(self.interactive) - 1
def handle_starttag(self, tag, attrs):
self.tag_stack.append(tag)
self.repr_stack.append('')
attrs = dict(attrs)
if tag == 'table':
self.table_stack.append(([], 0))
elif tag in ('tr', 'td', 'th') and self.table_stack[-1] != None:
self.table_stack[-1] = (self.table_stack[-1][0], self.table_stack[-1][1] + 1)
if self.table_stack[-1][1] == 1: self.table_stack[-1][0].append([])
elif tag == 'ul':
self.li_stack.append(None)
elif tag == 'ol':
self.li_stack.append(0)
elif tag == 'form':
self.form_stack.append((attrs, [], {}))
elif tag == 'input':
tp = attrs.get('type', 'text')
val = attrs.get('value', '')
name = attrs.get('name', None if tp != 'submit' else '')
if name != None and self.form_stack[-1] != None:
self.form_stack[-1][1].append((name, tp, val))
if tp != 'hidden':
self.repr_stack[-1] += '[%d:%s:%s=%s] '%(self.interact((self.form_stack[-1], name)), tp, name, val)
elif tag == 'button' and attrs.get('type', None) != 'submit':
self.tarea_stack.append(None)
elif tag == 'textarea' or (tag == 'button' and attrs.get('type', None) == 'submit'):
tp = 'textarea' if tag == 'textarea' else 'submit'
name = attrs.get('name', None if tag == 'textarea' else '')
value = attrs.get('value', '')
if name != None and self.form_stack[-1] != None:
self.form_stack[-1][1].append((name, tp, value))
self.repr_stack[-1] += '[%d:%s:%s:'%(self.interact((self.form_stack[-1], name)), tag, (name+'='+value if tag == 'button' else name))
self.tarea_stack.append((self.form_stack[-1], len(self.form_stack[-1][1]) - 1))
else:
self.tarea_stack.append(False)
elif tag == 'li':
if self.li_stack[-1] == None: self.repr_stack[-1] += '\n * '
else:
self.li_stack[-1] += 1
self.repr_stack[-1] += '\n %d. '%self.li_stack[-1]
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre', 'div', 'br'):
self.repr_stack[-1] += '\n'
if tag[0] == 'h': self.repr_stack[-1] += '#' * int(tag[1:]) + ' '
elif tag == 'a':
if 'href' not in attrs: self.tarea_stack.append(False)
else:
self.repr_stack[-1] += '[%d: '%self.interact(attrs['href'])
self.tarea_stack.append(True)
elif tag == 'select':
name = attrs.get('name', None)
if self.form_stack[-1] != None and name != None:
self.form_stack[-1][1].append((name, 'select', []))
self.repr_stack[-1] += '[%d:select:%s:'%(self.interact((self.form_stack[-1], name)), name)
self.select_stack.append(self.form_stack[-1][1][-1][-1])
else:
self.select_stack.append(None)
elif tag == 'option':
value = attrs.get('value', None)
if self.select_stack[-1] not in (None, True, False) and value != None:
self.select_stack[-1].append((value, None, 'selected' in attrs))
self.select_stack.append(True)
else:
self.select_stack.append(False)
elif tag == 'img':
src = attrs.get('src', None)
alt = attrs.get('alt', src)
if src != None:
self.repr_stack[-1] += '[%d: %s] '%(self.interact(src), alt)
if tag in ('td', 'th'): self.table_stack.append(None)
def handle_data(self, data):
self.repr_stack[-1] += data
def handle_endtag(self, tag):
if tag in self.tag_stack:
i = len(self.tag_stack) - 1
while self.tag_stack[i] != tag: i -= 1
while len(self.tag_stack) > i:
self.do_handle_endtag(self.tag_stack.pop())
def close(self):
html.parser.HTMLParser.close(self)
while self.tag_stack: self.do_handle_endtag(self.tag_stack.pop())
def do_handle_endtag(self, tag):
if tag in ('td', 'th'): self.table_stack.pop()
if tag in ('tr', 'td', 'th') and self.table_stack[-1] != None:
if tag != 'tr':
self.table_stack[-1][0][-1].append(cut_newlines(self.repr_stack[-1]))
self.repr_stack[-1] = ''
self.table_stack[-1] = (self.table_stack[-1][0], self.table_stack[-1][1] - 1)
elif tag == 'table':
data = self.table_stack.pop()[0]
ncols = max(map(len, data))
nrows = len(data)
col_width = [0] * (ncols + 1)
row_height = [0] * (nrows + 1)
for y, l in enumerate(data):
for x, c in enumerate(l):
ls = c.split('\n')
col_width[x] = max(col_width[x], *map(len, ls))
row_height[y] = max(row_height[y], len(ls))
table_width = sum(col_width) + ncols + 1
table_height = sum(row_height) + nrows + 1
table = [[' '] * table_width for i in range(table_height)]
row_offset = []
col_offset = []
offset = 0
for i in col_width:
for j in range(table_height):
table[j][offset] = '|'
col_offset.append(offset + 1)
offset += i + 1
offset = 0
for i in row_height:
for j in range(table_width):
if table[offset][j] == '|':
table[offset][j] = '+'
else:
table[offset][j] = '-'
row_offset.append(offset + 1)
offset += i + 1
for y, l in enumerate(data):
for x, c in enumerate(l):
off_y = row_offset[y]
off_x = col_offset[x]
ls = c.split('\n')
for y1, l in enumerate(ls):
for x1, c in enumerate(l):
table[off_y + y1][off_x + x1] = c
table = '\n'.join(map(''.join, table))
self.repr_stack[-1] += '\n'+table+'\n'
elif tag in ('ul', 'ol'): self.li_stack.pop()
elif tag == 'form':
self.form_stack.pop()
elif tag in ('button', 'textarea', 'a'):
q = self.tarea_stack.pop()
if q:
self.repr_stack[-1] += ']'
if tag == 'textarea':
q[0][1][q[1]] = q[0][1][q[1]][:2]+(self.repr_stack[-1][:-1],)
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'pre', 'div'):
self.repr_stack[-1] += '\n'
elif tag == 'option':
if self.select_stack.pop():
self.select_stack[-1][-1] = (self.select_stack[-1][-1][0], self.repr_stack[-1].strip(), self.select_stack[-1][-1][-1])
self.repr_stack[-1] += '|'
elif tag == 'select':
if self.select_stack.pop():
self.repr_stack[-1] = self.repr_stack[-1][:-1] + ']'
elif tag in ('style', 'script'):
self.repr_stack[-1] = ''
self.repr_stack[-1] = self.repr_stack[-2] + self.repr_stack.pop()
def cut_newlines(x):
ans = x.split('\n')
ans2 = []
for i in ans:
ans2.append(i.rstrip())
if ans2[-3:] == ['', '', '']: ans2.pop()
return '\n'.join(ans2)
def process_html(html):
x = Parser()
x.feed(html)
x.close()
return (cut_newlines(x.repr_stack[0]), x.interactive)
def load_page(req):
data = req.read()
ct = ''.join(req.getheader('Content-Type', '').split())
if 'charset=' in ct:
charset = ct.split('charset=', 1)[1].split(';', 1)[0]
else:
d = ''.join(data.decode('latin-1').split()).replace('"', ';')
if 'charset=' in d:
d = d.split('charset=', 1)[1].split(';')
if not d[0]: del d[0]
charset = d[0]
else:
charset = 'latin-1'
data = data.decode(charset, 'replace')
return (req.geturl(), process_html(data))
def browse_loop():
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor)
opener.add_headers = [('user-agent', 'w3m/0.5.2')]
browsing_stack = [('about:blank', ('', []))]
syseditor = True
while True:
base_url, (html, interactive) = browsing_stack[-1]
base_url = base_url.strip()
try:
cmd = input('> ')
if cmd in ('show', 'page'):
pydoc.pager(html)
elif cmd.startswith('go '):
lid = int(cmd[3:])
assert isinstance(interactive[lid], str), "Not a link"
link = urllib.parse.urljoin(base_url, interactive[lid])
browsing_stack.append(load_page(opener.open(link)))
pydoc.pager(browsing_stack[-1][1][0])
elif cmd == 'back':
assert len(browsing_stack) >= 2, "Nowhere to go"
browsing_stack.pop()
elif cmd.startswith('goto '):
link = urllib.parse.urljoin(base_url, cmd[5:])
browsing_stack.append(load_page(opener.open(link)))
pydoc.pager(browsing_stack[-1][1][0])
elif cmd == 'history':
for i in browsing_stack[::-1]: print(i[0])
elif cmd.startswith('use '):
lid = int(cmd[4:])
assert not isinstance(interactive[lid], str), "Not a control"
form, name = interactive[lid]
for n, t, v in form[1]:
if n == name: break
form[2][n] = ask_for_value(n, t, v, syseditor=syseditor)
if t == 'submit':
u, r = submit_form(opener, form, base_url)
browsing_stack.append(load_page(r))
pydoc.pager(browsing_stack[-1][1][0])
elif cmd == 'reload':
browsing_stack.append(load_page(opener.open(base_url)))
pydoc.pager(browsing_stack[-1][1][0])
elif cmd == 'editor sys':
syseditor = True
elif cmd == 'editor py':
syseditor = False
elif (cmd + ' ').startswith('download '):
if cmd == 'download': link = base_url
else:
lid = int(cmd[9:])
assert isinstance(interactive[lid], str), "Not a link"
link = urllib.parse.urljoin(base_url, interactive[lid])
savepath = input('Save to: ')
with opener.open(link) as file:
with open(savepath, 'wb') as wfile:
wfile.write(file.read())
else:
assert False, "Unknown cmd"
except EOFError: break
except BaseException as e: print(e)
class StringPayload:
def __init__(self, n, s):
self.n = n
self.s = s
def __str__(self):
return self.s
def get_payload(self):
return ('Content-Disposition: form-data; name="%s"\r\n\r\n%s'%(self.n, str(self))).encode('utf-8')
class FilePayload:
def __init__(self, n, f, d):
self.n = n
self.f = f
self.d = d
def __str__(self):
return self.f
def get_payload(self):
return ('Content-Disposition: form-data; name="%s"; filename="%s"\r\n\r\n'%(self.n, self.f)).encode('utf-8')+self.d
def ask_for_value(name, tp, default, syseditor=True):
if tp in ('radio', 'checkbox', 'submit'):
print('Set', name, 'to', default)
return StringPayload(name, default)
elif tp == 'textarea':
import tempfile, os, sys, shlex
if syseditor:
with tempfile.NamedTemporaryFile() as f:
if sys.platform == 'win32':
os.system('edit '+shlex.quote(f.name))
else:
os.system('editor '+shlex.quote(f.name))
return StringPayload(name, f.read().decode())
else:
print('Press Ctrl+D or Ctrl+C to exit.')
data = ''
while True:
try: data += input() + '\n'
except EOFError: return StringPayload(name, data)
except KeyboardInterrupt:
if input('Save? (y/N) ').strip().lower() == 'y':
return StringPayload(name, data)
else:
return StringPayload(name, default)
elif tp == 'select':
print('Choose an option:')
dflt = ('', 'None')
for i, (a, b, c) in enumerate(default):
if c: dflt = (a, b)
print('%d. %s'%(i + 1, b))
ans = input('Choose an option [%s]: '%dflt[1])
try: return StringPayload(name, default[int(ans) - 1][0])
except (IndexError, ValueError): return StringPayload(name, dflt[0])
elif tp == 'file':
import os.path
while True:
try: fn = input('File name: ')
except KeyboardInterrupt: return StringPayload(name, default)
if not os.path.exists(fn):
print('File not found')
else: break
with open(fn, 'rb') as file:
return FilePayload(name, os.path.split(fn)[1], file.read())
elif tp == 'password':
import getpass
return StringPayload(name, getpass.getpass())
else:
return StringPayload(name, input('Value [%s]: '%default) or default)
def submit_form(opener, form, base_url):
try: del form[2]['']
except KeyError: pass
for n, t, v in form[1]:
if t == 'hidden':
form[2][n] = StringPayload(n, v)
enctype = form[0].get('enctype', None)
if form[0].get('method', 'get').lower() != 'post':
enctype = None
if enctype == 'multipart/form-data':
data = [v.get_payload() for v in form[2].values()]
delim = b''
while any(delim in i for i in data):
delim = ('--------%d'%random.randrange(1000000000000000)).encode('ascii')
data = b'--'+delim+b'\r\n'+(b'\r\n--'+delim+b'\r\n').join(data)+b'\r\n--'+delim+b'--\r\n'
enctype += '; boundary='+delim.decode('ascii')
elif enctype == 'text/plain':
data = urllib.parse.urlencode(form[2]).replace('+', '%2B').replace('%20', '+').encode('ascii')
else:
data = urllib.parse.urlencode(form[2]).encode('ascii')
enctype = 'application/x-www-form-urlencoded'
if form[0].get('method', 'get').lower() != 'post':
url = urllib.parse.urljoin(base_url, form[0]['action']) + '?' + data.decode('ascii')
return (url, opener.open(url))
url = urllib.parse.urljoin(base_url, form[0]['action'])
return (url, opener.open(urllib.request.Request(url, data, {'Content-Type': enctype})))
if __name__ == '__main__':
try: import readline
except ImportError: pass
browse_loop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment