Last active
March 15, 2019 21:30
-
-
Save sleirsgoevy/d9a47dfe59ea5126edf09943ac65e361 to your computer and use it in GitHub Desktop.
SBrowse - a minimalistic pure-Python text-mode browser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html.parser, urllib.parse, urllib.request, pydoc, random | |
class Parser(html.parser.HTMLParser): | |
def __init__(self): | |
html.parser.HTMLParser.__init__(self) | |
self.li_stack = [None] | |
self.table_stack = [None] | |
self.tag_stack = [] | |
self.repr_stack = [''] | |
self.form_stack = [None] | |
self.tarea_stack = [] | |
self.select_stack = [None] | |
self.interactive = [] | |
def interact(self, x): | |
self.interactive.append(x) | |
return len(self.interactive) - 1 | |
def handle_starttag(self, tag, attrs): | |
self.tag_stack.append(tag) | |
self.repr_stack.append('') | |
attrs = dict(attrs) | |
if tag == 'table': | |
self.table_stack.append(([], 0)) | |
elif tag in ('tr', 'td', 'th') and self.table_stack[-1] != None: | |
self.table_stack[-1] = (self.table_stack[-1][0], self.table_stack[-1][1] + 1) | |
if self.table_stack[-1][1] == 1: self.table_stack[-1][0].append([]) | |
elif tag == 'ul': | |
self.li_stack.append(None) | |
elif tag == 'ol': | |
self.li_stack.append(0) | |
elif tag == 'form': | |
self.form_stack.append((attrs, [], {})) | |
elif tag == 'input': | |
tp = attrs.get('type', 'text') | |
val = attrs.get('value', '') | |
name = attrs.get('name', None if tp != 'submit' else '') | |
if name != None and self.form_stack[-1] != None: | |
self.form_stack[-1][1].append((name, tp, val)) | |
if tp != 'hidden': | |
self.repr_stack[-1] += '[%d:%s:%s=%s] '%(self.interact((self.form_stack[-1], name)), tp, name, val) | |
elif tag == 'button' and attrs.get('type', None) != 'submit': | |
self.tarea_stack.append(None) | |
elif tag == 'textarea' or (tag == 'button' and attrs.get('type', None) == 'submit'): | |
tp = 'textarea' if tag == 'textarea' else 'submit' | |
name = attrs.get('name', None if tag == 'textarea' else '') | |
value = attrs.get('value', '') | |
if name != None and self.form_stack[-1] != None: | |
self.form_stack[-1][1].append((name, tp, value)) | |
self.repr_stack[-1] += '[%d:%s:%s:'%(self.interact((self.form_stack[-1], name)), tag, (name+'='+value if tag == 'button' else name)) | |
self.tarea_stack.append((self.form_stack[-1], len(self.form_stack[-1][1]) - 1)) | |
else: | |
self.tarea_stack.append(False) | |
elif tag == 'li': | |
if self.li_stack[-1] == None: self.repr_stack[-1] += '\n * ' | |
else: | |
self.li_stack[-1] += 1 | |
self.repr_stack[-1] += '\n %d. '%self.li_stack[-1] | |
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre', 'div', 'br'): | |
self.repr_stack[-1] += '\n' | |
if tag[0] == 'h': self.repr_stack[-1] += '#' * int(tag[1:]) + ' ' | |
elif tag == 'a': | |
if 'href' not in attrs: self.tarea_stack.append(False) | |
else: | |
self.repr_stack[-1] += '[%d: '%self.interact(attrs['href']) | |
self.tarea_stack.append(True) | |
elif tag == 'select': | |
name = attrs.get('name', None) | |
if self.form_stack[-1] != None and name != None: | |
self.form_stack[-1][1].append((name, 'select', [])) | |
self.repr_stack[-1] += '[%d:select:%s:'%(self.interact((self.form_stack[-1], name)), name) | |
self.select_stack.append(self.form_stack[-1][1][-1][-1]) | |
else: | |
self.select_stack.append(None) | |
elif tag == 'option': | |
value = attrs.get('value', None) | |
if self.select_stack[-1] not in (None, True, False) and value != None: | |
self.select_stack[-1].append((value, None, 'selected' in attrs)) | |
self.select_stack.append(True) | |
else: | |
self.select_stack.append(False) | |
elif tag == 'img': | |
src = attrs.get('src', None) | |
alt = attrs.get('alt', src) | |
if src != None: | |
self.repr_stack[-1] += '[%d: %s] '%(self.interact(src), alt) | |
if tag in ('td', 'th'): self.table_stack.append(None) | |
def handle_data(self, data): | |
self.repr_stack[-1] += data | |
def handle_endtag(self, tag): | |
if tag in self.tag_stack: | |
i = len(self.tag_stack) - 1 | |
while self.tag_stack[i] != tag: i -= 1 | |
while len(self.tag_stack) > i: | |
self.do_handle_endtag(self.tag_stack.pop()) | |
def close(self): | |
html.parser.HTMLParser.close(self) | |
while self.tag_stack: self.do_handle_endtag(self.tag_stack.pop()) | |
def do_handle_endtag(self, tag): | |
if tag in ('td', 'th'): self.table_stack.pop() | |
if tag in ('tr', 'td', 'th') and self.table_stack[-1] != None: | |
if tag != 'tr': | |
self.table_stack[-1][0][-1].append(cut_newlines(self.repr_stack[-1])) | |
self.repr_stack[-1] = '' | |
self.table_stack[-1] = (self.table_stack[-1][0], self.table_stack[-1][1] - 1) | |
elif tag == 'table': | |
data = self.table_stack.pop()[0] | |
ncols = max(map(len, data)) | |
nrows = len(data) | |
col_width = [0] * (ncols + 1) | |
row_height = [0] * (nrows + 1) | |
for y, l in enumerate(data): | |
for x, c in enumerate(l): | |
ls = c.split('\n') | |
col_width[x] = max(col_width[x], *map(len, ls)) | |
row_height[y] = max(row_height[y], len(ls)) | |
table_width = sum(col_width) + ncols + 1 | |
table_height = sum(row_height) + nrows + 1 | |
table = [[' '] * table_width for i in range(table_height)] | |
row_offset = [] | |
col_offset = [] | |
offset = 0 | |
for i in col_width: | |
for j in range(table_height): | |
table[j][offset] = '|' | |
col_offset.append(offset + 1) | |
offset += i + 1 | |
offset = 0 | |
for i in row_height: | |
for j in range(table_width): | |
if table[offset][j] == '|': | |
table[offset][j] = '+' | |
else: | |
table[offset][j] = '-' | |
row_offset.append(offset + 1) | |
offset += i + 1 | |
for y, l in enumerate(data): | |
for x, c in enumerate(l): | |
off_y = row_offset[y] | |
off_x = col_offset[x] | |
ls = c.split('\n') | |
for y1, l in enumerate(ls): | |
for x1, c in enumerate(l): | |
table[off_y + y1][off_x + x1] = c | |
table = '\n'.join(map(''.join, table)) | |
self.repr_stack[-1] += '\n'+table+'\n' | |
elif tag in ('ul', 'ol'): self.li_stack.pop() | |
elif tag == 'form': | |
self.form_stack.pop() | |
elif tag in ('button', 'textarea', 'a'): | |
q = self.tarea_stack.pop() | |
if q: | |
self.repr_stack[-1] += ']' | |
if tag == 'textarea': | |
q[0][1][q[1]] = q[0][1][q[1]][:2]+(self.repr_stack[-1][:-1],) | |
elif tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'pre', 'div'): | |
self.repr_stack[-1] += '\n' | |
elif tag == 'option': | |
if self.select_stack.pop(): | |
self.select_stack[-1][-1] = (self.select_stack[-1][-1][0], self.repr_stack[-1].strip(), self.select_stack[-1][-1][-1]) | |
self.repr_stack[-1] += '|' | |
elif tag == 'select': | |
if self.select_stack.pop(): | |
self.repr_stack[-1] = self.repr_stack[-1][:-1] + ']' | |
elif tag in ('style', 'script'): | |
self.repr_stack[-1] = '' | |
self.repr_stack[-1] = self.repr_stack[-2] + self.repr_stack.pop() | |
def cut_newlines(x): | |
ans = x.split('\n') | |
ans2 = [] | |
for i in ans: | |
ans2.append(i.rstrip()) | |
if ans2[-3:] == ['', '', '']: ans2.pop() | |
return '\n'.join(ans2) | |
def process_html(html): | |
x = Parser() | |
x.feed(html) | |
x.close() | |
return (cut_newlines(x.repr_stack[0]), x.interactive) | |
def load_page(req): | |
data = req.read() | |
ct = ''.join(req.getheader('Content-Type', '').split()) | |
if 'charset=' in ct: | |
charset = ct.split('charset=', 1)[1].split(';', 1)[0] | |
else: | |
d = ''.join(data.decode('latin-1').split()).replace('"', ';') | |
if 'charset=' in d: | |
d = d.split('charset=', 1)[1].split(';') | |
if not d[0]: del d[0] | |
charset = d[0] | |
else: | |
charset = 'latin-1' | |
data = data.decode(charset, 'replace') | |
return (req.geturl(), process_html(data)) | |
def browse_loop(): | |
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor) | |
opener.add_headers = [('user-agent', 'w3m/0.5.2')] | |
browsing_stack = [('about:blank', ('', []))] | |
syseditor = True | |
while True: | |
base_url, (html, interactive) = browsing_stack[-1] | |
base_url = base_url.strip() | |
try: | |
cmd = input('> ') | |
if cmd in ('show', 'page'): | |
pydoc.pager(html) | |
elif cmd.startswith('go '): | |
lid = int(cmd[3:]) | |
assert isinstance(interactive[lid], str), "Not a link" | |
link = urllib.parse.urljoin(base_url, interactive[lid]) | |
browsing_stack.append(load_page(opener.open(link))) | |
pydoc.pager(browsing_stack[-1][1][0]) | |
elif cmd == 'back': | |
assert len(browsing_stack) >= 2, "Nowhere to go" | |
browsing_stack.pop() | |
elif cmd.startswith('goto '): | |
link = urllib.parse.urljoin(base_url, cmd[5:]) | |
browsing_stack.append(load_page(opener.open(link))) | |
pydoc.pager(browsing_stack[-1][1][0]) | |
elif cmd == 'history': | |
for i in browsing_stack[::-1]: print(i[0]) | |
elif cmd.startswith('use '): | |
lid = int(cmd[4:]) | |
assert not isinstance(interactive[lid], str), "Not a control" | |
form, name = interactive[lid] | |
for n, t, v in form[1]: | |
if n == name: break | |
form[2][n] = ask_for_value(n, t, v, syseditor=syseditor) | |
if t == 'submit': | |
u, r = submit_form(opener, form, base_url) | |
browsing_stack.append(load_page(r)) | |
pydoc.pager(browsing_stack[-1][1][0]) | |
elif cmd == 'reload': | |
browsing_stack.append(load_page(opener.open(base_url))) | |
pydoc.pager(browsing_stack[-1][1][0]) | |
elif cmd == 'editor sys': | |
syseditor = True | |
elif cmd == 'editor py': | |
syseditor = False | |
elif (cmd + ' ').startswith('download '): | |
if cmd == 'download': link = base_url | |
else: | |
lid = int(cmd[9:]) | |
assert isinstance(interactive[lid], str), "Not a link" | |
link = urllib.parse.urljoin(base_url, interactive[lid]) | |
savepath = input('Save to: ') | |
with opener.open(link) as file: | |
with open(savepath, 'wb') as wfile: | |
wfile.write(file.read()) | |
else: | |
assert False, "Unknown cmd" | |
except EOFError: break | |
except BaseException as e: print(e) | |
class StringPayload: | |
def __init__(self, n, s): | |
self.n = n | |
self.s = s | |
def __str__(self): | |
return self.s | |
def get_payload(self): | |
return ('Content-Disposition: form-data; name="%s"\r\n\r\n%s'%(self.n, str(self))).encode('utf-8') | |
class FilePayload: | |
def __init__(self, n, f, d): | |
self.n = n | |
self.f = f | |
self.d = d | |
def __str__(self): | |
return self.f | |
def get_payload(self): | |
return ('Content-Disposition: form-data; name="%s"; filename="%s"\r\n\r\n'%(self.n, self.f)).encode('utf-8')+self.d | |
def ask_for_value(name, tp, default, syseditor=True): | |
if tp in ('radio', 'checkbox', 'submit'): | |
print('Set', name, 'to', default) | |
return StringPayload(name, default) | |
elif tp == 'textarea': | |
import tempfile, os, sys, shlex | |
if syseditor: | |
with tempfile.NamedTemporaryFile() as f: | |
if sys.platform == 'win32': | |
os.system('edit '+shlex.quote(f.name)) | |
else: | |
os.system('editor '+shlex.quote(f.name)) | |
return StringPayload(name, f.read().decode()) | |
else: | |
print('Press Ctrl+D or Ctrl+C to exit.') | |
data = '' | |
while True: | |
try: data += input() + '\n' | |
except EOFError: return StringPayload(name, data) | |
except KeyboardInterrupt: | |
if input('Save? (y/N) ').strip().lower() == 'y': | |
return StringPayload(name, data) | |
else: | |
return StringPayload(name, default) | |
elif tp == 'select': | |
print('Choose an option:') | |
dflt = ('', 'None') | |
for i, (a, b, c) in enumerate(default): | |
if c: dflt = (a, b) | |
print('%d. %s'%(i + 1, b)) | |
ans = input('Choose an option [%s]: '%dflt[1]) | |
try: return StringPayload(name, default[int(ans) - 1][0]) | |
except (IndexError, ValueError): return StringPayload(name, dflt[0]) | |
elif tp == 'file': | |
import os.path | |
while True: | |
try: fn = input('File name: ') | |
except KeyboardInterrupt: return StringPayload(name, default) | |
if not os.path.exists(fn): | |
print('File not found') | |
else: break | |
with open(fn, 'rb') as file: | |
return FilePayload(name, os.path.split(fn)[1], file.read()) | |
elif tp == 'password': | |
import getpass | |
return StringPayload(name, getpass.getpass()) | |
else: | |
return StringPayload(name, input('Value [%s]: '%default) or default) | |
def submit_form(opener, form, base_url): | |
try: del form[2][''] | |
except KeyError: pass | |
for n, t, v in form[1]: | |
if t == 'hidden': | |
form[2][n] = StringPayload(n, v) | |
enctype = form[0].get('enctype', None) | |
if form[0].get('method', 'get').lower() != 'post': | |
enctype = None | |
if enctype == 'multipart/form-data': | |
data = [v.get_payload() for v in form[2].values()] | |
delim = b'' | |
while any(delim in i for i in data): | |
delim = ('--------%d'%random.randrange(1000000000000000)).encode('ascii') | |
data = b'--'+delim+b'\r\n'+(b'\r\n--'+delim+b'\r\n').join(data)+b'\r\n--'+delim+b'--\r\n' | |
enctype += '; boundary='+delim.decode('ascii') | |
elif enctype == 'text/plain': | |
data = urllib.parse.urlencode(form[2]).replace('+', '%2B').replace('%20', '+').encode('ascii') | |
else: | |
data = urllib.parse.urlencode(form[2]).encode('ascii') | |
enctype = 'application/x-www-form-urlencoded' | |
if form[0].get('method', 'get').lower() != 'post': | |
url = urllib.parse.urljoin(base_url, form[0]['action']) + '?' + data.decode('ascii') | |
return (url, opener.open(url)) | |
url = urllib.parse.urljoin(base_url, form[0]['action']) | |
return (url, opener.open(urllib.request.Request(url, data, {'Content-Type': enctype}))) | |
if __name__ == '__main__': | |
try: import readline | |
except ImportError: pass | |
browse_loop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment