Created
April 30, 2009 11:19
-
-
Save simonw/104413 to your computer and use it in GitHub Desktop.
Turn a BeautifulSoup form in to a dict of fields and default values - useful for screen scraping forms and then resubmitting them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_form_fields(self, soup): | |
"Turn a BeautifulSoup form in to a dict of fields and default values" | |
fields = {} | |
for input in soup.findAll('input'): | |
# ignore submit/image with no name attribute | |
if input['type'] in ('submit', 'image') and not input.has_key('name'): | |
continue | |
# single element nome/value fields | |
if input['type'] in ('text', 'hidden', 'password', 'submit', 'image'): | |
value = '' | |
if input.has_key('value'): | |
value = input['value'] | |
fields[input['name']] = value | |
continue | |
# checkboxes and radios | |
if input['type'] in ('checkbox', 'radio'): | |
value = '' | |
if input.has_key('checked'): | |
if input.has_key('value'): | |
value = input['value'] | |
else: | |
value = 'on' | |
if fields.has_key(input['name']) and value: | |
fields[input['name']] = value | |
if not fields.has_key(input['name']): | |
fields[input['name']] = value | |
continue | |
assert False, 'input type %s not supported' % input['type'] | |
# textareas | |
for textarea in soup.findAll('textarea'): | |
fields[textarea['name']] = textarea.string or '' | |
# select fields | |
for select in soup.findAll('select'): | |
value = '' | |
options = select.findAll('option') | |
is_multiple = select.has_key('multiple') | |
selected_options = [ | |
option for option in options | |
if option.has_key('selected') | |
] | |
# If no select options, go with the first one | |
if not selected_options and options: | |
selected_options = [options[0]] | |
if not is_multiple: | |
assert(len(selected_options) < 2) | |
if len(selected_options) == 1: | |
value = selected_options[0]['value'] | |
else: | |
value = [option['value'] for option in selected_options] | |
fields[select['name']] = value | |
return fields | |
oh this is good
This turned out to be very helpful but I am not sure why there is a self.
Hey @simonw, Would you mind putting this under a free license, e.g. the Apache or MIT license? :)
Thx!
def extract_form_fields(soup):
"Turn a BeautifulSoup form in to a dict of fields and default values"
fields = {}
for input in soup.findAll('input'):
# ignore submit/image with no name attribute
if input['type'] in ('submit', 'image') and not 'name' in input:
continue
# single element nome/value fields
if input['type'] in ('text', 'hidden', 'password', 'submit', 'image'):
value = ''
if 'value' in input:
value = input['value']
fields[input['name']] = value
continue
# checkboxes and radios
if input['type'] in ('checkbox', 'radio'):
value = ''
if input.has_attr("checked"):
if input.has_attr('value'):
value = input['value']
else:
value = 'on'
if 'name' in input and value:
fields[input['name']] = value
if not 'name' in input:
fields[input['name']] = value
continue
assert False, 'input type %s not supported' % input['type']
# textareas
for textarea in soup.findAll('textarea'):
fields[textarea['name']] = textarea.string or ''
# select fields
for select in soup.findAll('select'):
value = ''
options = select.findAll('option')
is_multiple = select.has_key('multiple')
selected_options = [
option for option in options
if option.has_key('selected')
]
# If no select options, go with the first one
if not selected_options and options:
selected_options = [options[0]]
if not is_multiple:
assert(len(selected_options) < 2)
if len(selected_options) == 1:
value = selected_options[0]['value']
else:
value = [option['value'] for option in selected_options]
fields[select['name']] = value
return fields
slightly modified python3
thanks ! It was missing a :
if not 'type' in input:
continue
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks! That saved me a bunch of time.