Skip to content

Instantly share code, notes, and snippets.

@jwlin
Created November 2, 2016 05:03
Show Gist options
  • Save jwlin/cd305b747b35acfd3d372848daa169d4 to your computer and use it in GitHub Desktop.
Save jwlin/cd305b747b35acfd3d372848daa169d4 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract features from htmls
"""
import sys, os, random, datetime
from bs4 import BeautifulSoup
from preprocess import extract_features
# randomly pick 20 htmls as training data (corpus)
def random_pick(parent_dir):
ids = []
for fname in os.listdir(parent_dir):
ids.append(fname.split('-')[0])
random.shuffle(ids)
#return ids[:20]
return ids
if __name__ == '__main__':
current_dir = os.path.dirname(__file__)
form_dir = 'forms'
train_dir = os.path.join(current_dir, 'corpus', 'trial-' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
os.makedirs(train_dir)
input_types = ['text', 'email', 'password']
training_ids = random_pick(os.path.join(current_dir, form_dir))
print 'training_ids:', training_ids
for fname in os.listdir(os.path.join(current_dir, form_dir)):
if fname.split('-')[0] in training_ids:
with open(os.path.join(current_dir, form_dir, fname), 'r') as f:
dom = f.read().lower()
soup = BeautifulSoup(dom, 'html5lib')
file_name, extension = os.path.splitext(fname)
c_path = os.path.join(current_dir, train_dir, file_name + '.corpus')
for input_type in input_types:
for input_tag in soup.find_all('input', attrs={'type': input_type}):
with open(c_path, 'a') as cf:
cf.write(extract_features(input_tag) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment