Skip to content

Instantly share code, notes, and snippets.

@knabben
Created April 11, 2018 12:23
Show Gist options
  • Select an option

  • Save knabben/0eb65a01c2d79d8b76d6f0bcd12c86ce to your computer and use it in GitHub Desktop.

Select an option

Save knabben/0eb65a01c2d79d8b76d6f0bcd12c86ce to your computer and use it in GitHub Desktop.
Chapter classifier
import collections
from pymongo import MongoClient
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
client = MongoClient()
cursor = client.db.col_dump.find({})
label_feats = collections.defaultdict(list)
# Chapter classifier
def bag_of_words(words):
words = words.replace('\xa0', ' ').replace('/', ' ').replace('.', ' ')
badwords = stopwords.words('english')
filtered_words = list(set(words.split()) - set(badwords))
bigram_finder = BigramCollocationFinder.from_words(filtered_words)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
return {word: True for word in bigrams + filtered_words}
def split_label_feats(lfeats, split=0.9):
train_feats = []
test_feats = []
for label, feats in lfeats.items():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
for item in cursor:
title = item['name'].split()
label = '-'.join(title[0:2])
answer = list(filter(lambda x: x['correct'] == 1,
item['options']))[0]['sentence']
label_feats[label].append(
bag_of_words( (item['sentence'].strip() + ' ' + answer) )
)
train_feats, test_feats = split_label_feats(label_feats)
nb_class = NaiveBayesClassifier.train(train_feats)
print('ACCURACY: ', accuracy(nb_class, test_feats))
---
import configparser
import pprint
import requests
from bs4 import BeautifulSoup
class Config():
def __init__(self):
config = configparser.ConfigParser()
config.read('config.ini')
self.config = config['DEFAULT']
self.pp = pprint.PrettyPrinter(indent=4)
self.request_session = requests.Session()
self.headers = {
'Authorization': 'Bearer {0}'.format(self.config['Bearer'])
}
self.start_login()
def start_login(self):
response = self.request_session.get('https://wiseuponline.com.br')
resp = BeautifulSoup(response.content, "lxml")
csrf_token = resp.find('form').find('input').attrs['value']
response = self.request_session.post(
'https://wiseuponline.com.br/site/login',
{
'LoginForm[username]': self.config['Username'],
'LoginForm[password]': self.config['Password'],
'_csrf-frontend': csrf_token
},
headers={'X-CSRF-Token': csrf_token}
)
assert response.status_code == 200
---
import requests
from pymongo import MongoClient
from bs4 import BeautifulSoup
from dump.config import Config
from dump.utils import extract_json, make_request, iterate_response
config = Config()
client = MongoClient()
API_URL = 'https://api.wiseuponline.com.br/v1'
def post_correct(list_item, children_id, item_id, answer_id):
sequence = '{0}/sequences/items/{1}/answer'
response = requests.post(sequence.format(API_URL, children_id), files={
'trail_item_id': (None, str(item_id)), 'answer': (None, str(answer_id))
}, headers=config.headers
)
if list_item:
response = response.json()['percentualProgress']
config.pp.pprint('{}% -----------'.format(response))
def post_correct_item(list_item, data, item_id, children_id):
correct_item = filter(lambda x: x['correct'] == 1, data['item']['options'])[0]
if list_item:
sentence = BeautifulSoup(data['item']['sentence'],
'lxml').text.strip()
config.pp.pprint(sentence)
config.pp.pprint(correct_item['sentence'])
post_correct(list_item, children_id, item_id, correct_item['id'])
def fetch_trail_items(trail_id, print_data=True):
url = '{0}/trails/{1}/items'
response = make_request(url.format(API_URL, trail_id))
data = iterate_response(response)
items = []
for item in data:
if print_data:
config.pp.pprint(item['lesson']['title'])
items.append({'item_id': item['id'], 'trail_id': trail_id})
return items
def fetch_item_exercises(trail_id, item_id, print_data=True):
url = '{0}/trails/{1}/items/{2}'
response = make_request(url.format(API_URL, trail_id, item_id)).json()
sequence = (response['preparation'][0]['id'], response['id'],)
url = 'https://wiseuponline.com.br/sequence/{0}-{1}'.format(*sequence)
return extract_json(config.request_session.get(url).content)
def exercises_options(exercises, print_data=True):
items = []
for item in exercises:
if 'children' in item: # ignore breadcrumb data
if print_data:
config.pp.pprint(item['name'])
for children in item['children']:
response = make_request(
'{0}/sequences/items/{1}'.format(API_URL, children['id'])
).json()
items.append(response['item'])
return items
def persist_on_database(rows, name):
try:
return client['db']['col_dump'].insert_many(rows)
except Exception as e:
print('ERROR: ', e, rows)
---
def make_request(url):
return requests.get(url, headers=config.headers)
def iterate_response(response):
return [n for n in response.json()]
def extract_json(data):
script = BeautifulSoup(data, "lxml").findAll('script')
return json.loads(script[1].text[21:-2])['progressOptions']
def format_options(options):
sentence = BeautifulSoup(options['sentence'], 'lxml').text
opt_list = []
for option in options['options']:
opt_list.append({
'sentence': option['sentence'],
'correct': option['correct']
})
return {
'name': options['name'],
'sentence': sentence,
'options': opt_list
}
---
import argparse
from dump.utils import (
iterate_response, make_request, format_options)
from dump.data import (
fetch_trail_items, fetch_item_exercises, exercises_options,
persist_on_database)
def main(args, url='https://api.wiseuponline.com.br/v1/trails'):
response = make_request(url)
data = iterate_response(response)
print_data = args.list
print('------- Starting Crawler -------')
if args.level:
data = list(filter(lambda x: x['level']['slug'] == args.level, data))
# Fetch levels data
for item in reversed(data):
name = item['level']['name']
trail_id = item.get('id')
if print_data:
print(name, trail_id)
# Fetch all trails items
for trail in fetch_trail_items(trail_id, print_data):
trail_id = trail['trail_id']
item_id = trail['item_id']
if print_data:
print(trail_id, item_id)
# Fetch trails exercises
exs = fetch_item_exercises(trail_id, item_id, print_data)
# Format options and insert on mongodb
rows = [format_options(options)
for options in exercises_options(exs, print_data)]
persist_on_database(rows, name)
print('--------------')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--list', default=True, action='store_true')
parser.add_argument('level', nargs='?')
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment