Created
April 11, 2018 12:23
-
-
Save knabben/0eb65a01c2d79d8b76d6f0bcd12c86ce to your computer and use it in GitHub Desktop.
Chapter classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collections | |
| from pymongo import MongoClient | |
| from nltk.corpus import stopwords | |
| from nltk.collocations import BigramCollocationFinder | |
| from nltk.metrics import BigramAssocMeasures | |
| from nltk.classify import NaiveBayesClassifier | |
| from nltk.classify.util import accuracy | |
| client = MongoClient() | |
| cursor = client.db.col_dump.find({}) | |
| label_feats = collections.defaultdict(list) | |
| # Chapter classifier | |
| def bag_of_words(words): | |
| words = words.replace('\xa0', ' ').replace('/', ' ').replace('.', ' ') | |
| badwords = stopwords.words('english') | |
| filtered_words = list(set(words.split()) - set(badwords)) | |
| bigram_finder = BigramCollocationFinder.from_words(filtered_words) | |
| bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500) | |
| return {word: True for word in bigrams + filtered_words} | |
| def split_label_feats(lfeats, split=0.9): | |
| train_feats = [] | |
| test_feats = [] | |
| for label, feats in lfeats.items(): | |
| cutoff = int(len(feats) * split) | |
| train_feats.extend([(feat, label) for feat in feats[:cutoff]]) | |
| test_feats.extend([(feat, label) for feat in feats[cutoff:]]) | |
| return train_feats, test_feats | |
| for item in cursor: | |
| title = item['name'].split() | |
| label = '-'.join(title[0:2]) | |
| answer = list(filter(lambda x: x['correct'] == 1, | |
| item['options']))[0]['sentence'] | |
| label_feats[label].append( | |
| bag_of_words( (item['sentence'].strip() + ' ' + answer) ) | |
| ) | |
| train_feats, test_feats = split_label_feats(label_feats) | |
| nb_class = NaiveBayesClassifier.train(train_feats) | |
| print('ACCURACY: ', accuracy(nb_class, test_feats)) | |
| --- | |
| import configparser | |
| import pprint | |
| import requests | |
| from bs4 import BeautifulSoup | |
| class Config(): | |
| def __init__(self): | |
| config = configparser.ConfigParser() | |
| config.read('config.ini') | |
| self.config = config['DEFAULT'] | |
| self.pp = pprint.PrettyPrinter(indent=4) | |
| self.request_session = requests.Session() | |
| self.headers = { | |
| 'Authorization': 'Bearer {0}'.format(self.config['Bearer']) | |
| } | |
| self.start_login() | |
| def start_login(self): | |
| response = self.request_session.get('https://wiseuponline.com.br') | |
| resp = BeautifulSoup(response.content, "lxml") | |
| csrf_token = resp.find('form').find('input').attrs['value'] | |
| response = self.request_session.post( | |
| 'https://wiseuponline.com.br/site/login', | |
| { | |
| 'LoginForm[username]': self.config['Username'], | |
| 'LoginForm[password]': self.config['Password'], | |
| '_csrf-frontend': csrf_token | |
| }, | |
| headers={'X-CSRF-Token': csrf_token} | |
| ) | |
| assert response.status_code == 200 | |
| --- | |
| import requests | |
| from pymongo import MongoClient | |
| from bs4 import BeautifulSoup | |
| from dump.config import Config | |
| from dump.utils import extract_json, make_request, iterate_response | |
| config = Config() | |
| client = MongoClient() | |
| API_URL = 'https://api.wiseuponline.com.br/v1' | |
| def post_correct(list_item, children_id, item_id, answer_id): | |
| sequence = '{0}/sequences/items/{1}/answer' | |
| response = requests.post(sequence.format(API_URL, children_id), files={ | |
| 'trail_item_id': (None, str(item_id)), 'answer': (None, str(answer_id)) | |
| }, headers=config.headers | |
| ) | |
| if list_item: | |
| response = response.json()['percentualProgress'] | |
| config.pp.pprint('{}% -----------'.format(response)) | |
| def post_correct_item(list_item, data, item_id, children_id): | |
| correct_item = filter(lambda x: x['correct'] == 1, data['item']['options'])[0] | |
| if list_item: | |
| sentence = BeautifulSoup(data['item']['sentence'], | |
| 'lxml').text.strip() | |
| config.pp.pprint(sentence) | |
| config.pp.pprint(correct_item['sentence']) | |
| post_correct(list_item, children_id, item_id, correct_item['id']) | |
| def fetch_trail_items(trail_id, print_data=True): | |
| url = '{0}/trails/{1}/items' | |
| response = make_request(url.format(API_URL, trail_id)) | |
| data = iterate_response(response) | |
| items = [] | |
| for item in data: | |
| if print_data: | |
| config.pp.pprint(item['lesson']['title']) | |
| items.append({'item_id': item['id'], 'trail_id': trail_id}) | |
| return items | |
| def fetch_item_exercises(trail_id, item_id, print_data=True): | |
| url = '{0}/trails/{1}/items/{2}' | |
| response = make_request(url.format(API_URL, trail_id, item_id)).json() | |
| sequence = (response['preparation'][0]['id'], response['id'],) | |
| url = 'https://wiseuponline.com.br/sequence/{0}-{1}'.format(*sequence) | |
| return extract_json(config.request_session.get(url).content) | |
| def exercises_options(exercises, print_data=True): | |
| items = [] | |
| for item in exercises: | |
| if 'children' in item: # ignore breadcrumb data | |
| if print_data: | |
| config.pp.pprint(item['name']) | |
| for children in item['children']: | |
| response = make_request( | |
| '{0}/sequences/items/{1}'.format(API_URL, children['id']) | |
| ).json() | |
| items.append(response['item']) | |
| return items | |
| def persist_on_database(rows, name): | |
| try: | |
| return client['db']['col_dump'].insert_many(rows) | |
| except Exception as e: | |
| print('ERROR: ', e, rows) | |
| --- | |
| def make_request(url): | |
| return requests.get(url, headers=config.headers) | |
| def iterate_response(response): | |
| return [n for n in response.json()] | |
| def extract_json(data): | |
| script = BeautifulSoup(data, "lxml").findAll('script') | |
| return json.loads(script[1].text[21:-2])['progressOptions'] | |
| def format_options(options): | |
| sentence = BeautifulSoup(options['sentence'], 'lxml').text | |
| opt_list = [] | |
| for option in options['options']: | |
| opt_list.append({ | |
| 'sentence': option['sentence'], | |
| 'correct': option['correct'] | |
| }) | |
| return { | |
| 'name': options['name'], | |
| 'sentence': sentence, | |
| 'options': opt_list | |
| } | |
| --- | |
| import argparse | |
| from dump.utils import ( | |
| iterate_response, make_request, format_options) | |
| from dump.data import ( | |
| fetch_trail_items, fetch_item_exercises, exercises_options, | |
| persist_on_database) | |
| def main(args, url='https://api.wiseuponline.com.br/v1/trails'): | |
| response = make_request(url) | |
| data = iterate_response(response) | |
| print_data = args.list | |
| print('------- Starting Crawler -------') | |
| if args.level: | |
| data = list(filter(lambda x: x['level']['slug'] == args.level, data)) | |
| # Fetch levels data | |
| for item in reversed(data): | |
| name = item['level']['name'] | |
| trail_id = item.get('id') | |
| if print_data: | |
| print(name, trail_id) | |
| # Fetch all trails items | |
| for trail in fetch_trail_items(trail_id, print_data): | |
| trail_id = trail['trail_id'] | |
| item_id = trail['item_id'] | |
| if print_data: | |
| print(trail_id, item_id) | |
| # Fetch trails exercises | |
| exs = fetch_item_exercises(trail_id, item_id, print_data) | |
| # Format options and insert on mongodb | |
| rows = [format_options(options) | |
| for options in exercises_options(exs, print_data)] | |
| persist_on_database(rows, name) | |
| print('--------------') | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--list', default=True, action='store_true') | |
| parser.add_argument('level', nargs='?') | |
| args = parser.parse_args() | |
| main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment