Last active
January 5, 2019 01:34
-
-
Save ronaldgreeff/07879a2d93b3db4119efca660e0e1c2c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os, sys, re | |
lib_path = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'lib')) | |
if lib_path not in sys.path: | |
sys.path[0:0] = [lib_path] | |
# Main | |
import json | |
import random | |
import numpy as np | |
import pandas as pd | |
from difflib import SequenceMatcher | |
#from sklearn.feature_extraction import DictVectorizer | |
#from sklearn import svm, preprocessing, cluster#, cross_validation | |
#from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support | |
#from sklearn.preprocessing import StandardScaler | |
from nltk.corpus import stopwords | |
#from textblob import Word, TextBlob | |
# DBSCAN | |
##################################################### | |
################# Temporary Helpers ################# | |
DATA_LOOKUP = { | |
0 : {'breadcrumbs': 18, 'title': 19, 'price': 20}, | |
1 : {'breadcrumbs': 18, 'title': 19, 'price': 20}, | |
2 : {'breadcrumbs': 18, 'title': 19, 'price': 20}, | |
3 : {'breadcrumbs': 20, 'title': 21, 'price': 22}, | |
4 : {'breadcrumbs': 20, 'title': 21, 'price': 22}, | |
5 : {'breadcrumbs': 20, 'title': 21, 'price': 22}, | |
6 : {'breadcrumbs': 20, 'title': 21, 'price': 22}, | |
7 : {'breadcrumbs': 11, 'title': 13, 'price': 17}, | |
8 : {'breadcrumbs': 11, 'title': 13, 'price': 17}, | |
} | |
##################################################### | |
##################### Constants ##################### | |
STOP = stopwords.words('english') | |
##################################################### | |
###################### Helpers ###################### | |
def load_data(file): | |
with open(file) as f: | |
data = json.load(f) | |
return data | |
def longest_substring(string_list): | |
string_list = [s.strip().rstrip() for s in string_list if not s == None] | |
long_comm_substr = "" | |
for i in range(len(string_list)): | |
match = SequenceMatcher(None, string_list[0], string_list[i]).find_longest_match(0, len(string_list[0]), 0, len(string_list[i])) | |
if (match.size!=0): | |
long_comm_substr = (string_list[0][match.a: match.a + match.size]) | |
else: | |
long_comm_substr = string_list[0] | |
return long_comm_substr | |
def trim_ends(string): | |
return re.sub('\s\|.*', "", string) | |
def char_count(string): | |
"""(word chars, number chars, non-word/non-digit chars) - excluding whitespaces""" | |
d = {'a': 0, 'd': 0, 'w': 0, 's': 0} | |
for char in string: | |
char = char.strip("\n") | |
if char.isalpha(): | |
d['a'] += 1 | |
if char.isdigit(): | |
d['d'] += 1 | |
if char.isspace(): | |
d['w'] += 1 | |
if not char.isalpha() and not char.isdigit() and not char.isspace(): | |
d['s'] += 1 | |
return [d['a'], d['d'], d['w'], d['s']] | |
def split_stopwords(split_string): | |
stop_words = [] | |
non_stopws = [] | |
split_string = [s.lower() for s in split_string] # Lower first for stop words to work properly | |
[stop_words.append(s) if s in STOP else non_stop.append(s) for s in split_string] | |
return (non_stopws, stop_words) | |
def dense_matrix(string): | |
return [(c, ord(v)) for c, v in enumerate(string)] | |
def string(dense_matrix): | |
return ''.join([chr(c[1]) for c in dense_matrix]) | |
##################################################### | |
class Page_Object: | |
def __init__(self, i): | |
self.title = trim_ends(longest_substring(i['titles'])) | |
self.page_width = float(i['body']['bound']['width']) | |
self.page_height = float(i['body']['bound']['height']) | |
self.norm_len_texts = 1/float(len(i['texts'])) | |
self.norm_def_font_size = 1/float(int(i['body']['computed']['font-size'][:2])) | |
#print "\n", self.title | |
#[Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size) for c, to in enumerate(i['texts'])] | |
for c, to in enumerate(i['texts']): | |
if c < 1: | |
Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size) | |
class Text_Object: | |
def __init__(self, index, text_object, pw, ph, pndfs): | |
self.index = index | |
self.norm_coords = (text_object['bound']['top']/ph, text_object['bound']['left']/pw) | |
self.perc_area = ((text_object['bound']['height'] * text_object['bound']['width'])/(pw * ph))*100 | |
self.features = self.extract_features(text_object['text']) | |
def extract_features(self, text_list): | |
tl = tuple() | |
for _, string in enumerate(text_list): | |
string = string.strip() | |
split_string = string.split() | |
chars_breakdown = char_count(string) | |
number_of_words = len(split_string) | |
average_word_length = (sum(len(s) for s in split_string)/len(split_string)) | |
words, stopwords = split_stopwords(split_string) | |
len_words, len_stopwords = len(split_string), len(stopwords) | |
word_dm = [dense_matrix(word) for word in words] | |
def main(): | |
data = [load_data(file) for file in DATA] | |
for c, page in enumerate(data): | |
page_object = Page_Object(page) | |
if __name__ == '__main__': | |
DATA_ROOT = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'data/_extracts/json')) | |
DATA = [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][:3] + [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][7:15] | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment