This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, os | |
from collections import Counter | |
from symspellpy.symspellpy import SymSpell as SymSpellPy, Verbosity | |
class SpellCheck: | |
def __init__(self, dictionary=None, verbose=0): | |
self.verbose = verbose | |
self.dictionary = dictionary | |
def correction(self, text): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Problem: | |
How to Convert PDF to Image with Python Script ? | |
$ sudo apt-get install libmagickwand-dev | |
$ pip install Wand | |
""" | |
from PIL import Image as Img |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PIL.Image | |
import PIL.ImageDraw | |
import wand.image | |
import sys, os | |
from io import BytesIO | |
def draw_rect(self, bbox_or_obj, | |
fill=DEFAULT_FILL, | |
stroke=DEFAULT_STROKE, | |
stroke_width=DEFAULT_STROKE_WIDTH): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
from PIL import Image | |
import pytesseract | |
from wand.image import Image as wi | |
pdf = wi(filename = "sample2.pdf", resolution = 300) | |
pdfImage = pdf.convert('jpeg') | |
imageBlobs = [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TableFinder(object): | |
""" | |
Given a PDF page, finds table structures. | |
""" | |
def __init__(self, page, settings={}): | |
for k in settings.keys(): | |
if k not in DEFAULT_TABLE_SETTINGS: | |
raise ValueError("Unrecognized table setting: '{0}'".format( | |
k | |
)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_tables(self, table_settings={}): | |
return TableFinder(self, table_settings).tables | |
def extract_tables(self, table_settings={}): | |
tables = self.find_tables(table_settings) | |
return [ table.extract() for table in tables ] | |
def extract_table(self, table_settings={}): | |
tables = self.find_tables(table_settings) | |
# Return the largest table, as measured by number of cells. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): | |
if useTrainCV: | |
xgb_param = alg.get_xgb_params() | |
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) | |
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, | |
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False) | |
alg.set_params(n_estimators=cvresult.shape[0]) | |
#Fit the algorithm on the data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import libraries: | |
import pandas as pd | |
import numpy as np | |
import xgboost as xgb | |
from xgboost.sklearn import XGBClassifier | |
from sklearn import cross_validation, metrics #Additional scklearn functions | |
from sklearn.grid_search import GridSearchCV #Perforing grid search | |
import matplotlib.pylab as plt | |
%matplotlib inline |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
car : 52.74809002876282 | |
car : 54.43572402000427 | |
car : 61.86940670013428 | |
car : 64.99541997909546 | |
car : 54.53670620918274 | |
car : 54.111236333847046 | |
car : 55.92341423034668 | |
person : 54.37796711921692 | |
person : 61.132240295410156 | |
car : 70.4900324344635 |
NewerOlder