Last active
May 12, 2020 23:49
-
-
Save jackbow/4235f45284b17655a76b8a060876ce13 to your computer and use it in GitHub Desktop.
For turning the Costco inventory PDFs into CSVs with prices for easy ordering. To use, download this file & run `python3 costco.py` in your terminal in the same directory as the file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import re | |
import sys | |
print("Upload the Costco pdf to https://ocr.space/, and select table recognition option, OCR Engine1, and save the output text to a txt file in the same directory as this script.") | |
path = input("Next, enter the name of the file: ") | |
f = open(path, 'r') | |
def hasNumbers(inputString): | |
return any(char.isdigit() for char in inputString) | |
def hasAlpha(inputString): | |
return any(char.isalpha() for char in inputString) | |
def split_on_char(s): | |
match = re.compile("[^\W\d]").search(s) | |
return [s[:match.start()], s[match.start():]] | |
def split_on_digit(s): | |
match = re.compile("\d").search(s) | |
return [s[:match.start()], s[match.start():]] | |
def split_on_money(s): | |
match = re.compile("(\d+)?\.?\d+$").search(s) | |
return [s[:match.start()], s[match.start():]] | |
def split_on_newline(s): | |
match = re.compile("\\n").search(s) | |
return [s[:match.start()], s[match.start():]] | |
def splitInsert(listt, idx, opt): | |
if opt == 'c': | |
csplit = split_on_char | |
if opt == 'd': | |
csplit = split_on_digit | |
if opt == 'm': | |
csplit = split_on_money | |
if opt == 'n': | |
csplit = split_on_newline | |
spp = csplit(listt[idx]) | |
listt[idx] = spp[0] | |
listt.insert(idx+1, spp[1]) | |
return listt | |
lines = f.readlines() | |
nlines = [] | |
for i,v in enumerate(lines): | |
if not ((not v[0].isdigit()) or | |
(v[0] == '*') or | |
(v[0:9].count('/', 0, 8) >= 2) or | |
(v.count('icc.coop') > 0)): | |
# split line | |
sp = v.split('\t') | |
# remove trailing spaces | |
for j in range(len(sp)): | |
if sp[j][-1] == ' ': | |
sp[j] = sp[j][0:-1] | |
contains_nums = [True, False, False, True, True, False] | |
contains_alph = [False, True, True, True, False, False] | |
# split bad data | |
if len(sp) != 6: | |
for j in range(len(sp)): | |
if (hasNumbers(sp[j]) != contains_nums[j] or | |
hasAlpha(sp[j]) != contains_alph[j]): | |
if j == 0: | |
sp = splitInsert(sp, j, 'c') | |
if j == 1: | |
sp = splitInsert(sp, j, 'd') | |
if j == 2: | |
if not sp[j][0].isdigit(): | |
sp = splitInsert(sp, j, 'd') | |
elif sp[j][-1].isdigit(): | |
sp = splitInsert(sp, j, 'm') | |
if j == 3: | |
if hasAlpha(sp[j]) and hasNumbers(sp[j]): | |
sp = splitInsert(sp, j, 'm') | |
elif sp[j][-1] == '\n': | |
sp = splitInsert(sp, j, 'n') | |
if len(sp[j]) > 0 and (sp[j][-1] == '\n'): | |
sp = splitInsert(sp, j, 'n') | |
# fill missing data | |
if len(sp) != 6: | |
for j in range(len(sp)): | |
if (hasNumbers(sp[j]) != contains_nums[j] or | |
hasAlpha(sp[j]) != contains_alph[j]): | |
sp.insert(j, ' ') | |
if len(sp) == 6: | |
break | |
# remove spaces in item # & price | |
sp[0] = sp[0].replace(' ', '') | |
sp[-2] = sp[-2].replace(' ', '') | |
nlines.append('\t'.join(sp)) | |
# if index error, run this code, and remove the \t from the relevant part: | |
# `print(v)` | |
# `lines[i] = <the text with the extra \t removed>` | |
# eg. | |
# `print(v): '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6\t.99\t\n'` | |
# `lines[i] = '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6.99\t\n'` | |
file = open('costco_inventory.csv', 'w') | |
file.writelines(nlines) | |
print("Number of items found: " + str(len(nlines))) | |
print("Output saved to costco_inventory.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment