Skip to content

Instantly share code, notes, and snippets.

Last active May 12, 2020 23:49
Show Gist options
  • Save jackbow/4235f45284b17655a76b8a060876ce13 to your computer and use it in GitHub Desktop.
Save jackbow/4235f45284b17655a76b8a060876ce13 to your computer and use it in GitHub Desktop.
For turning the Costco inventory PDFs into CSVs with prices for easy ordering. To use, download this file & run `python3` in your terminal in the same directory as the file.
import string
import re
import sys
print("Upload the Costco pdf to, and select table recognition option, OCR Engine1, and save the output text to a txt file in the same directory as this script.")
path = input("Next, enter the name of the file: ")
f = open(path, 'r')
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def hasAlpha(inputString):
return any(char.isalpha() for char in inputString)
def split_on_char(s):
match = re.compile("[^\W\d]").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_digit(s):
match = re.compile("\d").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_money(s):
match = re.compile("(\d+)?\.?\d+$").search(s)
return [s[:match.start()], s[match.start():]]
def split_on_newline(s):
match = re.compile("\\n").search(s)
return [s[:match.start()], s[match.start():]]
def splitInsert(listt, idx, opt):
if opt == 'c':
csplit = split_on_char
if opt == 'd':
csplit = split_on_digit
if opt == 'm':
csplit = split_on_money
if opt == 'n':
csplit = split_on_newline
spp = csplit(listt[idx])
listt[idx] = spp[0]
listt.insert(idx+1, spp[1])
return listt
lines = f.readlines()
nlines = []
for i,v in enumerate(lines):
if not ((not v[0].isdigit()) or
(v[0] == '*') or
(v[0:9].count('/', 0, 8) >= 2) or
(v.count('') > 0)):
# split line
sp = v.split('\t')
# remove trailing spaces
for j in range(len(sp)):
if sp[j][-1] == ' ':
sp[j] = sp[j][0:-1]
contains_nums = [True, False, False, True, True, False]
contains_alph = [False, True, True, True, False, False]
# split bad data
if len(sp) != 6:
for j in range(len(sp)):
if (hasNumbers(sp[j]) != contains_nums[j] or
hasAlpha(sp[j]) != contains_alph[j]):
if j == 0:
sp = splitInsert(sp, j, 'c')
if j == 1:
sp = splitInsert(sp, j, 'd')
if j == 2:
if not sp[j][0].isdigit():
sp = splitInsert(sp, j, 'd')
elif sp[j][-1].isdigit():
sp = splitInsert(sp, j, 'm')
if j == 3:
if hasAlpha(sp[j]) and hasNumbers(sp[j]):
sp = splitInsert(sp, j, 'm')
elif sp[j][-1] == '\n':
sp = splitInsert(sp, j, 'n')
if len(sp[j]) > 0 and (sp[j][-1] == '\n'):
sp = splitInsert(sp, j, 'n')
# fill missing data
if len(sp) != 6:
for j in range(len(sp)):
if (hasNumbers(sp[j]) != contains_nums[j] or
hasAlpha(sp[j]) != contains_alph[j]):
sp.insert(j, ' ')
if len(sp) == 6:
# remove spaces in item # & price
sp[0] = sp[0].replace(' ', '')
sp[-2] = sp[-2].replace(' ', '')
# if index error, run this code, and remove the \t from the relevant part:
# `print(v)`
# `lines[i] = <the text with the extra \t removed>`
# eg.
# `print(v): '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6\t.99\t\n'`
# `lines[i] = '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6.99\t\n'`
file = open('costco_inventory.csv', 'w')
print("Number of items found: " + str(len(nlines)))
print("Output saved to costco_inventory.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment