jackbow · May 12, 2020 23:49
diff --git a/costco.py b/costco.py
 import string
 import re
 import sys

 print("Upload the Costco pdf to https://ocr.space/, and select table recognition option, OCR Engine1, and save the output text to a txt file in the same directory as this script.")

 path = input("Next, enter the name of the file: ")
 f = open(path, 'r')

 def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

 def hasAlpha(inputString):
    return any(char.isalpha() for char in inputString)

 def split_on_char(s):
    match = re.compile("[^\W\d]").search(s)
    return [s[:match.start()], s[match.start():]]

 def split_on_digit(s):
    match = re.compile("\d").search(s)
    return [s[:match.start()], s[match.start():]]

 def split_on_money(s):
    match = re.compile("(\d+)?\.?\d+$").search(s)
    return [s[:match.start()], s[match.start():]]

 def split_on_newline(s):
    match = re.compile("\\n").search(s)
    return [s[:match.start()], s[match.start():]]

 def splitInsert(listt, idx, opt):
    if opt == 'c':
        csplit = split_on_char
    if opt == 'd':
        csplit = split_on_digit
    if opt == 'm':
        csplit = split_on_money
    if opt == 'n':
        csplit = split_on_newline
    spp = csplit(listt[idx])
    listt[idx] = spp[0]
    listt.insert(idx+1, spp[1])
    return listt

 lines = f.readlines()
 nlines = []
 for i,v in enumerate(lines):
    if not ((not v[0].isdigit()) or 
            (v[0] == '*') or
            (v[0:9].count('/', 0, 8) >= 2) or
            (v.count('icc.coop') > 0)):
        
        # split line
        sp = v.split('\t')
        
        # remove trailing spaces
        for j in range(len(sp)):
            if sp[j][-1] == ' ':
                sp[j] = sp[j][0:-1]
        
        contains_nums = [True, False, False, True, True, False]
        contains_alph = [False, True, True, True, False, False]
        
        # split bad data
        if len(sp) != 6:
            for j in range(len(sp)):
                if (hasNumbers(sp[j]) != contains_nums[j] or
                   hasAlpha(sp[j]) != contains_alph[j]):
                    if j == 0:
                        sp = splitInsert(sp, j, 'c')
                    if j == 1:
                        sp = splitInsert(sp, j, 'd')
                    if j == 2:
                        if not sp[j][0].isdigit():
                            sp = splitInsert(sp, j, 'd')
                        elif sp[j][-1].isdigit():
                            sp = splitInsert(sp, j, 'm')
                    if j == 3:
                        if hasAlpha(sp[j]) and hasNumbers(sp[j]):
                            sp = splitInsert(sp, j, 'm')
                        elif sp[j][-1] == '\n':
                            sp = splitInsert(sp, j, 'n')
                if len(sp[j]) > 0 and (sp[j][-1] == '\n'):
                    sp = splitInsert(sp, j, 'n')
                            
        # fill missing data
        if len(sp) != 6:
            for j in range(len(sp)):
                if (hasNumbers(sp[j]) != contains_nums[j] or
                   hasAlpha(sp[j]) != contains_alph[j]):
                    sp.insert(j, ' ')
                    if len(sp) == 6:
                        break
                        
        # remove spaces in item # & price
        sp[0] = sp[0].replace(' ', '')
        sp[-2] = sp[-2].replace(' ', '')
        
        nlines.append('\t'.join(sp))

 # if index error, run this code, and remove the \t from the relevant part:
 # `print(v)`
 # `lines[i] = <the text with the extra \t removed>`
 # eg.
 # `print(v): '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6\t.99\t\n'`
 # `lines[i] = '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6.99\t\n'`

 file = open('costco_inventory.csv', 'w')
 file.writelines(nlines)

 print("Number of items found: " + str(len(nlines)))
 print("Output saved to costco_inventory.csv")
	import string
	import re
	import sys

	print("Upload the Costco pdf to https://ocr.space/, and select table recognition option, OCR Engine1, and save the output text to a txt file in the same directory as this script.")

	path = input("Next, enter the name of the file: ")
	f = open(path, 'r')

	def hasNumbers(inputString):
	return any(char.isdigit() for char in inputString)

	def hasAlpha(inputString):
	return any(char.isalpha() for char in inputString)

	def split_on_char(s):
	match = re.compile("[^\W\d]").search(s)
	return [s[:match.start()], s[match.start():]]

	def split_on_digit(s):
	match = re.compile("\d").search(s)
	return [s[:match.start()], s[match.start():]]

	def split_on_money(s):
	match = re.compile("(\d+)?\.?\d+$").search(s)
	return [s[:match.start()], s[match.start():]]

	def split_on_newline(s):
	match = re.compile("\\n").search(s)
	return [s[:match.start()], s[match.start():]]

	def splitInsert(listt, idx, opt):
	if opt == 'c':
	csplit = split_on_char
	if opt == 'd':
	csplit = split_on_digit
	if opt == 'm':
	csplit = split_on_money
	if opt == 'n':
	csplit = split_on_newline
	spp = csplit(listt[idx])
	listt[idx] = spp[0]
	listt.insert(idx+1, spp[1])
	return listt

	lines = f.readlines()
	nlines = []
	for i,v in enumerate(lines):
	if not ((not v[0].isdigit()) or
	(v[0] == '*') or
	(v[0:9].count('/', 0, 8) >= 2) or
	(v.count('icc.coop') > 0)):

	# split line
	sp = v.split('\t')

	# remove trailing spaces
	for j in range(len(sp)):
	if sp[j][-1] == ' ':
	sp[j] = sp[j][0:-1]

	contains_nums = [True, False, False, True, True, False]
	contains_alph = [False, True, True, True, False, False]

	# split bad data
	if len(sp) != 6:
	for j in range(len(sp)):
	if (hasNumbers(sp[j]) != contains_nums[j] or
	hasAlpha(sp[j]) != contains_alph[j]):
	if j == 0:
	sp = splitInsert(sp, j, 'c')
	if j == 1:
	sp = splitInsert(sp, j, 'd')
	if j == 2:
	if not sp[j][0].isdigit():
	sp = splitInsert(sp, j, 'd')
	elif sp[j][-1].isdigit():
	sp = splitInsert(sp, j, 'm')
	if j == 3:
	if hasAlpha(sp[j]) and hasNumbers(sp[j]):
	sp = splitInsert(sp, j, 'm')
	elif sp[j][-1] == '\n':
	sp = splitInsert(sp, j, 'n')
	if len(sp[j]) > 0 and (sp[j][-1] == '\n'):
	sp = splitInsert(sp, j, 'n')

	# fill missing data
	if len(sp) != 6:
	for j in range(len(sp)):
	if (hasNumbers(sp[j]) != contains_nums[j] or
	hasAlpha(sp[j]) != contains_alph[j]):
	sp.insert(j, ' ')
	if len(sp) == 6:
	break

	# remove spaces in item # & price
	sp[0] = sp[0].replace(' ', '')
	sp[-2] = sp[-2].replace(' ', '')

	nlines.append('\t'.join(sp))

	# if index error, run this code, and remove the \t from the relevant part:
	# `print(v)`
	# `lines[i] = <the text with the extra \t removed>`
	# eg.
	# `print(v): '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6\t.99\t\n'`
	# `lines[i] = '843142\tLOG CABIN\tSYRUP\t2/64 OUNCE CONTAINERS\t6.99\t\n'`

	file = open('costco_inventory.csv', 'w')
	file.writelines(nlines)

	print("Number of items found: " + str(len(nlines)))
	print("Output saved to costco_inventory.csv")
No results found