Skip to content

Instantly share code, notes, and snippets.

@tbmreza
Created May 15, 2019 07:47
Show Gist options
  • Save tbmreza/5e98c2d4cc24bf5ef9294b93713c876a to your computer and use it in GitHub Desktop.
Save tbmreza/5e98c2d4cc24bf5ef9294b93713c876a to your computer and use it in GitHub Desktop.
from collections import namedtuple
import re
''' TXT structure sample:
"PROVINSI"(489,185),(652,185),(652,219),(489,219)
"JAWA"(658,185),(747,185),(747,219),(658,219)
"BARAT"(758,185),(866,185),(866,219),(758,219)
"KABUPATEN"(508,220),(704,220),(704,252),(508,252)
"BEKASI"(724,220),(848,220),(848,252),(724,252)
"NIK"(187,266),(256,266),(256,303),(187,303)
":"(377,260),(391,260),(391,311),(377,311)
'''
gcloud_output_clean_path = 'path_to_txt'
with open(gcloud_output_clean_path, "r") as the_file:
lines = the_file.readlines()
pairs_list = []
for each_line in lines:
text_pattern = r'"(.+)"' # Detects string between quotes.
xy_pattern = r'(\d+)' # Detects numbers.
t = re.compile(text_pattern)
t = t.search(each_line)
texts = t.group(1)
# Delete match (to handle digit between quotes).
each_line = re.sub(text_pattern, '', each_line)
p = re.compile(xy_pattern)
points = p.findall(each_line)
# Raw coordinate data is represented as [(x0,y0),(x1,y0),(x1,y1),(x0,y1)].
# Possible positions of [x0 x1 y0 y1] are respectively (0 4 1 5)
To_namedtuple = namedtuple('Bounding', 'x0 x1 y0 y1')
points_tuple = To_namedtuple(int(points[0]),int(points[4]),int(points[1]),int(points[5]))
element = [texts.lower(), points_tuple]
if len(texts) > 1: # Omit single character detected text.
pairs_list.append(element)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment