Created
May 15, 2019 07:47
-
-
Save tbmreza/5e98c2d4cc24bf5ef9294b93713c876a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import re | |
''' TXT structure sample: | |
"PROVINSI"(489,185),(652,185),(652,219),(489,219) | |
"JAWA"(658,185),(747,185),(747,219),(658,219) | |
"BARAT"(758,185),(866,185),(866,219),(758,219) | |
"KABUPATEN"(508,220),(704,220),(704,252),(508,252) | |
"BEKASI"(724,220),(848,220),(848,252),(724,252) | |
"NIK"(187,266),(256,266),(256,303),(187,303) | |
":"(377,260),(391,260),(391,311),(377,311) | |
''' | |
gcloud_output_clean_path = 'path_to_txt' | |
with open(gcloud_output_clean_path, "r") as the_file: | |
lines = the_file.readlines() | |
pairs_list = [] | |
for each_line in lines: | |
text_pattern = r'"(.+)"' # Detects string between quotes. | |
xy_pattern = r'(\d+)' # Detects numbers. | |
t = re.compile(text_pattern) | |
t = t.search(each_line) | |
texts = t.group(1) | |
# Delete match (to handle digit between quotes). | |
each_line = re.sub(text_pattern, '', each_line) | |
p = re.compile(xy_pattern) | |
points = p.findall(each_line) | |
# Raw coordinate data is represented as [(x0,y0),(x1,y0),(x1,y1),(x0,y1)]. | |
# Possible positions of [x0 x1 y0 y1] are respectively (0 4 1 5) | |
To_namedtuple = namedtuple('Bounding', 'x0 x1 y0 y1') | |
points_tuple = To_namedtuple(int(points[0]),int(points[4]),int(points[1]),int(points[5])) | |
element = [texts.lower(), points_tuple] | |
if len(texts) > 1: # Omit single character detected text. | |
pairs_list.append(element) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment