Skip to content

Instantly share code, notes, and snippets.

@matbee-eth
Last active May 1, 2024 20:16
Show Gist options
  • Save matbee-eth/15d858b65c2053c390914e5a2e818735 to your computer and use it in GitHub Desktop.
Save matbee-eth/15d858b65c2053c390914e5a2e818735 to your computer and use it in GitHub Desktop.
Parser for the CSV dataset at google-research-datasets/screen_annotation
(base) ➜ screen_annotation git:(main) ✗ python parse_dataset_entry.py
[('TOOLBAR', 0, 999, 31, 109, [('Inbox', 40, 158, 54, 85)]), ('CHATROOMS', 0, 333, 108, 171), ('NAVIGATION_BAR', 0, 998, 109, 171, [('DIRECT', 331, 664, 109, 171), ('CONTACTS', 672, 998, 110, 173)]), ('Inbox', 193, 409, 194, 222), ('Message', 273, 539, 226, 252), ('now', 904, 965, 202, 222), ('check', 903, 962, 228, 254), ('Friends', 391, 605, 346, 371)]
[('LIST_ITEM', 30, 482, 16, 232, [('Memoirs', 123, 398, 165, 220)]), ('LIST_ITEM', 500, 966, 16, 221, [('Comics', 660, 812, 165, 197)]), ('LIST_ITEM', 26, 487, 252, 556, [('gallery', 65, 452, 265, 488), ('Investing', 77, 450, 505, 536)]), ('LIST_ITEM', 512, 966, 254, 556, [('Spanish', 576, 897, 503, 536)]), ('LIST_ITEM', 19, 488, 591, 863, [('stop', 63, 454, 601, 827)]), ('LIST_ITEM', 509, 976, 589, 861, [('heart', 537, 939, 602, 823), ('TEXT', 539, 915, 843, 859)]), ('SKIP', 0, 167, 858, 935), ('PAGER_INDICATOR', 444, 547, 880, 910), ('NAVIGATION_BAR', 0, 998, 936, 999, [])]
[('TOOLBAR', 0, 999, 31, 104, [('Me', 285, 705, 37, 72), ('Summer', 2011, 229, 750, 75)]), ('mistletoe', 0, 994, 104, 725), ('LIST_ITEM', 0, 998, 727, 795, [('facebook', 27, 109, 737, 783), ('Facebook', 122, 298, 744, 776)]), ('LIST_ITEM', 0, 998, 794, 863, [('twitter', 25, 109, 804, 852), ('Twitter', 127, 251, 813, 844)]), ('LIST_ITEM', 0, 998, 863, 931, [('Apps', 124, 324, 882, 914)])]
[('Works', 193, 809, 645, 682), ('NAVIGATION_BAR', 0, 998, 936, 999, [])]
import re
def parse_dataset_entry(entry):
# Extract and structure entries along with their position values
entries = re.split(r',\s*(?![^()]*\))', entry) # Split the entry by commas not inside parentheses
structured_entries = []
for entry in entries:
# Extract positioning data and label
match = re.search(r'(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?:\s+\((.*)\))?$', entry)
if match:
label = match.group(1)
x1 = int(match.group(2))
y1 = int(match.group(3))
width = int(match.group(4))
height = int(match.group(5))
details = match.group(6)
if details:
# Handle nested details if present
nested_data = []
nested_parts = re.split(r',\s*', details)
for part in nested_parts:
nested_match = re.search(r'(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', part)
if nested_match:
nested_label = nested_match.group(1)
nested_x1 = int(nested_match.group(2))
nested_y1 = int(nested_match.group(3))
nested_width = int(nested_match.group(4))
nested_height = int(nested_match.group(5))
nested_data.append((nested_label, nested_x1, nested_y1, nested_width, nested_height))
structured_entries.append((label, x1, y1, width, height, nested_data))
else:
structured_entries.append((label, x1, y1, width, height))
return structured_entries
entries = [
"TOOLBAR 0 999 31 109 (TEXT Inbox 40 158 54 85, PICTOGRAM a white speech bubble with a plus sign on a blue background . 671 789 39 102, PICTOGRAM a magnifying glass icon on a blue background . 787 906 39 101, PICTOGRAM three white circles are lined up on a blue background . 908 999 39 101), BUTTON CHATROOMS 0 333 108 171, NAVIGATION_BAR 0 998 109 171 (BUTTON DIRECT 331 664 109 171, BUTTON CONTACTS 672 998 110 173), PICTOGRAM a green speech bubble with two white circles inside of it is in a yellow circle . 29 166 184 260, TEXT Team Inbox 193 409 194 222, PICTOGRAM three yellow stars are shining on a white background . 412 462 193 223, TEXT You: 199 267 226 251, TEXT Picture Message 273 539 226 252, TEXT now 904 965 202 222, PICTOGRAM check 903 962 228 254, TEXT Inbox is more fun with friends! 258 736 319 345, BUTTON Invite Friends 391 605 346 371",
"LIST_ITEM 30 482 16 232 (PICTOGRAM a blue circle with a white check mark in the middle . 75 458 23 148, TEXT Biographies & Memoirs 123 398 165 220), LIST_ITEM 500 966 16 221 (PICTOGRAM a circle with two bubbles in it on a white background . 527 935 27 148, TEXT Comics 660 812 165 197), LIST_ITEM 26 487 252 556 (PICTOGRAM gallery 65 452 265 488, TEXT Business & Investing 77 450 505 536), LIST_ITEM 512 966 254 556 (PICTOGRAM a book with a foreign language on it is in a circle . 538 932 265 490, TEXT Books in Spanish 576 897 503 536), LIST_ITEM 19 488 591 863 (PICTOGRAM stop 63 454 601 827), LIST_ITEM 509 976 589 861 (PICTOGRAM heart 537 939 602 823, TEXT 539 915 843 859), BUTTON SKIP 0 167 858 935, PAGER_INDICATOR 444 547 880 910, NAVIGATION_BAR 0 998 936 999 (PICTOGRAM a white triangle on a black background is a very small triangle . 185 256 945 983, PICTOGRAM a white circle on a black background . 462 532 945 984, PICTOGRAM a white square on a black background . 744 815 947 984), BUTTON NEXT > 766 998 863 932",
"PICTOGRAM a white arrow pointing to the left on a black background . 0 118 31 104, TOOLBAR 0 999 31 104 (TEXT That Should Be Me 285 705 37 72, TEXT Justin Bieber/Power Hits Summer 2011 229 750 75 97, PICTOGRAM a white music note with three lines on a black background . 864 998 34 102), IMAGE a poster for justin bieber 's under the mistletoe 0 994 104 725, LIST_ITEM 0 998 727 795 (PICTOGRAM facebook 27 109 737 783, TEXT Facebook 122 298 744 776), LIST_ITEM 0 998 794 863 (PICTOGRAM twitter 25 109 804 852, TEXT Twitter 127 251 813 844), LIST_ITEM 0 998 863 931 (PICTOGRAM a blue square with three white dots on it . 27 109 873 921, TEXT Other Apps 124 324 882 914)",
"TEXT An error occurred while initializing the YouTube player. 75 916 438 470, BUTTON How Drivewyze PreClear Works 193 809 645 682, NAVIGATION_BAR 0 998 936 999 (PICTOGRAM a white triangle on a black background is a triangle icon . 182 257 944 984, PICTOGRAM a white circle on a black background . 460 536 944 984, PICTOGRAM a white square on a black background . 743 819 945 984)",
]
[parse_dataset_entry(entry) for entry in entries]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment