Last active
October 18, 2023 15:20
-
-
Save AntonFriberg/d3440c7d6f4d57d8ac64cb70b4aefd4f to your computer and use it in GitHub Desktop.
Extract key value pairs from string with quotes in Python 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Extract key value pairs in Python 3 using shlex and regex.""" | |
import re | |
import shlex | |
def regex_kv_pairs(text, item_sep=r"\s", value_sep="="): | |
""" | |
Parse key-value pairs from a shell-like text with regex. | |
This approach is ~ 25 times faster than the shlex approach. | |
Returns a dict with the keys and values from the text input | |
""" | |
split_regex = r""" | |
(?P<key>[\w\-]+)= # Key consists of only alphanumerics and '-' character | |
(?P<quote>["']?) # Optional quote character. | |
(?P<value>[\S\s]*?) # Value is a non greedy match | |
(?P=quote) # Closing quote equals the first. | |
($|\s) # Entry ends with comma or end of string | |
""".replace("=", value_sep).replace(r"|\s)", f"|{item_sep})") | |
regex = re.compile(split_regex, re.VERBOSE) | |
return {match.group("key"): match.group("value") for match in regex.finditer(text)} | |
def parse_kv_pairs(text, item_sep=" ", value_sep="="): | |
""" | |
Parse key-value pairs from a shell-like text with shlex. | |
This approach has behavior very similar to the standard shell parsing. | |
Returns a dict with the keys and values from the text input | |
""" | |
# initialize a lexer, in POSIX mode (to properly handle escaping) | |
lexer = shlex.shlex(text, posix=True) | |
# set ' ' as whitespace for the lexer | |
# (the lexer will use this character to separate words) | |
lexer.whitespace = item_sep | |
# include '=' as a word character | |
# (this is done so that the lexer returns a list of key-value pairs) | |
# (if your option key or value contains any unquoted special character, | |
# you will need to add it here) | |
lexer.wordchars += value_sep | |
lexer.wordchars += ".-_()/:+*^&%$#@!?|{}" | |
# then we separate option keys and values to build the resulting dictionary | |
# (maxsplit is required to make sure that '=' in value will not be a problem) | |
return dict(word.split(value_sep, maxsplit=1) for word in lexer) | |
SIMPLE = "key1=value1 key2='value2,still_value2,not_key1=\"not_value1\"'" | |
ADVANCED = '''subsystem="syslog-ng" message="I/O error occurred while writing; | |
fd='20', error='Invalid argument (22)'"''' | |
SHLEX_SIMPLE = parse_kv_pairs(SIMPLE) | |
# { | |
# 'key1': 'value1', | |
# 'key2': 'value2,still_value2,not_key1="not_value1"' | |
# } | |
REGEX_SIMPLE = regex_kv_pairs(SIMPLE) | |
# { | |
# 'key1': 'value1', | |
# 'key2': 'value2,still_value2,not_key1=not_value1' | |
# } | |
SHLEX_ADVANCED = parse_kv_pairs(ADVANCED) | |
# { | |
# 'subsystem': 'syslog-ng', | |
# 'message': "I/O error occurred while writing; fd='20', error='Invalid argument (22)'" | |
# } | |
REGEX_ADVANCED = regex_kv_pairs(ADVANCED) | |
# { | |
# 'subsystem': 'syslog-ng', | |
# 'message': 'I/O error occurred while writing; fd=20, error=Invalid argument (22)' | |
# } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, thank you for the valuable code. However, I am dealing with this
case:
{'CPU': 'i7 13700H 2.4GHz', 'RAM': '16 GB DDR5 2 khe (1 khe 8 GB + 1 khe 8 GB) 5200 MHz', 'Ổ cứng': 'Hỗ trợ thêm 1 khe cắm SSD M.2 PCIe Gen 4 mở rộng (nâng cấp tối đa 1 TB) 1 TB SSD NVMe PCIe Gen 4', 'Màn hình': '16" WQXGA (2560 x 1600) 165Hz', 'Card màn hình': 'Card rời RTX 4060 8GB', 'Cổng kết nối': 'HDMI LAN (RJ45) 1 x USB 3.2 (Always on) 1 x USB 3.2 Jack tai nghe 3.5 mm 1 x USB-C 3.2 Gen 2 (hỗ trợ truyền dữ liệu and DisplayPort 1.4) 1 x USB-C 3.2 (hỗ trợ truyền dữ liệu, Power Delivery 140W và DisplayPort 1.4)', 'Đặc biệt': 'Có đèn bàn phím', 'Hệ điều hành': 'Windows 11 Home SL', 'Thiết kế': 'Vỏ nhựa - nắp lưng bằng kim loại', 'Kích thước, khối lượng': 'Dài 359.7 mm - Rộng 260.3 mm - Dày 19.9 ~ 25.2 mm - Nặng 2.4 kg', '': ''}
Do you have any ideal how to parse this string to key-value.
Thank you very much!