Skip to content

Instantly share code, notes, and snippets.

@AntonFriberg
Last active October 18, 2023 15:20
Show Gist options
  • Save AntonFriberg/d3440c7d6f4d57d8ac64cb70b4aefd4f to your computer and use it in GitHub Desktop.
Save AntonFriberg/d3440c7d6f4d57d8ac64cb70b4aefd4f to your computer and use it in GitHub Desktop.
Extract key value pairs from string with quotes in Python 3
"""Extract key value pairs in Python 3 using shlex and regex."""
import re
import shlex
def regex_kv_pairs(text, item_sep=r"\s", value_sep="="):
"""
Parse key-value pairs from a shell-like text with regex.
This approach is ~ 25 times faster than the shlex approach.
Returns a dict with the keys and values from the text input
"""
split_regex = r"""
(?P<key>[\w\-]+)= # Key consists of only alphanumerics and '-' character
(?P<quote>["']?) # Optional quote character.
(?P<value>[\S\s]*?) # Value is a non greedy match
(?P=quote) # Closing quote equals the first.
($|\s) # Entry ends with comma or end of string
""".replace("=", value_sep).replace(r"|\s)", f"|{item_sep})")
regex = re.compile(split_regex, re.VERBOSE)
return {match.group("key"): match.group("value") for match in regex.finditer(text)}
def parse_kv_pairs(text, item_sep=" ", value_sep="="):
"""
Parse key-value pairs from a shell-like text with shlex.
This approach has behavior very similar to the standard shell parsing.
Returns a dict with the keys and values from the text input
"""
# initialize a lexer, in POSIX mode (to properly handle escaping)
lexer = shlex.shlex(text, posix=True)
# set ' ' as whitespace for the lexer
# (the lexer will use this character to separate words)
lexer.whitespace = item_sep
# include '=' as a word character
# (this is done so that the lexer returns a list of key-value pairs)
# (if your option key or value contains any unquoted special character,
# you will need to add it here)
lexer.wordchars += value_sep
lexer.wordchars += ".-_()/:+*^&%$#@!?|{}"
# then we separate option keys and values to build the resulting dictionary
# (maxsplit is required to make sure that '=' in value will not be a problem)
return dict(word.split(value_sep, maxsplit=1) for word in lexer)
SIMPLE = "key1=value1 key2='value2,still_value2,not_key1=\"not_value1\"'"
ADVANCED = '''subsystem="syslog-ng" message="I/O error occurred while writing;
fd='20', error='Invalid argument (22)'"'''
SHLEX_SIMPLE = parse_kv_pairs(SIMPLE)
# {
# 'key1': 'value1',
# 'key2': 'value2,still_value2,not_key1="not_value1"'
# }
REGEX_SIMPLE = regex_kv_pairs(SIMPLE)
# {
# 'key1': 'value1',
# 'key2': 'value2,still_value2,not_key1=not_value1'
# }
SHLEX_ADVANCED = parse_kv_pairs(ADVANCED)
# {
# 'subsystem': 'syslog-ng',
# 'message': "I/O error occurred while writing; fd='20', error='Invalid argument (22)'"
# }
REGEX_ADVANCED = regex_kv_pairs(ADVANCED)
# {
# 'subsystem': 'syslog-ng',
# 'message': 'I/O error occurred while writing; fd=20, error=Invalid argument (22)'
# }
@sontung2310
Copy link

Hi, thank you for the valuable code. However, I am dealing with this case:

{'CPU': 'i7 13700H 2.4GHz', 'RAM': '16 GB DDR5 2 khe (1 khe 8 GB + 1 khe 8 GB) 5200 MHz', 'Ổ cứng': 'Hỗ trợ thêm 1 khe cắm SSD M.2 PCIe Gen 4 mở rộng (nâng cấp tối đa 1 TB) 1 TB SSD NVMe PCIe Gen 4', 'Màn hình': '16" WQXGA (2560 x 1600) 165Hz', 'Card màn hình': 'Card rời RTX 4060 8GB', 'Cổng kết nối': 'HDMI LAN (RJ45) 1 x USB 3.2 (Always on) 1 x USB 3.2 Jack tai nghe 3.5 mm 1 x USB-C 3.2 Gen 2 (hỗ trợ truyền dữ liệu and DisplayPort 1.4) 1 x USB-C 3.2 (hỗ trợ truyền dữ liệu, Power Delivery 140W và DisplayPort 1.4)', 'Đặc biệt': 'Có đèn bàn phím', 'Hệ điều hành': 'Windows 11 Home SL', 'Thiết kế': 'Vỏ nhựa - nắp lưng bằng kim loại', 'Kích thước, khối lượng': 'Dài 359.7 mm - Rộng 260.3 mm - Dày 19.9 ~ 25.2 mm - Nặng 2.4 kg', '': ''}

Do you have any ideal how to parse this string to key-value.
Thank you very much!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment