Created
May 14, 2021 22:36
-
-
Save sam-writer/20fa3f6ec8e162b86f601db13f0f907e to your computer and use it in GitHub Desktop.
Converting Python character indices to UTF-16 indices
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
from typing import Tuple | |
def ucp_to_utf16_charmap(s: str): | |
""" | |
mostly copied from | |
https://stackoverflow.com/questions/56280011/keeping-java-string-offsets-with-unicode-consistent-in-python | |
converts from python indices (unicode code points) to indices | |
""" | |
chrLengths = [len(bytearray(ch, "UTF-16LE"))//2 for ch in s] | |
utf16indices = [0] + list(itertools.accumulate(chrLengths)) | |
return utf16indices | |
def span_to_js(s: str, span: Tuple[int, int]) -> Tuple[int, int]: | |
charmap = ucp_to_utf16_charmap(s) | |
return charmap[span[0]], charmap[span[1]] | |
if __name__ == "__main__": | |
issue_spans = [(5,12)] # "firemen" span in python | |
s = "👨🏻🚒 firemen drive firetrucks" | |
js_spans = [span_to_js(s, sp) for sp in issue_spans] | |
print(js_spans) # [(8, 15)] | |
# note: in js, s.substring(8, 15) is the correct subspan |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment