sam-writer · May 14, 2021 22:36
diff --git a/ucp_to_u16.py b/ucp_to_u16.py
 import itertools
 from typing import Tuple


 def ucp_to_utf16_charmap(s: str):
  """
  mostly copied from
  https://stackoverflow.com/questions/56280011/keeping-java-string-offsets-with-unicode-consistent-in-python

  converts from python indices (unicode code points) to indices
  """
  chrLengths = [len(bytearray(ch, "UTF-16LE"))//2 for ch in s]
  utf16indices = [0] + list(itertools.accumulate(chrLengths))
  return utf16indices


 def span_to_js(s: str, span: Tuple[int, int]) -> Tuple[int, int]:
  charmap = ucp_to_utf16_charmap(s)
  return charmap[span[0]], charmap[span[1]]


 if __name__ == "__main__":
  issue_spans = [(5,12)]  # "firemen" span in python
  s = "👨🏻‍🚒 firemen drive firetrucks"
  js_spans = [span_to_js(s, sp) for sp in issue_spans]
  print(js_spans)  # [(8, 15)]
  # note: in js, s.substring(8, 15) is the correct subspan
	import itertools
	from typing import Tuple


	def ucp_to_utf16_charmap(s: str):
	"""
	mostly copied from
	https://stackoverflow.com/questions/56280011/keeping-java-string-offsets-with-unicode-consistent-in-python

	converts from python indices (unicode code points) to indices
	"""
	chrLengths = [len(bytearray(ch, "UTF-16LE"))//2 for ch in s]
	utf16indices = [0] + list(itertools.accumulate(chrLengths))
	return utf16indices


	def span_to_js(s: str, span: Tuple[int, int]) -> Tuple[int, int]:
	charmap = ucp_to_utf16_charmap(s)
	return charmap[span[0]], charmap[span[1]]


	if __name__ == "__main__":
	issue_spans = [(5,12)] # "firemen" span in python
	s = "👨🏻‍🚒 firemen drive firetrucks"
	js_spans = [span_to_js(s, sp) for sp in issue_spans]
	print(js_spans) # [(8, 15)]
	# note: in js, s.substring(8, 15) is the correct subspan