Skip to content

Instantly share code, notes, and snippets.

@kunrenzhilu
Created March 7, 2022 01:18
Show Gist options
  • Save kunrenzhilu/ab92463b6d9f7d2a71bb8c6b715db411 to your computer and use it in GitHub Desktop.
Save kunrenzhilu/ab92463b6d9f7d2a71bb8c6b715db411 to your computer and use it in GitHub Desktop.
unicode handling in python
import io
import json
if __name__=="__main__":
df = spark.table("item_lastest").filter(
col("country") == "TW"
).select(
"itemid", "name"
)
#python2
with io.open(res_path, "a", encoding='utf-8') as f:
for s in df.collect():
# it's necessary to have ensure_ascii=False, not necessary to have encoding="utf-8"
# u'{"itemid": 2959458, "name": "\u6771\u4eac\u65c5\u904a\u5168\u653b\u7565 + \u65c5\u904a\u65e5\u8a9e\u5168\u653b\u7565 2016-17\u5e74\u7248"}'
line = json.dumps(s.asDict(), ensure_ascii=False)
#the output of json remains unicode
# '{"itemid": 2959458, "name": "\xe6\x9d\xb1\xe4\xba\xac\xe6\x97\x85\xe9\x81\x8a\xe5\x85\xa8\xe6\x94\xbb\xe7\x95\xa5 + \xe6\x97\x85\xe9\x81\x8a\xe6\x97\xa5\xe8\xaa\x9e\xe5\x85\xa8\xe6\x94\xbb\xe7\x95\xa5 2016-17\xe5\xb9\xb4\xe7\x89\x88"}'
print(line.encode("utf-8"))
f.write(line+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment