zhehaowang · May 22, 2020 19:26
diff --git a/csv2xlsx.py b/csv2xlsx.py
 import os
 import glob
 import csv
 from xlsxwriter.workbook import Workbook

 for csvfile in ["out.csv"]:
    workbook = Workbook(csvfile[:-4] + '.xlsx')
    worksheet = workbook.add_worksheet()
    with open(csvfile, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for r, row in enumerate(reader):
            for c, col in enumerate(row):
                worksheet.write(r, c, col)
    workbook.close()
diff --git a/utf8csv.py b/utf8csv.py
 import re

 def should_flip(ints):
    characters = "".join([chr(c) for c in ints])
    if len(characters) == 3:
        # if characters in ["e94", "d8\n", "f57", "h57", "g,,", "e92", "e72", "e96", "h49", "g\n6"]:
        #     return True
        if re.match(r"^[edfgh][,\n0-9][,\n0-9]$", characters):
            return True
        # if re.match(r"^_[0-9][2b]$", characters):
        #     return True

    all_alphanum = all([chr(c).isalnum() or chr(c) in ['\n', ',', '_'] for c in ints])
    if all_alphanum:
        return False

    later_bytes_match = all([(0b01000000 & i) == 0 for i in ints[1:]])
    # turns out chinese utf-8 are all 3 chars?
    if len(ints) == 4:
        return (0b11111000 & ints[0] == 0b01110000) and later_bytes_match
    elif len(ints) == 3:
        return (0b11110000 & ints[0] == 0b01100000) and later_bytes_match
    # elif len(ints) == 2:
    #     return (0b11100000 & ints[0] == 0b01000000) and later_bytes_match
    else:
        return False
        # c = chr(ints[0])
        # return not (c.isalnum() or c in ['\n', ',', '_'])

 with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f, open('out.csv', 'wb') as outfile:
    # reader = csv.reader(f)
    content = f.read()

    i = 0
    while i < len(content):
        flipped = False
        for j in [3]:
            if should_flip(content[i:i + j]):
                seg = content[i:i + j]
                for k in range(len(seg)):
                    outfile.write((seg[k] + 128).to_bytes(1, byteorder='little'))
                i += j
                flipped = True
                break
        if flipped:
            continue
        else:
            outfile.write(content[i].to_bytes(1, byteorder='little'))
            i += 1

        # c = chr(content[i])
        # num = content[i]
        # if c.isalnum() or c in ['\n', ',', '_']:
        #     pass
        # else:
        #     num += 128
        # outfile.write(num.to_bytes(1, byteorder='little'))
        # i += 1


 # with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f:
 #     # reader = csv.reader(f)
 #     content = f.read()
 #     # for c in content:
 #     #     print(c), 

 #     utf8str = content.decode("utf-8")
 #     line = utf8str.split('\n')[1:5]
 #     print("".join(line).encode("utf-8").hex())
 #     # part = line.split(',')[1]
 #     # partbin = part.encode("utf-8")

 #     # print(partbin.hex())
 #     # print(u"f\x17%".encode("utf-8"))
 #     # print(u"f\x17%".encode("utf-8").hex())

 #     # print(u"f\x17".encode("utf-8").decode(""))

 #     # print(u"年 月 日".encode("gb2312"))

 #     # print(u"年 月 日".encode("utf-8"))
 #     # 2016 e94 9 f\x1c\x08 13 f\x17%
    
 # actual unicode: \xe5\xb9\xb4  \xe6\x9c\x88  \xe6\x97\xa5
 # given:          \x65\x3e\x34  \x66\x1c\x08  \x66\x17\x25
	import os
	import glob
	import csv
	from xlsxwriter.workbook import Workbook

	for csvfile in ["out.csv"]:
	workbook = Workbook(csvfile[:-4] + '.xlsx')
	worksheet = workbook.add_worksheet()
	with open(csvfile, 'r', newline='', encoding='utf-8') as f:
	reader = csv.reader(f)
	for r, row in enumerate(reader):
	for c, col in enumerate(row):
	worksheet.write(r, c, col)
	workbook.close()
	import re

	def should_flip(ints):
	characters = "".join([chr(c) for c in ints])
	if len(characters) == 3:
	# if characters in ["e94", "d8\n", "f57", "h57", "g,,", "e92", "e72", "e96", "h49", "g\n6"]:
	# return True
	if re.match(r"^[edfgh][,\n0-9][,\n0-9]$", characters):
	return True
	# if re.match(r"^_[0-9][2b]$", characters):
	# return True

	all_alphanum = all([chr(c).isalnum() or chr(c) in ['\n', ',', '_'] for c in ints])
	if all_alphanum:
	return False

	later_bytes_match = all([(0b01000000 & i) == 0 for i in ints[1:]])
	# turns out chinese utf-8 are all 3 chars?
	if len(ints) == 4:
	return (0b11111000 & ints[0] == 0b01110000) and later_bytes_match
	elif len(ints) == 3:
	return (0b11110000 & ints[0] == 0b01100000) and later_bytes_match
	# elif len(ints) == 2:
	# return (0b11100000 & ints[0] == 0b01000000) and later_bytes_match
	else:
	return False
	# c = chr(ints[0])
	# return not (c.isalnum() or c in ['\n', ',', '_'])

	with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f, open('out.csv', 'wb') as outfile:
	# reader = csv.reader(f)
	content = f.read()

	i = 0
	while i < len(content):
	flipped = False
	for j in [3]:
	if should_flip(content[i:i + j]):
	seg = content[i:i + j]
	for k in range(len(seg)):
	outfile.write((seg[k] + 128).to_bytes(1, byteorder='little'))
	i += j
	flipped = True
	break
	if flipped:
	continue
	else:
	outfile.write(content[i].to_bytes(1, byteorder='little'))
	i += 1

	# c = chr(content[i])
	# num = content[i]
	# if c.isalnum() or c in ['\n', ',', '_']:
	# pass
	# else:
	# num += 128
	# outfile.write(num.to_bytes(1, byteorder='little'))
	# i += 1


	# with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f:
	# # reader = csv.reader(f)
	# content = f.read()
	# # for c in content:
	# # print(c),

	# utf8str = content.decode("utf-8")
	# line = utf8str.split('\n')[1:5]
	# print("".join(line).encode("utf-8").hex())
	# # part = line.split(',')[1]
	# # partbin = part.encode("utf-8")

	# # print(partbin.hex())
	# # print(u"f\x17%".encode("utf-8"))
	# # print(u"f\x17%".encode("utf-8").hex())

	# # print(u"f\x17".encode("utf-8").decode(""))

	# # print(u"年月日".encode("gb2312"))

	# # print(u"年月日".encode("utf-8"))
	# # 2016 e94 9 f\x1c\x08 13 f\x17%

	# actual unicode: \xe5\xb9\xb4 \xe6\x9c\x88 \xe6\x97\xa5
	# given: \x65\x3e\x34 \x66\x1c\x08 \x66\x17\x25