Last active
July 18, 2019 10:24
-
-
Save anjia0532/6db48b0886d91d9a663e5a9fd19f2aaa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Author AnJia([email protected] https://anjia0532.github.io) | |
import argparse | |
import sys, os | |
import io | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
black_dict={"\\":"","\"":""} | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--cols', type=int, dest='cols', action='store', default=-1,help="count of columns,default first line's cells") | |
parser.add_argument('--src', type=str, dest='src', action='store', default='', | |
help='path to source csv file') | |
parser.add_argument('--dest', type=str, dest='dest', action='store', default='', | |
help='path to dest csv file') | |
parser.add_argument('--encoding', type=str, dest='encoding', action='store', default='utf-8', | |
help='file encoding,default utf-8') | |
parser.add_argument('--chunksize', type=int, dest='chunksize', action='store', default='10000', | |
help='batch lines to write dest file,default 10000') | |
parser.add_argument('--delimiter', type=str, dest='delimiter', action='store', default=',', | |
help='csv delimiter,default ,') | |
args = parser.parse_args() | |
cols = args.cols | |
src = args.src | |
dest = args.dest | |
encoding = args.encoding | |
chunksize = args.chunksize | |
delimiter = args.delimiter | |
if not (src and dest) or chunksize <= 0: | |
print("invaild args!") | |
sys.exit(-1) | |
olds=[] | |
lines=[] | |
with io.open(src,encoding=encoding) as fp: | |
for line in fp.readlines(): | |
line = line.strip() | |
for k,v in black_dict.items(): | |
if k in line: | |
line=line.replace(k,v) | |
cells = line.split(delimiter) | |
if cols == -1: | |
cols=len(cells) | |
if(len(cells) < cols or (len(olds)>0 and len(olds) < cols)): | |
if not olds: | |
olds = cells | |
else: | |
cells[0]=olds[-1]+cells[0] | |
olds.pop() | |
olds.extend(cells) | |
if len(olds) >= cols: | |
cells=olds | |
olds=[] | |
if not olds: | |
lines.append(delimiter.join(cells[0:cols])+"\n") | |
if len(lines) % chunksize == 0: | |
write_to_file(dest=dest,lines=lines) | |
lines=[] | |
write_to_file(dest=dest,lines=lines) | |
def write_to_file(dest,lines=[],encoding='utf-8'): | |
p = os.path.split(dest)[0] | |
if not os.path.exists(p): | |
os.makedirs(p) | |
with io.open(file=dest,mode="a+",encoding=encoding) as fp: | |
fp.writelines(lines) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
src csv
python cmd
python clean_csv.py --src=src.csv --dest=dest.csv --chunksize=50000 --cols --encoding=utf-8 --delimiter=,
dest csv