phucnh · September 10, 2019 05:45
diff --git a/README.md b/README.md
diff --git a/app.py b/app.py
 # -*- coding: utf-8 -*-

 import argparse
 import dask.dataframe as ddf
 import os
 import traceback


 def detect_null_columns(file_path, file_type):
    """Detect columns that contains empty value(s).
 Find rate of empty values of each column."""
    # Check file path
    if not (os.path.exists(file_path) and os.path.isfile(file_path)):
        raise Exception(f'[{file_path}] file is not found or is not file')

    # Read csv file to DataFrame, also trim whitespaces
    delimiter = ''
    if file_type == 'tsv':
        print('File type is [tsv]')
        delimiter = '\t'
    elif file_type == 'csv':
        print('File type is [csv]')
        delimiter = ','
    else:
        raise Exception(f'Unknown file type [{file_type}]')

    read_df = ddf.read_csv(file_path, sep=delimiter,
                           engine='python', encoding='utf-8')

    print('Sample data:')
    print(read_df.head())

    null_columns = read_df.columns[read_df.isnull().any()]
    null_columns_summary = (read_df[null_columns].isnull().mean()).compute(scheduler='processes')

    print('\n')

    print("""Rate of empty values, grouped by column.
 Left is column name, right is rate of empty values (range 0.0 ~ 1.0):""")
    print(null_columns_summary)


 def run(args):
    """Execute program"""
    try:
        file_path = args.path
        file_type = args.type
        detect_null_columns(file_path, file_type) # type: ignore
    except Exception as e:
        print('Got unexpected exception:', e)
        print(traceback.format_exc())
        # print(e)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CLI argument parser')
    parser.add_argument('--path', required=True,
                        type=str, help='Path to csv file')
    parser.add_argument('--type', required=True,
                        type=str, help='Type of file (e.g. csv, tsv)')

    args = parser.parse_args()

    run(args)
diff --git a/requirements.txt b/requirements.txt
 argparse==1.4.0
 # After https://github.com/cloudpipe/cloudpickle/pull/299 is released,
 # MUST use cloudpickle 1.1.1 because >=1.2.0 is work only with python 3.7.
 # bump cloudpickle version.
 cloudpickle==1.1.1
 dask==2.3.0
 dask[dataframe]
 fsspec==0.4.4
 numpy==1.17.2
 pandas==0.25.1
 python-dateutil==2.8.0
 pytz==2019.2
 six==1.12.0
 toolz==0.10.0
	# -- coding: utf-8 --

	import argparse
	import dask.dataframe as ddf
	import os
	import traceback


	def detect_null_columns(file_path, file_type):
	"""Detect columns that contains empty value(s).
	Find rate of empty values of each column."""
	# Check file path
	if not (os.path.exists(file_path) and os.path.isfile(file_path)):
	raise Exception(f'[{file_path}] file is not found or is not file')

	# Read csv file to DataFrame, also trim whitespaces
	delimiter = ''
	if file_type == 'tsv':
	print('File type is [tsv]')
	delimiter = '\t'
	elif file_type == 'csv':
	print('File type is [csv]')
	delimiter = ','
	else:
	raise Exception(f'Unknown file type [{file_type}]')

	read_df = ddf.read_csv(file_path, sep=delimiter,
	engine='python', encoding='utf-8')

	print('Sample data:')
	print(read_df.head())

	null_columns = read_df.columns[read_df.isnull().any()]
	null_columns_summary = (read_df[null_columns].isnull().mean()).compute(scheduler='processes')

	print('\n')

	print("""Rate of empty values, grouped by column.
	Left is column name, right is rate of empty values (range 0.0 ~ 1.0):""")
	print(null_columns_summary)


	def run(args):
	"""Execute program"""
	try:
	file_path = args.path
	file_type = args.type
	detect_null_columns(file_path, file_type) # type: ignore
	except Exception as e:
	print('Got unexpected exception:', e)
	print(traceback.format_exc())
	# print(e)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='CLI argument parser')
	parser.add_argument('--path', required=True,
	type=str, help='Path to csv file')
	parser.add_argument('--type', required=True,
	type=str, help='Type of file (e.g. csv, tsv)')

	args = parser.parse_args()

	run(args)
	argparse==1.4.0
	# After https://github.com/cloudpipe/cloudpickle/pull/299 is released,
	# MUST use cloudpickle 1.1.1 because >=1.2.0 is work only with python 3.7.
	# bump cloudpickle version.
	cloudpickle==1.1.1
	dask==2.3.0
	dask[dataframe]
	fsspec==0.4.4
	numpy==1.17.2
	pandas==0.25.1
	python-dateutil==2.8.0
	pytz==2019.2
	six==1.12.0
	toolz==0.10.0