daa233 · October 23, 2017 04:12 · cherifmad · Jun 12, 2018
diff --git a/count_files.py b/count_files.py
 # -*- encoding=utf-8 -*-
 # Python2 代码

 import os
 import csv

 # Open the csv and write headers.
 with open("files_count.txt",'wb') as out:
    outwriter = csv.writer(out)
    # outwriter.writerow(['Directory','FilesInDir','FilesIncludingSubdirs'])

    # Track total number of files in each subdirectory by absolute path
    totals = {}

    # topdown=False iterates lowest level (leaf) subdirectories first.
    # This way I can collect grand totals of files per subdirectory.
    for path,dirs,files in os.walk('FINAL_Plankton_Segments_12082014',topdown=False):
        files_in_current_directory = len(files)

        # Start with the files in the current directory and compute a
        # total for all subdirectories, which will be in the `totals`
        # dictionary already due to topdown=False.
        files_including_subdirs = files_in_current_directory
        for d in dirs:
            fullpath = os.path.abspath(os.path.join(path,d))

            # On my Windows system, Junctions weren't included in os.walk,
            # but would show up in the subdirectory list.  this try skips
            # them because they won't be in the totals dictionary.
            try:
                files_including_subdirs += totals[fullpath]
            except KeyError as e:
                print 'KeyError: {} may be symlink/junction'.format(e)

        totals[os.path.abspath(path)] = files_including_subdirs
        # outwriter.writerow([path,files_in_current_directory,files_including_subdirs])
        outwriter.writerow([path,files_in_current_directory])

 # # convert csv file to list and print
 # import csv
 # with open('Subject_Task_Count.csv', 'rb') as f:
 #     reader = csv.reader(f)
 #     my_list = list(reader)


 import numpy as np
 np.set_printoptions(suppress=True) # 不使用科学计数法显示数字

 data = np.genfromtxt('files_count.txt', delimiter=',')
 counts = data[:-1, [1]] # exclude the last one, it is not a class of plankton images
 print counts

 max_num = np.max(counts)
 min_num = np.min(counts)
 imbalance_rate = max_num / min_num

 print "max_num = " + str(max_num)  # 3883
 print "min_num = " + str(min_num)  # 20
 print "imbalance_rate = " + str(imbalance_rate)  # 194.15
	# -- encoding=utf-8 --
	# Python2 代码

	import os
	import csv

	# Open the csv and write headers.
	with open("files_count.txt",'wb') as out:
	outwriter = csv.writer(out)
	# outwriter.writerow(['Directory','FilesInDir','FilesIncludingSubdirs'])

	# Track total number of files in each subdirectory by absolute path
	totals = {}

	# topdown=False iterates lowest level (leaf) subdirectories first.
	# This way I can collect grand totals of files per subdirectory.
	for path,dirs,files in os.walk('FINAL_Plankton_Segments_12082014',topdown=False):
	files_in_current_directory = len(files)

	# Start with the files in the current directory and compute a
	# total for all subdirectories, which will be in the `totals`
	# dictionary already due to topdown=False.
	files_including_subdirs = files_in_current_directory
	for d in dirs:
	fullpath = os.path.abspath(os.path.join(path,d))

	# On my Windows system, Junctions weren't included in os.walk,
	# but would show up in the subdirectory list. this try skips
	# them because they won't be in the totals dictionary.
	try:
	files_including_subdirs += totals[fullpath]
	except KeyError as e:
	print 'KeyError: {} may be symlink/junction'.format(e)

	totals[os.path.abspath(path)] = files_including_subdirs
	# outwriter.writerow([path,files_in_current_directory,files_including_subdirs])
	outwriter.writerow([path,files_in_current_directory])

	# # convert csv file to list and print
	# import csv
	# with open('Subject_Task_Count.csv', 'rb') as f:
	# reader = csv.reader(f)
	# my_list = list(reader)


	import numpy as np
	np.set_printoptions(suppress=True) # 不使用科学计数法显示数字

	data = np.genfromtxt('files_count.txt', delimiter=',')
	counts = data[:-1, [1]] # exclude the last one, it is not a class of plankton images
	print counts

	max_num = np.max(counts)
	min_num = np.min(counts)
	imbalance_rate = max_num / min_num

	print "max_num = " + str(max_num) # 3883
	print "min_num = " + str(min_num) # 20
	print "imbalance_rate = " + str(imbalance_rate) # 194.15