zacharysyoung · January 16, 2022 23:24
diff --git a/README.md b/README.md
diff --git a/analyze_stats.py b/analyze_stats.py
 #!/usr/bin/env python3
 import csv
 import sys
 from collections import defaultdict

 COLS_NUM = 2
 HEADER = 'timestamp,value'

 # Ideally, one row count with a list of *all* sensors, e.g., {1_000_000: ['1', '2,' ... '500']}
 row_count_sensors_map = defaultdict(list)
 row_count_discrepancy = False

 header_error = False
 column_error = False
 with open('Sensor_stats.csv', newline='') as f:
    reader = csv.DictReader(f)
    for stat_row in reader:
        sensor = stat_row['Sensor']

        header = stat_row['Header']
        if header != HEADER:
            header_error = True

            print(f'Error with sensor {sensor}, header problem: expected {HEADER}, got {header}')

        min_cols = int(stat_row['Min Cols'])
        max_cols = int(stat_row['Max Cols'])
        if min_cols != COLS_NUM or max_cols != COLS_NUM:
            column_error = True

            print(f'Error with sensor {sensor}, column-count problem: some rows have as few as {min_cols}, some have as many as {max_cols}')
        
        row_count = int(stat_row['Num Rows'])
        row_count_sensors_map[row_count].append(sensor)


 # More than row count was found
 if len(row_count_sensors_map.keys()) > 1:
    row_count_discrepancy = True

    print('Error, found different row counts across all sensor CSVs:')
    for row_count, sensors in row_count_sensors_map.items():
        sensors = sensors[:20] + ['...'] if len(sensors) > 20 else sensors
        print(f'  Row count {row_count}: {",".join(sensors)}')

 if header_error or column_error or row_count_discrepancy:
    sys.exit(1)

 # Else, all is good!
 row_count = list(row_count_sensors_map.keys())[0]
 sensors = row_count_sensors_map[row_count]

 print(f'All {len(sensors)} sensors are conformant:')
 print(f'  Header: "{HEADER}"')
 print(f'  Columns: {COLS_NUM}')
 print(f'  Rows: {row_count}')
diff --git a/gen_sensor_data.py b/gen_sensor_data.py
 #!/usr/bin/env python3
 import csv
 import random

 for i in range(1, 501):
    fname = f'sensor{i}.csv'
    with open(fname, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['timestamp', 'value'])

        # Generate a series of timestamps, all sequential, but with some random variance in the order of 1-in-10 seconds
        for k in range(0, 10_000_000, 10):
            var = random.randrange(1, 10)
            ts = k + var

            whole = random.randrange(0, 20)
            part = random.randrange(0, 10)
            val = f'{whole}.{part}'

            writer.writerow([ts, val])

    print(f'Wrote {fname}')
diff --git a/get_sensor_stats.py b/get_sensor_stats.py
 #!/usr/bin/env python3
 import csv
 import glob

 # Get all sensor CSVs
 sensor_filenames = glob.glob('sensor*.csv')

 # Sort: trim leading 6 chars, 'sensor', and trailing 4 chars, '.csv', leaving just the number in the middle
 sensor_filenames = sorted(sensor_filenames, key=lambda x: int(x[6:-4]))

 # After iterating all rows of a sensor.csv, record some stats for that CSV
 ROW_TMPL = {'Sensor': None, 'Header': None,
            'Min Cols': None, 'Max Cols': None, 'Num Rows': None}

 all_rows = []
 for sensor_fname in sensor_filenames:
    f = open(sensor_fname, newline='')
    reader = csv.reader(f)

    header = next(reader)
    min_cols = len(header)
    max_cols = len(header)

    # Don't count header as a row
    row_count = 0
    for row in reader:
        row_count += 1
        col_count = len(row)

        if col_count < min_cols:
            min_cols = col_count

        if col_count > max_cols:
            max_cols = col_count

    row = dict(ROW_TMPL)  # copy dict/hash-map
    row['Sensor'] = sensor_fname[6:-4]
    row['Header'] = ','.join(header)
    row['Min Cols'] = min_cols
    row['Max Cols'] = max_cols
    row['Num Rows'] = row_count

    all_rows.append(row)

    f.close()


 with open('Sensor_stats.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=ROW_TMPL.keys())
    writer.writeheader()
    writer.writerows(all_rows)
	#!/usr/bin/env python3
	import csv
	import sys
	from collections import defaultdict

	COLS_NUM = 2
	HEADER = 'timestamp,value'

	# Ideally, one row count with a list of all sensors, e.g., {1_000_000: ['1', '2,' ... '500']}
	row_count_sensors_map = defaultdict(list)
	row_count_discrepancy = False

	header_error = False
	column_error = False
	with open('Sensor_stats.csv', newline='') as f:
	reader = csv.DictReader(f)
	for stat_row in reader:
	sensor = stat_row['Sensor']

	header = stat_row['Header']
	if header != HEADER:
	header_error = True

	print(f'Error with sensor {sensor}, header problem: expected {HEADER}, got {header}')

	min_cols = int(stat_row['Min Cols'])
	max_cols = int(stat_row['Max Cols'])
	if min_cols != COLS_NUM or max_cols != COLS_NUM:
	column_error = True

	print(f'Error with sensor {sensor}, column-count problem: some rows have as few as {min_cols}, some have as many as {max_cols}')

	row_count = int(stat_row['Num Rows'])
	row_count_sensors_map[row_count].append(sensor)


	# More than row count was found
	if len(row_count_sensors_map.keys()) > 1:
	row_count_discrepancy = True

	print('Error, found different row counts across all sensor CSVs:')
	for row_count, sensors in row_count_sensors_map.items():
	sensors = sensors[:20] + ['...'] if len(sensors) > 20 else sensors
	print(f' Row count {row_count}: {",".join(sensors)}')

	if header_error or column_error or row_count_discrepancy:
	sys.exit(1)

	# Else, all is good!
	row_count = list(row_count_sensors_map.keys())[0]
	sensors = row_count_sensors_map[row_count]

	print(f'All {len(sensors)} sensors are conformant:')
	print(f' Header: "{HEADER}"')
	print(f' Columns: {COLS_NUM}')
	print(f' Rows: {row_count}')
	#!/usr/bin/env python3
	import csv
	import random

	for i in range(1, 501):
	fname = f'sensor{i}.csv'
	with open(fname, 'w', newline='') as f:
	writer = csv.writer(f)
	writer.writerow(['timestamp', 'value'])

	# Generate a series of timestamps, all sequential, but with some random variance in the order of 1-in-10 seconds
	for k in range(0, 10_000_000, 10):
	var = random.randrange(1, 10)
	ts = k + var

	whole = random.randrange(0, 20)
	part = random.randrange(0, 10)
	val = f'{whole}.{part}'

	writer.writerow([ts, val])

	print(f'Wrote {fname}')
	#!/usr/bin/env python3
	import csv
	import glob

	# Get all sensor CSVs
	sensor_filenames = glob.glob('sensor*.csv')

	# Sort: trim leading 6 chars, 'sensor', and trailing 4 chars, '.csv', leaving just the number in the middle
	sensor_filenames = sorted(sensor_filenames, key=lambda x: int(x[6:-4]))

	# After iterating all rows of a sensor.csv, record some stats for that CSV
	ROW_TMPL = {'Sensor': None, 'Header': None,
	'Min Cols': None, 'Max Cols': None, 'Num Rows': None}

	all_rows = []
	for sensor_fname in sensor_filenames:
	f = open(sensor_fname, newline='')
	reader = csv.reader(f)

	header = next(reader)
	min_cols = len(header)
	max_cols = len(header)

	# Don't count header as a row
	row_count = 0
	for row in reader:
	row_count += 1
	col_count = len(row)

	if col_count < min_cols:
	min_cols = col_count

	if col_count > max_cols:
	max_cols = col_count

	row = dict(ROW_TMPL) # copy dict/hash-map
	row['Sensor'] = sensor_fname[6:-4]
	row['Header'] = ','.join(header)
	row['Min Cols'] = min_cols
	row['Max Cols'] = max_cols
	row['Num Rows'] = row_count

	all_rows.append(row)

	f.close()


	with open('Sensor_stats.csv', 'w', newline='') as f:
	writer = csv.DictWriter(f, fieldnames=ROW_TMPL.keys())
	writer.writeheader()
	writer.writerows(all_rows)