Skip to content

Instantly share code, notes, and snippets.

@CountParadox
Created March 15, 2023 05:27
Show Gist options
  • Save CountParadox/68f69fb6ab608433b4781ecae7005e60 to your computer and use it in GitHub Desktop.
Save CountParadox/68f69fb6ab608433b4781ecae7005e60 to your computer and use it in GitHub Desktop.
import csv
import os
import xml.etree.ElementTree as ET
# Set the directory containing the dataset files
data_dir = '/path/to/dataset/files/'
# Set the threshold for outlier detection (in standard deviations from the mean)
threshold = 3
# Initialize the output XML report
report = ET.Element('report')
# Loop through all files in the data directory
for filename in os.listdir(data_dir):
if filename.endswith('.csv'):
# Open the file and read in the data
with open(os.path.join(data_dir, filename)) as f:
reader = csv.reader(f)
data = list(reader)
# Identify outliers in the data
for i, row in enumerate(data):
for j, value in enumerate(row):
try:
value = float(value)
mean = sum(float(row[k]) for k in range(len(row)) if k != j) / (len(row) - 1)
std_dev = (sum((float(row[k]) - mean)**2 for k in range(len(row)) if k != j) / (len(row) - 1))**0.5
if abs(value - mean) > threshold * std_dev:
# Replace the outlier value with 'DATAERROR'
data[i][j] = 'DATAERROR'
# Add a record of the change to the output report
record = ET.SubElement(report, 'record')
record.set('filename', filename)
record.set('line', str(i+1))
record.set('column', str(j+1))
record.set('value', str(value))
record.set('new_value', 'DATAERROR')
except ValueError:
# Skip non-numeric values
pass
# Write the updated data back to the file
with open(os.path.join(data_dir, filename), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
# Write the output report to a file
with open('report.xml', 'w') as f:
f.write(ET.tostring(report, encoding='unicode'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment