Created
March 15, 2023 05:27
-
-
Save CountParadox/68f69fb6ab608433b4781ecae7005e60 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import xml.etree.ElementTree as ET | |
# Set the directory containing the dataset files | |
data_dir = '/path/to/dataset/files/' | |
# Set the threshold for outlier detection (in standard deviations from the mean) | |
threshold = 3 | |
# Initialize the output XML report | |
report = ET.Element('report') | |
# Loop through all files in the data directory | |
for filename in os.listdir(data_dir): | |
if filename.endswith('.csv'): | |
# Open the file and read in the data | |
with open(os.path.join(data_dir, filename)) as f: | |
reader = csv.reader(f) | |
data = list(reader) | |
# Identify outliers in the data | |
for i, row in enumerate(data): | |
for j, value in enumerate(row): | |
try: | |
value = float(value) | |
mean = sum(float(row[k]) for k in range(len(row)) if k != j) / (len(row) - 1) | |
std_dev = (sum((float(row[k]) - mean)**2 for k in range(len(row)) if k != j) / (len(row) - 1))**0.5 | |
if abs(value - mean) > threshold * std_dev: | |
# Replace the outlier value with 'DATAERROR' | |
data[i][j] = 'DATAERROR' | |
# Add a record of the change to the output report | |
record = ET.SubElement(report, 'record') | |
record.set('filename', filename) | |
record.set('line', str(i+1)) | |
record.set('column', str(j+1)) | |
record.set('value', str(value)) | |
record.set('new_value', 'DATAERROR') | |
except ValueError: | |
# Skip non-numeric values | |
pass | |
# Write the updated data back to the file | |
with open(os.path.join(data_dir, filename), 'w', newline='') as f: | |
writer = csv.writer(f) | |
writer.writerows(data) | |
# Write the output report to a file | |
with open('report.xml', 'w') as f: | |
f.write(ET.tostring(report, encoding='unicode')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment