Last active
January 8, 2025 08:51
-
-
Save sevab/d0a10dbba82d87f87a9244f326299b9b to your computer and use it in GitHub Desktop.
Filter CSV rows matching substring (case insensitive) in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script filters rows from a CSV file based on a case-insensitive substring match. | |
Usage: | |
python script.py <csv_filename> <substring> [--preserve-header] [--columns=column1,column2,...] | |
Arguments: | |
<csv_filename> The name of the input CSV file to process. | |
<substring> The substring to search for (case-insensitive). | |
[--preserve-header] (Optional) Include this flag to preserve the header row in the output file. | |
[--columns=...] (Optional) Specify a comma-separated list of column names to search against. | |
If not provided, all columns will be searched. | |
Output: | |
A new CSV file named "filtered_<csv_filename>" is created, containing only the rows | |
that match the specified substring. If the --preserve-header flag is used, the header | |
row will also be included in the output file. | |
Example: | |
python script.py data.csv apple --preserve-header --columns=name,address | |
- This will create a file "filtered_data.csv" containing rows where the substring "apple" | |
(case-insensitive) appears in the "name" or "address" columns, including the header row. | |
Error Handling: | |
- If the input file does not exist, an error message is displayed. | |
- Other exceptions are caught and printed for debugging purposes. | |
Author: | |
Seva Baskin | |
""" | |
import csv | |
import sys | |
if len(sys.argv) < 3: | |
print("Usage: python script.py <csv_filename> <substring> [--preserve-header] [--columns=column1,column2,...]") | |
sys.exit(1) | |
input_filename = sys.argv[1] | |
substring = sys.argv[2].lower() | |
preserve_header = "--preserve-header" in sys.argv | |
columns_arg = next((arg for arg in sys.argv if arg.startswith("--columns=")), None) | |
columns = columns_arg.split("=", 1)[1].split(",") if columns_arg else None | |
try: | |
output_filename = f"filtered_{input_filename}" | |
with open(input_filename, mode="r", encoding="utf-8") as infile, \ | |
open(output_filename, mode="w", encoding="utf-8", newline="") as outfile: | |
reader = csv.reader(infile) | |
writer = csv.writer(outfile) | |
header = next(reader, None) | |
if preserve_header and header: | |
writer.writerow(header) | |
# Map column names to indices if columns are specified | |
column_indices = None | |
if columns and header: | |
column_indices = [header.index(col) for col in columns if col in header] | |
for row in reader: | |
if column_indices: | |
# Search only specified columns | |
if any(substring in row[idx].lower() for idx in column_indices): | |
writer.writerow(row) | |
else: | |
# Search all columns | |
if any(substring in cell.lower() for cell in row): | |
writer.writerow(row) | |
print(f"Filtered rows written to {output_filename}") | |
except FileNotFoundError: | |
print(f"Error: File {input_filename} not found.") | |
except ValueError as e: | |
print(f"Error: {e}. Check if column names are correct.") | |
except Exception as e: | |
print(f"An error occurred: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment