Last active
November 3, 2019 13:52
-
-
Save ettorerizza/6d11f8ce91cfb288c9865d066f93f059 to your computer and use it in GitHub Desktop.
Import the content of each files in a folder in a unique csv
where each row contains the content of a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Import the content of each files in a folder in a unique csv | |
where each row contains the content of a file | |
Arguments: | |
-i or --inputfolder : path to the folder containing the files | |
-o or --outputfile : path and name to the result in CSV | |
-f or --filetype (optional) : filter on filetype (html, txt, etc) default : every filetype | |
Example of use in command line | |
python textfolder_to_csv.py -i test -o test.csv | |
You can also filter by file extension without the dot | |
python textfolder_to_csv.py -i path/to/filefolder -o path/test.csv --f html | |
""" | |
import csv | |
from pathlib import Path | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--inputfolder", type=str, | |
help="input folder") | |
parser.add_argument("-o", "--output", type=str, | |
help="output csv") | |
parser.add_argument("-f", "--filetype", type=str, | |
help="file extension without the dot", default="*") | |
args = parser.parse_args() | |
path = Path(args.inputfolder) | |
with open(args.output, 'w', encoding='utf-8') as out_file: | |
csv_out = csv.writer(out_file) | |
csv_out.writerow(['FileName', 'Content']) | |
for fileName in path.glob('*.%s' % args.filetype): | |
lines = [] | |
with open(str(fileName.absolute()), 'rb') as one_text: | |
for line in one_text.readlines(): | |
lines.append(line.decode( | |
encoding='utf-8', errors='ignore').strip()) | |
csv_out.writerow([str(fileName), ' '.join(lines)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment