Last active
August 29, 2015 14:08
-
-
Save dnmfarrell/9d0f1e568c564ff69b10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use HTML::Entities; | |
use autodie; | |
# this program can be called with a filepath or a folder path containing | |
# files to clean | |
die 'Provide a filepath to clean' unless @ARGV or not -e $ARGV[0]; | |
# if it's a file | |
if (-f $ARGV[0]) | |
{ | |
clean_file($ARGV[0]); | |
} | |
# if it's a dir, loop through the files | |
elsif (-d $ARGV[0]) | |
{ | |
opendir my $IN_DIR, $ARGV[0]; | |
for (readdir $IN_DIR) # get the complete file list BEFORE we start writing new files into the same dir | |
{ | |
# skip shortcut files | |
next if /^\.+$/; | |
# clean each file | |
clean_file( $ARGV[0] . '/' . $_ ); | |
} | |
} | |
# this function takes a filepath as an argument, cleans the file | |
# and writes it out, prepending "clean" to the file_name. | |
sub clean_file | |
{ | |
my $filepath = shift; | |
my @path_parts = split /[\\\/]/, $filepath; | |
$path_parts[-1] = 'clean_' . $path_parts[-1]; | |
open my $IN_FILE, '<:utf8', $filepath; | |
open my $OUT_FILE, '>:utf8', join '/', @path_parts; | |
while (<$IN_FILE>) | |
{ | |
my $line = $_; | |
# remove tags | |
$line =~ s/<\/?.+?>//g; | |
# decode_entities function comes from HTML::Entities module | |
print $OUT_FILE decode_entities($line); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment