Last active
April 3, 2024 09:10
-
-
Save feklee/7fbdc380abd41fb5d75b5a35e7324463 to your computer and use it in GitHub Desktop.
Finds URLs of files in my directory tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# All URLs of the format <http://f76.eu/a*> are for identifying my | |
# documents. See: http://f76.eu/a0 | |
# | |
# This script finds all files when run from the root of the directory | |
# tree with my documents. | |
# Example confguration `~/.config/parse_docurls.conf`: | |
# | |
# doc_dir = '~/windows/Dropbox/My Data' | |
# max_file_size = 1 # KiB | |
# Felix E. Klee <[email protected]> | |
use strict; | |
use feature "state"; | |
use ConfigReader::Simple; | |
my $config_filename = glob "~/.config/parse_docurls.conf"; | |
my $output_filename = glob "~/.local/share/docurl_cache"; | |
my $doc_dir; | |
my $max_file_size; | |
sub read_config { | |
my $config = ConfigReader::Simple->new($config_filename); | |
die "Could not read config: $ConfigReader::Simple::ERROR\n" | |
unless ref $config; | |
my $doc_dir = $config->get('doc_dir') || die "Missing config"; | |
my $max_file_size = $config->get("max_file_size"); | |
$doc_dir = glob "'$doc_dir'"; | |
return ($doc_dir, $max_file_size); | |
} | |
# All IDs start with an "a", which is followed by the actual | |
# identifier. Let's call that sub ID. | |
sub sub_id { | |
my $id = shift; | |
if (substr($id, 0, 1) ne 'a') { | |
die('ID "' . $id . '" is in wrong format'); | |
} | |
return substr($id, 1); | |
} | |
sub id_from_sub_id { | |
my $sub_id = shift; | |
return 'a' . $sub_id; | |
} | |
sub next_id { | |
my $id = shift; | |
my @chars = split(//, sub_id($id)); | |
my @rchars = reverse @chars; | |
my $l = @rchars; | |
while (my ($i, $char) = each @rchars) { | |
if ($char =~ /[0-8a-yA-Y]/) { | |
$rchars[$i] = chr(ord($char) + 1); | |
last; | |
} elsif ($char eq "9") { | |
$rchars[$i] = "a"; | |
last; | |
} elsif ($char eq "z") { | |
$rchars[$i] = "A"; | |
last; | |
} else { | |
$rchars[$i] = "0"; | |
if ($i == ($l - 1)) { | |
$rchars[$i] = "00"; | |
} | |
} | |
} | |
my $new_sub_id = join('', reverse @rchars); | |
return id_from_sub_id($new_sub_id); | |
} | |
sub format_line { | |
my $line = shift; | |
# When designing regular expressions, beware of carriage returns | |
# in DOS style line endings. They can cause weird effects in | |
# substitutions. | |
chomp($line); | |
$line =~ s/\.\/(.*):DocURL:\s*http:\/\/f76.eu\/([[:alnum:]]*)/$2\t$1/g; | |
return $line; | |
} | |
sub compare_ids { | |
my ($x, $y) = @_; | |
$x =~ tr/A-Za-z/a-zA-Z/; | |
$y =~ tr/A-Za-z/a-zA-Z/; | |
my $result = (length $x <=> length $y || $x cmp $y); | |
return (length $x <=> length $y || $x cmp $y); | |
} | |
sub compare_lines { | |
my @x = split(/\t/, $a); | |
my @y = split(/\t/, $b); | |
return compare_ids($x[0], $y[0]); | |
} | |
sub report_gap { | |
my ($id, $expected_id) = @_; | |
my $first_id = $$expected_id; | |
my $last_missing_id; | |
while (compare_ids($id, $$expected_id) > 0) { | |
$last_missing_id = $$expected_id; | |
$$expected_id = next_id($$expected_id); | |
} | |
my $gap_found = defined($last_missing_id); | |
if ($gap_found) { | |
print STDERR "Missing: ".$first_id; | |
if ($last_missing_id ne $first_id) { | |
print STDERR " - ".$last_missing_id; | |
} | |
print STDERR "\n"; | |
} | |
my $expected_id_found = $id eq $$expected_id; | |
return ($gap_found, $expected_id_found); | |
} | |
sub report_if_duplicate { | |
my $id = shift; | |
state $prev_id; | |
my $duplicate_found = $id eq $prev_id; | |
$prev_id = $id; | |
print STDERR "Duplicate: ".$id."\n" if $duplicate_found; | |
return $duplicate_found; | |
} | |
sub report_inconsistencies { | |
my @lines = @_; | |
my $expected_id = "a0"; | |
my @augmented_lines; | |
my $errors_found; | |
foreach (@lines) { | |
my ($id) = split(/\t/, $_); | |
my $duplicate_found = report_if_duplicate $id; | |
if ($duplicate_found) { | |
$_ = "# Duplicate: ".$_; | |
$errors_found = 1; | |
} | |
my ($gap_found, $expected_id_found) = report_gap($id, \$expected_id); | |
if ($expected_id_found) { | |
$expected_id = next_id($expected_id); | |
} | |
if ($gap_found) { | |
push(@augmented_lines, "# Gap"); | |
$errors_found = 1; | |
} | |
push(@augmented_lines, $_); | |
} | |
if ($errors_found) { | |
push(@augmented_lines, ""); | |
push(@augmented_lines, "# Errors found, see comments above."); | |
} | |
return @augmented_lines; | |
} | |
sub timestamp { | |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); | |
my $timestamp = sprintf("%04d-%02d-%02d %02d:%02d:%02d", | |
$year+1900,$mon+1,$mday,$hour,$min,$sec); | |
return $timestamp; | |
} | |
sub append_timestamp { | |
my @lines = @_; | |
push(@lines, ""); | |
push(@lines, "# Last update: " . timestamp); | |
return @lines; | |
} | |
sub clean_up { | |
$_[0] =~ s/\r|\n//g; | |
$_[0] =~ s/^\s+|\s+$//g; | |
} | |
sub summarize { | |
print <<~EOF; | |
Finished parsing: $doc_dir | |
Wrote output to: $output_filename | |
Maximum file size searched: $max_file_size KiB | |
If there are missing IDs, consider increasing file size for searching. | |
EOF | |
} | |
sub write_output { | |
open(OUT, '>', $output_filename) or die $!; | |
print OUT <<~EOF; | |
# DocURL cache. For information about f76.eu DocURLs and how to | |
# rebuild the cache, see: | |
# | |
# http://f76.eu/a0 | |
# | |
# This file was generated by: | |
# | |
# $0 | |
# | |
# Felix E. Klee <felix.klee\@inka.de> | |
EOF | |
print OUT join("\n", @_); | |
print OUT "\n"; | |
close OUT; | |
} | |
# See documentation of Gnu `find`: When an upper size limit is | |
# specified, it does not include the limit. Instead the maximum is | |
# one unit below that. | |
sub find_size { | |
return -($max_file_size + 1)."k"; | |
} | |
sub parse_docs { | |
my $find_cmd = | |
'find -type f -size '.find_size.' -exec grep -Hi "^DocURL:" {} \;'; | |
my @lines; | |
my @ids; | |
open(FIND, "$find_cmd |"); | |
while (<FIND>) { | |
my $line = format_line($_); | |
clean_up $line; | |
print "$line\n"; | |
push(@lines, $line); | |
} | |
return @lines; | |
} | |
($doc_dir, $max_file_size) = read_config; | |
chdir $doc_dir || die "Cannot enter " . $doc_dir; | |
my @lines = parse_docs; | |
my @sorted_lines = sort compare_lines @lines; | |
my @augmented_lines = report_inconsistencies(@sorted_lines); | |
@augmented_lines = append_timestamp(@augmented_lines); | |
write_output @augmented_lines; | |
summarize; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment