Created
June 22, 2022 20:01
-
-
Save xrat/d8b92e32e22af0cf8e05c1e7538cd2f5 to your computer and use it in GitHub Desktop.
Awk script to flatten an rss2email multipart digest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Flatten an rss2email multipart digest | |
# | |
# Tested only with GNU Awk, and rss2email up to 3.13.1. | |
# This script expects the encodings setting in r2e to prefer UTF-8. | |
# It also expects that input was run through `formail -c`, and it | |
# requires reformime (on e.g. Debian part of package maildrop) | |
function output() { | |
# print entry | |
print "-----------------" | |
#print "From: " from | |
print "> " subject ; print "" | |
print text; #print "" | |
#print url # the url is included in text | |
print id " (p. " date ")" | |
print "" | |
} | |
# preserve mail header but change content-type | |
NR==1,/^$/ { | |
if($0~/^$/) { | |
print "Content-Type: text/plain; charset=\"utf-8\"" | |
print "Content-Transfer-Encoding: 8bit" | |
print; next | |
} | |
if($0!~/^(Content-Type):/) print | |
headerdone=1 | |
} | |
# print entries at boundaries | |
/^--=========/ && id { output(); next } | |
(headerdone) { | |
if($1=="Content-Type:") { | |
if($2=="text/plain;"&&$3=="charset=\"utf-8\"") contenttype="utf8" | |
else if($2=="text/plain;"&&$3~/iso-8859-1/) contenttype="l9" | |
else contenttype=substr($0,15) | |
} | |
if($1=="Subject:") { | |
subject=substr($0,10) | |
while(getline && $0~/^[ ]/) { subject= subject $0 } | |
cmd="reformime -c utf-8 -h '" subject "'"; cmd|getline subject; close(cmd) | |
} | |
if($1=="Date:") { # example: Date: Fri, 24 Sep 2021 16:15:00 -0000 | |
date=substr($0,7) | |
cmd="date -d'" date "' +'%Y-%m-%d %H:%M'"; cmd|getline date; close(cmd) | |
} | |
# if($1=="From:") { | |
# from=substr($0,7) | |
# cmd="reformime -c utf-8 -h '" from "'"; cmd|getline from; close(cmd) | |
# } | |
# I like X-RSS-ID b/c it sometimes has short urls | |
if($1=="X-RSS-ID:") { | |
id=$2 | |
} | |
if($1=="X-RSS-URL:") { | |
url=$2 # but this is futile b/c the URL is part of text anyway | |
text="" | |
# process body of message | |
lastline="" | |
while(getline && $0!~/^--=========/) { | |
# squeeze empty lines to 1 (will also skip leading empty lines) | |
if(/^$/ && lastline=="") continue | |
# skip images | |
if(/^\[!\[\]\(.*\)$/) continue | |
# remove "URL:" (quick and dirty but will work 99.99%) | |
if($0~"^URL: " url) $0=substr($0,6) | |
# compile text | |
if (text) { text=text "\n" $0 } else { text=$0 } | |
lastline=$0 | |
} | |
if($0~/^--=========/) { output(); next } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment