Last active
March 13, 2024 14:50
Fix the Markdown in the old format v1 of signal-export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl -w | |
# 2024-03-12 huy | |
# Retractively fixes past chats to confirm with current version of signal-export. | |
# This is important because signal-export detects duplicates only if the messages match exactly, | |
# and the format changes have created duplicates that aren't exactly the same | |
# and thus can't be automatically removed. | |
# Note this script cleans up entries so that the next run of export-signal should detect duplicates but | |
# this script doesn't itself remove duplicates. | |
# | |
# Details: | |
# - For transition from format 1 to format 2, adds newlines at end of messages: | |
# See https://github.com/carderne/signal-export/commit/4816f1b3480a5785b9fddfa7b44841333ee3f755 | |
# - For transition from format 1 or 2 to format 3, fixes blockquotes: | |
# See https://github.com/carderne/signal-export/commit/4816f1b3480a5785b9fddfa7b44841333ee3f755 | |
# and https://github.com/carderne/signal-export/pull/112/commits/44213b23d4b3b53a9d523e89952ad76f8600de50 | |
# Optionally, specify the timestamp of the first message that was exported using --newlines (v1.8.0+). | |
# If not defined, then we assume that you never have never run with --newlines. | |
# WARNING: if there are multiple timestamps that match, you'll have to manually fix the events | |
# in the index.md yourself before running this script to disambiguate. | |
# But if you do so, be careful not to save the file in an editor that automatically | |
# strips trailing whitespace as that will break signal-export's deduplication. | |
$START_OF_FORMAT2 = '[2024-01-08 10:45]'; | |
# Optionally, specify the timestamp of the first message that was exported using the blockquote | |
# fixes from https://github.com/carderne/signal-export/tree/better-quotes | |
#$START_OF_FORMAT3 = '[2024-03-11 09:00]'; | |
$SRC = './index.md'; | |
$DST = './index.FIXED.md'; | |
open(SRC, $SRC) or die "Can't open $SRC: $!"; | |
open(DST, ">$DST") or die "Can't open $DST: $!"; | |
$format = 1; | |
$just_saw_date = 0; | |
$just_saw_blank_line_after_date = 0; | |
$newline_needed = 0; | |
$in_blockquote = 0; | |
while (<SRC>) { | |
# Format 1 regime: fix newlines and (best effort) blockquotes | |
if ($format == 1) { | |
# This is the second line of the blockquote, which is the only line that's properly | |
# quoted so we don't need to change anything. | |
if ($in_blockquote == 1 && /^> /) { | |
$in_blockquote = 2; | |
$just_saw_date = 0; | |
} | |
# If in blockquote, keep quoting until we detect the end of the blockquote | |
elsif ($in_blockquote && !/^>$/) { | |
# Note that unless this is the second line of the blockquote, | |
# this will match `> blah`, which will be properly converted to `> > blah` | |
s/^/> /; | |
$just_saw_date = 0; | |
} elsif (/^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2})\] .*/) { | |
# Add the newline for the previous message | |
print DST "\n" if $newline_needed; | |
$newline_needed = 1; | |
$just_saw_date = 1; | |
# Check for the date marker for the beginning of format 2 | |
# NOTE: rindex is just efficient way to check for prefix (nothing special) | |
if (defined $START_OF_FORMAT2 && rindex($_, $START_OF_FORMAT2, 0) == 0) { | |
$format = 2; | |
$newline_needed = 0; | |
} | |
$just_saw_date = 1; | |
} else { | |
# Check for start of blockquote | |
if ($just_saw_date && !$in_blockquote && /^>$/) { | |
s/^>//; | |
$in_blockquote = 1; | |
} | |
# If end of blockquote | |
# XXX of course, this is ambiguous: what if the quote actually contains a `>`? | |
elsif ($in_blockquote && /^>$/) { | |
s/^>//; | |
$in_blockquote = 0; | |
} | |
$just_saw_date = 0; | |
} | |
} | |
# Format 2 regime: only fix blockquotes (best effort) | |
elsif ($format = 2) { | |
# This is the second line of the blockquote, which is the only line that's properly | |
# quoted so we don't need to change anything. | |
if ($just_saw_blank_line_after_date && /^> /) { | |
$in_blockquote = 2; | |
$just_saw_date = 0; | |
$just_saw_blank_line_after_date = 0; | |
} | |
# If in blockquote, keep quoting until the next blank line | |
# XXX This is very ambiguous as the quotation could just have empty line. | |
# but there's nothing we can do. | |
# However, you may be able to detect this discrepancy after the next export | |
# with the proper blockquoting, because the message(s) won't be deduplicated. | |
# and you'll notice messages out of order: | |
# - Find the older message, fix the blockquote manually (or just copy the new version of the message) | |
# - Re-run the export and the dupliates should be removed | |
elsif ($in_blockquote && !/^$/) { | |
s/^/> /; | |
$just_saw_date = 0; | |
$just_saw_blank_line_after_date = 0; | |
} elsif (/^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2})\] .*/) { | |
$just_saw_date = 1; | |
$just_saw_blank_line_after_date = 0; | |
$in_blockquote = 0; | |
# Check for the date marker for the beginning of format 3 | |
# (Efficient way to check for prefix) | |
if (defined $START_OF_FORMAT3 && rindex($_, $START_OF_FORMAT3, 0) == 0) { | |
$format = 3; | |
} | |
} else { | |
# Check for start of blockquote | |
if ($just_saw_date && !$in_blockquote && /^$/) { | |
$just_saw_blank_line_after_date = 1 | |
} | |
# If end of blockquote | |
# XXX of course, this is ambiguous: what if the quote actually contains an empty line | |
elsif ($in_blockquote && /^$/) { | |
$in_blockquote = 0; | |
$just_saw_blank_line_after_date = 0; | |
} | |
$just_saw_date = 0; | |
} | |
} | |
# Format 3 regime: no change | |
print DST; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment