Last active
November 7, 2021 13:24
-
-
Save deliciouslytyped/5cd53011147a5634d64db59ec956320d to your computer and use it in GitHub Desktop.
Download the text messages of a (public) gitter room without logging in, view it with some semi-readable output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env bash | |
set -euo pipefail #NOTE I use a lot of functions and apparently -e is useless with functions? | |
#TODO simplify this back down by separating the render functionality | |
#TODO port to tcl? | |
# =========== utilities =========== # | |
# (root) -> () | |
function render() { | |
local root=$1 | |
shift | |
if command -v nix-build &> /dev/null; | |
then glow=$(nix-build '<nixpkgs>' -I nixpkgs=channel:nixos-unstable -A glow --no-out-link)/bin/glow | |
else glow=glow | |
fi | |
# glow doesnt seem to handle large files well | |
find "$root/md/" -type f -printf "%P\n" | sort -n -r | { while read line; do echo "$root/md/${line}"; done; } | #TODO this messy and root isn't quoted | |
xargs -I"{}" bash -c "echo {} && '$glow' -s dark ${*@Q} {} | tee -a out.bin.new > /dev/null #glow doesnt obey width when redirecting" | |
mv "$root/out.bin.new" "$root/out.bin" | |
less -R out.bin | |
exit | |
} | |
# =========== utilities =========== # | |
# (url) -> (g CHANID, g TOKEN, g BEFOREID) | |
function getmagic() { | |
local url=$1; local page; local jsonvals | |
page=$(curl --silent --fail "$url") | |
# we extract the necessary fields from a json object in script tags #TODO maybe the workings of the accessToken is in the public source somewhere? | |
jsonvals=$( echo "$page" | gawk 'match($0, /window.troupeContext = (.*);<\/script>/, m) { print m[1] }' | jq -r ".troupe.id, .accessToken" ) | |
# This is madness. spongebob: no this is bash ; https://www.etalabs.net/sh_tricks.html via https://stackoverflow.com/a/6779351 | |
IFS=$'\n' read -r -d '' CHANID TOKEN <<-EOF || true # "fix" exit code # we use two real tabs here for indentation https://unix.stackexchange.com/a/76483 | |
$jsonvals | |
EOF | |
BEFOREID=$( echo "$page" | | |
pup '[class~="chat-item"] json{}' | # narrow the elements and return them as json | |
jq -r '.[] .class | capture("model-id-(?<id>[0-9a-f]{24})") .id' | # get the ids out of the class | |
tail -n 1 ) | |
} | |
# (beforeid, chanid, token, root) -> (g last_id, g retVal) | |
function getpage() { | |
local beforeid=$1; local chanid=$2; local token=$3; local counter=$4; local root=$5 | |
# I just used "copy as curl" but https://gitlab.com/gitterHQ/webapp/-/blob/master/server/api/v1/rooms/chat-messages.js | |
curl --silent --fail "https://gitter.im/api/v1/rooms/$chanid/chatMessages?lookups%5B%5D=user&includeThreads=false&beforeId=$beforeid&limit=100" \ | |
-H "x-access-token: $token" \ | |
-o "$root/json/$counter.json" #NOTE using -o as opposed to IO redirection means we dont write a file on failure? | |
#the next page is fetched by passing the next "beforeId", which will be from the last message | |
last_id=$(jq -r ".items | .[] .id" < "$root/json/$counter.json" | head -n 1) && retVal=true || retVal=false | |
} | |
# (counter, root) -> () | |
function extractMarkdown(){ | |
local counter=$1; local root=$2; local prog | |
IFS='' read -r -d '' prog <<-"EOF" || true # "fix" exit code # we use two real tabs here for the indentation of the EOF | |
.lookups as $lookups | # we need to use lookups later to convert sender ids to usernames | |
.items[] | # convert the list of messages to a stream of messages | |
. as $item | # we need this for concatenating to .username, because the scope is different at the usage site | |
"\n", # separate messages in the markdown | |
(.fromUser as $uid | $lookups.users | .[] | select(.id == $uid) # look up the user entry from the uid in the message | |
| .username+" "+$item.sent), # return the username with the date | |
(.text | startswith("```") // (" "+.)) # we make sure quoted sections start at the beginning of the line, as required by markdown, otherwise, we indent the message a bit for readability #TODO something is broken | |
EOF | |
jq -r "$prog" < "$root/json/$counter.json" > "$root/md/$counter.md" | |
} | |
#If we're on NixOS. We use this strat instead of nix-shell shebang because we need selfexec to be fast. | |
#(url) -> () | |
function selfsetup() { | |
local url=$1 | |
echo Entering nix-shell. | |
exec nix-shell -I nixpkgs=channel:nixos-unstable -p pup glow jq --run "IN_MY_SHELL=1 $(realpath "$0") '$url'" | |
} | |
# (root) -> () | |
function init() { | |
local root=$1 | |
echo This program may fail silently. | |
export STARTED=1 | |
mkdir -p "$root" | |
} | |
# dependencies: pup, glow, jq, gawk, bash, xarrgs, find | |
function main() { | |
set +u | |
[ -z ${1+x} ] && { echo -e "USAGE: $0 gitter_url\nrender and extractMarkdown can be called via: ( . $0; somefunc somearg )\nThere is information about API longevity at https://gitter.im/gitter/api?at=5f74bee5cfe2f9049a14ae3e"; exit 1; } | |
set -u | |
url=$1 | |
root=$(echo "$url" | rev | cut -d "/" -f -2 | rev | sed "s/\//-/") | |
set +u | |
[ -z ${IN_MY_SHELL+x} ] && selfsetup "$url" | |
[ -z ${STARTED+x} ] && init "$root" | |
set -u | |
#COUNTER, ID, TOKEN and CHANID are passed via env vars during tail recursion | |
set +u | |
# passed during tail recursion | |
COUNTER=${COUNTER:-0}; BEFOREID=${BEFOREID:-}; TOKEN=${TOKEN:-}; CHANID=${CHANID:-} | |
set -u | |
#need to set the initial value the first time around | |
[[ -z "$BEFOREID" || -z "$CHANID" || -z "$TOKEN" ]] && getmagic "$url" | |
mkdir -p "$root/json" "$root/md" | |
echo "Fetching $BEFOREID at counter $COUNTER" | |
sleep 1 # naive rate limit, probably unnecessary | |
getpage "$BEFOREID" "$CHANID" "$TOKEN" "$COUNTER" "$root" #NOTE currently recursion is stopped by jq eventually failing on the empty list | |
! $retVal && render "$root" || true | |
extractMarkdown "$COUNTER" "$root" | |
# tail recursion :P | |
exec /usr/bin/env TOKEN="$TOKEN" CHANID="$CHANID" BEFOREID="$last_id" COUNTER=$(("$COUNTER" + 1)) "$0" "$url" | |
} | |
# check if we are sourced, like python if __name__ == "__main__" ; https://stackoverflow.com/a/23009039 | |
[ "$0" = "$BASH_SOURCE" ] && main "$@" || true |
Cool, can we also push it to IRC? :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Edit: all of this might be unnecessary? https://matrix.org/blog/2020/09/30/welcoming-gitter-to-matrix/
TODO: