-
-
Save vadorvatsal/1d9365bcf6550228d55299a0ffe38246 to your computer and use it in GitHub Desktop.
This script decodes Google News generated encoded, internal URLs for RSS items
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Google News started generate encoded, internal URLs for RSS items | |
* https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en | |
* | |
* This script decodes URLs into original one, for example URL | |
* https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5 | |
* | |
* contains this | |
* https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/ | |
* | |
* In path after articles/ goes Base64 encoded binary data | |
* | |
* Format is the following: | |
* <prefix> <len bytes> <URL bytes> <len bytes> <amp URL bytes> [<suffix>] | |
* | |
* <prefix> - 0x08, 0x13, 0x22 | |
* <suffix> - 0xd2, 0x01, 0x00 (sometimes missing??) | |
* <len bytes> - formatted as 0x40 or 0x81 0x01 sometimes | |
* | |
* FIXME: What will happen if URL more than 255 bytes?? | |
*/ | |
const decodeGoogleNewsUrl = (sourceUrl: string) => { | |
const url = new URL(sourceUrl); | |
const path = url.pathname.split("/"); | |
if ( | |
url.hostname === "news.google.com" && | |
path.length > 1 && | |
path[path.length - 2] === "articles" | |
) { | |
const base64 = path[path.length - 1]; | |
let str = atob(base64); | |
const prefix = Buffer.from([0x08, 0x13, 0x22]).toString("binary"); | |
if (str.startsWith(prefix)) { | |
str = str.substring(prefix.length); | |
} | |
const suffix = Buffer.from([0xd2, 0x01, 0x00]).toString("binary"); | |
if (str.endsWith(suffix)) { | |
str = str.substring(0, str.length - suffix.length); | |
} | |
// One or two bytes to skip | |
const bytes = Uint8Array.from(str, c => c.charCodeAt(0)); | |
const len = bytes.at(0)!; | |
if (len >= 0x80) { | |
str = str.substring(2, len + 1); | |
} else { | |
str = str.substring(1, len + 1); | |
} | |
return str; | |
} else { | |
return sourceUrl; | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment