Last active
August 30, 2019 18:14
-
-
Save arthurtsang/557b32c568af6fdfd411312b0850a710 to your computer and use it in GitHub Desktop.
Extracting HTML from GMail
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* search for email with gmailSearchQuery (e.g. "to: [email protected] subject: spam") | |
* and get the raw content of the mail | |
*/ | |
async searchEmail(gmailSearchQuery: string): Promise<any[]> { | |
const auth = await this.authorize(); | |
const gmail = google.gmail({ version: 'v1', auth }); | |
return new Promise((resolve, reject) => { | |
gmail.users.messages.list( | |
{ | |
userId: 'me', | |
q: gmailSearchQuery | |
}, | |
async (err, res) => { | |
if (err) reject(err); | |
if (res.data.messages === undefined) { | |
resolve([]); | |
} else { | |
resolve( | |
await Promise.all( | |
res.data.messages.map(m => this.getRawMessage(gmail, m.id)) | |
) | |
); | |
} | |
} | |
); | |
}); | |
} | |
/** | |
* with the message id, get the raw mail content | |
*/ | |
private getRawMessage(gmail: gmail_v1.Gmail, message_id: string) { | |
return new Promise((resolve, reject) => { | |
gmail.users.messages.get( | |
{ | |
userId: 'me', | |
id: message_id, | |
format: 'raw' | |
}, | |
(err2, res2) => { | |
if (err2) reject(err2); | |
resolve({ | |
id: message_id, | |
raw: res2.data.raw | |
}); | |
} | |
); | |
}); | |
} | |
/** | |
* with the raw mail content, extract the HTML and load it to cheerio | |
*/ | |
getHtmlFromEmailBody(raw: string) { | |
const message = Buffer.from(raw, 'base64') //convert base64 to text and clean it up | |
.toString() | |
.split(/(\r\n|\n|\r)/) | |
.map(s => s.trim()) | |
.filter(s => s.length > 0) | |
.map(s => s.replace(/=3D/g, '=')); | |
const formattedMessage: string[] = []; | |
let currentLine = ''; | |
let start = false; | |
for (const line of message) { | |
if (line.indexOf('DOCTYPE html') !== -1) { | |
start = true; | |
} | |
if (start && line.startsWith('----')) { | |
break; | |
} | |
if (start) { | |
if (line.slice(-1) === '=') { | |
currentLine += line.slice(0, -1); | |
} else { | |
currentLine += line; | |
formattedMessage.push(currentLine); | |
currentLine = ''; | |
} | |
} | |
} | |
return cheerio.load(formattedMessage.join()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment