Skip to content

Instantly share code, notes, and snippets.

@IvanaGyro
Created February 26, 2025 14:19
Show Gist options
  • Save IvanaGyro/e1139713d18821bc6ec3407edf542349 to your computer and use it in GitHub Desktop.
Save IvanaGyro/e1139713d18821bc6ec3407edf542349 to your computer and use it in GitHub Desktop.
Crawler for Microsoft Teams Channel Posts
/**
* Crawler for Microsoft Teams Channel Posts
*
* This script uses Puppeteer to automate a browser session, crawl a specified Microsoft Teams channel,
* extract posts along with their attachments and images, convert post content from HTML to Markdown, and
* then save the results as JSON, YAML, and Markdown files.
*
* Steps to use this script:
* 1. Install Node.js and all required modules.
* 2. Run the script using the command: `node crawler.js`.
* 3. When the browser opens, log in to your Microsoft Teams account.
* 4. Navigate to the channel you want to crawl.
* 5. Press the SPACE key in the terminal to start the crawler.
* 6. Wait for the crawler to finish processing all posts; a success message will be displayed in the terminal.
* 7. Repeat steps 4 to 6 for any additional channels you wish to crawl.
* 8. Press the Q key in the terminal to exit the program.
*/
const path = require('path');
const puppeteer = require('puppeteer');
const TurndownService = require('@joplin/turndown');
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
const { URL } = require('url');
const fs = require('fs');
const { v4: uuidv4 } = require('uuid');
const yaml = require('js-yaml');
const { gfm } = turndownPluginGfm;
const turndownService = new TurndownService();
turndownService.use(gfm);
// Global variables
let browserInstance = null;
let downloadPageInstance = null;
let cdpSession = null;
// Global variable: List of URLs to skip download via Puppeteer.
// Files in this list should be downloaded manually as they may not download correctly via the automated browser.
const SKIP_URLS = [];
const DOWNLOAD_BASE_DIR = 'resources';
let downloadDir = `./${DOWNLOAD_BASE_DIR}`;
let resumeCrawl = false;
// Queue for download jobs; each job is an object: { url: string, resolve: Function, reject: Function }
const downloadJobs = [];
let downloadLock = false;
const DEBUG = false;
if (!DEBUG) {
console.debug = () => {};
}
/**
* Sleeps for a given duration.
*
* @param {number} ms - The number of milliseconds to sleep.
* @returns {Promise<void>}
*/
async function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Repeatedly polls a condition function until it returns true or a timeout is reached.
*
* @param {() => boolean|Promise<boolean>} conditionFn - A function that returns a boolean or a Promise that resolves to a boolean.
* @param {Object} [options] - Optional settings.
* @param {number} [options.timeout=30000] - Maximum time (in ms) to wait.
* @param {number} [options.interval=500] - Polling interval in ms.
* @returns {Promise<void>}
* @throws {Error} If the condition isn’t met within the timeout.
*/
function waitUntil(conditionFn, { timeout = 30000, interval = 500 } = {}) {
return new Promise((resolve, reject) => {
const start = Date.now();
const timer = setInterval(async () => {
let result = false;
try {
result = await conditionFn();
} catch (err) {
// Error is treated as "not ready"
}
if (result) {
clearInterval(timer);
resolve();
return;
}
if (timeout > 0 && Date.now() - start > timeout) {
clearInterval(timer);
reject(new Error(`waitUntil() timed out after ${timeout} ms`));
}
}, interval);
});
}
/**
* Pauses the crawler until the user presses the "C" key.
*
* @returns {Promise<void>}
*/
async function pauseUntilResume() {
console.log("Crawler paused. Press C key to resume.");
await waitUntil(() => resumeCrawl, { timeout: 0, interval: 100 });
resumeCrawl = false;
}
/**
* Scrolls up the Teams message pane to load additional content.
*
* @param {import('puppeteer').Page} page - The Puppeteer page instance.
* @param {number|null} [scrollPx=null] - Pixels to scroll; defaults to the viewport height.
* @returns {Promise<number>} The new scroll top position.
*/
async function scrollUp(page, scrollPx = null) {
const messagePane = await page.$('[data-tid="channel-pane-viewport"]');
const clientHeight = await messagePane.evaluate((el) => el.clientHeight);
console.debug(`clientHeight: ${clientHeight}`);
if (scrollPx === null) {
scrollPx = clientHeight;
}
const getScrollTop = async () => messagePane.evaluate((el) => el.scrollTop);
const getScrollHeight = async () => messagePane.evaluate((el) => el.scrollHeight);
const oldTop = await getScrollTop();
const oldHeight = await getScrollHeight();
console.debug(`oldTop: ${oldTop}`);
console.debug(`oldHeight: ${oldHeight}`);
if (oldTop === 0) {
return 0;
}
// Scroll up by 'scrollPx' pixels
await messagePane.evaluate((el, old, delta) => {
el.scrollTo(0, Math.max(0, old - delta));
}, oldTop, scrollPx);
// Teams may trigger network requests when scrolling; wait for height change or scrollTop change
try {
await page.waitForFunction(
(old) => {
const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
return pane.scrollHeight > old;
},
{ timeout: 10000 },
oldHeight
);
} catch {
console.debug(`max height: ${await getScrollHeight()}`);
await page.waitForFunction(
(old) => {
const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
return pane.scrollTop < old;
},
{ timeout: 500 },
oldTop
);
}
const newTop = await getScrollTop();
console.debug(`newTop: ${newTop}`);
console.debug(`newHeight: ${await getScrollHeight()}`);
return newTop;
}
/**
* Enqueues a download job for the specified URL.
*
* @param {string} url - The URL to download.
* @returns {Promise<string>} Resolves with the local file path of the downloaded file.
*/
function download(url) {
return new Promise((resolve, reject) => {
downloadJobs.push({ url, resolve, reject });
processDownloadQueue();
});
}
/**
* Processes the download job queue sequentially.
*
* Continues processing until there are no jobs left.
*/
async function processDownloadQueue() {
if (downloadLock) return;
const job = downloadJobs.shift();
if (!job) return;
downloadLock = true;
try {
const parsedUrl = new URL(job.url);
console.debug(`downloading ${job.url}`);
const urlPath = parsedUrl.pathname;
let fileName = decodeURIComponent(urlPath.substring(urlPath.lastIndexOf('/') + 1));
let filePath = null;
let response = null;
if (parsedUrl.origin === 'https://ntucc365.sharepoint.com') {
// For attachments from SharePoint (Teams design), sometimes downloading via Puppeteer
// fails, so we skip those URLs.
if (!SKIP_URLS.includes(job.url)) {
try {
response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
} catch (err) {
// Ignore errors like net::ERR_ABORTED since the file might still download via CDP
}
}
const oldFilePath = path.join(downloadDir, fileName);
let retry = 2;
while (retry--) {
try {
await waitUntil(() => fs.existsSync(oldFilePath), { timeout: 10000, interval: 500 });
filePath = path.join(downloadDir, `${uuidv4()}-${fileName}`);
fs.renameSync(oldFilePath, filePath);
break;
} catch {
if (retry) {
console.error(`Failed to find the downloaded file.\n` +
`url: ${job.url}\n` +
`path: ${oldFilePath}\n` +
`Please manually download the file and than resume.`);
await pauseUntilResume();
} else {
console.warn("Leave the file path empty.");
filePath = '';
}
}
}
} else {
// For displayed images
response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
const contentType = response.headers()['content-type'] || '';
let extension;
if (contentType.includes('jpeg')) {
extension = 'jpg';
} else if (contentType.includes('png')) {
extension = 'png';
} else if (contentType.includes('gif')) {
extension = 'gif';
} else {
extension = contentType.split('/')[1];
}
// Assign a new unique filename because the filename in the URL is always the same
// for different images.
fileName = `${uuidv4()}.${extension}`;
const responseBuffer = await response.buffer();
filePath = path.join(downloadDir, fileName);
await fs.promises.writeFile(filePath, responseBuffer);
}
job.resolve(filePath);
} catch (error) {
job.reject(error);
} finally {
downloadLock = false;
processDownloadQueue();
}
}
/**
* Parses a post element from the Teams channel.
*
* @param {import('puppeteer').ElementHandle} postElement - The DOM element representing a post.
* @returns {Promise<Object|null>} The parsed post object, or null if the post is deleted.
*/
async function parsePost(postElement) {
const parsedPost = await postElement.evaluate(async (elem) => {
const postContent = elem.querySelector('[id^="post-message-render"] [id^="message-body"] [data-reply-chain-id]');
if (!postContent) { // Deleted post
return null;
}
const contentClone = postContent.querySelector('[id^="content"]')?.cloneNode(true);
const title = postContent.querySelector('h2')?.innerText || '';
const author = elem.querySelector('[id^="author"]').innerText;
let timestamp = elem.getElementsByTagName('time')[0].id.split('-')[1];
timestamp = Number(timestamp);
const createTime = new Date(timestamp).toLocaleString('zh-TW', {
hour12: false,
year: 'numeric',
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
second: '2-digit',
});
// Replace emoji animations with actual characters (Teams design)
contentClone.querySelectorAll('[data-tid="emoticon-renderer"]').forEach((el) => {
el.querySelectorAll('img').forEach((imgEl) => {
imgEl.outerHTML = imgEl.getAttribute('alt');
});
});
// Replace mention tags with plain text
contentClone.querySelectorAll('[itemtype="http://schema.skype.com/Mention"]').forEach((el) => {
if (el.parentNode.hasAttribute('data-lpc-hover-target-id')) {
el.parentNode.outerHTML = el.parentNode.innerText;
} else {
el.outerHTML = el.innerText;
}
});
const remoteToLocal = {};
const attachmentNodes = postContent.querySelector('[id^=attachments]')?.childNodes || [];
const attachmentResults = await Promise.all(
Array.from(attachmentNodes).map(async (attachment) => {
if (attachment.hasAttribute('title')) {
// Normal attachment
let [filename, url] = attachment.getAttribute('title').split('\r\n');
const parsed = new URL(url);
if (parsed.origin !== 'https://ntucc365.sharepoint.com') {
return [url, null];
}
const fullPath = parsed.origin + parsed.pathname;
const localLink = await window.download(fullPath);
remoteToLocal[url] = localLink;
return [localLink, null];
} else if (attachment.querySelector('img[data-gallery-src]') != null) {
// Image attachment
const imgElem = attachment.querySelector('img[data-gallery-src]');
const fullSizeImagePath = imgElem.getAttribute('data-gallery-src');
const localPath = await window.download(fullSizeImagePath);
imgElem.setAttribute('src', localPath);
return [localPath, imgElem];
} else {
return [null, null];
}
})
);
const attachments = [];
attachmentResults.forEach(([link, imageElem]) => {
if (link != null) attachments.push(link);
if (imageElem != null) contentClone.appendChild(imageElem);
});
// Download images and update their src attribute
await Promise.all(
Array.from(contentClone.querySelectorAll('img[data-gallery-src]')).map(async (img) => {
const fullSizeImagePath = img.getAttribute('data-gallery-src');
const localPath = await window.download(fullSizeImagePath);
img.setAttribute('src', localPath);
})
);
// Update anchor tags if they point to attachments
contentClone.querySelectorAll('a').forEach((el) => {
const href = el.getAttribute('href');
if (remoteToLocal[href] != null) {
el.setAttribute('href', remoteToLocal[href]);
}
});
return { title, author, timestamp, createTime, outerHTML: contentClone.outerHTML, attachments };
});
if (parsedPost === null) return null;
parsedPost.content = turndownService
.turndown(parsedPost.outerHTML)
.replace(/\u00A0/g, '\u0020'); // Replace non-breaking spaces with normal spaces
delete parsedPost.outerHTML;
return parsedPost;
}
/**
* Converts the extracted posts into a Markdown file.
*
* @param {Array<Object>} articles - Array of post objects.
* @param {string} markdownFilePath - The path where the Markdown file will be saved.
*/
function postToMarkdown(articles, markdownFilePath) {
let markdownContent = '';
articles.forEach((article) => {
const { title, author, createTime, content, attachments } = article;
let metadata = `- **Author:** ${author}\n- **Created:** ${createTime}\n`;
if (attachments && attachments.length > 0) {
metadata += `- **Attachments:**\n`;
attachments.forEach((filePath) => {
if (filePath === '') {
metadata += " - (the attachment was unavailable while crawling)\n";
} else {
const filename = path.basename(filePath);
metadata += ` - [${filename.substring(37)}](${encodeURIComponent(filePath)})\n`;
}
});
metadata += `\n`;
}
metadata += `\n`;
if (title) {
markdownContent += `## ${title}\n\n`;
}
markdownContent += metadata;
markdownContent += `${content}\n\n---\n\n`;
});
fs.writeFileSync(markdownFilePath, markdownContent, 'utf8');
}
/**
* Crawls the Microsoft Teams channel page to extract posts.
*
* @param {import('puppeteer').Page} page - The Puppeteer page instance.
* @returns {Promise<void>}
*/
async function crawlChannel(page) {
const seenPosts = new Set();
const posts = [];
const channelName = await page.evaluate(() => {
return document.querySelector('[data-tid="channelTitle-text"]').innerText;
});
console.log(`Crawling started. Channel:${channelName}`);
downloadDir = `./${channelName}_${DOWNLOAD_BASE_DIR}`;
const absoluteDownloadDir = path.resolve(downloadDir);
if (!fs.existsSync(absoluteDownloadDir)) {
fs.mkdirSync(absoluteDownloadDir, { recursive: true });
}
// Set Chrome’s download behavior to use the designated folder.
await cdpSession.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: absoluteDownloadDir,
});
let currentScrollTop = -1;
while (currentScrollTop !== 0) {
const postElements = await page.$$('[id^="reply-chain-summary"]');
for (const postElem of postElements) {
const postId = await page.evaluate((el) => el.getAttribute("id"), postElem);
if (!seenPosts.has(postId)) {
console.info(`Found new post. HTML element id: ${postId}`);
try {
const parsedPost = await parsePost(postElem);
if (parsedPost != null) {
posts.push(parsedPost);
}
} catch (err) {
console.error(err);
await pauseUntilResume();
}
seenPosts.add(postId);
}
}
console.debug(`Found ${postElements.length} elements matching [id^="reply-chain-summary"].`);
try {
currentScrollTop = await scrollUp(page);
await sleep(500);
} catch {
break;
}
}
console.info(`Total posts: ${posts.length}`);
posts.sort((a, b) => a.timestamp - b.timestamp);
const jsonString = JSON.stringify(posts, null, 4);
const yamlString = yaml.dump(posts);
try {
fs.writeFileSync(`${channelName}.json`, jsonString);
fs.writeFileSync(`${channelName}.yaml`, yamlString);
postToMarkdown(posts, `${channelName}.md`);
console.log(`Crawl the channel successfully: ${channelName}`);
} catch (err) {
console.error(`Error writing file: ${channelName}`, err);
}
}
(async () => {
// Launch Puppeteer with a visible browser
browserInstance = await puppeteer.launch({
headless: false,
userDataDir: path.join(__dirname, 'puppeteer_user_data'),
defaultViewport: null,
});
downloadPageInstance = await browserInstance.newPage();
cdpSession = await downloadPageInstance.createCDPSession();
// Configure Fetch to intercept responses from SharePoint and force file downloads instead of inline display
// Workaround from https://stackoverflow.com/a/63232618/6663588
await cdpSession.send('Fetch.enable', {
patterns: [{ urlPattern: 'https://ntucc365.sharepoint.com/*', requestStage: 'Response' }],
});
cdpSession.on('Fetch.requestPaused', async (event) => {
const { requestId, responseHeaders } = event;
const headers = responseHeaders || [];
const contentTypeHeader = headers.find(
(header) => header.name.toLowerCase() === 'content-type'
);
if (
contentTypeHeader &&
(contentTypeHeader.value === 'application/pdf' || contentTypeHeader.value.includes('xml'))
) {
headers.push({ name: 'Content-Disposition', value: 'attachment' });
const response = await cdpSession.send('Fetch.getResponseBody', { requestId });
await cdpSession.send('Fetch.fulfillRequest', {
requestId,
responseCode: 200,
responseHeaders: headers,
body: response.body,
});
} else {
await cdpSession.send('Fetch.continueRequest', { requestId });
}
});
const page = await browserInstance.newPage();
await page.exposeFunction('download', download);
// Navigate to the target Teams channel URL
const targetUrl = 'https://teams.microsoft.com/v2';
await page.goto(targetUrl, { waitUntil: 'networkidle2' });
await page.setBypassCSP(true);
console.log('Before starting to crawl the posts, please login to your' +
' Teams account and switch to the desired channel for crawling.');
console.log('Press SPACE to start/stop crawling. Press Q to quit.');
let isCrawling = false;
// Set raw mode for stdin to capture key presses.
process.stdin.setRawMode(true);
process.stdin.on('data', async (key) => {
const keyStr = key.toString();
if (keyStr === ' ') {
if (!isCrawling) {
isCrawling = true;
await crawlChannel(page);
isCrawling = false;
} else {
isCrawling = false;
console.info('Crawling stopped.');
}
} else if (keyStr === 'c') {
resumeCrawl = true;
} else if (keyStr.toLowerCase() === 'q') {
console.log('Exiting...');
await browserInstance.close();
process.exit(0);
}
});
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment