Created
February 26, 2025 14:19
-
-
Save IvanaGyro/e1139713d18821bc6ec3407edf542349 to your computer and use it in GitHub Desktop.
Crawler for Microsoft Teams Channel Posts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Crawler for Microsoft Teams Channel Posts | |
* | |
* This script uses Puppeteer to automate a browser session, crawl a specified Microsoft Teams channel, | |
* extract posts along with their attachments and images, convert post content from HTML to Markdown, and | |
* then save the results as JSON, YAML, and Markdown files. | |
* | |
* Steps to use this script: | |
* 1. Install Node.js and all required modules. | |
* 2. Run the script using the command: `node crawler.js`. | |
* 3. When the browser opens, log in to your Microsoft Teams account. | |
* 4. Navigate to the channel you want to crawl. | |
* 5. Press the SPACE key in the terminal to start the crawler. | |
* 6. Wait for the crawler to finish processing all posts; a success message will be displayed in the terminal. | |
* 7. Repeat steps 4 to 6 for any additional channels you wish to crawl. | |
* 8. Press the Q key in the terminal to exit the program. | |
*/ | |
const path = require('path'); | |
const puppeteer = require('puppeteer'); | |
const TurndownService = require('@joplin/turndown'); | |
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm'); | |
const { URL } = require('url'); | |
const fs = require('fs'); | |
const { v4: uuidv4 } = require('uuid'); | |
const yaml = require('js-yaml'); | |
const { gfm } = turndownPluginGfm; | |
const turndownService = new TurndownService(); | |
turndownService.use(gfm); | |
// Global variables | |
let browserInstance = null; | |
let downloadPageInstance = null; | |
let cdpSession = null; | |
// Global variable: List of URLs to skip download via Puppeteer. | |
// Files in this list should be downloaded manually as they may not download correctly via the automated browser. | |
const SKIP_URLS = []; | |
const DOWNLOAD_BASE_DIR = 'resources'; | |
let downloadDir = `./${DOWNLOAD_BASE_DIR}`; | |
let resumeCrawl = false; | |
// Queue for download jobs; each job is an object: { url: string, resolve: Function, reject: Function } | |
const downloadJobs = []; | |
let downloadLock = false; | |
const DEBUG = false; | |
if (!DEBUG) { | |
console.debug = () => {}; | |
} | |
/** | |
* Sleeps for a given duration. | |
* | |
* @param {number} ms - The number of milliseconds to sleep. | |
* @returns {Promise<void>} | |
*/ | |
async function sleep(ms) { | |
return new Promise((resolve) => setTimeout(resolve, ms)); | |
} | |
/** | |
* Repeatedly polls a condition function until it returns true or a timeout is reached. | |
* | |
* @param {() => boolean|Promise<boolean>} conditionFn - A function that returns a boolean or a Promise that resolves to a boolean. | |
* @param {Object} [options] - Optional settings. | |
* @param {number} [options.timeout=30000] - Maximum time (in ms) to wait. | |
* @param {number} [options.interval=500] - Polling interval in ms. | |
* @returns {Promise<void>} | |
* @throws {Error} If the condition isn’t met within the timeout. | |
*/ | |
function waitUntil(conditionFn, { timeout = 30000, interval = 500 } = {}) { | |
return new Promise((resolve, reject) => { | |
const start = Date.now(); | |
const timer = setInterval(async () => { | |
let result = false; | |
try { | |
result = await conditionFn(); | |
} catch (err) { | |
// Error is treated as "not ready" | |
} | |
if (result) { | |
clearInterval(timer); | |
resolve(); | |
return; | |
} | |
if (timeout > 0 && Date.now() - start > timeout) { | |
clearInterval(timer); | |
reject(new Error(`waitUntil() timed out after ${timeout} ms`)); | |
} | |
}, interval); | |
}); | |
} | |
/** | |
* Pauses the crawler until the user presses the "C" key. | |
* | |
* @returns {Promise<void>} | |
*/ | |
async function pauseUntilResume() { | |
console.log("Crawler paused. Press C key to resume."); | |
await waitUntil(() => resumeCrawl, { timeout: 0, interval: 100 }); | |
resumeCrawl = false; | |
} | |
/** | |
* Scrolls up the Teams message pane to load additional content. | |
* | |
* @param {import('puppeteer').Page} page - The Puppeteer page instance. | |
* @param {number|null} [scrollPx=null] - Pixels to scroll; defaults to the viewport height. | |
* @returns {Promise<number>} The new scroll top position. | |
*/ | |
async function scrollUp(page, scrollPx = null) { | |
const messagePane = await page.$('[data-tid="channel-pane-viewport"]'); | |
const clientHeight = await messagePane.evaluate((el) => el.clientHeight); | |
console.debug(`clientHeight: ${clientHeight}`); | |
if (scrollPx === null) { | |
scrollPx = clientHeight; | |
} | |
const getScrollTop = async () => messagePane.evaluate((el) => el.scrollTop); | |
const getScrollHeight = async () => messagePane.evaluate((el) => el.scrollHeight); | |
const oldTop = await getScrollTop(); | |
const oldHeight = await getScrollHeight(); | |
console.debug(`oldTop: ${oldTop}`); | |
console.debug(`oldHeight: ${oldHeight}`); | |
if (oldTop === 0) { | |
return 0; | |
} | |
// Scroll up by 'scrollPx' pixels | |
await messagePane.evaluate((el, old, delta) => { | |
el.scrollTo(0, Math.max(0, old - delta)); | |
}, oldTop, scrollPx); | |
// Teams may trigger network requests when scrolling; wait for height change or scrollTop change | |
try { | |
await page.waitForFunction( | |
(old) => { | |
const pane = document.querySelector('[data-tid="channel-pane-viewport"]'); | |
return pane.scrollHeight > old; | |
}, | |
{ timeout: 10000 }, | |
oldHeight | |
); | |
} catch { | |
console.debug(`max height: ${await getScrollHeight()}`); | |
await page.waitForFunction( | |
(old) => { | |
const pane = document.querySelector('[data-tid="channel-pane-viewport"]'); | |
return pane.scrollTop < old; | |
}, | |
{ timeout: 500 }, | |
oldTop | |
); | |
} | |
const newTop = await getScrollTop(); | |
console.debug(`newTop: ${newTop}`); | |
console.debug(`newHeight: ${await getScrollHeight()}`); | |
return newTop; | |
} | |
/** | |
* Enqueues a download job for the specified URL. | |
* | |
* @param {string} url - The URL to download. | |
* @returns {Promise<string>} Resolves with the local file path of the downloaded file. | |
*/ | |
function download(url) { | |
return new Promise((resolve, reject) => { | |
downloadJobs.push({ url, resolve, reject }); | |
processDownloadQueue(); | |
}); | |
} | |
/** | |
* Processes the download job queue sequentially. | |
* | |
* Continues processing until there are no jobs left. | |
*/ | |
async function processDownloadQueue() { | |
if (downloadLock) return; | |
const job = downloadJobs.shift(); | |
if (!job) return; | |
downloadLock = true; | |
try { | |
const parsedUrl = new URL(job.url); | |
console.debug(`downloading ${job.url}`); | |
const urlPath = parsedUrl.pathname; | |
let fileName = decodeURIComponent(urlPath.substring(urlPath.lastIndexOf('/') + 1)); | |
let filePath = null; | |
let response = null; | |
if (parsedUrl.origin === 'https://ntucc365.sharepoint.com') { | |
// For attachments from SharePoint (Teams design), sometimes downloading via Puppeteer | |
// fails, so we skip those URLs. | |
if (!SKIP_URLS.includes(job.url)) { | |
try { | |
response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' }); | |
} catch (err) { | |
// Ignore errors like net::ERR_ABORTED since the file might still download via CDP | |
} | |
} | |
const oldFilePath = path.join(downloadDir, fileName); | |
let retry = 2; | |
while (retry--) { | |
try { | |
await waitUntil(() => fs.existsSync(oldFilePath), { timeout: 10000, interval: 500 }); | |
filePath = path.join(downloadDir, `${uuidv4()}-${fileName}`); | |
fs.renameSync(oldFilePath, filePath); | |
break; | |
} catch { | |
if (retry) { | |
console.error(`Failed to find the downloaded file.\n` + | |
`url: ${job.url}\n` + | |
`path: ${oldFilePath}\n` + | |
`Please manually download the file and than resume.`); | |
await pauseUntilResume(); | |
} else { | |
console.warn("Leave the file path empty."); | |
filePath = ''; | |
} | |
} | |
} | |
} else { | |
// For displayed images | |
response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' }); | |
const contentType = response.headers()['content-type'] || ''; | |
let extension; | |
if (contentType.includes('jpeg')) { | |
extension = 'jpg'; | |
} else if (contentType.includes('png')) { | |
extension = 'png'; | |
} else if (contentType.includes('gif')) { | |
extension = 'gif'; | |
} else { | |
extension = contentType.split('/')[1]; | |
} | |
// Assign a new unique filename because the filename in the URL is always the same | |
// for different images. | |
fileName = `${uuidv4()}.${extension}`; | |
const responseBuffer = await response.buffer(); | |
filePath = path.join(downloadDir, fileName); | |
await fs.promises.writeFile(filePath, responseBuffer); | |
} | |
job.resolve(filePath); | |
} catch (error) { | |
job.reject(error); | |
} finally { | |
downloadLock = false; | |
processDownloadQueue(); | |
} | |
} | |
/** | |
* Parses a post element from the Teams channel. | |
* | |
* @param {import('puppeteer').ElementHandle} postElement - The DOM element representing a post. | |
* @returns {Promise<Object|null>} The parsed post object, or null if the post is deleted. | |
*/ | |
async function parsePost(postElement) { | |
const parsedPost = await postElement.evaluate(async (elem) => { | |
const postContent = elem.querySelector('[id^="post-message-render"] [id^="message-body"] [data-reply-chain-id]'); | |
if (!postContent) { // Deleted post | |
return null; | |
} | |
const contentClone = postContent.querySelector('[id^="content"]')?.cloneNode(true); | |
const title = postContent.querySelector('h2')?.innerText || ''; | |
const author = elem.querySelector('[id^="author"]').innerText; | |
let timestamp = elem.getElementsByTagName('time')[0].id.split('-')[1]; | |
timestamp = Number(timestamp); | |
const createTime = new Date(timestamp).toLocaleString('zh-TW', { | |
hour12: false, | |
year: 'numeric', | |
month: '2-digit', | |
day: '2-digit', | |
hour: '2-digit', | |
minute: '2-digit', | |
second: '2-digit', | |
}); | |
// Replace emoji animations with actual characters (Teams design) | |
contentClone.querySelectorAll('[data-tid="emoticon-renderer"]').forEach((el) => { | |
el.querySelectorAll('img').forEach((imgEl) => { | |
imgEl.outerHTML = imgEl.getAttribute('alt'); | |
}); | |
}); | |
// Replace mention tags with plain text | |
contentClone.querySelectorAll('[itemtype="http://schema.skype.com/Mention"]').forEach((el) => { | |
if (el.parentNode.hasAttribute('data-lpc-hover-target-id')) { | |
el.parentNode.outerHTML = el.parentNode.innerText; | |
} else { | |
el.outerHTML = el.innerText; | |
} | |
}); | |
const remoteToLocal = {}; | |
const attachmentNodes = postContent.querySelector('[id^=attachments]')?.childNodes || []; | |
const attachmentResults = await Promise.all( | |
Array.from(attachmentNodes).map(async (attachment) => { | |
if (attachment.hasAttribute('title')) { | |
// Normal attachment | |
let [filename, url] = attachment.getAttribute('title').split('\r\n'); | |
const parsed = new URL(url); | |
if (parsed.origin !== 'https://ntucc365.sharepoint.com') { | |
return [url, null]; | |
} | |
const fullPath = parsed.origin + parsed.pathname; | |
const localLink = await window.download(fullPath); | |
remoteToLocal[url] = localLink; | |
return [localLink, null]; | |
} else if (attachment.querySelector('img[data-gallery-src]') != null) { | |
// Image attachment | |
const imgElem = attachment.querySelector('img[data-gallery-src]'); | |
const fullSizeImagePath = imgElem.getAttribute('data-gallery-src'); | |
const localPath = await window.download(fullSizeImagePath); | |
imgElem.setAttribute('src', localPath); | |
return [localPath, imgElem]; | |
} else { | |
return [null, null]; | |
} | |
}) | |
); | |
const attachments = []; | |
attachmentResults.forEach(([link, imageElem]) => { | |
if (link != null) attachments.push(link); | |
if (imageElem != null) contentClone.appendChild(imageElem); | |
}); | |
// Download images and update their src attribute | |
await Promise.all( | |
Array.from(contentClone.querySelectorAll('img[data-gallery-src]')).map(async (img) => { | |
const fullSizeImagePath = img.getAttribute('data-gallery-src'); | |
const localPath = await window.download(fullSizeImagePath); | |
img.setAttribute('src', localPath); | |
}) | |
); | |
// Update anchor tags if they point to attachments | |
contentClone.querySelectorAll('a').forEach((el) => { | |
const href = el.getAttribute('href'); | |
if (remoteToLocal[href] != null) { | |
el.setAttribute('href', remoteToLocal[href]); | |
} | |
}); | |
return { title, author, timestamp, createTime, outerHTML: contentClone.outerHTML, attachments }; | |
}); | |
if (parsedPost === null) return null; | |
parsedPost.content = turndownService | |
.turndown(parsedPost.outerHTML) | |
.replace(/\u00A0/g, '\u0020'); // Replace non-breaking spaces with normal spaces | |
delete parsedPost.outerHTML; | |
return parsedPost; | |
} | |
/** | |
* Converts the extracted posts into a Markdown file. | |
* | |
* @param {Array<Object>} articles - Array of post objects. | |
* @param {string} markdownFilePath - The path where the Markdown file will be saved. | |
*/ | |
function postToMarkdown(articles, markdownFilePath) { | |
let markdownContent = ''; | |
articles.forEach((article) => { | |
const { title, author, createTime, content, attachments } = article; | |
let metadata = `- **Author:** ${author}\n- **Created:** ${createTime}\n`; | |
if (attachments && attachments.length > 0) { | |
metadata += `- **Attachments:**\n`; | |
attachments.forEach((filePath) => { | |
if (filePath === '') { | |
metadata += " - (the attachment was unavailable while crawling)\n"; | |
} else { | |
const filename = path.basename(filePath); | |
metadata += ` - [${filename.substring(37)}](${encodeURIComponent(filePath)})\n`; | |
} | |
}); | |
metadata += `\n`; | |
} | |
metadata += `\n`; | |
if (title) { | |
markdownContent += `## ${title}\n\n`; | |
} | |
markdownContent += metadata; | |
markdownContent += `${content}\n\n---\n\n`; | |
}); | |
fs.writeFileSync(markdownFilePath, markdownContent, 'utf8'); | |
} | |
/** | |
* Crawls the Microsoft Teams channel page to extract posts. | |
* | |
* @param {import('puppeteer').Page} page - The Puppeteer page instance. | |
* @returns {Promise<void>} | |
*/ | |
async function crawlChannel(page) { | |
const seenPosts = new Set(); | |
const posts = []; | |
const channelName = await page.evaluate(() => { | |
return document.querySelector('[data-tid="channelTitle-text"]').innerText; | |
}); | |
console.log(`Crawling started. Channel:${channelName}`); | |
downloadDir = `./${channelName}_${DOWNLOAD_BASE_DIR}`; | |
const absoluteDownloadDir = path.resolve(downloadDir); | |
if (!fs.existsSync(absoluteDownloadDir)) { | |
fs.mkdirSync(absoluteDownloadDir, { recursive: true }); | |
} | |
// Set Chrome’s download behavior to use the designated folder. | |
await cdpSession.send('Page.setDownloadBehavior', { | |
behavior: 'allow', | |
downloadPath: absoluteDownloadDir, | |
}); | |
let currentScrollTop = -1; | |
while (currentScrollTop !== 0) { | |
const postElements = await page.$$('[id^="reply-chain-summary"]'); | |
for (const postElem of postElements) { | |
const postId = await page.evaluate((el) => el.getAttribute("id"), postElem); | |
if (!seenPosts.has(postId)) { | |
console.info(`Found new post. HTML element id: ${postId}`); | |
try { | |
const parsedPost = await parsePost(postElem); | |
if (parsedPost != null) { | |
posts.push(parsedPost); | |
} | |
} catch (err) { | |
console.error(err); | |
await pauseUntilResume(); | |
} | |
seenPosts.add(postId); | |
} | |
} | |
console.debug(`Found ${postElements.length} elements matching [id^="reply-chain-summary"].`); | |
try { | |
currentScrollTop = await scrollUp(page); | |
await sleep(500); | |
} catch { | |
break; | |
} | |
} | |
console.info(`Total posts: ${posts.length}`); | |
posts.sort((a, b) => a.timestamp - b.timestamp); | |
const jsonString = JSON.stringify(posts, null, 4); | |
const yamlString = yaml.dump(posts); | |
try { | |
fs.writeFileSync(`${channelName}.json`, jsonString); | |
fs.writeFileSync(`${channelName}.yaml`, yamlString); | |
postToMarkdown(posts, `${channelName}.md`); | |
console.log(`Crawl the channel successfully: ${channelName}`); | |
} catch (err) { | |
console.error(`Error writing file: ${channelName}`, err); | |
} | |
} | |
(async () => { | |
// Launch Puppeteer with a visible browser | |
browserInstance = await puppeteer.launch({ | |
headless: false, | |
userDataDir: path.join(__dirname, 'puppeteer_user_data'), | |
defaultViewport: null, | |
}); | |
downloadPageInstance = await browserInstance.newPage(); | |
cdpSession = await downloadPageInstance.createCDPSession(); | |
// Configure Fetch to intercept responses from SharePoint and force file downloads instead of inline display | |
// Workaround from https://stackoverflow.com/a/63232618/6663588 | |
await cdpSession.send('Fetch.enable', { | |
patterns: [{ urlPattern: 'https://ntucc365.sharepoint.com/*', requestStage: 'Response' }], | |
}); | |
cdpSession.on('Fetch.requestPaused', async (event) => { | |
const { requestId, responseHeaders } = event; | |
const headers = responseHeaders || []; | |
const contentTypeHeader = headers.find( | |
(header) => header.name.toLowerCase() === 'content-type' | |
); | |
if ( | |
contentTypeHeader && | |
(contentTypeHeader.value === 'application/pdf' || contentTypeHeader.value.includes('xml')) | |
) { | |
headers.push({ name: 'Content-Disposition', value: 'attachment' }); | |
const response = await cdpSession.send('Fetch.getResponseBody', { requestId }); | |
await cdpSession.send('Fetch.fulfillRequest', { | |
requestId, | |
responseCode: 200, | |
responseHeaders: headers, | |
body: response.body, | |
}); | |
} else { | |
await cdpSession.send('Fetch.continueRequest', { requestId }); | |
} | |
}); | |
const page = await browserInstance.newPage(); | |
await page.exposeFunction('download', download); | |
// Navigate to the target Teams channel URL | |
const targetUrl = 'https://teams.microsoft.com/v2'; | |
await page.goto(targetUrl, { waitUntil: 'networkidle2' }); | |
await page.setBypassCSP(true); | |
console.log('Before starting to crawl the posts, please login to your' + | |
' Teams account and switch to the desired channel for crawling.'); | |
console.log('Press SPACE to start/stop crawling. Press Q to quit.'); | |
let isCrawling = false; | |
// Set raw mode for stdin to capture key presses. | |
process.stdin.setRawMode(true); | |
process.stdin.on('data', async (key) => { | |
const keyStr = key.toString(); | |
if (keyStr === ' ') { | |
if (!isCrawling) { | |
isCrawling = true; | |
await crawlChannel(page); | |
isCrawling = false; | |
} else { | |
isCrawling = false; | |
console.info('Crawling stopped.'); | |
} | |
} else if (keyStr === 'c') { | |
resumeCrawl = true; | |
} else if (keyStr.toLowerCase() === 'q') { | |
console.log('Exiting...'); | |
await browserInstance.close(); | |
process.exit(0); | |
} | |
}); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment