IvanaGyro · February 26, 2025 14:19
diff --git a/crawler.js b/crawler.js
 /**
 * Crawler for Microsoft Teams Channel Posts
 *
 * This script uses Puppeteer to automate a browser session, crawl a specified Microsoft Teams channel,
 * extract posts along with their attachments and images, convert post content from HTML to Markdown, and 
 * then save the results as JSON, YAML, and Markdown files.
 *
 * Steps to use this script:
 * 1. Install Node.js and all required modules.
 * 2. Run the script using the command: `node crawler.js`.
 * 3. When the browser opens, log in to your Microsoft Teams account.
 * 4. Navigate to the channel you want to crawl.
 * 5. Press the SPACE key in the terminal to start the crawler.
 * 6. Wait for the crawler to finish processing all posts; a success message will be displayed in the terminal.
 * 7. Repeat steps 4 to 6 for any additional channels you wish to crawl.
 * 8. Press the Q key in the terminal to exit the program.
 */

 const path = require('path');
 const puppeteer = require('puppeteer');
 const TurndownService = require('@joplin/turndown');
 const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
 const { URL } = require('url');
 const fs = require('fs');
 const { v4: uuidv4 } = require('uuid');
 const yaml = require('js-yaml');

 const { gfm } = turndownPluginGfm;
 const turndownService = new TurndownService();
 turndownService.use(gfm);

 // Global variables
 let browserInstance = null;
 let downloadPageInstance = null;
 let cdpSession = null;

 // Global variable: List of URLs to skip download via Puppeteer.
 // Files in this list should be downloaded manually as they may not download correctly via the automated browser.
 const SKIP_URLS = [];
 const DOWNLOAD_BASE_DIR = 'resources';
 let downloadDir = `./${DOWNLOAD_BASE_DIR}`;
 let resumeCrawl = false;

 // Queue for download jobs; each job is an object: { url: string, resolve: Function, reject: Function }
 const downloadJobs = [];
 let downloadLock = false;

 const DEBUG = false;
 if (!DEBUG) {
    console.debug = () => {};
 }

 /**
 * Sleeps for a given duration.
 *
 * @param {number} ms - The number of milliseconds to sleep.
 * @returns {Promise<void>}
 */
 async function sleep(ms) {
    return new Promise((resolve) => setTimeout(resolve, ms));
 }

 /**
 * Repeatedly polls a condition function until it returns true or a timeout is reached.
 *
 * @param {() => boolean|Promise<boolean>} conditionFn - A function that returns a boolean or a Promise that resolves to a boolean.
 * @param {Object} [options] - Optional settings.
 * @param {number} [options.timeout=30000] - Maximum time (in ms) to wait.
 * @param {number} [options.interval=500] - Polling interval in ms.
 * @returns {Promise<void>}
 * @throws {Error} If the condition isn’t met within the timeout.
 */
 function waitUntil(conditionFn, { timeout = 30000, interval = 500 } = {}) {
    return new Promise((resolve, reject) => {
        const start = Date.now();
        const timer = setInterval(async () => {
            let result = false;
            try {
                result = await conditionFn();
            } catch (err) {
                // Error is treated as "not ready"
            }
            if (result) {
                clearInterval(timer);
                resolve();
                return;
            }
            if (timeout > 0 && Date.now() - start > timeout) {
                clearInterval(timer);
                reject(new Error(`waitUntil() timed out after ${timeout} ms`));
            }
        }, interval);
    });
 }

 /**
 * Pauses the crawler until the user presses the "C" key.
 *
 * @returns {Promise<void>}
 */
 async function pauseUntilResume() {
    console.log("Crawler paused. Press C key to resume.");
    await waitUntil(() => resumeCrawl, { timeout: 0, interval: 100 });
    resumeCrawl = false;
 }

 /**
 * Scrolls up the Teams message pane to load additional content.
 *
 * @param {import('puppeteer').Page} page - The Puppeteer page instance.
 * @param {number|null} [scrollPx=null] - Pixels to scroll; defaults to the viewport height.
 * @returns {Promise<number>} The new scroll top position.
 */
 async function scrollUp(page, scrollPx = null) {
    const messagePane = await page.$('[data-tid="channel-pane-viewport"]');
    const clientHeight = await messagePane.evaluate((el) => el.clientHeight);
    console.debug(`clientHeight: ${clientHeight}`);
    if (scrollPx === null) {
        scrollPx = clientHeight;
    }

    const getScrollTop = async () => messagePane.evaluate((el) => el.scrollTop);
    const getScrollHeight = async () => messagePane.evaluate((el) => el.scrollHeight);

    const oldTop = await getScrollTop();
    const oldHeight = await getScrollHeight();
    console.debug(`oldTop: ${oldTop}`);
    console.debug(`oldHeight: ${oldHeight}`);
    if (oldTop === 0) {
        return 0;
    }

    // Scroll up by 'scrollPx' pixels
    await messagePane.evaluate((el, old, delta) => {
        el.scrollTo(0, Math.max(0, old - delta));
    }, oldTop, scrollPx);

    // Teams may trigger network requests when scrolling; wait for height change or scrollTop change
    try {
        await page.waitForFunction(
            (old) => {
                const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
                return pane.scrollHeight > old;
            },
            { timeout: 10000 },
            oldHeight
        );
    } catch {
        console.debug(`max height: ${await getScrollHeight()}`);
        await page.waitForFunction(
            (old) => {
                const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
                return pane.scrollTop < old;
            },
            { timeout: 500 },
            oldTop
        );
    }
    const newTop = await getScrollTop();
    console.debug(`newTop: ${newTop}`);
    console.debug(`newHeight: ${await getScrollHeight()}`);
    return newTop;
 }

 /**
 * Enqueues a download job for the specified URL.
 *
 * @param {string} url - The URL to download.
 * @returns {Promise<string>} Resolves with the local file path of the downloaded file.
 */
 function download(url) {
    return new Promise((resolve, reject) => {
        downloadJobs.push({ url, resolve, reject });
        processDownloadQueue();
    });
 }

 /**
 * Processes the download job queue sequentially.
 *
 * Continues processing until there are no jobs left.
 */
 async function processDownloadQueue() {
    if (downloadLock) return;
    const job = downloadJobs.shift();
    if (!job) return;

    downloadLock = true;
    try {
        const parsedUrl = new URL(job.url);
        console.debug(`downloading ${job.url}`);
        const urlPath = parsedUrl.pathname;
        let fileName = decodeURIComponent(urlPath.substring(urlPath.lastIndexOf('/') + 1));
        let filePath = null;
        let response = null;

        if (parsedUrl.origin === 'https://ntucc365.sharepoint.com') {
            // For attachments from SharePoint (Teams design), sometimes downloading via Puppeteer
            // fails, so we skip those URLs.
            if (!SKIP_URLS.includes(job.url)) {
                try {
                    response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
                } catch (err) {
                    // Ignore errors like net::ERR_ABORTED since the file might still download via CDP
                }
            }
            const oldFilePath = path.join(downloadDir, fileName);
            let retry = 2;
            while (retry--) {
                try {
                    await waitUntil(() => fs.existsSync(oldFilePath), { timeout: 10000, interval: 500 });
                    filePath = path.join(downloadDir, `${uuidv4()}-${fileName}`);
                    fs.renameSync(oldFilePath, filePath);
                    break;
                } catch {
                    if (retry) {
                        console.error(`Failed to find the downloaded file.\n` +
                            `url: ${job.url}\n` +
                            `path: ${oldFilePath}\n` +
                            `Please manually download the file and than resume.`);
                        await pauseUntilResume();
                    } else {
                        console.warn("Leave the file path empty.");
                        filePath = '';
                    }
                }
            }
        } else {
            // For displayed images
            response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
            const contentType = response.headers()['content-type'] || '';
            let extension;
            if (contentType.includes('jpeg')) {
                extension = 'jpg';
            } else if (contentType.includes('png')) {
                extension = 'png';
            } else if (contentType.includes('gif')) {
                extension = 'gif';
            } else {
                extension = contentType.split('/')[1];
            }
            // Assign a new unique filename because the filename in the URL is always the same
            // for different images.
            fileName = `${uuidv4()}.${extension}`;
            const responseBuffer = await response.buffer();
            filePath = path.join(downloadDir, fileName);
            await fs.promises.writeFile(filePath, responseBuffer);
        }

        job.resolve(filePath);
    } catch (error) {
        job.reject(error);
    } finally {
        downloadLock = false;
        processDownloadQueue();
    }
 }

 /**
 * Parses a post element from the Teams channel.
 *
 * @param {import('puppeteer').ElementHandle} postElement - The DOM element representing a post.
 * @returns {Promise<Object|null>} The parsed post object, or null if the post is deleted.
 */
 async function parsePost(postElement) {
    const parsedPost = await postElement.evaluate(async (elem) => {
        const postContent = elem.querySelector('[id^="post-message-render"] [id^="message-body"] [data-reply-chain-id]');
        if (!postContent) { // Deleted post
            return null;
        }
        const contentClone = postContent.querySelector('[id^="content"]')?.cloneNode(true);
        const title = postContent.querySelector('h2')?.innerText || '';
        const author = elem.querySelector('[id^="author"]').innerText;
        let timestamp = elem.getElementsByTagName('time')[0].id.split('-')[1];
        timestamp = Number(timestamp);
        const createTime = new Date(timestamp).toLocaleString('zh-TW', {
            hour12: false,
            year: 'numeric',
            month: '2-digit',
            day: '2-digit',
            hour: '2-digit',
            minute: '2-digit',
            second: '2-digit',
        });

        // Replace emoji animations with actual characters (Teams design)
        contentClone.querySelectorAll('[data-tid="emoticon-renderer"]').forEach((el) => {
            el.querySelectorAll('img').forEach((imgEl) => {
                imgEl.outerHTML = imgEl.getAttribute('alt');
            });
        });

        // Replace mention tags with plain text
        contentClone.querySelectorAll('[itemtype="http://schema.skype.com/Mention"]').forEach((el) => {
            if (el.parentNode.hasAttribute('data-lpc-hover-target-id')) {
                el.parentNode.outerHTML = el.parentNode.innerText;
            } else {
                el.outerHTML = el.innerText;
            }
        });

        const remoteToLocal = {};
        const attachmentNodes = postContent.querySelector('[id^=attachments]')?.childNodes || [];
        const attachmentResults = await Promise.all(
            Array.from(attachmentNodes).map(async (attachment) => {
                if (attachment.hasAttribute('title')) {
                    // Normal attachment
                    let [filename, url] = attachment.getAttribute('title').split('\r\n');
                    const parsed = new URL(url);
                    if (parsed.origin !== 'https://ntucc365.sharepoint.com') {
                        return [url, null];
                    }
                    const fullPath = parsed.origin + parsed.pathname;
                    const localLink = await window.download(fullPath);
                    remoteToLocal[url] = localLink;
                    return [localLink, null];
                } else if (attachment.querySelector('img[data-gallery-src]') != null) {
                    // Image attachment
                    const imgElem = attachment.querySelector('img[data-gallery-src]');
                    const fullSizeImagePath = imgElem.getAttribute('data-gallery-src');
                    const localPath = await window.download(fullSizeImagePath);
                    imgElem.setAttribute('src', localPath);
                    return [localPath, imgElem];
                } else {
                    return [null, null];
                }
            })
        );
        const attachments = [];
        attachmentResults.forEach(([link, imageElem]) => {
            if (link != null) attachments.push(link);
            if (imageElem != null) contentClone.appendChild(imageElem);
        });

        // Download images and update their src attribute
        await Promise.all(
            Array.from(contentClone.querySelectorAll('img[data-gallery-src]')).map(async (img) => {
                const fullSizeImagePath = img.getAttribute('data-gallery-src');
                const localPath = await window.download(fullSizeImagePath);
                img.setAttribute('src', localPath);
            })
        );

        // Update anchor tags if they point to attachments
        contentClone.querySelectorAll('a').forEach((el) => {
            const href = el.getAttribute('href');
            if (remoteToLocal[href] != null) {
                el.setAttribute('href', remoteToLocal[href]);
            }
        });

        return { title, author, timestamp, createTime, outerHTML: contentClone.outerHTML, attachments };
    });

    if (parsedPost === null) return null;
    parsedPost.content = turndownService
        .turndown(parsedPost.outerHTML)
        .replace(/\u00A0/g, '\u0020'); // Replace non-breaking spaces with normal spaces
    delete parsedPost.outerHTML;
    return parsedPost;
 }


 /**
 * Converts the extracted posts into a Markdown file.
 *
 * @param {Array<Object>} articles - Array of post objects.
 * @param {string} markdownFilePath - The path where the Markdown file will be saved.
 */
 function postToMarkdown(articles, markdownFilePath) {
    let markdownContent = '';

    articles.forEach((article) => {
        const { title, author, createTime, content, attachments } = article;
        let metadata = `- **Author:** ${author}\n- **Created:** ${createTime}\n`;

        if (attachments && attachments.length > 0) {
            metadata += `- **Attachments:**\n`;
            attachments.forEach((filePath) => {
                if (filePath === '') {
                    metadata += "  - (the attachment was unavailable while crawling)\n";
                } else {
                    const filename = path.basename(filePath);
                    metadata += `  - [${filename.substring(37)}](${encodeURIComponent(filePath)})\n`;
                }
            });
            metadata += `\n`;
        }
        metadata += `\n`;

        if (title) {
            markdownContent += `## ${title}\n\n`;
        }
        markdownContent += metadata;
        markdownContent += `${content}\n\n---\n\n`;
    });

    fs.writeFileSync(markdownFilePath, markdownContent, 'utf8');
 }

 /**
 * Crawls the Microsoft Teams channel page to extract posts.
 *
 * @param {import('puppeteer').Page} page - The Puppeteer page instance.
 * @returns {Promise<void>}
 */
 async function crawlChannel(page) {
    const seenPosts = new Set();
    const posts = [];


    const channelName = await page.evaluate(() => {
        return document.querySelector('[data-tid="channelTitle-text"]').innerText;
    });

    console.log(`Crawling started. Channel:${channelName}`);

    downloadDir = `./${channelName}_${DOWNLOAD_BASE_DIR}`;
    const absoluteDownloadDir = path.resolve(downloadDir);
    if (!fs.existsSync(absoluteDownloadDir)) {
        fs.mkdirSync(absoluteDownloadDir, { recursive: true });
    }

    // Set Chrome’s download behavior to use the designated folder.
    await cdpSession.send('Page.setDownloadBehavior', {
        behavior: 'allow',
        downloadPath: absoluteDownloadDir,
    });

    let currentScrollTop = -1;
    while (currentScrollTop !== 0) {
        const postElements = await page.$$('[id^="reply-chain-summary"]');

        for (const postElem of postElements) {
            const postId = await page.evaluate((el) => el.getAttribute("id"), postElem);
            if (!seenPosts.has(postId)) {
                console.info(`Found new post. HTML element id: ${postId}`);
                try {
                    const parsedPost = await parsePost(postElem);
                    if (parsedPost != null) {
                        posts.push(parsedPost);
                    }
                } catch (err) {
                    console.error(err);
                    await pauseUntilResume();
                }
                seenPosts.add(postId);
            }
        }
        console.debug(`Found ${postElements.length} elements matching [id^="reply-chain-summary"].`);

        try {
            currentScrollTop = await scrollUp(page);
            await sleep(500);
        } catch {
            break;
        }
    }
    console.info(`Total posts: ${posts.length}`);

    posts.sort((a, b) => a.timestamp - b.timestamp);

    const jsonString = JSON.stringify(posts, null, 4);
    const yamlString = yaml.dump(posts);

    try {
        fs.writeFileSync(`${channelName}.json`, jsonString);
        fs.writeFileSync(`${channelName}.yaml`, yamlString);
        postToMarkdown(posts, `${channelName}.md`);
        console.log(`Crawl the channel successfully: ${channelName}`);
    } catch (err) {
        console.error(`Error writing file: ${channelName}`, err);
    }
 }

 (async () => {
    // Launch Puppeteer with a visible browser
    browserInstance = await puppeteer.launch({
        headless: false,
        userDataDir: path.join(__dirname, 'puppeteer_user_data'),
        defaultViewport: null,
    });

    downloadPageInstance = await browserInstance.newPage();
    cdpSession = await downloadPageInstance.createCDPSession();

    // Configure Fetch to intercept responses from SharePoint and force file downloads instead of inline display
    // Workaround from https://stackoverflow.com/a/63232618/6663588
    await cdpSession.send('Fetch.enable', {
        patterns: [{ urlPattern: 'https://ntucc365.sharepoint.com/*', requestStage: 'Response' }],
    });

    cdpSession.on('Fetch.requestPaused', async (event) => {
        const { requestId, responseHeaders } = event;
        const headers = responseHeaders || [];
        const contentTypeHeader = headers.find(
            (header) => header.name.toLowerCase() === 'content-type'
        );

        if (
            contentTypeHeader &&
            (contentTypeHeader.value === 'application/pdf' || contentTypeHeader.value.includes('xml'))
        ) {
            headers.push({ name: 'Content-Disposition', value: 'attachment' });
            const response = await cdpSession.send('Fetch.getResponseBody', { requestId });
            await cdpSession.send('Fetch.fulfillRequest', {
                requestId,
                responseCode: 200,
                responseHeaders: headers,
                body: response.body,
            });
        } else {
            await cdpSession.send('Fetch.continueRequest', { requestId });
        }
    });

    const page = await browserInstance.newPage();
    await page.exposeFunction('download', download);

    // Navigate to the target Teams channel URL
    const targetUrl = 'https://teams.microsoft.com/v2';
    await page.goto(targetUrl, { waitUntil: 'networkidle2' });
    await page.setBypassCSP(true);

    console.log('Before starting to crawl the posts, please login to your' +
        ' Teams account and switch to the desired channel for crawling.');
    console.log('Press SPACE to start/stop crawling. Press Q to quit.');

    let isCrawling = false;

    // Set raw mode for stdin to capture key presses.
    process.stdin.setRawMode(true);
    process.stdin.on('data', async (key) => {
        const keyStr = key.toString();
        if (keyStr === ' ') {
            if (!isCrawling) {
                isCrawling = true;
                await crawlChannel(page);
                isCrawling = false;
            } else {
                isCrawling = false;
                console.info('Crawling stopped.');
            }
        } else if (keyStr === 'c') {
            resumeCrawl = true;
        } else if (keyStr.toLowerCase() === 'q') {
            console.log('Exiting...');
            await browserInstance.close();
            process.exit(0);
        }
    });
 })();
	/**
	* Crawler for Microsoft Teams Channel Posts
	*
	* This script uses Puppeteer to automate a browser session, crawl a specified Microsoft Teams channel,
	* extract posts along with their attachments and images, convert post content from HTML to Markdown, and
	* then save the results as JSON, YAML, and Markdown files.
	*
	* Steps to use this script:
	* 1. Install Node.js and all required modules.
	* 2. Run the script using the command: `node crawler.js`.
	* 3. When the browser opens, log in to your Microsoft Teams account.
	* 4. Navigate to the channel you want to crawl.
	* 5. Press the SPACE key in the terminal to start the crawler.
	* 6. Wait for the crawler to finish processing all posts; a success message will be displayed in the terminal.
	* 7. Repeat steps 4 to 6 for any additional channels you wish to crawl.
	* 8. Press the Q key in the terminal to exit the program.
	*/

	const path = require('path');
	const puppeteer = require('puppeteer');
	const TurndownService = require('@joplin/turndown');
	const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
	const { URL } = require('url');
	const fs = require('fs');
	const { v4: uuidv4 } = require('uuid');
	const yaml = require('js-yaml');

	const { gfm } = turndownPluginGfm;
	const turndownService = new TurndownService();
	turndownService.use(gfm);

	// Global variables
	let browserInstance = null;
	let downloadPageInstance = null;
	let cdpSession = null;

	// Global variable: List of URLs to skip download via Puppeteer.
	// Files in this list should be downloaded manually as they may not download correctly via the automated browser.
	const SKIP_URLS = [];
	const DOWNLOAD_BASE_DIR = 'resources';
	let downloadDir = `./${DOWNLOAD_BASE_DIR}`;
	let resumeCrawl = false;

	// Queue for download jobs; each job is an object: { url: string, resolve: Function, reject: Function }
	const downloadJobs = [];
	let downloadLock = false;

	const DEBUG = false;
	if (!DEBUG) {
	console.debug = () => {};
	}

	/**
	* Sleeps for a given duration.
	*
	* @param {number} ms - The number of milliseconds to sleep.
	* @returns {Promise<void>}
	*/
	async function sleep(ms) {
	return new Promise((resolve) => setTimeout(resolve, ms));
	}

	/**
	* Repeatedly polls a condition function until it returns true or a timeout is reached.
	*
	* @param {() => boolean\|Promise<boolean>} conditionFn - A function that returns a boolean or a Promise that resolves to a boolean.
	* @param {Object} [options] - Optional settings.
	* @param {number} [options.timeout=30000] - Maximum time (in ms) to wait.
	* @param {number} [options.interval=500] - Polling interval in ms.
	* @returns {Promise<void>}
	* @throws {Error} If the condition isn’t met within the timeout.
	*/
	function waitUntil(conditionFn, { timeout = 30000, interval = 500 } = {}) {
	return new Promise((resolve, reject) => {
	const start = Date.now();
	const timer = setInterval(async () => {
	let result = false;
	try {
	result = await conditionFn();
	} catch (err) {
	// Error is treated as "not ready"
	}
	if (result) {
	clearInterval(timer);
	resolve();
	return;
	}
	if (timeout > 0 && Date.now() - start > timeout) {
	clearInterval(timer);
	reject(new Error(`waitUntil() timed out after ${timeout} ms`));
	}
	}, interval);
	});
	}

	/**
	* Pauses the crawler until the user presses the "C" key.
	*
	* @returns {Promise<void>}
	*/
	async function pauseUntilResume() {
	console.log("Crawler paused. Press C key to resume.");
	await waitUntil(() => resumeCrawl, { timeout: 0, interval: 100 });
	resumeCrawl = false;
	}

	/**
	* Scrolls up the Teams message pane to load additional content.
	*
	* @param {import('puppeteer').Page} page - The Puppeteer page instance.
	* @param {number\|null} [scrollPx=null] - Pixels to scroll; defaults to the viewport height.
	* @returns {Promise<number>} The new scroll top position.
	*/
	async function scrollUp(page, scrollPx = null) {
	const messagePane = await page.$('[data-tid="channel-pane-viewport"]');
	const clientHeight = await messagePane.evaluate((el) => el.clientHeight);
	console.debug(`clientHeight: ${clientHeight}`);
	if (scrollPx === null) {
	scrollPx = clientHeight;
	}

	const getScrollTop = async () => messagePane.evaluate((el) => el.scrollTop);
	const getScrollHeight = async () => messagePane.evaluate((el) => el.scrollHeight);

	const oldTop = await getScrollTop();
	const oldHeight = await getScrollHeight();
	console.debug(`oldTop: ${oldTop}`);
	console.debug(`oldHeight: ${oldHeight}`);
	if (oldTop === 0) {
	return 0;
	}

	// Scroll up by 'scrollPx' pixels
	await messagePane.evaluate((el, old, delta) => {
	el.scrollTo(0, Math.max(0, old - delta));
	}, oldTop, scrollPx);

	// Teams may trigger network requests when scrolling; wait for height change or scrollTop change
	try {
	await page.waitForFunction(
	(old) => {
	const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
	return pane.scrollHeight > old;
	},
	{ timeout: 10000 },
	oldHeight
	);
	} catch {
	console.debug(`max height: ${await getScrollHeight()}`);
	await page.waitForFunction(
	(old) => {
	const pane = document.querySelector('[data-tid="channel-pane-viewport"]');
	return pane.scrollTop < old;
	},
	{ timeout: 500 },
	oldTop
	);
	}
	const newTop = await getScrollTop();
	console.debug(`newTop: ${newTop}`);
	console.debug(`newHeight: ${await getScrollHeight()}`);
	return newTop;
	}

	/**
	* Enqueues a download job for the specified URL.
	*
	* @param {string} url - The URL to download.
	* @returns {Promise<string>} Resolves with the local file path of the downloaded file.
	*/
	function download(url) {
	return new Promise((resolve, reject) => {
	downloadJobs.push({ url, resolve, reject });
	processDownloadQueue();
	});
	}

	/**
	* Processes the download job queue sequentially.
	*
	* Continues processing until there are no jobs left.
	*/
	async function processDownloadQueue() {
	if (downloadLock) return;
	const job = downloadJobs.shift();
	if (!job) return;

	downloadLock = true;
	try {
	const parsedUrl = new URL(job.url);
	console.debug(`downloading ${job.url}`);
	const urlPath = parsedUrl.pathname;
	let fileName = decodeURIComponent(urlPath.substring(urlPath.lastIndexOf('/') + 1));
	let filePath = null;
	let response = null;

	if (parsedUrl.origin === 'https://ntucc365.sharepoint.com') {
	// For attachments from SharePoint (Teams design), sometimes downloading via Puppeteer
	// fails, so we skip those URLs.
	if (!SKIP_URLS.includes(job.url)) {
	try {
	response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
	} catch (err) {
	// Ignore errors like net::ERR_ABORTED since the file might still download via CDP
	}
	}
	const oldFilePath = path.join(downloadDir, fileName);
	let retry = 2;
	while (retry--) {
	try {
	await waitUntil(() => fs.existsSync(oldFilePath), { timeout: 10000, interval: 500 });
	filePath = path.join(downloadDir, `${uuidv4()}-${fileName}`);
	fs.renameSync(oldFilePath, filePath);
	break;
	} catch {
	if (retry) {
	console.error(`Failed to find the downloaded file.\n` +
	`url: ${job.url}\n` +
	`path: ${oldFilePath}\n` +
	`Please manually download the file and than resume.`);
	await pauseUntilResume();
	} else {
	console.warn("Leave the file path empty.");
	filePath = '';
	}
	}
	}
	} else {
	// For displayed images
	response = await downloadPageInstance.goto(job.url, { timeout: 0, waitUntil: 'networkidle0' });
	const contentType = response.headers()['content-type'] \|\| '';
	let extension;
	if (contentType.includes('jpeg')) {
	extension = 'jpg';
	} else if (contentType.includes('png')) {
	extension = 'png';
	} else if (contentType.includes('gif')) {
	extension = 'gif';
	} else {
	extension = contentType.split('/')[1];
	}
	// Assign a new unique filename because the filename in the URL is always the same
	// for different images.
	fileName = `${uuidv4()}.${extension}`;
	const responseBuffer = await response.buffer();
	filePath = path.join(downloadDir, fileName);
	await fs.promises.writeFile(filePath, responseBuffer);
	}

	job.resolve(filePath);
	} catch (error) {
	job.reject(error);
	} finally {
	downloadLock = false;
	processDownloadQueue();
	}
	}

	/**
	* Parses a post element from the Teams channel.
	*
	* @param {import('puppeteer').ElementHandle} postElement - The DOM element representing a post.
	* @returns {Promise<Object\|null>} The parsed post object, or null if the post is deleted.
	*/
	async function parsePost(postElement) {
	const parsedPost = await postElement.evaluate(async (elem) => {
	const postContent = elem.querySelector('[id^="post-message-render"] [id^="message-body"] [data-reply-chain-id]');
	if (!postContent) { // Deleted post
	return null;
	}
	const contentClone = postContent.querySelector('[id^="content"]')?.cloneNode(true);
	const title = postContent.querySelector('h2')?.innerText \|\| '';
	const author = elem.querySelector('[id^="author"]').innerText;
	let timestamp = elem.getElementsByTagName('time')[0].id.split('-')[1];
	timestamp = Number(timestamp);
	const createTime = new Date(timestamp).toLocaleString('zh-TW', {
	hour12: false,
	year: 'numeric',
	month: '2-digit',
	day: '2-digit',
	hour: '2-digit',
	minute: '2-digit',
	second: '2-digit',
	});

	// Replace emoji animations with actual characters (Teams design)
	contentClone.querySelectorAll('[data-tid="emoticon-renderer"]').forEach((el) => {
	el.querySelectorAll('img').forEach((imgEl) => {
	imgEl.outerHTML = imgEl.getAttribute('alt');
	});
	});

	// Replace mention tags with plain text
	contentClone.querySelectorAll('[itemtype="http://schema.skype.com/Mention"]').forEach((el) => {
	if (el.parentNode.hasAttribute('data-lpc-hover-target-id')) {
	el.parentNode.outerHTML = el.parentNode.innerText;
	} else {
	el.outerHTML = el.innerText;
	}
	});

	const remoteToLocal = {};
	const attachmentNodes = postContent.querySelector('[id^=attachments]')?.childNodes \|\| [];
	const attachmentResults = await Promise.all(
	Array.from(attachmentNodes).map(async (attachment) => {
	if (attachment.hasAttribute('title')) {
	// Normal attachment
	let [filename, url] = attachment.getAttribute('title').split('\r\n');
	const parsed = new URL(url);
	if (parsed.origin !== 'https://ntucc365.sharepoint.com') {
	return [url, null];
	}
	const fullPath = parsed.origin + parsed.pathname;
	const localLink = await window.download(fullPath);
	remoteToLocal[url] = localLink;
	return [localLink, null];
	} else if (attachment.querySelector('img[data-gallery-src]') != null) {
	// Image attachment
	const imgElem = attachment.querySelector('img[data-gallery-src]');
	const fullSizeImagePath = imgElem.getAttribute('data-gallery-src');
	const localPath = await window.download(fullSizeImagePath);
	imgElem.setAttribute('src', localPath);
	return [localPath, imgElem];
	} else {
	return [null, null];
	}
	})
	);
	const attachments = [];
	attachmentResults.forEach(([link, imageElem]) => {
	if (link != null) attachments.push(link);
	if (imageElem != null) contentClone.appendChild(imageElem);
	});

	// Download images and update their src attribute
	await Promise.all(
	Array.from(contentClone.querySelectorAll('img[data-gallery-src]')).map(async (img) => {
	const fullSizeImagePath = img.getAttribute('data-gallery-src');
	const localPath = await window.download(fullSizeImagePath);
	img.setAttribute('src', localPath);
	})
	);

	// Update anchor tags if they point to attachments
	contentClone.querySelectorAll('a').forEach((el) => {
	const href = el.getAttribute('href');
	if (remoteToLocal[href] != null) {
	el.setAttribute('href', remoteToLocal[href]);
	}
	});

	return { title, author, timestamp, createTime, outerHTML: contentClone.outerHTML, attachments };
	});

	if (parsedPost === null) return null;
	parsedPost.content = turndownService
	.turndown(parsedPost.outerHTML)
	.replace(/\u00A0/g, '\u0020'); // Replace non-breaking spaces with normal spaces
	delete parsedPost.outerHTML;
	return parsedPost;
	}


	/**
	* Converts the extracted posts into a Markdown file.
	*
	* @param {Array<Object>} articles - Array of post objects.
	* @param {string} markdownFilePath - The path where the Markdown file will be saved.
	*/
	function postToMarkdown(articles, markdownFilePath) {
	let markdownContent = '';

	articles.forEach((article) => {
	const { title, author, createTime, content, attachments } = article;
	let metadata = `- Author: ${author}\n- Created: ${createTime}\n`;

	if (attachments && attachments.length > 0) {
	metadata += `- Attachments:\n`;
	attachments.forEach((filePath) => {
	if (filePath === '') {
	metadata += " - (the attachment was unavailable while crawling)\n";
	} else {
	const filename = path.basename(filePath);
	metadata += ` - [${filename.substring(37)}](${encodeURIComponent(filePath)})\n`;
	}
	});
	metadata += `\n`;
	}
	metadata += `\n`;

	if (title) {
	markdownContent += `## ${title}\n\n`;
	}
	markdownContent += metadata;
	markdownContent += `${content}\n\n---\n\n`;
	});

	fs.writeFileSync(markdownFilePath, markdownContent, 'utf8');
	}

	/**
	* Crawls the Microsoft Teams channel page to extract posts.
	*
	* @param {import('puppeteer').Page} page - The Puppeteer page instance.
	* @returns {Promise<void>}
	*/
	async function crawlChannel(page) {
	const seenPosts = new Set();
	const posts = [];


	const channelName = await page.evaluate(() => {
	return document.querySelector('[data-tid="channelTitle-text"]').innerText;
	});

	console.log(`Crawling started. Channel:${channelName}`);

	downloadDir = `./${channelName}_${DOWNLOAD_BASE_DIR}`;
	const absoluteDownloadDir = path.resolve(downloadDir);
	if (!fs.existsSync(absoluteDownloadDir)) {
	fs.mkdirSync(absoluteDownloadDir, { recursive: true });
	}

	// Set Chrome’s download behavior to use the designated folder.
	await cdpSession.send('Page.setDownloadBehavior', {
	behavior: 'allow',
	downloadPath: absoluteDownloadDir,
	});

	let currentScrollTop = -1;
	while (currentScrollTop !== 0) {
	const postElements = await page.$$('[id^="reply-chain-summary"]');

	for (const postElem of postElements) {
	const postId = await page.evaluate((el) => el.getAttribute("id"), postElem);
	if (!seenPosts.has(postId)) {
	console.info(`Found new post. HTML element id: ${postId}`);
	try {
	const parsedPost = await parsePost(postElem);
	if (parsedPost != null) {
	posts.push(parsedPost);
	}
	} catch (err) {
	console.error(err);
	await pauseUntilResume();
	}
	seenPosts.add(postId);
	}
	}
	console.debug(`Found ${postElements.length} elements matching [id^="reply-chain-summary"].`);

	try {
	currentScrollTop = await scrollUp(page);
	await sleep(500);
	} catch {
	break;
	}
	}
	console.info(`Total posts: ${posts.length}`);

	posts.sort((a, b) => a.timestamp - b.timestamp);

	const jsonString = JSON.stringify(posts, null, 4);
	const yamlString = yaml.dump(posts);

	try {
	fs.writeFileSync(`${channelName}.json`, jsonString);
	fs.writeFileSync(`${channelName}.yaml`, yamlString);
	postToMarkdown(posts, `${channelName}.md`);
	console.log(`Crawl the channel successfully: ${channelName}`);
	} catch (err) {
	console.error(`Error writing file: ${channelName}`, err);
	}
	}

	(async () => {
	// Launch Puppeteer with a visible browser
	browserInstance = await puppeteer.launch({
	headless: false,
	userDataDir: path.join(__dirname, 'puppeteer_user_data'),
	defaultViewport: null,
	});

	downloadPageInstance = await browserInstance.newPage();
	cdpSession = await downloadPageInstance.createCDPSession();

	// Configure Fetch to intercept responses from SharePoint and force file downloads instead of inline display
	// Workaround from https://stackoverflow.com/a/63232618/6663588
	await cdpSession.send('Fetch.enable', {
	patterns: [{ urlPattern: 'https://ntucc365.sharepoint.com/*', requestStage: 'Response' }],
	});

	cdpSession.on('Fetch.requestPaused', async (event) => {
	const { requestId, responseHeaders } = event;
	const headers = responseHeaders \|\| [];
	const contentTypeHeader = headers.find(
	(header) => header.name.toLowerCase() === 'content-type'
	);

	if (
	contentTypeHeader &&
	(contentTypeHeader.value === 'application/pdf' \|\| contentTypeHeader.value.includes('xml'))
	) {
	headers.push({ name: 'Content-Disposition', value: 'attachment' });
	const response = await cdpSession.send('Fetch.getResponseBody', { requestId });
	await cdpSession.send('Fetch.fulfillRequest', {
	requestId,
	responseCode: 200,
	responseHeaders: headers,
	body: response.body,
	});
	} else {
	await cdpSession.send('Fetch.continueRequest', { requestId });
	}
	});

	const page = await browserInstance.newPage();
	await page.exposeFunction('download', download);

	// Navigate to the target Teams channel URL
	const targetUrl = 'https://teams.microsoft.com/v2';
	await page.goto(targetUrl, { waitUntil: 'networkidle2' });
	await page.setBypassCSP(true);

	console.log('Before starting to crawl the posts, please login to your' +
	' Teams account and switch to the desired channel for crawling.');
	console.log('Press SPACE to start/stop crawling. Press Q to quit.');

	let isCrawling = false;

	// Set raw mode for stdin to capture key presses.
	process.stdin.setRawMode(true);
	process.stdin.on('data', async (key) => {
	const keyStr = key.toString();
	if (keyStr === ' ') {
	if (!isCrawling) {
	isCrawling = true;
	await crawlChannel(page);
	isCrawling = false;
	} else {
	isCrawling = false;
	console.info('Crawling stopped.');
	}
	} else if (keyStr === 'c') {
	resumeCrawl = true;
	} else if (keyStr.toLowerCase() === 'q') {
	console.log('Exiting...');
	await browserInstance.close();
	process.exit(0);
	}
	});
	})();