Last active
July 27, 2023 10:18
-
-
Save pnutmath/0b52dd13c67b6dbce2cdf1146447ed87 to your computer and use it in GitHub Desktop.
Scrape documentation and post to QnAs discourse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fetch from 'node-fetch'; | |
import cheerio from 'cheerio'; | |
import { Configuration, OpenAIApi } from 'openai'; | |
const OPENAI_API_KEY = '<YOUR_OPENAI_API_KEY>'; | |
const DISCOURSE_SERVER_URL = '<YOUR_DISCOURSE_SERVER_URL>'; | |
const DISCOURSE_API_KEY = '<YOUR_DISCOURSE_API_KEY>'; | |
const DISCOURSE_POST_CREATORS = ['<CREATOR1>', '<CREATOR2>']; | |
const DISCOURSE_PRODUCT_MODERATORS = ['<MODERATOR1>', '<MODERATOR2>']; | |
const CATEGORY_ID = 1; | |
const ROOT_DOCUMENTATION_PATH = 'https://abhay.dev/' | |
const configuration = new Configuration({ apiKey: OPENAI_API_KEY }); | |
const openai = new OpenAIApi(configuration); | |
// Parse the webpage content | |
const parseContent = async (url) => { | |
const result = await fetch(url); | |
const text = await result.text(); | |
const $ = cheerio.load(text); | |
// Remove irrelevant elements | |
$('script, noscript, style, footer, nav, iframe').remove(); | |
const bodyContent = $('body').text().trim(); | |
const pageTitle = $('title').text(); | |
return { pageTitle, bodyContent, url }; | |
}; | |
// Crawl function | |
const crawlAndParse = async (rootURL) => { | |
let queue = [rootURL]; | |
let seen = new Set(queue); | |
let crawledPages = []; | |
while (queue.length) { | |
const currentUrl = queue.pop(); | |
try { | |
const content = await parseContent(currentUrl); | |
crawledPages.push(content); | |
const result = await fetch(currentUrl); | |
const text = await result.text(); | |
const $ = cheerio.load(text); | |
// Collect hyperlinks on the current page | |
let hyperlinks = $('a[href]').map((_, el) => $(el).attr('href')).get(); | |
// Clean and filter links | |
let cleanLinks = hyperlinks.map(link => { | |
let cleanLink = new URL(link, currentUrl).toString(); | |
if (cleanLink.startsWith(`${rootURL}/`)) { | |
// remove fragment identifiers | |
cleanLink = cleanLink.split('#')[0]; | |
return cleanLink; | |
} | |
}).filter(link => link !== undefined); | |
// Add unique links to the queue | |
cleanLinks.forEach(link => { | |
if (!seen.has(link)) { | |
queue.push(link); | |
seen.add(link); | |
} | |
}); | |
} catch (err) { | |
console.error(`Failed to process ${currentUrl}: ${err}`); | |
} | |
} | |
return crawledPages; | |
}; | |
// Generate Q&A using OpenAI API | |
const generateQA = async ({ pageTitle, bodyContent, url }) => { | |
console.info(`Generating Q&A for ${url}`); | |
console.time(`generateQA:${url}`); | |
const completion = await openai.createChatCompletion({ | |
model: 'gpt-4', | |
messages: [ | |
{ | |
role: 'system', | |
content: `You are an AI trained to assist moderators of Discourse servers in generating initial posts from existing documentation. You're given the title and content of a webpage, along with its URL. Your task is to analyze the content and formulate a multiple relevant discussion topic (t), craft a detailed question to instigate discussion (q), and prepare an answer(a). The answer should be in Markdown format and include the reference document URL. Double escape the newline characters in answer. The response should be a JSON array with keys 't', 'q', and 'a'.` | |
}, | |
{ | |
role: 'user', | |
content: `Page Title: ${pageTitle}\n\nContent:\n${bodyContent}\n\nURL: ${url}` | |
} | |
], | |
}) | |
.catch(err => console.error(err)) | |
.finally(() => console.timeEnd(`generateQA:${url}`)); | |
// Parse Q&As from the response | |
let qas = JSON.parse(completion.data.choices[0].message.content); | |
// Replace escaped newline characters with actual newline characters | |
qas = qas.map(qa => { qa.a = qa.a.replace(/\\n/g, '\n'); return qa; }); | |
return qas; | |
}; | |
// Function to select a random user from an array | |
const getRandomUser = users => users[Math.floor(Math.random() * users.length)]; | |
function randomDate(start, end) { | |
return new Date(start.getTime() + Math.random() * (end.getTime() - start.getTime())); | |
} | |
const now = new Date(); | |
const oneWeekAgo = new Date(); | |
oneWeekAgo.setDate(now.getDate() - 7); | |
const randomDateTime = randomDate(oneWeekAgo, now); | |
// Post the Q&As to Discourse | |
const postToDiscourse = async (qa, category) => { | |
// Select a random user from the list of users | |
const creator = getRandomUser(DISCOURSE_POST_CREATORS); | |
const moderator = getRandomUser(DISCOURSE_PRODUCT_MODERATORS); | |
// Post the question as a new topic | |
const questionResponse = await fetch(`${DISCOURSE_SERVER_URL}posts.json`, { | |
method: 'POST', | |
headers: { | |
'Api-Key': DISCOURSE_API_KEY, | |
'Api-Username': creator, | |
'Content-Type': 'application/json' | |
}, | |
body: JSON.stringify({ | |
title: qa.t, | |
raw: qa.q, | |
category, | |
created_at: randomDate(oneWeekAgo, now).toISOString(), | |
// embed_url: '' // add documentation url if you want to embed it | |
}) | |
}).catch(err => console.error(err)); | |
const questionData = await questionResponse.json(); | |
if (questionData.errors) { | |
console.error(JSON.stringify(questionData)); | |
console.log(`(${qa.t} [Creator: ${creator}]) Failed to post question.`) | |
return; | |
} | |
console.info(`Question posted (${qa.t}): ${DISCOURSE_SERVER_URL}t/${questionData.topic_id}`); | |
// Post the answer as a reply to the new topic with a different user | |
const answerResponse = await fetch(`${DISCOURSE_SERVER_URL}posts.json`, { | |
method: 'POST', | |
headers: { | |
'Api-Key': DISCOURSE_API_KEY, | |
'Api-Username': moderator, | |
'Content-Type': 'application/json' | |
}, | |
body: JSON.stringify({ | |
raw: qa.a, | |
topic_id: questionData.topic_id, // topic_id of the question you've just created | |
}) | |
}).catch(err => console.error(err)); | |
const answerData = await answerResponse.json(); | |
if (answerData.errors) { | |
console.error(JSON.stringify(answerData)); | |
console.log(`(${qa.t} [Moderator: ${moderator}]) Failed to post answer.`) | |
return; | |
} | |
console.info(`Answer posted (${qa.t}): ${DISCOURSE_SERVER_URL}t/${answerData.topic_id}`); | |
}; | |
// Main function to scrape the site, generate Q&As and post them | |
const main = async () => { | |
// Step 1: Crawl all pages and parse the content | |
const pagesContent = await crawlAndParse(ROOT_DOCUMENTATION_PATH); | |
console.info(`Pages crawled and found ${pagesContent.length} pages`); | |
for (const content of pagesContent) { | |
console.info(`Page: ${content.url}`); | |
} | |
// Step 2: Generate Q&A for each page | |
const allQAs = []; | |
for (const content of pagesContent) { | |
const qas = await generateQA(content); | |
console.info(`Generated ${qas.length} Q&As for ${content.url}`); | |
console.log(JSON.stringify(qas, null, 2)) | |
allQAs.push(...qas); | |
} | |
// Step 3: Post all Q&As to Discourse | |
for (const qa of allQAs) { | |
await postToDiscourse(qa, CATEGORY_ID); | |
} | |
}; | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment