Created
April 12, 2023 20:16
-
-
Save ceifa/a607e715247dd90be7337d358c1d0769 to your computer and use it in GitHub Desktop.
openai web-qa node.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import axios from 'axios'; | |
import { load } from 'cheerio'; | |
import fs from 'node:fs'; | |
import { setTimeout } from 'node:timers/promises'; | |
import { createObjectCsvWriter } from 'csv-writer' | |
import tiktoken from 'tiktoken-node' | |
import { OpenAIApi, Configuration } from 'openai' | |
import distance from 'compute-cosine-distance'; | |
const domain = '' | |
const full_url = '' | |
const openai = new OpenAIApi(new Configuration({ | |
apiKey: '' | |
})) | |
const getHyperlinks = async (url) => { | |
try { | |
const response = await axios.get(url); | |
if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) { | |
return []; | |
} | |
const $ = load(response.data); | |
const hyperlinks = []; | |
$('a[href]').each((_, element) => { | |
hyperlinks.push($(element).attr('href')); | |
}); | |
return hyperlinks; | |
} catch (error) { | |
return []; | |
} | |
} | |
const getDomainHyperlinks = async (localDomain, url) => { | |
const hyperlinks = await getHyperlinks(url) | |
const cleanLinks = []; | |
hyperlinks.forEach((link) => { | |
let cleanLink = null; | |
if (/^http[s]*:\/\/.+/.test(link)) { | |
const urlObj = new URL(link); | |
if (urlObj.hostname === localDomain) { | |
cleanLink = `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`; | |
} | |
} else { | |
if (link.startsWith('/')) { | |
link = link.substr(1); | |
} else if (link.startsWith('#') || link.startsWith('mailto:') || link.startsWith('www.')) { | |
return; | |
} | |
cleanLink = `https://${localDomain}/${link.split(/[?#]/)[0]}`; | |
} | |
if (cleanLink) { | |
if (cleanLink.endsWith('/')) { | |
cleanLink = cleanLink.slice(0, -1); | |
} | |
cleanLinks.push(cleanLink); | |
} | |
}); | |
return Array.from(new Set(cleanLinks)); | |
} | |
const crawl = async (url) => { | |
const localDomain = new URL(url).hostname; | |
const queue = [url]; | |
const seen = new Set([url]); | |
if (!fs.existsSync("text/")) { | |
fs.mkdirSync("text/"); | |
} | |
if (!fs.existsSync(`text/${localDomain}/`)) { | |
fs.mkdirSync(`text/${localDomain}/`); | |
} | |
while (queue.length > 0) { | |
const currentUrl = queue.shift(); | |
console.log(currentUrl); | |
const fileName = currentUrl.replace(/https?:\/\//, '').replace(/:\d+/g, '').replace(/\//g, '_'); | |
const filePath = `text/${localDomain}/${fileName}.txt`; | |
await setTimeout(100) | |
try { | |
const response = await axios.get(currentUrl); | |
if (response.status !== 200 || response.headers['content-type'].indexOf('text/html') === -1) { | |
continue; | |
} | |
const $ = load(response.data); | |
$('script, style, link, noscript').remove(); | |
const text = $('body').text().replace(/\n+\s*/g, '\n').replace(/ +/g, ' ').trim(); | |
fs.writeFileSync(filePath, text); | |
const hyperlinks = await getDomainHyperlinks(localDomain, currentUrl); | |
hyperlinks.forEach((link) => { | |
if (!seen.has(link)) { | |
queue.push(link); | |
seen.add(link); | |
} | |
}); | |
} catch (error) { | |
console.log('Failed to process', currentUrl); | |
} | |
} | |
} | |
console.log('Crawling...'); | |
await crawl(full_url); | |
console.log('Done crawling'); | |
console.log('Processing...'); | |
let texts = []; | |
const tokenizer = tiktoken.getEncoding('cl100k_base') | |
const files = await fs.promises.readdir(`text/` + domain + '/'); | |
for (const file of files) { | |
const text = await fs.promises.readFile(`text/` + domain + '/' + file, 'utf8'); | |
const fixedText = text.replace(/\n/g, ' ').replace(/\\n/g, ' ').replace(/\s+/g, ' ') | |
const filename = file.slice(domain.length, -4) | |
if (filename && fixedText) { | |
texts.push({ | |
fname: filename, | |
text: fixedText, | |
n_tokens: tokenizer.encode(fixedText).length, | |
}); | |
} | |
} | |
if (!fs.existsSync("processed/")) { | |
fs.mkdirSync("processed/"); | |
} | |
const csvWriter = createObjectCsvWriter({ | |
path: 'processed/scraped.csv', | |
header: [ | |
{ id: 'fname', title: 'fname' }, | |
{ id: 'text', title: 'text' }, | |
] | |
}); | |
await csvWriter.writeRecords(texts); | |
console.log('Done processing'); | |
const maxTokens = 500 | |
const splitIntoMany = (text) => { | |
const sentences = text.split('. '); | |
const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length); | |
const chunks = []; | |
let tokensSoFar = 0; | |
let chunk = []; | |
for (let i = 0; i < sentences.length; i++) { | |
const sentence = sentences[i]; | |
const token = nTokens[i]; | |
if (tokensSoFar + token > maxTokens) { | |
chunks.push(chunk.join('. ') + '.'); | |
chunk = []; | |
tokensSoFar = 0; | |
} | |
if (token > maxTokens) { | |
continue; | |
} | |
chunk.push(sentence); | |
tokensSoFar += token + 1; | |
} | |
if (chunk) { | |
chunks.push(chunk.join('. ') + '.'); | |
} | |
return chunks; | |
} | |
const shortened = [] | |
for (const text of texts) { | |
if (text.n_tokens > maxTokens) { | |
shortened.push(...splitIntoMany(text.text)) | |
} else { | |
shortened.push(text.text) | |
} | |
} | |
console.log('Embedding...'); | |
const df = await Promise.all(shortened | |
.filter(t => t !== '.') | |
.map(async (text) => ({ | |
text, | |
n_tokens: tokenizer.encode(text).length, | |
embeddings: await openai.createEmbedding({ | |
input: text, | |
model: 'text-embedding-ada-002', | |
}).then((res) => res.data.data[0].embedding) | |
}))) | |
const csvWriter2 = createObjectCsvWriter({ | |
path: 'processed/embeddings.csv', | |
header: [ | |
{ id: 'text', title: 'text' }, | |
{ id: 'n_tokens', title: 'n_tokens' }, | |
{ id: 'embeddings', title: 'embeddings' }, | |
] | |
}); | |
await csvWriter2.writeRecords([...df].map((row) => ({ | |
text: row.text, | |
n_tokens: row.n_tokens, | |
embeddings: JSON.stringify(row.embeddings), | |
}))); | |
console.log('Done embedding'); | |
const create_context = async (question, max_len = 1800) => { | |
const q_embeddings = await openai.createEmbedding({ | |
input: question, | |
model: 'text-embedding-ada-002', | |
}) | |
const embeddings = q_embeddings.data.data[0].embedding | |
const sortedDf = [...df].map((row) => { | |
return { | |
...row, | |
distance: distance(embeddings, row.embeddings) | |
} | |
}).sort((a, b) => a.distance - b.distance) | |
const returns = [] | |
let cur_len = 0 | |
for (let i = 0; i < sortedDf.length; i++) { | |
const row = df[i] | |
cur_len += row.n_tokens + 4 | |
if (cur_len > max_len) { | |
break | |
} | |
returns.push(row.text) | |
} | |
return returns.join('\n\n###\n\n') | |
} | |
const answer_question = async (question, max_len = 1800) => { | |
const context = await create_context(question, max_len) | |
console.log('Context:\n' + context + '\n\n') | |
console.log('Question:\n' + question + '\n\n') | |
console.log('Answering...') | |
try { | |
const response = await openai.createCompletion({ | |
prompt: `Responda a pergunta com base no contexto abaixo e, se a pergunta não puder ser respondida com base no contexto, diga "Não sei"\n\nContexto: ${context}\n\n---\n\nPergunta: ${question}\nResposta:`, | |
temperature: 0, | |
max_tokens: 150, | |
top_p: 1, | |
frequency_penalty: 0, | |
presence_penalty: 0, | |
stop: null, | |
model: 'text-davinci-003', | |
}) | |
return response.data.choices[0].text.trim() | |
} catch (e) { | |
console.log(e) | |
return '' | |
} | |
} | |
console.log('Answering...'); | |
// Read from stdin | |
process.stdin.resume(); | |
process.stdin.setEncoding('utf8'); | |
// Read from stdin | |
process.stdin.on('data', async function (chunk) { | |
const question = chunk.trim() | |
if (question) { | |
console.log(await answer_question(question)) | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment