Forked from hrishioa/load_and_process_open_source_licenses.ts
Created
May 9, 2023 07:45
-
-
Save tuantranf/d12214413e67ea55c2460ccb5b036003 to your computer and use it in GitHub Desktop.
Simple Typescript file demonstrating chunked, chained LLM calls to process large amounts of text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Requires the gpt library from https://github.com/hrishioa/socrate and the progress bar library. | |
// Created by Hrishi Olickel ([email protected]) (@hrishioa). Reach out if you have trouble running this. | |
import { ThunkQueue } from '../../utils/simplethrottler'; | |
import { | |
AcceptedModels, | |
Messages, | |
askChatGPT, | |
getMessagesTokenCount, | |
getProperJSONFromGPT, | |
modelProperties, | |
} from '../base'; | |
const cliProgress = require('cli-progress'); | |
const colors = require('ansi-colors'); | |
import fs from 'fs'; | |
import { Browser, Page, PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright'; | |
type License = { | |
licenseName: string; | |
licenseContent: string; | |
licenseContentParts?: string[]; | |
processedAnswers?: ProcessedAnswers | string; | |
}; | |
type ProcessedAnswers = { | |
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions. | |
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work? | |
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license? | |
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral? | |
requirePublish: boolean; // Does this license require that the source code be published? | |
}; | |
const PROCESSED_ANSWERS_SPEC = `type ProcessedAnswers = { | |
commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions. | |
downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work? | |
persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license? | |
viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral? | |
requirePublish: boolean; // Does this license require that the source code be published? | |
}`; | |
const DEBUG = process.env.COPILOT_IS_DEBUG === 'true'; | |
export async function getLicenseList() { | |
const loader = new PlaywrightWebBaseLoader('https://spdx.org/licenses/', { | |
launchOptions: { | |
headless: true, | |
}, | |
gotoOptions: { | |
waitUntil: 'domcontentloaded', | |
}, | |
async evaluate(page: Page, browser: Browser) { | |
return await page.evaluate(() => { | |
return [].map.call(document.querySelectorAll('[typeof="spdx:License"]'), function (licensetag) { | |
return licensetag.textContent | |
}).join('~') | |
}) | |
}, | |
}); | |
const licenseStrs = await loader.scrape(); | |
const licenses = licenseStrs.split('~').map((licenseStr) => { | |
return licenseStr.replace(/[\s\n]/g, '') | |
}); | |
return licenses; | |
} | |
export async function getLicense(licenseId: string) { | |
const loader = new PlaywrightWebBaseLoader(`https://spdx.org/licenses/${licenseId}.html`, { | |
launchOptions: { | |
headless: true, | |
}, | |
gotoOptions: { | |
waitUntil: 'domcontentloaded', | |
}, | |
async evaluate(page: Page, browser: Browser) { | |
return await page.evaluate(() => { | |
return JSON.stringify({ | |
licenseName: document.querySelector('[property="spdx:name"]')!.textContent, | |
licenseContent: document.querySelector('[property="spdx:licenseText"]')!.textContent, | |
}) | |
}) | |
}, | |
}); | |
const licenseStr = await loader.scrape(); | |
const license = JSON.parse(licenseStr); | |
return license; | |
} | |
function splitLicenseIntoParagraphs(licenseText: string): string[] { | |
// Split the text into paragraphs based on multiple consecutive line breaks | |
const paragraphs = licenseText.split(/\n\s*\n/); | |
// Remove any leading or trailing whitespace from each paragraph | |
return paragraphs.map((paragraph) => paragraph.trim().replace(/\s+/, ' ')); | |
} | |
async function processLicenseWithGPT( | |
license: License, | |
previousAnswers?: string | |
): Promise<ProcessedAnswers | string | null> { | |
const BASEMODEL: AcceptedModels = 'gpt-3.5-turbo'; | |
const LICENSE_CONTENT_TOKEN_LIMIT = | |
modelProperties['gpt-3.5-turbo'].tokenLimit - 1000; | |
if (!license.licenseContentParts) | |
license.licenseContentParts = splitLicenseIntoParagraphs( | |
license.licenseContent | |
); | |
// prettier-ignore | |
const prompts = { | |
systemPrompt: (licenseContent: string, licenseName: string) => | |
`You are a commercial license processor that can only output valid JSON. | |
LICENSE_NAME: ${licenseName} | |
LICENSE_CONTENT_PART: | |
\`\`\` | |
${licenseContent} | |
\`\`\` | |
`, | |
startingPrompt: (previousAnswers?: string) => | |
`PROCESSED_ANSWERS_SPEC: | |
\`\`\`typescript | |
${PROCESSED_ANSWERS_SPEC} | |
\`\`\` | |
ANSWERS_FOR_PREVIOUS_PARTS: | |
${previousAnswers ? previousAnswers : 'None'} | |
LICENSE_CONTENT_PART contains part of a code license. ANSWERS_FOR_PREVIOUS_PARTS contains ProcessedAnswers about the previous parts of the license. Use ANSWERS_FOR_PREVIOUS_PARTS and LICENSE_CONTENT_PART to generate a new JSON in the spec of PROCESSED_ANSWERS_SPEC, answering the questions therein. | |
Processed Answers JSON: | |
{ | |
` | |
} | |
let trimmedLicenseContent = ''; | |
let trimmedLicenseTokenCount = 0; | |
let remainingPartsToProcess: string[] = []; | |
for (let i = 0; i < license.licenseContentParts.length; i++) { | |
const licensePartTokenCount = getMessagesTokenCount([ | |
{ | |
role: 'system', | |
content: license.licenseContentParts[i], | |
}, | |
]); | |
if ( | |
trimmedLicenseTokenCount + licensePartTokenCount < | |
LICENSE_CONTENT_TOKEN_LIMIT | |
) { | |
trimmedLicenseContent += license.licenseContentParts[i]; | |
trimmedLicenseTokenCount += licensePartTokenCount; | |
} else { | |
remainingPartsToProcess = license.licenseContentParts.slice(i); | |
break; | |
} | |
} | |
const messages: Messages = [ | |
{ | |
role: 'system', | |
content: prompts.systemPrompt(trimmedLicenseContent, license.licenseName), | |
}, | |
{ | |
role: 'user', | |
content: prompts.startingPrompt(previousAnswers), | |
}, | |
]; | |
if (DEBUG) | |
console.log( | |
'Processing part starting with ', | |
trimmedLicenseContent.slice(0, 100), | |
'...' | |
); | |
const result = await askChatGPT(messages, BASEMODEL, undefined, undefined, 1); | |
if (result.response.type === 'completeMessage') { | |
result.response.completeMessage = '{' + result.response.completeMessage; | |
if (DEBUG) console.log('Got {', result.response.completeMessage, '.'); | |
if (remainingPartsToProcess.length > 0) { | |
return await processLicenseWithGPT( | |
{ ...license, licenseContentParts: remainingPartsToProcess }, | |
result.response.completeMessage | |
); | |
} else { | |
try { | |
const processedAnswers: ProcessedAnswers = JSON.parse( | |
result.response.completeMessage | |
); | |
return processedAnswers; | |
} catch (err) { | |
const betterJSON = await getProperJSONFromGPT( | |
result.response.completeMessage, | |
1 | |
); | |
if (betterJSON.success) { | |
if (DEBUG) | |
console.log('JSON coercion got us ', betterJSON.extractedJSON, '.'); | |
return betterJSON.extractedJSON; | |
} else { | |
if (DEBUG) | |
console.error( | |
'Error processing ', | |
result.response.completeMessage, | |
' - ', | |
err | |
); | |
return result.response.completeMessage; | |
} | |
} | |
} | |
} else { | |
console.error( | |
'Error processing ', | |
license.licenseName, | |
' - ', | |
result.response | |
); | |
return null; | |
} | |
} | |
async function loadLicenses() { | |
console.log('Getting license list...'); | |
const licenseList = await getLicenseList(); | |
console.log('Downloading licenses...'); | |
const pBar = new cliProgress.SingleBar({ | |
format: | |
'Downloading Licenses |' + | |
colors.cyan('{bar}') + | |
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount} Current: {licenseId}', | |
barCompleteChar: '\u2588', | |
barIncompleteChar: '\u2591', | |
hideCursor: true, | |
}); | |
let errorCount = 0; | |
pBar.start(licenseList.length, 0, { | |
licenseId: licenseList[0], | |
errorCount: errorCount, | |
}); | |
const licenses = fs.existsSync('./tmp_data/licenses.json') | |
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8')) | |
: {}; | |
const erroredLicenses: string[] = []; | |
const licenseQueue = new ThunkQueue(50); | |
for (let i = 0; i < licenseList.length; i++) { | |
const licenseId = licenseList[i]; | |
if(i > 50) break; // This is just to make sure we don't keep ringing up super costly GPT-3 charges | |
licenseQueue.add(async () => { | |
try { | |
licenses[licenseId] = await getLicense(licenseId); | |
fs.writeFileSync( | |
'./tmp_data/licenses.json', | |
JSON.stringify(licenses, null, 2) | |
); | |
} catch (err) { | |
erroredLicenses.push(licenseId); | |
errorCount++; | |
} | |
pBar.increment(1, { | |
licenseId: licenseId, | |
errorCount: errorCount, | |
}); | |
}); | |
} | |
await licenseQueue.waitForAll(); | |
} | |
async function processLicenses() { | |
const licenseList: { [key: string]: License } = fs.existsSync( | |
'./tmp_data/licenses.json' | |
) | |
? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8')) | |
: {}; | |
const processedLicenseList: { [key: string]: License } = fs.existsSync( | |
'./tmp_data/processedLicenses.json' | |
) | |
? JSON.parse(fs.readFileSync('./tmp_data/processedLicenses.json', 'utf8')) | |
: {}; | |
const pBar = new cliProgress.SingleBar({ | |
format: | |
'Processing Licenses |' + | |
colors.cyan('{bar}') + | |
'| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount}, Succeeded: {successCount} Current: {licenseId}', | |
barCompleteChar: '\u2588', | |
barIncompleteChar: '\u2591', | |
hideCursor: true, | |
}); | |
let errorCount = 0, | |
successCount = 0; | |
pBar.start(Object.keys(licenseList).length, 0, { | |
licenseId: 'None', | |
errorCount: errorCount, | |
successCount: successCount, | |
}); | |
const licenseQueue = new ThunkQueue(1); | |
for (const licenseId of Object.keys(licenseList)) { | |
const license = licenseList[licenseId]; | |
licenseQueue.add(async () => { | |
if (!processedLicenseList[licenseId]) { | |
const processedAnswers = await processLicenseWithGPT(license); | |
if (processedAnswers === null || typeof processedAnswers === 'string') { | |
errorCount++; | |
} else { | |
license.processedAnswers = processedAnswers; | |
processedLicenseList[licenseId] = license; | |
successCount++; | |
fs.writeFileSync( | |
'./tmp_data/processedLicenses.json', | |
JSON.stringify(processedLicenseList, null, 2) | |
); | |
} | |
} | |
pBar.increment(1, { | |
licenseId: licenseId, | |
errorCount: errorCount, | |
successCount: successCount, | |
}); | |
}); | |
} | |
await licenseQueue.waitForAll(); | |
} | |
(async function loadAndProcessLicenses() { | |
await loadLicenses(); | |
// await processLicenses(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment