// Requires the gpt library from https://github.com/hrishioa/socrate and the progress bar library. // Created by Hrishi Olickel (hrishioa@gmail.com) (@hrishioa). Reach out if you have trouble running this. import { ThunkQueue } from '../../utils/simplethrottler'; import { AcceptedModels, Messages, askChatGPT, getMessagesTokenCount, getProperJSONFromGPT, modelProperties, } from '../base'; const cliProgress = require('cli-progress'); const colors = require('ansi-colors'); import fs from 'fs'; import { Browser, Page, PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright'; type License = { licenseName: string; licenseContent: string; licenseContentParts?: string[]; processedAnswers?: ProcessedAnswers | string; }; type ProcessedAnswers = { commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions. downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work? persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license? viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral? requirePublish: boolean; // Does this license require that the source code be published? }; const PROCESSED_ANSWERS_SPEC = `type ProcessedAnswers = { commercialConditions: string; // What conditions must be followed for commercial use? "" if there are no conditions. downstreamChanges: string; // Does using code licensed under this license require any changes to the licensing of the derivative work? persistent: boolean; // Should all derivative work that uses code licensed under this license, also be distributed under the same license? viral: boolean; // Viral effect means that combining copyleft licensed work with a work licensed under a different license leads to the resulting work (an aggregate work) falling under the copyleft license. Is this license viral? requirePublish: boolean; // Does this license require that the source code be published? }`; const DEBUG = process.env.COPILOT_IS_DEBUG === 'true'; export async function getLicenseList() { const loader = new PlaywrightWebBaseLoader('https://spdx.org/licenses/', { launchOptions: { headless: true, }, gotoOptions: { waitUntil: 'domcontentloaded', }, async evaluate(page: Page, browser: Browser) { return await page.evaluate(() => { return [].map.call(document.querySelectorAll('[typeof="spdx:License"]'), function (licensetag) { return licensetag.textContent }).join('~') }) }, }); const licenseStrs = await loader.scrape(); const licenses = licenseStrs.split('~').map((licenseStr) => { return licenseStr.replace(/[\s\n]/g, '') }); return licenses; } export async function getLicense(licenseId: string) { const loader = new PlaywrightWebBaseLoader(`https://spdx.org/licenses/${licenseId}.html`, { launchOptions: { headless: true, }, gotoOptions: { waitUntil: 'domcontentloaded', }, async evaluate(page: Page, browser: Browser) { return await page.evaluate(() => { return JSON.stringify({ licenseName: document.querySelector('[property="spdx:name"]')!.textContent, licenseContent: document.querySelector('[property="spdx:licenseText"]')!.textContent, }) }) }, }); const licenseStr = await loader.scrape(); const license = JSON.parse(licenseStr); return license; } function splitLicenseIntoParagraphs(licenseText: string): string[] { // Split the text into paragraphs based on multiple consecutive line breaks const paragraphs = licenseText.split(/\n\s*\n/); // Remove any leading or trailing whitespace from each paragraph return paragraphs.map((paragraph) => paragraph.trim().replace(/\s+/, ' ')); } async function processLicenseWithGPT( license: License, previousAnswers?: string ): Promise { const BASEMODEL: AcceptedModels = 'gpt-3.5-turbo'; const LICENSE_CONTENT_TOKEN_LIMIT = modelProperties['gpt-3.5-turbo'].tokenLimit - 1000; if (!license.licenseContentParts) license.licenseContentParts = splitLicenseIntoParagraphs( license.licenseContent ); // prettier-ignore const prompts = { systemPrompt: (licenseContent: string, licenseName: string) => `You are a commercial license processor that can only output valid JSON. LICENSE_NAME: ${licenseName} LICENSE_CONTENT_PART: \`\`\` ${licenseContent} \`\`\` `, startingPrompt: (previousAnswers?: string) => `PROCESSED_ANSWERS_SPEC: \`\`\`typescript ${PROCESSED_ANSWERS_SPEC} \`\`\` ANSWERS_FOR_PREVIOUS_PARTS: ${previousAnswers ? previousAnswers : 'None'} LICENSE_CONTENT_PART contains part of a code license. ANSWERS_FOR_PREVIOUS_PARTS contains ProcessedAnswers about the previous parts of the license. Use ANSWERS_FOR_PREVIOUS_PARTS and LICENSE_CONTENT_PART to generate a new JSON in the spec of PROCESSED_ANSWERS_SPEC, answering the questions therein. Processed Answers JSON: { ` } let trimmedLicenseContent = ''; let trimmedLicenseTokenCount = 0; let remainingPartsToProcess: string[] = []; for (let i = 0; i < license.licenseContentParts.length; i++) { const licensePartTokenCount = getMessagesTokenCount([ { role: 'system', content: license.licenseContentParts[i], }, ]); if ( trimmedLicenseTokenCount + licensePartTokenCount < LICENSE_CONTENT_TOKEN_LIMIT ) { trimmedLicenseContent += license.licenseContentParts[i]; trimmedLicenseTokenCount += licensePartTokenCount; } else { remainingPartsToProcess = license.licenseContentParts.slice(i); break; } } const messages: Messages = [ { role: 'system', content: prompts.systemPrompt(trimmedLicenseContent, license.licenseName), }, { role: 'user', content: prompts.startingPrompt(previousAnswers), }, ]; if (DEBUG) console.log( 'Processing part starting with ', trimmedLicenseContent.slice(0, 100), '...' ); const result = await askChatGPT(messages, BASEMODEL, undefined, undefined, 1); if (result.response.type === 'completeMessage') { result.response.completeMessage = '{' + result.response.completeMessage; if (DEBUG) console.log('Got {', result.response.completeMessage, '.'); if (remainingPartsToProcess.length > 0) { return await processLicenseWithGPT( { ...license, licenseContentParts: remainingPartsToProcess }, result.response.completeMessage ); } else { try { const processedAnswers: ProcessedAnswers = JSON.parse( result.response.completeMessage ); return processedAnswers; } catch (err) { const betterJSON = await getProperJSONFromGPT( result.response.completeMessage, 1 ); if (betterJSON.success) { if (DEBUG) console.log('JSON coercion got us ', betterJSON.extractedJSON, '.'); return betterJSON.extractedJSON; } else { if (DEBUG) console.error( 'Error processing ', result.response.completeMessage, ' - ', err ); return result.response.completeMessage; } } } } else { console.error( 'Error processing ', license.licenseName, ' - ', result.response ); return null; } } async function loadLicenses() { console.log('Getting license list...'); const licenseList = await getLicenseList(); console.log('Downloading licenses...'); const pBar = new cliProgress.SingleBar({ format: 'Downloading Licenses |' + colors.cyan('{bar}') + '| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount} Current: {licenseId}', barCompleteChar: '\u2588', barIncompleteChar: '\u2591', hideCursor: true, }); let errorCount = 0; pBar.start(licenseList.length, 0, { licenseId: licenseList[0], errorCount: errorCount, }); const licenses = fs.existsSync('./tmp_data/licenses.json') ? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8')) : {}; const erroredLicenses: string[] = []; const licenseQueue = new ThunkQueue(50); for (let i = 0; i < licenseList.length; i++) { const licenseId = licenseList[i]; if(i > 50) break; // This is just to make sure we don't keep ringing up super costly GPT-3 charges licenseQueue.add(async () => { try { licenses[licenseId] = await getLicense(licenseId); fs.writeFileSync( './tmp_data/licenses.json', JSON.stringify(licenses, null, 2) ); } catch (err) { erroredLicenses.push(licenseId); errorCount++; } pBar.increment(1, { licenseId: licenseId, errorCount: errorCount, }); }); } await licenseQueue.waitForAll(); } async function processLicenses() { const licenseList: { [key: string]: License } = fs.existsSync( './tmp_data/licenses.json' ) ? JSON.parse(fs.readFileSync('./tmp_data/licenses.json', 'utf8')) : {}; const processedLicenseList: { [key: string]: License } = fs.existsSync( './tmp_data/processedLicenses.json' ) ? JSON.parse(fs.readFileSync('./tmp_data/processedLicenses.json', 'utf8')) : {}; const pBar = new cliProgress.SingleBar({ format: 'Processing Licenses |' + colors.cyan('{bar}') + '| {percentage}% || {value}/{total} Licenses ({eta}s left) || Errored: {errorCount}, Succeeded: {successCount} Current: {licenseId}', barCompleteChar: '\u2588', barIncompleteChar: '\u2591', hideCursor: true, }); let errorCount = 0, successCount = 0; pBar.start(Object.keys(licenseList).length, 0, { licenseId: 'None', errorCount: errorCount, successCount: successCount, }); const licenseQueue = new ThunkQueue(1); for (const licenseId of Object.keys(licenseList)) { const license = licenseList[licenseId]; licenseQueue.add(async () => { if (!processedLicenseList[licenseId]) { const processedAnswers = await processLicenseWithGPT(license); if (processedAnswers === null || typeof processedAnswers === 'string') { errorCount++; } else { license.processedAnswers = processedAnswers; processedLicenseList[licenseId] = license; successCount++; fs.writeFileSync( './tmp_data/processedLicenses.json', JSON.stringify(processedLicenseList, null, 2) ); } } pBar.increment(1, { licenseId: licenseId, errorCount: errorCount, successCount: successCount, }); }); } await licenseQueue.waitForAll(); } (async function loadAndProcessLicenses() { await loadLicenses(); // await processLicenses(); })();