Skip to content

Instantly share code, notes, and snippets.

@ivarprudnikov
Created January 4, 2025 19:31
Show Gist options
  • Save ivarprudnikov/081f5a6fb7a041ecb323a792d9f7ab88 to your computer and use it in GitHub Desktop.
Save ivarprudnikov/081f5a6fb7a041ecb323a792d9f7ab88 to your computer and use it in GitHub Desktop.
Summarise MS Stream transcripts after downloading them from the website as JSON files.

Lecture transcripts

To get the transcripts I got them in the MS Stream website. The transcript was extracted when inspecting the file downloads in the network tab. You need to filter the network tab to list the paths that contain "transcript" and select the largest one. The response will contain the JSON which was copied to this directory in the form of transcript.MMMDD.json.

Convert JSON to Markdown

Make sure you have Node.JS in path and then run the script:

node convertTranscript.js

The script will sort the entries in each transcript file and will write the content to Mardown file. The output will be in the form of transcript.MMMDD.json.md.

Summarise the transcripts using OpenAI

You will need to install the depenencies first:

npm install

Then copy your OpenAI API key to the .env file so that it looks like this:

OPENAI_API_KEY=your_api_key_here

Then run the script:

node --env-file=.env summariseTranscript.js
/**
* This Node.js script reads each transcript file downloaded from MS Stream
* extracts every line from sorted json entries array
* and writes those lines to the equivalent markdown file.
*/
const fs = require('fs');
const path = require('path');
directoryPath = __dirname;
fs.readdir(directoryPath, (err, files) => {
if (err) {
return console.log('Unable to scan directory: ' + err);
}
files.forEach((file) => {
if (path.extname(file) === '.json' && file.startsWith('transcript')) {
fs.readFile(path.join(directoryPath, file), 'utf8', (err, data) => {
if (err) {
console.log('Error reading file:', err);
return;
}
try {
const transcript = JSON.parse(data);
const transcriptEntries = transcript.entries.sort((a, b) => a.startOffset - b.startOffset);
let markdownTranscript = '';
let speakerName = null;
transcriptEntries.forEach(entry => {
if (entry.speakerDisplayName !== speakerName) {
markdownTranscript += `\n\n**${entry.speakerDisplayName} is saying:**\n\n`;
speakerName = entry.speakerDisplayName;
}
markdownTranscript += `${entry.text}\n`;
});
const outName = `${file}.md`;
fs.writeFile(path.join(directoryPath, outName), markdownTranscript, (err) => {
if (err) {
console.log('Error writing file:', err);
} else {
console.log(`Successfully wrote transcript to ${outName}`);
}
});
} catch (parseErr) {
console.log('Error parsing JSON:', parseErr);
}
});
} else {
console.log(`Skipping file: ${file}`);
}
})
});
{
"name": "transcripts",
"version": "1.0.0",
"description": "",
"main": "",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"openai": "^4.77.3"
}
}
/**
* Using openai module to upload files to OpenAI to summarise them
*/
const OpenAI = require('openai');
const fs = require('fs');
const path = require('path');
directoryPath = __dirname;
const client = new OpenAI({
apiKey: process.env['OPENAI_API_KEY'],
});
async function summarizeTranscript(fileName) {
const fileData = await fs.promises.readFile(path.join(__dirname, fileName), 'utf8');
const chatCompletion = await client.chat.completions.create({
messages: [{ role: 'user', content: `Provide a detailed summary of the following lecture transcript and make sure to cover each topic of the course content, no topic should be omitted from the summary, present the summary in markdown with appropriate headings:\n\n${fileData}` }],
model: 'gpt-4o',
});
return chatCompletion.choices[0].message.content;
}
async function main() {
const files = await fs.promises.readdir(directoryPath)
const transcriptFiles = files.filter(file => path.extname(file) === '.md' && file.startsWith('transcript'));
for (const file of transcriptFiles) {
console.log(`Summarising file: ${file}`);
const summary = await summarizeTranscript(file);
const outName = `summarised.${file}`;
await fs.promises.writeFile(path.join(directoryPath, outName), summary);
}
console.log('Summarization complete.');
}
main();
{
"$schema": "http://stream.office.com/schemas/transcript.json",
"version": "1.0.0",
"type": "Transcript",
"entries": [
{
"id": "065b775a-afc5-4027-add5-a82c0aa38efb/75",
"speechServiceResultId": "ca3f3e56f82e406781c817e5fecb35e3",
"text": "Hello world",
"speakerId": "3861413c-a8a0-45df-9173-4444e243011@6edb49c1-bf72-4eea-8b3f-a7fd0a25b68c",
"speakerDisplayName": "John Doe",
"confidence": 0.543203,
"startOffset": "00:00:28.4578205",
"endOffset": "00:00:30.0178205",
"hasBeenEdited": false,
"roomId": null,
"spokenLanguageTag": "en-gb"
},
{
"id": "065b775a-afc5-4027-add5-a82c0aa38efb/132",
"speechServiceResultId": "d0efe07cc98f4a57b1fb190d39c14520",
"text": "I want to introduce you to some content here.",
"speakerId": "3861413c-a8a0-45df-9173-4444e243011@6edb49c1-bf72-4eea-8b3f-a7fd0a25b68c",
"speakerDisplayName": "John Doe",
"confidence": 0.5790199,
"startOffset": "00:00:30.9278205",
"endOffset": "00:00:56.0078205",
"hasBeenEdited": false,
"roomId": null,
"spokenLanguageTag": "en-gb"
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment