Skip to content

Instantly share code, notes, and snippets.

@pietrop
Last active December 5, 2019 21:04
Show Gist options
  • Save pietrop/a6a4d23e9b83b539cad74e34360e2935 to your computer and use it in GitHub Desktop.
Save pietrop/a6a4d23e9b83b539cad74e34360e2935 to your computer and use it in GitHub Desktop.
copy paste script in console to scrape C-Span captions from a video
/**
* To Scrape captions text from C-Span website
*
* Example page
* https://www.c-span.org/video/?465757-1/federal-reserve-chair-powell-announces-interest-rate-cut-signals-pause
*/
// possible function to clean up text
// https://github.com/voxmedia/c-span_opened_captions_server/blob/master/index.js#L61
function formatText(str) {
var ret = str.toLowerCase().replace("\r\n", ' ') // remove random line breaks
// ret = s.clean(ret) // remove redundant spaces
// now use our words file to do a bunch of stuff
// words.forEach((pair) => {
// ret = ret
// .replace(new RegExp(` ${pair[0].replace('.', '\\.')}( |\\.|,|:|')`, 'gi'), (match, a) => { return ` ${pair[1]}${a}` })
// .replace(new RegExp(`^${pair[0]}( |\\.|,|:|')`, 'i'), (match, a) => { return `${pair[1]}${a}` })
// .replace(new RegExp(` ${pair[0]}$`, 'i'), pair[1])
// })
ret = ret
// Music notes
.replace(/\s+b\x19\*\s+/, '\n\n🎵\n\n')
// remove blank space before puncuation
.replace(/\s+(!|\?|;|:|,|\.|')/g, '$1')
// handle honorifics
.replace(/ (sen\.?|rep\.?|mr\.?|mrs\.?|ms\.?|dr\.?) (\w)/gi,
(match, a, b) => { return ` ${s.capitalize(a)} ${b.toUpperCase()}` })
// Cap first letter of sentences
.replace(/(!|\?|:|\.|>>)\s+(\w)/g, (match, a, b) => { return `${a} ${b.toUpperCase()}` })
// >> seems to be used instead of repeating speaker prompts in back and forths
.replace(/\s*>>\s*/g, "\n\n>> ")
// Put speaker prompts on new lines
.replace(/(\.|"|!|\?|—)\s*([a-zA-Z. ]{2,30}:)/g, '$1\n\n$2')
return ret
}
// Opens up the 'show more'
document.querySelectorAll('.hidden-full-transcript-link').forEach((p)=>{
return p.click()
})
// Get the transcription text.
const result = [];
document.querySelectorAll('.short_transcript').forEach((p)=>{
//console.log(p.innerText);
const text = p.innerText;
result.push(text);
})
const resultString = result.join('\n');
const resultCleaned = formatText(resultString)
console.log(resultCleaned);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment