Last active
December 5, 2019 21:04
-
-
Save pietrop/a6a4d23e9b83b539cad74e34360e2935 to your computer and use it in GitHub Desktop.
copy paste script in console to scrape C-Span captions from a video
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* To Scrape captions text from C-Span website | |
* | |
* Example page | |
* https://www.c-span.org/video/?465757-1/federal-reserve-chair-powell-announces-interest-rate-cut-signals-pause | |
*/ | |
// possible function to clean up text | |
// https://github.com/voxmedia/c-span_opened_captions_server/blob/master/index.js#L61 | |
function formatText(str) { | |
var ret = str.toLowerCase().replace("\r\n", ' ') // remove random line breaks | |
// ret = s.clean(ret) // remove redundant spaces | |
// now use our words file to do a bunch of stuff | |
// words.forEach((pair) => { | |
// ret = ret | |
// .replace(new RegExp(` ${pair[0].replace('.', '\\.')}( |\\.|,|:|')`, 'gi'), (match, a) => { return ` ${pair[1]}${a}` }) | |
// .replace(new RegExp(`^${pair[0]}( |\\.|,|:|')`, 'i'), (match, a) => { return `${pair[1]}${a}` }) | |
// .replace(new RegExp(` ${pair[0]}$`, 'i'), pair[1]) | |
// }) | |
ret = ret | |
// Music notes | |
.replace(/\s+b\x19\*\s+/, '\n\n🎵\n\n') | |
// remove blank space before puncuation | |
.replace(/\s+(!|\?|;|:|,|\.|')/g, '$1') | |
// handle honorifics | |
.replace(/ (sen\.?|rep\.?|mr\.?|mrs\.?|ms\.?|dr\.?) (\w)/gi, | |
(match, a, b) => { return ` ${s.capitalize(a)} ${b.toUpperCase()}` }) | |
// Cap first letter of sentences | |
.replace(/(!|\?|:|\.|>>)\s+(\w)/g, (match, a, b) => { return `${a} ${b.toUpperCase()}` }) | |
// >> seems to be used instead of repeating speaker prompts in back and forths | |
.replace(/\s*>>\s*/g, "\n\n>> ") | |
// Put speaker prompts on new lines | |
.replace(/(\.|"|!|\?|—)\s*([a-zA-Z. ]{2,30}:)/g, '$1\n\n$2') | |
return ret | |
} | |
// Opens up the 'show more' | |
document.querySelectorAll('.hidden-full-transcript-link').forEach((p)=>{ | |
return p.click() | |
}) | |
// Get the transcription text. | |
const result = []; | |
document.querySelectorAll('.short_transcript').forEach((p)=>{ | |
//console.log(p.innerText); | |
const text = p.innerText; | |
result.push(text); | |
}) | |
const resultString = result.join('\n'); | |
const resultCleaned = formatText(resultString) | |
console.log(resultCleaned); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment