Skip to content

Instantly share code, notes, and snippets.

@twobob
Last active June 18, 2022 15:19
Show Gist options
  • Save twobob/e772c3b207fc8cfca5c2e4008aefceb0 to your computer and use it in GitHub Desktop.
Save twobob/e772c3b207fc8cfca5c2e4008aefceb0 to your computer and use it in GitHub Desktop.
a python script that converts .mp4 to .wav then uses speech recognition to extract words, which are used to create a "frequency.txt" and some other useful distribution reports for each file as a .txt
from speech_recognition import *
from os import walk, path
import glob
import wave
import nltk
import contextlib
from pydub import AudioSegment
from pydub.silence import split_on_silence
import subprocess
def split(filepath):
sound = AudioSegment.from_file(filepath)
chunks = split_on_silence(
sound,
min_silence_len = 500,
silence_thresh = sound.dBFS - 16,
keep_silence = 250, # optional
)
return chunks
def recognize(audio_file):
r = Recognizer()
inp = AudioFile(audio_file)
with inp as source:
audio = r.record(source)
try:
all_words = r.recognize_google(audio)
return all_words
except:
return "No speech detected"
def recognize_chunks(chunk):
# Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
silence_chunk = AudioSegment.silent(duration=500)
# Add the padding chunk to beginning and end of the entire chunk.
audio_chunk = silence_chunk + chunk + silence_chunk
# Export the audio chunk with new bitrate.
#print("Exporting tempchunk.wav")
audio_chunk.export(
".//tempchunk.wav",
bitrate = "44100",
format = "wav"
)
r = Recognizer()
inp = AudioFile(".//tempchunk.wav")
with inp as source:
audio = r.record(source)
try:
all_words = r.recognize_google(audio)
return all_words
except:
return "No speech detected"
def main():
start = time.time()
root_dir = "./"
all_words = {}
durations = {}
frequency = []
duration = 0
wav_suffix = ".wav"
for filename in glob.iglob(root_dir + "**/*.mp4", recursive=True):
new_name = filename.replace('.mp4','')
if not path.exists(new_name+wav_suffix):
subprocess.call(['ffmpeg', '-i', filename, '-acodec',\
'pcm_s16le', '-ac', '1', '-ar', '16000', new_name+wav_suffix])
for filename in glob.iglob(root_dir + "**/*.wav", recursive=True):
if filename.endswith('.wav') and "tempchunk" not in filename:
words = ""
if path.exists(filename+".time.txt"):
myfile = open(filename+".time.txt", "r")
time_in_file = myfile.readline()
print(time_in_file)
duration = float(time_in_file)
else:
with contextlib.closing(wave.open(filename,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
with open(filename+".time.txt", "w") as o:
dur = str(duration)
o.write(dur + '\n')
durations[filename] = duration
print(filename, duration, "total time")
print(filename, 'processing')
# we need to split files that are longer than like 2 minutes
# minimum chunk length
target_length = 25 * 1000 # 25 seconds
if path.exists(filename+".txt"):
with open(filename+".txt", 'r') as fp:
for count, line in enumerate(fp):
pass
print('Total Lines', count + 1)
myfile = open(filename+".txt", "r")
for line in myfile:
words = words + line + ' '
else:
if duration > 120:
print(f"total words so far in array from chunks: ", end ='' )
chunks = split(filename)
output_chunks = [chunks[0]]
for chunk in chunks[1:]:
if len(output_chunks[-1]) < target_length:
output_chunks[-1] += chunk
else:
output_chunks.append(chunk)
for chunk in output_chunks:
tempwords = recognize_chunks(chunk)
if tempwords != "No speech detected":
words += tempwords
#print(tempwords)
prtial = str(len(words.split(' ')))
print(prtial, end=" ")
else:
words = recognize(filename)
all_words[filename] = words
if not path.exists(filename+".txt"):
with open(filename+".txt", "w") as o:
for i in words.split(' '):
o.write(i + ' \n')
if not path.exists(filename+".distribution.txt"):
with open(filename+".distribution.txt", "w") as o:
# create a frequency distribution
fdist = nltk.FreqDist(all_words[filename].split(' '))
# print the top 150 most spoken words
o.write("word frequency report for "+ filename.replace('wav','')+"\n\n")
for w, count in fdist.most_common(150):
if not w.isspace():
report = w.ljust(20) + str(count) + " \n"
o.write(report)
else:
print(filename, 'skipped')
for key, value in all_words.items():
if value != "No speech detected":
mins = int( durations[key] / 60 )
avg = value.count(" ")/mins
secs = (( durations[key] / 60) - mins) * 60
word_totals = (value.count(" ") - value.count(" ") %2) /2
frequency.append("{}: mins:{} secs:{} total words:{} words per min:{}".format(\
key.replace(".wav","").ljust(90), str(int(mins)).ljust(2), str(int(secs)).ljust(2), str(int(word_totals)).ljust(2), avg))
else:
print("No speech detected in {}".format(key))
with open("frequency.txt", "w") as o:
for i in frequency:
print(i+ '\n')
o.write(i + '\n')
if os.path.exists("tempchunk.wav"):
os.remove("tempchunk.wav")
print(time.time() - start, "Total execution time")
if __name__ == '__main__':
main()
@twobob
Copy link
Author

twobob commented Jun 18, 2022

Thrown together for fun to analyse the words per minute of the excellent videos by James Sharman
https://twitter.com/WeirdBoyJim
https://www.youtube.com/watch?v=3iHag4k4yEg&list=PLFhc0MFC8MiCDOh3cGFji3qQfXziB9yOw

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment