Last active
June 18, 2022 15:19
-
-
Save twobob/e772c3b207fc8cfca5c2e4008aefceb0 to your computer and use it in GitHub Desktop.
a python script that converts .mp4 to .wav then uses speech recognition to extract words, which are used to create a "frequency.txt" and some other useful distribution reports for each file as a .txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from speech_recognition import * | |
from os import walk, path | |
import glob | |
import wave | |
import nltk | |
import contextlib | |
from pydub import AudioSegment | |
from pydub.silence import split_on_silence | |
import subprocess | |
def split(filepath): | |
sound = AudioSegment.from_file(filepath) | |
chunks = split_on_silence( | |
sound, | |
min_silence_len = 500, | |
silence_thresh = sound.dBFS - 16, | |
keep_silence = 250, # optional | |
) | |
return chunks | |
def recognize(audio_file): | |
r = Recognizer() | |
inp = AudioFile(audio_file) | |
with inp as source: | |
audio = r.record(source) | |
try: | |
all_words = r.recognize_google(audio) | |
return all_words | |
except: | |
return "No speech detected" | |
def recognize_chunks(chunk): | |
# Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding. | |
silence_chunk = AudioSegment.silent(duration=500) | |
# Add the padding chunk to beginning and end of the entire chunk. | |
audio_chunk = silence_chunk + chunk + silence_chunk | |
# Export the audio chunk with new bitrate. | |
#print("Exporting tempchunk.wav") | |
audio_chunk.export( | |
".//tempchunk.wav", | |
bitrate = "44100", | |
format = "wav" | |
) | |
r = Recognizer() | |
inp = AudioFile(".//tempchunk.wav") | |
with inp as source: | |
audio = r.record(source) | |
try: | |
all_words = r.recognize_google(audio) | |
return all_words | |
except: | |
return "No speech detected" | |
def main(): | |
start = time.time() | |
root_dir = "./" | |
all_words = {} | |
durations = {} | |
frequency = [] | |
duration = 0 | |
wav_suffix = ".wav" | |
for filename in glob.iglob(root_dir + "**/*.mp4", recursive=True): | |
new_name = filename.replace('.mp4','') | |
if not path.exists(new_name+wav_suffix): | |
subprocess.call(['ffmpeg', '-i', filename, '-acodec',\ | |
'pcm_s16le', '-ac', '1', '-ar', '16000', new_name+wav_suffix]) | |
for filename in glob.iglob(root_dir + "**/*.wav", recursive=True): | |
if filename.endswith('.wav') and "tempchunk" not in filename: | |
words = "" | |
if path.exists(filename+".time.txt"): | |
myfile = open(filename+".time.txt", "r") | |
time_in_file = myfile.readline() | |
print(time_in_file) | |
duration = float(time_in_file) | |
else: | |
with contextlib.closing(wave.open(filename,'r')) as f: | |
frames = f.getnframes() | |
rate = f.getframerate() | |
duration = frames / float(rate) | |
with open(filename+".time.txt", "w") as o: | |
dur = str(duration) | |
o.write(dur + '\n') | |
durations[filename] = duration | |
print(filename, duration, "total time") | |
print(filename, 'processing') | |
# we need to split files that are longer than like 2 minutes | |
# minimum chunk length | |
target_length = 25 * 1000 # 25 seconds | |
if path.exists(filename+".txt"): | |
with open(filename+".txt", 'r') as fp: | |
for count, line in enumerate(fp): | |
pass | |
print('Total Lines', count + 1) | |
myfile = open(filename+".txt", "r") | |
for line in myfile: | |
words = words + line + ' ' | |
else: | |
if duration > 120: | |
print(f"total words so far in array from chunks: ", end ='' ) | |
chunks = split(filename) | |
output_chunks = [chunks[0]] | |
for chunk in chunks[1:]: | |
if len(output_chunks[-1]) < target_length: | |
output_chunks[-1] += chunk | |
else: | |
output_chunks.append(chunk) | |
for chunk in output_chunks: | |
tempwords = recognize_chunks(chunk) | |
if tempwords != "No speech detected": | |
words += tempwords | |
#print(tempwords) | |
prtial = str(len(words.split(' '))) | |
print(prtial, end=" ") | |
else: | |
words = recognize(filename) | |
all_words[filename] = words | |
if not path.exists(filename+".txt"): | |
with open(filename+".txt", "w") as o: | |
for i in words.split(' '): | |
o.write(i + ' \n') | |
if not path.exists(filename+".distribution.txt"): | |
with open(filename+".distribution.txt", "w") as o: | |
# create a frequency distribution | |
fdist = nltk.FreqDist(all_words[filename].split(' ')) | |
# print the top 150 most spoken words | |
o.write("word frequency report for "+ filename.replace('wav','')+"\n\n") | |
for w, count in fdist.most_common(150): | |
if not w.isspace(): | |
report = w.ljust(20) + str(count) + " \n" | |
o.write(report) | |
else: | |
print(filename, 'skipped') | |
for key, value in all_words.items(): | |
if value != "No speech detected": | |
mins = int( durations[key] / 60 ) | |
avg = value.count(" ")/mins | |
secs = (( durations[key] / 60) - mins) * 60 | |
word_totals = (value.count(" ") - value.count(" ") %2) /2 | |
frequency.append("{}: mins:{} secs:{} total words:{} words per min:{}".format(\ | |
key.replace(".wav","").ljust(90), str(int(mins)).ljust(2), str(int(secs)).ljust(2), str(int(word_totals)).ljust(2), avg)) | |
else: | |
print("No speech detected in {}".format(key)) | |
with open("frequency.txt", "w") as o: | |
for i in frequency: | |
print(i+ '\n') | |
o.write(i + '\n') | |
if os.path.exists("tempchunk.wav"): | |
os.remove("tempchunk.wav") | |
print(time.time() - start, "Total execution time") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thrown together for fun to analyse the words per minute of the excellent videos by James Sharman
https://twitter.com/WeirdBoyJim
https://www.youtube.com/watch?v=3iHag4k4yEg&list=PLFhc0MFC8MiCDOh3cGFji3qQfXziB9yOw