This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydub import AudioSegment | |
mp3_audio = AudioSegment.from_file(r"audio_full.wav", format="wav") | |
print(len(mp3_audio)/(1000*60)) | |
# 12 Minutes audio breaks into 3 minutes 4 audio files (slicingis done by milliseconds) | |
counter_audio = 180 | |
split_audio = [mp3_audio[:180*1000]] | |
for i in range(4): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A python package for music and audio analysis. | |
# https://librosa.org/doc/latest/index.html | |
import librosa | |
import torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
# load model and tokenizer | |
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import praw | |
>>> import pandas as pd | |
>>> from sklearn.cluster import KMeans | |
>>> from sklearn.feature_extraction.text import TfidfVectorizer | |
>>> import random | |
>>> import numpy as np | |
>>> from transformers import RobertaTokenizer | |
>>> roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
>>> reddit = praw.Reddit(client_id='client id', | |
... client_secret='client secret', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import praw | |
>>> reddit = praw.Reddit(client_id='client id', #1 | |
... client_secret='client secret', | |
... user_agent='user agent') | |
Version 7.1.0 of praw is outdated. Version 7.2.0 was released Wednesday February 24, 2021. | |
>>> def replies_of(top_level_comment, comment_list): #2 | |
... if len(top_level_comment.replies) == 0: | |
... return | |
... else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> text_data_test = "Life is what life happens when you're busy life making other life plans." | |
>>> index_list = [] | |
>>> flag = 0 | |
>>> count = 0 | |
>>> word_length = len('life') | |
>>> while 'life' in text_data_test: | |
... return_index = text_data_test.lower().find('life') | |
... print(return_index) | |
... if return_index == -1: | |
... break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from nltk.tokenize import word_tokenize | |
>>> text_data = "Life is what happens when you're busy making other plans." | |
>>> duplicate_data = "what happens when you're busy" | |
>>> original_tokens = word_tokenize(text_data) | |
>>> duplicate_tokens = word_tokenize(duplicate_data) | |
>>> # Convert all the characters to lower case because this method is case sensitive. | |
>>> original_tokens = [token.lower() for token in original_tokens] | |
>>> duplicate_tokens = [token.lower() for token in duplicate_tokens] | |
>>> original_trigrams = [] | |
>>> for i in range(len(original_tokens) - 2): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from gensim.summarization import keywords | |
>>> text = """spaCy is an open-source software library for advanced natural language processing, | |
written in the programming languages Python and Cython. The library is published under the MIT license | |
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion.""" | |
>>> print(keywords(text)) | |
language | |
languages | |
software | |
company |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from rake_nltk import Rake | |
>>> rake_nltk_var = Rake() | |
>>> text = """spaCy is an open-source software library for advanced natural language processing, | |
written in the programming languages Python and Cython. The library is published under the MIT license | |
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion.""" | |
>>> rake_nltk_var.extract_keywords_from_text(text) | |
>>> keyword_extracted = rake_nltk_var.get_ranked_phrases() | |
>>> print(keyword_extracted) | |
['advanced natural language processing', 'software company explosion', | |
'programming languages python', 'source software library', 'mit license', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import yake | |
>>> kw_extractor = yake.KeywordExtractor() | |
>>> text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion.""" | |
>>> language = "en" | |
>>> max_ngram_size = 3 | |
>>> deduplication_threshold = 0.9 | |
>>> numOfKeywords = 20 | |
>>> custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None) | |
>>> keywords = custom_kw_extractor.extract_keywords(text) | |
>>> for kw in keywords: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import spacy | |
>>> nlp = spacy.load("en_core_sci_lg") | |
>>> text = """spaCy is an open-source software library for advanced natural language processing, | |
written in the programming languages Python and Cython. The library is published under the MIT license | |
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion.""" | |
>>> doc = nlp(text) | |
>>> print(doc.ents) | |
(spaCy, open-source software library, written, programming languages, | |
Python, Cython, library, MIT, license, developers, Matthew Honnibal, | |
Ines, Montani, founders, software company) |
NewerOlder