NewscatcherAPI · December 30, 2021 09:55 · shantanuo · Dec 30, 2021
diff --git a/all_summary.py b/all_summary.py
 summary = [article['summary'] for article in articles]
 sentence = summary[0]
diff --git a/article_structure.py b/article_structure.py
 {'_id': 'baff70092abcf695d73af5186c4df82f',
 '_score': 26.395569,
 'author': None,
 'authors': [],
 'clean_url': 'cnbctv18.com',
 'country': 'US',
 'excerpt': "Some analysts believe an Ethereum ETF can be more successful than the Bitcoin ETF. For example, Grayscale's Ethereum Trust (ETHE) is witnessing more institutional investors flocking to Grayscale's…",
 'is_opinion': False,
 'language': 'en',
 'link': 'https://www.cnbctv18.com/cryptocurrency/is-ethereum-etf-on-the-way-grayscale-ceo-deciphers-11178362.htm’,
 'media': 'https://images.cnbctv18.com/wp-content/uploads/2021/09/ether-1019x573.jpg',
 'published_date': '2021-10-21 13:14:33',
 'published_date_precision': 'timezone unknown',
 'rank': 16951,
 'rights': 'cnbctv18.com',
 'summary': "Following the debut of Bitcoin futures ETF in the United States, the crypto market is abuzz with talks of an impending Ether ETF.Speaking on a show on CNBC, Michael Sonnenshein, CEO of Grayscale -- an asset management company with $52 billion in assets under management -- says it is possible. He said it 'stands to reason' the Securities and Exchange Committee (SEC) will proactively consider bringing Ethereum ETF and other similar products in the US market.Canada already has Bitcoin, Ethereum ETFsWhile US regulators have allowed Bitcoin futures ETF to be traded on the exchanges, Canada has allowed both Bitcoin and Ethereum ETFs.",
 'title': 'Is Ethereum ETF on the way? Grayscale CEO deciphers',
 'topic': 'news',
 'twitter_account': None
 }
diff --git a/demo.py b/demo.py
 import time

 text = input("Enter the text to be tokenized: \n")
 choice = input("\nEnter Your choice of library: \n->spaCy (s) \n->NLTK (n) \n->Both (b)\n")
 if choice == 's':
    spacy_pipeline(text)
 elif choice =='n':
    nltk_pipeline(text)
 elif choice == 'b':
    start = time()
    print("\t\t\tspaCy\n")
    spacy_pipeline(text)
    print(f"Time taken by spaCy: {time()-start}s")
    start = time()
    print("\n\t\t\tnltk\n")
    spacy_pipeline(text)
    print(f"Time taken by spaCy: {time()-start}s")
 else:
    print("Invalid choice!")
diff --git a/fetch_articles.py b/fetch_articles.py
 API_KEY = "YOUR API KEY GOES HERE"
 from newscatcherapi import NewsCatcherApiClient
 
 newscatcherapi = NewsCatcherApiClient(x_api_key=API_KEY)
 
 data = newscatcherapi.get_search(q="Bitcoin OR Ethereum OR crypto",
                                         lang='en',
                                         page_size=100)
 articles = data['articles']
 print(articles[0])
diff --git a/import_nltk.py b/import_nltk.py
 import nltk
 nltk.download('punkt')
 nltk.download('wordnet')
 nltk.download('stopwords')
diff --git a/install_libraries b/install_libraries
 pip install spacy
 pip install nltk
 pip install newscatcherapi
diff --git a/nltk_pipeline.py b/nltk_pipeline.py
 from nltk.tokenize import word_tokenize  
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from nltk.stem.porter import *
 stemmer = PorterStemmer()
 tokens = word_tokenize(sentence)
 
 #Stemming
 stemed_tokens = []
 for word in tokens:
    stemed_tokens.append(stemmer.stem(word))
 
 #Lemmatization
 lemmatizer = WordNetLemmatizer()
 nltk_lemma_list = []
 for word in stemed_tokens:
    nltk_lemma_list.append(lemmatizer.lemmatize(word))
 
 print("Stemming + Lemmatization:")
 print(nltk_lemma_list)

 """
 Stemming + Lemmatization:
 ['follow', 'the', 'debut', 'of', 'bitcoin', 'futur', 
 'etf', 'in', 'the', 'unit', 'state', ',', 'the', 'crypto', 
 'market', 'is', 'abuzz', 'with', 'talk', 'of', 'an', 'impend', 
 'ether', 'etf.speak', 'on', 'a', 'show', 'on', 'cnbc', ',', 'michael', 
 'sonnenshein', ',', 'ceo', 'of', 'grayscal', '--', 'an', 'asset', 'manag',
 'compani', 'with', '$', '52', 'billion', 'in', 'asset', 'under', 'manag',
 '--', 'say', 'it', 'is', 'possibl', '.', 'He', 'said', 'it', "'stand", 'to', 
 'reason', "'", 'the', 'secur', 'and', 'exchang', 'committe', '(', 'sec', ')', 
 'will', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', 'and', 'other', 'similar', 
 'product', 'in', 'the', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin', ',', '
 ethereum', 'etfswhil', 'US', 'regul', 'have', 'allow', 'bitcoin', 'futur', 'etf', 'to', 
 'be', 'trade', 'on', 'the', 'exchang', ',', 'canada', 'ha', 'allow', 'both', 'bitcoin', 
 'and', 'ethereum', 'etf', '.']
 """
diff --git a/nltk_stopwords.py b/nltk_stopwords.py
 #Removing the stopwords
 normalized_tokens = []  
 nltk_stop_words = set(stopwords.words("english"))
 for w in nltk_lemma_list:  
    if w not in nltk_stop_words:  
        normalized_tokens.append(w)
 
 #Removing the punctuations
 normalized_tokens = remove_punctuations(normalized_tokens)
 print(" ")
 print("\nText after removing stopwords & punctuations:\n")
 print(normalized_tokens)


 """
 Text after removing stopwords & punctuations:
 
 ['follow', 'debut', 'bitcoin', 'futur', 'etf', 'unit', 'state',
 'crypto', 'market', 'abuzz', 'talk', 'impend', 'ether', 'etf.speak',
 'show', 'cnbc', 'michael', 'sonnenshein', 'ceo', 'grayscal', 'asset', 
 'manag', 'compani', '$', '52', 'billion', 'asset', 'manag', 'say', 
 'possibl', 'He', 'said', "'stand", 'reason', "'", 'secur', 'exchang', 
 'committe', 'sec', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', '
 similar', 'product', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin',
 'ethereum', 'etfswhil', 'US', 'regul', 'allow', 'bitcoin', 'futur', 'etf',
 'trade', 'exchang', 'canada', 'ha', 'allow', 'bitcoin', 'ethereum', 'etf']
 """
diff --git a/punc_remove_function.py b/punc_remove_function.py
 # Neither spaCy nor NLTK have any methods for filtering punctuations 
 def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)
    return normalized_tokens
diff --git a/remove_punc.py b/remove_punc.py
 # Neither spaCy nor NLTK have any methods for filtering punctuations 
 def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)
    return normalized_tokens
diff --git a/spacy_get_lemma.py b/spacy_get_lemma.py
 lemma_list = []
 for token in doc:
    lemma_list.append(token.lemma_)
 print("Lemmatized tokens:\n")
 print(lemma_list)



 """Lemmatized tokens:
 ['follow', 'the', 'debut', 'of', 'Bitcoin', 'future', 
 'etf', 'in', 'the', 'United', 'States', ',', 'the', 'crypto', 
 'market', 'be', 'abuzz', 'with', 'talk', 'of', 'an', 'impend', 
 'Ether', 'etf.speake', 'on', 'a', 'show', 'on', 'CNBC', ',', 'Michael', 
 'Sonnenshein', ',', 'ceo', 'of', 'Grayscale', '--', 'an', 'asset',
 'management', 'company', 'with', '$', '52', 'billion', 'in', 'asset', 
 'under', 'management', '--', 'say', '-PRON-', 'be', 'possible', '.', 
 '-PRON-', 'say', '-PRON-', "'", 'stand', 'to', 'reason', "'", 'the', 
 'Securities', 'and', 'Exchange', 'Committee', '(', 'SEC', ')', 'will',
 'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'and', 'other', 
 'similar', 'product', 'in', 'the', 'US', 'market', '.', 'Canada', 'already', 
 'have', 'Bitcoin', ',', 'Ethereum', 'ETFsWhile', 'US', 'regulator', 'have', 
 'allow', 'Bitcoin', 'future', 'etf', 'to', 'be', 'trade', 'on', 'the', 'exchange',
 ',', 'Canada', 'have', 'allow', 'both', 'Bitcoin', 'and', 'Ethereum', 'etf', '.']
 """
diff --git a/spacy_nlp_pipeline_ex.py b/spacy_nlp_pipeline_ex.py
 import spacy
 # just keeping pos tagger and lemmatizer
 nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
                                            'tok2vec', 'attribute_ruler'])
 doc = nlp(sentence)
diff --git a/spacy_punc.py b/spacy_punc.py
 #Removing the stopwords
 normalized_tokens =[] 
 for word in lemma_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        normalized_tokens.append(word) 
 normalized_tokens = remove_punctuations(normalized_tokens)
 print("\nText after removing stopwords & punctuations:\n")
 print(normalized_tokens)

 """Text after removing stopwords & punctuations:
 
 ['follow', 'debut', 'Bitcoin', 'future', 'etf', 'United', 
 'States', 'crypto', 'market', 'abuzz', 'talk', 'impend', 
 'Ether', 'etf.speake', 'CNBC', 'Michael', 'Sonnenshein', 'ceo', 
 'Grayscale', 'asset', 'management', 'company', '$', '52', 'billion', 
 'asset', 'management', '-PRON-', 'possible', '-PRON-', '-PRON-', "'", 
 'stand', 'reason', "'", 'Securities', 'Exchange', 'Committee', 'SEC', 
 'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'similar', 'product', 
 'market', 'Canada', 'Bitcoin', 'Ethereum', 'ETFsWhile', 'regulator', 'allow', 
 'Bitcoin', 'future', 'etf', 'trade', 'exchange', 'Canada', 'allow', 'Bitcoin',
 'Ethereum', 'etf']
 """
diff --git a/spacy_tokenizer_example.py b/spacy_tokenizer_example.py
 from spacy.tokenizer import Tokenizer
 from spacy.lang.en import English
 vocab = English()
 # Create a Tokenizer with the default settings for English
 tokenizer = vocab.tokenizer
 tokens = tokenizer(sentence)
 print(list(tokens))

 """[Following, the, debut, of, Bitcoin, futures, ETF, in, the, United, 
 States, ,, the, crypto, market, is, abuzz, with, talks, of, an, impending, 
 Ether, ETF.Speaking, on, a, show, on, CNBC, ,, Michael, Sonnenshein, ,, CEO, 
 of, Grayscale, --, an, asset, management, company, with, $, 52, billion, in, 
 assets, under, management, --, says, it, is, possible, ., He, said, it, ', 
 stands, to, reason, ', the, Securities, and, Exchange, Committee, (, SEC, ), 
 will, proactively, consider, bringing, Ethereum, ETF, and, other, similar, 
 products, in, the, US, market, ., Canada, already, has, Bitcoin, ,, Ethereum, 
 ETFsWhile, US, regulators, have, allowed, Bitcoin, futures, ETF, to, be, traded, 
 on, the, exchanges, ,, Canada, has, allowed, both, Bitcoin, and, Ethereum, ETFs, .]"""
	summary = [article['summary'] for article in articles]
	sentence = summary[0]
	{'_id': 'baff70092abcf695d73af5186c4df82f',
	'_score': 26.395569,
	'author': None,
	'authors': [],
	'clean_url': 'cnbctv18.com',
	'country': 'US',
	'excerpt': "Some analysts believe an Ethereum ETF can be more successful than the Bitcoin ETF. For example, Grayscale's Ethereum Trust (ETHE) is witnessing more institutional investors flocking to Grayscale's…",
	'is_opinion': False,
	'language': 'en',
	'link': 'https://www.cnbctv18.com/cryptocurrency/is-ethereum-etf-on-the-way-grayscale-ceo-deciphers-11178362.htm’,
	'media': 'https://images.cnbctv18.com/wp-content/uploads/2021/09/ether-1019x573.jpg',
	'published_date': '2021-10-21 13:14:33',
	'published_date_precision': 'timezone unknown',
	'rank': 16951,
	'rights': 'cnbctv18.com',
	'summary': "Following the debut of Bitcoin futures ETF in the United States, the crypto market is abuzz with talks of an impending Ether ETF.Speaking on a show on CNBC, Michael Sonnenshein, CEO of Grayscale -- an asset management company with $52 billion in assets under management -- says it is possible. He said it 'stands to reason' the Securities and Exchange Committee (SEC) will proactively consider bringing Ethereum ETF and other similar products in the US market.Canada already has Bitcoin, Ethereum ETFsWhile US regulators have allowed Bitcoin futures ETF to be traded on the exchanges, Canada has allowed both Bitcoin and Ethereum ETFs.",
	'title': 'Is Ethereum ETF on the way? Grayscale CEO deciphers',
	'topic': 'news',
	'twitter_account': None
	}
	import time

	text = input("Enter the text to be tokenized: \n")
	choice = input("\nEnter Your choice of library: \n->spaCy (s) \n->NLTK (n) \n->Both (b)\n")
	if choice == 's':
	spacy_pipeline(text)
	elif choice =='n':
	nltk_pipeline(text)
	elif choice == 'b':
	start = time()
	print("\t\t\tspaCy\n")
	spacy_pipeline(text)
	print(f"Time taken by spaCy: {time()-start}s")
	start = time()
	print("\n\t\t\tnltk\n")
	spacy_pipeline(text)
	print(f"Time taken by spaCy: {time()-start}s")
	else:
	print("Invalid choice!")
	API_KEY = "YOUR API KEY GOES HERE"
	from newscatcherapi import NewsCatcherApiClient

	newscatcherapi = NewsCatcherApiClient(x_api_key=API_KEY)

	data = newscatcherapi.get_search(q="Bitcoin OR Ethereum OR crypto",
	lang='en',
	page_size=100)
	articles = data['articles']
	print(articles[0])
	import nltk
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.stem.porter import *
	stemmer = PorterStemmer()
	tokens = word_tokenize(sentence)

	#Stemming
	stemed_tokens = []
	for word in tokens:
	stemed_tokens.append(stemmer.stem(word))

	#Lemmatization
	lemmatizer = WordNetLemmatizer()
	nltk_lemma_list = []
	for word in stemed_tokens:
	nltk_lemma_list.append(lemmatizer.lemmatize(word))

	print("Stemming + Lemmatization:")
	print(nltk_lemma_list)

	"""
	Stemming + Lemmatization:
	['follow', 'the', 'debut', 'of', 'bitcoin', 'futur',
	'etf', 'in', 'the', 'unit', 'state', ',', 'the', 'crypto',
	'market', 'is', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
	'ether', 'etf.speak', 'on', 'a', 'show', 'on', 'cnbc', ',', 'michael',
	'sonnenshein', ',', 'ceo', 'of', 'grayscal', '--', 'an', 'asset', 'manag',
	'compani', 'with', '$', '52', 'billion', 'in', 'asset', 'under', 'manag',
	'--', 'say', 'it', 'is', 'possibl', '.', 'He', 'said', 'it', "'stand", 'to',
	'reason', "'", 'the', 'secur', 'and', 'exchang', 'committe', '(', 'sec', ')',
	'will', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', 'and', 'other', 'similar',
	'product', 'in', 'the', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin', ',', '
	ethereum', 'etfswhil', 'US', 'regul', 'have', 'allow', 'bitcoin', 'futur', 'etf', 'to',
	'be', 'trade', 'on', 'the', 'exchang', ',', 'canada', 'ha', 'allow', 'both', 'bitcoin',
	'and', 'ethereum', 'etf', '.']
	"""
	#Removing the stopwords
	normalized_tokens = []
	nltk_stop_words = set(stopwords.words("english"))
	for w in nltk_lemma_list:
	if w not in nltk_stop_words:
	normalized_tokens.append(w)

	#Removing the punctuations
	normalized_tokens = remove_punctuations(normalized_tokens)
	print(" ")
	print("\nText after removing stopwords & punctuations:\n")
	print(normalized_tokens)


	"""
	Text after removing stopwords & punctuations:

	['follow', 'debut', 'bitcoin', 'futur', 'etf', 'unit', 'state',
	'crypto', 'market', 'abuzz', 'talk', 'impend', 'ether', 'etf.speak',
	'show', 'cnbc', 'michael', 'sonnenshein', 'ceo', 'grayscal', 'asset',
	'manag', 'compani', '$', '52', 'billion', 'asset', 'manag', 'say',
	'possibl', 'He', 'said', "'stand", 'reason', "'", 'secur', 'exchang',
	'committe', 'sec', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', '
	similar', 'product', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin',
	'ethereum', 'etfswhil', 'US', 'regul', 'allow', 'bitcoin', 'futur', 'etf',
	'trade', 'exchang', 'canada', 'ha', 'allow', 'bitcoin', 'ethereum', 'etf']
	"""
	# Neither spaCy nor NLTK have any methods for filtering punctuations
	def remove_punctuations(normalized_tokens):
	punctuations=['?',':','!',',','.',';','\|','(',')','--']
	for word in normalized_tokens:
	if word in punctuations:
	normalized_tokens.remove(word)
	return normalized_tokens
	lemma_list = []
	for token in doc:
	lemma_list.append(token.lemma_)
	print("Lemmatized tokens:\n")
	print(lemma_list)



	"""Lemmatized tokens:
	['follow', 'the', 'debut', 'of', 'Bitcoin', 'future',
	'etf', 'in', 'the', 'United', 'States', ',', 'the', 'crypto',
	'market', 'be', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
	'Ether', 'etf.speake', 'on', 'a', 'show', 'on', 'CNBC', ',', 'Michael',
	'Sonnenshein', ',', 'ceo', 'of', 'Grayscale', '--', 'an', 'asset',
	'management', 'company', 'with', '$', '52', 'billion', 'in', 'asset',
	'under', 'management', '--', 'say', '-PRON-', 'be', 'possible', '.',
	'-PRON-', 'say', '-PRON-', "'", 'stand', 'to', 'reason', "'", 'the',
	'Securities', 'and', 'Exchange', 'Committee', '(', 'SEC', ')', 'will',
	'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'and', 'other',
	'similar', 'product', 'in', 'the', 'US', 'market', '.', 'Canada', 'already',
	'have', 'Bitcoin', ',', 'Ethereum', 'ETFsWhile', 'US', 'regulator', 'have',
	'allow', 'Bitcoin', 'future', 'etf', 'to', 'be', 'trade', 'on', 'the', 'exchange',
	',', 'Canada', 'have', 'allow', 'both', 'Bitcoin', 'and', 'Ethereum', 'etf', '.']
	"""
	import spacy
	# just keeping pos tagger and lemmatizer
	nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
	'tok2vec', 'attribute_ruler'])
	doc = nlp(sentence)