codingdudecom · November 1, 2023 06:23
diff --git a/nlp.py b/nlp.py
 			from js import fetch
 			import nltk
 			from nltk.util import ngrams
 			from pathlib import Path
 			import os, sys, io, zipfile

 			stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
 			stopwords = stopwords.split(",")

 			punkt_downloaded = False
 			async def download_punkt():
 				global punkt_downloaded
 				if not punkt_downloaded:
 					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
 					js_buffer = await response.arrayBuffer()
 					py_buffer = js_buffer.to_py()  # this is a memoryview
 					stream = py_buffer.tobytes()  # now we have a bytes object

 					d = Path("/nltk_data/tokenizers")
 					d.mkdir(parents=True, exist_ok=True)

 					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

 					# extract punkt.zip
 					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
 					    path='/nltk_data/tokenizers/'
 					)
 					punkt_downloaded = True

 			
 			async def extract_keywords(text):
 				global punkt_downloaded
 				if not punkt_downloaded:
 					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
 					js_buffer = await response.arrayBuffer()
 					py_buffer = js_buffer.to_py()  # this is a memoryview
 					stream = py_buffer.tobytes()  # now we have a bytes object

 					d = Path("/nltk_data/tokenizers")
 					d.mkdir(parents=True, exist_ok=True)

 					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

 					# extract punkt.zip
 					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
 					    path='/nltk_data/tokenizers/'
 					)
 					punkt_downloaded = True

 				# check file contents in /nltk_data/tokenizers/
 				# print(os.listdir("/nltk_data/tokenizers/punkt"))

 				# return nltk.word_tokenize(text)
 				words = nltk.word_tokenize(text)
 				words = [word for word in words if word.isalnum()]
 				filtered_words = [word for word in words if word.lower() not in stopwords]

 				# Create bi-grams and tri-grams
 				bigrams = list(ngrams(filtered_words, 2))
 				trigrams = list(ngrams(filtered_words, 3))
 				quadgrams = list(ngrams(filtered_words, 4))

 				# Calculate frequency distributions for bi-grams and tri-grams
 				bigram_freq_dist = nltk.FreqDist(bigrams)
 				trigram_freq_dist = nltk.FreqDist(trigrams)
 				quadgram_freq_dist = nltk.FreqDist(quadgrams)

 				data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)
 				
 				# Get the top N words
 				# top_keywords = [word for word, freq in word_freq.most_common(10)]
 				formatted_data = [[" ".join(keyword), count] for keyword, count in data]

 				return formatted_data
	from js import fetch
	import nltk
	from nltk.util import ngrams
	from pathlib import Path
	import os, sys, io, zipfile

	stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
	stopwords = stopwords.split(",")

	punkt_downloaded = False
	async def download_punkt():
	global punkt_downloaded
	if not punkt_downloaded:
	response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
	js_buffer = await response.arrayBuffer()
	py_buffer = js_buffer.to_py() # this is a memoryview
	stream = py_buffer.tobytes() # now we have a bytes object

	d = Path("/nltk_data/tokenizers")
	d.mkdir(parents=True, exist_ok=True)

	Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

	# extract punkt.zip
	zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
	path='/nltk_data/tokenizers/'
	)
	punkt_downloaded = True


	async def extract_keywords(text):
	global punkt_downloaded
	if not punkt_downloaded:
	response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
	js_buffer = await response.arrayBuffer()
	py_buffer = js_buffer.to_py() # this is a memoryview
	stream = py_buffer.tobytes() # now we have a bytes object

	d = Path("/nltk_data/tokenizers")
	d.mkdir(parents=True, exist_ok=True)

	Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

	# extract punkt.zip
	zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
	path='/nltk_data/tokenizers/'
	)
	punkt_downloaded = True

	# check file contents in /nltk_data/tokenizers/
	# print(os.listdir("/nltk_data/tokenizers/punkt"))

	# return nltk.word_tokenize(text)
	words = nltk.word_tokenize(text)
	words = [word for word in words if word.isalnum()]
	filtered_words = [word for word in words if word.lower() not in stopwords]

	# Create bi-grams and tri-grams
	bigrams = list(ngrams(filtered_words, 2))
	trigrams = list(ngrams(filtered_words, 3))
	quadgrams = list(ngrams(filtered_words, 4))

	# Calculate frequency distributions for bi-grams and tri-grams
	bigram_freq_dist = nltk.FreqDist(bigrams)
	trigram_freq_dist = nltk.FreqDist(trigrams)
	quadgram_freq_dist = nltk.FreqDist(quadgrams)

	data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)

	# Get the top N words
	# top_keywords = [word for word, freq in word_freq.most_common(10)]
	formatted_data = [[" ".join(keyword), count] for keyword, count in data]

	return formatted_data