Created
November 1, 2023 06:23
-
-
Save codingdudecom/9bc1b19ac82c556331da0c4f2efc7885 to your computer and use it in GitHub Desktop.
NLP Python code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from js import fetch | |
import nltk | |
from nltk.util import ngrams | |
from pathlib import Path | |
import os, sys, io, zipfile | |
stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now" | |
stopwords = stopwords.split(",") | |
punkt_downloaded = False | |
async def download_punkt(): | |
global punkt_downloaded | |
if not punkt_downloaded: | |
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') | |
js_buffer = await response.arrayBuffer() | |
py_buffer = js_buffer.to_py() # this is a memoryview | |
stream = py_buffer.tobytes() # now we have a bytes object | |
d = Path("/nltk_data/tokenizers") | |
d.mkdir(parents=True, exist_ok=True) | |
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) | |
# extract punkt.zip | |
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( | |
path='/nltk_data/tokenizers/' | |
) | |
punkt_downloaded = True | |
async def extract_keywords(text): | |
global punkt_downloaded | |
if not punkt_downloaded: | |
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') | |
js_buffer = await response.arrayBuffer() | |
py_buffer = js_buffer.to_py() # this is a memoryview | |
stream = py_buffer.tobytes() # now we have a bytes object | |
d = Path("/nltk_data/tokenizers") | |
d.mkdir(parents=True, exist_ok=True) | |
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) | |
# extract punkt.zip | |
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( | |
path='/nltk_data/tokenizers/' | |
) | |
punkt_downloaded = True | |
# check file contents in /nltk_data/tokenizers/ | |
# print(os.listdir("/nltk_data/tokenizers/punkt")) | |
# return nltk.word_tokenize(text) | |
words = nltk.word_tokenize(text) | |
words = [word for word in words if word.isalnum()] | |
filtered_words = [word for word in words if word.lower() not in stopwords] | |
# Create bi-grams and tri-grams | |
bigrams = list(ngrams(filtered_words, 2)) | |
trigrams = list(ngrams(filtered_words, 3)) | |
quadgrams = list(ngrams(filtered_words, 4)) | |
# Calculate frequency distributions for bi-grams and tri-grams | |
bigram_freq_dist = nltk.FreqDist(bigrams) | |
trigram_freq_dist = nltk.FreqDist(trigrams) | |
quadgram_freq_dist = nltk.FreqDist(quadgrams) | |
data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10) | |
# Get the top N words | |
# top_keywords = [word for word, freq in word_freq.most_common(10)] | |
formatted_data = [[" ".join(keyword), count] for keyword, count in data] | |
return formatted_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment