ivopbernardo · November 2, 2022 09:29
diff --git a/nltk_intro.py b/nltk_intro.py
 # Getting started with NLTK scripts - used in blog post:
 # https://towardsdatascience.com/getting-started-with-nltk-eb4ed6eb7a37

 from nltk import tokenize

 python_wiki = '''
 Python is a high-level, interpreted, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
 Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.
 Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions. Python 2 was discontinued with version 2.7.18 in 2020.
 Python consistently ranks as one of the most popular programming languages.
 '''

 # WhiteSpace Tokenizer

 ws_tok = tokenize.WhitespaceTokenizer()
 token_list = ws_tok.tokenize(python_wiki)
 print(token_list[0:10])

 # TreeBank Tokenizer
 tb_tokenizer = tokenize.treebank.TreebankWordTokenizer()
 token_list = tb_tokenizer.tokenize(python_wiki)
 print(token_list[0:10])

 # Default Word Tokenizer
 from nltk import word_tokenize
 token_list = word_tokenize(python_wiki)
 print(token_list[0:10])

 # Porter Stemmer
 from nltk.stem import PorterStemmer
 porter = PorterStemmer()
 porter_tokens = [porter.stem(token) for token in token_list]

 # Lancaster Stemmer
 from nltk.stem import LancasterStemmer
 lanc = LancasterStemmer()
 lanc_tokens = [lanc.stem(token) for token in token_list]

 # WordNet Lemmatizer
 from nltk.stem import WordNetLemmatizer
 lemmatizer = WordNetLemmatizer()
 lemma_tokens = [lemmatizer.lemmatize(token) for token in token_list]

 # Pos Tagger
 import nltk
 pos_tags = nltk.pos_tag(token_list)
 print(pos_tags[0:10])

 # Get Lemma Tag Function
 def get_lemma_tag(pos_tag):
    if pos_tag.startswith('J'):
        return 'a'
    elif pos_tag.startswith('V'):
        return 'v'
    elif pos_tag.startswith('N'):
        return 'n'
    elif pos_tag.startswith('R'):
        return 'r'
    else:
        return ''
      
 # Bi-Grams
 list(nltk.bigrams(token_list))

 # N-Grams
 list(nltk.ngrams(token_list, 4))
	# Getting started with NLTK scripts - used in blog post:
	# https://towardsdatascience.com/getting-started-with-nltk-eb4ed6eb7a37

	from nltk import tokenize

	python_wiki = '''
	Python is a high-level, interpreted, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
	Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.
	Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions. Python 2 was discontinued with version 2.7.18 in 2020.
	Python consistently ranks as one of the most popular programming languages.
	'''

	# WhiteSpace Tokenizer

	ws_tok = tokenize.WhitespaceTokenizer()
	token_list = ws_tok.tokenize(python_wiki)
	print(token_list[0:10])

	# TreeBank Tokenizer
	tb_tokenizer = tokenize.treebank.TreebankWordTokenizer()
	token_list = tb_tokenizer.tokenize(python_wiki)
	print(token_list[0:10])

	# Default Word Tokenizer
	from nltk import word_tokenize
	token_list = word_tokenize(python_wiki)
	print(token_list[0:10])

	# Porter Stemmer
	from nltk.stem import PorterStemmer
	porter = PorterStemmer()
	porter_tokens = [porter.stem(token) for token in token_list]

	# Lancaster Stemmer
	from nltk.stem import LancasterStemmer
	lanc = LancasterStemmer()
	lanc_tokens = [lanc.stem(token) for token in token_list]

	# WordNet Lemmatizer
	from nltk.stem import WordNetLemmatizer
	lemmatizer = WordNetLemmatizer()
	lemma_tokens = [lemmatizer.lemmatize(token) for token in token_list]

	# Pos Tagger
	import nltk
	pos_tags = nltk.pos_tag(token_list)
	print(pos_tags[0:10])

	# Get Lemma Tag Function
	def get_lemma_tag(pos_tag):
	if pos_tag.startswith('J'):
	return 'a'
	elif pos_tag.startswith('V'):
	return 'v'
	elif pos_tag.startswith('N'):
	return 'n'
	elif pos_tag.startswith('R'):
	return 'r'
	else:
	return ''

	# Bi-Grams
	list(nltk.bigrams(token_list))

	# N-Grams
	list(nltk.ngrams(token_list, 4))