Jzzhou nlpjoe

Sentence Tokenization

>>> from nltk import tokenize
>>> para = "Hello. My name is Jacob. Today you'll be learning NLTK."
>>> sents = tokenize.sent_tokenize(para)
>>> sents
['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."]

两个quad空格	a \qquad b	a \qquad b	两个m的宽度
quad空格	a \quad b	a \quad b	一个m的宽度
大空格	a\ b	a\ b	1/3m宽度
中等空格	a\;b	a\;b	2/7m宽度
小空格	a\,b	a\,b	1/6m宽度
没有空格	ab	ab\,	
紧贴	a\!b	a\!b	缩进1/6m宽度

class EditDistance(object):
    def __init__(self, allow_transpose=True):
        self.allow_transpose = allow_transpose
        self.score = None

    def score_edit_distance(self, source, target):
        if source == target:
            return 0
        s_pos = len(source)

How to check if a string in Python is in ASCII?

Python 3 way:

isascii = lambda s: len(s) == len(s.encode())

Since ascii characters can be encoded using only 1 byte, so any ascii characters length will be true to its size after encoded to bytes; whereas other non-ascii characters will be encoded to 2 bytes or 3 bytes accordingly which will increase their sizes.

Q

>>> x = {'a':1, 'b': 2}
>>> y = {'b':10, 'c': 11}
>>> z = x.update(y)
>>> print(z)
None
>>> x
{'a': 1, 'b': 10, 'c': 11}

	from nltk.corpus import wordnet

	def get_wordnet_pos(self, treebank_tag):
	if treebank_tag.startswith('J'):
	return wordnet.ADJ
	elif treebank_tag.startswith('V'):
	return wordnet.VERB
	elif treebank_tag.startswith('N'):
	return wordnet.NOUN
	elif treebank_tag.startswith('R'):

	import nltk
	with open('sample.txt', 'r') as f:
	sample = f.read()


	sentences = nltk.sent_tokenize(sample)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

	from enum import Enum
	Animal = Enum('Animal', ('ant bee cat dog'))
	访问：
	Animal.ant
	Animal.bee

	def edit_distance(str_a, str_b):
	"""
	a到b
	0：正确；1：字符替换；2：插入；3：删除
	"""
	len_a = len(str_a) + 1
	len_b = len(str_b) + 1
	dist = [[float('inf') for row in range(len_b)] for col in range(len_a)]
	operation = [[float('inf') for row in range(len_b)] for col in range(len_a)]
	print(dist)

	from itertools import chain
	from collections import Counter

	class StringUtils(object):

	@staticmethod
	def get_m_2_ngrams(input_list, min, max):
	"""
	获得给定大小的子串，比如给定参数(items, 1, 2)
	返回：[[a], [a, b], [b], [b, c], [c], [c, d], [d]]的生成器