>>> from nltk import tokenize
>>> para = "Hello. My name is Jacob. Today you'll be learning NLTK."
>>> sents = tokenize.sent_tokenize(para)
>>> sents
['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.corpus import wordnet | |
| def get_wordnet_pos(self, treebank_tag): | |
| if treebank_tag.startswith('J'): | |
| return wordnet.ADJ | |
| elif treebank_tag.startswith('V'): | |
| return wordnet.VERB | |
| elif treebank_tag.startswith('N'): | |
| return wordnet.NOUN | |
| elif treebank_tag.startswith('R'): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| with open('sample.txt', 'r') as f: | |
| sample = f.read() | |
| sentences = nltk.sent_tokenize(sample) | |
| tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] | |
| tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] | |
| chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from enum import Enum | |
| Animal = Enum('Animal', ('ant bee cat dog')) | |
| 访问: | |
| Animal.ant | |
| Animal.bee |
两个quad空格 a \qquad b a \qquad b 两个m的宽度
quad空格 a \quad b a \quad b 一个m的宽度
大空格 a\ b a\ b 1/3m宽度
中等空格 a\;b a\;b 2/7m宽度
小空格 a\,b a\,b 1/6m宽度
没有空格 ab ab\,
紧贴 a\!b a\!b 缩进1/6m宽度
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def edit_distance(str_a, str_b): | |
| """ | |
| a到b | |
| 0:正确;1:字符替换;2:插入;3:删除 | |
| """ | |
| len_a = len(str_a) + 1 | |
| len_b = len(str_b) + 1 | |
| dist = [[float('inf') for row in range(len_b)] for col in range(len_a)] | |
| operation = [[float('inf') for row in range(len_b)] for col in range(len_a)] | |
| print(dist) |
class EditDistance(object):
def __init__(self, allow_transpose=True):
self.allow_transpose = allow_transpose
self.score = None
def score_edit_distance(self, source, target):
if source == target:
return 0
s_pos = len(source)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from itertools import chain | |
| from collections import Counter | |
| class StringUtils(object): | |
| @staticmethod | |
| def get_m_2_ngrams(input_list, min, max): | |
| """ | |
| 获得给定大小的子串,比如给定参数(items, 1, 2) | |
| 返回:[[a], [a, b], [b], [b, c], [c], [c, d], [d]]的生成器 |
How to check if a string in Python is in ASCII?
Python 3 way:
isascii = lambda s: len(s) == len(s.encode())
Since ascii characters can be encoded using only 1 byte, so any ascii characters length will be true to its size after encoded to bytes; whereas other non-ascii characters will be encoded to 2 bytes or 3 bytes accordingly which will increase their sizes.