Created
February 4, 2020 01:33
-
-
Save zeddee/4f347f438393b5f5876576a8e197b053 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from https://course.spacy.io/chapter1 | |
from spacy.lang.en import English | |
nlp = English() | |
doc = nlp("hello world!!! one 123") | |
for token in doc: | |
print(token.text) | |
print(doc) | |
# a slice from a Doc is a Span object | |
span = doc[1:4] | |
print(span.text) | |
# lexical attributes | |
print("Lexical attributes") | |
print("========================================") | |
print("Index:\t", [token.i for token in doc]) | |
print("Text:\t", [token.text for token in doc]) | |
print("is_alphs:\t", [token.is_alpha for token in doc]) | |
print("is_punct:\t", [token.is_punct for token in doc]) | |
print("like_num:\t", [token.like_num for token in doc]) | |
doc = nlp( | |
"In 1990, more than 60% of people in East Asia were in extreme poverty. " | |
"Now less than 4% are." | |
) | |
# Iterate over the tokens in the doc | |
# to find percentage figures in 'doc' | |
for token in doc: | |
# Check if the token resembles a number | |
if token.like_num: | |
# Get the next token in the document | |
# if token is like a number | |
# we want to check if the next token is '%' | |
next_token = doc[token.i + 1] | |
# Check if the next token's text equals '%' | |
if next_token.text == "%": | |
print("Percentage found:", token.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment