Import the English language class
from spacy.lang.en import English
# Create the nlp object
nlp = English()
# Process a text
doc = nlp("This is a sentence.")
# Print the document text, only 3 to 5
print(doc[2:5].text)
# Print the document text
print(doc.text)
When you call nlp on a string, spaCy first tokenizes the text and creates a document object. In this exercise, you’ll learn more about the Doc, as well as its views Token and Span.
# Import the English language class and create the nlp object
from spacy.lang.en import English
nlp = English()
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")
# Select the first token
first_token = doc[2]
# Print the first token's text
print(first_token.text)
Use spaCy’s Doc and Token objects, and lexical attributes to find percentages in a text. We'll be looking for two subsequent tokens: a number and a percent sign.
from spacy.lang.en import English
nlp = English()
# Process the text
doc = nlp(
"In 1990, more than 60% of people in East Asia were in extreme poverty. "
"Now less than 4% are."
)
# Iterate over the tokens in the doc
for token in doc:
# Check if the token resembles a number
if token.like_num:
# Get the next token in the document
next_token = doc[token.i + 1]
# Check if the next token's text equals '%'
if next_token.text == "%":
print("Percentage found:", token.text)
import spacy
# Load the small English model – spaCy is already imported
nlp = spacy.load("en_core_web_sm")
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"
# Process the text
doc = nlp(text)
# Print the document text
print(doc.text)
1 For each token, print the token text, the token’s .pos_ (part-of-speech tag) and the token’s .dep_ (dependency label).
import spacy
nlp = spacy.load("en_core_web_sm")
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"
# Process the text
doc = nlp(text)
for token in doc:
# Get the token text, part-of-speech tag and dependency label
token_text = token.text
token_pos = token.pos_
token_dep = token.dep_
# This is for formatting only
print("{:<12}{:<10}{:<10}".format(token_text, token_pos, token_dep))
import spacy
nlp = spacy.load("en_core_web_sm")
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"
# Process the text
doc = nlp(text)
# Iterate over the predicted entities
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
import spacy
nlp = spacy.load("en_core_web_sm")
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"
# Process the text
doc = nlp(text)
# Iterate over the entities
for ent in doc.ents:
# Print the entity text and label
print(ent.text, ent.label_)
# Get the span for "iPhone X"
iphone_x = doc[1:3]
# Print the span text
print("Missing entity:", iphone_x.text)
Note: Models are statistical and not always right. Whether their predictions are correct depends on the training data and the text you’re processing.
Rule-based matching
import spacy
# Import the Matcher
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")
# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)
# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)
# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])
Writing more complex match patterns using different token attributes and operators.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
doc = nlp(
"After making the iOS update you won't notice a radical system-wide "
"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
"iOS 11's furniture remains the same as in iOS 10. But you will discover "
"some tweaks once you delve a little deeper."
)
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
Write one pattern that only matches forms of “download” (tokens with the lemma “download”), followed by a token with the part-of-speech tag 'PROPN' (proper noun).
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
doc = nlp(
"i downloaded Fortnite on my laptop and can't open the game at all. Help? "
"so when I was downloading Minecraft, I got the Windows version where it "
"is the '.zip' folder and I used the default program to unpack it... do "
"I also need to download Winzip?"
)
# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
Write one pattern that matches adjectives ('ADJ') followed by one or two 'NOUN's (one noun and one optional noun).
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
doc = nlp(
"Features of the app include a beautiful design, smart search, automatic "
"labels and optional voice responses."
)
# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
Look up the string “cat” in nlp.vocab.strings to get the hash. Look up the hash to get back the string.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I have a cat")
# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)
# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("David Bowie is a PERSON")
# Look up the hash for the string label "PERSON"
person_hash = nlp.vocab.strings["PERSON"]
print(person_hash)
# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)