Last active
September 21, 2023 17:14
-
-
Save ines/b320cb8441b590eedf19137599ce6685 to your computer and use it in GitHub Desktop.
Streamlit + spaCy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install streamlit | |
pip install spacy | |
python -m spacy download en_core_web_sm | |
python -m spacy download en_core_web_md | |
python -m spacy download de_core_news_sm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import spacy | |
from spacy import displacy | |
import pandas as pd | |
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] | |
DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." | |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>""" | |
@st.cache(ignore_hash=True) | |
def load_model(name): | |
return spacy.load(name) | |
@st.cache(ignore_hash=True) | |
def process_text(model_name, text): | |
nlp = load_model(model_name) | |
return nlp(text) | |
st.sidebar.title("Interactive spaCy visualizer") | |
st.sidebar.markdown( | |
""" | |
Process text with [spaCy](https://spacy.io) models and visualize named entities, | |
dependencies and more. Uses spaCy's built-in | |
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood. | |
""" | |
) | |
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) | |
model_load_state = st.info(f"Loading model '{spacy_model}'...") | |
nlp = load_model(spacy_model) | |
model_load_state.empty() | |
text = st.text_area("Text to analyze", DEFAULT_TEXT) | |
doc = process_text(spacy_model, text) | |
if "parser" in nlp.pipe_names: | |
st.header("Dependency Parse & Part-of-speech tags") | |
st.sidebar.header("Dependency Parse") | |
split_sents = st.sidebar.checkbox("Split sentences", value=True) | |
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) | |
collapse_phrases = st.sidebar.checkbox("Collapse phrases") | |
compact = st.sidebar.checkbox("Compact mode") | |
options = { | |
"collapse_punct": collapse_punct, | |
"collapse_phrases": collapse_phrases, | |
"compact": compact, | |
} | |
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] | |
for sent in docs: | |
html = displacy.render(sent, options=options) | |
# Double newlines seem to mess with the rendering | |
html = html.replace("\n\n", "\n") | |
if split_sents and len(docs) > 1: | |
st.markdown(f"> {sent.text}") | |
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) | |
if "ner" in nlp.pipe_names: | |
st.header("Named Entities") | |
st.sidebar.header("Named Entities") | |
default_labels = ["PERSON", "ORG", "GPE", "LOC"] | |
labels = st.sidebar.multiselect( | |
"Entity labels", nlp.get_pipe("ner").labels, default_labels | |
) | |
html = displacy.render(doc, style="ent", options={"ents": labels}) | |
# Newlines seem to mess with the rendering | |
html = html.replace("\n", " ") | |
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) | |
attrs = ["text", "label_", "start", "end", "start_char", "end_char"] | |
if "entity_linker" in nlp.pipe_names: | |
attrs.append("kb_id_") | |
data = [ | |
[str(getattr(ent, attr)) for attr in attrs] | |
for ent in doc.ents | |
if ent.label_ in labels | |
] | |
df = pd.DataFrame(data, columns=attrs) | |
st.dataframe(df) | |
if "textcat" in nlp.pipe_names: | |
st.header("Text Classification") | |
st.markdown(f"> {text}") | |
df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score")) | |
st.dataframe(df) | |
vector_size = nlp.meta.get("vectors", {}).get("width", 0) | |
if vector_size: | |
st.header("Vectors & Similarity") | |
st.code(nlp.meta["vectors"]) | |
text1 = st.text_input("Text or word 1", "apple") | |
text2 = st.text_input("Text or word 2", "orange") | |
doc1 = process_text(spacy_model, text1) | |
doc2 = process_text(spacy_model, text2) | |
similarity = doc1.similarity(doc2) | |
if similarity > 0.5: | |
st.success(similarity) | |
else: | |
st.error(similarity) | |
st.header("Token attributes") | |
if st.button("Show token attributes"): | |
attrs = [ | |
"idx", | |
"text", | |
"lemma_", | |
"pos_", | |
"tag_", | |
"dep_", | |
"head", | |
"ent_type_", | |
"ent_iob_", | |
"shape_", | |
"is_alpha", | |
"is_ascii", | |
"is_digit", | |
"is_punct", | |
"like_num", | |
] | |
data = [[str(getattr(token, attr)) for attr in attrs] for token in doc] | |
df = pd.DataFrame(data, columns=attrs) | |
st.dataframe(df) | |
st.header("JSON Doc") | |
if st.button("Show JSON Doc"): | |
st.json(doc.to_json()) | |
st.header("JSON model meta") | |
if st.button("Show JSON model meta"): | |
st.json(nlp.meta) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice example!
The
ignore_hash
argument has been renamed toallow_output_mutation
here. This should be changed in lines 12 and 17.