Skip to content

Instantly share code, notes, and snippets.

@DeNeutoy
Created February 17, 2020 17:41
Show Gist options
  • Save DeNeutoy/b20860b40b9fa9d33675893c56afde42 to your computer and use it in GitHub Desktop.
Save DeNeutoy/b20860b40b9fa9d33675893c56afde42 to your computer and use it in GitHub Desktop.
scispacy demo
import streamlit as st
import spacy
from spacy import displacy
import pandas as pd
from scispacy.umls_linking import UmlsEntityLinker
from scispacy.abbreviation import AbbreviationDetector
SPACY_MODEL_NAMES = ["en_core_sci_sm", "en_core_sci_md", "en_core_sci_lg"]
NER_MODEL_NAMES = ["en_ner_craft_md", "en_ner_jnlpba_md", "en_ner_bc5cdr_md", "en_ner_bionlp13cg_md"]
DEFAULT_TEXT = "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease caused by the expansion of a polyglutamine tract within the androgen receptor (AR). SBMA can be caused by this easily."
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
@st.cache(allow_output_mutation=True)
def load_model(name):
nlp = spacy.load(name)
# Add abbreviation detector
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
return nlp
@st.cache(allow_output_mutation=True)
def process_text(model_name, text):
nlp = load_model(model_name)
return nlp(text)
@st.cache(allow_output_mutation=True)
def load_linker():
linker = UmlsEntityLinker(resolve_abbreviations=True)
return linker
st.title("Scispacy - Demo")
st.sidebar.markdown(
"""
Analyze text with [ScispaCy](https://allenai.github.io/scispacy/) models and visualize entity linking, named entities,
dependencies and more.
"""
)
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
model_load_state = st.info(f"Loading model '{spacy_model}'...")
nlp = load_model(spacy_model)
model_load_state.empty()
linker = load_linker()
st.sidebar.header("Entity Linking")
threshold = st.sidebar.slider("Mention Threshold", 0.0, 1.0, 0.85)
linker.threshold = threshold
show_only_top = st.sidebar.checkbox("Show only top entity per mention", value=True)
st.sidebar.header("Specialized NER")
ner_model = st.sidebar.selectbox("NER Model", NER_MODEL_NAMES)
ner_load_state = st.info(f"Loading NER Model '{ner_model}'...")
ner = load_model(ner_model)
ner_load_state.empty()
st.header("Enter some text:")
text = st.text_area("", DEFAULT_TEXT)
doc = process_text(spacy_model, text)
ner_doc = process_text(ner_model, text)
st.header("Entity Linking")
st.markdown("Mentions are detected with the standard pipeline's mention detector.")
html = displacy.render(doc, style="ent")
# Newlines seem to mess with the rendering
html = html.replace("\n", " ")
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
data = []
for ent in linker(doc).ents:
for ent_id, score in ent._.umls_ents:
kb_entity = linker.umls.cui_to_entity[ent_id]
tuis = ",".join(kb_entity.types)
data.append([
ent.text,
kb_entity.canonical_name,
ent_id,
tuis,
score,
ent.start,
ent.end,
])
if show_only_top:
break
attrs = ["text", "Canonical Name", "Concept ID", "TUI(s)", "Score", "start", "end"]
df = pd.DataFrame(data, columns=attrs)
dfStyler = df.style.set_properties(**{'text-align': 'left'})
dfStyler.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
st.markdown("Entities are linked to the Unified Medical Language System (UMLS).")
st.table(dfStyler)
st.header("Specialized NER")
labels = st.sidebar.multiselect(
"Entity labels",
ner.get_pipe("ner").labels, # Options
ner.get_pipe("ner").labels # Default to all selected.
)
html = displacy.render(ner_doc, style="ent", options={"ents": labels})
# Newlines seem to mess with the rendering
html = html.replace("\n", " ")
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
data = [
[str(getattr(ent, attr)) for attr in attrs]
for ent in ner_doc.ents
]
df = pd.DataFrame(data, columns=attrs)
st.dataframe(df)
st.header("Dependency Parse & Part-of-speech tags")
if st.button("Show Parser and Tagger"):
st.sidebar.header("Dependency Parse")
split_sents = st.sidebar.checkbox("Split sentences", value=True)
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
collapse_phrases = st.sidebar.checkbox("Collapse phrases")
compact = st.sidebar.checkbox("Compact mode")
options = {
"collapse_punct": collapse_punct,
"collapse_phrases": collapse_phrases,
"compact": compact,
}
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
for sent in docs:
html = displacy.render(sent, options=options)
# Double newlines seem to mess with the rendering
html = html.replace("\n\n", "\n")
if split_sents and len(docs) > 1:
st.markdown(f"> {sent.text}")
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
st.header("Token attributes")
if st.button("Show token attributes"):
attrs = [
"idx",
"text",
"lemma_",
"pos_",
"tag_",
"dep_",
"head",
"ent_type_",
"ent_iob_",
"shape_",
"is_alpha",
"is_ascii",
"is_digit",
"is_punct",
"like_num",
]
data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
df = pd.DataFrame(data, columns=attrs)
st.dataframe(df)
st.header("JSON Doc")
if st.button("Show JSON Doc"):
st.json(doc.to_json())
st.header("JSON model meta")
if st.button("Show JSON model meta"):
st.json(nlp.meta)
st.sidebar.header("Acknowledgements")
st.sidebar.markdown(
"""
Adapted from [Ines Montani](https://ines.io/)'s brilliant [spaCy](https://spacy.io) Streamlit Demo to work with Scispacy.
Scispacy is maintained by [Mark Neumann](https://markneumann.xyz) and [Daniel King](https://www.linkedin.com/in/daniel-king-2190a595) at the [Allen Institute for Artificial Intelligence](https://allenai.org/).
"""
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment