Skip to content

Instantly share code, notes, and snippets.

@sai-prasanna
Last active April 10, 2020 23:14
Show Gist options
  • Save sai-prasanna/e580408f9d26f73759b083f600667e22 to your computer and use it in GitHub Desktop.
Save sai-prasanna/e580408f9d26f73759b083f600667e22 to your computer and use it in GitHub Desktop.
Reverse Engineer SpaCy Patterns - Pattern Matching
pip install git+https://github.com/sai-prasanna/spacy-pattern-builder.git
pip install streamlit
python -m spacy download en # Or any other model you wish to use.
streamlit run streamlit_pattern_builder.py
# coding: utf-8
"""
Reverse Engineer Spacy.
"""
import streamlit as st
import spacy
import time
from spacy import displacy
from spacy_pattern_builder import util
from spacy_pattern_builder import build_dependency_pattern
from spacy.matcher import DependencyMatcher
import json
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
DEFAULT_SEED_SENTENCE = "CycleCycle introduced efficient methods for reverse engineering patterns from text."
DEFAULT_SEED_TOKEN_IDS = (0, 1, 3)
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
DEFAULT_TEST = """Mr. Robot introduced new methods for hacking ecorp.
The AI self-discovered an advanced method to automatically mine patterns.
They introduced efficient methods for forming patterns.
"""
@st.cache(allow_output_mutation=True)
def load_model(name):
return spacy.load(name)
@st.cache(allow_output_mutation=True)
def process_text(model_name, text):
nlp = load_model(model_name)
return nlp(text)
def build_dep_pattern(sentence, match_tokens, feature_dict):
return build_dependency_pattern(sentence, match_tokens, feature_dict=feature_dict)
def smallest_connected_subgraph(selected_tokens, sentence):
return util.smallest_connected_subgraph(selected_tokens, sentence) if len(selected_tokens) > 1 else []
def build_selective_parse_tree(sentence, selected_tokens, pos_level):
selected_parse_tree = {
"words": [
{"text": token.text, "tag": token.tag_ if pos_level == "low" else token.pos_ if token in selected_tokens else ""}
for token in sentence
]
}
arcs = []
for token in selected_tokens:
if token.head == token or token.head not in selected_tokens:
continue
if token.i < token.head.i:
arcs.append(
{"start": token.i, "end": token.head.i, "label": token.dep_, "dir": "left"}
)
else:
arcs.append(
{
"start": token.head.i,
"end": token.i,
"label": token.dep_,
"dir": "right",
}
)
selected_parse_tree["arcs"] = arcs
return selected_parse_tree
def render_manual_parse(selected_parse_tree, options):
html = displacy.render(selected_parse_tree, manual=True, options=options)
# Double newlines seem to mess with the rendering
html = html.replace("\n\n", "\n")
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
def main():
st.sidebar.title("Interactive spaCy Reverse Engineering")
st.sidebar.markdown(
"""
## Reverse engineer dependency and POS patterns from data.
Using my fork of [Spacy Pattern Builder](https://github.com/cyclecycle/spacy-pattern-builder) + [displaCy](https://explosion.ai/demos/displacy)
+ [Streamlit](streamlit.io/)
"""
)
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
st.sidebar.header("POS")
pos_level = st.sidebar.radio("POS level", ("high", "low"))
st.header("Dependency Parse & POS")
st.sidebar.header("Dependency Parse")
compact = st.sidebar.checkbox("Compact mode")
model_load_state = st.info(f"Loading model '{spacy_model}'...")
nlp = load_model(spacy_model)
model_load_state.empty()
st.header("Seed sentence")
seed_text = st.text_area("Text", DEFAULT_SEED_SENTENCE)
if not seed_text:
st.error("Enter some seed text.")
return
doc = process_text(spacy_model, seed_text)
if "parser" not in nlp.pipe_names:
st.error("Dependency Parser not found for the selected model.")
return
options = {
"compact": compact,
"collapse_punct": False,
}
seed_sentence = next(doc.sents).as_doc()
parse_tree = build_selective_parse_tree(seed_sentence, seed_sentence, pos_level)
render_manual_parse(parse_tree, options)
st.header("Select tokens that form a pattern.")
selected_tokens = [token for token in seed_sentence if st.checkbox(token.text, key=f"{token.i}", value=seed_text==DEFAULT_SEED_SENTENCE and token.i in DEFAULT_SEED_TOKEN_IDS)]
match_tokens = smallest_connected_subgraph(selected_tokens, seed_sentence)
if not match_tokens:
st.info("Select few more tokens")
time.sleep(0.1) # Sleep is needed for streamlit to sync first checkbox click for some odd reason.
return
# Printing Matched Subgraph
st.header("Matched subgraph")
parse_tree = build_selective_parse_tree(seed_sentence, match_tokens, pos_level)
render_manual_parse(parse_tree, options)
st.header("Token attributes to reverse engineer.")
feature_dict = { } # This here is equal to the default feature_dict
if st.checkbox("Dependency graph", value=True):
feature_dict['DEP'] = 'dep_'
if st.checkbox("POS", value=True):
if pos_level == "high":
feature_dict['POS'] = 'pos_'
else:
feature_dict['TAG'] = 'tag_'
if not feature_dict:
st.error("Select atleast one feature to build matcher.")
return
pattern = build_dep_pattern(seed_sentence, match_tokens, feature_dict=feature_dict)
st.info("Pattern created successfully.")
st.write("-----------------")
if st.checkbox("Show/Edit Spacy matcher pattern"):
pattern = json.loads(st.text_area("Pattern", json.dumps(pattern)))
st.header("Test your newly created pattern")
test_sentences = st.text_area("Type/Paste some sentences.", value=DEFAULT_TEST if seed_text == DEFAULT_SEED_SENTENCE else '')
matcher = DependencyMatcher(nlp.vocab)
matcher.add('pattern', None, pattern)
for sentence_id, sentence in enumerate(nlp(test_sentences).sents):
st.write("----------")
sentence = sentence.as_doc()
st.markdown(f"**Test Sentence {sentence_id}**: {sentence}")
match_shown = False
for match_id, token_idxs in matcher(sentence):
if token_idxs:
matched_tokens = [sentence[i] for i in token_idxs[0]]
match_shown = True
st.markdown("**Match:** " + " ".join([
f"**{t.text}**" if t in matched_tokens else t.text
for t in sentence
]))
if not match_shown:
st.info("No Match")
if st.checkbox('View Dependency parse', key=sentence_id):
parse_tree = build_selective_parse_tree(sentence, sentence, pos_level)
render_manual_parse(parse_tree, options)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment