Last active
April 10, 2020 23:14
-
-
Save sai-prasanna/e580408f9d26f73759b083f600667e22 to your computer and use it in GitHub Desktop.
Reverse Engineer SpaCy Patterns - Pattern Matching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install git+https://github.com/sai-prasanna/spacy-pattern-builder.git | |
pip install streamlit | |
python -m spacy download en # Or any other model you wish to use. | |
streamlit run streamlit_pattern_builder.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
""" | |
Reverse Engineer Spacy. | |
""" | |
import streamlit as st | |
import spacy | |
import time | |
from spacy import displacy | |
from spacy_pattern_builder import util | |
from spacy_pattern_builder import build_dependency_pattern | |
from spacy.matcher import DependencyMatcher | |
import json | |
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] | |
DEFAULT_SEED_SENTENCE = "CycleCycle introduced efficient methods for reverse engineering patterns from text." | |
DEFAULT_SEED_TOKEN_IDS = (0, 1, 3) | |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>""" | |
DEFAULT_TEST = """Mr. Robot introduced new methods for hacking ecorp. | |
The AI self-discovered an advanced method to automatically mine patterns. | |
They introduced efficient methods for forming patterns. | |
""" | |
@st.cache(allow_output_mutation=True) | |
def load_model(name): | |
return spacy.load(name) | |
@st.cache(allow_output_mutation=True) | |
def process_text(model_name, text): | |
nlp = load_model(model_name) | |
return nlp(text) | |
def build_dep_pattern(sentence, match_tokens, feature_dict): | |
return build_dependency_pattern(sentence, match_tokens, feature_dict=feature_dict) | |
def smallest_connected_subgraph(selected_tokens, sentence): | |
return util.smallest_connected_subgraph(selected_tokens, sentence) if len(selected_tokens) > 1 else [] | |
def build_selective_parse_tree(sentence, selected_tokens, pos_level): | |
selected_parse_tree = { | |
"words": [ | |
{"text": token.text, "tag": token.tag_ if pos_level == "low" else token.pos_ if token in selected_tokens else ""} | |
for token in sentence | |
] | |
} | |
arcs = [] | |
for token in selected_tokens: | |
if token.head == token or token.head not in selected_tokens: | |
continue | |
if token.i < token.head.i: | |
arcs.append( | |
{"start": token.i, "end": token.head.i, "label": token.dep_, "dir": "left"} | |
) | |
else: | |
arcs.append( | |
{ | |
"start": token.head.i, | |
"end": token.i, | |
"label": token.dep_, | |
"dir": "right", | |
} | |
) | |
selected_parse_tree["arcs"] = arcs | |
return selected_parse_tree | |
def render_manual_parse(selected_parse_tree, options): | |
html = displacy.render(selected_parse_tree, manual=True, options=options) | |
# Double newlines seem to mess with the rendering | |
html = html.replace("\n\n", "\n") | |
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) | |
def main(): | |
st.sidebar.title("Interactive spaCy Reverse Engineering") | |
st.sidebar.markdown( | |
""" | |
## Reverse engineer dependency and POS patterns from data. | |
Using my fork of [Spacy Pattern Builder](https://github.com/cyclecycle/spacy-pattern-builder) + [displaCy](https://explosion.ai/demos/displacy) | |
+ [Streamlit](streamlit.io/) | |
""" | |
) | |
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) | |
st.sidebar.header("POS") | |
pos_level = st.sidebar.radio("POS level", ("high", "low")) | |
st.header("Dependency Parse & POS") | |
st.sidebar.header("Dependency Parse") | |
compact = st.sidebar.checkbox("Compact mode") | |
model_load_state = st.info(f"Loading model '{spacy_model}'...") | |
nlp = load_model(spacy_model) | |
model_load_state.empty() | |
st.header("Seed sentence") | |
seed_text = st.text_area("Text", DEFAULT_SEED_SENTENCE) | |
if not seed_text: | |
st.error("Enter some seed text.") | |
return | |
doc = process_text(spacy_model, seed_text) | |
if "parser" not in nlp.pipe_names: | |
st.error("Dependency Parser not found for the selected model.") | |
return | |
options = { | |
"compact": compact, | |
"collapse_punct": False, | |
} | |
seed_sentence = next(doc.sents).as_doc() | |
parse_tree = build_selective_parse_tree(seed_sentence, seed_sentence, pos_level) | |
render_manual_parse(parse_tree, options) | |
st.header("Select tokens that form a pattern.") | |
selected_tokens = [token for token in seed_sentence if st.checkbox(token.text, key=f"{token.i}", value=seed_text==DEFAULT_SEED_SENTENCE and token.i in DEFAULT_SEED_TOKEN_IDS)] | |
match_tokens = smallest_connected_subgraph(selected_tokens, seed_sentence) | |
if not match_tokens: | |
st.info("Select few more tokens") | |
time.sleep(0.1) # Sleep is needed for streamlit to sync first checkbox click for some odd reason. | |
return | |
# Printing Matched Subgraph | |
st.header("Matched subgraph") | |
parse_tree = build_selective_parse_tree(seed_sentence, match_tokens, pos_level) | |
render_manual_parse(parse_tree, options) | |
st.header("Token attributes to reverse engineer.") | |
feature_dict = { } # This here is equal to the default feature_dict | |
if st.checkbox("Dependency graph", value=True): | |
feature_dict['DEP'] = 'dep_' | |
if st.checkbox("POS", value=True): | |
if pos_level == "high": | |
feature_dict['POS'] = 'pos_' | |
else: | |
feature_dict['TAG'] = 'tag_' | |
if not feature_dict: | |
st.error("Select atleast one feature to build matcher.") | |
return | |
pattern = build_dep_pattern(seed_sentence, match_tokens, feature_dict=feature_dict) | |
st.info("Pattern created successfully.") | |
st.write("-----------------") | |
if st.checkbox("Show/Edit Spacy matcher pattern"): | |
pattern = json.loads(st.text_area("Pattern", json.dumps(pattern))) | |
st.header("Test your newly created pattern") | |
test_sentences = st.text_area("Type/Paste some sentences.", value=DEFAULT_TEST if seed_text == DEFAULT_SEED_SENTENCE else '') | |
matcher = DependencyMatcher(nlp.vocab) | |
matcher.add('pattern', None, pattern) | |
for sentence_id, sentence in enumerate(nlp(test_sentences).sents): | |
st.write("----------") | |
sentence = sentence.as_doc() | |
st.markdown(f"**Test Sentence {sentence_id}**: {sentence}") | |
match_shown = False | |
for match_id, token_idxs in matcher(sentence): | |
if token_idxs: | |
matched_tokens = [sentence[i] for i in token_idxs[0]] | |
match_shown = True | |
st.markdown("**Match:** " + " ".join([ | |
f"**{t.text}**" if t in matched_tokens else t.text | |
for t in sentence | |
])) | |
if not match_shown: | |
st.info("No Match") | |
if st.checkbox('View Dependency parse', key=sentence_id): | |
parse_tree = build_selective_parse_tree(sentence, sentence, pos_level) | |
render_manual_parse(parse_tree, options) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment