Skip to content

Instantly share code, notes, and snippets.

@janduplessis883
Last active November 9, 2025 14:11
Show Gist options
  • Save janduplessis883/eb81897ec0c4360d686e827a58926341 to your computer and use it in GitHub Desktop.
Save janduplessis883/eb81897ec0c4360d686e827a58926341 to your computer and use it in GitHub Desktop.
noema
import spacy
from spacy.matcher import Matcher
import math
import pandas as pd
# ============================================================================
# Installation Requirements:
# 1. pip install spacy
# 2. python -m spacy download en_core_web_sm
# ============================================================================
def setup_nlp():
"""Initialize spaCy language model."""
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
raise RuntimeError(
"spaCy model 'en_core_web_sm' not found. "
"Install it with: python -m spacy download en_core_web_sm"
)
return nlp
def create_matcher(nlp):
"""Create and configure the Matcher with action-oriented patterns."""
matcher = Matcher(nlp.vocab)
# Patterns use lemmas and POS for robustness across word forms
matcher.add("SHOULD_VERB", [[{"LEMMA": "should"}, {"POS": "VERB"}]])
matcher.add("NEED_TO", [[{"LEMMA": "need"}, {"LOWER": "to"}, {"POS": "VERB"}]])
matcher.add("MUST_VERB", [[{"LEMMA": "must"}, {"POS": "VERB"}]])
matcher.add("RECOMMEND", [[{"LEMMA": "recommend"}]])
matcher.add("SUGGEST", [[{"LEMMA": "suggest"}]])
matcher.add("IMPROVE", [[{"LEMMA": "improve"}]])
matcher.add("FOLLOW_UP", [[{"LEMMA": "follow"}, {"LEMMA": "up"}]])
return matcher
def actionability_score(text, nlp, matcher):
"""
Calculate an actionability score for a given text.
Args:
text (str): The input text to analyze
nlp: spaCy language model
matcher: Configured spaCy Matcher
Returns:
float: Actionability score between 0 and ~1
"""
doc = nlp(text)
matches = matcher(doc)
# Count verbs using spaCy (more efficient and consistent than NLTK)
verbs = sum(1 for token in doc if token.pos_ == "VERB")
# Imperative heuristic: ROOT verb without subject
imperatives = 0
for sent in doc.sents:
for token in sent:
if token.dep_ == "ROOT" and token.pos_ == "VERB":
has_subject = any(
child.dep_ in ("nsubj", "nsubjpass")
for child in token.children
)
if not has_subject:
imperatives += 1
suggestions = len(matches)
N = len(doc) # Use spaCy token count for consistency
# Avoid division by zero
raw = (verbs + imperatives + suggestions) / (N + 1e-6)
# Length-based sigmoid scaling (downweights very short texts)
N_MID = 12
K = 0.6
scale = 1 / (1 + math.exp(-K * (N - N_MID)))
return raw * scale
def analyze_text_detailed(text, nlp, matcher):
"""
Analyze text and print detailed breakdown of scoring components.
"""
doc = nlp(text)
score = actionability_score(text, nlp, matcher)
print(f"\n{'='*60}")
print(f"Text: '{text}'")
print(f"Tokens ({len(doc)}): {[token.text for token in doc]}")
# Matcher matches
matches = matcher(doc)
print(f"\nMatcher matches ({len(matches)}):")
for match_id, start, end in matches:
pattern_name = nlp.vocab.strings[match_id]
matched_text = doc[start:end].text
print(f" - {pattern_name}: '{matched_text}'")
# Imperatives
imperatives = []
for sent in doc.sents:
for token in sent:
if token.dep_ == "ROOT" and token.pos_ == "VERB":
has_subject = any(
child.dep_ in ("nsubj", "nsubjpass")
for child in token.children
)
if not has_subject:
imperatives.append(token.text)
print(f"\nImperatives detected ({len(imperatives)}): {imperatives}")
# Verb count
verbs = sum(1 for token in doc if token.pos_ == "VERB")
print(f"spaCy verb count: {verbs}")
# Score breakdown
raw_numerator = verbs + len(imperatives) + len(matches)
raw = raw_numerator / (len(doc) + 1e-6)
scale = 1 / (1 + math.exp(-0.6 * (len(doc) - 12)))
print(f"\nScore breakdown:")
print(f" Raw numerator (verbs + imperatives + matches): {raw_numerator}")
print(f" Token count (N): {len(doc)}")
print(f" Raw score: {raw:.4f}")
print(f" Length scale (sigmoid): {scale:.4f}")
print(f" Final actionability score: {score:.4f}")
return score
# ============================================================================
# Unit Test Harness
# ============================================================================
if __name__ == "__main__":
# Setup
nlp = setup_nlp()
matcher = create_matcher(nlp)
data = pd.read_csv('fft.csv') # Assuming a CSV file with a column 'text' for test cases
data.dropna(subset=['free_text'], inplace=True)
text = data['free_text'].tolist()
test_cases = text[13400:14000]
# Test cases covering various actionability levels
# test_cases = [
# "I m considering the receptionist staff lie saying they ve completed a task when they haven t done also neglecting said that practice has nothing to do with me being stocked which is pretty pretty unbelievable Also I do expect the GP to call when they say they re going to if my appointment is scheduled for 12 to 12 10 why the hell they called when I m in the police station at 6 pm is beyond me I want the MRI results tomorrow they were sent on Monday",
# "I feel rather processed than listened to as a patient we have lost the GP patient relationship and it s quite sad it feels to me there is a loss of compassion among the Dr s most Good because they are overstretched or it might be generational Most have absolutely no idea about ones condition or couldn t be bothered reading ones history it feels impersonal and a waste of time doing a whole repetition of the problems one has as time is obviously limited it s also not holistic any more as you can only speak about one thing at a time Its not only in your health centre btw It seems rather systemic as I had a horrendous even dangerous experience while I was hospitalized recently the same for my partner who was just released from hospital there is a huge problem with communication and reporting it seems On a lighter note your reception staff has really stepped up and all for the better at least they know what they are doing now which wasn t the case months ago which was more than annoying Regarding the service you are providing it is prompt and reliable Look times have changed where your GP knew everything about you and your family and I was lucky to have had the privilege to experience a completely different approach to healthcare but those days a truly over and in my opinion it s a great shame",
# "Maybe we could consider updating the library someday",
# "Short text",
# "Fix bugs", # Very short but actionable
# ]
print("Testing Actionability Score Function")
print("="*60)
results = []
for text in test_cases:
score = analyze_text_detailed(text, nlp, matcher)
results.append((text, score))
# Summary sorted by score
print("\n" + "="*60)
print("SUMMARY (sorted by actionability score):")
print("="*60)
for text, score in sorted(results, key=lambda x: x[1], reverse=True):
truncated = text[:250] + "..." if len(text) > 50 else text
print(f"{score:.4f} | {truncated}")
print("\n" + "="*60)
print("✅ Test completed successfully!")
"""
Theme Count Function
Counts distinct semantic themes in text based on noun chunk similarity.
Requires: pip install spacy sentence-transformers scikit-learn numpy
python -m spacy download en_core_web_sm
"""
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
from typing import Union
def theme_count(text: str, nlp, embedder, min_dist: float = 0.35) -> int:
"""
Estimate the number of distinct themes in a text.
Args:
text: Input text to analyze
nlp: spaCy language model (e.g., en_core_web_sm)
embedder: SentenceTransformer model for embeddings
min_dist: Minimum semantic distance (1 - similarity) to count as different themes
Returns:
Integer estimate of theme count (>= 1)
"""
# Extract content units (noun chunks); fallback to nouns if no chunks
doc = nlp(text)
chunks = [nc.text for nc in doc.noun_chunks]
# Fallback: individual nouns if no noun chunks found
if not chunks:
chunks = [t.lemma_ for t in doc if t.pos_ == "NOUN" and not t.is_stop]
# Single theme if insufficient content units
if len(chunks) < 2:
return 1
# Generate embeddings and compute similarity matrix
embs = embedder.encode(chunks, normalize_embeddings=True)
sim = cosine_similarity(embs)
# Count pairs that are sufficiently different
# Condition: 1 - sim >= min_dist <=> sim <= (1 - min_dist)
different_pairs = np.sum(np.triu((sim <= (1 - min_dist)), k=1))
# Heuristic: theme count based on ratio of different pairs to total chunks
tc = max(1, int(1 + different_pairs / max(1, len(chunks))))
return tc
# --- Self-contained test block ---
if __name__ == "__main__":
# Model initialization (downloads on first run)
print("Loading models...")
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Downloading spaCy model... Run: python -m spacy download en_core_web_sm")
exit(1)
try:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
print(f"Error loading SentenceTransformer: {e}")
exit(1)
# Test cases
test_texts = [
"The cat sat on the mat. Dogs run in the park.",
"I object strongly to having to explain my symptons to a receptionist What happened to patient confidentiality.",
"I live in Earl s court It took me a while to get in Chelsea. The nurse measured my blood pressure which I do daily at home. She said I had to review the medicines with pharmacist. I went back to the reception and the receptionist in an upset way asked my I had to see the pharmacist I explained the reasons and he said the pharmacist was not able to see me and she would phone me the day after. Perhaps before taking an appointment you should check if your employees are available On top of this the day after the pharmacist said that the medicine I am taking is not available in the UK so she cancelled my current prescription and closed the call I really struggle to believe that she could not prescribe any sort of other medicines from the sartan family Overall the clinic is not providing a good service",
"The reception person did not acknowledge my presence so I waited until she was ready to communicate At that point I d expect a greeting and perhaps recognition I hadn t interrupted her computer interaction Not forthcoming As this was my first visit to this site I was directed to wait but had to ask where Fortunately I wasn t presenting in a stressed mind frame but this is a communication failure similar to what I remembered from the previous premises in SW10 so not promising",
"One reception staff needs to be trained how to phares the sentences and speak softer",
"Doctors nurse and the receptionist are helpful."
]
# Run tests
print("\n" + "="*60)
print("THEME COUNT ANALYSIS")
print("="*60)
for i, text in enumerate(test_texts, 1):
count = theme_count(text, nlp, embedder, min_dist=0.35)
print(f"\nTest {i}: '{text[:100]}...'")
print(f" Estimated themes: {count}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment