Skip to content

Instantly share code, notes, and snippets.

@lgmoneda
Created January 4, 2025 15:43
Show Gist options
  • Save lgmoneda/dc02923c165689e5ac72902aee08e512 to your computer and use it in GitHub Desktop.
Save lgmoneda/dc02923c165689e5ac72902aee08e512 to your computer and use it in GitHub Desktop.
Textual Topography
;; Textual topography
(defun textual-topography ()
"Prompt for domain, characteristic/anti-characteristic pair, and selected text.
Save selected text to a file and call a Python script with the arguments."
(interactive)
;; Define the domains and their associated characteristic/anti-characteristic pairs
(setq domain-characteristics
'(("Mood" . (("Happy" . "Unhappy") ("Calm" . "Agitated") ("Optimistic" . "Pessimist")))
("Style" . (("Complex" . "Simple")))
("Personality" . (("Outgoing" . "Shy") ("Confident" . "Timid")))))
;; Prompt user to select a domain
(setq domain (completing-read "Select a domain: " (mapcar 'car domain-characteristics)))
;; Retrieve the list of characteristic/anti-characteristic pairs for the selected domain
(setq char-anti-char-pairs (cdr (assoc domain domain-characteristics)))
;; Prompt user to select a characteristic/anti-characteristic pair
(setq selected-pair (completing-read "Select a characteristic/anti-characteristic: "
(mapcar (lambda (pair) (concat (car pair) " / " (cdr pair))) char-anti-char-pairs)))
;; Split the selected pair into characteristic and anti-characteristic
(let ((selected-pair-split (assoc selected-pair (mapcar (lambda (pair)
(cons (concat (car pair) " / " (cdr pair)) pair))
char-anti-char-pairs))))
(setq characteristic (car (cdr selected-pair-split)))
(setq anti-characteristic (cdr (cdr selected-pair-split))))
;; Ask for the selected text region
(if (use-region-p)
(setq selected-text (buffer-substring-no-properties (region-beginning) (region-end)))
(setq selected-text (read-string "Enter text manually: ")))
;; Define the file location outside of the let block to ensure it's accessible later
(setq file-location (concat "~/Documents/" domain "-" characteristic "-" anti-characteristic ".txt"))
;; Save the selected text to a file in ~/Documents/
(with-temp-file file-location
(insert selected-text))
(message "Text saved to %s" file-location)
;; Call the Python script with the four arguments
(let ((python-script "~/repos/org-roam-ai/writing/textual_topography.py"))
(shell-command (format "source ~/.zshrc && conda activate ml3 && python3 %s %s %s %s %s"
python-script domain characteristic anti-characteristic file-location)))
;; Delete shell command buffer
(delete-window (get-buffer-window (get-buffer "*Shell Command Output*")))
;; Create the org-mode buffer with an image and link
(let ((svg-file (concat "~/Documents/minimalistic_" domain "_" characteristic ".svg"))
(html-file (concat "~/Documents/" domain "_" characteristic ".html"))
(org-buffer (generate-new-buffer "*Domain Characteristics*")))
(switch-to-buffer org-buffer)
(org-mode)
;; Optionally, add a heading
(insert (format "#+TITLE: Textual Topography: %s - %s / %s\n" domain characteristic anti-characteristic))
;; Insert the image using org-mode syntax
(insert (format "[[file:%s]]\n" svg-file svg-file))
;; Insert the link to the HTML file
(insert (format "[[file:%s]]\n" html-file html-file))
(org-toggle-inline-images)
(message "Org buffer created with image and link.")))
import os
import re
import argparse
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random
from langchain.llms import OpenAI
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
def split_text_into_sentences(text):
"""
Splits a given text into sentences using punctuation marks (. ! ?).
Args:
text (str): The text to be split into sentences.
Returns:
list of str: A list of sentences derived from the text.
"""
# Use regular expression to split the text based on punctuation
pattern = r'(?<=[.!?])\s+'
sentences = re.split(pattern, text)
# Strip leading/trailing whitespace from each sentence and filter out empty ones
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
return sentences
def get_embeddings(query):
return embedding.embed_query(query)
def get_documents_embeddings(documents):
return embedding.embed_documents(documents)
def build_prompt_to_follow_input_theme(domain, characteristic, input_sentences, n_samples=10):
n_samples = min(len(input_sentences), n_samples)
senteces_sample = random.sample(input_sentences, n_samples)
senteces_prompt_example = "\n\n".join(senteces_sample)
return f"Generate an example of sentence that the {domain} is extremely {characteristic} and it fits the theme from the following sentences: \n {senteces_prompt_example}"
def generate_instances_of_domain_and_characteristic(domain, characteristic, n=10, verbose=False, prompt=None):
# Initialize the OpenAI model with a higher temperature for variety
llm = OpenAI(temperature=0.8, max_tokens=200)
# Define the prompt
if not prompt:
prompt = f"Generate an example of sentence extracted from a literary classic novel in which the {domain} is {characteristic}."
# Generate multiple examples by calling the model multiple times
examples = [llm(prompt) for _ in range(n)]
if verbose:
# Print the generated examples
for i, example in enumerate(examples, 1):
print(f"Example {i}:\n{example}\n")
return examples
def get_reference_embedding(sentences):
"""
Generates a reference embedding from a list of sentences.
Args:
sentences (list of str): A list of sentences for which to compute the reference embedding.
Returns:
np.ndarray: The reference embedding obtained by averaging the sentence embeddings.
"""
# Retrieve embeddings for the given sentences
sentence_embeddings = get_documents_embeddings(sentences)
# Compute the mean of all sentence embeddings to get a reference embedding
reference_embedding = np.mean(sentence_embeddings, axis=0)
return reference_embedding
def rescale_scores(scores, min_val=-1, max_val=1):
min_score = min(scores)
max_score = max(scores)
# Handle the case where all scores are the same
if min_score == max_score:
return [min_val] * len(scores) # Arbitrary value; could be min_val since all are the same
scaled_scores = [
min_val + (score - min_score) * (max_val - min_val) / (max_score - min_score)
for score in scores
]
return scaled_scores
def alignment_to_a_reference_embedding_normalized_by_difference(sentence_embeddings, reference_embedding, negative_reference_embedding):
scores = []
concept_difference = reference_embedding - negative_reference_embedding
# Normalize the concept difference vector
unit_vector = concept_difference / np.linalg.norm(concept_difference)
for embedding in sentence_embeddings:
# Calculate projection of the segment on the unit vector
score = np.dot(embedding - reference_embedding, unit_vector)
scores.append(score)
return scores
def moving_average(data, window_size):
""" Calculate the moving average of the given data with specified window size. """
return np.convolve(data, np.ones(window_size) / window_size, mode='valid')
def calculate_sentence_positions(sentences):
""" Calculate cumulative word positions of the sentences. """
positions = []
current_position = 0
for sentence in sentences:
positions.append(current_position)
current_position += len(sentence.split()) # Using word count for position
return positions
def plot_scores(scores, sentences, domain, characteristic, window_size=10):
""" Plot smoothed scores with a moving average. """
sentence_positions = calculate_sentence_positions(sentences)
# Calculate the moving average of the scores
smoothed_scores = moving_average(scores, window_size)
# Adjust the sentence positions to match the length of smoothed scores
valid_positions = sentence_positions[:len(smoothed_scores)]
plt.figure(figsize=(12, 6))
plt.plot(valid_positions, smoothed_scores, marker='o', color='#5c3e99')
plt.title(f"{domain} as {characteristic} score per sentence")
plt.xlabel("Position in Text (Words)")
plt.ylabel("Alignment with characteristic")
plt.grid()
# plt.show()
plt.savefig(f"/Users/luis.moneda/Documents/{domain}_{characteristic}.svg")
def plot_scores_interactive(scores, sentences, domain, characteristic, window_size=10):
""" Plot smoothed scores with a moving average and interactive hover feature. """
sentence_positions = calculate_sentence_positions(sentences)
# Calculate the moving average of the scores
smoothed_scores = moving_average(scores, window_size)
# Adjust the sentence positions to match the length of smoothed scores
valid_positions = sentence_positions[:len(smoothed_scores)]
# Create the interactive plot
fig = go.Figure()
# Add the plot trace
fig.add_trace(go.Scatter(
x=valid_positions,
y=smoothed_scores,
mode='markers+lines',
line=dict(color="#5c3e99"),
hovertemplate=[
f"{sentence}"
for sentence, score in zip(sentences[:len(smoothed_scores)], smoothed_scores)
],
))
# Set the layout
fig.update_layout(
title=f"{domain} as {characteristic} score per sentence",
xaxis_title="Position in Text (Words)",
yaxis_title="Alignment with characteristic",
template="plotly",
hovermode="closest"
)
# Show the plot
# fig.show()
fig.write_html(os.path.expanduser(f'/Users/luis.moneda/Documents/{domain}_{characteristic}.html'))
def plot_scores_minimalistic(scores, sentences, domain, characteristic, anti_characteristic, window_size=10):
"""
Plot smoothed scores with a moving average in a minimalist style.
- No x-axis or y-axis ticks.
- A dashed grey line at y=0 labeled as the "{characteristic} / {anti_characteristic} frontier".
- No box around the plot (only x axis).
"""
sentence_positions = calculate_sentence_positions(sentences)
# Calculate the moving average of the scores
smoothed_scores = moving_average(scores, window_size)
# Adjust the sentence positions to match the length of smoothed scores
valid_positions = sentence_positions[:len(smoothed_scores)]
plt.figure(figsize=(12, 6))
# Plot the smoothed scores
plt.plot(valid_positions,
smoothed_scores,
color='#5c3e99')
# Add the dashed line for the frontier
plt.axhline(0, color='grey', linestyle='--', linewidth=1, alpha=0.7, zorder=0)
plt.text(
valid_positions[-1], 0,
f" {characteristic} / {anti_characteristic} frontier",
color='grey', fontsize=10, va='bottom', ha='right', alpha=0.8
)
# Minimalist adjustments
plt.title(f"Textual topography for {characteristic} {domain}", fontsize=16)
plt.xlabel("Position in text")
# Remove axis ticks
plt.xticks([])
plt.yticks([])
# Remove grid and box (spines around the plot)
plt.grid(False)
for spine in ['top', 'right', 'left']:
plt.gca().spines[spine].set_visible(False)
# Keep only the bottom and left axes visible
plt.gca().spines['bottom'].set_linewidth(1.5)
plt.gca().spines['left'].set_linewidth(1.5)
plt.savefig(f"/Users/luis.moneda/Documents/minimalistic_{domain}_{characteristic}.svg",
bbox_inches="tight")
def read_file(file_name):
try:
with open(file_name, 'r') as file:
return file.read()
except FileNotFoundError:
return f"Error: The file '{file_name}' was not found."
except IOError:
return "Error: There was an issue reading the file."
def textual_topography(domain, characteristic, anti_characteristic, text_file):
text = read_file(text_file)
sentences = split_text_into_sentences(text)
sentences_embeddings = get_documents_embeddings(sentences)
n_samples = min(max(1, int(0.1 * len(sentences))), 20)
prompt = build_prompt_to_follow_input_theme(domain,
characteristic,
sentences,
n_samples=n_samples)
prompt = None
examples_for_reference = generate_instances_of_domain_and_characteristic(domain,
characteristic,
n=30,
verbose=False,
prompt=prompt)
negative_prompt = build_prompt_to_follow_input_theme(domain,
anti_characteristic,
sentences,
n_samples=n_samples)
negative_prompt = None
negative_examples_for_reference = generate_instances_of_domain_and_characteristic(domain,
anti_characteristic,
n=30,
verbose=False,
prompt=negative_prompt)
reference_embedding = get_reference_embedding(examples_for_reference)
negative_reference_embedding = get_reference_embedding(negative_examples_for_reference)
scores = alignment_to_a_reference_embedding_normalized_by_difference(sentences_embeddings,
reference_embedding,
negative_reference_embedding)
scores = rescale_scores(scores)
window_size = min(max(1, int(0.1 * len(sentences))), 15)
plot_scores(scores,
sentences,
domain,
characteristic,
window_size=window_size)
plot_scores_interactive(scores,
sentences,
domain,
characteristic,
window_size=window_size)
plot_scores_minimalistic(scores,
sentences,
domain,
characteristic,
anti_characteristic,
window_size=window_size)
if __name__ == "__main__":
# Set up the argument parser
parser = argparse.ArgumentParser(description="Process domain, characteristic, anti-characteristic, and text-file-name.")
# Add arguments
parser.add_argument('domain', type=str, help="Domain name")
parser.add_argument('characteristic', type=str, help="Characteristic name")
parser.add_argument('anti_characteristic', type=str, help="Anti-characteristic name")
parser.add_argument('text_file', type=str, help="Text file location and name")
# Parse the arguments
args = parser.parse_args()
# Call the main function with the parsed arguments
textual_topography(args.domain, args.characteristic, args.anti_characteristic, args.text_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment