Created
January 4, 2025 15:43
-
-
Save lgmoneda/dc02923c165689e5ac72902aee08e512 to your computer and use it in GitHub Desktop.
Textual Topography
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Textual topography | |
(defun textual-topography () | |
"Prompt for domain, characteristic/anti-characteristic pair, and selected text. | |
Save selected text to a file and call a Python script with the arguments." | |
(interactive) | |
;; Define the domains and their associated characteristic/anti-characteristic pairs | |
(setq domain-characteristics | |
'(("Mood" . (("Happy" . "Unhappy") ("Calm" . "Agitated") ("Optimistic" . "Pessimist"))) | |
("Style" . (("Complex" . "Simple"))) | |
("Personality" . (("Outgoing" . "Shy") ("Confident" . "Timid"))))) | |
;; Prompt user to select a domain | |
(setq domain (completing-read "Select a domain: " (mapcar 'car domain-characteristics))) | |
;; Retrieve the list of characteristic/anti-characteristic pairs for the selected domain | |
(setq char-anti-char-pairs (cdr (assoc domain domain-characteristics))) | |
;; Prompt user to select a characteristic/anti-characteristic pair | |
(setq selected-pair (completing-read "Select a characteristic/anti-characteristic: " | |
(mapcar (lambda (pair) (concat (car pair) " / " (cdr pair))) char-anti-char-pairs))) | |
;; Split the selected pair into characteristic and anti-characteristic | |
(let ((selected-pair-split (assoc selected-pair (mapcar (lambda (pair) | |
(cons (concat (car pair) " / " (cdr pair)) pair)) | |
char-anti-char-pairs)))) | |
(setq characteristic (car (cdr selected-pair-split))) | |
(setq anti-characteristic (cdr (cdr selected-pair-split)))) | |
;; Ask for the selected text region | |
(if (use-region-p) | |
(setq selected-text (buffer-substring-no-properties (region-beginning) (region-end))) | |
(setq selected-text (read-string "Enter text manually: "))) | |
;; Define the file location outside of the let block to ensure it's accessible later | |
(setq file-location (concat "~/Documents/" domain "-" characteristic "-" anti-characteristic ".txt")) | |
;; Save the selected text to a file in ~/Documents/ | |
(with-temp-file file-location | |
(insert selected-text)) | |
(message "Text saved to %s" file-location) | |
;; Call the Python script with the four arguments | |
(let ((python-script "~/repos/org-roam-ai/writing/textual_topography.py")) | |
(shell-command (format "source ~/.zshrc && conda activate ml3 && python3 %s %s %s %s %s" | |
python-script domain characteristic anti-characteristic file-location))) | |
;; Delete shell command buffer | |
(delete-window (get-buffer-window (get-buffer "*Shell Command Output*"))) | |
;; Create the org-mode buffer with an image and link | |
(let ((svg-file (concat "~/Documents/minimalistic_" domain "_" characteristic ".svg")) | |
(html-file (concat "~/Documents/" domain "_" characteristic ".html")) | |
(org-buffer (generate-new-buffer "*Domain Characteristics*"))) | |
(switch-to-buffer org-buffer) | |
(org-mode) | |
;; Optionally, add a heading | |
(insert (format "#+TITLE: Textual Topography: %s - %s / %s\n" domain characteristic anti-characteristic)) | |
;; Insert the image using org-mode syntax | |
(insert (format "[[file:%s]]\n" svg-file svg-file)) | |
;; Insert the link to the HTML file | |
(insert (format "[[file:%s]]\n" html-file html-file)) | |
(org-toggle-inline-images) | |
(message "Org buffer created with image and link."))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import argparse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import plotly.graph_objects as go | |
import random | |
from langchain.llms import OpenAI | |
from langchain_openai import OpenAIEmbeddings | |
from dotenv import load_dotenv | |
load_dotenv() | |
embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
def split_text_into_sentences(text): | |
""" | |
Splits a given text into sentences using punctuation marks (. ! ?). | |
Args: | |
text (str): The text to be split into sentences. | |
Returns: | |
list of str: A list of sentences derived from the text. | |
""" | |
# Use regular expression to split the text based on punctuation | |
pattern = r'(?<=[.!?])\s+' | |
sentences = re.split(pattern, text) | |
# Strip leading/trailing whitespace from each sentence and filter out empty ones | |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] | |
return sentences | |
def get_embeddings(query): | |
return embedding.embed_query(query) | |
def get_documents_embeddings(documents): | |
return embedding.embed_documents(documents) | |
def build_prompt_to_follow_input_theme(domain, characteristic, input_sentences, n_samples=10): | |
n_samples = min(len(input_sentences), n_samples) | |
senteces_sample = random.sample(input_sentences, n_samples) | |
senteces_prompt_example = "\n\n".join(senteces_sample) | |
return f"Generate an example of sentence that the {domain} is extremely {characteristic} and it fits the theme from the following sentences: \n {senteces_prompt_example}" | |
def generate_instances_of_domain_and_characteristic(domain, characteristic, n=10, verbose=False, prompt=None): | |
# Initialize the OpenAI model with a higher temperature for variety | |
llm = OpenAI(temperature=0.8, max_tokens=200) | |
# Define the prompt | |
if not prompt: | |
prompt = f"Generate an example of sentence extracted from a literary classic novel in which the {domain} is {characteristic}." | |
# Generate multiple examples by calling the model multiple times | |
examples = [llm(prompt) for _ in range(n)] | |
if verbose: | |
# Print the generated examples | |
for i, example in enumerate(examples, 1): | |
print(f"Example {i}:\n{example}\n") | |
return examples | |
def get_reference_embedding(sentences): | |
""" | |
Generates a reference embedding from a list of sentences. | |
Args: | |
sentences (list of str): A list of sentences for which to compute the reference embedding. | |
Returns: | |
np.ndarray: The reference embedding obtained by averaging the sentence embeddings. | |
""" | |
# Retrieve embeddings for the given sentences | |
sentence_embeddings = get_documents_embeddings(sentences) | |
# Compute the mean of all sentence embeddings to get a reference embedding | |
reference_embedding = np.mean(sentence_embeddings, axis=0) | |
return reference_embedding | |
def rescale_scores(scores, min_val=-1, max_val=1): | |
min_score = min(scores) | |
max_score = max(scores) | |
# Handle the case where all scores are the same | |
if min_score == max_score: | |
return [min_val] * len(scores) # Arbitrary value; could be min_val since all are the same | |
scaled_scores = [ | |
min_val + (score - min_score) * (max_val - min_val) / (max_score - min_score) | |
for score in scores | |
] | |
return scaled_scores | |
def alignment_to_a_reference_embedding_normalized_by_difference(sentence_embeddings, reference_embedding, negative_reference_embedding): | |
scores = [] | |
concept_difference = reference_embedding - negative_reference_embedding | |
# Normalize the concept difference vector | |
unit_vector = concept_difference / np.linalg.norm(concept_difference) | |
for embedding in sentence_embeddings: | |
# Calculate projection of the segment on the unit vector | |
score = np.dot(embedding - reference_embedding, unit_vector) | |
scores.append(score) | |
return scores | |
def moving_average(data, window_size): | |
""" Calculate the moving average of the given data with specified window size. """ | |
return np.convolve(data, np.ones(window_size) / window_size, mode='valid') | |
def calculate_sentence_positions(sentences): | |
""" Calculate cumulative word positions of the sentences. """ | |
positions = [] | |
current_position = 0 | |
for sentence in sentences: | |
positions.append(current_position) | |
current_position += len(sentence.split()) # Using word count for position | |
return positions | |
def plot_scores(scores, sentences, domain, characteristic, window_size=10): | |
""" Plot smoothed scores with a moving average. """ | |
sentence_positions = calculate_sentence_positions(sentences) | |
# Calculate the moving average of the scores | |
smoothed_scores = moving_average(scores, window_size) | |
# Adjust the sentence positions to match the length of smoothed scores | |
valid_positions = sentence_positions[:len(smoothed_scores)] | |
plt.figure(figsize=(12, 6)) | |
plt.plot(valid_positions, smoothed_scores, marker='o', color='#5c3e99') | |
plt.title(f"{domain} as {characteristic} score per sentence") | |
plt.xlabel("Position in Text (Words)") | |
plt.ylabel("Alignment with characteristic") | |
plt.grid() | |
# plt.show() | |
plt.savefig(f"/Users/luis.moneda/Documents/{domain}_{characteristic}.svg") | |
def plot_scores_interactive(scores, sentences, domain, characteristic, window_size=10): | |
""" Plot smoothed scores with a moving average and interactive hover feature. """ | |
sentence_positions = calculate_sentence_positions(sentences) | |
# Calculate the moving average of the scores | |
smoothed_scores = moving_average(scores, window_size) | |
# Adjust the sentence positions to match the length of smoothed scores | |
valid_positions = sentence_positions[:len(smoothed_scores)] | |
# Create the interactive plot | |
fig = go.Figure() | |
# Add the plot trace | |
fig.add_trace(go.Scatter( | |
x=valid_positions, | |
y=smoothed_scores, | |
mode='markers+lines', | |
line=dict(color="#5c3e99"), | |
hovertemplate=[ | |
f"{sentence}" | |
for sentence, score in zip(sentences[:len(smoothed_scores)], smoothed_scores) | |
], | |
)) | |
# Set the layout | |
fig.update_layout( | |
title=f"{domain} as {characteristic} score per sentence", | |
xaxis_title="Position in Text (Words)", | |
yaxis_title="Alignment with characteristic", | |
template="plotly", | |
hovermode="closest" | |
) | |
# Show the plot | |
# fig.show() | |
fig.write_html(os.path.expanduser(f'/Users/luis.moneda/Documents/{domain}_{characteristic}.html')) | |
def plot_scores_minimalistic(scores, sentences, domain, characteristic, anti_characteristic, window_size=10): | |
""" | |
Plot smoothed scores with a moving average in a minimalist style. | |
- No x-axis or y-axis ticks. | |
- A dashed grey line at y=0 labeled as the "{characteristic} / {anti_characteristic} frontier". | |
- No box around the plot (only x axis). | |
""" | |
sentence_positions = calculate_sentence_positions(sentences) | |
# Calculate the moving average of the scores | |
smoothed_scores = moving_average(scores, window_size) | |
# Adjust the sentence positions to match the length of smoothed scores | |
valid_positions = sentence_positions[:len(smoothed_scores)] | |
plt.figure(figsize=(12, 6)) | |
# Plot the smoothed scores | |
plt.plot(valid_positions, | |
smoothed_scores, | |
color='#5c3e99') | |
# Add the dashed line for the frontier | |
plt.axhline(0, color='grey', linestyle='--', linewidth=1, alpha=0.7, zorder=0) | |
plt.text( | |
valid_positions[-1], 0, | |
f" {characteristic} / {anti_characteristic} frontier", | |
color='grey', fontsize=10, va='bottom', ha='right', alpha=0.8 | |
) | |
# Minimalist adjustments | |
plt.title(f"Textual topography for {characteristic} {domain}", fontsize=16) | |
plt.xlabel("Position in text") | |
# Remove axis ticks | |
plt.xticks([]) | |
plt.yticks([]) | |
# Remove grid and box (spines around the plot) | |
plt.grid(False) | |
for spine in ['top', 'right', 'left']: | |
plt.gca().spines[spine].set_visible(False) | |
# Keep only the bottom and left axes visible | |
plt.gca().spines['bottom'].set_linewidth(1.5) | |
plt.gca().spines['left'].set_linewidth(1.5) | |
plt.savefig(f"/Users/luis.moneda/Documents/minimalistic_{domain}_{characteristic}.svg", | |
bbox_inches="tight") | |
def read_file(file_name): | |
try: | |
with open(file_name, 'r') as file: | |
return file.read() | |
except FileNotFoundError: | |
return f"Error: The file '{file_name}' was not found." | |
except IOError: | |
return "Error: There was an issue reading the file." | |
def textual_topography(domain, characteristic, anti_characteristic, text_file): | |
text = read_file(text_file) | |
sentences = split_text_into_sentences(text) | |
sentences_embeddings = get_documents_embeddings(sentences) | |
n_samples = min(max(1, int(0.1 * len(sentences))), 20) | |
prompt = build_prompt_to_follow_input_theme(domain, | |
characteristic, | |
sentences, | |
n_samples=n_samples) | |
prompt = None | |
examples_for_reference = generate_instances_of_domain_and_characteristic(domain, | |
characteristic, | |
n=30, | |
verbose=False, | |
prompt=prompt) | |
negative_prompt = build_prompt_to_follow_input_theme(domain, | |
anti_characteristic, | |
sentences, | |
n_samples=n_samples) | |
negative_prompt = None | |
negative_examples_for_reference = generate_instances_of_domain_and_characteristic(domain, | |
anti_characteristic, | |
n=30, | |
verbose=False, | |
prompt=negative_prompt) | |
reference_embedding = get_reference_embedding(examples_for_reference) | |
negative_reference_embedding = get_reference_embedding(negative_examples_for_reference) | |
scores = alignment_to_a_reference_embedding_normalized_by_difference(sentences_embeddings, | |
reference_embedding, | |
negative_reference_embedding) | |
scores = rescale_scores(scores) | |
window_size = min(max(1, int(0.1 * len(sentences))), 15) | |
plot_scores(scores, | |
sentences, | |
domain, | |
characteristic, | |
window_size=window_size) | |
plot_scores_interactive(scores, | |
sentences, | |
domain, | |
characteristic, | |
window_size=window_size) | |
plot_scores_minimalistic(scores, | |
sentences, | |
domain, | |
characteristic, | |
anti_characteristic, | |
window_size=window_size) | |
if __name__ == "__main__": | |
# Set up the argument parser | |
parser = argparse.ArgumentParser(description="Process domain, characteristic, anti-characteristic, and text-file-name.") | |
# Add arguments | |
parser.add_argument('domain', type=str, help="Domain name") | |
parser.add_argument('characteristic', type=str, help="Characteristic name") | |
parser.add_argument('anti_characteristic', type=str, help="Anti-characteristic name") | |
parser.add_argument('text_file', type=str, help="Text file location and name") | |
# Parse the arguments | |
args = parser.parse_args() | |
# Call the main function with the parsed arguments | |
textual_topography(args.domain, args.characteristic, args.anti_characteristic, args.text_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment