graylan0 · December 19, 2023 02:33
diff --git a/ducktapeisgood.py b/ducktapeisgood.py
 To create a more advanced tokenizer using NLTK that specifically targets action or attention words, we can integrate Named Entity Recognition (NER) and a custom list of action or attention keywords. NER can help in identifying proper nouns or entities which often are points of attention or action in a text. Additionally, we can use a custom list of keywords that are typically associated with actions or attention-grabbing concepts.

 Here's how you can implement this:

 1. **Install NLTK and Download Required Data**:
   Ensure NLTK is installed and download the necessary datasets for tokenization and named entity recognition.

   ```bash
   pip install nltk
   nltk.download('punkt')
   nltk.download('averaged_perceptron_tagger')
   nltk.download('maxent_ne_chunker')
   nltk.download('words')
   ```

 2. **Implement the Advanced Tokenizer**:
   The tokenizer will use NLTK's functionality to tokenize the text, tag parts of speech, and then use named entity recognition to identify entities. Additionally, it will check for custom action or attention keywords.

   ```python
   import nltk
   from nltk.tokenize import word_tokenize
   from nltk import pos_tag, ne_chunk
   from nltk.tree import Tree
   from typing import Dict, List

   def extract_entities(tokenized_message: List[str]) -> List[str]:
       """
       Use Named Entity Recognition to extract entities from the tokenized message.
       """
       pos_tags = pos_tag(tokenized_message)
       named_entities = ne_chunk(pos_tags, binary=True)
       entities = []

       for chunk in named_entities:
           if isinstance(chunk, Tree):
               entities.append(" ".join([token for token, pos in chunk.leaves()]))

       return entities

   def is_action_attention_token(token: str, entities: List[str], action_attention_keywords: List[str]) -> bool:
       """
       Check if a token is an action attention token based on entities and custom keywords.
       """
       return token in entities or token.lower() in action_attention_keywords

   def chunk_messages(tokenized_message: List[str], max_chunk_size: int, action_attention_keywords: List[str]) -> List[str]:
       """
       Chunk a tokenized message into smaller parts, considering action attention tokens.
       """
       chunks = []
       current_chunk = []
       entities = extract_entities(tokenized_message)

       for token in tokenized_message:
           current_chunk.append(token)
           if len(current_chunk) >= max_chunk_size or is_action_attention_token(token, entities, action_attention_keywords):
               chunks.append(' '.join(current_chunk))
               current_chunk = []

       if current_chunk:
           chunks.append(' '.join(current_chunk))

       return chunks

   def format_messages(messages: List[Dict[str, str]], max_chunk_size: int = 10, action_attention_keywords: List[str] = []) -> str:
       """
       Format messages for Llama-2 chat models with advanced chunking based on NER and custom keywords.
       """
       formatted_messages = []

       for message in messages:
           # Tokenize the message content
           tokenized_content = word_tokenize(message["content"])

           # Chunk the tokenized content
           chunks = chunk_messages(tokenized_content, max_chunk_size, action_attention_keywords)

           # Format each chunk
           for chunk in chunks:
               formatted_chunk = f"<s>[{message['role'].upper()}] {chunk}</s>"
               formatted_messages.append(formatted_chunk)

       return "\n".join(formatted_messages)

   # Example usage
   dialog = [
       {"role": "system", "content": "Always answer with Haiku"},
       {"role": "user", "content": "I am going to Paris, what should I see?"}
   ]

   # Define some action attention keywords
   action_attention_keywords = ['Paris', 'Haiku', 'travel', 'visit']

   formatted_dialog = format_messages(dialog, 10, action_attention_keywords)
   print(formatted_dialog)
   ```

 In this implementation, `extract_entities` uses NLTK's named entity recognition to find entities in the text. The `is_action_attention_token` function checks if a token is an entity or one of the specified action/attention keywords. The chunking logic in `chunk_messages` is then based on these criteria, along with the maximum chunk size.

 This approach provides a more context-aware chunking mechanism, considering both named entities and custom-defined keywords that are significant for action or attention.
	To create a more advanced tokenizer using NLTK that specifically targets action or attention words, we can integrate Named Entity Recognition (NER) and a custom list of action or attention keywords. NER can help in identifying proper nouns or entities which often are points of attention or action in a text. Additionally, we can use a custom list of keywords that are typically associated with actions or attention-grabbing concepts.

	Here's how you can implement this:

	1. Install NLTK and Download Required Data:
	Ensure NLTK is installed and download the necessary datasets for tokenization and named entity recognition.

	```bash
	pip install nltk
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('maxent_ne_chunker')
	nltk.download('words')
	```

	2. Implement the Advanced Tokenizer:
	The tokenizer will use NLTK's functionality to tokenize the text, tag parts of speech, and then use named entity recognition to identify entities. Additionally, it will check for custom action or attention keywords.

	```python
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk import pos_tag, ne_chunk
	from nltk.tree import Tree
	from typing import Dict, List

	def extract_entities(tokenized_message: List[str]) -> List[str]:
	"""
	Use Named Entity Recognition to extract entities from the tokenized message.
	"""
	pos_tags = pos_tag(tokenized_message)
	named_entities = ne_chunk(pos_tags, binary=True)
	entities = []

	for chunk in named_entities:
	if isinstance(chunk, Tree):
	entities.append(" ".join([token for token, pos in chunk.leaves()]))

	return entities

	def is_action_attention_token(token: str, entities: List[str], action_attention_keywords: List[str]) -> bool:
	"""
	Check if a token is an action attention token based on entities and custom keywords.
	"""
	return token in entities or token.lower() in action_attention_keywords

	def chunk_messages(tokenized_message: List[str], max_chunk_size: int, action_attention_keywords: List[str]) -> List[str]:
	"""
	Chunk a tokenized message into smaller parts, considering action attention tokens.
	"""
	chunks = []
	current_chunk = []
	entities = extract_entities(tokenized_message)

	for token in tokenized_message:
	current_chunk.append(token)
	if len(current_chunk) >= max_chunk_size or is_action_attention_token(token, entities, action_attention_keywords):
	chunks.append(' '.join(current_chunk))
	current_chunk = []

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def format_messages(messages: List[Dict[str, str]], max_chunk_size: int = 10, action_attention_keywords: List[str] = []) -> str:
	"""
	Format messages for Llama-2 chat models with advanced chunking based on NER and custom keywords.
	"""
	formatted_messages = []

	for message in messages:
	# Tokenize the message content
	tokenized_content = word_tokenize(message["content"])

	# Chunk the tokenized content
	chunks = chunk_messages(tokenized_content, max_chunk_size, action_attention_keywords)

	# Format each chunk
	for chunk in chunks:
	formatted_chunk = f"<s>[{message['role'].upper()}] {chunk}</s>"
	formatted_messages.append(formatted_chunk)

	return "\n".join(formatted_messages)

	# Example usage
	dialog = [
	{"role": "system", "content": "Always answer with Haiku"},
	{"role": "user", "content": "I am going to Paris, what should I see?"}
	]

	# Define some action attention keywords
	action_attention_keywords = ['Paris', 'Haiku', 'travel', 'visit']

	formatted_dialog = format_messages(dialog, 10, action_attention_keywords)
	print(formatted_dialog)
	```

	In this implementation, `extract_entities` uses NLTK's named entity recognition to find entities in the text. The `is_action_attention_token` function checks if a token is an entity or one of the specified action/attention keywords. The chunking logic in `chunk_messages` is then based on these criteria, along with the maximum chunk size.

	This approach provides a more context-aware chunking mechanism, considering both named entities and custom-defined keywords that are significant for action or attention.