Skip to content

Instantly share code, notes, and snippets.

@CoffeeVampir3
Created August 4, 2023 13:37
Show Gist options
  • Save CoffeeVampir3/14673d7f3296e180fd9690b4bdf9bf7f to your computer and use it in GitHub Desktop.
Save CoffeeVampir3/14673d7f3296e180fd9690b4bdf9bf7f to your computer and use it in GitHub Desktop.
Thing for book thinging of stuff with the thing.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "0727e4f1-6539-45b0-8d05-5e1abb26ea28",
"metadata": {},
"outputs": [],
"source": [
"with open('datasets/Book.txt', 'r') as file:\n",
" text = file.read()\n",
"text = text.replace('\\n', ' ')\n",
"sentences = text.split('.')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "59b268e1-f155-47ea-8a13-42d4d87903f4",
"metadata": {},
"outputs": [],
"source": [
"new_sentences = []\n",
"for sentence in sentences:\n",
" sentence = sentence.strip()\n",
" if len(sentence) > 80:\n",
" # Split on comma\n",
" parts = [part.strip() for part in sentence.split(',')]\n",
" if len(parts) > 1: # If sentence could be split\n",
" new_sentences.extend(parts)\n",
" else:\n",
" new_sentences.append(sentence)\n",
"\n",
"sentences = new_sentences"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb84a662-4cd2-40f4-985b-31fba70ec7f4",
"metadata": {},
"outputs": [],
"source": [
"[print(x) for x in sentences[:10]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86f9ef96-b7eb-43e8-adf7-82620db375af",
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "47bacb44-3279-4417-a94c-7e2435b72918",
"metadata": {},
"outputs": [],
"source": [
"def count_descriptive_words(sentence):\n",
" words = nltk.word_tokenize(sentence)\n",
" tagged = nltk.pos_tag(words)\n",
" descriptive_count = sum(1 for word, tag in tagged if tag in ('JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'))\n",
" return descriptive_count\n",
"descriptive_counts = [(sentence, count_descriptive_words(sentence)) for sentence in sentences]\n",
"descriptive_counts = sorted(descriptive_counts, key=lambda x: x[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "1b81d8c9-a640-4d39-98b7-fdea3b622e12",
"metadata": {},
"outputs": [],
"source": [
"top_100 = descriptive_counts[:100]\n",
"items = [x.encode('ascii', 'ignore').decode() for x, y in top_100]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "18d88cd9-b289-49d5-9b42-0dedb16ab57e",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open('descriptive_sentences.jsonl', 'w') as file:\n",
" for item in items:\n",
" line = json.dumps({\"text\": item})\n",
" file.write(line + '\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment