akash-ch2812’s gists

akash-ch2812 / Vocab_len_pad_seq.py

Created July 24, 2020 06:31

	# compute length of vocabulary and maximum length of a caption (for padding)
	vocab_len = len(tokenizer.word_counts) + 1
	print(f"Vocabulary length - {vocab_len}")

	max_caption_len = max([len(x.split(" ")) for x in all_captions])
	print(f"Maximum length of caption - {max_caption_len}")

akash-ch2812 / Create_Training_Dataset.py

Created July 24, 2020 06:43

	from keras.preprocessing.sequence import pad_sequences
	from keras.utils import to_categorical

	# generator function to generate inputs for model
	def create_trianing_data(captions, images, tokenizer, max_caption_length, vocab_len, photos_per_batch):

	X1, X2, y = list(), list(), list()
	n=0

	# loop through every image

akash-ch2812 / CaptionModel.py

Created July 24, 2020 08:00

	import keras

	def create_model(max_caption_length, vocab_length):

	# sub network for handling the image feature part
	input_layer1 = keras.Input(shape=(18432))
	feature1 = keras.layers.Dropout(0.2)(input_layer1)
	feature2 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature1)
	feature3 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature2)
	feature4 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature3)

akash-ch2812 / Word_embedding.py

Created July 24, 2020 08:10

	# create word embeddings
	import spacy
	nlp = spacy.load('en_core_web_lg')

	# create word embeddings
	embedding_dimension = 300
	embedding_matrix = np.zeros((vocab_len, embedding_dimension))

	# travel through every word in vocabulary and get its corresponding vector
	for word, index in tokenizer.word_index.items():

akash-ch2812 / Train_Caption_Model.py

Created July 24, 2020 08:20

	# get training data
	train_data = create_trianing_data(train_image_captions, train_image_features, tokenizer, max_caption_len, vocab_length, 32)

	# initialize model
	model = create_model(max_caption_len, vocab_len)

	steps_per_epochs = len(train_image_captions)//32

	# compile model
	model.compile(optimizer='adam', loss='categorical_crossentropy')

akash-ch2812 / Generate_Captions.py

Last active July 24, 2020 08:37

	import matplotlib.pyplot as plt
	import seaborn as sns
	from PIL import Image
	%matplotlib inline

	# method for generating captions
	def generate_captions(model, image, tokenizer.word_index, max_caption_length, tokenizer.index_word):

	# input is <start>
	input_text = '<start>'

akash-ch2812 / Data_Preprocessing.py

Last active September 6, 2020 02:38

	import spacy
	import PyPDF2

	# spacy english model (large)
	nlp = spacy.load('en_core_web_lg')

	# method for reading a pdf file
	def readPdfFile(filename, folder_name):

	# storing path of PDF-Documents folder

akash-ch2812 / Similar_Keywords.py

Created August 14, 2020 09:06

	# convert keywords to vector
	def createKeywordsVectors(keyword, nlp):
	doc = nlp(keyword) # convert to document object

	return doc.vector


	# method to find cosine similarity
	def cosineSimilarity(vect1, vect2):
	# return cosine distance

akash-ch2812 / Search.py

Created August 14, 2020 09:14

	from spacy.matcher import PhraseMatcher
	from scipy import spatial

	# method for searching keyword from the text
	def search_for_keyword(keyword, doc_obj, nlp):
	phrase_matcher = PhraseMatcher(nlp.vocab)
	phrase_list = [nlp(keyword)]
	phrase_matcher.add("Text Extractor", None, *phrase_list)

	matched_items = phrase_matcher(doc_obj)

akash-ch2812 / Create_Titanic_Model.py

Created September 1, 2020 12:21

	import pandas as pd
	import numpy as np

	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.linear_model import LogisticRegression

	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

	# load dataset