This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing the libraries | |
import tensorflow as tf | |
import numpy as np | |
import re | |
import time |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Loading the datasets | |
lines=open("movie_lines.txt",encoding="utf-8",errors="ignore").read().split("\n") | |
conversations=open("movie_conversations.txt",encoding="utf-8",errors="ignore").read().split("\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
id2line={} | |
for line in lines: | |
_line1=line.split(" +++$+++ ") | |
if len(_line1)==5: | |
id2line[_line1[0]]=_line1[4] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating a LIST for conversations | |
coversation_ids=[] | |
for conversation in conversations[:-1]: | |
_conversation=conversation.split(" +++$+++ ")[-1][1:-1].replace("'","").replace(" ","") | |
coversation_ids.append(_conversation.split(",")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Mapping Questions and Answers | |
Questions=[] | |
Answers=[] | |
for conversation in conversation_ids: | |
for i in range(len(conversation)-1): | |
Questions.append(id2line[conversation[i]]) | |
Answers.append(id2line[conversation[i+1]]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Cleaning the text | |
def clean_text(text): | |
text=text.lower() | |
text=re.sub(r"he's","he is", text) | |
text=re.sub(r"she's","she is",text) | |
text=re.sub(r"i'm","i am",text) | |
text=re.sub(r"that's","that is",text) | |
text=re.sub(r"what's","what is",text) | |
text=re.sub(r"where's","where is",text) | |
text=re.sub(r"\'ll"," will",text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Cleaning Questions | |
Clean_questions=[] | |
for question in Questions: | |
_question1=clean_text(question) | |
Clean_questions.append(_question1) | |
#Cleaning Answers | |
Clean_answers=[] | |
for answer in Answers: | |
_answer1=clean_text(answer) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating a dictionary that maps each word to its occurences | |
#For questions | |
word2count={} | |
for sentence in Clean_questions: | |
for word in sentence.split(): | |
if word not in word2count: | |
word2count[word]=1 | |
else: | |
word2count[word]+=1 | |
#For answers |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating a dictionary that maps each word to a unique integer and also checking if the frequency threshold is met | |
threshold=20 | |
questionswordstoint={} | |
wordnumber=0 | |
for word,count in word2count.items(): | |
if count>=threshold: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER | |
threshold=20 | |
word_number=0 | |
dict_integer={} | |
for word,frequency in word2count.items(): | |
if frequency>20: | |
dict_integer[word]=word_number | |
word_number+=1 |