Skip to content

Instantly share code, notes, and snippets.

View HarshSingh16's full-sized avatar

Harshdeep Singh HarshSingh16

View GitHub Profile
#importing the libraries
import tensorflow as tf
import numpy as np
import re
import time
#Loading the datasets
lines=open("movie_lines.txt",encoding="utf-8",errors="ignore").read().split("\n")
conversations=open("movie_conversations.txt",encoding="utf-8",errors="ignore").read().split("\n")
id2line={}
for line in lines:
_line1=line.split(" +++$+++ ")
if len(_line1)==5:
id2line[_line1[0]]=_line1[4]
#Creating a LIST for conversations
coversation_ids=[]
for conversation in conversations[:-1]:
_conversation=conversation.split(" +++$+++ ")[-1][1:-1].replace("'","").replace(" ","")
coversation_ids.append(_conversation.split(","))
##Mapping Questions and Answers
Questions=[]
Answers=[]
for conversation in conversation_ids:
for i in range(len(conversation)-1):
Questions.append(id2line[conversation[i]])
Answers.append(id2line[conversation[i+1]])
##Cleaning the text
def clean_text(text):
text=text.lower()
text=re.sub(r"he's","he is", text)
text=re.sub(r"she's","she is",text)
text=re.sub(r"i'm","i am",text)
text=re.sub(r"that's","that is",text)
text=re.sub(r"what's","what is",text)
text=re.sub(r"where's","where is",text)
text=re.sub(r"\'ll"," will",text)
#Cleaning Questions
Clean_questions=[]
for question in Questions:
_question1=clean_text(question)
Clean_questions.append(_question1)
#Cleaning Answers
Clean_answers=[]
for answer in Answers:
_answer1=clean_text(answer)
#Creating a dictionary that maps each word to its occurences
#For questions
word2count={}
for sentence in Clean_questions:
for word in sentence.split():
if word not in word2count:
word2count[word]=1
else:
word2count[word]+=1
#For answers
#Creating a dictionary that maps each word to a unique integer and also checking if the frequency threshold is met
threshold=20
questionswordstoint={}
wordnumber=0
for word,count in word2count.items():
if count>=threshold:
#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER
threshold=20
word_number=0
dict_integer={}
for word,frequency in word2count.items():
if frequency>20:
dict_integer[word]=word_number
word_number+=1