irdanish11 · October 16, 2019 08:14
diff --git a/preprocessing.py b/preprocessing.py
 #removing the redundant lines
 start_time  = time.time()
 unique_data = []
 for i in range(len(data)):
    if data['description'][i] not in unique_data:
        unique_data.append(data['description'][i])
        if i % 5000 == 0:
            print('{0}'.format(i)+' lines have been processed')
    else:
        None
 print(start_time - time.clock)
 end_time  = time.time()
 print('Total time:', end_time - start_time)

 unique_data_str = []
 for i in range(len(unique_data)):
    if type(unique_data[i]) is str:
        unique_data_str.append(unique_data[i])
    else:
        None

 # Cleaning the data
 clean_data = []
 for text in unique_data_str:
    a = re.sub(r'[^a-zA-z ]+', '', text).strip()
    if len(a)>0:
        clean_data.append(clean_text(a))
    else:
        None

 # Removing the lines which are to short or to long
 short_data = []
 for line in clean_data:
    if 2 <= len(line.split()) <= 25:
        short_data.append(line)
    else:
        None

 # Counting the appearnce of each word in the corpus also calculates the number of unique words also
 word2count = {}
 total_words = 0
 for text in short_data:
    for word in text.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
        total_words += 1
        
 # creating a list that will only contain the words that appear more than 15 times
 word15 = []
 threshold = 15
 for word, count in word2count.items():
    if count >= threshold:
        if len(word) > 1:
            word15.append(word)
            
 # Removing the words from each string which appear less than 15 times
 data_15 = []
 for line in short_data:
    str1=''
    for word in line.split():
        if word in word15:
            str1 = " ".join((str1, word))
    data_15.append(str1)

 # Removing the lines which are to short or to long after removing the unnecssary words.     
 short_data_consize = []
 for line in data_15:
    if 3 <= len(line.split()) <= 15:
        short_data_consize.append(line)
    else:
        None
	#removing the redundant lines
	start_time = time.time()
	unique_data = []
	for i in range(len(data)):
	if data['description'][i] not in unique_data:
	unique_data.append(data['description'][i])
	if i % 5000 == 0:
	print('{0}'.format(i)+' lines have been processed')
	else:
	None
	print(start_time - time.clock)
	end_time = time.time()
	print('Total time:', end_time - start_time)

	unique_data_str = []
	for i in range(len(unique_data)):
	if type(unique_data[i]) is str:
	unique_data_str.append(unique_data[i])
	else:
	None

	# Cleaning the data
	clean_data = []
	for text in unique_data_str:
	a = re.sub(r'[^a-zA-z ]+', '', text).strip()
	if len(a)>0:
	clean_data.append(clean_text(a))
	else:
	None

	# Removing the lines which are to short or to long
	short_data = []
	for line in clean_data:
	if 2 <= len(line.split()) <= 25:
	short_data.append(line)
	else:
	None

	# Counting the appearnce of each word in the corpus also calculates the number of unique words also
	word2count = {}
	total_words = 0
	for text in short_data:
	for word in text.split():
	if word not in word2count:
	word2count[word] = 1
	else:
	word2count[word] += 1
	total_words += 1

	# creating a list that will only contain the words that appear more than 15 times
	word15 = []
	threshold = 15
	for word, count in word2count.items():
	if count >= threshold:
	if len(word) > 1:
	word15.append(word)

	# Removing the words from each string which appear less than 15 times
	data_15 = []
	for line in short_data:
	str1=''
	for word in line.split():
	if word in word15:
	str1 = " ".join((str1, word))
	data_15.append(str1)

	# Removing the lines which are to short or to long after removing the unnecssary words.
	short_data_consize = []
	for line in data_15:
	if 3 <= len(line.split()) <= 15:
	short_data_consize.append(line)
	else:
	None
No results found