Skip to content

Instantly share code, notes, and snippets.

@irdanish11
Last active October 16, 2019 08:14
Show Gist options
  • Select an option

  • Save irdanish11/69818c36e5228bc006cd739d00ce9bc2 to your computer and use it in GitHub Desktop.

Select an option

Save irdanish11/69818c36e5228bc006cd739d00ce9bc2 to your computer and use it in GitHub Desktop.
Removing Redundant data, cleaning the data, and removing the sentences that are too short are too long.
#removing the redundant lines
start_time = time.time()
unique_data = []
for i in range(len(data)):
if data['description'][i] not in unique_data:
unique_data.append(data['description'][i])
if i % 5000 == 0:
print('{0}'.format(i)+' lines have been processed')
else:
None
print(start_time - time.clock)
end_time = time.time()
print('Total time:', end_time - start_time)
unique_data_str = []
for i in range(len(unique_data)):
if type(unique_data[i]) is str:
unique_data_str.append(unique_data[i])
else:
None
# Cleaning the data
clean_data = []
for text in unique_data_str:
a = re.sub(r'[^a-zA-z ]+', '', text).strip()
if len(a)>0:
clean_data.append(clean_text(a))
else:
None
# Removing the lines which are to short or to long
short_data = []
for line in clean_data:
if 2 <= len(line.split()) <= 25:
short_data.append(line)
else:
None
# Counting the appearnce of each word in the corpus also calculates the number of unique words also
word2count = {}
total_words = 0
for text in short_data:
for word in text.split():
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
total_words += 1
# creating a list that will only contain the words that appear more than 15 times
word15 = []
threshold = 15
for word, count in word2count.items():
if count >= threshold:
if len(word) > 1:
word15.append(word)
# Removing the words from each string which appear less than 15 times
data_15 = []
for line in short_data:
str1=''
for word in line.split():
if word in word15:
str1 = " ".join((str1, word))
data_15.append(str1)
# Removing the lines which are to short or to long after removing the unnecssary words.
short_data_consize = []
for line in data_15:
if 3 <= len(line.split()) <= 15:
short_data_consize.append(line)
else:
None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment