Last active
October 16, 2019 08:14
-
-
Save irdanish11/69818c36e5228bc006cd739d00ce9bc2 to your computer and use it in GitHub Desktop.
Removing Redundant data, cleaning the data, and removing the sentences that are too short are too long.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #removing the redundant lines | |
| start_time = time.time() | |
| unique_data = [] | |
| for i in range(len(data)): | |
| if data['description'][i] not in unique_data: | |
| unique_data.append(data['description'][i]) | |
| if i % 5000 == 0: | |
| print('{0}'.format(i)+' lines have been processed') | |
| else: | |
| None | |
| print(start_time - time.clock) | |
| end_time = time.time() | |
| print('Total time:', end_time - start_time) | |
| unique_data_str = [] | |
| for i in range(len(unique_data)): | |
| if type(unique_data[i]) is str: | |
| unique_data_str.append(unique_data[i]) | |
| else: | |
| None | |
| # Cleaning the data | |
| clean_data = [] | |
| for text in unique_data_str: | |
| a = re.sub(r'[^a-zA-z ]+', '', text).strip() | |
| if len(a)>0: | |
| clean_data.append(clean_text(a)) | |
| else: | |
| None | |
| # Removing the lines which are to short or to long | |
| short_data = [] | |
| for line in clean_data: | |
| if 2 <= len(line.split()) <= 25: | |
| short_data.append(line) | |
| else: | |
| None | |
| # Counting the appearnce of each word in the corpus also calculates the number of unique words also | |
| word2count = {} | |
| total_words = 0 | |
| for text in short_data: | |
| for word in text.split(): | |
| if word not in word2count: | |
| word2count[word] = 1 | |
| else: | |
| word2count[word] += 1 | |
| total_words += 1 | |
| # creating a list that will only contain the words that appear more than 15 times | |
| word15 = [] | |
| threshold = 15 | |
| for word, count in word2count.items(): | |
| if count >= threshold: | |
| if len(word) > 1: | |
| word15.append(word) | |
| # Removing the words from each string which appear less than 15 times | |
| data_15 = [] | |
| for line in short_data: | |
| str1='' | |
| for word in line.split(): | |
| if word in word15: | |
| str1 = " ".join((str1, word)) | |
| data_15.append(str1) | |
| # Removing the lines which are to short or to long after removing the unnecssary words. | |
| short_data_consize = [] | |
| for line in data_15: | |
| if 3 <= len(line.split()) <= 15: | |
| short_data_consize.append(line) | |
| else: | |
| None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment