This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def find_similar_articles(news, similarity): | |
| news_title_tokenized = "" | |
| if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)): | |
| news_article = Article(news) | |
| news_article.download() | |
| news_article.parse() | |
| news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title)) | |
| else: | |
| news_title_tokenized = news_title_tokenization(preproccess_text(news)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def news_title_tokenization(message): | |
| stopwords = nltk.corpus.stopwords.words('english') | |
| tokenized_news_title = [] | |
| ps = PorterStemmer() | |
| for word in word_tokenize(message): | |
| if word not in stopwords: | |
| tokenized_news_title.append(ps.stem(word)) | |
| return tokenized_news_title |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def preproccess_text(text_messages): | |
| # change words to lower case - Hello, HELLO, hello are all the same word | |
| processed = text_messages.lower() | |
| # Remove remove unnecessary noise | |
| processed = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', processed) | |
| # Remove punctuation | |
| processed = re.sub(r'[.,\/#!%\^&\*;\[\]:|+{}=\-\'"_”“`~(’)?]', ' ', processed) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def remove_unnecessary_noise(text_messages): | |
| text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages) | |
| text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages) | |
| text_messages = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', text_messages) | |
| return text_messages |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| nltk | |
| beautifulsoup4 | |
| selenium>=2.44.0,<3.0.0 | |
| requests | |
| unidecode | |
| vcrpy | |
| future | |
| fake-useragent | |
| newspaper3k | |
| sklearn |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def convertText(text): | |
| words = word_tokenize(text) | |
| new_string = '' | |
| for msg in words: | |
| new_word = '' | |
| alpha_flag = False | |
| digit_flag = False | |
| for c in msg: | |
| if c.isalpha(): | |
| alpha_flag = True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def preproccess_text(text_messages): | |
| # change words to lower case - Hello, HELLO, hello are all the same word | |
| processed = text_messages.lower() | |
| # Replace email addresses with 'almtemail' | |
| processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ', processed) | |
| # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn' | |
| processed = re.sub(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn ', processed) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Train the sentence tokenizer | |
| f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r") | |
| if f.mode == 'r': | |
| train_text = preproccess_text(f.read()) | |
| f.close() | |
| path = 'indonesian_sent_tokenizer_corpus/tempo/txt' | |
| for foldername in os.listdir(path): | |
| new_path = path + '/' + foldername | |
| for filename in os.listdir(new_path): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word | |
| processed = text_messages.str.lower() | |
| # Replace email addresses with 'almtemail' | |
| processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ') | |
| # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn' | |
| processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' ) | |
| # Replace URLs with 'almtweb' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| df = pd.read_csv('sms_classifier_corpus/data.txt', engine='python', sep="<%>", header=None) |
OlderNewer