agustinustheo’s gists

agustinustheo / find_similar_articles

Last active February 22, 2019 12:45

Find Similar Articles Function function for Filtering Fake News Blog

	def find_similar_articles(news, similarity):
	news_title_tokenized = ""

	if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)):
	news_article = Article(news)
	news_article.download()
	news_article.parse()
	news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title))
	else:
	news_title_tokenized = news_title_tokenization(preproccess_text(news))

agustinustheo / news_title_tokenization

Created February 22, 2019 12:43

News Title Tokenization function for Filtering Fake News Blog

	def news_title_tokenization(message):
	stopwords = nltk.corpus.stopwords.words('english')
	tokenized_news_title = []
	ps = PorterStemmer()
	for word in word_tokenize(message):
	if word not in stopwords:
	tokenized_news_title.append(ps.stem(word))

	return tokenized_news_title

agustinustheo / preproccess_text

Created February 22, 2019 12:45

Preprocess Text function for Filtering Fake News Blog

	def preproccess_text(text_messages):
	# change words to lower case - Hello, HELLO, hello are all the same word
	processed = text_messages.lower()

	# Remove remove unnecessary noise
	processed = re.sub(r'\[[0-9]+\]\|\[[a-z]+\]\|\[[A-Z]+\]\|\\\\\|\\r\|\\t\|\\n\|\\', ' ', processed)

	# Remove punctuation
	processed = re.sub(r'[.,\/#!%\^&\*;\[\]:\|+{}=\-\'"_”“`~(’)?]', ' ', processed)

agustinustheo / remove_unnecessary_noise

Created February 22, 2019 12:46

Remove Unnecessary Noise function for Filtering Fake News Blog

	def remove_unnecessary_noise(text_messages):
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\[[0-9]+\]\|\[[a-z]+\]\|\[[A-Z]+\]\|\\\\\|\\r\|\\t\|\\n\|\\', ' ', text_messages)

	return text_messages

agustinustheo / requirements.txt

Created February 22, 2019 12:47

Required libraries for Filtering Fake News Blog

agustinustheo / convertText.py

Created July 24, 2019 03:34

agustinustheo / preprocessText.py

Created July 30, 2019 05:27

Preprocess Text function for SMS Classifier Blog

	def preproccess_text(text_messages):
	# change words to lower case - Hello, HELLO, hello are all the same word
	processed = text_messages.lower()

	# Replace email addresses with 'almtemail'
	processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ', processed)

	# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
	processed = re.sub(r'($)?(\+62\|62\|0)(\d{2,3})?$?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn ', processed)

agustinustheo / trainTokenizer.py

Created July 30, 2019 05:42

Train Tokenizer code for SMS Classifier Blog

	#Train the sentence tokenizer
	f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r")
	if f.mode == 'r':
	train_text = preproccess_text(f.read())
	f.close()

	path = 'indonesian_sent_tokenizer_corpus/tempo/txt'
	for foldername in os.listdir(path):
	new_path = path + '/' + foldername
	for filename in os.listdir(new_path):

agustinustheo / preprocessDataframe.py

Created July 30, 2019 06:09

Preprocess Dataframe function for SMS Classifier Blog

	def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word
	processed = text_messages.str.lower()

	# Replace email addresses with 'almtemail'
	processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ')

	# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
	processed = processed.str.replace(r'($)?(\+62\|62\|0)(\d{2,3})?$?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' )

	# Replace URLs with 'almtweb'

agustinustheo / loadingDataset.py

Last active July 30, 2019 06:12

Loading dataset for SMS Classifier Blog

df = pd.read_csv('sms_classifier_corpus/data.txt', engine='python', sep="<%>", header=None)

	def remove_unnecessary_noise(text_messages):
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\[[0-9]+\]\|\[[a-z]+\]\|\[[A-Z]+\]\|\\\\\|\\r\|\\t\|\\n\|\\', ' ', text_messages)

	return text_messages

theo agustinustheo