Skip to content

Instantly share code, notes, and snippets.

View audhiaprilliant's full-sized avatar
🎯
Focusing

Audhi Aprilliant audhiaprilliant

🎯
Focusing
View GitHub Profile
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_1.py
Last active April 20, 2022 01:27
How to Automatically Build Stopwords
# HTTP library for Python
import requests
# Regular expression
import re
# Array manipulation
import collections
# Data manipulation
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_2.py
Created April 20, 2022 01:28
How to Automatically Build Stopwords
# List of URL
urls = [
'https://www.gutenberg.org/files/1661/1661-0.txt',
'https://www.gutenberg.org/files/2701/2701-0.txt',
'https://www.gutenberg.org/files/11/11-0.txt',
'https://www.gutenberg.org/files/98/98-0.txt',
'https://www.gutenberg.org/files/74/74-0.txt'
]
# Text
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_3.py
Created April 20, 2022 01:29
How to Automatically Build Stopwords
# Lower text
def lowerCase(text):
return text.lower()
# Numbers removal
def numberRemoval(text):
return re.sub(
pattern = '\d',
repl = ' ',
string = text
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_4.py
Last active April 23, 2022 12:32
How to Automatically Build Stopwords
# How to get a list of top words
def getTopWords(
text: str
):
# Split text by its whitespace
list_words = text.split()
# Count the word frequencies
word_freq = collections.Counter(list_words)
# Get top n words that have highest frequencies
top_words = word_freq.most_common()
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_5.py
Last active April 23, 2022 12:33
How to Automatically Build Stopwords
# List of data
l_data = []
# Highest frequency
max_freq = top_words[0][1]
# Alpha
alpha = 1
# Loop
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_6.py
Last active April 23, 2022 12:33
How to Automatically Build Stopwords
# Data viz
plotnine.options.figure_size = (10, 4.8)
(
ggplot(
data = df[:20]
)+
geom_bar(
aes(
x = 'word',
y = 'actual_freq'
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_7.py
Last active April 25, 2022 14:46
How to Automatically Build Stopwords
# Data viz
plotnine.options.figure_size = (10, 4.8)
(
ggplot(
data = df
)+
geom_line(
aes(
x = 'rank',
y = 'zipf_freq',
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords_8.py
Last active April 25, 2022 14:46
How to Automatically Build Stopwords
# Linear regression for ideal Zipf's line
linear = LinearRegression()
linear.fit(
X = np.log(np.array(df['rank'])).reshape(-1, 1),
y = np.log(df['zipf_freq'])
)
# Print slope and intercept
print('Intercept: {intercept}\nSlope: {slope}'.format(
intercept = linear.intercept_,
slope = linear.coef_[0]
@audhiaprilliant
audhiaprilliant / zipfs_law_autostopwords.ipynb
Last active April 25, 2022 14:53
How to Automatically Build Stopwords
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@audhiaprilliant
audhiaprilliant / monty_hall_problem_1.py
Created April 25, 2022 15:29
Simulation of Monty Hall Problem
# Data frame manipulation
import pandas as pd
# Mathematical operations
import numpy as np
# Data visualization
import plotnine
from plotnine import *