Last active
March 30, 2021 16:09
-
-
Save lgelape/01cda13cbb9ecf06e9131873d509910b to your computer and use it in GitHub Desktop.
Código de análise de dados da matéria "De cobras a funk: a guinada na comunicação digital do Instituto Butantan", Núcleo Jornalismo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################### | |
################################################################################################### | |
######### | |
######### "De cobras a funk: a guinada na comunicação digital do Instituto Butantan" | |
######### | |
######### Lucas Gelape | |
######### Analise de dados da materia publicada no Nucleo Jornalismo | |
######### | |
## Pacotes | |
library(dplyr) | |
library(lubridate) | |
library(stringr) | |
library(tidytext) | |
library(tidyr) | |
library(funprog) | |
library(purrr) | |
library(tm) | |
## Funcoes | |
# Cria um "nao esta contido em" | |
'%nin%' <- Negate('%in%') | |
# Remove strings de dentro de um character | |
limpar_texto <- function(x){ | |
t <- unlist(strsplit(x, " ")) | |
vetor <- t[t %nin% stopwords] | |
texto <- paste(vetor, collapse = " ") | |
} | |
################################################################################################### | |
# Abre o banco de tweets ate 17/03 | |
twitter <- readRDS("butantan_tweets.rds") | |
# Acerta fuso horario e cria variaveis para analises e grafico | |
twitter <- twitter %>% | |
mutate(created_at = created_at - hours(3), | |
dia_mes_ano = as.Date(created_at), | |
mes = month(created_at), | |
dia = day(created_at)) %>% | |
mutate(engajamento = retweet_count + favorite_count) %>% | |
mutate(mes = factor(mes, | |
levels = c(6:12, 1:3), | |
labels = c("Jun", "Jul", "Ago", | |
"Set", "Out", "Nov", | |
"Dez", "Jan", "Fev", | |
"Mar"))) | |
### MEDIA TWEETS/DIA | |
# Calcula a media de tweets por dia (grafico) | |
tweets_dia <- twitter %>% | |
group_by(mes) %>% | |
summarise(dias = length(unique(dia)), | |
total_mes = n(), | |
tweets_dia = total_mes/dias) %>% | |
ungroup() | |
### ENGAJAMENTO | |
# Calcula o engajamento mensal de posts do Butantan | |
engajamento_mes_twitter <- twitter %>% | |
# Elimina os RT, mantendo somente os posts de autoria do Butantan | |
filter(!is_retweet == "TRUE") %>% | |
group_by(mes) %>% | |
summarise(engajamento_mensal = sum(engajamento)) %>% | |
ungroup() | |
# 5 postagens com maior engajamento | |
twitter %>% | |
filter(!is_retweet == "TRUE") %>% | |
slice_max(engajamento, n = 5) %>% | |
mutate(link = paste0("https://twitter.com/butantanoficial/status/", status_id)) %>% | |
select(link) | |
### RESPOSTAS A USUARIOS | |
# N. absoluto e porcentagem de tweets que sao respostas | |
replies <- twitter %>% | |
mutate(resposta = ifelse(!is.na(reply_to_screen_name) & reply_to_screen_name != "butantanoficial", | |
1, 0)) %>% | |
group_by(mes) %>% | |
summarise(resposta = sum(resposta), | |
percentual = (resposta/n())*100) %>% | |
ungroup() | |
### CONTAGEM DE PALAVRAS | |
# Abrir stopwords gerais | |
source("https://gist.githubusercontent.com/lgelape/edcc0250f21bcc5710c0a9fd0488d1ea/raw/960e11e214d6a32df5dd38e4d0f251a992b46d57/stopwords_pt.R") | |
# Stopwords identificadas na analise | |
stopwords_butantan <- data.frame(word = c("https", "t.co", "butantan", "equipebutantan", "instituto", | |
"10", "120", "12h45", "19", "2020", "oi", "ola", "ne"), | |
source = "analise_propria") | |
# Banco e vetor finais de stopwords | |
stopwords_pt_final_noaccent <- bind_rows(stopwords_pt_final_noaccent, stopwords_butantan) | |
stopwords <- stopwords_pt_final_noaccent$word | |
# Limpa texto dos tweets | |
twitter <- twitter %>% | |
mutate(text = tolower(stri_trans_general(text, "Latin-ASCII")), | |
text = removePunctuation(text)) | |
# Aplica a funcao de remover as stopwords e guarda como vetor | |
textos_limpos <- map(.x = twitter$text, | |
.f = limpar_texto) | |
textos_limpos <- do.call(c, textos_limpos) | |
# Salva o vetor como coluna do banco de dados | |
twitter$textos_limpos <- textos_limpos | |
# Identifica as 5 palavras mais repetidas em cada periodo de analise | |
palavras_mais_repetidas <- twitter %>% | |
unnest_tokens(word, textos_limpos) %>% | |
select(dia_mes_ano, word) %>% | |
mutate(periodo = ifelse(dia_mes_ano < as.Date("2020-12-01"), 1, 2)) %>% | |
group_by(periodo) %>% | |
count(word) %>% | |
slice_max(n, n = 5) %>% | |
arrange(desc(n)) %>% | |
mutate(posicao = rank(n)) %>% | |
ungroup() | |
################################################################################################### | |
# Abre o banco de posts no Facebook | |
facebook <- readRDS("butantan_desde0106.rds") | |
# Cria variaveis uteis na analise | |
facebook <- facebook %>% | |
mutate(dia_redondo = as.Date(date), | |
mes_lubridate = month(date), | |
dia = day(date)) %>% | |
mutate(mes = factor(mes_lubridate, | |
levels = c(6:12, 1:3), | |
labels = c("Jun", "Jul", "Ago", | |
"Set", "Out", "Nov", | |
"Dez", "Jan", "Fev", | |
"Mar"))) %>% | |
mutate(engajamento = actual_shareCount + actual_likeCount + actual_loveCount + | |
actual_commentCount + actual_wowCount + actual_hahaCount + | |
actual_sadCount + actual_angryCount + actual_thankfulCount + | |
actual_careCount) | |
### POSTS/DIA | |
# Calcula o n. de posts por dia do mes | |
posts_dia <- facebook %>% | |
group_by(mes) %>% | |
summarise(dias = unique(days_in_month(mes_lubridate)), | |
total_mes = n()) %>% | |
ungroup() %>% | |
mutate(dias = ifelse(mes == "Mar", 17, dias), | |
tweets_dia = total_mes/dias) | |
################################################################################################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment