Last active
August 29, 2015 14:16
-
-
Save briatte/ebf865ff296d9daf1249 to your computer and use it in GitHub Desktop.
scrape front covers from Charlie Hebdo – source: http://stripsjournal.canalblog.com/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# download 338 Charlie Hebdo covers with keywords | |
# | |
library(dplyr) | |
library(XML) | |
library(lubridate) | |
library(stringr) | |
library(ggplot2) | |
source("charlie-themes.r") | |
dir.create("pages", showWarnings = FALSE) | |
dir.create("covers", showWarnings = FALSE) | |
covers = "charlie.csv" | |
if(!file.exists(covers)) { | |
d = data.frame() | |
for(i in seq(330, 0, -10)) { | |
cat("Parsing page", sprintf("%2.0f", i / 10)) | |
file = paste0("pages/page-", i, ".html") | |
if(!file.exists(file)) | |
h = try(download.file(paste0("http://stripsjournal.canalblog.com/tag/Les%20Unes%20de%20Charlie%20Hebdo/p", | |
i, "-0.html"), file, mode = "wb", quiet = TRUE), silent = TRUE) | |
if(!file.info(file)$size) { | |
cat(": failed\n") | |
file.remove(file) | |
} else { | |
h = htmlParse(file) | |
addr = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='url']/@content") | |
date = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='dateCreated']/@content") | |
titl = xpathSApply(h, "//div[@class='blogbody']//h3", xmlValue) | |
dat = gsub("(.*) - (.*) - (.*)", "\\3", titl) | |
dat = gsub("(.*)- (.*)", "\\2", dat) | |
dat = gsub("octore", "octobre", dat) | |
dat = gsub("sept\\.", "sep.", dat) | |
dat = parse_date_time(gsub("\\.", "", dat), "%d %m %y", locale = "fr_FR") | |
dat = as.Date(dat) | |
# fix one unparsable date | |
dat[ addr == "http://stripsjournal.canalblog.com/archives/2012/11/14/26045046.html" ] = | |
as.Date("2012-11-14") | |
file = gsub("(.*) - (.*) - (.*)", "\\2 - \\1", titl) | |
file = gsub("Charlie Hebdo Nª", "", file) | |
file = paste(dat, file) | |
kwd = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]") | |
kwd = lapply(kwd, xpathSApply, "a[@rel='tag']", xmlValue) | |
# fix a few keywords | |
kwd = lapply(kwd, function(x) { | |
y = tolower(x[ !x %in% c("Charlie Hebdo", "Les Unes de Charlie Hebdo") ]) | |
y[ y == "aubry" ] = "martine aubry" | |
y[ y == "crise économique 2009..." ] = "crise économique 2009" | |
y[ y == "tapie" ] = "bernard tapie" | |
y[ y == "emmigration" ] = "émigration" | |
y[ y == "univercité" ] = "université" | |
return(gsub("œ", "oe", y)) | |
}) | |
kwd = sapply(kwd, paste0, collapse = ";") | |
aut = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][2]") | |
aut = lapply(aut, xpathSApply, "a[contains(@href, 'archives')]", xmlValue) | |
aut = sapply(aut, head, 1) | |
# img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::p//img/@src") | |
img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]/following-sibling::p[2]") | |
img = sapply(img, xpathSApply, "a/img/@src") | |
img = sapply(img, function(x) ifelse(is.null(x), NA, x)) | |
# fix six parser errors | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2012/04/30/24141374.html" ] = | |
"http://p0.storage.canalblog.com/06/27/177230/75268181.jpg" | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/20/28478698.html" ] = | |
"http://p2.storage.canalblog.com/22/38/177230/91674307.jpg" | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/12/28419822.html" ] = | |
"http://p1.storage.canalblog.com/15/42/177230/91450058_o.jpg" | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/05/28369503.html" ] = | |
"http://p2.storage.canalblog.com/28/47/177230/91251269_o.jpg" | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/10/29/28316947.html" ] = | |
"http://p4.storage.canalblog.com/44/10/177230/91032930_o.jpg" | |
img[ addr == "http://stripsjournal.canalblog.com/archives/2015/01/12/31306656.html" ] = | |
"http://p6.storage.canalblog.com/68/20/177230/101510919_o.png" | |
file = paste0(file, gsub("(.*)\\.(gif|jpg|png)", ".\\2", img)) | |
d = rbind(d, data.frame(post_page = i, | |
post_url = addr, post_date = date, post_title = titl, | |
date = dat, tags = kwd, author = aut, image = img, | |
file, stringsAsFactors = FALSE)) | |
cat(":", sprintf("%3.0f", nrow(d)), "total covers\n") | |
} | |
} | |
# fix three missing authors | |
d$author[ d$author == "Index Dessinateurs" ] = NA | |
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2009/10/28/30971837.html" ] = | |
"Luz" | |
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/21/30964112.html" ] = | |
"Cabu" | |
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/14/30964082.html" ] = | |
"Cabu" | |
write.csv(d, covers, row.names = FALSE) | |
} | |
d = read.csv(covers, stringsAsFactors = FALSE) | |
for(i in which(!is.na(d$image))) { | |
file = paste0("covers/", d$file[ i ]) | |
if(!file.exists(file)) | |
try(download.file(d$image[ i ], file, quiet = TRUE), silent = TRUE) | |
} | |
# dates | |
print(table(year(d$date), exclude = NULL)) | |
d$year = year(d$date) | |
d$quarter = paste0(d$year, "_", quarter(d$date)) | |
# numérotation | |
d$num = str_extract(d$post_title, "°\\d+") | |
d$num[ is.na(d$num) ] = str_extract(d$post_title[ is.na(d$num) ], "Charlie Hebdo \\d+") | |
d$num = gsub("\\D", "", d$num) | |
stopifnot(n_distinct(d$num) == nrow(d)) | |
# authors | |
print(table(d$author, exclude = NULL)) | |
# keywords | |
terms = unlist(strsplit(d$tags, ";")) | |
cat("\n", length(unique(terms)), "unique keywords:\n") | |
print(table(terms)[ table(terms) > quantile(table(terms), .99) ]) | |
# themes | |
stopifnot(!length(terms[ is.na(themes(terms)) ])) | |
# complete edge list | |
full = data.frame() | |
for(i in 1:nrow(d)) { | |
y = unlist(strsplit(d$tags[ i ], ";")) | |
full = rbind(full, data.frame( | |
expand.grid(i = y, j = y, stringsAsFactors = FALSE), | |
author = d$author[ i ], | |
quarter = d$quarter[ i ], | |
weight = 1 / (1 + str_count(d$tags[ i ], ";")) # inverse weighting | |
)) | |
} | |
csv = data.frame(tag = unique(terms), theme = themes(unique(terms))) %>% | |
arrange(tag) | |
head(csv) | |
csv$nums = NA | |
K = strsplit(d$tags, ";") | |
for(i in csv$tag) { | |
k = lapply(K, function(x) i %in% x) | |
k = as.numeric(d$num[ which(sapply(k, isTRUE)) ]) | |
if(length(k)) | |
csv$nums[ csv$tag == i ] = paste0(k [ order(k) ], collapse = ";") | |
} | |
write.csv(csv, "charlie-tags.csv", row.names = FALSE) | |
# | |
# Thematic plots | |
# | |
th = c( | |
"Armée/Police" = "Varia", | |
"Capitalisme" = "Varia", | |
"France" = "Varia", | |
"International" = "International", | |
"Medias" = "Varia", | |
"Politique" = "Politique", | |
"Religion" = "Religion", | |
"Showbiz" = "Varia", | |
"Terrorisme" = "Violence", | |
"Varia" = "Varia", | |
"Violence" = "Violence" | |
) | |
yy = lapply(d$tags, function(x) { | |
y = unlist(strsplit(x, ";")) | |
y = th[ themes(y, simplify = F) ] | |
unique(y) | |
}) | |
dd = data.frame() | |
for(i in unique(d$year)) { | |
y = unlist(yy[ d$year == i ]) | |
print(i) | |
print(table(y)) | |
dd = rbind(dd, data.frame(year = i, theme = names(table(y)), | |
freq = as.vector(table(y)), | |
prop = as.vector(table(y) / sum(table(y))))) | |
} | |
colors = c( | |
"Politique" = "#80b1d3", # Set3 / light blue | |
"International" = "#fdb462", # Set3 / light orange | |
"Religion" = "#4daf4a", # Set1 / bright green | |
"Terrorisme" = "#e41a1c", # Set1 / bright red | |
"Varia" = "#999999", # Set1 / grey | |
"Violence" = "#984ea3" # Set1 / bright purple | |
) | |
qplot(data = dd, fill = theme, y = prop, x = factor(year), stat = "identity", geom = "bar") + | |
scale_fill_manual("Thème", values = colors2) + | |
theme_bw() + | |
theme(panel.grid = element_blank()) + | |
labs(y = "Fréquence relative\n", x = "\nAnnée") | |
ggsave("plots/charlie_themes_prop.pdf", width = 9, height = 8) | |
qplot(data = dd, fill = theme, y = freq, x = factor(year), stat = "identity", geom = "bar") + | |
scale_fill_manual("Thème", values = colors2) + | |
theme_bw() + | |
theme(panel.grid = element_blank()) + | |
labs(y = "Fréquence brute\n", x = "\nAnnée") | |
ggsave("plots/charlie_themes_freq.pdf", width = 9, height = 8) | |
# done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# quick and dirty ERGM specification for the co-occurrence network | |
# NB: to be run after the exploratory models in charlie-models.r | |
# | |
# edge list | |
e = subset(full[, 1:2 ], i != j) | |
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///")) | |
e = data.frame(table(e$u)) | |
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1), | |
j = gsub("(.*)///(.*)", "\\2", e$Var1), | |
n = e$Freq / 2) | |
# network | |
n = network(e[, 1:2 ], directed = FALSE) | |
t = themes(network.vertex.names(n), simplify = FALSE) | |
n %n% "covers" = 338 | |
n %v% "theme" = t | |
set.edge.attribute(n, "weight", ifelse(e$n > 10, 10, e$n)) | |
print(table(n %e% "weight")) | |
# > subset(e, n > 10) | |
# i j n | |
# 779 élections présidentielle 23 | |
# 781 élections présidentielle 2012 22 | |
# 789 élections sarkozy 17 | |
# 871 extrême droite fn 17 | |
# 880 extrême droite marine le pen 14 | |
# 961 fn marine le pen 15 | |
# 1189 intégrisme religion 17 | |
# 1563 présidentielle 2012 sarkozy 15 | |
# 1571 présidentielle présidentielle 2012 22 | |
# 1575 présidentielle sarkozy 15 | |
th = c( | |
"Armée/Police" = "Varia", | |
"Capitalisme" = "Varia", | |
"France" = "Varia", | |
"International" = "International", | |
"Medias" = "Varia", | |
"Politique" = "Politique", | |
"Religion" = "Religion", | |
"Showbiz" = "Varia", | |
"Terrorisme" = "Violence", | |
"Varia" = "Varia", | |
"Violence" = "Violence" | |
) | |
# collapse themes | |
n %v% "theme" = th[ n %v% "theme" ] | |
print(table(n %v% "theme", exclude = NULL)) | |
E = ergm(n ~ nodemix("theme")) | |
b = which(names(coef(E)) %in% c("mix.theme.France.France", | |
"mix.theme.International.International", | |
"mix.theme.Politique.Politique", | |
"mix.theme.Religion.Religion", | |
"mix.theme.Showbiz.Showbiz", | |
"mix.theme.Terrorisme.Terrorisme", | |
"mix.theme.Varia.Varia", | |
"mix.theme.Violence.Violence")) | |
b = c(b, which(grepl("Varia", names(coef(E))))) | |
# | |
# Baseline model | |
# | |
B = ergm(n ~ edges + | |
nodefactor("theme", base = 4) + | |
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) + | |
nodemix("theme", base = b), | |
parallel = 4, | |
control = control.ergm(seed = 4575, | |
MCMLE.termination = "precision", | |
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval | |
MCMLE.MCMC.precision = .01, # control how much noise is tolerable | |
MCMLE.maxit = 50)) | |
print(summary(B)) | |
# | |
# GWDSP-adjusted model | |
# | |
D = ergm(n ~ edges + | |
nodefactor("theme", base = 4) + | |
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) + | |
nodemix("theme", base = b) + | |
gwdsp(1, fixed = TRUE), | |
parallel = 4, | |
control = control.ergm(seed = 4575, | |
MCMLE.termination = "precision", | |
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval | |
MCMLE.MCMC.precision = .01, # control how much noise is tolerable | |
MCMLE.maxit = 50)) | |
print(summary(D)) | |
# | |
# GWD-adjusted models, at various values of alpha decay | |
# | |
a = seq(0, 5, by = 0.1) | |
alphas = c() | |
L = list() | |
for(i in a) { | |
cat("\nAttempting to fit at decay alpha =", i, "\n") | |
E = try(ergm(n ~ edges + | |
nodefactor("theme", base = 4) + | |
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) + | |
nodemix("theme", base = b) + | |
gwdegree(i, fixed = TRUE), # gwesp(1, fixed = TRUE), gwdsp(1, fixed = TRUE), | |
parallel = 4, | |
control = control.ergm(seed = 4575, | |
MCMLE.termination = "precision", | |
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval | |
MCMLE.MCMC.precision = .01, # control how much noise is tolerable | |
MCMLE.maxit = 50)), silent = TRUE) | |
if(!"try-error" %in% class(E)) { | |
print(summary(E)) | |
L[[ which(a == i) ]] = E | |
alphas = c(alphas, i) | |
} | |
} | |
L = L[ !sapply(L, is.null) ] | |
r = data.frame() | |
for(i in 1:length(L)) { | |
r = rbind(r, data.frame( | |
x = names(coef(L[[ i ]])), | |
b = summary(L[[ i ]])$coefs[, 1], | |
se = summary(L[[ i ]])$coefs[, 2], | |
alpha = alphas[ i ], | |
stringsAsFactors = FALSE)) | |
} | |
r = bind_rows(r) | |
r$ub = r$b + 2 * r$se | |
r$lb = r$b - 2 * r$se | |
# see how the GWD term moves: a < 1 and a > 3 are alright | |
qplot(data = subset(r, grepl("gwd", x)), | |
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_wrap(~ alpha, scales = "free", nrow = 1) + | |
labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") + | |
theme_bw() + | |
theme(panel.grid = element_blank()) | |
r = subset(r, alpha < 1 | alpha > 3) | |
qplot(data = subset(r, grepl("nodefactor", x)), | |
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_wrap(~ x) + | |
theme_bw() | |
qplot(data = subset(r, grepl("nodematch", x)), | |
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_wrap(~ x) + | |
theme_bw() | |
qplot(data = subset(r, grepl("mix", x)), | |
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_wrap(~ x) + | |
labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") + | |
theme_bw() + | |
theme(panel.grid = element_blank()) | |
save(B, D, L, alphas, n, file = "charlie-ergm.rda") | |
# have a nice day |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# quick and dirty ERGM specifications for the co-occurrence networks | |
# | |
library(ergm) | |
library(ergm.count) | |
coefs = data.frame() | |
for(i in ls(pattern = "^net_")) { | |
nn = get(i) | |
# collapse general themes | |
nn %v% "theme" = ifelse(nn %v% "theme" %in% c("France", "International"), "Varia", nn %v% "theme") | |
cat("\n", i, ":\n\n") | |
print(table(nn %v% "theme", exclude = NULL)) | |
E = ergm(nn ~ nodemix("theme")) | |
b = which(names(coef(E)) %in% c("mix.theme.France.France", # useless | |
"mix.theme.International.International", | |
"mix.theme.Religion.Religion", | |
"mix.theme.Terrorisme.Terrorisme", | |
"mix.theme.Varia.Varia", | |
"mix.theme.Violence.Violence")) | |
b = c(b, which(grepl("Varia", names(coef(E))))) | |
if(!exists(paste0("ergm_", i))) { | |
# binary model for all networks | |
E = ergm(nn ~ edges + nodefactor("theme") + nodemix("theme", base = b), | |
control = control.ergm(seed = 4575, | |
MCMLE.termination = "precision", | |
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval | |
MCMLE.MCMC.precision = .01, # control how much noise is tolerable | |
MCMLE.maxit = 50)) | |
# add weighted model for author-specific networks | |
if(!grepl("\\d", i)) { | |
W = try(ergm(nn ~ nonzero + sum + nodefactor("theme") + nodemix("theme", base = b), | |
response = "weight", reference = ~ Poisson, | |
control = control.ergm(seed = 4575, | |
MCMC.prop.weights = "0inflated", | |
MCMLE.trustregion = 1000, | |
MCMLE.termination = "precision", | |
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval | |
MCMLE.MCMC.precision = .01, # control how much noise is tolerable | |
MCMLE.maxit = 50))) | |
assign(paste0("wergm_", i), W) | |
} | |
} else { | |
E = get(paste0("ergm_", i)) | |
} | |
assign(paste0("ergm_", i), E) | |
coefs = rbind(coefs, data.frame( | |
x0 = "B", | |
network = gsub("net_", "", i), | |
x = names(coef(E)), | |
b = coef(E), | |
se = summary(E)$coefs[ , 2 ], | |
stringsAsFactors = FALSE | |
)) | |
if(exists(paste0("wergm_", i))) { | |
W = get(paste0("wergm_", i)) | |
if(!"try-error" %in% class(W)) | |
coefs = rbind(coefs, data.frame( | |
x0 = "W", | |
network = gsub("net_", "", i), | |
x = names(coef(W)), | |
b = coef(W), | |
se = summary(W)$coefs[ , 2 ], | |
stringsAsFactors = FALSE | |
)) | |
} | |
} | |
# | |
# temporal ERGM results | |
# | |
qplot(data = subset(coefs, grepl("20(09|10|11|12|13|14)", network) & | |
x0 == "B" & grepl("mix", x) & !is.infinite(b) & se < 10), | |
y = b, x = x0, color = x) + | |
geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_grid(x ~ network) + | |
scale_color_brewer("Coefficients", palette = "Set1") + | |
labs(x = NULL, y = "log Pr( co-occurrence )\n") + | |
theme_bw() + | |
theme(axis.text.x = element_blank(), | |
axis.ticks.x = element_blank(), | |
panel.grid = element_blank(), | |
legend.position = "bottom") | |
ggsave("ergm_time.pdf", width = 9, height = 8) | |
# | |
# author-specific ERGM results, binary and valued | |
# | |
coefs$x = gsub("sum\\.", "", coefs$x) | |
qplot(data = subset(coefs, grepl("Cabu|Charb|Riss|Luz", network) & | |
grepl("mix", x) & !is.infinite(b) & se < 10), | |
y = b, x = x0, color = x) + | |
geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) + | |
geom_hline(y = 0, lty = "dashed") + | |
facet_grid(x ~ network) + | |
scale_color_brewer("Coefficients", palette = "Set1") + | |
labs(x = NULL, y = "log Pr( co-occurrence )\n") + | |
theme_bw() + | |
theme(#axis.text.x = element_blank(), | |
#axis.ticks.x = element_blank(), | |
panel.grid = element_blank(), | |
legend.position = "bottom") | |
ggsave("ergm_authors.pdf", width = 9, height = 8) | |
# kthxbye |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# build temporal and author-specific co-occurrence networks | |
# | |
library(animation) | |
library(dplyr) | |
library(GGally) | |
library(ggplot2) | |
library(network) | |
source("charlie-data.r") | |
dir.create("plots", showWarnings = FALSE) | |
colors = c( | |
"France" = "#80b1d3", # Set3 / light blue | |
"International" = "#fdb462", # Set3 / light orange | |
"Religion" = "#4daf4a", # Set1 / bright green | |
"Terrorisme" = "#e41a1c", # Set1 / bright red | |
"Varia" = "#999999", # Set1 / grey | |
"Violence" = "#984ea3" # Set1 / bright purple | |
) | |
# | |
# time-specific term networks | |
# | |
prop = data.frame() | |
for(k in names(table(d$quarter)[ table(d$quarter) > 10 | names(table(d$quarter)) == "2015_1" ])) { | |
# remove self-loops and select author | |
e = subset(full, i != j & quarter == k)[, 1:2 ] | |
cat("Building:", k, nrow(e), "edges\n") | |
# aggregate similar rows | |
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///")) | |
e = data.frame(table(e$u)) | |
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1), | |
j = gsub("(.*)///(.*)", "\\2", e$Var1), | |
n = e$Freq / 2) | |
n = network(e[, 1:2 ], directed = FALSE) | |
t = themes(network.vertex.names(n)) | |
n %n% "period" = k | |
n %v% "theme" = t | |
set.edge.attribute(n, "weight", e$n) | |
## print(table(n %e% "weight")) | |
# compare network dimensions | |
nv0 = network.size(n) | |
ne0 = network.edgecount(n) | |
nd = network.density(n) | |
g = ggnet(n, label.nodes = network.vertex.names(n), | |
node.group = t, node.color = colors[ names(colors) %in% t ], | |
size = 0, label.size = 4, segment.color = "grey50", | |
segment.alpha = .5) + | |
theme(legend.key = element_blank(), legend.position = "bottom") + | |
ggtitle(paste("Période", gsub("_", "-", k), | |
"n =", table(d$quarter)[ names(table(d$quarter)) == k ], | |
"'unes'", nv0, "mots-clés", ne0, "co-occurrences\n")) | |
ggsave(paste0("plots/network_", k, ".pdf"), g, width = 12, height = 12) | |
assign(paste0("plot_", k), g) | |
assign(paste0("net_", k), n) | |
assign(paste0("edges_", k), e) | |
# subset to selected themes | |
t = t %in% c("Religion", "Terrorisme", "Violence") | |
delete.vertices(n, which(!t)) | |
prop = rbind(prop, data.frame(t = k, | |
n = table(d$quarter)[ names(table(d$quarter)) == k ], | |
d = nd, | |
nv0, nv1 = network.size(n), | |
ne0, ne1 = network.edgecount(n), | |
stringsAsFactors = FALSE)) | |
print(tail(prop, 1)) | |
} | |
stop() | |
# dimensions | |
summary(prop$d) | |
prop$pv = with(prop, nv1 / nv0) | |
prop$pe = with(prop, ne1 / ne0) | |
# plot | |
prop$year = substr(prop$t, 1, 4) | |
prop$quarter = substr(prop$t, 6, 7) | |
prop = arrange(prop, -pe) %>% | |
mutate(color = 1:n()) %>% | |
mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE)) | |
qplot(data = subset(prop, year %in% 2009:2014), | |
x = quarter, y = pe, group = year, fill = color, | |
stat = "identity", geom = "bar") + | |
geom_hline(y = prop$pe[ prop$t == "2015_1" ], lty = "dashed") + | |
facet_wrap(~ year, nrow = 1) + | |
scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) + | |
guides(fill = FALSE) + | |
labs(y = "Proportion des liens 'Religion, Terrorisme, Violence'\n", | |
x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) | |
ggsave("plots/prop_edges.pdf", width = 9, height = 7) | |
prop = arrange(prop, -pv) %>% | |
mutate(color = 1:n()) %>% | |
mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE)) | |
qplot(data = subset(prop, year %in% 2009:2014), | |
x = quarter, y = pv, group = year, fill = color, | |
stat = "identity", geom = "bar") + | |
geom_hline(y = prop$pv[ prop$t == "2015_1" ], lty = "dashed") + | |
facet_wrap(~ year, nrow = 1) + | |
scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) + | |
guides(fill = FALSE) + | |
labs(y = "Proportion des mots-clés 'Religion, Terrorisme, Violence'\n", | |
x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) | |
ggsave("plots/prop_nodes.pdf", width = 9, height = 7) | |
# movie | |
saveGIF({ | |
for(i in sort(ls(pattern = "plot_\\d+"))) | |
print(get(i)) | |
}, movie.name = "network_quarters.gif", ani.width = 600, ani.height = 600) | |
# | |
# author-specific term networks | |
# | |
for(k in names(table(d$author)[ table(d$author) > 50 ])) { | |
# remove self-loops and select author | |
e = subset(full, i != j & author == k)[, 1:2 ] | |
cat("Building:", k, nrow(e), "edges\n") | |
# aggregate similar rows | |
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///")) | |
e = data.frame(table(e$u)) | |
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1), | |
j = gsub("(.*)///(.*)", "\\2", e$Var1), | |
n = e$Freq / 2) | |
n = network(e[, 1:2 ], directed = FALSE) | |
t = themes(network.vertex.names(n)) | |
n %n% "period" = k | |
n %v% "theme" = t | |
set.edge.attribute(n, "weight", e$n) | |
## print(table(n %e% "weight")) | |
assign(paste0("plot_", k), g) | |
assign(paste0("net_", k), n) | |
assign(paste0("edges_", k), e) | |
ggnet(n, label.nodes = network.vertex.names(n), | |
node.group = t, node.color = colors[ names(colors) %in% t ], | |
size = 0, label.size = 4, segment.color = "grey50", | |
segment.alpha = .5) + | |
theme(legend.key = element_blank(), legend.position = "bottom") + | |
ggtitle(paste(k, ", n =", table(d$author)[ names(table(d$author)) == k ], | |
"'unes'", network.size(n), "mots-clés", | |
network.edgecount(n), "co-occurrences\n")) | |
ggsave(paste0("plots/network_", k, ".pdf"), width = 12, height = 12) | |
} | |
# done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
themes <- function(x) { | |
y = rep(NA, length(x)) | |
# france | |
y[ x %in% c("bretagne", "corse", "guadeloupe", "marseille", "nice", "paris", | |
"provence-alpes-côte d'azur", "régions françaises", | |
"toulouse") ] = "France" | |
# politique française | |
y[ x %in% c("allègre", "bayrou", "bernard tapie", | |
"bernadette chirac", "besson", "bettencourt", "borloo","boutin", | |
"cécile duflot", "chirac", "christiane taubira", "clearstream", | |
"cohn-bendit", "copé", "darcos", "dati", "de gaulle", "droite", | |
"dsk", "elysée", "européennes", "eva joly", "extrême droite", | |
"fabius", "fillon", "florian philippot", "fn", "françois hollande", | |
"françois mitterrand", "frédéric mitterrand", "gauche", "grenelle", | |
"guéant", "hortefeux", "hulot", "jean-luc mélenchon", "jean-marc ayrault", | |
"jean-yves le drian", "jérôme cahuzac", "juppé", "kouchner", | |
"lagarde", "le pen", "législatives", "les verts", "luc chatel", | |
"luc ferry", "mam", "manuel valls", "marine le pen", "martine aubry", | |
"michel sapin", "ministre", "montebourg", "municipales", | |
"nadine morano", "najat belkacem", "nathalie kosciusko-morizet", | |
"patrick buisson", "pierre moscovici", "président", "présidentielle", | |
"présidentielle 2007", "présidentielle 2012", "présidentielle 2017", | |
"primaires", "ps", "rama yade", "régionales", "rolex", | |
"roselyne bachelot", "sarkozy", "ségolène royal", "sénat", "sénateur", | |
"ump", "valérie trierweiler", "villepin", "woerth", | |
"xavier darcos") ] = "Politique" | |
# économie : banques, capitalisme, industrie | |
y[ x %in% c("air france", "airbus", "banques", "bernard arnault", "bnp", "bourse", | |
"capitalisme", "crédit lyonnais", "crise", "crise économique 2009", | |
"crise financière 2008", "entreprises", "florange", "fmi", "krach", "lcl", | |
"les firmes", "les riches", "medef", "parisot", "stock-options", "trader", | |
"wall street") ] = "Capitalisme" | |
# showbiz et assimilés | |
y[ x %in% c("anne sinclair", "bigard", "carla bruni", "closer", "delarue", | |
"dieudonné", "frigide barjot", "galliano", "gérard depardieu", | |
"georges lautner", "houellebecq", | |
"jean-pierre pernaut", "johnny hallyday", "madonna", "michael jackson", | |
"mireille mathieu", "miss france", "polanski", "raymond domenech", "yannick noah", | |
"zemmour") ] = "Showbiz" | |
# politique internationale | |
y[ x %in% c("afghanistan", "ahmadinejad", "algérie", "allemagne", "bachar al-assad", | |
"belgique", "ben ali", "berlusconi", "brésil", "bush", "durban", "egypte", | |
"gaza", "grèce", "haïti", "irak", "israël", "israël-palestine", "japon", | |
"kadhafi", "les dictatures", "libye", "mali", "maroc", "merkel", "mitt romney", | |
"moubarak", "nigeria", "norvège", "obama", "palestine", "poutine", "pussy riot", "qatar", | |
"rio", "roumanie", "russie", "suisse", "syrie", "tunisie", "ukraine", "usa" ) ] = "International" | |
# terrorisme | |
y[ x %in% c("al-qaida", "11 septembre", "ben laden", "jihad", "mohamed merah", "otage", | |
"terrorisme") ] = "Terrorisme" | |
# religion | |
y[ x %in% c("antisémitisme", "athées", "benoit xvi", "bible", "burka", "charia", "coran", | |
"dieu", "évêque", "françois 1er", "intégrisme", "jesus", "laïcité", "lourdes", | |
"mahomet", "opus dei", "pape", "prophète", "ramadan", "religion", "torah", | |
"vatican", "voile") ] = "Religion" | |
# armée et police | |
y[ x %in% c("14 juillet", "armée", "armes", "crs", "police") ] = "Armée/Police" | |
# violence | |
y[ x %in% c("assassinat", "attentats", "discrimination", "guerre", "homophobie", | |
"incendie criminel", "massacre", "parricide", "pédophilie", "sexisme", | |
"torture", "tueur de masse", "violence", "xénophobie") ] = "Violence" | |
# médias | |
y[ x %in% c("cavanna", "le monde", "libération", "figaro", "france télévision", "hara-kiri", | |
"radio", "radio france", "siné", "télé", "tf1") ] = "Medias" | |
# residuals | |
# 'bac' is the diploma, not the police unit | |
# 'bizutage' is not necessarily violent | |
y[ x %in% c("2022", "absentéisme", "agriculture", "alcoolisme", | |
"alimentation", "amende", "anniversaire", "astérix", "aviation", "avion", "bac", | |
"ballerine", "banditisme", "bateau", "batman", "bizutage", "bonne année", "bonus", | |
"budget", "ça déraille", "carbone", "caricature", "catastrophes naturelles", | |
"charter", "chômage", "cinéma", "climat", "clonage", "clowns", | |
"cocaïne", "colonialisme", "coluche", "concept cars", "consommation", | |
"corruption", "crash test", "croissance", | |
"délation", "démocratie", "derrick", "dimanche", "disparus", "drh", "drogue", | |
"ebola", "école", "écologie", "économie", | |
"écoute", "education nationale", "élections", "elvis", | |
"émigration", "enseignement", "environnement", | |
"épidémie", "essence", "été", "europe", "examen", "expulsions", | |
"facebook", "famille", "farine animale", "femen", | |
"foot", "france", "france télécom", | |
"gigolo", "glou glou", "grève", "grippe", | |
"hitler", "humanisme", "immigration", "impôts", "intempéries", "internet", | |
"iron man", "jeanne d'arc", "jo", "journal", "journalisme", "journée de...", | |
"justice", "l'oréal", | |
"la poste", "le cimetière", "livres", "logement", "lois", | |
"malaise", "manifestations", "mariage", | |
"mc donald's", "médicament", "mer", "mode", | |
"nazisme", "négationnisme", "népotisme", "noël", | |
"nucléaire", "nudisme", "ouverture", | |
"patrimoine", "pétrole", "phallocrate", "pharmaceutique", | |
"philosophie", "pip", "pirates", "pitbulls", | |
"plage", "pma", "pollution", "poséidon adventure", | |
"préservatif", "presse", | |
"prisons", "privatisation", "prof", "prostitution", "pub", | |
"recyclage", "réforme", "régime", | |
"remaniement", "rentrée", "répression", "retraite", | |
"révolution", "rigueur", "rumeur", "santé", "science", | |
"sdf", "sécurité", "sécurité sociale", | |
"sexualité", "smic", "social", | |
"soeur emmanuelle", "sondages", "sotchi", "sport", "st valentin", | |
"suicide", "surpopulation", "taser", "taxe", "téléphone", "téléthon", | |
"tempête", "théâtre", "totalitarisme", "train", "travail", "tva", "twitter", | |
"université", "vacances", "vache folle", | |
"vie privée", "voeux", "voiture") ] = "Varia" | |
# simplify | |
y[ y %in% c("Showbiz", "Politique") ] = "France" | |
y[ y %in% c("Capitalisme", "Armée/Police", "Medias") ] = "Varia" | |
return(y) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment