Skip to content

Instantly share code, notes, and snippets.

@briatte
Created May 16, 2013 13:43
Show Gist options
  • Save briatte/5591824 to your computer and use it in GitHub Desktop.
Save briatte/5591824 to your computer and use it in GitHub Desktop.
scrape and plot Beijing's pollution data
# Target data source.
link = "https://raw.github.com/briatte/ida/master/data/beijing.aqi.2013.txt"
file = "data/beijing.aqi.2013.txt"
if(!file.exists(file)) download(link, file)
# Read CSV file.
bp <- read.csv(file, stringsAsFactors = FALSE)
# Check result.
head(bp)
bp$time <- strptime(bp$time, format = "%Y-%m-%d %T")
ggplot(data = bp, aes(x = time, y = PM)) +
geom_line(color = "gray80") +
geom_point(color = "blue", alpha = .5) +
geom_smooth(fill = "lightblue") +
labs(x = NULL, y = "Fine particles (PM2.5) 24hr avg")
# from Jan 6 to May 12
structure(list(PM = c(174.7, 150.3, 203, 118.8, 25.2, 78.5, 135.5,
248.2, 379, 361.2, 333.2, 568.5, 630.6, 416.2, 343.4, 299.5,
157.3, 114.9, 145.3, 112.2, 63, 75.7, 276.8, 333.6, 189.4, 89.9,
107.2, 198.8, 379.3, 378, 180.2, 19.4, 34.7, 73.9, 121.5, 161.3,
244.2, 312.9, 338, 406.3, 461.9, 433.1, 340.5, 277.9, 97.4, 34.5,
22.6, 49, 113.3, 188.8, 131.6, 37.5, 52.1, 147.9, 93.7, 8.8,
12.6, 16.1, 60.3, 130.6, 188.5, 153.5, 59.8, 32.6, 65.2, 100.7,
215.5, 315, 211.2, 70.3, 40.3, 40.5, 80.8, 145.3, 213, 124, 14.7,
22.3, 21.1, 32.9, 83.4, 109.1, 212.6, 206.7, 85, 56.1, 149, 266.1,
332.8, 261.2, 159.4, 153, 216.4, 294.2, 245.7, 294.8, 18, 10.5,
14, 48.8, 103.7, 101, 45.2, 85.8, 183.4, 218, 315.2, 222.1, 125.9,
21.9, 42.1, 126.5, 54.4, 112.8, 170.4, 256.5, 311.4, 265.2, 260.4,
59.6, 64.9, 97.5, 157.6, 64.5, 55.1, 67.9, 113.6, 153.7, 196.5,
92.7, 18.4, 37, 80.9, 95.9, 88.9, 94.5, 126.2, 200.4, 207.3,
95.6, 64.9, 116.7, 144.6, 123.4, 75.2, 83.2, 102.7, 59.7, 13,
19.8, 60.3, 19.9, 14.8, 14.3, 14.6, 18.5, 21.1, 39.2, 56.5, 81.6,
61.5, 61.6, 71.7, 69.2, 48.2, 47.5, 32, 12.1, 75.5, 133, 166.2,
179.1, 199.5, 181, 77.9, 17.1, 20.6, 82.5, 76.2, 39.7, 30, 28.3,
21.4, 35.3, 45.7, 61.5, 82.9, 72.9, 64.1, 84.7, 86.6, 113.8,
186.3, 256.5, 214.3, 166.7, 179, 157.9, 124.3, 67.7, 49.7, 64.2,
48.8, 50.2, 53.2, 41.2, 48.9, 64), time = structure(list(sec = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
hour = c(12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L,
0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 0L, 12L,
0L, 12L, 0L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L,
0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 0L, 12L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 0L, 12L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 0L, 12L, 0L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 0L, 12L, 0L, 12L, 0L, 12L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 12L, 0L, 12L, 12L, 0L, 12L, 12L, 0L, 0L, 12L, 0L, 12L,
0L, 12L, 0L, 12L, 12L, 0L, 12L, 0L, 0L, 0L, 12L, 0L, 12L,
0L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L,
0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 12L, 0L, 12L,
0L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 0L, 0L, 12L,
0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L,
12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L, 0L, 12L,
0L, 12L), mday = c(6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L,
11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 18L, 18L, 19L, 19L, 20L, 22L, 22L, 23L, 23L, 24L, 24L,
25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 31L,
1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 6L, 6L, 7L, 7L, 8L, 8L,
9L, 9L, 10L, 10L, 11L, 11L, 12L, 13L, 13L, 14L, 14L, 15L,
15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 20L, 20L, 21L, 21L,
22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L,
28L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 5L, 5L, 7L, 8L, 8L, 9L,
10L, 11L, 13L, 14L, 14L, 15L, 15L, 16L, 17L, 19L, 20L, 20L,
21L, 22L, 23L, 24L, 25L, 25L, 26L, 27L, 27L, 28L, 28L, 29L,
29L, 30L, 30L, 31L, 31L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L,
5L, 5L, 6L, 7L, 8L, 8L, 9L, 9L, 10L, 11L, 11L, 12L, 12L,
13L, 14L, 15L, 15L, 16L, 17L, 17L, 18L, 20L, 21L, 22L, 22L,
23L, 23L, 24L, 24L, 25L, 26L, 27L, 28L, 29L, 29L, 30L, 30L,
1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L), mon = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), year = c(113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L), wday = c(0L, 1L,
1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 0L, 0L, 1L, 1L,
2L, 2L, 3L, 3L, 4L, 5L, 5L, 6L, 6L, 0L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 0L, 0L, 1L, 1L, 2L, 2L, 3L, 4L, 5L, 5L,
6L, 6L, 0L, 0L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L,
0L, 0L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 0L, 0L,
1L, 1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 0L, 0L, 1L, 1L,
2L, 2L, 3L, 3L, 4L, 5L, 5L, 6L, 6L, 0L, 0L, 1L, 2L, 2L, 4L,
5L, 5L, 6L, 0L, 1L, 3L, 4L, 4L, 5L, 5L, 6L, 0L, 2L, 3L, 3L,
4L, 5L, 6L, 0L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L,
0L, 0L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 0L, 1L,
1L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 6L, 0L, 1L, 1L, 2L, 3L, 3L,
4L, 6L, 0L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 0L, 1L, 1L,
2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 0L, 0L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 0L, 0L), yday = c(5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L,
12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 17L, 17L, 18L, 18L,
19L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L,
26L, 27L, 27L, 28L, 28L, 29L, 30L, 31L, 31L, 32L, 32L, 33L,
33L, 34L, 34L, 35L, 36L, 36L, 37L, 37L, 38L, 38L, 39L, 39L,
40L, 40L, 41L, 41L, 42L, 43L, 43L, 44L, 44L, 45L, 45L, 46L,
46L, 47L, 47L, 48L, 48L, 49L, 50L, 50L, 51L, 51L, 52L, 52L,
53L, 53L, 54L, 54L, 55L, 55L, 56L, 56L, 57L, 57L, 58L, 59L,
59L, 60L, 60L, 61L, 61L, 62L, 63L, 63L, 65L, 66L, 66L, 67L,
68L, 69L, 71L, 72L, 72L, 73L, 73L, 74L, 75L, 77L, 78L, 78L,
79L, 80L, 81L, 82L, 83L, 83L, 84L, 85L, 85L, 86L, 86L, 87L,
87L, 88L, 88L, 89L, 89L, 90L, 90L, 91L, 91L, 92L, 92L, 93L,
93L, 94L, 94L, 95L, 96L, 97L, 97L, 98L, 98L, 99L, 100L, 100L,
101L, 101L, 102L, 103L, 104L, 104L, 105L, 106L, 106L, 107L,
109L, 110L, 111L, 111L, 112L, 112L, 113L, 113L, 114L, 115L,
116L, 117L, 118L, 118L, 119L, 119L, 120L, 120L, 121L, 121L,
122L, 122L, 123L, 123L, 124L, 124L, 125L, 125L, 126L, 126L,
127L, 127L, 128L, 128L, 129L, 129L, 130L, 130L, 131L, 131L
), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"
))), .Names = c("PM", "time"), row.names = c(NA, -208L), class = "data.frame")
## David Ruau's code to scrape and plot Beijing air pollution:
## http://brainchronicle.blogspot.co.uk/2012/07/twitter-analysis-of-air-pollution-in.html
## The code was adapted for the Twitter 1.1 API, which now requires ROAuth.
## You will first need to include individual consumer keys at '###' to run it.
require(twitteR)
require(ROAuth)
require(ggplot2)
require(grid)
## ROAuth.
cred <- OAuthFactory$new(consumerKey= "###",
consumerSecret= "###",
requestURL="https://api.twitter.com/oauth/request_token",
accessURL="https://api.twitter.com/oauth/access_token",
authURL="https://api.twitter.com/oauth/authorize")
cred$handshake()
registerTwitterOAuth(cred)
## Back to David Ruau's code.
pol <- userTimeline('BeijingAir', n=3200)
length(pol)
# 3200
myGrep <- function(x){
grep("PM2.5 24hr avg;", x$getText(), value=T)
}
POL <- unlist(lapply(pol, myGrep))
# cleaning no data tweets
POL <- POL[-grep("No Data", POL)]
# uncomment the following to combine with previous extract
# allPM <- unique(c(allPM, POL))
allPM <- POL
time <- sub("^(.*) to .*", "\\1", allPM)
# to posix time
time <- strptime(time, format="%m-%d-%Y %R")
PM <- as.numeric(sub("^.* 24hr avg; (.*); .*; .*", "\\1", allPM, perl=T))
data <- data.frame(PM=PM, time=time)
data <- data[order(data$time),]
yrange <- c(25, 75, 125, 175, 250, 400)
tsize=4
textPos <- as.POSIXct(strsplit(as.character(min(data$time)), " ")[[1]][1])
p <- qplot(time, PM, data=data, geom=c("blank"), group=1)
p +
labs(x = "Time", y = "Fine particles (PM2.5) 24hr avg", size = expression(log[10](pval))) +
opts(title="Air pollution in Beijing\nTwitter @BeijingAir", panel.background=theme_rect(colour="white")) +
geom_hline(aes(yintercept=50), colour="green", alpha=I(1/5), size=2) +
geom_hline(aes(yintercept=100), colour="yellow", alpha=I(1/5), size=2) +
geom_hline(aes(yintercept=150), colour="orange", alpha=I(1/5), size=2) +
geom_hline(aes(yintercept=200), colour="red", alpha=I(1/5), size=2) +
geom_hline(aes(yintercept=300), colour="darkred", alpha=I(1/5), size=2) +
geom_path(aes(time, PM), data=data, group=1) +
annotate("text", x=textPos, y=yrange[1], label="good", size=tsize, colour="grey70") +
annotate("text", x=textPos, y=yrange[2], label="moderate", size=tsize, colour="grey70") +
annotate("text", x=textPos, y=yrange[3], label="unhealthy", size=tsize, colour="grey70") +
annotate("text", x=textPos, y=yrange[4], label="unhealthy +", size=tsize, colour="grey70") +
annotate("text", x=textPos, y=yrange[5], label="unhealthy ++", size=tsize, colour="grey70") +
annotate("text", x=textPos, y=yrange[6], label="hazardous", size=tsize, colour="grey70") +
opts(title="Air pollution in Beijing\nTwitter @BeijingAir",
panel.background=theme_rect(colour="white"))
## saving
write.csv(data, "data/beijing.aqi.2013.txt", row.names = FALSE)
head(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment