CharlesNepote · April 7, 2019 22:20
diff --git a/off.Rmd b/off.Rmd
 ---
 # title: Open Food Facts growth
 output:
  html_document:
    df_print: paged
    code_folding: hide
    toc: true
    toc_float: true
  html_notebook: default
  pdf_document: default
 fontsize: 8pt
 ---

 This document analyzes [Open Food Facts](https://world.openfoodfacts.org) growth. It is based on the Open Food Facts official [CSV export](https://world.openfoodfacts.org/data).

 This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. The results appear beneath the code. 

 [comment]: # Text after that is never exported whatever output we choose.

 [comment]: # (Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*.)

 Present code needs some R libraries: `tidyr`, `dplyr`, `lubridate`, `reshape2` and `ggplot2`.
 ```{r error = FALSE, warning = FALSE, message = FALSE, results = "hide"}
 # https://bookdown.org/yihui/rmarkdown/r-code.html
 library(tidyr)      # tidy data
 library(dplyr)      # data maniuplation
 library(lubridate)  # manages dates
 library(reshape2)   # To use melt function
 Sys.setlocale("LC_TIME", "C") # To have dates localized to default (english)
 ```

 ```{r}
 # 0. Download CSV file: 
 #    https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv

 file <- "/media/DONNEES/0/OFF/base/en.openfoodfacts.org.products.csv"
 fileDate <- file.info(file)$mtime
 cat("File downloaded on: ", format(fileDate, format = "%Y-%m-%d"))
 ```

 ---
 title: 'Open Food Facts growth, `r format(fileDate, format = "%B %Y")`'
 ---

 # Extract from original raw export
 ```{r warning = FALSE}
 # 1. read raw extract from original OFF: created_datetime,creator,countries_en
 # https://stackoverflow.com/questions/44585168/select-a-subsample-of-columns-when-reading-a-file-in-the-tidyverse
 library(readr)
 OFF_raw <- read_tsv(file, quote = "", # Change efault quote behaviour
                    col_types = cols_only('code' = col_guess(),         # guess type
                                      'created_datetime' = col_guess(), # guess type
                                      'creator' = col_guess(),          # guess type
                                      'countries_en' = '?',
                                      'categories_tags' = '?',
                                      'nutrition-score-fr_100g' = '?')) # guess type

 # Print problems occured during import
 problems(OFF_raw)

 # Print 3 first lines of the imported file
 head(OFF_raw, n = 3L)

 summary(OFF_raw)
 ```

 # Separate countries_en column values into mutiple rows

 And sort data by `created_datetime`.

 ```{r}
 # 2. Transform countries_en into multiple lines
 #    4056489010258  date-limite-app  2018-10-21 18:32:19  France,Germany
 #    =>
 #    4056489010258  date-limite-app  2018-10-21 18:32:19  France
 #    4056489010258  date-limite-app  2018-10-21 18:32:19  Germany
 OFF_raw_sep <- separate_rows(OFF_raw, countries_en, sep = ",")

 # 3. Sort file by created_datetime column
 OFF_raw_sep_sorted <- OFF_raw_sep[with(OFF_raw_sep, order(created_datetime)), ]

 head(OFF_raw_sep_sorted, n = 5L)
 ```

 # Count number of products for each country and per user
 ```{r}
 # 4. Count the number of products for each country (new column p_p_country)
 OFF_raw_sep_sorted_cum <- OFF_raw_sep_sorted %>%
  group_by(countries_en) %>%
  mutate(p_p_country = row_number())

 # 5. Count number of products per user for each country (new column p_p_user)
 OFF_raw_sep_sorted_cum <- OFF_raw_sep_sorted_cum %>%
  group_by(creator) %>%
  mutate(p_p_user = row_number())

 head(OFF_raw_sep_sorted_cum, n = 5L)

 # 5b. Count number of products per user for all countries (new column p_p_user)
 OFF_raw_sorted_user_cum <- OFF_raw[with(OFF_raw, order(created_datetime)), ] %>%
  group_by(creator) %>%
  mutate(p_p_user = row_number())

 head(OFF_raw_sorted_user_cum, n = 5L)



 ```

 \pagebreak

 # Questions

 - Started at the beginning of 2012, the first year of Open Food Facts is not very relevant; do we have to exclude 2012 and maybe even 2013, 2014...?

 # Global setup for all graphs

 ```{r error = FALSE, warning = FALSE, message = FALSE}
 library(ggplot2) # https://ggplot2.tidyverse.org/
 library(scales)  # https://ggplot2.tidyverse.org/reference/scale_date.html

 # Common caption for all graphs
 OFF.caption = list(labs(caption = paste(
      "Data: https://world.openfoodfacts.org/data - ",
      format(fileDate, format = "%Y-%m-%d"))))

 # Common theme for all graphs
 theme_off <- theme_gray() +
  theme(legend.position = c(0.15, 0.5),
        legend.text = element_text(size = 10),
        legend.key.size = unit(7, "mm"),
        plot.caption = element_text(size = 7))
 ```


 \pagebreak

 # All countries with more than 500 products
 ```{r}
 OFF_5_25 <- filter(OFF_raw_sep_sorted_cum, p_p_country >= 500)

 country500P <- ggplot (data=OFF_5_25, aes(created_datetime, p_p_country)) + 
               geom_line(aes(group = countries_en, color = countries_en))
 country500P + theme_off + theme(legend.position = c(0.20, 0.6),
                                legend.text = element_text(size = 8),
                                legend.key.size = unit(4, "mm")) +
              scale_y_continuous(labels = comma) + # comma format (100,000)
              scale_x_datetime(date_breaks="1 year", date_labels = "%Y") +
              coord_cartesian(ylim  = c(0, 280000)) #+ # adjusts the visible area
              #coord_cartesian(xlim = as.Date(c("2012-01-01", "2019-06-01")))

 country500Pb <- country500P + coord_cartesian(ylim  = c(280000, 560000)) #+ # adjusts the visible area
  #coord_cartesian(xlim = as.Date(c("2012-01-01", "2019-06-01")))
 country500Pb + theme_off + theme(legend.position = c(0.20, 0.6),
                                legend.text = element_text(size = 8),
                                legend.key.size = unit(4, "mm")) +
              scale_y_continuous(labels = comma) + # comma format (100,000)
              scale_x_datetime(date_breaks="1 year", date_labels = "%Y")

 ```

 \pagebreak

 # World vs France products growth
 ```{r}
 OFF_world_vs_fr_growth <- OFF_raw_sep_sorted %>%
  mutate(countries_en = ifelse(countries_en != "France", "world", "France")) %>%
  group_by(countries_en) %>%
  mutate(p_p_country = row_number()) %>% na.omit()

 cdf2 <- ggplot (data=OFF_world_vs_fr_growth, aes(created_datetime, p_p_country)) + 
 		geom_line(aes(group = countries_en, color = countries_en))
 cdf2 + labs(title = "World vs France products growth") + theme_off + 
  OFF.caption + scale_y_continuous(labels = comma)
 ```


 \pagebreak

 # Countries with more than 5000 products, except France and the USA
 ```{r}
 OFF_5_25 <- filter(OFF_raw_sep_sorted_cum,
                   countries_en != "France" & 
                     countries_en != "United States" & 
                     p_p_country >= 5000 & 
                     p_p_country <= 32000)

 cdf2 <- ggplot (data=OFF_5_25, aes(created_datetime, p_p_country)) + 
        geom_line(aes(group = countries_en, color = countries_en))
 cdf2 + theme_off + scale_y_continuous(labels = comma)
 ```

 \pagebreak

 # Countries with more than 2000 products, except France and the USA
 ```{r}
 OFF_2_32 <- filter(OFF_raw_sep_sorted_cum,
                   countries_en != "France" & 
                     countries_en != "United States" & 
                     p_p_country >= 2000 & 
                     p_p_country <= 32000)

 cdf2 <- ggplot (data=OFF_2_32, aes(created_datetime, p_p_country)) + 
        geom_line(aes(group = countries_en, color = countries_en))
 cdf2 + theme_off + scale_y_continuous(labels = comma)
 ```

 \pagebreak

 # Countries with more than 2000 products, except France and the USA
 (from 2018)
 ```{r}

 cdf2 <- ggplot (data=OFF_5_25, aes(created_datetime, p_p_country)) + 
        geom_line(aes(group = countries_en, color = countries_en))
 cdf2 + theme_off + scale_y_continuous(labels = comma) + 
  scale_x_datetime(limits = ymd_h(c("2019-02-01 00", "2019-05-01 00")))

 ```


 \pagebreak

 # Countries with 1000+ products except the 7 biggest ones
 ```{r}
 OFF_1_32 <- filter(OFF_raw_sep_sorted_cum,
                   countries_en != "France" & 
                     countries_en != "United States" & 
                     countries_en != "Germany" &
                     countries_en != "Switzerland" &
                     countries_en != "Belgium" &
                     countries_en != "Spain" &
                     countries_en != "United Kingdom" &
                     p_p_country >= 1000 & 
                     p_p_country <= 32000)

 cdf2 <- ggplot (data=OFF_1_32, aes(created_datetime, p_p_country))

 cdf2 + geom_line(aes(group = countries_en, color = countries_en)) +
       theme_off
 ```



 \pagebreak

 # Product creations by country by month
 ```{r}
 OFF_creations_p_country_p_month <- filter(OFF_raw_sep_sorted_cum,
                   countries_en != "France" & 
                     countries_en != "United States" & 
                     countries_en != "Germany" &
                     countries_en != "Switzerland" &
                     countries_en != "Belgium" &
                     countries_en != "Spain" &
                     countries_en != "United Kingdom" &
                     p_p_country >= 900 & 
                     p_p_country <= 30000)  %>%
  group_by(created_datetime = floor_date(created_datetime, "month"), countries_en) %>%
  summarise(creations_p_country_p_month = n())

 OFF_creations_p_country_p_month <- filter(OFF_creations_p_country_p_month,
                                          creations_p_country_p_month >= 40)

 gr_pc <- ggplot (data=OFF_creations_p_country_p_month, aes(created_datetime, creations_p_country_p_month)) +
        geom_line(aes(group = countries_en, color = countries_en))
 
 gr_pc + labs(title="Product creations by country by month") +
       theme_off + OFF.caption +
       theme(legend.position = c(0.15, 0.5)) +
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y") + #+
       scale_x_datetime(limits = ymd_h(c("2018-01-01 00", "2019-03-30 00"))) +
       coord_cartesian(ylim  = c(0, 600)) #+ # adjusts the visible area
 ```












 \pagebreak

 # Users who created 5000+ products
 ```{r}
 OFF_5_800000 <- filter(OFF_raw_sep_sorted_cum,
                     p_p_user >= 5000 & 
                     p_p_user <= 800000)

 cdf2 <- ggplot (data=OFF_5_800000, aes(created_datetime, p_p_user)) + 
        geom_line(aes(group = creator, color = creator))

 cdf2 + labs(title="Users with 5000+ products") + 
       theme_off +
       theme(legend.position = c(0.22, 0.6)) +
       scale_y_continuous(labels = comma) +
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y")
 ```

 \pagebreak

 # Users compared to Yuka
 ```{r}
 OFF_products_growth_per_user <- OFF_raw %>%
  select(code, creator, created_datetime) %>%
  mutate(creator = ifelse(creator != "kiliweb", "other", "kiliweb")) %>%
  arrange(created_datetime) %>%
  group_by(creator) %>%
  mutate(pr_p_user = row_number()) %>% na.omit()

 gr_OFF_products_growth_per_user <- ggplot (data=OFF_products_growth_per_user, aes(created_datetime, pr_p_user)) + 
 		geom_line(aes(group = creator, color = creator))

 gr_OFF_products_growth_per_user + labs(title="Regular users vs kiliweb growth") + 
 	   theme_off + OFF.caption +
 	   theme(legend.position = c(0.22, 0.6)) +
     scale_colour_discrete(labels=c("kiliweb", "others")) +
 	   scale_y_continuous(labels = comma) +
 	   scale_x_datetime(date_breaks="1 year", date_labels = "%Y")
 ```

 \pagebreak

 # Users compared to Yuka (France)
 ```{r}
 OFF_FR_products_growth_per_user <- OFF_raw_sep_sorted %>%
  select(countries_en, creator, created_datetime) %>%
  filter(countries_en == "France") %>%
  mutate(creator = ifelse(creator != "kiliweb", "other", "kiliweb")) %>%
  arrange(created_datetime) %>%
  group_by(creator) %>%
  mutate(pr_p_user = row_number()) %>% na.omit()

 gr_OFF_FR_products_growth_per_user <- ggplot (data=OFF_FR_products_growth_per_user, aes(created_datetime, pr_p_user)) + 
 		geom_line(aes(group = creator, color = creator))

 gr_OFF_FR_products_growth_per_user + labs(title="France growth: regular users vs kiliweb") + 
 	   theme_off + OFF.caption +
 	   theme(legend.position = c(0.22, 0.6)) +
     scale_colour_discrete(labels=c("kiliweb", "others")) +
 	   scale_y_continuous(labels = comma) +
 	   scale_x_datetime(date_breaks="1 year", date_labels = "%Y")
 ```


 \pagebreak

 # Users creations range
 ```{r}

 # created_datetime | stephane    | 1
 # created_datetime | marianne    | 1
 # created_datetime | stephane    | 2
 # created_datetime | stephane    | 3

 OFF_user_creations <- select(OFF_raw_sorted_user_cum,
                             created_datetime, creator) %>%
  summarise(creations = n()) %>%
  mutate(range = cut(creations, breaks = c(0, 1000, 100000, 200000, 1000000)))

 # kiliweb                     | 380582
 # usda-ndb-import             | 171153
 # openfoodfacts-contributors  | 118268
 # date-limite-app             | 31496


 cdf2 <- ggplot (data = OFF_user_creations, aes(range, creations, fill = creations)) + 
        geom_bar(stat = "identity")

 cdf2 + labs(title="Users with 5000+ products") + 
       theme_off +
       scale_y_continuous(labels = comma)
 ```

 \pagebreak

 # Users creations treemap
 ```{r}
 library(treemap)
 # creator                     | creations
 # kiliweb                     | 380582   
 # usda-ndb-import             | 171153  
 # openfoodfacts-contributors  | 118268  

 # Takes long time to produce (more than 5 minutes)

 treemap(OFF_user_creations,
        index = "creator",
        vSize = "creations",
        type = "index",
        title = "Number of products created")

 ```

 \pagebreak

 # Top 10 users creations proportion, related to all creations
 ```{r}
 library(treemap)

 top_users_by_creations <- OFF_user_creations %>%
  mutate(creator = ifelse(creations > 2000, creator, "users below 2000 created products")) %>%
  mutate(type = ifelse(creator == "kiliweb", creator, 
                       ifelse(grepl("import$", creator), "import", "user"))) %>%
  group_by(creator, type)
  


 top_users_by_creations <- top_users_by_creations  %>%
  summarise(nb = sum(creations)) %>%
  ungroup() %>%
  mutate(creator_text = paste(creator, "\n", round((nb / sum(nb)*100),1), "%"))

 # creator                     | creations  | group
 # kiliweb                     | 380582     | kiliweb
 # usda-ndb-import             | 171153     | usda-ndb-import
 # openfoodfacts-contributors  | 118268     | openfoodfacts-contributors
 # xxx                         | 2          | others

 treemap(top_users_by_creations,
 		index = c("type", "creator_text"),
 		vSize = "nb",
 		type = "index",
 		title = "Number of products created",
    align.labels = list(c("centre","centre"),c("left","top")),
 		fontsize.labels = c(25,9))
 ```


 \pagebreak

 # User creation activity by month
 ```{r}
 OFF_creations_p_month <- OFF_raw_sep_sorted_cum  %>%
  group_by(created_datetime = floor_date(created_datetime, "month"), creator) %>%
  summarise(creations_p_user_p_month = n())

 OFF_user_creation_activity <- OFF_creations_p_month %>%
  group_by(created_datetime,
           creator_ranges = cut (creations_p_user_p_month,
                                 breaks = c(0, 5, 10, 100,1000,10000,500000),
                                 dig.lab = 7)) %>% # 1000 and not 1e+03
  summarise(users_p_range = n())

 gr_uca <- ggplot (data=OFF_user_creation_activity, aes(created_datetime, users_p_range)) + 
        geom_line(aes(group = creator_ranges, color = creator_ranges))

 gr_uca + labs(title="User creation activity by month") + 
       theme_off + OFF.caption +
       theme(legend.position = c(0.15, 0.7)) +
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y") +
       scale_y_continuous(name = "users", breaks = c(0, 1, 2, 5, 10, 20, 30, 40, 60, 80, 100, 200, 300, 400, 500, 600)) +
       coord_trans(y = "log10") +
       scale_color_hue(name = "New products")#, labels = c(as.character(creator_ranges))
 ```
 This graph only shows new products created by the users and not the modifications. In France it begins more and more difficult to find new products.

 10,000-500,000 creations per month is probably only due to massive imports. We could hide this category as it is not very relevant.

 0-5, 5-10 and 10-100 creations might be divided to understand how Open Food Facts could help user to create more products each months.

 \pagebreak

 # User creation activity by month
 ```{r}
 OFF_creations_p_month <- OFF_raw_sep_sorted_cum  %>%
  group_by(created_datetime = floor_date(created_datetime, "month"), creator) %>%
  summarise(creations_p_user_p_month = n())

 OFF_user_creation_activity <- OFF_creations_p_month %>%
  group_by(created_datetime,
           creator_ranges = cut (creations_p_user_p_month,
                                 breaks = c(0, 2, 5, 10, 50, 100, 1000, 500000),
                                 dig.lab = 7)) %>% # 1000 and not 1e+03
  summarise(users_p_range = n())

 gr_uca <- ggplot (data=OFF_user_creation_activity, aes(created_datetime, users_p_range)) + 
        geom_line(aes(group = creator_ranges, color = creator_ranges))

 gr_uca + labs(title="User creation activity by month") + 
       theme_off + OFF.caption +
       theme(legend.position = c(0.15, 0.7)) +
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y") +
       scale_y_continuous(name = "users", breaks = c(0, 1, 2, 5, 10, 20, 30, 40, 60, 80, 100, 200, 300, 400, 500, 600)) +
       coord_trans(y = "log10") +
       scale_color_hue(name = "New products")#, labels = c(as.character(creator_ranges))
 ```
 This graph only shows new products created by the users and not the modifications. In France it begins more and more difficult to find new products.

 10,000-500,000 creations per month is probably only due to massive imports. We could hide this category as it is not very relevant.

 0-5, 5-10 and 10-100 creations might be divided to understand how Open Food Facts could help user to create more products each months.

 \pagebreak

 # Monthly active contributors (todo)

 TODO
 https://meta.wikimedia.org/wiki/Research:Modeling_monthly_active_editors

 ### Editor Classes
 - **Monthly Active Editors** (MAE)
    Editors who save at least 5 revisions within one month.
 - **New Active Editors** (NAE)
    Newly registered users who save at least 5 revisions in the month that they registered.
 - **Surviving New Active Editors** (SNAE)
    New Active Editors from the previous month who continued to make at least 5 edits in the current month.
 - **Recurring Old Active Editors** (ROAE)
    Non-new Active Editors from the previous month who continued to make at least 5 edits in the current month.
 - **Reactivated Editors** (RAE)
    All other active editors who (1) were not active in the previous month and (2) were not a Newly registered user in the current month.

 ### Basic equation
 MAEm = NAEm + SNAEm + ROAEm + RAEm

 ```{r}

 # See: http://larmarange.github.io/analyse-R/analyse-de-survie.html

 # Starting from OFF_creations_p_month
 # 2012-01-01 | stephane | 1
 # 2012-02-01 | andre    | 10

 # OFF_editor_classes <- OFF_raw_sep_sorted_cum %>%
 #   group_by(created_datetime = floor_date(created_datetime, "month"), creator) %>%
 #   summarise(creations_p_user_p_month = n())

       
 ```

 \pagebreak

 # New 'creators' per month per country
 ```{r}
 # countries_en | start | end | creators_p_product
 new_users_p_months <- OFF_raw_sep_sorted_cum %>%
  filter(p_p_user == 1 & p_p_country >= 5000) %>% 
  group_by(countries_en, month=floor_date(created_datetime, "month")) %>% 
  summarise(nb = n())

 gr_new_users_p_months <- ggplot (data = new_users_p_months, aes (month, nb)) + 
  geom_line(aes(group = countries_en, color = countries_en))

 lims <- as.POSIXct(strptime(c("2014-01-01","2019-02-28"), format = "%Y-%m-d%"))
 gr_new_users_p_months + labs(title="New creators per month per country") + 
       theme_off +
       theme(legend.position = c(0.22, 0.6)) +
       scale_y_continuous(labels = comma) +
       coord_cartesian(ylim  = c(0, 200)) +  # adjusts the visible area
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y", limits = lims)
 ```

 Note: graph visible area has been cut to provide better readability. French peak at the end of 2018 reached 719 new creators, following Open Food Facts interview during "Envoyé spécial" prime time TV show.


 \pagebreak

 # New 'creators' per week per country except France
 ```{r}
 new_users_p_week <- OFF_raw_sep_sorted_cum %>% 
  filter(p_p_user == 1 & 
           p_p_country >= 5000 & 
           countries_en != "France") %>% 
  group_by(countries_en, week=floor_date(created_datetime, "week")) %>% 
  summarise(nb = n())

 gr_new_users_p_week <- ggplot (data = new_users_p_week, aes (week, nb)) + 
  geom_line(aes(group = countries_en, color = countries_en))

 gr_new_users_p_week + labs(title="New creators per week per country") + 
       theme_off +
       theme(legend.position = c(0.22, 0.6)) +
       scale_y_continuous(labels = comma) +
       scale_x_datetime(date_breaks="1 year", date_labels = "%Y")
 ```

 \pagebreak

 # New product creations by day of the week
 ```{r}
 # Filter huge USDA  and Openfood CH imports
 OFF_raw_sep_sorted_cum_wo_usda <- OFF_raw_sep_sorted_cum %>% filter(creator != "usda-ndb-import" & creator != "openfood-ch-import")
 gr_product_creations_by_day <- ggplot (data = OFF_raw_sep_sorted_cum_wo_usda, aes (x = weekdays(created_datetime))) + 
   geom_bar()

 gr_product_creations_by_day + labs(title="New product creations by day of the week") + 
        theme_off + OFF.caption +
        scale_y_continuous(labels = comma) +
  scale_x_discrete(limits = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
        # geom_text(hjust=1.4, vjust=0.6, color="white", size=3)
       # theme(legend.position = c(0.22, 0.6)) +
       # scale_x_datetime(date_breaks="1 year", date_labels = "%Y")
 ```
 Massive imports are filtered, not to disturb the graph.

 \pagebreak

 # New product creations by hour of the day
 ```{r}
 # Filter huge USDA  and Openfood CH imports
 gr_product_creations_by_day <- ggplot (data = OFF_raw_sep_sorted_cum_wo_usda, aes (x = hour(created_datetime))) + 
   geom_bar()

 gr_product_creations_by_day + labs(title="New product creations by hour of the day") + 
        theme_off + OFF.caption +
        scale_y_continuous(labels = comma) +
        scale_x_continuous(breaks = seq(0, 24, 1))
        # geom_text(hjust=1.4, vjust=0.6, color="white", size=3)
 ```
 Massive imports are filtered, not to disturb the graph. It is based on the `created_datetime` field which represent the date and UTC time of creation of the product in OFF database.

 This graph has to be reworked. It should be produced per country.


 \pagebreak

 # Nutriscore

 ```{r}
 # New table: countries_en | total | missing | completed  | ratio_ns
 library(stringr)
 nutriscore_data <- OFF_raw_sep_sorted_cum %>%
  group_by(countries_en) %>% 
  filter(!grepl("(en:baby-foods|en:baby-milks|en:alcoholic-beverages)",categories_tags)) %>% 
  filter(!grepl("(en:waters|en:coffees|en:teas|en:herbal-teas)",categories_tags)) %>% 
  filter(!grepl("(fr:levure|en:honeys|en:vinegars)",categories_tags))

 nutriscore_pr_world <- nutriscore_data %>%
  summarise(world_total = n(),
            world_completed_ns = sum(!is.na(`nutrition-score-fr_100g`)),
            world_ratio_ns = (world_completed_ns/world_total))

 p <- 1500 # Top countries
 nutriscore_pr_in_top_countries <- nutriscore_data %>%
  filter(n() >= p) %>% 
  summarise(total = n(),
            missing_ns = sum(is.na(`nutrition-score-fr_100g`)),
            missing_ct_tags = sum(is.na(`categories_tags`)),
            completed_ns = sum(!is.na(`nutrition-score-fr_100g`)),
            ratio_ns = (completed_ns/total))

 # Number of nutriscored products in France
 ns_pr_france <- subset(nutriscore_pr_world, countries_en =="France")

 # Total number of nutriscored products
 ns_pr_total_world <- nutriscore_pr_world %>% summarise(ns_pr_total = sum(world_completed_ns),
                                                       pr_total = sum(world_total))

 # total number of products
 tot_nb_of_products <- nrow(OFF_raw)

 # potential number of Nutri-Scored products
 OFF_raw_ns <- OFF_raw %>% filter(!grepl("(en:baby-foods|en:baby-milks|en:alcoholic-beverages)", categories_tags)) %>% filter(!grepl("(en:waters|en:coffees|en:teas|en:herbal-teas)", categories_tags)) %>% filter(!grepl("(fr:levure|en:honeys|en:vinegars)", categories_tags))
 pot_nb_of_ns_products <- nrow(OFF_raw_ns) # potential number of nutriscored products

 # total number of Nutri-Scored products
 tot_nb_of_ns_products <- nrow(na.omit(OFF_raw_ns))

 # percent of total nutriscored products
 # round(tot_nb_of_ns_products/pot_nb_of_ns_products*100, digits = 1)

 ```
 Open Food Facts compute Nutri-Score for each product in whatever country. In the statistics and the graphs below, if a product is sold in two countries, it is counted in both countries. But world total number of Nutri-Scored products will count only one product.

 Open Food Facts database contains **`r format(tot_nb_of_products, big.mark = ",")`** products: 

 - **`r format(pot_nb_of_ns_products, big.mark = ",")`** represent all products that could be Nutri-Scored
 - **`r format(tot_nb_of_ns_products, big.mark = ",")`** of which are Nutri-Scored (`r round(tot_nb_of_ns_products/pot_nb_of_ns_products*100, digits=1)`%);
 - France hold **`r format(as.numeric(ns_pr_france$world_completed_ns), big.mark = ",")`** products with a Nutri-Score (which some of them are also sold in some other countries).

 \pagebreak

 # Nutri-Score in countries with more than `r p` products (abs)

 ```{r, size = 'tiny'}
 # 1. Using ggplot2: Your data is in the wide format You need to put it in the long format. Generally speaking, long format is better for variables comparison".
 # https://stackoverflow.com/a/21236376/4098096

 ns_pr_top_countries.wo.France <- nutriscore_pr_in_top_countries %>% 
  filter(countries_en != "France")

 gr_nutriscore_p_country <- ggplot (data = ns_pr_top_countries.wo.France, aes (reorder(countries_en,completed_ns), completed_ns)) + 
  geom_col() + coord_flip()

 gr_nutriscore_p_country + 
  labs(title = paste("Nutriscore per country with more than", p, "products"),
       subtitle = paste("(Except France which hold ",
                        format(as.numeric(ns_pr_france$world_completed_ns), big.mark = ","),
                        "products with a Nutri-Score)"),
                        y = "Nutri-Scored products", x = "countries") + 
  scale_y_continuous(labels = comma) +
  geom_text(aes(label = completed_ns), hjust=1, vjust=0.6, color="white", size=3) +
  theme_off + OFF.caption

 ```


 \pagebreak

 # Nutriscore in countries with more than `r p` products (%)
 ```{r}
 gr_ns_pr_in_top_countries <- ggplot (data = nutriscore_pr_in_top_countries, aes (reorder(countries_en,ratio_ns), ratio_ns)) + 
  geom_col() + coord_flip()

 gr_ns_pr_in_top_countries + 
  labs(title = paste("Nutriscore in countries with more than", p, "products (%)"),
       y = "Nutri-Score", x = "countries") + 
  geom_text(aes(label = round(ratio_ns*100, 0)),
            hjust=1.4, vjust=0.6, color="white", size=3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     breaks = seq(0, 1.1, 0.1)) + 
  theme_off + OFF.caption

 ```

 \pagebreak

 # Missing category: the main issue for Nutri-Score
 [comment]: # 1. With somthing else than ggplot2 : https://www.r-graph-gallery.com/202-barplot-for-likert-type-items/

 [comment]: # 2. Lattice, # https://www.statmethods.net/advgraphs/trellis.html

 [comment]: # 1. Using ggplot2: Your data is in the wide format You need to put it in the long format. Generally speaking, long format is better for variables comparison".
 [comment]: # https://stackoverflow.com/a/21236376/4098096


 ```{r, size = 'tiny'}
 nutriscore_p_country2 <- OFF_raw_sep_sorted_cum %>%
  group_by(countries_en) %>% filter(n() >= p) %>% 
  filter(!grepl("(en:baby-foods|en:baby-milks|en:alcoholic-beverages)",categories_tags)) %>%
  filter(!grepl("(en:waters|en:coffees|en:teas|en:herbal-teas)",categories_tags)) %>%
  filter(!grepl("(fr:levure|en:honeys|en:vinegars)",categories_tags)) %>%
  summarise(missing_ct_tags = sum(is.na(`categories_tags`)/n()),
            completed_ns = sum(!is.na(`nutrition-score-fr_100g`))/n()) %>%
            arrange(completed_ns) %>%
            mutate(countries_en = factor (countries_en, countries_en))
 nutriscore_p_country.m <- reshape2::melt(nutriscore_p_country2,id.vars = "countries_en")

 gr_nutriscore_p_country.m <- ggplot (data = nutriscore_p_country.m, aes (x = countries_en, y = value, fill = variable, order = -as.numeric(variable))) + 
  geom_bar(stat = "identity") + coord_flip()

 gr_nutriscore_p_country.m + 
  labs(title = paste("Nutriscore per country with more than", p, "products"),
                     y = "Products with Nutri-Score and without categories",
                     x = "countries") +
  scale_fill_discrete(labels = c("Missing category","Nutriscore")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     breaks = seq(0, 1.1, 0.1)) +
  theme_off + OFF.caption

 ```

 Only two countries have more than 50% of products with the Nutri-Score. 


 [comment]: # (Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.)

 [comment]: # (When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).)

 [comment]: # (The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.)