juanchiem · May 21, 2022 11:50
diff --git a/sql b/sql
 How to use R with SQLlite
 ================
  
  This is a walkthrough of how you can use SQLite with R from my twitter
 [@neilgcurrie](https://twitter.com/neilgcurrie).

 Often in R we read in files of mixed formats. This gets messy fast and
 when the data is too big R will lag or fall over. Enter the database.
 SQLite is software used to interact with databases and is one of the
 most popular engines around. We can work with SQLite directly from R.

 Let’s create a database. We need the DBI and RSQLite packages to
 interact with our database. The dbConnect function creates a connection
 to a database when you supply a file name. When no database exists it
 creates a blank one. This one is created in my working directory.

 library(DBI)     
 library(RSQLite)

 filename <- glue::glue("{here::here()}/football-data.db")

 con <- DBI::dbConnect(RSQLite::SQLite(), filename)

 DBI::dbListTables(con)

 ## character(0)

 Now we add some data to our database. In this example I have taken
 football (soccer) results data from football-data.co.uk for the Premier
 League and La Liga. We use the dbWriteTable function to create new
 tables with specified names and write the data all in one step.

 library(readr)

 # Create a table

 epl_2122 <- readr::read_csv("https://www.football-data.co.uk/mmz4281/2122/E0.csv", 
                            show_col_types = FALSE)

 DBI::dbWriteTable(con, name = "england_premier_league", value = epl_2122)

 dbListTables(con) 

 ## [1] "england_premier_league"

 # You can look at a tables fields

 DBI::dbListFields(con, name = "england_premier_league")

 ##   [1] "Div"       "Date"      "Time"      "HomeTeam"  "AwayTeam"  "FTHG"     
 ##   [7] "FTAG"      "FTR"       "HTHG"      "HTAG"      "HTR"       "Referee"  
 ##  [13] "HS"        "AS"        "HST"       "AST"       "HF"        "AF"       
 ##  [19] "HC"        "AC"        "HY"        "AY"        "HR"        "AR"       
 ##  [25] "B365H"     "B365D"     "B365A"     "BWH"       "BWD"       "BWA"      
 ##  [31] "IWH"       "IWD"       "IWA"       "PSH"       "PSD"       "PSA"      
 ##  [37] "WHH"       "WHD"       "WHA"       "VCH"       "VCD"       "VCA"      
 ##  [43] "MaxH"      "MaxD"      "MaxA"      "AvgH"      "AvgD"      "AvgA"     
 ##  [49] "B365>2.5"  "B365<2.5"  "P>2.5"     "P<2.5"     "Max>2.5"   "Max<2.5"  
 ##  [55] "Avg>2.5"   "Avg<2.5"   "AHh"       "B365AHH"   "B365AHA"   "PAHH"     
 ##  [61] "PAHA"      "MaxAHH"    "MaxAHA"    "AvgAHH"    "AvgAHA"    "B365CH"   
 ##  [67] "B365CD"    "B365CA"    "BWCH"      "BWCD"      "BWCA"      "IWCH"     
 ##  [73] "IWCD"      "IWCA"      "PSCH"      "PSCD"      "PSCA"      "WHCH"     
 ##  [79] "WHCD"      "WHCA"      "VCCH"      "VCCD"      "VCCA"      "MaxCH"    
 ##  [85] "MaxCD"     "MaxCA"     "AvgCH"     "AvgCD"     "AvgCA"     "B365C>2.5"
 ##  [91] "B365C<2.5" "PC>2.5"    "PC<2.5"    "MaxC>2.5"  "MaxC<2.5"  "AvgC>2.5" 
 ##  [97] "AvgC<2.5"  "AHCh"      "B365CAHH"  "B365CAHA"  "PCAHH"     "PCAHA"    
 ## [103] "MaxCAHH"   "MaxCAHA"   "AvgCAHH"   "AvgCAHA"

 # Create another table

 ll_2122 <- read_csv("https://www.football-data.co.uk/mmz4281/2122/SP1.csv", 
                    show_col_types = FALSE)

 dbWriteTable(con, name = "spain_la_liga", value = ll_2122)

 dbListTables(con)

 ## [1] "england_premier_league" "spain_la_liga"

 Imagine we want to add an extra season of Premier League data to our
 table england_premier_league. Again, we use the dbWriteTable function
 but now we use the option append = TRUE.

 epl_2021 <- readr::read_csv("https://www.football-data.co.uk/mmz4281/2021/E0.csv",
                            show_col_types = FALSE)

 dbWriteTable(con, name = "england_premier_league", value = epl_2021, 
             append = TRUE)

 Now we have some data in there let’s query the database. We can send a
 SQL query from R to the database and pull the queried data back into R.
 We type SQL code as a normal string and pass it to the dbGetQuery
 function.

 query <- "SELECT Date, HomeTeam, AwayTeam, FTHG, FTAG
          FROM   england_premier_league"

 football_data1 <- DBI::dbGetQuery(con, statement = query)

 print(head(football_data1))

 ##         Date   HomeTeam       AwayTeam FTHG FTAG
 ## 1 13/08/2021  Brentford        Arsenal    2    0
 ## 2 14/08/2021 Man United          Leeds    5    1
 ## 3 14/08/2021    Burnley       Brighton    1    2
 ## 4 14/08/2021    Chelsea Crystal Palace    3    0
 ## 5 14/08/2021    Everton    Southampton    3    1
 ## 6 14/08/2021  Leicester         Wolves    1    0

 # Say you only wanted games with more than 2.5 goals scored

 query <- "SELECT Date, HomeTeam, AwayTeam, FTHG, FTAG
          FROM   england_premier_league
          WHERE  FTHG + FTAG > 2.5"

 football_data2 <- DBI::dbGetQuery(con, statement = query)

 print(head(football_data2))

 ##         Date   HomeTeam       AwayTeam FTHG FTAG
 ## 1 14/08/2021 Man United          Leeds    5    1
 ## 2 14/08/2021    Burnley       Brighton    1    2
 ## 3 14/08/2021    Chelsea Crystal Palace    3    0
 ## 4 14/08/2021    Everton    Southampton    3    1
 ## 5 14/08/2021    Watford    Aston Villa    3    2
 ## 6 14/08/2021    Norwich      Liverpool    0    3

 This is really powerful. This data could be far too big for R to handle
 all at once - no problem with this approach. You would only need the
 memory to handle the queried, and hopefully smaller, dataset.

 What if you don’t know SQL? You can use the dbplyr package with all the
 great syntax you are familiar with from dplyr! R will turn your R code
 into SQL code and send it to the database without needing to know any
 SQL.

 library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
 library(dbplyr, quietly = TRUE, warn.conflicts = FALSE)

 football_data3 <- con |>
  dplyr::tbl("england_premier_league") |>
  dplyr::select(Date, HomeTeam, AwayTeam, FTHG, FTAG) |>
  dplyr::filter(FTHG + FTAG > 2.5)

 print(head(football_data3))


 ## # Source:   lazy query [?? x 5]
 ## # Database: sqlite 3.38.5
 ## #   [/Users/neilcurrie/freelance/socials/threads/football-data.db]
 ##   Date       HomeTeam   AwayTeam        FTHG  FTAG
 ##   <chr>      <chr>      <chr>          <dbl> <dbl>
 ## 1 14/08/2021 Man United Leeds              5     1
 ## 2 14/08/2021 Burnley    Brighton           1     2
 ## 3 14/08/2021 Chelsea    Crystal Palace     3     0
 ## 4 14/08/2021 Everton    Southampton        3     1
 ## 5 14/08/2021 Watford    Aston Villa        3     2
 ## 6 14/08/2021 Norwich    Liverpool          0     3

 footballdata3 isn’t quite the same as a normal tibble/data.frame. That’s
 because R hasn’t pulled the data in yet. This keeps things fast. To pull
 it in we use the collect function.

 football_data3 <- dplyr::collect(football_data3)
 print(football_data3)

 ## # A tibble: 384 × 5
 ##    Date       HomeTeam   AwayTeam        FTHG  FTAG
 ##    <chr>      <chr>      <chr>          <dbl> <dbl>
 ##  1 14/08/2021 Man United Leeds              5     1
 ##  2 14/08/2021 Burnley    Brighton           1     2
 ##  3 14/08/2021 Chelsea    Crystal Palace     3     0
 ##  4 14/08/2021 Everton    Southampton        3     1
 ##  5 14/08/2021 Watford    Aston Villa        3     2
 ##  6 14/08/2021 Norwich    Liverpool          0     3
 ##  7 15/08/2021 Newcastle  West Ham           2     4
 ##  8 21/08/2021 Leeds      Everton            2     2
 ##  9 21/08/2021 Man City   Norwich            5     0
 ## 10 23/08/2021 West Ham   Leicester          4     1
 ## # … with 374 more rows

 football_data2 |> 
  dplyr::as_tibble() |> 
  identical(football_data3) # exactly the same

 ## [1] TRUE

 ``` r
 # remember to close your connection to the database

 dbDisconnect(con) 
 ```

 Thanks for reading, if you liked this then follow me on twitter for
 mainly R and data science stuff (though I make no promises)
 [@neilgcurrie](https://twitter.com/neilgcurrie).
	How to use R with SQLlite
	================

	This is a walkthrough of how you can use SQLite with R from my twitter
	[@neilgcurrie](https://twitter.com/neilgcurrie).

	Often in R we read in files of mixed formats. This gets messy fast and
	when the data is too big R will lag or fall over. Enter the database.
	SQLite is software used to interact with databases and is one of the
	most popular engines around. We can work with SQLite directly from R.

	Let’s create a database. We need the DBI and RSQLite packages to
	interact with our database. The dbConnect function creates a connection
	to a database when you supply a file name. When no database exists it
	creates a blank one. This one is created in my working directory.

	library(DBI)
	library(RSQLite)

	filename <- glue::glue("{here::here()}/football-data.db")

	con <- DBI::dbConnect(RSQLite::SQLite(), filename)

	DBI::dbListTables(con)

	## character(0)

	Now we add some data to our database. In this example I have taken
	football (soccer) results data from football-data.co.uk for the Premier
	League and La Liga. We use the dbWriteTable function to create new
	tables with specified names and write the data all in one step.

	library(readr)

	# Create a table

	epl_2122 <- readr::read_csv("https://www.football-data.co.uk/mmz4281/2122/E0.csv",
	show_col_types = FALSE)

	DBI::dbWriteTable(con, name = "england_premier_league", value = epl_2122)

	dbListTables(con)

	## [1] "england_premier_league"

	# You can look at a tables fields

	DBI::dbListFields(con, name = "england_premier_league")

	## [1] "Div" "Date" "Time" "HomeTeam" "AwayTeam" "FTHG"
	## [7] "FTAG" "FTR" "HTHG" "HTAG" "HTR" "Referee"
	## [13] "HS" "AS" "HST" "AST" "HF" "AF"
	## [19] "HC" "AC" "HY" "AY" "HR" "AR"
	## [25] "B365H" "B365D" "B365A" "BWH" "BWD" "BWA"
	## [31] "IWH" "IWD" "IWA" "PSH" "PSD" "PSA"
	## [37] "WHH" "WHD" "WHA" "VCH" "VCD" "VCA"
	## [43] "MaxH" "MaxD" "MaxA" "AvgH" "AvgD" "AvgA"
	## [49] "B365>2.5" "B365<2.5" "P>2.5" "P<2.5" "Max>2.5" "Max<2.5"
	## [55] "Avg>2.5" "Avg<2.5" "AHh" "B365AHH" "B365AHA" "PAHH"
	## [61] "PAHA" "MaxAHH" "MaxAHA" "AvgAHH" "AvgAHA" "B365CH"
	## [67] "B365CD" "B365CA" "BWCH" "BWCD" "BWCA" "IWCH"
	## [73] "IWCD" "IWCA" "PSCH" "PSCD" "PSCA" "WHCH"
	## [79] "WHCD" "WHCA" "VCCH" "VCCD" "VCCA" "MaxCH"
	## [85] "MaxCD" "MaxCA" "AvgCH" "AvgCD" "AvgCA" "B365C>2.5"
	## [91] "B365C<2.5" "PC>2.5" "PC<2.5" "MaxC>2.5" "MaxC<2.5" "AvgC>2.5"
	## [97] "AvgC<2.5" "AHCh" "B365CAHH" "B365CAHA" "PCAHH" "PCAHA"
	## [103] "MaxCAHH" "MaxCAHA" "AvgCAHH" "AvgCAHA"

	# Create another table

	ll_2122 <- read_csv("https://www.football-data.co.uk/mmz4281/2122/SP1.csv",
	show_col_types = FALSE)

	dbWriteTable(con, name = "spain_la_liga", value = ll_2122)

	dbListTables(con)

	## [1] "england_premier_league" "spain_la_liga"

	Imagine we want to add an extra season of Premier League data to our
	table england_premier_league. Again, we use the dbWriteTable function
	but now we use the option append = TRUE.

	epl_2021 <- readr::read_csv("https://www.football-data.co.uk/mmz4281/2021/E0.csv",
	show_col_types = FALSE)

	dbWriteTable(con, name = "england_premier_league", value = epl_2021,
	append = TRUE)

	Now we have some data in there let’s query the database. We can send a
	SQL query from R to the database and pull the queried data back into R.
	We type SQL code as a normal string and pass it to the dbGetQuery
	function.

	query <- "SELECT Date, HomeTeam, AwayTeam, FTHG, FTAG
	FROM england_premier_league"

	football_data1 <- DBI::dbGetQuery(con, statement = query)

	print(head(football_data1))

	## Date HomeTeam AwayTeam FTHG FTAG
	## 1 13/08/2021 Brentford Arsenal 2 0
	## 2 14/08/2021 Man United Leeds 5 1
	## 3 14/08/2021 Burnley Brighton 1 2
	## 4 14/08/2021 Chelsea Crystal Palace 3 0
	## 5 14/08/2021 Everton Southampton 3 1
	## 6 14/08/2021 Leicester Wolves 1 0

	# Say you only wanted games with more than 2.5 goals scored

	query <- "SELECT Date, HomeTeam, AwayTeam, FTHG, FTAG
	FROM england_premier_league
	WHERE FTHG + FTAG > 2.5"

	football_data2 <- DBI::dbGetQuery(con, statement = query)

	print(head(football_data2))

	## Date HomeTeam AwayTeam FTHG FTAG
	## 1 14/08/2021 Man United Leeds 5 1
	## 2 14/08/2021 Burnley Brighton 1 2
	## 3 14/08/2021 Chelsea Crystal Palace 3 0
	## 4 14/08/2021 Everton Southampton 3 1
	## 5 14/08/2021 Watford Aston Villa 3 2
	## 6 14/08/2021 Norwich Liverpool 0 3

	This is really powerful. This data could be far too big for R to handle
	all at once - no problem with this approach. You would only need the
	memory to handle the queried, and hopefully smaller, dataset.

	What if you don’t know SQL? You can use the dbplyr package with all the
	great syntax you are familiar with from dplyr! R will turn your R code
	into SQL code and send it to the database without needing to know any
	SQL.

	library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
	library(dbplyr, quietly = TRUE, warn.conflicts = FALSE)

	football_data3 <- con \|>
	dplyr::tbl("england_premier_league") \|>
	dplyr::select(Date, HomeTeam, AwayTeam, FTHG, FTAG) \|>
	dplyr::filter(FTHG + FTAG > 2.5)

	print(head(football_data3))


	## # Source: lazy query [?? x 5]
	## # Database: sqlite 3.38.5
	## # [/Users/neilcurrie/freelance/socials/threads/football-data.db]
	## Date HomeTeam AwayTeam FTHG FTAG
	## <chr> <chr> <chr> <dbl> <dbl>
	## 1 14/08/2021 Man United Leeds 5 1
	## 2 14/08/2021 Burnley Brighton 1 2
	## 3 14/08/2021 Chelsea Crystal Palace 3 0
	## 4 14/08/2021 Everton Southampton 3 1
	## 5 14/08/2021 Watford Aston Villa 3 2
	## 6 14/08/2021 Norwich Liverpool 0 3

	footballdata3 isn’t quite the same as a normal tibble/data.frame. That’s
	because R hasn’t pulled the data in yet. This keeps things fast. To pull
	it in we use the collect function.

	football_data3 <- dplyr::collect(football_data3)
	print(football_data3)

	## # A tibble: 384 × 5
	## Date HomeTeam AwayTeam FTHG FTAG
	## <chr> <chr> <chr> <dbl> <dbl>
	## 1 14/08/2021 Man United Leeds 5 1
	## 2 14/08/2021 Burnley Brighton 1 2
	## 3 14/08/2021 Chelsea Crystal Palace 3 0
	## 4 14/08/2021 Everton Southampton 3 1
	## 5 14/08/2021 Watford Aston Villa 3 2
	## 6 14/08/2021 Norwich Liverpool 0 3
	## 7 15/08/2021 Newcastle West Ham 2 4
	## 8 21/08/2021 Leeds Everton 2 2
	## 9 21/08/2021 Man City Norwich 5 0
	## 10 23/08/2021 West Ham Leicester 4 1
	## # … with 374 more rows

	football_data2 \|>
	dplyr::as_tibble() \|>
	identical(football_data3) # exactly the same

	## [1] TRUE

	``` r
	# remember to close your connection to the database

	dbDisconnect(con)
	```

	Thanks for reading, if you liked this then follow me on twitter for
	mainly R and data science stuff (though I make no promises)
	[@neilgcurrie](https://twitter.com/neilgcurrie).