GUI SQL client: MySQL Workbench
command line SQL client (next slide)
generic SQL access in R: DBI::dbGetQuery()
RMySQL
through R Markdown chunks (see labs & hw)
through dplyr
with dbplyr
dplyr
+ SQL connectiondplyr
can access a SQL database directly
Instead of tbl_df
, you have a tbl_sql
dplyr
+ SQL connectiondplyr
can access a SQL database directly
Instead of tbl_df
, you have a tbl_sql
Data is stored and processed in SQL
dplyr
+ SQL connectiondplyr
can access a SQL database directly
Instead of tbl_df
, you have a tbl_sql
Data is stored and processed in SQL
dplyr
to SQL translation via show_query()
https://en.wikipedia.org/wiki/Big_data
"Size" | size | hardware | software |
---|---|---|---|
small | < several GB | RAM | R |
medium | several GB -- a few TB | hard disk | SQL |
big | many TB or more | cluster | Spark? |
title <- db %>% tbl("title")class(title)
## [1] "tbl_MySQLConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl"
title
## # Source: table<title> [?? x 12]## # Database: mysql 5.5.58-0ubuntu0.14.04.1-log [sds192@scidb.smith.edu:/imdb]## id title imdb_index kind_id production_year imdb_id phonetic_code## <int> <chr> <chr> <int> <int> <int> <chr> ## 1 78460 Adul… <NA> 7 2017 NA A3432 ## 2 70273 (201… <NA> 7 2016 NA <NA> ## 3 60105 (201… <NA> 7 2014 NA <NA> ## 4 32120 (200… <NA> 7 2008 NA <NA> ## 5 97554 Schm… <NA> 7 2001 NA S2543 ## 6 57966 (#1.… <NA> 7 2013 NA <NA> ## 7 76391 Anni… <NA> 7 1971 NA A5162 ## 8 11952 Angu… <NA> 7 2009 NA A5214 ## 9 1554 New … <NA> 7 2003 NA N6452 ## 10 58442 Kiss… <NA> 7 2011 NA K2523 ## # … with more rows, and 5 more variables: episode_of_id <int>, season_nr <int>,## # episode_nr <int>, series_years <chr>, md5sum <chr>
title
contains 4.6 million rows, but...print(object.size(title), units = "Kb")
## 4.3 Kb
title
contains 4.6 million rows, but...print(object.size(title), units = "Kb")
## 4.3 Kb
title
looks like a data.frame
but...nrow(title)
## [1] NA
title
contains 4.6 million rows, but...print(object.size(title), units = "Kb")
## 4.3 Kb
title
looks like a data.frame
but...nrow(title)
## [1] NA
class(title)
## [1] "tbl_MySQLConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl"
title
contains 4.6 million rows, but...print(object.size(title), units = "Kb")
## 4.3 Kb
title
looks like a data.frame
but...nrow(title)
## [1] NA
class(title)
## [1] "tbl_MySQLConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl"
data.frame
show_query()
star_wars <- title %>% filter(title == "Star Wars", kind_id == 1) %>% select(production_year, title)
show_query()
star_wars <- title %>% filter(title == "Star Wars", kind_id == 1) %>% select(production_year, title)
star_wars
## # Source: lazy query [?? x 2]## # Database: mysql 5.5.58-0ubuntu0.14.04.1-log [sds192@scidb.smith.edu:/imdb]## production_year title ## <int> <chr> ## 1 1977 Star Wars
show_query()
star_wars <- title %>% filter(title == "Star Wars", kind_id == 1) %>% select(production_year, title)
star_wars
## # Source: lazy query [?? x 2]## # Database: mysql 5.5.58-0ubuntu0.14.04.1-log [sds192@scidb.smith.edu:/imdb]## production_year title ## <int> <chr> ## 1 1977 Star Wars
show_query(star_wars)
## <SQL>## SELECT `production_year`, `title`## FROM `title`## WHERE ((`title` = 'Star Wars') AND (`kind_id` = 1.0))
https://dbplyr.tidyverse.org/articles/translation-function.html#known-functions-1
library(dbplyr)translate_sql(ceiling(mpg))
## <SQL> CEIL(`mpg`)
https://dbplyr.tidyverse.org/articles/translation-function.html#known-functions-1
library(dbplyr)translate_sql(ceiling(mpg))
## <SQL> CEIL(`mpg`)
translate_sql(mean(mpg))
## Warning: Missing values are always removed in SQL.## Use `AVG(x, na.rm = TRUE)` to silence this warning## This warning is displayed only once per session.
## <SQL> AVG(`mpg`) OVER ()
https://dbplyr.tidyverse.org/articles/translation-function.html#known-functions-1
library(dbplyr)translate_sql(ceiling(mpg))
## <SQL> CEIL(`mpg`)
translate_sql(mean(mpg))
## Warning: Missing values are always removed in SQL.## Use `AVG(x, na.rm = TRUE)` to silence this warning## This warning is displayed only once per session.
## <SQL> AVG(`mpg`) OVER ()
translate_sql(cyl == 4)
## <SQL> `cyl` = 4.0
https://dbplyr.tidyverse.org/articles/translation-function.html#known-functions-1
library(dbplyr)translate_sql(ceiling(mpg))
## <SQL> CEIL(`mpg`)
translate_sql(mean(mpg))
## Warning: Missing values are always removed in SQL.## Use `AVG(x, na.rm = TRUE)` to silence this warning## This warning is displayed only once per session.
## <SQL> AVG(`mpg`) OVER ()
translate_sql(cyl == 4)
## <SQL> `cyl` = 4.0
translate_sql(cyl %in% c(4, 6, 8))
## <SQL> `cyl` IN (4.0, 6.0, 8.0)
https://dbplyr.tidyverse.org/articles/translation-function.html#unknown-functions-1
# no PASTE() in SQLtranslate_sql(paste0("hp", "wt", "vs"))
## <SQL> CONCAT_WS('', 'hp', 'wt', 'vs')
https://dbplyr.tidyverse.org/articles/translation-function.html#unknown-functions-1
# no PASTE() in SQLtranslate_sql(paste0("hp", "wt", "vs"))
## <SQL> CONCAT_WS('', 'hp', 'wt', 'vs')
# works, but no CONCAT() in Rtranslate_sql(CONCAT("hp", "wt", "vs"))
## <SQL> CONCAT('hp', 'wt', 'vs')
https://dbplyr.tidyverse.org/articles/translation-function.html#unknown-functions-1
# no PASTE() in SQLtranslate_sql(paste0("hp", "wt", "vs"))
## <SQL> CONCAT_WS('', 'hp', 'wt', 'vs')
# works, but no CONCAT() in Rtranslate_sql(CONCAT("hp", "wt", "vs"))
## <SQL> CONCAT('hp', 'wt', 'vs')
# nonsensetranslate_sql(CRAZY_FUNCTION(mpg))
## <SQL> CRAZY_FUNCTION(`mpg`)
title %>% filter(title %like% '%Star Wars%', kind_id == 1, !is.na(production_year)) %>% select(title, production_year) %>% arrange(production_year)
## # Source: lazy query [?? x 2]## # Database: mysql 5.5.58-0ubuntu0.14.04.1-log [sds192@scidb.smith.edu:/imdb]## # Ordered by: production_year## title production_year## <chr> <int>## 1 Star Wars 1977## 2 Star Wars: Episode V - The Empire Strikes Back 1980## 3 Star Wars Underoos 1980## 4 Star Wars: Episode VI - Return of the Jedi 1983## 5 Tezukuri no Star Wars 1990## 6 Star Wars: Episode I - The Phantom Menace 1999## 7 Star Wars Gangsta Rap 2000## 8 Star Wars Returns 2001## 9 Star Wars: Attack of the Clones - A Jigsaw Puzzle 2002## 10 Star Wars Episode V 1/2: The Han Solo Affair 2002## # … with more rows
title %>% filter(title %like% '%Star Wars%', kind_id == 1, !is.na(production_year)) %>% mutate(before_dash = SUBSTRING_INDEX(title, '-', 1)) %>% select(before_dash, production_year) %>% arrange(production_year)
## # Source: lazy query [?? x 2]## # Database: mysql 5.5.58-0ubuntu0.14.04.1-log [sds192@scidb.smith.edu:/imdb]## # Ordered by: production_year## before_dash production_year## <chr> <int>## 1 "Star Wars" 1977## 2 "Star Wars: Episode V " 1980## 3 "Star Wars Underoos" 1980## 4 "Star Wars: Episode VI " 1983## 5 "Tezukuri no Star Wars" 1990## 6 "Star Wars: Episode I " 1999## 7 "Star Wars Gangsta Rap" 2000## 8 "Star Wars Returns" 2001## 9 "Star Wars: Attack of the Clones " 2002## 10 "Star Wars Episode V 1/2: The Han Solo Affair" 2002## # … with more rows
old_movies <- title %>% filter( production_year < 1960, kind_id == 1 )class(old_movies)
## [1] "tbl_MySQLConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl"
dim(old_movies)
## [1] NA 12
print(object.size(old_movies), units = "Kb")
## 7.4 Kb
collect()
to bring into Rold_movies_local <- old_movies %>% collect()class(old_movies_local)
## [1] "tbl_df" "tbl" "data.frame"
dim(old_movies_local)
## [1] 215234 12
print(object.size(old_movies_local), units = "Mb")
## 48.8 Mb
Pipelines that don't ask for data
Processing triggered when data is needed
print()
, head()
, glimpse()
, etc.collect()
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |