A data.frame

library(tidyverse)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
dim(mtcars)
## [1] 32 11
class(mtcars)
## [1] "data.frame"

Make a SQLite database

db <- src_sqlite(":memory:", create = TRUE)
mtcars2 <- mtcars %>%
  tibble::rownames_to_column(var = "model") %>%
  copy_to(dest = db, df = .)
dim(mtcars2)
## [1] NA 12
head(mtcars2)
## # Source:   lazy query [?? x 12]
## # Database: sqlite 3.22.0 [:memory:]
##   model     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mazda …  21       6   160   110  3.9   2.62  16.5     0     1     4     4
## 2 Mazda …  21       6   160   110  3.9   2.88  17.0     0     1     4     4
## 3 Datsun…  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
## 4 Hornet…  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
## 5 Hornet…  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
## 6 Valiant  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1
class(mtcars2)
## [1] "tbl_SQLiteConnection" "tbl_dbi"              "tbl_sql"             
## [4] "tbl_lazy"             "tbl"

Get some results

# works
mtcars %>%
  group_by(cyl) %>%
  summarize(N = n(), avg_mpg = mean(mpg), 
            range = paste(mean(mpg), mean(disp), sep = "-"))
## # A tibble: 3 x 4
##     cyl     N avg_mpg range                            
##   <dbl> <int>   <dbl> <chr>                            
## 1     4    11    26.7 26.6636363636364-105.136363636364
## 2     6     7    19.7 19.7428571428571-183.314285714286
## 3     8    14    15.1 15.1-353.1
# works now, but didn't use to!
mtcars2 %>%
  group_by(cyl) %>%
  summarize(N = n(), avg_mpg = mean(mpg), 
            range = paste(mean(mpg), mean(disp), sep = "-")) %>%
  show_query()
## <SQL>
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## SELECT `cyl`, COUNT() AS `N`, AVG(`mpg`) AS `avg_mpg`, AVG(`mpg`) || '-' || AVG(`disp`) AS `range`
## FROM `.`
## GROUP BY `cyl`
mtcars3 <- mtcars2 %>% 
  collect()
# works, but why???
mtcars2 %>%
  group_by(cyl) %>%
  summarize(N = n(), avg_mpg = mean(mpg), 
            range = avg(mpg) %||% "-" %||% mean(disp)) %>%
  show_query()
## <SQL>
## SELECT `cyl`, COUNT() AS `N`, AVG(`mpg`) AS `avg_mpg`, avg(`mpg`) || '-' || AVG(`disp`) AS `range`
## FROM `.`
## GROUP BY `cyl`

Connect to PostgreSQL

library(tidyverse)
db <- src_postgres(
  dbname = "postgres", 
  host = "localhost", port = 5432, 
  user = "postgres", password = "postgres"
)
class(db)
## [1] "src_PostgreSQLConnection" "src_dbi"                 
## [3] "src_sql"                  "src"
str(db)
## List of 2
##  $ con  :Formal class 'PostgreSQLConnection' [package "RPostgreSQL"] with 1 slot
##   .. ..@ Id: int [1:2] 20901 0
##  $ disco:<environment: 0x559abc2be548> 
##  - attr(*, "class")= chr [1:4] "src_PostgreSQLConnection" "src_dbi" "src_sql" "src"

Connect to a real database

library(tidyverse)
db <- src_postgres(
  dbname = "imdb", 
  host = "localhost", port = 5432, 
  user = "postgres", password = "postgres"
)
src_tbls(db)
## [1] "movies"    "people"    "role_type" "cast_info"
movies <- tbl(db, "movies")
movies
## # Source:   table<movies> [?? x 5]
## # Database: postgres 9.5.17 [postgres@localhost:5432/imdb]
##         id title                      production_year  votes rating
##      <int> <chr>                                <int>  <int>  <dbl>
##  1 1696072 (500) Days of Summer                  2009 237877    7.8
##  2 1697216 10 Things I Hate About You            1999 141525    7  
##  3 1697262 10,000 BC                             2008  82065    4.9
##  4 1698059 12 Angry Men                          1957 251500    8.9
##  5 1698236 127 Hours                             2010 184672    7.7
##  6 1698327 13 Going on 30                        2004  75575    6  
##  7 1698534 1408                                  2007 149826    6.8
##  8 1698721 16 Blocks                             2006  88119    6.6
##  9 1698796 17 Again                              2009  90147    6.3
## 10 1700353 2 Fast 2 Furious                      2003 107460    5.5
## # … with more rows

Compare memory footprints

print(object.size(movies), units = "Kb")
## 3.8 Kb
print(object.size(movies %>% collect()), units = "Kb")
## 154.7 Kb
DBI::dbGetQuery(db$con, "SELECT people.name, 
           COUNT(DISTINCT movies.title) as num_films 
           FROM movies 
           JOIN cast_info ON movies.id = cast_info.movie_id 
           JOIN people ON people.id = cast_info.person_id 
           GROUP BY people.id 
           ORDER BY num_films desc 
           LIMIT 10;")
##                  name num_films
## 1       Willis, Bruce        35
## 2     De Niro, Robert        35
## 3  Jackson, Samuel L.        34
## 4        Depp, Johnny        31
## 5          Pitt, Brad        30
## 6   Spielberg, Steven        29
## 7       Welker, Frank        28
## 8         Damon, Matt        28
## 9     Freeman, Morgan        27
## 10      Cage, Nicolas        27