A data.frame
library(tidyverse)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
dim(mtcars)
## [1] 32 11
class(mtcars)
## [1] "data.frame"
Make a SQLite database
db <- src_sqlite(":memory:", create = TRUE)
mtcars2 <- mtcars %>%
tibble::rownames_to_column(var = "model") %>%
copy_to(dest = db, df = .)
dim(mtcars2)
## [1] NA 12
head(mtcars2)
## # Source: lazy query [?? x 12]
## # Database: sqlite 3.22.0 [:memory:]
## model mpg cyl disp hp drat wt qsec vs am gear carb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mazda … 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 Mazda … 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 Datsun… 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 Hornet… 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 Hornet… 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 Valiant 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
class(mtcars2)
## [1] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql"
## [4] "tbl_lazy" "tbl"
Get some results
# works
mtcars %>%
group_by(cyl) %>%
summarize(N = n(), avg_mpg = mean(mpg),
range = paste(mean(mpg), mean(disp), sep = "-"))
## # A tibble: 3 x 4
## cyl N avg_mpg range
## <dbl> <int> <dbl> <chr>
## 1 4 11 26.7 26.6636363636364-105.136363636364
## 2 6 7 19.7 19.7428571428571-183.314285714286
## 3 8 14 15.1 15.1-353.1
# works now, but didn't use to!
mtcars2 %>%
group_by(cyl) %>%
summarize(N = n(), avg_mpg = mean(mpg),
range = paste(mean(mpg), mean(disp), sep = "-")) %>%
show_query()
## <SQL>
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## SELECT `cyl`, COUNT() AS `N`, AVG(`mpg`) AS `avg_mpg`, AVG(`mpg`) || '-' || AVG(`disp`) AS `range`
## FROM `.`
## GROUP BY `cyl`
mtcars3 <- mtcars2 %>%
collect()
# works, but why???
mtcars2 %>%
group_by(cyl) %>%
summarize(N = n(), avg_mpg = mean(mpg),
range = avg(mpg) %||% "-" %||% mean(disp)) %>%
show_query()
## <SQL>
## SELECT `cyl`, COUNT() AS `N`, AVG(`mpg`) AS `avg_mpg`, avg(`mpg`) || '-' || AVG(`disp`) AS `range`
## FROM `.`
## GROUP BY `cyl`
library(tidyverse)
db <- src_postgres(
dbname = "postgres",
host = "localhost", port = 5432,
user = "postgres", password = "postgres"
)
class(db)
## [1] "src_PostgreSQLConnection" "src_dbi"
## [3] "src_sql" "src"
str(db)
## List of 2
## $ con :Formal class 'PostgreSQLConnection' [package "RPostgreSQL"] with 1 slot
## .. ..@ Id: int [1:2] 20901 0
## $ disco:<environment: 0x559abc2be548>
## - attr(*, "class")= chr [1:4] "src_PostgreSQLConnection" "src_dbi" "src_sql" "src"
Connect to a real database
library(tidyverse)
db <- src_postgres(
dbname = "imdb",
host = "localhost", port = 5432,
user = "postgres", password = "postgres"
)
src_tbls(db)
## [1] "movies" "people" "role_type" "cast_info"
movies <- tbl(db, "movies")
movies
## # Source: table<movies> [?? x 5]
## # Database: postgres 9.5.17 [postgres@localhost:5432/imdb]
## id title production_year votes rating
## <int> <chr> <int> <int> <dbl>
## 1 1696072 (500) Days of Summer 2009 237877 7.8
## 2 1697216 10 Things I Hate About You 1999 141525 7
## 3 1697262 10,000 BC 2008 82065 4.9
## 4 1698059 12 Angry Men 1957 251500 8.9
## 5 1698236 127 Hours 2010 184672 7.7
## 6 1698327 13 Going on 30 2004 75575 6
## 7 1698534 1408 2007 149826 6.8
## 8 1698721 16 Blocks 2006 88119 6.6
## 9 1698796 17 Again 2009 90147 6.3
## 10 1700353 2 Fast 2 Furious 2003 107460 5.5
## # … with more rows
Compare memory footprints
print(object.size(movies), units = "Kb")
## 3.8 Kb
print(object.size(movies %>% collect()), units = "Kb")
## 154.7 Kb
DBI::dbGetQuery(db$con, "SELECT people.name,
COUNT(DISTINCT movies.title) as num_films
FROM movies
JOIN cast_info ON movies.id = cast_info.movie_id
JOIN people ON people.id = cast_info.person_id
GROUP BY people.id
ORDER BY num_films desc
LIMIT 10;")
## name num_films
## 1 Willis, Bruce 35
## 2 De Niro, Robert 35
## 3 Jackson, Samuel L. 34
## 4 Depp, Johnny 31
## 5 Pitt, Brad 30
## 6 Spielberg, Steven 29
## 7 Welker, Frank 28
## 8 Damon, Matt 28
## 9 Freeman, Morgan 27
## 10 Cage, Nicolas 27