show_up_to_work(have_breakfast(glam_up(dress(shower(wake_up("I"))))))
show_up_to_work( have_breakfast( glam_up( dress( shower( wake_up("I") ) ) ) ))
morning1 <- wake_up("I")morning2 <- shower(morning1)morning3 <- dress(morning2)morning4 <- glam_up(morning3)morning5 <- have_breakfast(morning4)morning6 <- show_up_to_work(morning5)
morning1 <- wake_up("I")morning2 <- shower(morning1)morning3 <- dress(morning2)morning4 <- glam_up(morning3)morning5 <- have_breakfast(morning4)morning6 <- show_up_to_work(morning5)
morning_routine <- "I" %>% wake_up() %>% shower() %>% dress() %>% glam_up() %>% have_breakfast() %>% show_up_to_work()
%>%
for expressing a linear sequence of multiple operationsRStudio shortcut: Ctrl/Cmd + Shift + M
library(readxl)library(tidyverse)time_use_raw <- read_xlsx("data/time-use-oecd.xlsx")time_use_raw
#> # A tibble: 461 x 3#> Country Category `Time (minutes)`#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 455 more rows
ggplot(time_use_raw) + geom_col(aes( Country, `Time (minutes)`, fill = Category))
time_use_raw %>% ggplot() + geom_col(aes( Country, `Time (minutes)`, fill = Category))
{dplyr} is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges:
mutate()
adds new variables that are functions of existing variablesselect()
picks variables based on their names.filter()
picks cases based on their values.summarise()
reduces multiple values down to a single summary.arrange()
changes the ordering of the rows.These all combine naturally with group_by()
which allows you to perform any operation “by group”.
time_use <- time_use_raw %>% rename( # new_name = old_name country = Country, category = Category, minutes = `Time (minutes)`)time_use
#> # A tibble: 461 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 455 more rows
time_use %>% distinct(category)
#> # A tibble: 14 x 1#> category #> <chr> #> 1 Paid work #> 2 Education #> 3 Care for household members #> 4 Housework #> 5 Shopping #> 6 Other unpaid work & volunteering#> 7 Sleep #> 8 Eating and drinking #> 9 Personal care #> 10 Sports #> 11 Attending events #> 12 Seeing friends #> 13 TV and Radio #> 14 Other leisure activities
time_use %>% distinct(country, category)
#> # A tibble: 461 x 2#> country category #> <chr> <chr> #> 1 Australia Paid work#> 2 Austria Paid work#> 3 Belgium Paid work#> 4 Canada Paid work#> 5 Denmark Paid work#> 6 Estonia Paid work#> # … with 455 more rows
slice()
: subsets rows using their positionstime_use %>% slice((n() - 4):n())
#> # A tibble: 5 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 UK Other leisure activities 98.4#> 2 USA Other leisure activities 73.5#> 3 China Other leisure activities 53.0#> 4 India Other leisure activities 109. #> 5 South Africa Other leisure activities 81.8
n()
is a context dependent function: the current group size
group_by()
: group by one or more variablestime_use %>% group_by(country)
#> # A tibble: 461 x 3#> # Groups: country [33]#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 455 more rows
group_by()
: use in conjunction with other verbstime_use %>% group_by(country) %>% slice((n() - 4):n())
#> # A tibble: 165 x 3#> # Groups: country [33]#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Sports 19.0 #> 2 Australia Attending events 6.00#> 3 Australia Seeing friends 40.0 #> 4 Australia TV and Radio 140. #> 5 Australia Other leisure activities 76.1 #> 6 Austria Sports 32.1 #> # … with 159 more rows
slice()
shortcuts: slice_head()
& slice_tail()
time_use %>% group_by(country) %>% slice_tail(n = 5)
#> # A tibble: 165 x 3#> # Groups: country [33]#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Sports 19.0 #> 2 Australia Attending events 6.00#> 3 Australia Seeing friends 40.0 #> 4 Australia TV and Radio 140. #> 5 Australia Other leisure activities 76.1 #> 6 Austria Sports 32.1 #> # … with 159 more rows
ungroup()
: removes groupingtime_use %>% group_by(country) %>% slice_tail(n = 5) %>% ungroup()
#> # A tibble: 165 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Sports 19.0 #> 2 Australia Attending events 6.00#> 3 Australia Seeing friends 40.0 #> 4 Australia TV and Radio 140. #> 5 Australia Other leisure activities 76.1 #> 6 Austria Sports 32.1 #> # … with 159 more rows
slice_sample()
randomly selects rowsset.seed(220) # an arbitrary numbertime_use %>% slice_sample(n = 10)
#> # A tibble: 10 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Norway Housework 82.9#> 2 Austria Education 26.9#> 3 Spain Other leisure activities 84.7#> 4 Estonia Paid work 231. #> 5 Canada Education 36.0#> 6 Turkey Education 29.0#> 7 Netherlands Housework 105. #> 8 Australia Sleep 512. #> 9 Australia Other unpaid work & volunteering 57.5#> 10 Germany Housework 109.
arrange()
: arrange rows by columnstime_use %>% arrange(minutes)
#> # A tibble: 461 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Japan Care for household members 0 #> 2 Lithuania Attending events 1.35#> 3 China Attending events 2.00#> 4 Turkey Attending events 2.58#> 5 Poland Attending events 2.81#> 6 Korea Attending events 4.45#> # … with 455 more rows
arrange()
in ascending order by default
arrange()
: arrange rows by columnstime_use %>% arrange(desc(minutes)) # -minutes
#> # A tibble: 461 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 South Africa Sleep 553.#> 2 China Sleep 542.#> 3 Estonia Sleep 530.#> 4 India Sleep 528.#> 5 USA Sleep 528.#> 6 New Zealand Sleep 526 #> # … with 455 more rows
use desc()
in descending order
arrange()
: arrange rows by columnstime_use %>% arrange(country, desc(minutes))
#> # A tibble: 461 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Sleep 512. #> 2 Australia Paid work 211. #> 3 Australia TV and Radio 140. #> 4 Australia Housework 132. #> 5 Australia Eating and drinking 89.1#> 6 Australia Other leisure activities 76.1#> # … with 455 more rows
filter()
: subsets rows using columnstime_use %>% filter(minutes == max(minutes))
#> # A tibble: 1 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 South Africa Sleep 553.
filter()
observations that satisfy your conditions
filter()
: subsets rows using columnstime_use %>% group_by(country) %>% filter(minutes == max(minutes))
#> # A tibble: 33 x 3#> # Groups: country [33]#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Sleep 512.#> 2 Austria Sleep 498.#> 3 Belgium Sleep 513.#> 4 Canada Sleep 520 #> 5 Denmark Sleep 489.#> 6 Estonia Sleep 530.#> # … with 27 more rows
filter()
: subsets rows using columnsanz <- c("Australia", "New Zealand")time_use %>% filter(country %in% anz)
#> # A tibble: 28 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211. #> 2 New Zealand Paid work 241. #> 3 Australia Education 27.0#> 4 New Zealand Education 29 #> 5 Australia Care for household members 44.5#> 6 New Zealand Care for household members 30 #> # … with 22 more rows
filter()
: subsets rows using columnstime_use %>% filter(!(country %in% anz)) # ! logical negation (NOT)
#> # A tibble: 433 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Austria Paid work 280.#> 2 Belgium Paid work 194.#> 3 Canada Paid work 269.#> 4 Denmark Paid work 200.#> 5 Estonia Paid work 231.#> 6 Finland Paid work 200.#> # … with 427 more rows
filter()
: subsets rows using columnstime_use %>% filter(country %in% anz, minutes > 30)## time_use %>% ## filter(country %in% anz & minutes > 30)
#> # A tibble: 19 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211. #> 2 New Zealand Paid work 241. #> 3 Australia Care for household members 44.5#> 4 Australia Housework 132. #> 5 New Zealand Housework 110. #> 6 Australia Other unpaid work & volunteering 57.5#> 7 New Zealand Other unpaid work & volunteering 59 #> 8 Australia Sleep 512. #> 9 New Zealand Sleep 526 #> 10 Australia Eating and drinking 89.1#> 11 New Zealand Eating and drinking 80 #> 12 Australia Personal care 56.0#> 13 New Zealand Personal care 42 #> 14 Australia Seeing friends 40.0#> 15 New Zealand Seeing friends 69 #> 16 Australia TV and Radio 140. #> 17 New Zealand TV and Radio 124 #> 18 Australia Other leisure activities 76.1#> 19 New Zealand Other leisure activities 85
filter()
: subsets rows using columns
filter()
: subsets rows using columnstime_use %>% filter(country %in% anz | minutes > 30)
#> # A tibble: 337 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 331 more rows
|
: either ... or ...
x <- c(TRUE, FALSE, TRUE, FALSE)y <- c(FALSE, TRUE, TRUE, FALSE)
x & y
#> [1] FALSE FALSE TRUE FALSE
x | y
#> [1] TRUE TRUE TRUE FALSE
x && y
#> [1] FALSE
x || y
#> [1] TRUE
filter()
: subsets rows using columnstime_use_anz <- time_use %>% filter(country %in% anz)
select()
: subsets columns using their names and typestime_use_anz %>% select(country, category)## time_use_anz %>% ## select(-minutes) # !minutes## time_use_anz %>% ## select(country:category)
#> # A tibble: 28 x 2#> country category #> <chr> <chr> #> 1 Australia Paid work #> 2 New Zealand Paid work #> 3 Australia Education #> 4 New Zealand Education #> 5 Australia Care for household members#> 6 New Zealand Care for household members#> # … with 22 more rows
Selection helpers
starts_with()
: starts with a prefix.ends_with()
: ends with a suffix.contains()
: contains a literal string.time_use_anz %>% select(starts_with("c"))
#> # A tibble: 28 x 2#> country category #> <chr> <chr> #> 1 Australia Paid work #> 2 New Zealand Paid work #> 3 Australia Education #> 4 New Zealand Education #> 5 Australia Care for household members#> 6 New Zealand Care for household members#> # … with 22 more rows
mutate()
: creates, modifies, and deletes columnstime_use_anz2 <- time_use_anz %>% mutate( # new_column = f(existing_column) hours = minutes / 60, iso = case_when( country == "Australia" ~ "AU", TRUE ~ "NZ"))time_use_anz2
#> # A tibble: 28 x 5#> country category minutes hours iso #> <chr> <chr> <dbl> <dbl> <chr>#> 1 Australia Paid work 211. 3.52 AU #> 2 New Zealand Paid work 241. 4.02 NZ #> 3 Australia Education 27.0 0.450 AU #> 4 New Zealand Education 29 0.483 NZ #> 5 Australia Care for household members 44.5 0.742 AU #> 6 New Zealand Care for household members 30 0.5 NZ #> # … with 22 more rows
mutate()
: creates, modifies, and deletes columns
case_when()
: when it's the case, do something
z <- 1:10case_when( # LHS ~ RHS # logical cond ~ replacement val z < 5 ~ "less than 5", z > 5 ~ "greater than 5", TRUE ~ "equal to 5")
#> [1] "less than 5" "less than 5" #> [3] "less than 5" "less than 5" #> [5] "equal to 5" "greater than 5"#> [7] "greater than 5" "greater than 5"#> [9] "greater than 5" "greater than 5"
summarise()
: summarises to one rowtime_use_anz2 %>% summarise( # summarize() min = min(hours), max = max(hours), avg = mean(hours))
#> # A tibble: 1 x 3#> min max avg#> <dbl> <dbl> <dbl>#> 1 0.1 8.77 1.72
summarise()
: summarises each group to fewer rowstime_use_anz2 %>% group_by(category) %>% summarise( min = min(hours), max = max(hours), avg = mean(hours))
#> # A tibble: 14 x 4#> category min max avg#> <chr> <dbl> <dbl> <dbl>#> 1 Attending events 0.1 0.100 0.100#> 2 Care for household members 0.5 0.742 0.621#> 3 Eating and drinking 1.33 1.48 1.41 #> 4 Education 0.450 0.483 0.467#> 5 Housework 1.83 2.20 2.02 #> 6 Other leisure activities 1.27 1.42 1.34 #> 7 Other unpaid work & volunteering 0.959 0.983 0.971#> 8 Paid work 3.52 4.02 3.77 #> 9 Personal care 0.7 0.934 0.817#> 10 Seeing friends 0.667 1.15 0.908#> 11 Shopping 0.4 0.484 0.442#> 12 Sleep 8.54 8.77 8.65 #> 13 Sports 0.283 0.317 0.300#> 14 TV and Radio 2.07 2.33 2.20
%>%
time_use_anz <- time_use %>% filter(country %in% anz)
time_use_anz2 <- time_use_anz %>% mutate( # new_column = f(existing_column) hours = minutes / 60, iso = case_when( country == "Australia" ~ "AU", TRUE ~ "NZ"))
time_use_anz2 %>% group_by(category) %>% summarise( min = min(hours), max = max(hours), avg = mean(hours))
time_use %>% filter(country %in% anz) %>% mutate(hours = minutes / 60) %>% group_by(category) %>% summarise( min = min(hours), max = max(hours), avg = mean(hours))
%>%
time_use %>% filter(country %in% anz) %>% mutate(hours = minutes / 60) %>% group_by(category) %>% summarise( min = min(hours), max = max(hours), avg = mean(hours))
%>%
time_use_anz %>% group_by(country) %>% summarise(n = n())## time_use_anz %>% ## count(country)## time_use_anz %>% ## group_by(Country) %>% ## tally()
#> # A tibble: 2 x 2#> country n#> <chr> <int>#> 1 Australia 14#> 2 New Zealand 14
library(RSQLite)con <- dbConnect(SQLite(), dbname = "data/pisa/pisa-student.db")pisa_db <- tbl(con, "pisa")pisa_db
#> # Source: table<pisa> [?? x 22]#> # Database: sqlite 3.34.1#> # [/Users/wany568/Teaching/stats220/lectures/data/pisa/pisa-student.db]#> year country school_id student_id mother_educ father_educ#> <dbl> <chr> <chr> <chr> <chr> <chr> #> 1 2000 ALB 1001 1 <NA> <NA> #> 2 2000 ALB 1001 3 <NA> <NA> #> 3 2000 ALB 1001 6 <NA> <NA> #> 4 2000 ALB 1001 8 <NA> <NA> #> 5 2000 ALB 1001 11 <NA> <NA> #> 6 2000 ALB 1001 12 <NA> <NA> #> # … with more rows, and 16 more variables: gender <chr>,#> # computer <chr>, internet <chr>, math <dbl>, read <dbl>,#> # science <dbl>, stu_wgt <dbl>, desk <chr>, room <chr>,#> # dishwasher <chr>, television <chr>, computer_n <chr>,#> # car <chr>, book <chr>, wealth <dbl>, escs <dbl>
pisa_sql <- pisa_db %>% filter(country %in% c("NZL", "AUS")) %>% group_by(country) %>% summarise(avg_math = mean(math, na.rm = TRUE)) %>% arrange(desc(avg_math))pisa_sql
#> # Source: lazy query [?? x 2]#> # Database: sqlite 3.34.1#> # [/Users/wany568/Teaching/stats220/lectures/data/pisa/pisa-student.db]#> # Ordered by: desc(avg_math)#> country avg_math#> <chr> <dbl>#> 1 NZL 511.#> 2 AUS 502.
Show SQL queries
pisa_sql %>% show_query()
#> <SQL>#> SELECT `country`, AVG(`math`) AS `avg_math`#> FROM `pisa`#> WHERE (`country` IN ('NZL', 'AUS'))#> GROUP BY `country`#> ORDER BY `avg_math` DESC
Retrieve results to R
pisa_sql %>% collect()
#> # A tibble: 2 x 2#> country avg_math#> <chr> <dbl>#> 1 NZL 511.#> 2 AUS 502.
A key is a variable (or set of variables) that uniquely identifies an observation.
time_use
#> # A tibble: 461 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 455 more rows
(country_code <- read_csv("data/countrycode.csv"))
#> # A tibble: 100 x 2#> country country_name#> <chr> <chr> #> 1 AZE Azerbaijan #> 2 ARG Argentina #> 3 AUS Australia #> 4 AUT Austria #> 5 BEL Belgium #> 6 BRA Brazil #> # … with 94 more rows
What's the key for the
time_use
data?
00:30
country
from country_code
to time_use
, by common valuescountry
from time_use
, but country_name
from country_code
time_use %>% filter(country %in% c("New Zealand", "USA")) %>% distinct(country, .keep_all = TRUE)
#> # A tibble: 2 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 New Zealand Paid work 241.#> 2 USA Paid work 251.
country_code %>% filter(country_name %in% c("New Zealand", "United States"))
#> # A tibble: 2 x 2#> country country_name #> <chr> <chr> #> 1 NZL New Zealand #> 2 USA United States
Mutating joins: add new variables to one data frame from matching observations in another.
inner_join()
: All rows from x
where there are matching values in y
, and all columns from x
and y
image credit: Garrick Aden-Buie
inner_join()
: All rows from x
where there are matching values in y
, and all columns from x
and y
time_use %>% inner_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 377 x 4#> country category minutes country.y#> <chr> <chr> <dbl> <chr> #> 1 Australia Paid work 211. AUS #> 2 Austria Paid work 280. AUT #> 3 Belgium Paid work 194. BEL #> 4 Canada Paid work 269. CAN #> 5 Denmark Paid work 200. DNK #> 6 Estonia Paid work 231. EST #> # … with 371 more rows
by
defaultcountry
column, dup name will be .y
suffixleft_join()
: All rows from x
, and all columns from x
and y
. Rows in x
with no match in y
will have NA
values in the new columns.image credit: Garrick Aden-Buie
left_join()
: All rows from x
, and all columns from x
and y
. Rows in x
with no match in y
will have NA
values in the new columns.time_use %>% left_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 461 x 4#> country category minutes country.y#> <chr> <chr> <dbl> <chr> #> 1 Australia Paid work 211. AUS #> 2 Austria Paid work 280. AUT #> 3 Belgium Paid work 194. BEL #> 4 Canada Paid work 269. CAN #> 5 Denmark Paid work 200. DNK #> 6 Estonia Paid work 231. EST #> # … with 455 more rows
left_join()
: All rows from x
, and all columns from x
and y
. Rows in x
with no match in y
will have NA
values in the new columns.time_use %>% left_join(country_code, by = c("country" = "country_name"))%>% filter(country %in% c("New Zealand", "USA")) %>% group_by(country) %>% slice_head()
#> # A tibble: 2 x 4#> # Groups: country [2]#> country category minutes country.y#> <chr> <chr> <dbl> <chr> #> 1 New Zealand Paid work 241. NZL #> 2 USA Paid work 251. <NA>
right_join()
: All rows from y
, and all columns from x
and y
. Rows in y
with no match in x
will have NA
values in the new columns.
image credit: Garrick Aden-Buie
right_join()
: All rows from y
, and all columns from x
and y
. Rows in y
with no match in x
will have NA
values in the new columns.time_use %>% right_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 450 x 4#> country category minutes country.y#> <chr> <chr> <dbl> <chr> #> 1 Australia Paid work 211. AUS #> 2 Austria Paid work 280. AUT #> 3 Belgium Paid work 194. BEL #> 4 Canada Paid work 269. CAN #> 5 Denmark Paid work 200. DNK #> 6 Estonia Paid work 231. EST #> # … with 444 more rows
full_join()
: All rows and all columns from both x
and y
. Where there are not matching values, returns NA
for the one missing.
image credit: Garrick Aden-Buie
full_join()
: All rows and all columns from both x
and y
. Where there are not matching values, returns NA
for the one missing.time_use %>% full_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 534 x 4#> country category minutes country.y#> <chr> <chr> <dbl> <chr> #> 1 Australia Paid work 211. AUS #> 2 Austria Paid work 280. AUT #> 3 Belgium Paid work 194. BEL #> 4 Canada Paid work 269. CAN #> 5 Denmark Paid work 200. DNK #> 6 Estonia Paid work 231. EST #> # … with 528 more rows
Filtering joins: filter observations from one data frame based on whether or not they match an observation in the other table.
semi_join()
: All rows from x
where there are matching values in y
, keeping just columns from x
.
image credit: Garrick Aden-Buie
semi_join()
: All rows from x
where there are matching values in y
, keeping just columns from x
.time_use %>% semi_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 377 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Australia Paid work 211.#> 2 Austria Paid work 280.#> 3 Belgium Paid work 194.#> 4 Canada Paid work 269.#> 5 Denmark Paid work 200.#> 6 Estonia Paid work 231.#> # … with 371 more rows
anti_join()
: All rows from x
where there are not matching values in y
, keeping just columns from x
.
image credit: Garrick Aden-Buie
anti_join()
: All rows from x
where there are not matching values in y
, keeping just columns from x
.time_use %>% anti_join(country_code, by = c("country" = "country_name"))
#> # A tibble: 84 x 3#> country category minutes#> <chr> <chr> <dbl>#> 1 Korea Paid work 288.#> 2 UK Paid work 235.#> 3 USA Paid work 251.#> 4 China Paid work 315.#> 5 India Paid work 272.#> 6 South Africa Paid work 189.#> # … with 78 more rows
pull()
transmute()
relocate()
bind_rows()
bind_cols()
union()
union_all()
setdiff()
intersect()
show_up_to_work(have_breakfast(glam_up(dress(shower(wake_up("I"))))))
show_up_to_work( have_breakfast( glam_up( dress( shower( wake_up("I") ) ) ) ))
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |