pacman::p_load(dplyr)
tbl <- as_tibble(PlantGrowth)
tbl2 <- head(tbl)Playing with Your Data using {dplyr}
{dplyr} package for processing the data.
Load the Data
In the following example, we use a partial of the PlantGrowth dataset.
Read this article for more information about the PlantGrowth dataset itself.
mutate()
baseR
tbl2$new <- tbl2$weight + 2Create new column
tbl2 <- tbl2 %>% mutate(new = weight + 2)Replace the value in-place
tbl2 <- tbl2 %>% mutate(weight = weight + 2)Note that when we export it to excel, the backtick won’t shown in the excel file.
tbl2 %>%
mutate(
`Name with space` = "Hello!",
number10 = 10
)# A tibble: 6 × 5
weight group new `Name with space` number10
<dbl> <fct> <dbl> <chr> <dbl>
1 6.17 ctrl 6.17 Hello! 10
2 7.58 ctrl 7.58 Hello! 10
3 7.18 ctrl 7.18 Hello! 10
4 8.11 ctrl 8.11 Hello! 10
5 6.5 ctrl 6.5 Hello! 10
6 6.61 ctrl 6.61 Hello! 10
For the 3
tbl2 %>%
mutate(v1 = 1, v2 = 2, v3 = 3) %>%
mutate(
across(starts_with("v"), ~ .x + 20))# A tibble: 6 × 6
weight group new v1 v2 v3
<dbl> <fct> <dbl> <dbl> <dbl> <dbl>
1 6.17 ctrl 6.17 21 22 23
2 7.58 ctrl 7.58 21 22 23
3 7.18 ctrl 7.18 21 22 23
4 8.11 ctrl 8.11 21 22 23
5 6.5 ctrl 6.5 21 22 23
6 6.61 ctrl 6.61 21 22 23
select()
tbl3 <- tbl2 %>%
mutate(var1 = 1, var2 = 2, var3 = "text", var4 = "word")
tbl3# A tibble: 6 × 7
weight group new var1 var2 var3 var4
<dbl> <fct> <dbl> <dbl> <dbl> <chr> <chr>
1 6.17 ctrl 6.17 1 2 text word
2 7.58 ctrl 7.58 1 2 text word
3 7.18 ctrl 7.18 1 2 text word
4 8.11 ctrl 8.11 1 2 text word
5 6.5 ctrl 6.5 1 2 text word
6 6.61 ctrl 6.61 1 2 text word
tbl3 %>%
select(group, var1, var4)# A tibble: 6 × 3
group var1 var4
<fct> <dbl> <chr>
1 ctrl 1 word
2 ctrl 1 word
3 ctrl 1 word
4 ctrl 1 word
5 ctrl 1 word
6 ctrl 1 word
tbl3 %>%
select(group, var1:var4)# A tibble: 6 × 5
group var1 var2 var3 var4
<fct> <dbl> <dbl> <chr> <chr>
1 ctrl 1 2 text word
2 ctrl 1 2 text word
3 ctrl 1 2 text word
4 ctrl 1 2 text word
5 ctrl 1 2 text word
6 ctrl 1 2 text word
tbl3 %>%
select(-group)# A tibble: 6 × 6
weight new var1 var2 var3 var4
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 6.17 6.17 1 2 text word
2 7.58 7.58 1 2 text word
3 7.18 7.18 1 2 text word
4 8.11 8.11 1 2 text word
5 6.5 6.5 1 2 text word
6 6.61 6.61 1 2 text word
tbl3 %>%
select(where(is.numeric))# A tibble: 6 × 4
weight new var1 var2
<dbl> <dbl> <dbl> <dbl>
1 6.17 6.17 1 2
2 7.58 7.58 1 2
3 7.18 7.18 1 2
4 8.11 8.11 1 2
5 6.5 6.5 1 2
6 6.61 6.61 1 2
Select literally everything but move the column var to the front.
tbl3 %>%
select(var1, everything())# A tibble: 6 × 7
var1 weight group new var2 var3 var4
<dbl> <dbl> <fct> <dbl> <dbl> <chr> <chr>
1 1 6.17 ctrl 6.17 2 text word
2 1 7.58 ctrl 7.58 2 text word
3 1 7.18 ctrl 7.18 2 text word
4 1 8.11 ctrl 8.11 2 text word
5 1 6.5 ctrl 6.5 2 text word
6 1 6.61 ctrl 6.61 2 text word
filter()
tbl %>%
filter(weight > 6 & group == "trt2")# A tibble: 2 × 2
weight group
<dbl> <fct>
1 6.31 trt2
2 6.15 trt2
tbl %>%
filter(weight > 6 | weight < 4)# A tibble: 6 × 2
weight group
<dbl> <fct>
1 6.11 ctrl
2 3.59 trt1
3 3.83 trt1
4 6.03 trt1
5 6.31 trt2
6 6.15 trt2
Now we want to keep only trt1 and trt2 observations
Option 1:
tbl %>%
filter(group == "trt1" | group == "trt2")# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
Option 2:
tbl %>%
filter(group %in% c("trt1", "trt2"))# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
Option 3:
tbl %>%
filter(group != "ctrl")# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
tbl %>%
filter(weight > mean(weight))# A tibble: 17 × 2
weight group
<dbl> <fct>
1 5.58 ctrl
2 5.18 ctrl
3 6.11 ctrl
4 5.17 ctrl
5 5.33 ctrl
6 5.14 ctrl
7 5.87 trt1
8 6.03 trt1
9 6.31 trt2
10 5.12 trt2
11 5.54 trt2
12 5.5 trt2
13 5.37 trt2
14 5.29 trt2
15 6.15 trt2
16 5.8 trt2
17 5.26 trt2
arrange()
Firstly we gather the data. Note that we make use of slice(), whose function is to select data of discrete ranges.
tbl4 <- tbl %>%
slice(1:3, 11:13, 21:23)
# this keeps only rows 1,2,3,11,12,13,21,22,23Sort by weight (default: ascending)
tbl4 %>%
arrange(weight) # A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 ctrl
2 4.17 trt1
3 4.41 trt1
4 4.81 trt1
5 5.12 trt2
6 5.18 ctrl
7 5.54 trt2
8 5.58 ctrl
9 6.31 trt2
Sort by weight, but descending:
tbl4 %>%
arrange(desc(weight))# A tibble: 9 × 2
weight group
<dbl> <fct>
1 6.31 trt2
2 5.58 ctrl
3 5.54 trt2
4 5.18 ctrl
5 5.12 trt2
6 4.81 trt1
7 4.41 trt1
8 4.17 ctrl
9 4.17 trt1
Firstly sort by “group”. Within each “group” we sort by “weight”.
tbl4 %>%
arrange(group, weight)# A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 ctrl
2 5.18 ctrl
3 5.58 ctrl
4 4.17 trt1
5 4.41 trt1
6 4.81 trt1
7 5.12 trt2
8 5.54 trt2
9 6.31 trt2
Custom order (e.g. sort not alphabetically):
myorder <- c("trt1", "ctrl", "trt2")
tbl4 %>%
arrange(
match(group, myorder),
weight
)# A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 trt1
2 4.41 trt1
3 4.81 trt1
4 4.17 ctrl
5 5.18 ctrl
6 5.58 ctrl
7 5.12 trt2
8 5.54 trt2
9 6.31 trt2
summarise()
tbl %>%
# no group_by
summarise(my_mean = mean(weight))# A tibble: 1 × 1
my_mean
<dbl>
1 5.07
But why not use the built-in mean() function? (i.e. in this case mean(tbl$weight)).
But summary() shines when combining with other functions. E.g. we can get the mean of each control groups:
tbl %>%
group_by(group) %>%
summarise(my_mean = mean(weight))# A tibble: 3 × 2
group my_mean
<fct> <dbl>
1 ctrl 5.03
2 trt1 4.66
3 trt2 5.53
We can also elaborate by combining with multiple functions:
tbl %>%
group_by(group) %>%
summarise(
Mean = mean(weight),
StdDev = sd(weight),
Min = min(weight),
Median = median(weight),
Max = max(weight),
n_Obs = n(),
)# A tibble: 3 × 7
group Mean StdDev Min Median Max n_Obs
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 ctrl 5.03 0.583 4.17 5.15 6.11 10
2 trt1 4.66 0.794 3.59 4.55 6.03 10
3 trt2 5.53 0.443 4.92 5.44 6.31 10
We can even group by more than one groups
tbl %>%
mutate(larger5 = case_when(
weight > 5 ~ "yes",
weight < 5 ~ "no"
)) %>%
group_by(group, larger5) %>%
summarise(
n_Obs = n(),
Mean = mean(weight)
)`summarise()` has grouped output by 'group'. You can override using the
`.groups` argument.
# A tibble: 6 × 4
# Groups: group [3]
group larger5 n_Obs Mean
<fct> <chr> <int> <dbl>
1 ctrl no 4 4.45
2 ctrl yes 6 5.42
3 trt1 no 8 4.34
4 trt1 yes 2 5.95
5 trt2 no 1 4.92
6 trt2 yes 9 5.59
What’s next?
Here are some tests for you.