::p_load(dplyr)
pacman<- as_tibble(PlantGrowth)
tbl <- head(tbl) tbl2
Playing with Your Data using {dplyr}
{dplyr}
package for processing the data.
Load the Data
In the following example, we use a partial of the PlantGrowth
dataset.
Read this article for more information about the PlantGrowth
dataset itself.
mutate()
baseR
$new <- tbl2$weight + 2 tbl2
Create new column
<- tbl2 %>% mutate(new = weight + 2) tbl2
Replace the value in-place
<- tbl2 %>% mutate(weight = weight + 2) tbl2
Note that when we export it to excel, the backtick won’t shown in the excel file.
%>%
tbl2 mutate(
`Name with space` = "Hello!",
number10 = 10
)
# A tibble: 6 × 5
weight group new `Name with space` number10
<dbl> <fct> <dbl> <chr> <dbl>
1 6.17 ctrl 6.17 Hello! 10
2 7.58 ctrl 7.58 Hello! 10
3 7.18 ctrl 7.18 Hello! 10
4 8.11 ctrl 8.11 Hello! 10
5 6.5 ctrl 6.5 Hello! 10
6 6.61 ctrl 6.61 Hello! 10
For the 3
%>%
tbl2 mutate(v1 = 1, v2 = 2, v3 = 3) %>%
mutate(
across(starts_with("v"), ~ .x + 20))
# A tibble: 6 × 6
weight group new v1 v2 v3
<dbl> <fct> <dbl> <dbl> <dbl> <dbl>
1 6.17 ctrl 6.17 21 22 23
2 7.58 ctrl 7.58 21 22 23
3 7.18 ctrl 7.18 21 22 23
4 8.11 ctrl 8.11 21 22 23
5 6.5 ctrl 6.5 21 22 23
6 6.61 ctrl 6.61 21 22 23
select()
<- tbl2 %>%
tbl3 mutate(var1 = 1, var2 = 2, var3 = "text", var4 = "word")
tbl3
# A tibble: 6 × 7
weight group new var1 var2 var3 var4
<dbl> <fct> <dbl> <dbl> <dbl> <chr> <chr>
1 6.17 ctrl 6.17 1 2 text word
2 7.58 ctrl 7.58 1 2 text word
3 7.18 ctrl 7.18 1 2 text word
4 8.11 ctrl 8.11 1 2 text word
5 6.5 ctrl 6.5 1 2 text word
6 6.61 ctrl 6.61 1 2 text word
%>%
tbl3 select(group, var1, var4)
# A tibble: 6 × 3
group var1 var4
<fct> <dbl> <chr>
1 ctrl 1 word
2 ctrl 1 word
3 ctrl 1 word
4 ctrl 1 word
5 ctrl 1 word
6 ctrl 1 word
%>%
tbl3 select(group, var1:var4)
# A tibble: 6 × 5
group var1 var2 var3 var4
<fct> <dbl> <dbl> <chr> <chr>
1 ctrl 1 2 text word
2 ctrl 1 2 text word
3 ctrl 1 2 text word
4 ctrl 1 2 text word
5 ctrl 1 2 text word
6 ctrl 1 2 text word
%>%
tbl3 select(-group)
# A tibble: 6 × 6
weight new var1 var2 var3 var4
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 6.17 6.17 1 2 text word
2 7.58 7.58 1 2 text word
3 7.18 7.18 1 2 text word
4 8.11 8.11 1 2 text word
5 6.5 6.5 1 2 text word
6 6.61 6.61 1 2 text word
%>%
tbl3 select(where(is.numeric))
# A tibble: 6 × 4
weight new var1 var2
<dbl> <dbl> <dbl> <dbl>
1 6.17 6.17 1 2
2 7.58 7.58 1 2
3 7.18 7.18 1 2
4 8.11 8.11 1 2
5 6.5 6.5 1 2
6 6.61 6.61 1 2
Select literally everything but move the column var
to the front.
%>%
tbl3 select(var1, everything())
# A tibble: 6 × 7
var1 weight group new var2 var3 var4
<dbl> <dbl> <fct> <dbl> <dbl> <chr> <chr>
1 1 6.17 ctrl 6.17 2 text word
2 1 7.58 ctrl 7.58 2 text word
3 1 7.18 ctrl 7.18 2 text word
4 1 8.11 ctrl 8.11 2 text word
5 1 6.5 ctrl 6.5 2 text word
6 1 6.61 ctrl 6.61 2 text word
filter()
%>%
tbl filter(weight > 6 & group == "trt2")
# A tibble: 2 × 2
weight group
<dbl> <fct>
1 6.31 trt2
2 6.15 trt2
%>%
tbl filter(weight > 6 | weight < 4)
# A tibble: 6 × 2
weight group
<dbl> <fct>
1 6.11 ctrl
2 3.59 trt1
3 3.83 trt1
4 6.03 trt1
5 6.31 trt2
6 6.15 trt2
Now we want to keep only trt1
and trt2
observations
Option 1:
%>%
tbl filter(group == "trt1" | group == "trt2")
# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
Option 2:
%>%
tbl filter(group %in% c("trt1", "trt2"))
# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
Option 3:
%>%
tbl filter(group != "ctrl")
# A tibble: 20 × 2
weight group
<dbl> <fct>
1 4.81 trt1
2 4.17 trt1
3 4.41 trt1
4 3.59 trt1
5 5.87 trt1
6 3.83 trt1
7 6.03 trt1
8 4.89 trt1
9 4.32 trt1
10 4.69 trt1
11 6.31 trt2
12 5.12 trt2
13 5.54 trt2
14 5.5 trt2
15 5.37 trt2
16 5.29 trt2
17 4.92 trt2
18 6.15 trt2
19 5.8 trt2
20 5.26 trt2
%>%
tbl filter(weight > mean(weight))
# A tibble: 17 × 2
weight group
<dbl> <fct>
1 5.58 ctrl
2 5.18 ctrl
3 6.11 ctrl
4 5.17 ctrl
5 5.33 ctrl
6 5.14 ctrl
7 5.87 trt1
8 6.03 trt1
9 6.31 trt2
10 5.12 trt2
11 5.54 trt2
12 5.5 trt2
13 5.37 trt2
14 5.29 trt2
15 6.15 trt2
16 5.8 trt2
17 5.26 trt2
arrange()
Firstly we gather the data. Note that we make use of slice()
, whose function is to select data of discrete ranges.
<- tbl %>%
tbl4 slice(1:3, 11:13, 21:23)
# this keeps only rows 1,2,3,11,12,13,21,22,23
Sort by weight (default: ascending)
%>%
tbl4 arrange(weight)
# A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 ctrl
2 4.17 trt1
3 4.41 trt1
4 4.81 trt1
5 5.12 trt2
6 5.18 ctrl
7 5.54 trt2
8 5.58 ctrl
9 6.31 trt2
Sort by weight, but descending:
%>%
tbl4 arrange(desc(weight))
# A tibble: 9 × 2
weight group
<dbl> <fct>
1 6.31 trt2
2 5.58 ctrl
3 5.54 trt2
4 5.18 ctrl
5 5.12 trt2
6 4.81 trt1
7 4.41 trt1
8 4.17 ctrl
9 4.17 trt1
Firstly sort by “group”. Within each “group” we sort by “weight”.
%>%
tbl4 arrange(group, weight)
# A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 ctrl
2 5.18 ctrl
3 5.58 ctrl
4 4.17 trt1
5 4.41 trt1
6 4.81 trt1
7 5.12 trt2
8 5.54 trt2
9 6.31 trt2
Custom order (e.g. sort not alphabetically):
<- c("trt1", "ctrl", "trt2")
myorder
%>%
tbl4 arrange(
match(group, myorder),
weight )
# A tibble: 9 × 2
weight group
<dbl> <fct>
1 4.17 trt1
2 4.41 trt1
3 4.81 trt1
4 4.17 ctrl
5 5.18 ctrl
6 5.58 ctrl
7 5.12 trt2
8 5.54 trt2
9 6.31 trt2
summarise()
%>%
tbl # no group_by
summarise(my_mean = mean(weight))
# A tibble: 1 × 1
my_mean
<dbl>
1 5.07
But why not use the built-in mean()
function? (i.e. in this case mean(tbl$weight)
).
But summary()
shines when combining with other functions. E.g. we can get the mean of each control groups:
%>%
tbl group_by(group) %>%
summarise(my_mean = mean(weight))
# A tibble: 3 × 2
group my_mean
<fct> <dbl>
1 ctrl 5.03
2 trt1 4.66
3 trt2 5.53
We can also elaborate by combining with multiple functions:
%>%
tbl group_by(group) %>%
summarise(
Mean = mean(weight),
StdDev = sd(weight),
Min = min(weight),
Median = median(weight),
Max = max(weight),
n_Obs = n(),
)
# A tibble: 3 × 7
group Mean StdDev Min Median Max n_Obs
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 ctrl 5.03 0.583 4.17 5.15 6.11 10
2 trt1 4.66 0.794 3.59 4.55 6.03 10
3 trt2 5.53 0.443 4.92 5.44 6.31 10
We can even group by more than one groups
%>%
tbl mutate(larger5 = case_when(
> 5 ~ "yes",
weight < 5 ~ "no"
weight %>%
)) group_by(group, larger5) %>%
summarise(
n_Obs = n(),
Mean = mean(weight)
)
`summarise()` has grouped output by 'group'. You can override using the
`.groups` argument.
# A tibble: 6 × 4
# Groups: group [3]
group larger5 n_Obs Mean
<fct> <chr> <int> <dbl>
1 ctrl no 4 4.45
2 ctrl yes 6 5.42
3 trt1 no 8 4.34
4 trt1 yes 2 5.95
5 trt2 no 1 4.92
6 trt2 yes 9 5.59
What’s next?
Here are some tests for you.