library(dplyr)
library(ggplot2)
jhur::read_yts()
). select
“Sample_Size”, “Education”, and “LocationAbbr”. Name this data “yts”.yts <- jhur::read_yts() %>% select(Sample_Size, Education, LocationAbbr)
# Alt:
# yts <- read_csv("http://jhudatascience.org/intro_to_R_class/data/Youth_Tobacco_Survey_YTS_Data.csv")
aes(x = Education, y = Sample_Size)
and geom_boxplot()
.yts %>%
ggplot(aes(x = Education, y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).
group_by
and tally
to count up the number of lines of data for each “Education” group.yts %>% group_by(Education) %>% tally()
## # A tibble: 2 x 2
## Education n
## <chr> <int>
## 1 High School 4588
## 2 Middle School 5206
mutate
and factor
functions. Use the levels
argument inside factor
to reorder “Education”. Reorder this variable so that “Middle School” comes before “High School”. Assign the output the name “yts_fct”.yts_fct <-
yts %>% mutate(Education = factor(Education, levels = c("Middle School", "High School")))
tally
table.yts_fct %>%
ggplot(aes(x = Education, y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).
yts_fct %>% group_by(Education) %>% tally()
## # A tibble: 2 x 2
## Education n
## <fct> <int>
## 1 Middle School 5206
## 2 High School 4588
BONUS
mutate
and factor
functions. Do not add a levels =
argument.yts_fct <- yts_fct %>% mutate(LocationAbbr = factor(LocationAbbr))
group_by
“LocationAbbr”.mutate
to create a new column “med_sample_size” that is the median “Sample_Size”.group_by
, a median “Sample_Size” will automatically be created for each unique level in “LocationAbbr”. Use the median
function with na.rm = TRUE
.yts_fct <- yts_fct %>% group_by(LocationAbbr) %>% mutate(med_sample_size = median(Sample_Size, na.rm = TRUE))
forcats
package, create a plot that:mapping
argument and the fct_reorder
function to order the x-axis by “med_sample_size”geom_boxplot
)library(forcats)
yts_fct %>%
ggplot(mapping = aes(x = fct_reorder(LocationAbbr, med_sample_size),
y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).