library(dplyr)
library(ggplot2)
jhur::read_yts()). select “Sample_Size”, “Education”, and “LocationAbbr”. Name this data “yts”.yts <- jhur::read_yts() %>% select(Sample_Size, Education, LocationAbbr)
# Alt:
# yts <- read_csv("http://jhudatascience.org/intro_to_R_class/data/Youth_Tobacco_Survey_YTS_Data.csv")
aes(x = Education, y = Sample_Size) and geom_boxplot().yts %>%
ggplot(aes(x = Education, y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).
group_by and tally to count up the number of lines of data for each “Education” group.yts %>% group_by(Education) %>% tally()
## # A tibble: 2 x 2
## Education n
## <chr> <int>
## 1 High School 4588
## 2 Middle School 5206
mutate and factor functions. Use the levels argument inside factor to reorder “Education”. Reorder this variable so that “Middle School” comes before “High School”. Assign the output the name “yts_fct”.yts_fct <-
yts %>% mutate(Education = factor(Education, levels = c("Middle School", "High School")))
tally table.yts_fct %>%
ggplot(aes(x = Education, y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).
yts_fct %>% group_by(Education) %>% tally()
## # A tibble: 2 x 2
## Education n
## <fct> <int>
## 1 Middle School 5206
## 2 High School 4588
BONUS
mutate and factor functions. Do not add a levels = argument.yts_fct <- yts_fct %>% mutate(LocationAbbr = factor(LocationAbbr))
group_by “LocationAbbr”.mutate to create a new column “med_sample_size” that is the median “Sample_Size”.group_by, a median “Sample_Size” will automatically be created for each unique level in “LocationAbbr”. Use the median function with na.rm = TRUE.yts_fct <- yts_fct %>% group_by(LocationAbbr) %>% mutate(med_sample_size = median(Sample_Size, na.rm = TRUE))
forcats package, create a plot that:mapping argument and the fct_reorder function to order the x-axis by “med_sample_size”geom_boxplot)library(forcats)
yts_fct %>%
ggplot(mapping = aes(x = fct_reorder(LocationAbbr, med_sample_size),
y = Sample_Size)) +
geom_boxplot()
## Warning: Removed 425 rows containing non-finite values (stat_boxplot).