library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.6
## ✓ tibble 3.1.2 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(jhur)
First, create a vector that has class integer called int_vect
that starts at 1 and goes up to 10 and repeats this sequence 3 times using rep
(hint use seq()
).
int_vect <- rep(seq(1, 10), 3)
rand_vect
(hint use sample()
).Because we are using a random vector let’s use the set.seed()
function to make sure we all have the same result - this code is already be in the code chunk for you. Simply create the vector below the set.seed
line.
set.seed(1234)
rand_vect <- sample( 1:30, size = 30, replace = TRUE)
c(TRUE, TRUE, FALSE)
10 times called TF_vect
. Also create a vector that repeats c("TRUE", "TRUE", "FALSE")
10 times called TF_vect2
.TF_vect <- rep(c(TRUE, TRUE, FALSE), times = 10)
TF_vect2 <- rep(c("TRUE", "TRUE", "FALSE"), times = 10)
Create a tibble combining these vectors together called vect_data
using the following code.
vect_data <- tibble(int_vect, rand_vect, TF_vect, TF_vect2)
slice_sample()
function. Try this a few times to see how the results change.slice_sample(vect_data, n = 5)
## # A tibble: 5 x 4
## int_vect rand_vect TF_vect TF_vect2
## <int> <int> <lgl> <chr>
## 1 3 26 FALSE FALSE
## 2 4 22 TRUE TRUE
## 3 6 4 TRUE TRUE
## 4 5 5 TRUE TRUE
## 5 2 16 TRUE TRUE
slice_sample(vect_data, n = 5)
## # A tibble: 5 x 4
## int_vect rand_vect TF_vect TF_vect2
## <int> <int> <lgl> <chr>
## 1 10 24 FALSE FALSE
## 2 5 22 FALSE FALSE
## 3 8 9 TRUE TRUE
## 4 7 21 FALSE FALSE
## 5 10 20 TRUE TRUE
slice_sample(vect_data, n = 5)
## # A tibble: 5 x 4
## int_vect rand_vect TF_vect TF_vect2
## <int> <int> <lgl> <chr>
## 1 6 26 TRUE TRUE
## 2 2 4 FALSE FALSE
## 3 3 26 FALSE FALSE
## 4 3 24 TRUE TRUE
## 5 9 5 FALSE FALSE
TF_vect
is logical. Check to see if TF_vect2
is logical. Why are the results what they are?is.logical(TF_vect)
## [1] TRUE
is.logical(TF_vect2)
## [1] FALSE
Logical vectors do not have quotes around TRUE
and FALSE
values.
mutate()
function to create a new variable in the data set named type_fact
that is of class factor
made from the int_vect
variable. Take a look at the data.vect_data <- mutate(vect_data, type_fact = factor(int_vect))
vect_data
## # A tibble: 30 x 5
## int_vect rand_vect TF_vect TF_vect2 type_fact
## <int> <int> <lgl> <chr> <fct>
## 1 1 28 TRUE TRUE 1
## 2 2 16 TRUE TRUE 2
## 3 3 26 FALSE FALSE 3
## 4 4 22 TRUE TRUE 4
## 5 5 5 TRUE TRUE 5
## 6 6 12 FALSE FALSE 6
## 7 7 15 TRUE TRUE 7
## 8 8 9 TRUE TRUE 8
## 9 9 5 FALSE FALSE 9
## 10 10 6 TRUE TRUE 10
## # … with 20 more rows
read_circulator()
function from jhur
package. Assign it to circ
variable. Use str()
function to take a look at the data and learn column types.circ <- read_circulator()
## Rows: 1146 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): day, date
## dbl (13): orangeBoardings, orangeAlightings, orangeAverage, purpleBoardings,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(circ)
## spec_tbl_df [1,146 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ day : chr [1:1146] "Monday" "Tuesday" "Wednesday" "Thursday" ...
## $ date : chr [1:1146] "01/11/2010" "01/12/2010" "01/13/2010" "01/14/2010" ...
## $ orangeBoardings : num [1:1146] 877 777 1203 1194 1645 ...
## $ orangeAlightings: num [1:1146] 1027 815 1220 1233 1643 ...
## $ orangeAverage : num [1:1146] 952 796 1212 1214 1644 ...
## $ purpleBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ purpleAlightings: num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ purpleAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ greenBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ greenAlightings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ greenAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ bannerBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ bannerAlightings: num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ bannerAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## $ daily : num [1:1146] 952 796 1212 1214 1644 ...
## - attr(*, "spec")=
## .. cols(
## .. day = col_character(),
## .. date = col_character(),
## .. orangeBoardings = col_double(),
## .. orangeAlightings = col_double(),
## .. orangeAverage = col_double(),
## .. purpleBoardings = col_double(),
## .. purpleAlightings = col_double(),
## .. purpleAverage = col_double(),
## .. greenBoardings = col_double(),
## .. greenAlightings = col_double(),
## .. greenAverage = col_double(),
## .. bannerBoardings = col_double(),
## .. bannerAlightings = col_double(),
## .. bannerAverage = col_double(),
## .. daily = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
mutate()
function to create a new column named date_formatted
that is of Date
class. The new variable is created from date
column. Hint: use mdy()
function.circ <- mutate(circ, date_formatted = mdy(date))
date_formatted
variable to be before date
using the relocate
function. Take a look at the data using glimpse()
. Note the difference between date
and date_formatted
columns.circ <- circ %>% relocate(date_formatted, .before = date)
glimpse(circ)
## Rows: 1,146
## Columns: 16
## $ day <chr> "Monday", "Tuesday", "Wednesday", "Thursday", "Friday…
## $ date_formatted <date> 2010-01-11, 2010-01-12, 2010-01-13, 2010-01-14, 2010…
## $ date <chr> "01/11/2010", "01/12/2010", "01/13/2010", "01/14/2010…
## $ orangeBoardings <dbl> 877, 777, 1203, 1194, 1645, 1457, 839, 999, 1023, 137…
## $ orangeAlightings <dbl> 1027, 815, 1220, 1233, 1643, 1524, 938, 1000, 1047, 1…
## $ orangeAverage <dbl> 952.0, 796.0, 1211.5, 1213.5, 1644.0, 1490.5, 888.5, …
## $ purpleBoardings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ purpleAlightings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ purpleAverage <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ greenBoardings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ greenAlightings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ greenAverage <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ bannerBoardings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ bannerAlightings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ bannerAverage <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ daily <dbl> 952.0, 796.0, 1211.5, 1213.5, 1644.0, 1490.5, 888.5, …
classes_data
that combines the vect_data
from the first part of the lab and circ
. Use glimpse()
to look at the data.classes_data <-list(vect_data, circ)
glimpse(classes_data)
## List of 2
## $ : tibble [30 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ int_vect : int [1:30] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ rand_vect: int [1:30] 28 16 26 22 5 12 15 9 5 6 ...
## ..$ TF_vect : logi [1:30] TRUE TRUE FALSE TRUE TRUE FALSE ...
## ..$ TF_vect2 : chr [1:30] "TRUE" "TRUE" "FALSE" "TRUE" ...
## ..$ type_fact: Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ : tibble [1,146 × 16] (S3: tbl_df/tbl/data.frame)
## ..$ day : chr [1:1146] "Monday" "Tuesday" "Wednesday" "Thursday" ...
## ..$ date_formatted : Date[1:1146], format: "2010-01-11" "2010-01-12" ...
## ..$ date : chr [1:1146] "01/11/2010" "01/12/2010" "01/13/2010" "01/14/2010" ...
## ..$ orangeBoardings : num [1:1146] 877 777 1203 1194 1645 ...
## ..$ orangeAlightings: num [1:1146] 1027 815 1220 1233 1643 ...
## ..$ orangeAverage : num [1:1146] 952 796 1212 1214 1644 ...
## ..$ purpleBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ purpleAlightings: num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ purpleAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ greenBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ greenAlightings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ greenAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ bannerBoardings : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ bannerAlightings: num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ bannerAverage : num [1:1146] NA NA NA NA NA NA NA NA NA NA ...
## ..$ daily : num [1:1146] 952 796 1212 1214 1644 ...
Bonus: Use range()
function on date_formatted
variable to display the range of dates in the data set. How does this compare to that of date
? Why?
range(circ$date_formatted)
## [1] "2010-01-11" "2013-03-01"
range(circ$date)
## [1] "01/01/2011" "12/31/2012"
The max of range(circ$date) is numerical not based on date.
Bonus: Use table()
function on day
variable to display the count of orangeBoardings
observations with each day. Which day had the most boardings? Is this true for the other routes (purpleBoardings
, greenBoardings
, bannerBoardings
)?
circ %>%
group_by(day) %>%
summarize(n = sum(orangeBoardings, na.rm = TRUE))
## # A tibble: 7 x 2
## day n
## <chr> <dbl>
## 1 Friday 584070
## 2 Monday 479914
## 3 Saturday 437682
## 4 Sunday 283648
## 5 Thursday 495142
## 6 Tuesday 488682
## 7 Wednesday 501440
circ %>%
group_by(day) %>%
summarize(n = sum(purpleBoardings, na.rm = TRUE))
## # A tibble: 7 x 2
## day n
## <chr> <dbl>
## 1 Friday 698199
## 2 Monday 561344
## 3 Saturday 501958
## 4 Sunday 333439
## 5 Thursday 590142
## 6 Tuesday 576164
## 7 Wednesday 601997
circ %>%
group_by(day) %>%
summarize(n = sum(greenBoardings, na.rm = TRUE))
## # A tibble: 7 x 2
## day n
## <chr> <dbl>
## 1 Friday 162829
## 2 Monday 136575
## 3 Saturday 109121
## 4 Sunday 81044
## 5 Thursday 145750
## 6 Tuesday 148742
## 7 Wednesday 151503
circ %>%
group_by(day) %>%
summarize(n = sum(bannerBoardings, na.rm = TRUE))
## # A tibble: 7 x 2
## day n
## <chr> <dbl>
## 1 Friday 39530
## 2 Monday 28953
## 3 Saturday 35957
## 4 Sunday 25950
## 5 Thursday 32561
## 6 Tuesday 29231
## 7 Wednesday 31809