Completed homework should be submitted on CoursePlus as an Rmd file. Please see the course website for more information about submitting assignments: https://jhudatascience.org/intro_to_r/syllabus.html#submitting-assignments.
Homework will be graded for correct output, not code style. All assignments are due at the end of the course. Please see the course website for more information about grading: https://jhudatascience.org/intro_to_r/syllabus.html#grading.
## you can add more, or change...these are suggestions
library(tidyverse)
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)1. Bring the following dataset into R.
read_csv() and assign it the name “mort”.mort <- read_csv("https://jhudatascience.org/intro_to_r/data/mortality.csv")## New names:
## Rows: 197 Columns: 255
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (254): 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768,
## 1769, 1770,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`2. Run the colnames() function to take a look at the dataset column names. You should see that there was originally no name for the first column and that R replaced it with “…1”. Rename the first column of “mort” to “country” using the rename() function in dplyr.
colnames(mort)##   [1] "...1" "1760" "1761" "1762" "1763" "1764" "1765" "1766" "1767" "1768"
##  [11] "1769" "1770" "1771" "1772" "1773" "1774" "1775" "1776" "1777" "1778"
##  [21] "1779" "1780" "1781" "1782" "1783" "1784" "1785" "1786" "1787" "1788"
##  [31] "1789" "1790" "1791" "1792" "1793" "1794" "1795" "1796" "1797" "1798"
##  [41] "1799" "1800" "1801" "1802" "1803" "1804" "1805" "1806" "1807" "1808"
##  [51] "1809" "1810" "1811" "1812" "1813" "1814" "1815" "1816" "1817" "1818"
##  [61] "1819" "1820" "1821" "1822" "1823" "1824" "1825" "1826" "1827" "1828"
##  [71] "1829" "1830" "1831" "1832" "1833" "1834" "1835" "1836" "1837" "1838"
##  [81] "1839" "1840" "1841" "1842" "1843" "1844" "1845" "1846" "1847" "1848"
##  [91] "1849" "1850" "1851" "1852" "1853" "1854" "1855" "1856" "1857" "1858"
## [101] "1859" "1860" "1861" "1862" "1863" "1864" "1865" "1866" "1867" "1868"
## [111] "1869" "1870" "1871" "1872" "1873" "1874" "1875" "1876" "1877" "1878"
## [121] "1879" "1880" "1881" "1882" "1883" "1884" "1885" "1886" "1887" "1888"
## [131] "1889" "1890" "1891" "1892" "1893" "1894" "1895" "1896" "1897" "1898"
## [141] "1899" "1900" "1901" "1902" "1903" "1904" "1905" "1906" "1907" "1908"
## [151] "1909" "1910" "1911" "1912" "1913" "1914" "1915" "1916" "1917" "1918"
## [161] "1919" "1920" "1921" "1922" "1923" "1924" "1925" "1926" "1927" "1928"
## [171] "1929" "1930" "1931" "1932" "1933" "1934" "1935" "1936" "1937" "1938"
## [181] "1939" "1940" "1941" "1942" "1943" "1944" "1945" "1946" "1947" "1948"
## [191] "1949" "1950" "1951" "1952" "1953" "1954" "1955" "1956" "1957" "1958"
## [201] "1959" "1960" "1961" "1962" "1963" "1964" "1965" "1966" "1967" "1968"
## [211] "1969" "1970" "1971" "1972" "1973" "1974" "1975" "1976" "1977" "1978"
## [221] "1979" "1980" "1981" "1982" "1983" "1984" "1985" "1986" "1987" "1988"
## [231] "1989" "1990" "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998"
## [241] "1999" "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008"
## [251] "2009" "2010" "2030" "2050" "2099"mort <- mort %>% rename(country = `...1`)3. Select only the numeric type columns (select()). Then, create the variable “year” from column names by using the colnames() function to extract them.
year <- mort %>%
  select(-country) %>%
  colnames()
# OR
year <- mort %>%
  select(starts_with(c("1", "2"))) %>%
  colnames()
# OR
year <- mort %>%
  select(where(is.numeric)) %>%
  colnames()4. What is the typeof() for “year”? If it’s not an integer, turn it into integer form with as.integer().
typeof(year)## [1] "character"year <- as.integer(year)
# "year" is of type "character".5. Use the pct_complete() function in the naniar package to determine the percent missing data in “mort”. You might need to load and install naniar!
library(naniar)
pct_complete(mort)## [1] 66.95332# "mort" is 66.95332 percent complete.6. Are there any countries that have a complete record in “mort” across all years? Just look at the output here, don’t reassign it. Hint: look for complete records by dropping all NAs from the dataset using drop_na().
drop_na(mort)## # A tibble: 2 × 255
##   country  `1760` `1761` `1762` `1763` `1764` `1765` `1766` `1767` `1768` `1769`
##   <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Sweden     2.21   2.30   2.79   2.94   2.44   2.35   2.23   2.34   2.44   2.40
## 2 United …   2.20   2.35   2.32   2.32   2.37   2.39   2.27   2.29   2.28   2.32
## # ℹ 244 more variables: `1770` <dbl>, `1771` <dbl>, `1772` <dbl>, `1773` <dbl>,
## #   `1774` <dbl>, `1775` <dbl>, `1776` <dbl>, `1777` <dbl>, `1778` <dbl>,
## #   `1779` <dbl>, `1780` <dbl>, `1781` <dbl>, `1782` <dbl>, `1783` <dbl>,
## #   `1784` <dbl>, `1785` <dbl>, `1786` <dbl>, `1787` <dbl>, `1788` <dbl>,
## #   `1789` <dbl>, `1790` <dbl>, `1791` <dbl>, `1792` <dbl>, `1793` <dbl>,
## #   `1794` <dbl>, `1795` <dbl>, `1796` <dbl>, `1797` <dbl>, `1798` <dbl>,
## #   `1799` <dbl>, `1800` <dbl>, `1801` <dbl>, `1802` <dbl>, `1803` <dbl>, …# Yes - but just two!7. Reshape the “complete” data to long form.
pivot_longer().!COLUMN or -COLUMN means everything except COLUMN.long <-
  pivot_longer(mort, -country, names_to = "year", values_to = "mortality")
# OR
long <-
  pivot_longer(mort, !country, names_to = "year", values_to = "mortality")
# OR
long <-
  pivot_longer(mort, starts_with(c("1", "2")), names_to = "year", values_to = "mortality")8. Bring an additional dataset into R.
read_tsv() and assign it the name “pop”.pop <- read_tsv("https://jhudatascience.org/intro_to_r/data/country_pop.txt")## Rows: 242 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Country (or dependent territory), Date, % of world population, Source
## dbl (1): Rank
## num (1): Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.9. Rename the second column in “pop” to “country” and the column “% of world population”, to “percent”. Use the rename() function. Don’t forget to reassign the renamed data to “pop”.
pop <- pop %>%
  rename(
    country = `Country (or dependent territory)`,
    percent = `% of world population`
  )10. Sort the data in “pop” by “Population” from largest to smallest using arrange() and desc(). After sorting, select() “country” to create an one-column tibble of countries ordered by population. Assign this data the name “country_ordered”.
country_ordered <- pop %>%
  arrange(desc(Population)) %>%
  select(country)11. Subset “long” based on years 2000-2010, including 2000 and 2010 and call this “long_sub” using & or the between() function. Confirm your filtering worked by looking at the range of “year”. If you’re getting a strange error, make sure you created the “year” column in problem #7.
long_sub <- long %>% filter(year >= 2000 & year <= 2010)
long_sub %>%
  pull(year) %>%
  range() # confirm it worked## [1] "2000" "2010"12. Further subset long_sub. You will filter for specific countries using filter() and the %in% operator. Only include countries in this list: c("Venezuela", "Bahrain", "Estonia", "Iran", "Thailand", "Canada"). Make sure to reassign to “long_sub”.
long_sub <- long_sub %>%
  filter(country %in% c("Venezuela", "Bahrain", "Estonia", "Iran", "Thailand", "Canada"))13. Use pivot_wider() to turn the “year” column of “long_sub” into multiple columns, each representing a different year. Fill values (values_from=) with “mortality”. Assign this pivoted dataset the name “mort_sub”.
mort_sub <- long_sub %>%
  pivot_wider(id_cols = country, names_from = year, values_from = mortality)14. Using “country_ordered” and “mort_sub”, right_join() the two datasets by “country”. Use the pipe %>% to join this dataset to “pop”, keeping only the data on the lefthand side of the join. Call this “joined”.
joined <- country_ordered %>%
  right_join(mort_sub, by = "country") %>%
  left_join(pop, by = "country")15. The values in the table are percentages of the total population (not proportion).
select() only “country”, “Population”, and “mort_count” and view the data.Justification is just for fun. The main point is that decisions in your analysis should depend on your reasoning not how many lines of code it takes :)
# Use 2010: There appears to be a downward trend in mortality rates, so using 2010 could be the most accurate for future years.
joined %>%
  mutate(mort_count = Population * `2010` / 100) %>%
  select(country, Population, mort_count)## # A tibble: 6 × 3
##   country   Population mort_count
##   <chr>          <dbl>      <dbl>
## 1 Iran        77056000     79995.
## 2 Thailand    65926261     58939.
## 3 Canada      35002447     17119.
## 4 Venezuela   28946101     40455.
## 5 Estonia      1294455      1122.
## 6 Bahrain      1234571      1609.# OR
# Use an average 2000-2010: Using the average is more robust to fluctuations and uncertainty in the coming years.
avg_mort <- rowMeans(joined %>% select(starts_with("2")))
joined <- joined %>% mutate(avg_pct_2000 = avg_mort)
joined %>%
  mutate(mort_count = Population * avg_pct_2000 / 100) %>%
  select(country, Population, mort_count)## # A tibble: 6 × 3
##   country   Population mort_count
##   <chr>          <dbl>      <dbl>
## 1 Iran        77056000    107019.
## 2 Thailand    65926261     65944.
## 3 Canada      35002447     16886.
## 4 Venezuela   28946101     46433.
## 5 Estonia      1294455      1265.
## 6 Bahrain      1234571      1772.The following questions are not required for full credit, but can make up for any points lost on other questions.
A. Bring the following dataset into R.
read_excel() from the readxl package and assign it the name “asthma”.library(readxl)
asthma <- read_excel("asthma.xlsx", sheet = "Age Group (Years)")B. Rename the column Weighted Number With Current Asthma to “asthma_count” using rename(). Replace the original “asthma” object by calling the new dataset “asthma”.
asthma <- asthma %>% rename("asthma_count" = `Weighted Number With Current Asthma`)C. Separate Percent (SE) into two separate columns: “percent” and “SE” using the separate() function. Replace the original “asthma” object by calling the new dataset “asthma”.
asthma <- asthma %>% separate(`Percent (SE)`, into = c("percent", "SE"), sep = " ")D. Remove the parentheses around the numbers in the new SE column. You should use a combination of str_replace(), pull() (because stringr package functions work on vectors not dataframes!) and mutate(). Replace the original “asthma” object by calling the new dataset “asthma”.
pattern = to find the starting parenthesis is “[(]”pattern = to find for the ending parenthesis is “[)]”replacement = for both can be empty quotation marks: “”asthma <- asthma %>% mutate(SE = str_replace(pull(asthma, SE), pattern = "[(]", replacement = ""))
asthma <- asthma %>% mutate(SE = str_replace(pull(asthma, SE), pattern = "[)]", replacement = ""))E. Determine the class of “percent” and “SE”. Can you take the mean values? Why or why not?
class(pull(asthma, percent))## [1] "character"class(pull(asthma, SE))## [1] "character"# No. Both are character type and math cannot be performed on characters.F. Use as.numeric() to convert “percent” and “SE” to numeric class. Calculate the mean for both.
asthma %>%
  mutate(percent = as.numeric(percent)) %>%
  pull(percent) %>%
  mean()## [1] 7.771429asthma %>%
  mutate(SE = as.numeric(SE)) %>%
  pull(SE) %>%
  mean()## [1] 0.655