Prepare the input data

download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", 
              destfile = "data/time_series_covid19_confirmed_global.csv")
time_series_confirmed <- read_csv("data/time_series_covid19_confirmed_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
head(time_series_confirmed)
## # A tibble: 6 x 254
##   Province_State Country_Region   Lat   Long `1/22/20` `1/23/20` `1/24/20`
##   <chr>          <chr>          <dbl>  <dbl>     <dbl>     <dbl>     <dbl>
## 1 <NA>           Afghanistan     33.9  67.7          0         0         0
## 2 <NA>           Albania         41.2  20.2          0         0         0
## 3 <NA>           Algeria         28.0   1.66         0         0         0
## 4 <NA>           Andorra         42.5   1.52         0         0         0
## 5 <NA>           Angola         -11.2  17.9          0         0         0
## 6 <NA>           Antigua and B…  17.1 -61.8          0         0         0
## # … with 247 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## #   `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## #   `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## #   `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## #   `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## #   `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## #   `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## #   `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## #   `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## #   `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## #   `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## #   `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## #   `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## #   `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## #   `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## #   `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## #   `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## #   `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## #   `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## #   `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## #   `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## #   `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## #   `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## #   `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## #   `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## #   `5/3/20` <dbl>, …

Create long format for the confirmed data

time_series_confirmed_long <- time_series_confirmed %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Confirmed") 
head(time_series_confirmed_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date    Confirmed
##   <chr>          <chr>          <dbl> <dbl> <chr>       <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20         0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20         0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20         0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20         0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20         0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20         0

Create long format for the death data

download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
               destfile = "data/time_series_covid19_deaths_global.csv")
time_series_deaths <- read_csv("data/time_series_covid19_deaths_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
time_series_deaths_long <- time_series_deaths %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Deaths") 
head(time_series_deaths_long)
## # A tibble: 6 x 6
##   Province_State Country_Region   Lat  Long Date    Deaths
##   <chr>          <chr>          <dbl> <dbl> <chr>    <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20      0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20      0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20      0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20      0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20      0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20      0

Create a key column and join confirmed and death dataframes.

time_series_confirmed_long <- time_series_confirmed_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)
time_series_deaths_long <- time_series_deaths_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".") %>% 
  select(Key, Deaths)
time_series_long_joined <- full_join(time_series_confirmed_long,
              time_series_deaths_long, by = c("Key")) %>% 
              select(-Key)
head(time_series_long_joined)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date    Confirmed Deaths
##   <chr>          <chr>          <dbl> <dbl> <chr>       <dbl>  <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 1/22/20         0      0
## 2 <NA>           Afghanistan     33.9  67.7 1/23/20         0      0
## 3 <NA>           Afghanistan     33.9  67.7 1/24/20         0      0
## 4 <NA>           Afghanistan     33.9  67.7 1/25/20         0      0
## 5 <NA>           Afghanistan     33.9  67.7 1/26/20         0      0
## 6 <NA>           Afghanistan     33.9  67.7 1/27/20         0      0

Change the date format

library(lubridate)
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
time_series_long_joined_counts <- time_series_long_joined %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")
head(time_series_long_joined_counts)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date       Report_Type Counts
##   <chr>          <chr>          <dbl> <dbl> <date>     <chr>        <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 2020-01-22 Confirmed        0
## 2 <NA>           Afghanistan     33.9  67.7 2020-01-22 Deaths           0
## 3 <NA>           Afghanistan     33.9  67.7 2020-01-23 Confirmed        0
## 4 <NA>           Afghanistan     33.9  67.7 2020-01-23 Deaths           0
## 5 <NA>           Afghanistan     33.9  67.7 2020-01-24 Confirmed        0
## 6 <NA>           Afghanistan     33.9  67.7 2020-01-24 Deaths           0

Example Plot

time_series_long_joined_counts %>% 
  group_by(Country_Region, Report_Type, Date) %>% 
  summarise(Counts = sum(Counts)) %>% 
  filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = log2(Counts), fill = Report_Type, color = Report_Type)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Cases")

Exercise

  1. Revise the above example for joining tables using 6/13/2020 and 9/13/2020. Plot the data as a bar plot. Make sure your graph looks nice in the report by adjusting the height in the R code chunk header.
US_time <- time_series_long_joined_counts %>% filter(Date >= as.Date("2020-06-13") & Date <= as.Date("2020-09-13")) %>%
  group_by(Country_Region, Report_Type, Date) %>% 
  summarise(Counts = sum(Counts)) %>% filter (Country_Region == "US")
## `summarise()` regrouping output by 'Country_Region', 'Report_Type' (override with `.groups` argument)
datatable(US_time)
p1 <- US_time %>% 
  ggplot(aes(x = Date,  y = log2(Counts), fill=Report_Type)) + 
    geom_bar(stat="identity") +
    ggtitle("US COVID-19 Cases") + 
    facet_wrap(~Report_Type)
p1

  1. Add more informative x and y labels and a title to the above graph in Ex2.
p1 + 
  labs(title = "US COVID-19 Cases",
        x = "Date",
        y = "Counts")

  1. Using the time series data. Make a plot of the total number of confirmed deaths per day worldwide
time_series_long_joined_counts %>% filter(Report_Type=="Deaths") %>%
  group_by(Date,Report_Type) %>%
  summarize(Counts = sum(Counts)) %>%
  ggplot(aes(x=Date,y=log2(Counts))) + 
    geom_line() +
    theme_bw() +
    labs(title = "Worlwide COVID19 Deaths",
        x = "Date",
        y = "Deaths") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))

  1. Use Mutate to make a new column with deaths/confirmed cases (Hint: this might be best done after joining the tables, but before pivoting longer).
tslj_drate <- time_series_long_joined %>% mutate(DRate=ifelse(Deaths==0,0,Deaths/Confirmed)) %>%
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")
head(tslj_drate)
## # A tibble: 6 x 7
##   Province_State Country_Region   Lat  Long Date       Report_Type Counts
##   <chr>          <chr>          <dbl> <dbl> <date>     <chr>        <dbl>
## 1 <NA>           Afghanistan     33.9  67.7 2020-01-22 Confirmed        0
## 2 <NA>           Afghanistan     33.9  67.7 2020-01-22 Deaths           0
## 3 <NA>           Afghanistan     33.9  67.7 2020-01-22 DRate            0
## 4 <NA>           Afghanistan     33.9  67.7 2020-01-23 Confirmed        0
## 5 <NA>           Afghanistan     33.9  67.7 2020-01-23 Deaths           0
## 6 <NA>           Afghanistan     33.9  67.7 2020-01-23 DRate            0
  1. Plot US deaths/confirmed cases per day.
tslj_drate %>% filter(Report_Type=="DRate") %>%
  group_by(Country_Region,Date,Report_Type) %>%
  summarize(Counts = sum(Counts)) %>%
  filter(Country_Region=="US") %>%
  ggplot(aes(x=Date,y=Counts)) + 
    geom_line() +
    theme_bw() +
    labs(title = "US COVID19 - Death to Confirmed Case Ratio",
        x = "Date",
        y = "Death Rate") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))

  1. Make a single graph with from the times series data with the 10 countries with the hightest death total.
Top10 <- time_series_long_joined_counts %>% filter(Report_Type=="Deaths") %>% group_by(Country_Region) %>% summarise(Counts=sum(Counts)) %>% top_n(10)
Top10
## # A tibble: 10 x 2
##    Country_Region   Counts
##    <chr>             <dbl>
##  1 Brazil         11221271
##  2 France          4951523
##  3 India           5018592
##  4 Iran            2328120
##  5 Italy           5905648
##  6 Mexico          5580224
##  7 Peru            2313290
##  8 Spain           4849599
##  9 United Kingdom  6366694
## 10 US             22300040
time_series_long_joined_counts %>% filter(Report_Type=="Deaths" & Country_Region%in%Top10$Country_Region) %>%
  group_by(Country_Region,Date,Report_Type) %>%
  summarize(Counts = sum(Counts)) %>%
  ggplot(aes(x=Date,y=Counts,color=Country_Region)) + 
    geom_line(size=1) +
    theme_bw() +
    labs(title = "COVID19 Deaths of Top10 Highest",
        x = "Date",
        y = "Death") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))

  1. Using Facet_Wraps make a graph from the times series data with the 10 countries with the hightest death total. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.
time_series_long_joined_counts %>% filter(Report_Type=="Deaths" & Country_Region%in%Top10$Country_Region) %>%
  group_by(Country_Region,Date,Report_Type) %>%
  summarize(Counts = sum(Counts)) %>%
  ggplot(aes(x=Date,y=Counts,color=Country_Region)) + 
    geom_line(size=1) +
    theme_bw() +
    facet_wrap(~Country_Region, ncol=3) +
    labs(title = "COVID19 Deaths of Top10 Highest",
        x = "Date",
        y = "Death") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))

  1. Using Facet_Wraps make a graph from the times series data for confirmed cases for all US states and territories. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.
time_series_long_joined_counts %>% filter(Report_Type=="Confirmed" & Country_Region=="United Kingdom") %>%
  group_by(Province_State,Date) %>%
  summarize(Counts = sum(Counts)) %>%
  ggplot(aes(x=Date,y=Counts,color=Province_State)) + 
    geom_line(size=1) +
    theme_bw() +
    facet_wrap(~Province_State, ncol=3, scales = "free_y") +
    labs(title = "COVID19 Deaths of UK Territories",
        x = "Date",
        y = "Death") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))

  1. Apply a theme other than the one used in the Data Carpentries tutorial to the graph in Ex8.
time_series_long_joined_counts %>% filter(Report_Type=="Confirmed" & Country_Region=="United Kingdom") %>%
  group_by(Province_State,Date) %>%
  summarize(Counts = sum(Counts)) %>%
  ggplot(aes(x=Date,y=Counts,color=Province_State)) + 
    geom_line(size=1) +
    theme_linedraw() +
    facet_wrap(~Province_State, ncol=3, scales = "free_y") +
    labs(title = "COVID19 Deaths of UK Territories",
        x = "Date",
        y = "Death") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          text=element_text(size = 16))