Prepare the input data
download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
destfile = "data/time_series_covid19_confirmed_global.csv")
time_series_confirmed <- read_csv("data/time_series_covid19_confirmed_global.csv")%>%
rename(Province_State = "Province/State", Country_Region = "Country/Region")
head(time_series_confirmed)
## # A tibble: 6 x 254
## Province_State Country_Region Lat Long `1/22/20` `1/23/20` `1/24/20`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0
## 6 <NA> Antigua and B… 17.1 -61.8 0 0 0
## # … with 247 more variables: `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>, `1/31/20` <dbl>,
## # `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>,
## # `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>,
## # `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>,
## # `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>,
## # `2/25/20` <dbl>, `2/26/20` <dbl>, `2/27/20` <dbl>, `2/28/20` <dbl>,
## # `2/29/20` <dbl>, `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>, `3/7/20` <dbl>,
## # `3/8/20` <dbl>, `3/9/20` <dbl>, `3/10/20` <dbl>, `3/11/20` <dbl>,
## # `3/12/20` <dbl>, `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>, `3/19/20` <dbl>,
## # `3/20/20` <dbl>, `3/21/20` <dbl>, `3/22/20` <dbl>, `3/23/20` <dbl>,
## # `3/24/20` <dbl>, `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>, `3/31/20` <dbl>,
## # `4/1/20` <dbl>, `4/2/20` <dbl>, `4/3/20` <dbl>, `4/4/20` <dbl>,
## # `4/5/20` <dbl>, `4/6/20` <dbl>, `4/7/20` <dbl>, `4/8/20` <dbl>,
## # `4/9/20` <dbl>, `4/10/20` <dbl>, `4/11/20` <dbl>, `4/12/20` <dbl>,
## # `4/13/20` <dbl>, `4/14/20` <dbl>, `4/15/20` <dbl>, `4/16/20` <dbl>,
## # `4/17/20` <dbl>, `4/18/20` <dbl>, `4/19/20` <dbl>, `4/20/20` <dbl>,
## # `4/21/20` <dbl>, `4/22/20` <dbl>, `4/23/20` <dbl>, `4/24/20` <dbl>,
## # `4/25/20` <dbl>, `4/26/20` <dbl>, `4/27/20` <dbl>, `4/28/20` <dbl>,
## # `4/29/20` <dbl>, `4/30/20` <dbl>, `5/1/20` <dbl>, `5/2/20` <dbl>,
## # `5/3/20` <dbl>, …
Create a key column and join confirmed and death dataframes.
time_series_confirmed_long <- time_series_confirmed_long %>%
unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)
time_series_deaths_long <- time_series_deaths_long %>%
unite(Key, Province_State, Country_Region, Date, sep = ".") %>%
select(Key, Deaths)
time_series_long_joined <- full_join(time_series_confirmed_long,
time_series_deaths_long, by = c("Key")) %>%
select(-Key)
head(time_series_long_joined)
## # A tibble: 6 x 7
## Province_State Country_Region Lat Long Date Confirmed Deaths
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 1/22/20 0 0
## 2 <NA> Afghanistan 33.9 67.7 1/23/20 0 0
## 3 <NA> Afghanistan 33.9 67.7 1/24/20 0 0
## 4 <NA> Afghanistan 33.9 67.7 1/25/20 0 0
## 5 <NA> Afghanistan 33.9 67.7 1/26/20 0 0
## 6 <NA> Afghanistan 33.9 67.7 1/27/20 0 0
Example Plot
time_series_long_joined_counts %>%
group_by(Country_Region, Report_Type, Date) %>%
summarise(Counts = sum(Counts)) %>%
filter (Country_Region == "US") %>%
ggplot(aes(x = Date, y = log2(Counts), fill = Report_Type, color = Report_Type)) +
geom_point() +
geom_line() +
ggtitle("US COVID-19 Cases")

Exercise
- Revise the above example for joining tables using 6/13/2020 and 9/13/2020. Plot the data as a bar plot. Make sure your graph looks nice in the report by adjusting the height in the R code chunk header.
US_time <- time_series_long_joined_counts %>% filter(Date >= as.Date("2020-06-13") & Date <= as.Date("2020-09-13")) %>%
group_by(Country_Region, Report_Type, Date) %>%
summarise(Counts = sum(Counts)) %>% filter (Country_Region == "US")
## `summarise()` regrouping output by 'Country_Region', 'Report_Type' (override with `.groups` argument)
p1 <- US_time %>%
ggplot(aes(x = Date, y = log2(Counts), fill=Report_Type)) +
geom_bar(stat="identity") +
ggtitle("US COVID-19 Cases") +
facet_wrap(~Report_Type)
p1

- Add more informative x and y labels and a title to the above graph in Ex2.
p1 +
labs(title = "US COVID-19 Cases",
x = "Date",
y = "Counts")

- Using the time series data. Make a plot of the total number of confirmed deaths per day worldwide
time_series_long_joined_counts %>% filter(Report_Type=="Deaths") %>%
group_by(Date,Report_Type) %>%
summarize(Counts = sum(Counts)) %>%
ggplot(aes(x=Date,y=log2(Counts))) +
geom_line() +
theme_bw() +
labs(title = "Worlwide COVID19 Deaths",
x = "Date",
y = "Deaths") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))

- Use Mutate to make a new column with deaths/confirmed cases (Hint: this might be best done after joining the tables, but before pivoting longer).
tslj_drate <- time_series_long_joined %>% mutate(DRate=ifelse(Deaths==0,0,Deaths/Confirmed)) %>%
pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
names_to = "Report_Type", values_to = "Counts")
head(tslj_drate)
## # A tibble: 6 x 7
## Province_State Country_Region Lat Long Date Report_Type Counts
## <chr> <chr> <dbl> <dbl> <date> <chr> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 Confirmed 0
## 2 <NA> Afghanistan 33.9 67.7 2020-01-22 Deaths 0
## 3 <NA> Afghanistan 33.9 67.7 2020-01-22 DRate 0
## 4 <NA> Afghanistan 33.9 67.7 2020-01-23 Confirmed 0
## 5 <NA> Afghanistan 33.9 67.7 2020-01-23 Deaths 0
## 6 <NA> Afghanistan 33.9 67.7 2020-01-23 DRate 0
- Plot US deaths/confirmed cases per day.
tslj_drate %>% filter(Report_Type=="DRate") %>%
group_by(Country_Region,Date,Report_Type) %>%
summarize(Counts = sum(Counts)) %>%
filter(Country_Region=="US") %>%
ggplot(aes(x=Date,y=Counts)) +
geom_line() +
theme_bw() +
labs(title = "US COVID19 - Death to Confirmed Case Ratio",
x = "Date",
y = "Death Rate") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))

- Make a single graph with from the times series data with the 10 countries with the hightest death total.
Top10 <- time_series_long_joined_counts %>% filter(Report_Type=="Deaths") %>% group_by(Country_Region) %>% summarise(Counts=sum(Counts)) %>% top_n(10)
Top10
## # A tibble: 10 x 2
## Country_Region Counts
## <chr> <dbl>
## 1 Brazil 11221271
## 2 France 4951523
## 3 India 5018592
## 4 Iran 2328120
## 5 Italy 5905648
## 6 Mexico 5580224
## 7 Peru 2313290
## 8 Spain 4849599
## 9 United Kingdom 6366694
## 10 US 22300040
time_series_long_joined_counts %>% filter(Report_Type=="Deaths" & Country_Region%in%Top10$Country_Region) %>%
group_by(Country_Region,Date,Report_Type) %>%
summarize(Counts = sum(Counts)) %>%
ggplot(aes(x=Date,y=Counts,color=Country_Region)) +
geom_line(size=1) +
theme_bw() +
labs(title = "COVID19 Deaths of Top10 Highest",
x = "Date",
y = "Death") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))

- Using Facet_Wraps make a graph from the times series data with the 10 countries with the hightest death total. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.
time_series_long_joined_counts %>% filter(Report_Type=="Deaths" & Country_Region%in%Top10$Country_Region) %>%
group_by(Country_Region,Date,Report_Type) %>%
summarize(Counts = sum(Counts)) %>%
ggplot(aes(x=Date,y=Counts,color=Country_Region)) +
geom_line(size=1) +
theme_bw() +
facet_wrap(~Country_Region, ncol=3) +
labs(title = "COVID19 Deaths of Top10 Highest",
x = "Date",
y = "Death") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))

- Using Facet_Wraps make a graph from the times series data for confirmed cases for all US states and territories. Format the table to look nice in the report by changing the number of rows in your graph and/or adjusting the height in the R code chunk header.
time_series_long_joined_counts %>% filter(Report_Type=="Confirmed" & Country_Region=="United Kingdom") %>%
group_by(Province_State,Date) %>%
summarize(Counts = sum(Counts)) %>%
ggplot(aes(x=Date,y=Counts,color=Province_State)) +
geom_line(size=1) +
theme_bw() +
facet_wrap(~Province_State, ncol=3, scales = "free_y") +
labs(title = "COVID19 Deaths of UK Territories",
x = "Date",
y = "Death") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))

- Apply a theme other than the one used in the Data Carpentries tutorial to the graph in Ex8.
time_series_long_joined_counts %>% filter(Report_Type=="Confirmed" & Country_Region=="United Kingdom") %>%
group_by(Province_State,Date) %>%
summarize(Counts = sum(Counts)) %>%
ggplot(aes(x=Date,y=Counts,color=Province_State)) +
geom_line(size=1) +
theme_linedraw() +
facet_wrap(~Province_State, ncol=3, scales = "free_y") +
labs(title = "COVID19 Deaths of UK Territories",
x = "Date",
y = "Death") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
text=element_text(size = 16))
