Putting it all together and getting organized
Overview
Teaching: 30 min
Exercises: 10 minQuestions
How can I choose a folder structure that will make report writing easier?
Objectives
TBD
Putting it all together and getting organized
Instructor notes
The goal of this last part is to put together all the pieces started in the previous lesson to build a manuscript that is fully automated. Most of the files are already provided but they are missing the key pieces that make everything work. In this lesson, participants need to copy and paste the chunks of code listed in the appropriate files. At the end, their directory should look like the content of https://github.com/fmichonneau/teaching-automation
|
`-- data-raw/
| |
| `-- Afghanistan-gdp-percapita.csv
| `-- Albania-gdp-percapita.csv
| `-- ....
|
`-- data-output/
|
`-- fig/
|
`-- R/
| |
| `-- figures.R
| `-- data.R
| `-- utils.R
| `-- dependencies.R
|
`-- tests/
|
`-- manuscript.Rmd
`-- make.R
Tests
testthat
has a convenient function called test_dir
that will run tests
included in files in a given directory. We can use it to run all the tests in
our tests/
folder.
test_dir("tests/")
## Error in test_dir("tests/"): could not find function "test_dir"
Let’s turn it into a function, so we’ll be able to add some additional
functionalities to it a little later. We are also going to save it at the root
of our working directory in the file called make.R
:
## add this to make.R
make_tests <- function() {
test_dir("tests/")
}
Figures
This is the code to generate the two figures in the manuscript:
## add this to R/figure.R
plot_summary_lifeExp_by_continent <- function(mean_lifeExp) {
ggplot(mean_lifeExp, aes(x = year, y = mean_lifeExp, colour = continent)) +
geom_line() + facet_wrap(~ continent) + theme(legend.position = "top")
}
plot_change_trend <- function(mean_lifeExp, year_break) {
tmp_data <- get_coef_before_after(mean_lifeExp, year_break)
ggplot(tmp_data, aes(x = period, y = trend, colour = continent, group = continent)) +
geom_point() + geom_path()
}
## -----
This is the code to generate PDF files from figures, and the two figures as PDF files:
## add this to make.R
make_figures <- function(path = "fig", ...) {
make_summary_by_continent(path = path, ...)
make_change_trend(path = path, ...)
}
make_summary_by_continent <- function(path = "fig", ...) {
mean_lifeExp <- get_mean_lifeExp(gather_gdp_data())
p <- plot_summary_lifeExp_by_continent(mean_lifeExp)
make_pdf(print(p), file = file.path(path, "summary_by_continent.pdf"), ...)
}
make_change_trend <- function(path = "fig", year = 1980, ...) {
mean_lifeExp <- get_mean_lifeExp(gather_gdp_data())
p <- plot_change_trend(mean_lifeExp, year = year)
make_pdf(print(p), file = file.path(path, "change_trend.pdf"), ...)
}
## -----
Data
This is the code that generates the intermediate datasets:
## add this to R/data.R
gather_gdp_data <- function(path = "data-raw") {
split_gdp_files <- list.files(path = path, pattern = "gdp-percapita\\.csv$", full.names = TRUE)
split_gdp_list <- lapply(split_gdp_files, read.csv)
gdp <- do.call("rbind", split_gdp_list)
gdp
}
get_mean_lifeExp <- function(gdp) {
mean_lifeExp_by_cont <- gdp %>% group_by(continent, year) %>%
summarize(mean_lifeExp = mean(lifeExp)) %>% as.data.frame
mean_lifeExp_by_cont
}
get_latest_lifeExp <- function(gdp) {
latest_lifeExp <- gdp %>% filter(year == max(gdp$year)) %>%
group_by(continent) %>%
summarize(latest_lifeExp = mean(lifeExp)) %>%
as.data.frame
latest_lifeExp
}
get_coef_before_after <- function(mean_lifeExp, year_break) {
coef_before_after <- lapply(unique(mean_lifeExp$continent), function(cont) {
mdl_before <- lm(mean_lifeExp ~ year,
data = mean_lifeExp,
subset = (continent == cont & year <= year_break))
mdl_after <- lm(mean_lifeExp ~ year,
data = mean_lifeExp,
subset = (continent == cont & year > year_break))
rbind(c(as.character(cont), "before", coef(mdl_before)[2]),
c(as.character(cont), "after", coef(mdl_after)[2]))
}) %>%
do.call("rbind", .) %>% as.data.frame %>%
setNames(c("continent", "period", "trend"))
coef_before_after$trend <- as.numeric(levels(coef_before_after$trend)[coef_before_after$trend])
coef_before_after$period <- factor(coef_before_after$period, levels = c("before", "after"))
coef_before_after
}
## -----
This is the code to generate the CSV files that contain the intermediate
datasets that are needed to draw the figures. The function make_data
generates
both datasets at once.
## add this to make.R
make_data <- function(path = "data-output", verbose = TRUE) {
make_gdp_data(path)
make_mean_lifeExp_data()
}
make_gdp_data <- function(path = "data-output") {
gdp <- gather_gdp_data()
make_csv(gdp, file = file.path(path, "gdp.csv"))
}
make_mean_lifeExp_data <- function(path = "data-output") {
gdp <- gather_gdp_data()
make_csv(get_mean_lifeExp(gdp), file = file.path(path, "mean_lifeExp.csv"))
}
## -----
Cleaning
The only way to ensure that your analysis is reproducible is to delete all the intermediate and final products to make sure your functions can recreate everything from the raw data and your code.
Having the figures and the intermediate data files isolated in their own folders in your working directory will allow you to make sure you only delete these generated figures, and none of the original data.
## add this to make.R
clean_data <- function(path = "data-output") {
to_rm <- list.files(path = path, pattern = "csv$", full.names = TRUE)
res <- file.remove(to_rm)
invisible(res)
}
clean_figures <- function(path = "fig") {
to_rm <- list.files(path = path, pattern = "pdf$", full.names = TRUE)
res <- file.remove(to_rm)
invisible(res)
}
## -----
Make everything
These are wrapper functions to generate/delete everything:
## add this to make.R
make_ms <- function() {
rmarkdown::render("manuscript.Rmd", "html_document")
invisible(file.exists("manuscript.html"))
}
clean_ms <- function() {
res <- file.remove("manuscript.html")
invisible(res)
}
make_all <- function() {
make_data()
make_figures()
make_tests()
make_ms()
}
clean_all <- function() {
clean_data()
clean_figures()
clean_ms()
}
## -----
and before we continue, we are going to replace the make_tests function with something a little more comprehensive:
## add this to make.R
make_tests <- function() {
if (require(testthat)) {
p <- test_dir("tests/")
if (!interactive() && any(p$failed)) {
q("no", status = 1, FALSE)
}
} else {
message("skipped the tests, testthat not available.")
return(NULL)
}
}
## -----
Key Points
TBD