if (!requireNamespace("pacman", quietly = TRUE)) {
install.packages("pacman")
}::p_load(dm, data.table, DiagrammeR, miceadds, readxl, s3fs,htmlwidgets,DiagrammeR,DiagrammeRsvg,rsvg,htmltools) pacman
Creating the ERA Data Model
1 Overview
This guide describes how to build a structured data model for the agronomic component of the Evidence for Resilient Agriculture (ERA) dataset using dm
. It includes steps for downloading, cleaning, and linking the data tables into a coherent data model object.
2 Packages and Setup
<- "downloaded_data"
dl_dir if (!dir.exists(dl_dir)) dir.create(dl_dir, showWarnings = FALSE)
3 Download Agronomic Dataset
<- s3fs::S3FileSystem$new(anonymous = TRUE)
s3 <- "s3://digital-atlas/era"
era_s3 <- s3$dir_ls(file.path(era_s3, "data"))
all_files
<- tail(
target_file grep(".RData", grep("agronomic", all_files, value = TRUE), value = TRUE),
1
)
<- file.path(getwd(), dl_dir, basename(target_file))
save_path
if (!file.exists(save_path)) {
$file_download(target_file, save_path, overwrite = TRUE)
s3 }
<- miceadds::load.Rdata2(
agronomic_data file = basename(save_path),
path = dirname(save_path)
)
4 Download and Import Master Vocabulary
<- "https://github.com/peetmate/era_codes/raw/main/era_master_sheet.xlsx"
era_vocab_url <- file.path(dl_dir, basename(era_vocab_url))
era_vocab_local if(!file.exists(era_vocab_local)){
download.file(era_vocab_url, era_vocab_local, mode = "wb")
}
<- readxl::excel_sheets(era_vocab_local)
sheet_names <- sheet_names[!grepl("sheet|Sheet", sheet_names)]
sheet_names
<- sapply(
era_master_codes
sheet_names,FUN = function(x) data.table::data.table(readxl::read_excel(era_vocab_local, sheet = x)),
USE.NAMES = TRUE
)
5 Clean and Harmonize Tables
<-"Prod.Out"
tab<-agronomic_data[[tab]]
datasetnames(data,c("P.Product.Subtype","P.Product"),c("Product.Subtype","Product.Simple"),skip_absent = T)
<-data
agronomic_data[[tab]]
<-"MT.Out"
tab<-agronomic_data[[tab]]
datasetnames(data,c("P.Product"),c("Product.Simple"),skip_absent = T)
<-merge(data,agronomic_data$Prod.Out[,.(B.Code,Product.Simple,Product.Subtype)],by=c("B.Code","Product.Simple"))
data<-data
agronomic_data[[tab]]
<-"Var.Out"
tab<-agronomic_data[[tab]]
datasetnames(data,c("V.Product"),c("Product.Simple"),skip_absent = T)
<-merge(data,agronomic_data$Prod.Out[,.(B.Code,Product.Simple,Product.Subtype)],by=c("B.Code","Product.Simple"))
data<-data
agronomic_data[[tab]]
<-"Chems.Out"
tab<-agronomic_data[[tab]]
datasetnames(data,c("Times"),c("Time"),skip_absent = T)
<-data
agronomic_data[[tab]]
<-era_master_codes$vars
datasetnames(data,c("V.Product"),c("Product.Simple"),skip_absent = T)
:=V.Var1][,V.Species:=V.Species1][,c("V.Var1","V.Species1","V.Subspecies1"):=NULL]
data[,V.Var$vars<-data
era_master_codes
<-era_master_codes$chem
data<-data[,.(AOM,C.Type...3,C.Type.AI,C.Name,C.Is.Name.Commercial,C.AI)]
chemsetnames(chem,"C.Type...3","C.Type",skip_absent = T)
<-chem[!is.na(C.Type)]
chem
<-data[,.(C.Name.Commercial,C.Name.AI...10,C.Animal,C.Crop,C.Is.Chem,C.Type...14)]
chem.commsetnames(chem.comm,c("C.Name.AI...10","C.Type...14"),c("C.Name.AI","C.Type"),skip_absent = T)
<-chem.comm[!is.na(C.Name.Commercial)]
chem.comm
<-data[,.(C.Name.AI...16,C.Name.AI.CHEBI,C.Tname.AI.Type_gpt)]
chem.aisetnames(chem.ai,c("C.Name.AI...16"),c("C.Name.AI"))
<-chem.ai[!is.na(C.Name.AI)]
chem.ai
$chem<-chem
era_master_codes$chem.ai<-chem.ai
era_master_codes$chem.comm<-chem.comm
era_master_codes
<-"Out.Out"
tab<-agronomic_data[[tab]]
datasetnames(data,"Out.Subind","Subindicator",skip_absent = T)
<-data
agronomic_data[[tab]]
<-"Res.Method"
tab<-agronomic_data[[tab]]
datasetnames(data,"M.Tree","Tree.Latin.Name",skip_absent = T)
<-data
agronomic_data[[tab]]
<-"AF.Trees"
tab<-agronomic_data[[tab]]
datasetnames(data,"AF.Tree","Tree.Latin.Name",skip_absent = T)
<-data
agronomic_data[[tab]]
# Dev.Note: Probably needs converting from wide to long format
<-"Int.Out"
tab<-agronomic_data[[tab]]
data:=IN.Level.Name] # Note this multiple T.Names is concatenated with a "..."
data[,T.Name<-data
agronomic_data[[tab]]
<-"Rot.Seq"
tab<-agronomic_data[[tab]]
data!is.na(R.Prod2)|!is.na(IN.Code),IN.Level.Name:=R.Treatment]
data[is.na(R.Prod2) & is.na(IN.Code),T.Name:=R.Treatment]
data[<-data agronomic_data[[tab]]
6 Create Data Model
- Ensure consistent key naming and uniqueness before setting PKs/FKs.
<- dm(
era_agronomy.model Pub.Out = agronomic_data$Pub.Out,
Site.Out = agronomic_data$Site.Out,
Times.Out = agronomic_data$Times.Out,
Times.Clim = agronomic_data$Times.Clim,
Prod.Out = agronomic_data$Prod.Out,
ExpD.Out = agronomic_data$ExpD.Out,
Var.Out = agronomic_data$Var.Out,
Till.Out = agronomic_data$Till.Out,
Plant.Out = agronomic_data$Plant.Out,
Plant.Method = agronomic_data$Plant.Method,
PD.Codes = agronomic_data$PD.Codes,
PD.Out = agronomic_data$PD.Out,
Chems.Code = agronomic_data$Chems.Code,
Chems.Out = agronomic_data$Chems.Out,
# Chems.AI = agronomic_data$Chems.AI,
Res.Out = agronomic_data$Res.Out,
Res.Method = agronomic_data$Res.Method,
# Res.Comp = agronomic_data$Res.Comp,
Fert.Out = agronomic_data$Fert.Out,
Fert.Method = agronomic_data$Fert.Method,
# Fert.Comp = agronomic_data$Fert.Comp,
Har.Out = agronomic_data$Har.Out,
pH.Out = agronomic_data$pH.Out,
pH.Method = agronomic_data$pH.Method,
WH.Out = agronomic_data$WH.Out,
AF.Out = agronomic_data$AF.Out,
AF.Trees=agronomic_data$AF.Trees,
Weed.Out=agronomic_data$Weed.Out,
Other.Out=agronomic_data$Other.Out,
Irrig.Codes=agronomic_data$Irrig.Codes,
Irrig.Method=agronomic_data$Irrig.Method,
MT.Out = agronomic_data$MT.Out,
Int.Out = agronomic_data$Int.Out,
Rot.Out = agronomic_data$Rot.Out,
Rot.Seq = agronomic_data$Rot.Seq,
Rot.Seq.Summ = agronomic_data$Rot.Seq.Summ,
Out.Out = agronomic_data$Out.Out,
Data.Out=agronomic_data$Data.Out,
era_master_codes.fert = era_master_codes$fert,
era_master_codes.prod = era_master_codes$prod,
era_master_codes.vars = era_master_codes$vars,
era_master_codes.chem = era_master_codes$chem,
era_master_codes.residues = era_master_codes$residues,
era_master_codes.trees = era_master_codes$trees,
era_master_codes.out = era_master_codes$out,
era_master_codes.prac = era_master_codes$prac) |>
# PRIMARY KEYS
dm_add_pk(Pub.Out, B.Code) |>
dm_add_pk(Site.Out, c(B.Code,Site.ID))|>
dm_add_pk(Times.Out, c(B.Code,Time)) |>
dm_add_pk(Times.Clim, c(B.Code,Site.ID,Time)) |>
dm_add_pk(Prod.Out, c(B.Code,Product.Subtype,Product.Simple)) |>
dm_add_pk(ExpD.Out, B.Code) |>
dm_add_pk(Var.Out, c(B.Code,V.Level.Name)) |>
dm_add_pk(Till.Out, c(B.Code,Till.Level.Name)) |>
dm_add_pk(Plant.Out, c(B.Code,P.Level.Name)) |>
dm_add_pk(Plant.Method, c(B.Code,P.Level.Name)) |>
dm_add_pk(PD.Codes, c(B.Code,PD.Level.Name)) |>
dm_add_pk(PD.Out, c(B.Code,PD.Level.Name,Site.ID ,Time)) |>
dm_add_pk(Chems.Code, c(B.Code,C.Level.Name)) |>
dm_add_pk(Chems.Out, c(B.Code,C.Level.Name,C.Type,C.Name)) |>
#dm_add_pk(Chems.AI, c(B.Code,C.Name)) |>
dm_add_pk(Fert.Out, c(B.Code,F.Level.Name)) |>
dm_add_pk(Fert.Method, c(B.Code,F.Level.Name,F.Type)) |>
#dm_add_pk(Fert.Comp, c(B.Code,F.Type)) |>
dm_add_pk(Res.Out, c(B.Code,M.Level.Name)) |>
dm_add_pk(Res.Method, c(B.Code,M.Level.Name)) |>
#dm_add_pk(Res.Comp, c(B.Code,M.Tree,M.Material)) |>
dm_add_pk(Har.Out, c(B.Code,H.Level.Name)) |>
dm_add_pk(pH.Out, c(B.Code,pH.Level.Name)) |>
dm_add_pk(pH.Method, c(B.Code,pH.Level.Name)) |>
dm_add_pk(WH.Out, c(B.Code,WH.Level.Name)) |>
dm_add_pk(AF.Out, c(B.Code,AF.Level.Name)) |>
dm_add_pk(AF.Trees, c(B.Code,AF.Level.Name)) |>
dm_add_pk(Weed.Out, c(B.Code,W.Level.Name)) |>
dm_add_pk(Other.Out, c(B.Code,O.Level.Name)) |>
dm_add_pk(Irrig.Codes, c(B.Code,I.Level.Name)) |>
dm_add_pk(Irrig.Method, c(B.Code,I.Level.Name)) |>
dm_add_pk(MT.Out, c(T.Name,B.Code)) |>
dm_add_pk(Int.Out, c(IN.Level.Name,B.Code)) |>
dm_add_pk(Rot.Out, c(R.Level.Name,B.Code)) |>
dm_add_pk(Rot.Seq, c(R.Level.Name,T.Name,IN.Level.Name,B.Code)) |>
dm_add_pk(Rot.Seq.Summ, c(R.Level.Name,B.Code)) |>
dm_add_pk(Out.Out, c(Out.Code.Joined,B.Code)) |>
dm_add_pk(Data.Out, c(Site.ID,Time,T.Name,IN.Level.Name,R.Level.Name,Out.Code.Joined,B.Code)) |>
dm_add_pk(era_master_codes.fert , F.Type) |>
dm_add_pk(era_master_codes.prod, c(Product.Subtype,Product.Simple)) |>
dm_add_pk(era_master_codes.vars, c(Product.Simple,V.Var)) |>
dm_add_pk(era_master_codes.chem, c(C.Type,C.Name)) |>
dm_add_pk(era_master_codes.residues, M.Material) |>
dm_add_pk(era_master_codes.trees, Tree.Latin.Name) |>
dm_add_pk(era_master_codes.out, Subindicator) |>
dm_add_pk(era_master_codes.prac, Subpractice) |>
# FOREIGN KEYS
dm_add_fk(Site.Out, B.Code, Pub.Out) |>
dm_add_fk(Times.Out, B.Code, Pub.Out) |>
dm_add_fk(Times.Clim, c(B.Code,Time), Times.Out) |>
dm_add_fk(Times.Clim, c(B.Code,Site.ID), Site.Out) |>
dm_add_fk(Prod.Out, B.Code, Pub.Out) |>
dm_add_fk(Prod.Out, c(Product.Subtype,Product.Simple), era_master_codes.prod) |>
dm_add_fk(ExpD.Out, B.Code, Pub.Out) |>
dm_add_fk(Plant.Out, B.Code,Pub.Out) |>
dm_add_fk(Plant.Method, c(B.Code,P.Level.Name),Plant.Out) |>
dm_add_fk(Var.Out, c(Product.Simple,V.Var),era_master_codes.vars) |>
dm_add_fk(Var.Out, c(B.Code,Product.Subtype,Product.Simple), Prod.Out) |>
dm_add_fk(Var.Out, B.Code, Pub.Out) |>
dm_add_fk(Till.Out, B.Code, Pub.Out) |>
dm_add_fk(Till.Out, c(B.Code,Site.ID), Site.Out) |>
dm_add_fk(Till.Out, c(B.Code,Time), Times.Out) |>
dm_add_fk(PD.Codes, B.Code, Pub.Out) |>
dm_add_fk(PD.Out, c(B.Code,PD.Level.Name), PD.Codes) |>
dm_add_fk(PD.Out, c(B.Code,Site.ID), Site.Out) |>
dm_add_fk(PD.Out, c(B.Code,Time), Times.Out) |>
dm_add_fk(Chems.Code, B.Code, Pub.Out) |>
dm_add_fk(Chems.Out, c(B.Code,C.Level.Name), Chems.Code) |>
dm_add_fk(Chems.Out, c(B.Code,Site.ID), Site.Out) |>
dm_add_fk(Chems.Out, c(B.Code,Time), Times.Out) |>
dm_add_fk(Chems.Out, c(C.Type,C.Name), era_master_codes.chem) |>
dm_add_fk(Fert.Out, B.Code, Pub.Out) |>
dm_add_fk(Fert.Method, c(B.Code,F.Level.Name), Fert.Out) |>
dm_add_fk(Fert.Method, F.Type, era_master_codes.fert ) |>
dm_add_fk(Fert.Method, c(B.Code,Time), Times.Out) |>
dm_add_fk(Fert.Method, c(B.Code,Site.ID), Site.Out ) |>
dm_add_fk(Res.Out, B.Code, Pub.Out) |>
dm_add_fk(Res.Method, c(B.Code,M.Level.Name), Res.Out) |>
dm_add_fk(Res.Method, M.Material, era_master_codes.residues) |>
dm_add_fk(Res.Method, Tree.Latin.Name, era_master_codes.trees) |>
dm_add_fk(Res.Method, c(B.Code,Time), Times.Out) |>
dm_add_fk(Res.Method, c(B.Code,Site.ID), Site.Out ) |>
dm_add_fk(pH.Out, B.Code, Pub.Out) |>
dm_add_fk(pH.Method, c(B.Code,pH.Level.Name), pH.Out) |>
dm_add_fk(WH.Out, B.Code, Pub.Out) |>
dm_add_fk(AF.Out, B.Code, Pub.Out) |>
dm_add_fk(AF.Trees, c(B.Code,AF.Level.Name), AF.Out) |>
dm_add_fk(AF.Trees, Tree.Latin.Name, era_master_codes.trees) |>
dm_add_fk(Weed.Out, B.Code, Pub.Out) |>
dm_add_fk(Other.Out, B.Code, Pub.Out) |>
dm_add_fk(Irrig.Codes, B.Code, Pub.Out) |>
dm_add_fk(Irrig.Method, c(B.Code,I.Level.Name), Irrig.Codes) |>
dm_add_fk(Irrig.Method, c(B.Code,Time), Times.Out) |>
dm_add_fk(Irrig.Method, c(B.Code,Site.ID), Site.Out ) |>
dm_add_fk(MT.Out, c(B.Code,Product.Subtype,Product.Simple), Prod.Out) |>
dm_add_fk(MT.Out, c(B.Code,F.Level.Name), Fert.Out) |>
dm_add_fk(MT.Out, c(B.Code,P.Level.Name), Plant.Out) |>
dm_add_fk(MT.Out, c(B.Code,V.Level.Name), Var.Out) |>
dm_add_fk(MT.Out, c(B.Code,Till.Level.Name), Till.Out) |>
dm_add_fk(MT.Out, c(B.Code,PD.Level.Name), PD.Codes) |>
dm_add_fk(MT.Out, c(B.Code,C.Level.Name), Chems.Code) |>
dm_add_fk(MT.Out, c(B.Code,M.Level.Name), Res.Out) |>
dm_add_fk(MT.Out, c(B.Code,H.Level.Name), Har.Out) |>
dm_add_fk(MT.Out, c(B.Code,pH.Level.Name), pH.Out) |>
dm_add_fk(MT.Out, c(B.Code,WH.Level.Name), WH.Out) |>
dm_add_fk(MT.Out, c(B.Code,AF.Level.Name), AF.Out) |>
dm_add_fk(MT.Out, c(B.Code,W.Level.Name), Weed.Out) |>
dm_add_fk(MT.Out, c(B.Code,O.Level.Name), Other.Out) |>
dm_add_fk(MT.Out, c(B.Code,I.Level.Name), Irrig.Codes) |>
dm_add_fk(Int.Out, c(B.Code,T.Name),MT.Out) |>
dm_add_fk(Rot.Seq, c(B.Code,T.Name),MT.Out) |>
dm_add_fk(Rot.Seq, c(B.Code,IN.Level.Name),Int.Out) |>
dm_add_fk(Rot.Seq, c(B.Code,R.Level.Name),Rot.Out) |>
dm_add_fk(Rot.Seq, c(B.Code,Time),Times.Out) |>
dm_add_fk(Rot.Seq.Summ, c(B.Code,R.Level.Name),Rot.Out) |>
dm_add_fk(Out.Out, Subindicator,era_master_codes.out) |>
dm_add_fk(Data.Out, c(B.Code,Out.Code.Joined),Out.Out) |>
dm_add_fk(Data.Out, c(B.Code,Site.ID),Site.Out) |>
dm_add_fk(Data.Out, c(B.Code,Time),Times.Out) |>
dm_add_fk(Data.Out, c(B.Code,T.Name),MT.Out) |>
dm_add_fk(Data.Out, c(B.Code,IN.Level.Name),Int.Out) |>
dm_add_fk(Data.Out, c(B.Code,R.Level.Name),Rot.Out)
7 Save Data Model
<- gsub(".RData", "_model.rds", basename(target_file))
save_name <- file.path(dl_dir, save_name)
save_file saveRDS(era_agronomy.model, file = save_file)
8 Show Data Model
- The data model enables efficient checking of table linkages and enforcement of relational structure.
- For a visualisation, you can use:
<- dm::dm_draw(era_agronomy.model)
g <- export_svg(g)
svg cat(svg)