Introduction

Overview

This notebook assembles and cleans heterogeneous livestock-nutrition trials into a single, analysis-ready dataset that can be used to quantify enteric-methane (CH₄) emissions and emission-intensity for cattle, sheep and goats. Using R, we use ERA metadata, harmonise units, fill nutritional gaps (five-step cascade: original paper → Feedipedia → ILRI feeds DB → proximate-composition equation of Weiss & Tebbe → ingredient means), and reconstruct diet-level gross energy (GE) where necessary.

Emission calculations

Daily GE intake is converted to annual CH₄ with the IPCC Tier 2 equation. Methane-conversion factors (Ym) follow the 2019 IPCC defaults but are adjusted for diet forage level and dairy productivity where the data allow. Absolute emissions are then paired with weight-gain and milk-/meat-yield records to derive emission-intensity (kg CH₄ kg-product⁻¹). A change-detection routine classifies every treatment diet relative to its control (addition, substitution, dosage change) so diet strategies can be linked directly to shifts in EI.

Why this matters

Ruminants in sub-Saharan Africa often operate far below potential productivity; improving feed quality can therefore lower both absolute and per-kilogram emissions. By turning hundred of individual studies into a coherent Tier-2 dataset, this pipeline provides a transparent basis for (i) benchmarking baseline emissions, (ii) identifying high-leverage nutritional interventions, and (iii) stress-testing alternative CH₄ equations or region-specific Ym values.

A detailed methodological rationale is given in Livestock Emission Method.docx.

1) Set up

This section installs and loads required packages, connects to the S3 bucket containing ERA data, and loads both the livestock and camel metadata into memory.

# Install and load 'pacman' package manager if not already installed

library(ERAg)
library(stringr)
library(knitr)
library(dplyr)
library(DT)
library(readxl)
library(tidyr)
library(arrow)
library(ggplot2)
library(data.table)
library(purrr)
library(janitor)
library(ggalt)
# Set up connection to S3 bucket
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3,"data"))

# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData", grep("skinny_cow_2022", files, value=TRUE), value=TRUE), 1)

# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if(!file.exists(save_path)){
  s3$file_download(files, save_path, overwrite = TRUE)
}

# Load the data
livestock_metadata <- miceadds::load.Rdata2(file=basename(save_path), path=dirname(save_path))
# Set up connection to S3 bucket for camel data 
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3,"data"))

# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData", grep("courageous_camel_2024", files, value=TRUE), value=TRUE), 1)

# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if(!file.exists(save_path)){
  s3$file_download(files, save_path, overwrite = TRUE)
}

# Load the data
camel <- miceadds::load.Rdata2(file=basename(save_path), path=dirname(save_path))

to DO

missing_unit_feed_intake<- camel$Data.Out %>%
  select(B.Code, T.Name, ED.Intake.Item,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,Out.Code.Joined) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(is.na(Out.Unit))

fill in feed intakes

# #Add units where they are NA for Feed-Intake rows
# camel$Data.Out[
#   Out.Subind == "Feed Intake" & is.na(Out.Unit),                 # target rows
#   Out.Unit := sub("^.*?\\.\\.(.*?)\\.\\..*$", "\\1", Out.Code.Joined)  # extract middle chunk
# ]
# 
# feed_intake<-camel$Data.Out%>%
#   filter(Out.Subind=="Feed Intake")
# 
# sort(unique(feed_intake$Out.Unit), na.last = TRUE)   # A-Z, NAs at the end
##### 0. column templates ------------------------------------------------------
diet_cols   <- c("B.Code","A.Level.Name","D.Item","D.Type","D.Amount",
                 "D.Unit.Amount","D.Unit.Time","D.Unit.Animals",
                 "DC.Is.Dry","D.Ad.lib")
mt_cols     <- c("B.Code","A.Level.Name","T.Animals","T.Name","T.Control")
comp_cols   <- c("B.Code","D.Item","is_entire_diet",
                 "DC.Variable","DC.Value","DC.Unit")
digest_cols <- c("B.Code","D.Item","is_entire_diet",
                 "DD.Variable","DD.Value","DD.Unit")
data_cols   <- c("B.Code","A.Level.Name","Out.Code.Joined",
                 "ED.Intake.Item","ED.Intake.Item.Raw",
                 "ED.Mean.T","ED.Error","Out.Subind","Out.Unit",
                 "Out.WG.Start","Out.WG.Unit","is_entire_diet","T.Animals")
prod_cols   <- c("B.Code","P.Product")

##### 1. livestock tables (unchanged) ----------------------------------------
diet_livestock   <- livestock_metadata$Animals.Diet        [ , ..diet_cols ][, Source:="livestock"]
mt_livestock     <- livestock_metadata$MT.Out              [ , ..mt_cols   ][, Source:="livestock"]
comp_livestock   <- livestock_metadata$Animals.Diet.Comp   [ , ..comp_cols ][, Source:="livestock"]
digest_livestock <- livestock_metadata$Animals.Diet.Digest [ , ..digest_cols][, Source:="livestock"]
data_livestock   <- livestock_metadata$Data.Out            [ , ..data_cols ][, Source:="livestock"]
prod_livestock   <- livestock_metadata$Prod.Out            [ , ..prod_cols ][, Source:="livestock"]

##### 2. camel – build one key that links everything --------------------------
## 2·1  T-level information from MT.Out  (diet names + control flag)
diet_key <- camel$MT.Out[ , .(B.Code, T.Name, A.Level.Name, T.Control,Herd.Level.Name,Herd.Sublevel.Name)]

## 2·2  Animal numbers & start-weights from Herd.Out
herd_info <- camel$Herd.Out[ ,
   .(B.Code,
    Herd.Level.Name,
    Herd.Sublevel.Name,
    Herd.Sex,
    Herd.Stage,
     T.Animals    = Herd.N,
     Out.WG.Start = Herd.Start.Weight,
     Out.WG.Unit  = Herd.Start.Weight.Unit)
]

##### 3. camel tables ----------------------------------------------------------
## 3·1  Animals.Diet
diet_camel <- camel$Animal.Diet[ , ..diet_cols ][ , Source:="camel"]

## 3·2  MT.Out  (add T.Animals via herd_info)
mt_camel <- merge(diet_key,
                  herd_info,
                  by = c("B.Code","Herd.Level.Name","Herd.Sublevel.Name"),
                  all.x = TRUE)[, Source := "camel"]
        # ..mt_cols  **includes T.Animals**


## 3·3  Comp / Digest
comp_camel <- camel$Animal.Diet.Comp[ ,
       .(B.Code, D.Item, is_entire_diet,
         DC.Variable = DN.Variable,
         DC.Value    = DN.Value,
         DC.Unit     = DN.Unit)][, Source:="camel"]

digest_camel <- camel$Animal.Diet.Digest[ ,
       .(B.Code, D.Item, is_entire_diet,
         DD.Variable, DD.Value, DD.Unit)][, Source:="camel"]

## 3·4  Data.Out   (join diet_key → A.Level.Name, then herd_info)
data_camel <- merge(camel$Data.Out,                # brings in T.Name*
                    diet_key,                      # adds A.Level.Name & control
                    by = c("B.Code","T.Name"),
                    all.x = TRUE, sort = FALSE)

data_camel <- merge(data_camel, herd_info, by = c("B.Code","Herd.Level.Name","Herd.Sublevel.Name"),
                    all.x = TRUE, sort = FALSE)

data_camel <- data_camel[ ,
   .(B.Code, A.Level.Name,
     ED.Intake.Item,
     ED.Intake.Item.Raw = ED.Intake.Item,
     ED.Mean.T, ED.Error, Out.Unit, Out.Subind,
     Out.WG.Start, Out.WG.Unit,
     T.Animals)][, Source:="camel"]

## 3·5  product table
prod_camel <- camel$Herd.Out[ ,
   .(B.Code,
     P.Product = V.Product,
     Herd.Stage)][, Source:="camel"]

##### 4·a. harmonise Pub.Out --------------------------------------------------
# Get all columns from both pub tables
pub_cols <- union(names(livestock_metadata$Pub.Out), names(camel$Pub.Out))

# Add missing columns to each table
for (col in setdiff(pub_cols, names(livestock_metadata$Pub.Out))) {
  livestock_metadata$Pub.Out[, (col) := NA]
}
for (col in setdiff(pub_cols, names(camel$Pub.Out))) {
  camel$Pub.Out[, (col) := NA]
}

# Reorder columns to match
setcolorder(livestock_metadata$Pub.Out, pub_cols)
setcolorder(camel$Pub.Out, pub_cols)

# Add Source columns (optional, for tracking origin)
livestock_metadata$Pub.Out[, Source := "livestock"]
camel$Pub.Out[, Source := "camel"]



##### 4. bind camel + livestock ----------------------------------------------
diet_all   <- rbindlist(list(diet_livestock,   diet_camel  ), fill = TRUE)
mt_all     <- rbindlist(list(mt_livestock,     mt_camel    ), fill = TRUE)
comp_all   <- rbindlist(list(comp_livestock,   comp_camel  ), fill = TRUE)
digest_all <- rbindlist(list(digest_livestock, digest_camel), fill = TRUE)
data_all   <- rbindlist(list(data_livestock,   data_camel  ), fill = TRUE)
prod_all   <- rbindlist(list(prod_livestock,   prod_camel  ), fill = TRUE)
pub_all <- rbind(livestock_metadata$Pub.Out,camel$Pub.Out)
pub_all <- rbindlist(list(livestock_metadata$Pub.Out, camel$Pub.Out), fill = TRUE)

merged_metadata <- list(
    Pub.Out= pub_all,
  Animals.Diet        = diet_all,
  MT.Out              = mt_all,
  Animals.Diet.Comp   = comp_all,
  Animals.Diet.Digest = digest_all,
  Data.Out            = data_all,
  Prod.Out            = prod_all
)
# Clean up intermediate objects
rm(
  diet_cols, mt_cols, comp_cols, digest_cols, data_cols, prod_cols,
  diet_livestock, mt_livestock, comp_livestock, digest_livestock, data_livestock, prod_livestock,
  diet_camel, mt_camel, comp_camel, digest_camel, data_camel, prod_camel, herd_weight,comp_all,data_all,diet_all,digest_all,mt_all,prod_all,livestock_metadata,camel,herd_info,diet_key)

# (Optional) Run gc() to free memory
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3727077 199.1    7450001 397.9  4703350 251.2
## Vcells 7085788  54.1   16486262 125.8 14341494 109.5

1.1 Correct T.Control INCLUDE IN ERA DEV

When t control is yes for one of the diets then automatically enter No for the other diets of the same paper. Also for missing t control if the name of the diet contains control or Con or Zero then automatically deduce this is a control diet and others are treatment diets.

MT <- copy(merged_metadata$MT.Out)


MT[, T.Control := tolower(trimws(as.character(T.Control))) ]


MT[
  ## diet name suggests a control …
  grepl("\\bzero\\b|\\bcon\\b|control", A.Level.Name, ignore.case = TRUE) &

  ## … but flag is still empty / ambiguous
  (is.na(T.Control) | T.Control %in% c("0", "false","na")),

  T.Control := "yes"
]



MT[, T.Control := {
  has_ctrl <- any(T.Control == "yes", na.rm = TRUE)          # does study have a control?
  
  if (has_ctrl) {
    # 4·a  normalise obvious “no” variants
    flag <- fifelse(
      T.Control %in% c("false", "0", "n", "no"), "no", T.Control
    )
    # 4·b  ensure exactly one category: yes / no
    fifelse(flag == "yes", "yes", "no")
    
  } else {
    # keep whatever was there (even "0", "false", NA, …)
    T.Control
  }
}, by = B.Code]


MT[, T.Control := fifelse(
  (is.na(T.Control) | T.Control %in% c("", "na")) & any(T.Control == "yes", na.rm = TRUE),
  "no",
  T.Control
), by = B.Code]


merged_metadata$MT.Out <- MT


controls_multi <- MT[
  T.Control == "yes", 
  .SD[uniqueN(A.Level.Name) > 1], 
  by = B.Code
]

controls_multi[, .(B.Code, A.Level.Name, T.Control)]
##      B.Code                     A.Level.Name T.Control
##      <char>                           <char>    <char>
##   1: HK0287 Conc Commercial...Concentrate HM       yes
##   2: HK0287                  Conc Commercial       yes
##   3: HK0287                   Concentrate HM       yes
##   4: JS0340         MSP0 Enzyme...MSP0 Yeast       yes
##   5: JS0340                      MSP0 Enzyme       yes
##  ---                                                  
## 265: JO1088                    100g MOBYS RS       yes
## 266: JO1088                    200g MOBYS RS       yes
## 267: JO1088                    300g MOBYS RS       yes
## 268: JO1095                           Diet 1       yes
## 269: JO1095                           Diet 5       yes

1.2) Correct A.Level.Name

Some feed intake entries are not matched to any ingredient data due to discrepancies in A.Level.Name between the feed intake and ingredient datasets for the same study codes. In this section, we build a table to flag studies where diet names do not align, helping to identify cases that may require manual review or correction.

# Step 1: Prepare cleaned input
ingredient_levels <- merged_metadata$Animals.Diet %>%
  distinct(B.Code, A.Level.Name) %>%
  mutate(Source = "Ingredient")

feedintake_levels <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
  distinct(B.Code, A.Level.Name) %>%
  mutate(Source = "FeedIntake")

# Step 2: Create mismatch table
combined_levels <- bind_rows(ingredient_levels, feedintake_levels)

comparison_table <- combined_levels %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    Sources = paste(sort(unique(Source)), collapse = " & "),
    .groups = "drop"
  ) %>%
  mutate(
    Match_Status = case_when(
      Sources == "FeedIntake & Ingredient" ~ "Match",
      Sources == "Ingredient" ~ "Only in Ingredient",
      Sources == "FeedIntake" ~ "Only in FeedIntake",
      TRUE ~ "Unknown"
    )
  )

# Step 3: Identify B.Codes with exactly 1 unmatched in both
to_correct <- comparison_table %>%
  filter(Match_Status %in% c("Only in Ingredient", "Only in FeedIntake")) %>%
  group_by(B.Code) %>%
  summarise(
    n_ing = sum(Match_Status == "Only in Ingredient"),
    n_feed = sum(Match_Status == "Only in FeedIntake"),
    Ingredient_Name = A.Level.Name[Match_Status == "Only in Ingredient"][1],
    FeedIntake_Name = A.Level.Name[Match_Status == "Only in FeedIntake"][1],
    .groups = "drop"
  ) %>%
  filter(n_ing == 1, n_feed == 1)

# Step 4: Apply the corrections to Data.Out (rename unmatched FeedIntake names)
merged_metadata$Data.Out <- merged_metadata$Data.Out %>%
  left_join(to_correct, by = c("B.Code", "A.Level.Name" = "FeedIntake_Name")) %>%
  mutate(
    A.Level.Name = coalesce(Ingredient_Name, A.Level.Name)
  ) %>%
  select(-Ingredient_Name)


rm(ingredient_levels,feedintake_levels,to_correct,combined_levels,comparison_table)

# Get merged stats
merged_stats <- list(
  papers = length(unique(merged_metadata$Animals.Diet$B.Code)),
  diets = length(unique(merged_metadata$Animals.Diet$A.Level.Name)),
  ingredients = length(unique(merged_metadata$Animals.Diet$D.Item))
)

1.3) Correct ingredient units

Some ingredients appear multiple times with amounts reported in different units. In this section, we prioritize entries that are already in preferred units to minimize the need for conversions.

## keep ingredient that appear twice with different units only once 

# Define your unit preferences
preferred_units <- c("g", "kg")
preferred_time <- "day"
preferred_animals <- "individual"

# Prioritize and deduplicate
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
  # Add a priority score to sort by best rows
  mutate(
    priority = case_when(
      D.Unit.Amount %in% preferred_units &
        D.Unit.Time == preferred_time &
        D.Unit.Animals == preferred_animals ~ 1,
      TRUE ~ 2  # lower priority
    )
  ) %>%
  group_by(B.Code, A.Level.Name, D.Item) %>%
  slice_min(order_by = priority, with_ties = FALSE) %>%
  ungroup() %>%
  select(-priority)  # optional: drop helper column

rm(preferred_units,preferred_time,preferred_animals)


### correct extraction errors 

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  mutate(
    DC.Value = ifelse(
      B.Code == "BO1006" &
      DC.Variable == "Ash" &
      D.Item == "Ration 2" & 
      as.numeric(DC.Value) == 628,
      6.28,
      DC.Value
    )
  )

1.4 Take out grazing experiments

species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_before <- merged_metadata$Animals.Diet %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_before")

# Step 1: Identify A.Level.Name (diet) that include Grazing
grazing_diets <- merged_metadata$Animals.Diet %>%
  filter(D.Type == "Grazing") %>%
  distinct(B.Code, A.Level.Name)

length(unique(grazing_diets$B.Code))
## [1] 63
# Step 2: Count how many non-grazing diets exist per B.Code
nongrazing_counts <- merged_metadata$Animals.Diet %>%
  filter(!(D.Type == "Grazing")) %>%
  distinct(B.Code, A.Level.Name) %>%
  group_by(B.Code) %>%
  summarise(n_nongrazing = n(), .groups = "drop")

length(unique(nongrazing_counts$B.Code))
## [1] 712
# Step 3: Keep only diets that are not marked as Grazing AND are in B.Codes with at least one non-grazing diet
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
  anti_join(grazing_diets, by = c("B.Code", "A.Level.Name")) %>%  # remove all diets that were grazing
  semi_join(nongrazing_counts, by = "B.Code")  # keep only papers with at least one non-grazing diet


species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_after <- merged_metadata$Animals.Diet %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_after")

1.5) Correct D.Type PETE HAS TO CHANGE THIS ERA DEV

Some ingredients have more than one type, for those we merge the ingredients with the masterhseet to get the correct D.Type

# # Step 1: Load master sheet
# url <- "https://raw.githubusercontent.com/Mlolita26/ERA_Docs/main/era_master_sheet(7).xlsx"
# temp_file <- tempfile(fileext = ".xlsx")
# download.file(url, destfile = temp_file, mode = "wb")
# 
# # Step 2: Read Ani diet and AOM sheets
# ani_diet <- read_excel(temp_file, sheet = "ani_diet")
# aom <- read_excel(temp_file, sheet = "AOM") %>% filter(L5 == "Feed Ingredient")
# 
# # Step 3: Clean Ani diet — rename columns and trim whitespace
# ani_diet_clean <- ani_diet %>%
#   rename_with(~ str_replace_all(., "\\s+", "_")) %>%
#   mutate(across(everything(), ~ str_trim(as.character(.))))
# 
# # Step 4: Create long-format lookup for D.Item corrections
# ditem_lookup <- ani_diet_clean %>% 
#   select(D.Item.Root.Comp.Proc_Major, D.Item, D.Item.Root.Other.Comp.Proc_All) %>% 
#   pivot_longer(cols = c(D.Item, D.Item.Root.Other.Comp.Proc_All),
#                names_to = "Source_Field", values_to = "Name") %>% 
#   filter(!is.na(Name), !is.na(D.Item.Root.Comp.Proc_Major)) %>% 
#   separate_rows(Name, sep = "[,;]\\s*") %>%   # ← changed line
#   mutate(Name = str_to_lower(str_trim(Name)),
#          D.Item.Root.Comp.Proc_Major = str_trim(D.Item.Root.Comp.Proc_Major)) %>% 
#   group_by(Name) %>% 
#   slice(1) %>% 
#   ungroup()
# 
# 
# # Step 5: Apply correction to Animals.Diet using the lookup
# animals_diet_updated <- merged_metadata$Animals.Diet %>%
#   mutate(D.Item = str_to_lower(str_trim(D.Item))) %>%
#   left_join(ditem_lookup, by = c("D.Item" = "Name")) %>%
#   mutate(D.Item = coalesce(D.Item.Root.Comp.Proc_Major, D.Item)) %>%
#   select(-D.Item.Root.Comp.Proc_Major)
# 
# # Step 6: Clean and prepare AOM mapping
# aom_clean <- aom %>%
#   rename_with(~ str_replace_all(., "\\s+", "_")) %>%
#   mutate(
#     Corrected_Type = ifelse(
#       str_to_lower(L6) %in% c("other ingredients", "forage plants") & !is.na(L7),
#       L7,                       # use L7 when rule matches
#       L6                        # otherwise keep L6
#     ),
#     Edge_Value = str_trim(Edge_Value),
#     Synonym    = str_trim(Synonym)
#   )
# 
# # Step 7: Build mapping table from Edge_Value + Synonym
# direct_match <- aom_clean %>%
#   select(D.Item = Edge_Value, Corrected_Type) %>%
#   filter(!is.na(D.Item))
# 
# synonym_match <- aom_clean %>%
#   filter(!is.na(Synonym)) %>%
#   separate_rows(Synonym, sep = ";\\s*") %>%
#   mutate(Synonym = str_trim(Synonym)) %>%
#   select(D.Item = Synonym, Corrected_Type)
# 
# aom_mapping <- bind_rows(direct_match, synonym_match) %>%
#   mutate(D.Item = str_to_lower(str_trim(D.Item))) %>%
#   distinct(D.Item, Corrected_Type)
# 
# # Step 8: Join to assign Corrected_Type → D.Type
# animals_diet_updated <- animals_diet_updated %>%
#   mutate(D.Item = str_to_lower(str_trim(D.Item))) %>%
#   left_join(aom_mapping, by = "D.Item") %>%
#   mutate(D.Type = ifelse(!is.na(Corrected_Type), Corrected_Type, D.Type)) %>%
#   select(-Corrected_Type)
# 
# # # Step 9: Save back to merged metadata
# merged_metadata$Animals.Diet <- animals_diet_updated
# # 
# # # Step 10: Optional check — items with no match
# # check <- animals_diet_updated %>%
# #   filter(is.na(Corrected_Type) & D.Type != "entire diet")
# # 

also when same species so same in mastersheet they should have the same caterogy

those need to be categforized because there is nothing for d item

to_be_categorized<-merged_metadata$Animals.Diet%>%
  filter(is.na(D.Item))%>%
  filter(D.Type!="Entire Diet")

##1.8) Chnage corn to Maize

# ── Change “Corn” → “Maize” (whole-word match, case-insensitive) ─────────────
fix_corn <- function(df) {
  df %>% 
    mutate(
      D.Item = ifelse(
        str_detect(trimws(D.Item), regex("^corn$", ignore_case = TRUE)),
        "Maize",
        D.Item
      )
    )
}

merged_metadata$Animals.Diet        <- fix_corn(merged_metadata$Animals.Diet)
merged_metadata$Animals.Diet.Comp   <- fix_corn(merged_metadata$Animals.Diet.Comp)
merged_metadata$Animals.Diet.Digest <- fix_corn(merged_metadata$Animals.Diet.Digest)

1.7) Summary of the data

# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")

# Initialize result containers
species_summary <- list()
bcode_table <- list()

# Loop over each species
for (species in species_list) {
  
  # All B.Codes for this species
  species_bcodes <- merged_metadata$Prod.Out %>%
    filter(P.Product == species) %>%
    pull(B.Code) %>%
    unique()
  
  # Feed intake B.Codes
  feed_data <- merged_metadata$Data.Out %>%
    filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
    filter(Out.Subind=="Feed Intake")%>%
    select(B.Code, D.Item = ED.Intake.Item)
  
  feed_bcodes <- unique(feed_data$B.Code)
  
  # GE B.Codes
  ge_data <- merged_metadata$Animals.Diet.Comp %>%
    filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
    select(B.Code, D.Item)
  
  ge_bcodes <- unique(ge_data$B.Code)
  
  # Matched B.Codes (join on both B.Code and D.Item)
  matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
    pull(B.Code) %>%
    unique()
  
  # Store species-level summary
  species_summary[[species]] <- tibble(
    Species = species,
    Total_BCodes = length(species_bcodes),
    With_FEED = length(feed_bcodes),
    With_GE = length(ge_bcodes),
    With_BOTH = length(matched_bcodes)
  )
  
  # Store long-format bcode list
  bcode_table[[species]] <- tibble(
    B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
    Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"), 
                   times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
    Species = species
  ) %>% distinct()
}

# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)

2) Harmonize units of ingredient amounts

This step standardizes ingredient amount units (e.g., from mg, kg, %, etc.) to a consistent base (usually grams per individual per day), ensuring reliable downstream calculations and comparisons.

# diet ingredients: breakdown of all ingredients within each diet
ingredients <- merged_metadata$Animals.Diet %>%
  dplyr::select(
    B.Code,
    A.Level.Name,
    D.Item,
    D.Type,
    D.Amount,
    D.Unit.Amount,
    D.Unit.Time,
    D.Unit.Animals,
    DC.Is.Dry,
    D.Ad.lib
  ) %>%
  mutate(D.Amount = as.numeric(D.Amount))

# Function to harmonize diet ingredient units
harmonize_units <- function(df) {
  df %>%
    mutate(
      # Record original units (optional, for debugging/traceability)
      Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep=";"),

      # Harmonize weight and volume units to a consistent base
      D.Amount = case_when(
        D.Unit.Amount %in% c("kg", "mg", "g/100g", "l", "kg/t", "g/300l", 
                             "mg/kg", "kg/100kg", "kg/100kg body weight", 
                             "kg/kg metabolic weight", "g/kg metabolic weight (0.75)", 
                             "g/kg DMI", "g/L", "g/kg DM") ~ {
          case_when(
            D.Unit.Amount == "kg" ~ D.Amount * 1000,                      # kg -> g
            D.Unit.Amount == "mg" ~ D.Amount / 1000,                     # mg -> g
            D.Unit.Amount == "g/100g" ~ D.Amount * 10,                   # g/100g -> g/kg
            D.Unit.Amount == "l" ~ D.Amount * 1000,                      # l -> ml
            D.Unit.Amount == "kg/t" ~ D.Amount * 1000,                   # kg/t -> g/kg
            D.Unit.Amount == "g/300l" ~ D.Amount / 300,                  # g/300l -> g/L
            D.Unit.Amount == "mg/kg" ~ D.Amount / 1000,                  # mg/kg -> g/kg
            D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ D.Amount * 10,     # -> g/kg body weight
            D.Unit.Amount == "kg/kg metabolic weight" ~ D.Amount * 1000, # -> g/kg metabolic weight
            D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)") ~ D.Amount, # unchanged, just normalize name
            D.Unit.Amount == "g/kg DMI" ~ D.Amount,
            D.Unit.Amount == "g/kg DM" ~ D.Amount,
            TRUE ~ D.Amount
          )
        },
        TRUE ~ D.Amount
      ),

      # Update D.Unit.Amount to reflect new base
      D.Unit.Amount = case_when(
        D.Unit.Amount == "kg"  ~ "g",
        D.Unit.Amount == "mg"  ~ "g",
        D.Unit.Amount == "g/100g" ~ "g/kg",
        D.Unit.Amount == "l"  ~ "ml",
        D.Unit.Amount == "kg/t" ~ "g/kg",
        D.Unit.Amount == "g/300l" ~ "g/L",
        D.Unit.Amount == "mg/kg" ~ "g/kg",
        D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ "g/kg body weight",
        D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)") ~ "g/kg metabolic weight",
        D.Unit.Amount == "kg/kg metabolic weight" ~ "g/kg metabolic weight",
        D.Unit.Amount == "g/kg metabolic weight (0.75)" ~ "g/kg metabolic weight",
        D.Unit.Amount == "g/kg DMI" ~ "g/kg",
        D.Unit.Amount == "g/kg DM" ~ "g/kg",
        D.Unit.Amount == "g/L" ~ "g/L",
        D.Unit.Amount == "g/kg Body Weight" ~ "g/kg body weight",  # case fix
        TRUE ~ D.Unit.Amount
      ),

      # Harmonize time units to day
      D.Amount = case_when(
        D.Unit.Time == "week" ~ D.Amount / 7,
        D.Unit.Time == "month" ~ D.Amount / 30,
        D.Unit.Time == "2 weeks" ~ D.Amount / 14,
        D.Unit.Time == "4 days interval" ~ D.Amount / 4,
        D.Unit.Time == "2x/day" ~ D.Amount * 2,
        TRUE ~ D.Amount
      ),
      D.Unit.Time = ifelse(!is.na(D.Unit.Time), "day", D.Unit.Time)
    ) %>%
    # Set unit fields to NA if amount is NA
    mutate(
      D.Unit.Amount   = ifelse(is.na(D.Amount), NA_character_, D.Unit.Amount),
      D.Unit.Time     = ifelse(is.na(D.Amount), NA_character_, D.Unit.Time),
      D.Unit.Animals  = ifelse(is.na(D.Amount), NA_character_, D.Unit.Animals)
    )
}



# Apply the harmonization
ingredients <- harmonize_units(ingredients)

# Load replicate info to adjust for "all animals in replicate"
reps <- merged_metadata$MT.Out %>%
  mutate(T.Animals = as.numeric(T.Animals)) %>%
  distinct(B.Code, A.Level.Name, .keep_all = TRUE)


ingredients <- ingredients %>%
  left_join(reps %>% dplyr::select(B.Code, A.Level.Name, T.Animals), by=c("B.Code","A.Level.Name")) %>%
  mutate(
    D.Amount = case_when(
      D.Unit.Animals == "all animals in replicate" & !is.na(T.Animals) ~ D.Amount / T.Animals,
      TRUE ~ D.Amount
    ),
    D.Unit.Animals = ifelse(D.Unit.Animals == "all animals in replicate", "individual", D.Unit.Animals)
  )

# Refresh 'Units' column after these adjustments (optional)
ingredients <- ingredients %>%
  mutate(Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep=";"))

rm(harmonize_units)

2.1) Suspicious dry amounts TO DO ADD IN ERA DEV

This step identifies ingredients where DC.Is.Dry is marked as NA or unspecified. We cross-check these with moisture values from the composition table to infer whether the ingredient is likely dry or wet. We also flag inconsistencies between the ingredient name (e.g., containing “dried”) and the recorded dry status.

# --- 1. Classify ingredients from Moisture or DM values ---
dry_classified <- bind_rows(
  # Moisture-based rules
  merged_metadata$Animals.Diet.Comp %>%
    filter(str_to_lower(DC.Variable) %in% c("moisture", "moist")) %>%
    mutate(
      Predictor = "Moisture",
      Value = as.numeric(DC.Value),
      Unit = str_to_lower(DC.Unit),
      Dry_Prediction = case_when(
        Unit %in% c("%", "% dm", "g/100g") & Value < 15  ~ TRUE,
        Unit %in% c("%", "% dm", "g/100g") & Value > 50  ~ FALSE,
        Unit == "g/kg" & Value < 150                     ~ TRUE,
        Unit == "g/kg" & Value > 500                     ~ FALSE,
        TRUE ~ NA
      )
    ),
  # DM-based rules
  merged_metadata$Animals.Diet.Comp %>%
    filter(str_to_lower(DC.Variable) == "dm") %>%
    mutate(
      Predictor = "DM",
      Value = as.numeric(DC.Value),
      Unit = str_to_lower(DC.Unit),
      Dry_Prediction = case_when(
        Unit %in% c("%", "% dm", "g/100g") & Value > 85  ~ TRUE,
        Unit %in% c("%", "% dm", "g/100g") & Value < 50  ~ FALSE,
        Unit == "g/kg" & Value > 850                    ~ TRUE,
        Unit == "g/kg" & Value < 500                    ~ FALSE,
        TRUE ~ NA
      )
    )
) %>%
  select(B.Code, D.Item, Predictor, Dry_Prediction) %>%
  distinct() %>%
  group_by(B.Code, D.Item, Predictor) %>%
  summarise(Dry_Prediction = first(na.omit(Dry_Prediction)), .groups = "drop") %>%
  pivot_wider(names_from = Predictor, values_from = Dry_Prediction, names_prefix = "Dry_Prediction_")

# --- 2. Detect "dry" or "dried" in name ---
dry_name_match <- ingredients %>%
  mutate(
    name_lower = str_to_lower(D.Item),
    Dry_by_name = str_detect(
      name_lower,
      "\\bdry\\b|\\bdried\\b|\\bmeal\\b|\\bground\\b|\\bmilled\\b|\\bflour\\b"
    ) & !str_detect(name_lower, "fish meal boiled")
  ) %>%
  select(B.Code, A.Level.Name,D.Item, Dry_by_name)


extra_dry <- ingredients %>%
  filter(
    D.Type %in% c("Non-ERA Feed", "Other Feed","Other Ingredients","Supplement"),
    !str_detect(str_to_lower(D.Item), "water|oil")
  ) %>%
  select(B.Code,A.Level.Name, D.Item) 

# Remove A.Level.Name from extra_dry before combining
extra_dry <- extra_dry %>%
  select(B.Code, D.Item) %>%
  distinct() %>%
  mutate(DC.Is.Dry = "Yes")


# --- 3. Combine predictions ---
# Combine dry predictions from DM/moisture, name-based detection, and extra rules
dry_prediction_final <- bind_rows(
  dry_classified %>%
    mutate(Prediction_Source = "Moisture_or_DM") %>%
    filter(Dry_Prediction_Moisture == TRUE | Dry_Prediction_DM == TRUE) %>%
    select(B.Code, D.Item) %>%
    mutate(DC.Is.Dry = "Yes"),
  
  dry_name_match %>%
    filter(Dry_by_name) %>%
    select(B.Code, D.Item) %>%
    mutate(DC.Is.Dry = "Yes", Prediction_Source = "Name_match"),
  
  extra_dry %>%
    select(B.Code, D.Item) %>%
    mutate(DC.Is.Dry = "Yes", Prediction_Source = "Extra_rule")
) %>%
  distinct(B.Code, D.Item, .keep_all = TRUE)


# --- 4. Apply predictions to ingredients table (update only predicted dry) ---
# 1. Join on D.Item
ingredients <- ingredients %>%
  left_join(dry_prediction_final, by = c("B.Code", "D.Item"), suffix = c("", ".from_DItem"))

# 2. Join also on A.Level.Name
ingredients <- ingredients %>%
  left_join(
    dry_prediction_final %>%
      rename(A.Level.Name = D.Item),
    by = c("B.Code", "A.Level.Name"),
    suffix = c("", ".from_AName")
  )

# 3. Apply logic: keep existing "Yes", but if original is "No" or NA, check if any of the predictions is "Yes"
ingredients <- ingredients %>%
  mutate(
    DC.Is.Dry = case_when(
      (is.na(DC.Is.Dry) | str_to_lower(DC.Is.Dry) %in% c("no","unspecified")) &
        (DC.Is.Dry.from_DItem == "Yes" | DC.Is.Dry.from_AName == "Yes") ~ "Yes",
      TRUE ~ DC.Is.Dry
    )
  ) %>%
  select(-DC.Is.Dry.from_DItem, -DC.Is.Dry.from_AName)


#take out water 
ingredients<-ingredients%>%
filter(D.Item != "Water" | is.na(D.Item))


rm(dry_classified,dry_name_match,dry_prediction_final,extra_dry)

Subset for Dry data

When feed intake data is missing and we rely on ingredient-level amounts, we need to ensure that the quantities used refer to the dry form of the ingredient. To do this, we subset the data to include only ingredients identified as dried—based on either the declared DC.Is.Dry value or our earlier flagging using ingredient names and moisture content predictions. This ensures consistency and accuracy in downstream calculations.

#take out unspecified ingredients

species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_before <- ingredients %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_before")
# length(unique(ingredients$B.Code))


# Step 1: Identify A.Level.Name groups with at least one ingredient NOT dry
non_dry_diets <- ingredients %>%
  filter(is.na(DC.Is.Dry) | str_to_lower(DC.Is.Dry) %in% c("no", "n", "unspecified")) %>%
  distinct(B.Code, A.Level.Name)

# Step 2: Filter out these diets entirely from ingredients
ingredients <- ingredients %>%
  anti_join(non_dry_diets, by = c("B.Code", "A.Level.Name"))

length(unique(ingredients$A.Level.Name))
## [1] 1339
rm(non_dry_diets)

length(unique(ingredients$B.Code))
## [1] 492
species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_after <- ingredients %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_after")

2.2) Filter data with acceptable units

We filter the dataset to retain only ingredients with acceptable units. Uncommon or unexpected unit formats are excluded to ensure consistency and simplify downstream processing

# Define a list of acceptable units
acceptable_units <- c(
  "g/kg", "g", "%", "ml", "L", "kg", "g/L",
  "g/kg metabolic weight", "% Diet", 
  "g/kg body weight", "ml/kg", "% Body Mass", 
  "% Concentrate", "g/kg body weight"  # leave only one if repeated
)


# Filter rows to keep only acceptable units
ingredients <- ingredients %>%
  filter(D.Unit.Amount %in% acceptable_units | is.na(D.Unit.Amount)) %>%
  mutate(D.Item = ifelse(is.na(D.Item), "Unspecified", D.Item))

#tracking paper loss because of harmnozied units
#total_studies_harmonized<- length(unique(ingredients_harmonized$B.Code))
#total_ingredients_harmonized<- length(unique(ingredients_harmonized$D.Item))
#total_diets_harmonized<- length(unique(ingredients_harmonized$A.Level.Name))

rm(acceptable_units)

2.3) Convert body weight units by linking animals weight to data

# Pre-filter Data.Out to keep only the first non-NA weight per group
weights_unique <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Weight Gain") %>%
  mutate(
    Out.WG.Unit = tolower(Out.WG.Unit),
    Out.WG.Start = case_when(
      Out.WG.Unit == "g" ~ Out.WG.Start / 1000,
      Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
      Out.WG.Unit %in% c("kg", "unspecified", NA) ~ Out.WG.Start,
      TRUE ~ NA_real_
    ),
    Out.WG.Unit = ifelse(!is.na(Out.WG.Start), "kg", NA_character_)
  ) %>%
  filter(!is.na(Out.WG.Start)) %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    Out.WG.Start = mean(Out.WG.Start, na.rm = TRUE),
    Out.WG.Unit = "kg",
    .groups = "drop"
  )



# Step 3: Join body weight and keep only ingredients with weight
ingredients <- ingredients %>%
  left_join(
    weights_unique,
    by = c("B.Code", "A.Level.Name")) 


# Step 4: Convert D.Amount to grams based on body weight and update unit accordingly
ingredients <- ingredients %>%
  mutate(
    # Recalculate D.Amount using body weight or metabolic weight, result in kg
    D.Amount = case_when(
      D.Unit.Amount == "g/kg body weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start) / 1000,
      D.Unit.Amount == "g/kg metabolic weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start^0.75) / 1000,
      D.Unit.Amount == "% Body Mass" & !is.na(Out.WG.Start) ~ ((D.Amount / 100) * Out.WG.Start),
      TRUE ~ D.Amount  # unchanged if not convertible
    ),

    # Update unit to "kg" if recalculated
    D.Unit.Amount = case_when(
      D.Unit.Amount %in% c("g/kg body weight", "g/kg metabolic weight", "% Body Mass") &
        !is.na(Out.WG.Start) ~ "kg",
      TRUE ~ D.Unit.Amount
    )
  )

2.4) Estimate amounts using feed intake

For ingredients reported in relative units such as %, % Diet, or g/kg, we can estimate their absolute amounts when total diet feed intake is available. This allows us to convert proportions into standardized quantities.

missing_unit_feed_intake<- merged_metadata$Data.Out %>%
  select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,is_entire_diet,Source,Out.Code.Joined) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(is.na(Out.Unit))

2.4.1) Harmonization of feed intake data

feed_intake <- merged_metadata$Data.Out %>%
  select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,is_entire_diet,T.Animals) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(!is.na(ED.Mean.T))%>%
  mutate(
    Out.Unit = tolower(trimws(Out.Unit)),

    # Extract unit components
    Out.Unit.Animals = str_extract(Out.Unit, "\\b(individual|replicate|animal replicate)\\b"),
    Out.Unit.Time    = str_extract(Out.Unit, "\\b(day|d|yr|year)\\b"),
    Out.Unit.Amount  = Out.Unit %>%
      str_remove_all("\\b(individual|replicate|animal replicate)\\b") %>%
      str_remove_all("\\b(day|d|yr|year)\\b") %>%
      str_replace_all("/+", "/") %>%
      str_trim() %>%
      str_remove("/$"))


# Step 3: Convert replicate-level intake to individual-level when possible
feed_intake <- feed_intake %>%
  mutate(
    # Ensure numeric type
    ED.Mean.T = as.numeric(ED.Mean.T),
    T.Animals = as.numeric(T.Animals),

    # Convert intake and update unit
    ED.Mean.T = case_when(
      Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ ED.Mean.T / T.Animals,
      TRUE ~ ED.Mean.T
    ),
    Out.Unit.Animals = case_when(
      Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ "individual",
      TRUE ~ Out.Unit.Animals
    )
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `T.Animals = as.numeric(T.Animals)`.
## Caused by warning:
## ! NAs introduced by coercion
# Define preferred units
preferred_units <- c("kg dm", "g dm")
preferred_time <- "day"
preferred_animals <- "individual"

# Create filtered table with prioritization
feed_intake <- feed_intake %>%
  mutate(
    priority = case_when(
      Out.Unit.Amount %in% preferred_units &
        Out.Unit.Time == preferred_time &
        Out.Unit.Animals == preferred_animals &
        !is.na(ED.Error) ~ 1,
      Out.Unit.Amount %in% preferred_units &
        Out.Unit.Time == preferred_time &
        Out.Unit.Animals == preferred_animals ~ 2,
      TRUE ~ 3
    )
  ) %>%
  group_by(B.Code, A.Level.Name, ED.Intake.Item) %>%
  slice_min(order_by = priority, with_ties = FALSE) %>%
  ungroup() %>%
  select(-priority)  # optional



### keep unique units of ingredients 


# Step 3: Join body weight
feed_intake <- feed_intake %>%
  left_join(
    weights_unique %>%
      select(B.Code, A.Level.Name, Out.WG.Start, Out.WG.Unit),
    by = c("B.Code", "A.Level.Name")
  ) %>%
  group_by(A.Level.Name) %>%
  mutate(
    Out.WG.Start = ifelse(is.na(Out.WG.Start),
                          first(na.omit(Out.WG.Start)),
                          Out.WG.Start)
  ) %>%
  ungroup() %>%
  distinct()

# Step 4: Convert ED.Mean.T to kg/day
feed_intake <- feed_intake %>%
  mutate(
    Out.Unit = trimws(tolower(Out.Unit)),
    Out.Unit.Amount = trimws(str_remove(Out.Unit.Amount, "/$")),
    ED.Mean.T = case_when(
      Out.Unit.Amount %in% c("g", "g dm") ~ ED.Mean.T / 1000,
      Out.Unit.Amount %in% c("kg", "kg dm") ~ ED.Mean.T,
      Out.Unit.Amount == "g dm/100 g" ~ ED.Mean.T * 10 / 1000,
      Out.Unit.Amount == "g/week" ~ ED.Mean.T / 7 / 1000,
      Out.Unit.Amount == "g dm/kg metabolic weight" ~ (ED.Mean.T * Out.WG.Start^0.75) / 1000,
      Out.Unit.Amount == "g dm/kg body weight" ~ (ED.Mean.T * Out.WG.Start) / 1000,
      Out.Unit.Amount == "mg" ~ ED.Mean.T / 1e6,
      Out.Unit.Amount %in% c("g/kg body weight", "g dm/kg body weight") ~ (ED.Mean.T * Out.WG.Start) / 1000,
      Out.Unit.Amount %in% c("g/kg body weight (0.75)", "g dm/kg body weight (0.75)") ~ (ED.Mean.T * Out.WG.Start^0.75) / 1000,
      Out.Unit.Amount %in% c("g/kg metabolic weight","g dm/kg metabolic weight") ~ (ED.Mean.T * Out.WG.Start^0.75) / 1000,
      Out.Unit.Amount == "kg dm/kg metabolic weight" ~ ED.Mean.T * Out.WG.Start^0.75,
      Out.Unit.Amount %in% c("% body weight", "dm % body weight") ~ (ED.Mean.T / 100) * Out.WG.Start,
      Out.Unit.Amount == "g/g body weight" ~ (ED.Mean.T * Out.WG.Start) / 1000,
      TRUE ~ NA_real_
    ),
    Out.Unit.Amount = case_when(
      Out.Unit.Amount %in% c("g", "g dm", "g dm/100 g", "g/week", "g/kg body weight",
                             "g dm/kg body weight", "g/kg body weight (0.75)",
                             "g dm/kg body weight (0.75)", "g/kg metabolic weight","g dm/kg metabolic weight","g/g body weight",
                             "kg dm/kg metabolic weight", "% body weight", 
                             "dm % body weight", "kg", "kg dm", "mg") ~ "kg",
      TRUE ~ Out.Unit.Amount
    )
  ) %>%
  select(-Out.Unit)

feed_intake <- feed_intake %>%
  group_by(B.Code, A.Level.Name, ED.Intake.Item.Raw) %>%
  arrange(
    desc(!is.na(ED.Mean.T)), 
    desc(Out.Unit.Amount == "kg"),
    desc(Out.Unit.Animals == "individual"),
    desc(Out.Unit.Time == "day")
  ) %>%
  slice(1) %>%
  ungroup()

2.4.2) Unit error check on feed intake

Performs basic outlier checks on feed intake data, identifying suspiciously high or low intake values based on predefined unit thresholds.

# Assuming your data is stored in a data frame named `feed_intake_data`

# Filter for units that imply kg/day/individual and check if ED.Mean.T > 50
# Identify suspicious feed intake values
## ---- detect_suspicious_feed_intake -----------------------------------------
# Thresholds
high_kg <- 50      # > 50 kg d-1 animal-1  ⟹ unrealistically high
low_kg  <- 0.1     # < 0.1 kg d-1 animal-1 ⟹ unrealistically low
low_g_min <- 1     # 1–20 g range unlikely for daily intake
low_g_max <- 20

suspicious_feed_intake <- feed_intake %>% 
  mutate(
    flag = case_when(
      Out.Unit.Amount %in% c("kg", "kg dm") &
      Out.Unit.Time   == "day"              &
      Out.Unit.Animals == "individual" & ED.Mean.T >  high_kg ~ "too_high_kg",

      Out.Unit.Amount %in% c("kg", "kg dm") &
      Out.Unit.Time   == "day"              &
      Out.Unit.Animals == "individual" & ED.Mean.T <  low_kg  ~ "too_low_kg",

      Out.Unit.Amount %in% c("g",  "g dm")  &
      ED.Mean.T >= low_g_min & ED.Mean.T <= low_g_max          ~ "implausibly_low_g",

      TRUE ~ NA_character_
    )
  ) %>% 
  filter(!is.na(flag)) %>% 
  select(B.Code, A.Level.Name, ED.Mean.T,
         Out.Unit.Amount, Out.Unit.Time, Out.Unit.Animals, flag)

print(suspicious_feed_intake)
## # A tibble: 579 × 7
##    B.Code A.Level.Name ED.Mean.T Out.Unit.Amount Out.Unit.Time Out.Unit.Animals
##    <chr>  <chr>            <dbl> <chr>           <chr>         <chr>           
##  1 AG0051 NG              0.0083 kg              day           individual      
##  2 AG0051 NGD             0.0082 kg              day           individual      
##  3 AG0051 PG              0.0091 kg              day           individual      
##  4 AG0051 PGD             0.0092 kg              day           individual      
##  5 AN0007 T1              0.0696 kg              day           individual      
##  6 AN0007 T2              0.0735 kg              day           individual      
##  7 AN0007 T3              0.0808 kg              day           individual      
##  8 AN0007 T4              0.0769 kg              day           individual      
##  9 AN0007 T5              0.0802 kg              day           individual      
## 10 AN0020 Control T1      0.0503 kg              day           individual      
## # ℹ 569 more rows
## # ℹ 1 more variable: flag <chr>
low_intake_codes <- feed_intake %>% 
  filter(
    # (a) < 0.1 kg / day / individual
    (Out.Unit.Amount %in% c("kg", "kg dm") &
     Out.Unit.Time   == "day" &
     Out.Unit.Animals == "individual" &
     ED.Mean.T < 0.1) |
    
    # (b) 1–20 g reported in gram units
    (Out.Unit.Amount %in% c("g", "g dm") &
     ED.Mean.T >= 1 & ED.Mean.T <= 20)
  ) %>% 
  distinct(B.Code) %>%           # keep each code only once
  pull()                         # return as a plain character vector

low_intake_codes
##   [1] "AG0051"     "AN0007"     "AN0020"     "B01063"     "BO1012"    
##   [6] "BO1015"     "BO1030"     "BO1042"     "BO1048"     "BO1061"    
##  [11] "BO1062"     "BO1064"     "BO1075"     "BO1095"     "CJ1001"    
##  [16] "CJ1006"     "CJ1015"     "DK0037"     "EM1049"     "EM1060"    
##  [21] "EM1062"     "EM1065"     "EM1066"     "EM1068"     "EM1069"    
##  [26] "EM1070"     "EM1077"     "EM1081"     "EM1092"     "EM1097"    
##  [31] "EM1100"     "EO0032"     "EO0047"     "EO0086"     "EO0102"    
##  [36] "HK0005"     "HK0007"     "HK0011"     "HK0014"     "HK0017"    
##  [41] "HK0037"     "HK0045"     "HK0066.2"   "HK0067"     "HK0102.2"  
##  [46] "HK0118"     "HK0228"     "HK0260"     "HK0319"     "HK0331"    
##  [51] "HK0333"     "HK0337"     "JO1001"     "JO1010"     "JO1015"    
##  [56] "JO1020"     "JO1022"     "JO1023"     "JO1025"     "JO1026"    
##  [61] "JO1027"     "JO1033"     "JO1034"     "JO1036"     "JO1038"    
##  [66] "JO1040"     "JO1041"     "JO1042"     "JO1050"     "JO1057"    
##  [71] "JO1068"     "JO1069"     "JO1070"     "JO1071"     "JO1074"    
##  [76] "JO1075"     "JO1077"     "JO1078"     "JS0047"     "JS0048"    
##  [81] "JS0076"     "JS0079.1"   "JS0093"     "JS0095"     "JS0123"    
##  [86] "JS0124"     "JS0125"     "JS0134"     "JS0135"     "JS0139"    
##  [91] "JS0177"     "JS0192"     "JS0193"     "JS0206"     "JS0207"    
##  [96] "JS0208"     "JS0209"     "JS0281"     "JS0291"     "JS0295"    
## [101] "JS0296"     "JS0309"     "JS0342"     "JS0370"     "JS0371"    
## [106] "JS0393"     "LM0120"     "LM0266"     "LM0277.1"   "LM0289"    
## [111] "LM0310"     "LM0312"     "LM0334"     "LM0335"     "LM0354"    
## [116] "NJ0003"     "NJ0047"     "NN0021"     "NN0083"     "NN0085"    
## [121] "NN0098"     "NN0108"     "NN0114"     "NN0115"     "NN0118"    
## [126] "NN0119.2"   "NN0189"     "NN0198"     "NN0201"     "NN0205"    
## [131] "NN0211"     "NN0212"     "NN0214"     "NN0230"     "NN0251"    
## [136] "NN0272.1"   "NN0272.1.2" "NN0272.2"   "NN0321"     "NN0325"    
## [141] "NN0326"     "NN0358"     "NN0359"     "NN0367"     "NN0377"    
## [146] "NN0396"

Some papers were identified with errors. This section corrects those error if they are still present before the next update of the data.

# Step: Convert suspiciously high kg values that are likely mislabelled g
feed_intake <- feed_intake %>%
  mutate(
    Out.Unit.Amount = case_when(
      Out.Unit.Amount == "kg" & Out.Unit.Animals == "individual" & Out.Unit.Time == "day" & ED.Mean.T > 50 ~ "g",  # correct mislabeling
      TRUE ~ Out.Unit.Amount
    )
  )

2.4.3) Use of feed intake for ingredient amounts

If feed intake data is there for the entire diet and that the ingredients in ingredients dataset are in % or % of diet or g/kg we can convert

# Step 5: Estimate ingredient absolute amounts for diets with "Entire Diet" intake
ingredients <- ingredients %>%
  # Join only feed intake values where ED.Intake.Item is "Entire Diet"
  left_join(
    feed_intake %>%
      filter(ED.Intake.Item == "Entire Diet") %>%
      select(B.Code, A.Level.Name, Intake_kg_day = ED.Mean.T),
    by = c("B.Code", "A.Level.Name")
  ) %>%
  mutate(
    # Estimate absolute amount (in grams)
    D.Amount = case_when(
      D.Unit.Amount == "%" & !is.na(Intake_kg_day) ~ (D.Amount / 100) * Intake_kg_day * 1000,
      D.Unit.Amount == "% Diet" & !is.na(Intake_kg_day) ~ (D.Amount / 100) * Intake_kg_day * 1000,
      D.Unit.Amount == "g/kg" & !is.na(Intake_kg_day) ~ D.Amount * Intake_kg_day,
      TRUE ~ D.Amount
    ),
    
    # Update unit only if we did a conversion
    D.Unit.Amount = case_when(
      D.Unit.Amount %in% c("%", "% Diet", "g/kg") & !is.na(Intake_kg_day) ~ "g",
      TRUE ~ D.Unit.Amount
    )
  ) %>%
  select(-Intake_kg_day)

3) Subset data

3.1) Take out basal diets

We excluded the basal diets because, in most of the livestock trials in our database, the basal ration is offered ad libitum and therefore lacks two pieces of information that are essential for a fair, productivity-based comparison: (i) actual feed-intake measurements and (ii) corresponding enteric-methane estimates. Without intake data we cannot convert emissions—or, in many cases, even estimate them—on a per-kilogram-of-product basis, and retaining these records would artificially inflate the number of “treatments” while contributing no usable information to the efficiency analyses. By focusing on the control diet (with measured intake) and the formulated treatment diets, we keep a coherent set of observations in which every data point can be expressed as both absolute emissions and emission intensity, allowing robust, like-for-like comparisons across studies.

ingredients<-ingredients%>%
  filter(A.Level.Name!="Base")

4) Replace feed intake data where available

We prioritize feed intake data over the amounts reported in the methods section, as feed intake is the key variable needed for emission calculations. When feed intake data is unavailable, we fall back on the harmonized ingredient amounts extracted earlier.

4.1) Feed intake data at the diet level

# Step 5: Remove feed intake entries that are actually full-diet labels
feed_intake_ingredients <- feed_intake %>%
  filter(str_trim(ED.Intake.Item) != str_trim(A.Level.Name) & 
         str_trim(ED.Intake.Item) != "Entire Diet") %>%
  filter(!is.na(ED.Mean.T))


# Step 6: Merge with ingredients and override with ED.Mean.T when available
# FIXED VERSION
# Merge with ingredients and override with ED.Mean.T when available
ingredients <- ingredients %>%
  left_join(
    feed_intake_ingredients %>%
      select(B.Code, A.Level.Name, ED.Intake.Item, ED.Mean.T, Out.Unit.Amount, Out.Unit.Animals, Out.Unit.Time),
    by = c("B.Code", "A.Level.Name", "D.Item" = "ED.Intake.Item")
  ) %>%
  mutate(
    from_feed_intake = !is.na(ED.Mean.T),
    D.Amount = if_else(from_feed_intake, ED.Mean.T, D.Amount),
    D.Unit.Amount = if_else(from_feed_intake & !is.na(Out.Unit.Amount), Out.Unit.Amount, D.Unit.Amount),
    D.Unit.Animals = if_else(from_feed_intake & !is.na(Out.Unit.Animals), Out.Unit.Animals, D.Unit.Animals),
    D.Unit.Time = if_else(from_feed_intake & !is.na(Out.Unit.Time), Out.Unit.Time, D.Unit.Time)
  ) %>%
  select(-ED.Mean.T, -Out.Unit.Amount, -Out.Unit.Animals, -Out.Unit.Time) %>%
  distinct()


# Step 8: Final conversion of unit to grams (g)
ingredients <- ingredients %>%
  mutate(
    D.Amount = case_when(
      D.Unit.Amount == "kg" ~ D.Amount * 1000,
      TRUE ~ D.Amount
    ),
    D.Unit.Amount = case_when(
      D.Unit.Amount == "kg" ~ "g",
      TRUE ~ D.Unit.Amount
    )
  )

4.2) Feed intake at the diet level

TO do : See BO1070 we want the full diet to be included from feed intake which is not the case now see feed intake data the goal would be to add rows in ingredient to add info at the entire diet level have a close look to make it the same format as entire diets already in ingredients

# Step: Extract feed intake entries that refer to the entire diet
feed_intake_diet <- feed_intake %>%
  filter(ED.Intake.Item == A.Level.Name | ED.Intake.Item == "Entire Diet")

# Step: Merge and overwrite ingredients using diet-level feed intake
ingredients <- ingredients %>%
  left_join(
    feed_intake_diet %>%
      select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw, ED.Mean.T, Out.Unit.Amount, Out.Unit.Animals, Out.Unit.Time),
    by = c("B.Code", "A.Level.Name", "D.Type" = "ED.Intake.Item.Raw")
  ) %>%
  mutate(
    from_feed_intake2 = !is.na(ED.Mean.T),
    D.Amount = if_else(from_feed_intake2, ED.Mean.T, D.Amount),
    D.Unit.Amount = if_else(from_feed_intake2 & !is.na(Out.Unit.Amount), Out.Unit.Amount, D.Unit.Amount),
    D.Unit.Animals = if_else(from_feed_intake2 & !is.na(Out.Unit.Animals), Out.Unit.Animals, D.Unit.Animals),
    D.Unit.Time = if_else(from_feed_intake2 & !is.na(Out.Unit.Time), Out.Unit.Time, D.Unit.Time)
  ) %>%
  select(-ED.Mean.T, -Out.Unit.Amount, -Out.Unit.Animals, -Out.Unit.Time,-ED.Intake.Item) %>%
  distinct()


# Step 8: Final conversion of unit to grams (g)
ingredients <- ingredients %>%
  mutate(
    D.Amount = case_when(
      D.Unit.Amount == "kg" ~ D.Amount * 1000,
      TRUE ~ D.Amount
    ),
    D.Unit.Amount = case_when(
      D.Unit.Amount == "kg" ~ "g",
      TRUE ~ D.Unit.Amount
    )
  )

6) Complete the nutrition and digestibility papers

Not all ingredients and diets are currently included in the digestibility and composition tables—only those for which data is available. We aim to add the missing ones to help complete the dataset and enable more comprehensive analysis.

# --- Step 1: Get all unique (B.Code, D.Item, Source) from the Diet table ---
all_diet_items <- merged_metadata$Animals.Diet %>%
  distinct(B.Code, A.Level.Name, D.Item, Source) %>%
  mutate(
    is_entire_diet = if_else(is.na(D.Item), TRUE, FALSE),
    D.Item = coalesce(D.Item, A.Level.Name)
  ) %>%
  select(B.Code, D.Item, is_entire_diet, Source) %>%
  distinct()

# --- Step 2: Get unique variables in the nutrition and digestibility datasets ---
nutrition_variables <- unique(na.omit(merged_metadata$Animals.Diet.Comp$DC.Variable))
digestibility_variables <- unique(na.omit(merged_metadata$Animals.Diet.Digest$DD.Variable))

# --- Step 3: Expand to full grid of all D.Item × variables (by B.Code and Source) ---
# Nutrition (Comp)
comp_missing <- all_diet_items %>%
  crossing(DC.Variable = nutrition_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item, DC.Variable),
    by = c("B.Code", "D.Item", "DC.Variable")
  ) %>%
  mutate(
    DC.Value = NA_real_,
    DC.Unit = NA_character_
  )

# Digestibility (Digest)
digest_missing <- all_diet_items %>%
  crossing(DD.Variable = digestibility_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item, DD.Variable),
    by = c("B.Code", "D.Item", "DD.Variable")
  ) %>%
  mutate(
    DD.Value = NA_real_,
    DD.Unit = NA_character_
  )

# --- Step 4: Bind the missing values into the existing datasets ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_missing)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_missing)
# --- Step 5: Ensure each ingredient has a row for each nutrition variable ---
comp_completion <- merged_metadata$Animals.Diet.Comp %>%
  distinct(B.Code, D.Item, is_entire_diet, Source) %>%
  crossing(DC.Variable = nutrition_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item, DC.Variable),
    by = c("B.Code", "D.Item", "DC.Variable")
  ) %>%
  mutate(
    DC.Value = NA_real_,
    DC.Unit = NA_character_
  )

# --- Step 6: Ensure each ingredient has a row for each digestibility variable ---
digest_completion <- merged_metadata$Animals.Diet.Digest %>%
  distinct(B.Code, D.Item, is_entire_diet, Source) %>%
  crossing(DD.Variable = digestibility_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item, DD.Variable),
    by = c("B.Code", "D.Item", "DD.Variable")
  ) %>%
  mutate(
    DD.Value = NA_real_,
    DD.Unit = NA_character_
  )

# --- Step 7: Bind additional completion rows ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_completion)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_completion)

rm(comp_completion,comp_missing,digest_completion,digest_missing)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
#   filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
#   distinct(B.Code) %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_with_GE")
# 
# # View result
# ge_papers

6.2) Correct nutrition units

# ── 2.  Build a “variable → units” summary table ───────────────────────────────
table_units <- merged_metadata$Animals.Diet.Comp %>%
  # keep rows where a unit is actually recorded
  filter(!is.na(DC.Unit) & DC.Unit != "NA" & str_trim(DC.Unit) != "") %>%
  # one row per unique (variable, unit) pair
  distinct(DC.Variable, DC.Unit) %>%
  # collapse multiple units for the same variable into a single comma-separated
  group_by(DC.Variable) %>%
  summarise(units = paste(sort(unique(DC.Unit)), collapse = ", "),
            .groups = "drop") %>%
  arrange(DC.Variable)

# ── 3.  Inspect / export ───────────────────────────────────────────────────────
print(table_units)
## # A tibble: 62 × 2
##    DC.Variable units                                                            
##    <chr>       <chr>                                                            
##  1 ADF         %, % DM, g, g/100g, g/100g DM, g/kg, g/kg DM                     
##  2 ADL         %, % DM, g/100g, g/100g DM, g/kg, g/kg DM                        
##  3 Ash         %, % DM, g/100g, g/100g DM, g/kg, g/kg DM, Unspecified           
##  4 CF          %, % DM, g/kg, g/kg DM                                           
##  5 CFbr        %, % DM, g/100g DM, g/kg, g/kg DM                                
##  6 CFibre      %, g/100g, g/100g DM, g/kg, g/kg(Prosopis)/ % (diet), Unspecified
##  7 CH          %, % DM, g/kg DM                                                 
##  8 CP          %, % DM, %/(g/100g for pumpkin leaves), g, g/100g, g/100g DM, g/…
##  9 CT          %, % DM, g/kg, g/kg DM, mg/g                                     
## 10 Ca          %, % DM, g/100g, g/kg, g/kg DM                                   
## # ℹ 52 more rows
# write_csv(table_units, "variable_units_lookup.csv")   # if you want a file
###############################################################################
##  Fix the mis-labelled GE unit for four specific ingredients
##  *and* only in the paper NN0106
###############################################################################

bad_items <- c(
  "Biscuit Waste Meal Dried(Sun)",
  "Leucaena leucocephala Leaves Dried",
  "Maize",
  "Wheat Offal"
)

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(
    DC.Unit = if_else(
      D.Item      %in% bad_items      &   # right ingredient name
      B.Code  ==  "NN0106"        &   # right paper / source
      DC.Variable ==  "GE"            &   # only Gross Energy rows
      DC.Unit     ==  "kcal/g",           # only if label is wrong
      "kcal/kg",                          # corrected label
      DC.Unit                             
    )
  )
animals <- merged_metadata$Animals.Diet.Comp %>% mutate(value_num = suppressWarnings(as.numeric(DC.Value)))

# 1. nutrient (% / g-based) ➜ g / kg DM ========================================
vars_gkg <- c("NDF", "EE", "CP", "Ash")

conv_gkg <- tribble(
  ~DC.Unit,       ~factor,
  "%",             10,
  "% DM",          10,
  "g/100g",        10,
  "g/100g DM",     10,
  "g/kg",           1,
  "g/kg DM",        1
)

animals <- animals %>%
  left_join(conv_gkg, by = "DC.Unit") %>%
  mutate(
    value_std_gkg = if_else(DC.Variable %in% vars_gkg & !is.na(factor),
                            value_num * factor, NA_real_),
    unit_std_gkg  = if_else(!is.na(value_std_gkg), "g/kg", NA_character_)
  ) %>%
  select(-factor)

# 2. energy ➜ MJ / kg (DM) =====================================================
vars_MJ <- c("GE", "ME", "NE")

conv_MJ <- tribble(
  ~DC.Unit,       ~factorMJ,
  "MJ/kg",           1,
  "MJ/kg DM",        1,
  "MJ/100g",        10,
  "MJ/100g DM",     10,
  "MJ/g",        1e3,
  "kJ/kg",       0.001,
  "kJ/kg DM",    0.001,
  "kJ/100g",     0.01,
  "kJ/100g DM",  0.01,
  "kJ/g",           1,
  "J/mg",           1,          # J per mg ×1e-6 MJ/J ×1e6 mg/kg
  "kcal/kg",     0.004184,
  "kcal/kg DM",  0.004184,
  "kcal/100g",   0.04184,
  "kcal/g",      4.184,
  "Mcal/kg",     4.184,
  "Mcal/kg DM",  4.184
)

animals <- animals %>%
  left_join(conv_MJ, by = "DC.Unit") %>%
  mutate(
    value_std_MJ  = if_else(DC.Variable %in% vars_MJ & !is.na(factorMJ),
                            value_num * factorMJ, NA_real_),
    unit_std_MJ   = if_else(!is.na(value_std_MJ), "MJ/kg", NA_character_)
  ) %>%
  select(-factorMJ)

# 3. overwrite DC.Value / DC.Unit where a conversion succeeded -----------------
animals <- animals %>%
  mutate(
    DC.Value = coalesce(value_std_gkg, value_std_MJ, value_num) |> format(scientific = FALSE),
    DC.Unit  = coalesce(unit_std_gkg, unit_std_MJ, DC.Unit)
  ) %>%
  select(-value_num, -value_std_gkg, -value_std_MJ,
         -unit_std_gkg, -unit_std_MJ)

# 4. save back into the metadata list ------------------------------------------
merged_metadata$Animals.Diet.Comp <- animals
# ── END CHUNK ──────────────────────────────────────────────────────────────────

6.3) Coverage of nutritent

coverage_raw <- merged_metadata$Animals.Diet.Comp %>%
  filter(!is.na(DC.Variable)) %>%
  mutate(
    # Clean and detect actual missing values
    DC.Value_clean = str_trim(as.character(DC.Value)),
    DC.Value_clean = na_if(DC.Value_clean, ""),
    DC.Value_clean = na_if(DC.Value_clean, "NA"),
    DC.Value_clean = na_if(DC.Value_clean, "na")
  ) %>%
  group_by(DC.Variable) %>%
  summarise(
    n_total = n(),
    n_non_missing = sum(!is.na(DC.Value_clean)),
    coverage_percent = round(100 * n_non_missing / n_total, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(coverage_percent))

7.1) add nutrition and digestibility from feedipedia

feedipedia_unit_audit <- feedipedia %>%
  mutate(
    Variable = as.character(Variable),
    Unit     = trimws(Unit)
  ) %>% 
  group_by(Variable, Unit) %>% 
  summarise(n_rows = n(), .groups = "drop") %>% 
  arrange(Variable, desc(n_rows)) %>% 
  group_by(Variable) %>% 
  mutate(
    n_units = n(),
    flag    = if_else(n_units > 1, "⚠︎ multiple units", "")
  ) %>% 
  ungroup()

# ── wide human-friendly table  ---------------------------------------------
unit_summary <- feedipedia_unit_audit %>% 
  unite(unit_n, Unit, n_rows, sep = " (n = ") %>% 
  mutate(unit_n = paste0(unit_n, ")")) %>% 
  group_by(Variable, flag) %>% 
  summarise(units = paste(unit_n, collapse = "; "), .groups = "drop") %>% 
  arrange(Variable)

###7.1) Harmonize feedipedia units

# helper -------------------------------------------------------------
to_canonical <- function(val, unit, vclass){
  u  <- str_trim(tolower(unit))
  nu <- unit              # keep the original unit by default
  v  <- val               # keep the original value by default

  if (vclass == "energy"){                 # ---------- ENERGY ----------
    if (u %in% c("mcal/kg dm","mcal/kg","mcal/kg d.m.")){
      v  <- val * 4.184 ; nu <- "MJ/kg"
    } else if (u %in% c("kcal/kg dm","kcal/kg")){
      v  <- val * 0.004184 ; nu <- "MJ/kg"
    } else if (u %in% c("mj/kg","mj/kg dm")){
      nu <- "MJ/kg"
    }

  } else {                                 # -------- NUTRIENT ----------
    if (u %in% c("% dm","%","% as fed")){
      v  <- val * 10 ;   nu <- "g/kg"
    } else if (u %in% c("mg/kg dm","mg/kg")){
      v  <- val / 1000 ; nu <- "g/kg"
    } else if (u %in% c("g/100g dm","g/100g")){
      v  <- val * 10 ;   nu <- "g/kg"
    } else if (u == "g dm"){
      nu <- "g/kg"
    }
  }
  list(val = v, unit = nu)
}

# ── 0 · variable lists ──────────────────────────────────────────────────────
nutrition_vars <- str_trim(c(
  "ADF", "Crude protein", "Ether extract", "Ash","NDF"
))

energy_vars <- c(                          #  ✱ NEW: long names added here ✱
  "GE", "Gross energy",
  "ME", "Metabolizable energy",
  "NE",
  "AME", "AMEn", "TME", "TMEn",
  "ME ruminants", "ME ruminants (FAO, 1982)", "ME ruminants (gas production)",
  "MEn growing pig", "MEn rabbit",
  "NE growing pig"
)

# ── 1 · conversion ─────────────────────────────────────────────────────────
feedipedia <- feedipedia %>%
  rowwise() %>%
  mutate(
    conv = list(
      if (Variable %in% c(nutrition_vars, energy_vars))      # ← look in BOTH lists
        to_canonical(
          Avg, Unit,
          if_else(Variable %in% energy_vars, "energy", "nutrient")
        )
      else list(val = Avg, unit = Unit)      # leave untouched
    )
  ) %>%
  tidyr::unnest_wider(conv) %>%
  mutate(Avg = val, Unit = unit) %>%
  select(-val, -unit) %>%
  ungroup()

7.2) add nutrition and digestibility from feedipedia

# --- A) Add Feedipedia node from mastersheet into Comp and Digest tables
add_feedipedia_node <- function(df, mastersheet) {
  df %>%
    left_join(
      mastersheet %>% select(Edge_Value, Feedipedia),
      by = c("D.Item" = "Edge_Value")
    ) %>%
    distinct()
}

merged_metadata$Animals.Diet.Comp   <- add_feedipedia_node(merged_metadata$Animals.Diet.Comp, mastersheet)
merged_metadata$Animals.Diet.Digest <- add_feedipedia_node(merged_metadata$Animals.Diet.Digest, mastersheet)

# --- B) Standardize Feedipedia variables and assign categories
digestibility_keywords <- c(
  "digestibility", "degradability", "energy digestibility", 
  "DE", "ME", "NE", "AMEn", "OM digestibility", "DM digestibility",
  "ruminants", "rabbit", "pig", "poultry", "horse", "salmonids", "hydrolysis"
)

feedipedia <- feedipedia %>%
  mutate(
    Variable = case_when(
      tolower(Variable) == "gross energy" ~ "GE",
      tolower(Variable) == "crude protein" ~ "CP",
      tolower(Variable) == "ether extract" ~ "EE",
      tolower(Variable) == "ash" ~ "Ash",
      tolower(Variable) == "neutral detergent fibre
" ~ "NDF",
      TRUE ~ Variable
    ),
    category = case_when(
      Variable %in% c("GE", "CP", "EE", "Ash","NDF") ~ "nutrition",
      str_detect(Variable, regex(paste(digestibility_keywords, collapse = "|"), ignore_case = TRUE)) ~ "digestibility",
      TRUE ~ "nutrition"
    ),
    feedipedia_node = as.numeric(feedipedia_node)
  )

# --- C) Ensure source columns exist
if (!"nutrition_source" %in% names(merged_metadata$Animals.Diet.Comp)) {
  merged_metadata$Animals.Diet.Comp$nutrition_source <- NA_character_
}
if (!"digestibility_source" %in% names(merged_metadata$Animals.Diet.Digest)) {
  merged_metadata$Animals.Diet.Digest$digestibility_source <- NA_character_
}

# --- D) Nutrition variables: GE, CP, EE, Ash
nutrition_vars <- c("GE", "CP", "EE", "Ash","NDF")

feedipedia_nutrition <- feedipedia %>%
  filter(Variable %in% nutrition_vars) %>%
  group_by(feedipedia_node, Variable, Unit) %>%
  summarise(Value = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
  rename(DC.Variable = Variable, DC.Value.feedipedia = Value)

# --- E) Join nutrition into Animals.Diet.Comp
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>% 
  left_join(
    feedipedia_nutrition,
    by = c("Feedipedia" = "feedipedia_node", "DC.Variable")
  ) %>% 
  mutate(
    # 1.  If DC.Value is missing *and* Feedipedia has a value,
    #     copy the Feedipedia unit; otherwise keep the existing one
    DC.Unit  = if_else(is.na(DC.Value) & !is.na(DC.Value.feedipedia),
                       Unit,                   # unit from Feedipedia
                       DC.Unit),               # keep existing
    
    # 2.  Now copy the value itself
    DC.Value = coalesce(DC.Value, DC.Value.feedipedia),
    
    # 3.  Flag the source when we used Feedipedia
    nutrition_source = if_else(
      is.na(nutrition_source) & !is.na(DC.Value.feedipedia),
      "feedipedia", nutrition_source)
  ) %>% 
  select(-DC.Value.feedipedia, -Unit)




# --- F) Digestibility: Energy digestibility, ruminants → DE
feedipedia_digestibility <- feedipedia %>%
  filter(str_detect(tolower(Variable), "energy digestibility")) %>%
  mutate(
    DD.Variable = "DE",  # Map all energy digestibility to "DE"
    feedipedia_node = as.numeric(feedipedia_node)
  ) %>%
  group_by(feedipedia_node, DD.Variable, Unit) %>%
  summarise(DD.Value.feedipedia = mean(Avg, na.rm = TRUE), .groups = "drop")

merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
  mutate(Feedipedia = as.numeric(Feedipedia)) %>%
  left_join(feedipedia_digestibility, by = c("Feedipedia" = "feedipedia_node", "DD.Variable")) %>%
  mutate(
    value_missing = is.na(DD.Value),
    DD.Value = coalesce(DD.Value, DD.Value.feedipedia),
    DD.Unit  = if_else(value_missing & !is.na(DD.Value.feedipedia), Unit, DD.Unit),
    digestibility_source = if_else(value_missing & !is.na(DD.Value.feedipedia), "feedipedia", digestibility_source)
  ) %>%
  select(-DD.Value.feedipedia, -Unit, -value_missing)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# # Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
#   filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
#   distinct(B.Code) %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_with_GE")
# 
# # View result
# ge_papers

8) Infer nutritional and digestibility data from raw data and feedipedia

Now that we have both raw and Feedipedia data, we can use them to infer missing values. If some instances of an ingredient have nutritional or digestibility data while others do not, we can impute the missing values using the average of the available ones.

8.1) Nutritional composition inferrence

# --- Standardize Units in Animals.Diet.Comp ---
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(
    # 1) normalise text --------------------------------------------------------
    DC.Unit     = tolower(trimws(DC.Unit)),
    DC.Value    = as.numeric(DC.Value)
  ) %>% 
  
  # 2) convert values ----------------------------------------------------------
  mutate(
    DC.Value = case_when(
      # --- gross energy → MJ kg-1 -------------------------------------------
      DC.Variable == "GE" ~ case_when(
        DC.Unit %in% c("mj/kg", "mj/kg dm")                       ~ DC.Value,
        DC.Unit %in% c("kcal/kg", "kcal/kg dm")                   ~ DC.Value * 0.004184,
        DC.Unit ==  "kcal/g"                                      ~ DC.Value * 4.184,
        DC.Unit ==  "kcal/100g"                                   ~ DC.Value * 0.04184,   # fixed
        DC.Unit ==  "cal/g"                                       ~ DC.Value * 0.004184,  # fixed
        DC.Unit ==  "mcal/kg"                                     ~ DC.Value * 4.184,
        DC.Unit %in% c("kj/kg", "kj/kg dm")                       ~ DC.Value / 1000,
        DC.Unit ==  "kj/g"                                        ~ DC.Value,             # already MJ/kg
        DC.Unit %in% c("kj/100g", "kj/100g dm", "kj/100 g dm")    ~ DC.Value * 0.01,
        DC.Unit ==  "mj/100g"                                     ~ DC.Value * 10,
        DC.Unit ==  "j/mg"                                        ~ DC.Value,             # fixed
        TRUE                                                      ~ NA_real_
      ),
      
      # --- everything else → g kg-1 -----------------------------------------
      TRUE ~ case_when(
        DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
        DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value,
        DC.Unit %in% c("kg", "kg/day")                                 ~ DC.Value * 1000,
        TRUE                                                           ~ NA_real_
      )
    ),
    
    # 3) harmonised unit labels ----------------------------------------------
    DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", "g/kg")
  )



# --- Compute Mean Value Per Ingredient and Variable ---
ingredient_means <- merged_metadata$Animals.Diet.Comp %>%
  filter(is_entire_diet == FALSE, !is.na(DC.Value)) %>%
  group_by(D.Item, DC.Variable) %>%
  summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  left_join(ingredient_means, by = c("D.Item", "DC.Variable")) %>%
  mutate(
    nutrition_source = case_when(
      nutrition_source == "feedipedia" ~ "feedipedia",  # preserve Feedipedia
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value) ~ "inferred_from_same_ingredient",  # imputed
      is.na(nutrition_source) & !is.na(DC.Value) ~ "raw data",  # only if there's a value
      TRUE ~ nutrition_source  # keep whatever is there
    ),
    DC.Value = if_else(
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value),
      mean_value,
      DC.Value
    )
  ) %>%
  select(-mean_value)

8.2) Digestibility inferrence

# --- B) Compute Mean Digestibility Per Ingredient and Variable ---
digest_means <- merged_metadata$Animals.Diet.Digest %>%
  filter(DD.Variable == "DE", is_entire_diet == FALSE, !is.na(DD.Value)) %>%
  group_by(D.Item, DD.Variable, DD.Unit) %>%
  summarise(mean_digest = mean(DD.Value, na.rm = TRUE), .groups = "drop")


# --- C) Fill Missing Digestibility Values with Ingredient Mean ---
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
  left_join(digest_means, by = c("D.Item", "DD.Variable")) %>%
  mutate(
    digestibility_source = case_when(
      digestibility_source == "feedipedia" ~ "feedipedia",
      is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest) ~ "inferred_from_same_ingredient",
      is.na(digestibility_source) ~ "raw data",
      TRUE ~ digestibility_source
    ),
    DD.Value = if_else(
      is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest),
      mean_digest,
      DD.Value
    )
  ) %>%
  select(-mean_digest)

since there is only very few digestibility in the raw data logic that we dont gain any data by doing this because feedipedia is infered for all the ingredients we have the info for.

# ✅ Count how many papers had GE filled by Feedipedia, and how many of those are Cattle / Sheep / Goat

# Step 1: Filter for Feedipedia-filled GE entries
ge_filled_by_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE", nutrition_source == "feedipedia") %>%
  distinct(B.Code)

# Step 2: Total number of studies
n_ge_feedipedia <- nrow(ge_filled_by_feedipedia)

# Step 3: Add species info and filter to ruminants
ge_species_feedipedia <- ge_filled_by_feedipedia %>%
  left_join(merged_metadata$`Prod.Out` %>% select(B.Code, P.Product), by = "B.Code") %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product)

# Step 4: Count per species
# ge_species_counts <- ge_species_feedipedia %>%
#   count(P.Product, name = "Papers_with_GE_from_Feedipedia")
# 
# # Display summary
# cat("📌 Total papers with GE filled from Feedipedia:", n_ge_feedipedia, "\n")
# cat("📌 Of those, species breakdown:\n")
# print(ge_species_counts)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# # Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
#   filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
#   distinct(B.Code) %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_with_GE")
# 
# # View result
# ge_papers
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")

# Initialize result containers
species_summary <- list()
bcode_table <- list()

# Loop over each species
for (species in species_list) {
  
  # All B.Codes for this species
  species_bcodes <- merged_metadata$Prod.Out %>%
    filter(P.Product == species) %>%
    pull(B.Code) %>%
    unique()
  
  # Feed intake B.Codes
  feed_data <- merged_metadata$Data.Out %>%
    filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
    select(B.Code, D.Item = ED.Intake.Item)
  
  feed_bcodes <- unique(feed_data$B.Code)
  
  # GE B.Codes
  ge_data <- merged_metadata$Animals.Diet.Comp %>%
    filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
    select(B.Code, D.Item)
  
  ge_bcodes <- unique(ge_data$B.Code)
  
  # Matched B.Codes (join on both B.Code and D.Item)
  matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
    pull(B.Code) %>%
    unique()
  
  # Store species-level summary
  species_summary[[species]] <- tibble(
    Species = species,
    Total_BCodes = length(species_bcodes),
    With_FEED = length(feed_bcodes),
    With_GE = length(ge_bcodes),
    With_BOTH = length(matched_bcodes)
  )
  
  # Store long-format bcode list
  bcode_table[[species]] <- tibble(
    B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
    Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"), 
                   times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
    Species = species
  ) %>% distinct()
}

# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)

# Show results
print(species_summary_df)
## # A tibble: 3 × 5
##   Species Total_BCodes With_FEED With_GE With_BOTH
##   <chr>          <int>     <int>   <int>     <int>
## 1 Cattle           127       127     104        23
## 2 Sheep            222       222     195        66
## 3 Goat             186       185     162        39
head(bcode_table_df)
## # A tibble: 6 × 3
##   B.Code Category Species
##   <chr>  <chr>    <chr>  
## 1 NN0188 All      Cattle 
## 2 NN0293 All      Cattle 
## 3 AG0018 All      Cattle 
## 4 AG0051 All      Cattle 
## 5 EO0041 All      Cattle 
## 6 EO0067 All      Cattle

###7.3 Coverage after Feedipedia

coverage_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
  filter(!is.na(DC.Variable)) %>%
  mutate(
    # Clean and detect actual missing values
    DC.Value_clean = str_trim(as.character(DC.Value)),
    DC.Value_clean = na_if(DC.Value_clean, ""),
    DC.Value_clean = na_if(DC.Value_clean, "NA"),
    DC.Value_clean = na_if(DC.Value_clean, "na")
  ) %>%
  group_by(DC.Variable) %>%
  summarise(
    n_total = n(),
    n_non_missing = sum(!is.na(DC.Value_clean)),
    coverage_percent = round(100 * n_non_missing / n_total, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(coverage_percent))

9) Tagg diet practices

9.1) Tagg ingredient with control or trt

##— build the control key from your merged MT.Out
ctrl_key <- merged_metadata$MT.Out[, .(
  B.Code,
  A.Level.Name,
  T.Name,
  T.Control
)]

##— example join back onto your ingredient table
# (assuming your ingredients DT has B.Code + A.Level.Name)
ingredients <- merge(
  x = ingredients, 
  y = ctrl_key, 
  by = c("B.Code","A.Level.Name"), 
  all.x = TRUE, 
  sort = FALSE
)

10) Relative % of each ingredient in the diet

This table presents the relative percentage of each ingredient within a diet. To ensure accurate proportion calculations, we first filtered the dataset to include only diets with consistent units. Diets with incompatible or non-harmonizable units were excluded, as their proportions could not be reliably determined.

# (a) Ingredient-based totals ----------------------------------------------------
consistent_diets <- ingredients %>%                       # ignore "Entire Diet" rows
  filter(D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  mutate(Unit_Count = n_distinct(D.Unit.Amount)) %>%        # always 1 after conversion
  filter(Unit_Count == 1) %>%                               # keep only single-unit diets
  ungroup()

diet_completeness <- consistent_diets %>% 
  filter(D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    n_total      = n(),
    n_nonmissing = sum(!is.na(D.Amount)),
    .groups      = "drop"
  ) %>%
  filter(n_total == n_nonmissing)

total_ingr <- consistent_diets %>%
  semi_join(diet_completeness, by = c("B.Code", "A.Level.Name")) %>%
  group_by(B.Code, A.Level.Name, D.Unit.Amount) %>%
  summarise(Total_Amount_ingr = sum(D.Amount), .groups = "drop")

# (b) Feed-intake figures that already represent the whole diet -----------------
feed_intake_diet_g <- feed_intake_diet %>% 
  mutate(
    # convert the numeric amount ---------------------------------------------
    Intake_Amount_g = case_when(
      Out.Unit.Amount == "kg" ~ ED.Mean.T * 1000,  # kg → g
      Out.Unit.Amount == "g"  ~ ED.Mean.T,         # already g
      TRUE                    ~ NA_real_           # any other unit → NA
    ),
    # overwrite the *unit* once it has been converted -------------------------
    Out.Unit.Amount = case_when(
      Out.Unit.Amount %in% c("kg", "g") ~ "g",      # now everything is g
      TRUE                              ~ Out.Unit.Amount
    )
  ) %>% 
  rename(D.Unit.Amount = Out.Unit.Amount) %>%       # unify the column name
  select(B.Code, A.Level.Name, D.Unit.Amount, Intake_Amount_g)


total_amounts <- full_join(
  feed_intake_diet_g,
  total_ingr,
  by = c("B.Code", "A.Level.Name", "D.Unit.Amount")
) %>%
  mutate(
    Total_Amount = coalesce(Intake_Amount_g, Total_Amount_ingr)
  ) %>%
  select(B.Code, A.Level.Name, D.Unit.Amount, Total_Amount)


# ────────────────────────────────────────────────────────────────────────────────
# 2.  Check unit correspondence  -------------------------------------------------
unit_check <- ingredients %>%
  select(B.Code, A.Level.Name, Ingredient_Unit = D.Unit.Amount) %>%
  distinct() %>%
  left_join(
    total_amounts %>%
      select(B.Code, A.Level.Name, Total_Unit = D.Unit.Amount),
    by = c("B.Code", "A.Level.Name")
  ) %>%
  mutate(Consistent_Units = Ingredient_Unit == Total_Unit)

# View the correspondence table (interactive)
datatable(
  unit_check,
  options = list(pageLength = 10, scrollX = TRUE, autoWidth = TRUE),
  caption = "Unit correspondence between ingredient rows and diet totals"
)
# Keep only diets whose units are consistent
consistent_unit_diets <- unit_check %>%
  filter(Consistent_Units) %>%
  select(B.Code, A.Level.Name)

ingredients_clean <- ingredients %>%
  semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))

totals_clean <- total_amounts %>%
  semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))

# ────────────────────────────────────────────────────────────────────────────────
# 3.  Calculate percentage contribution of each ingredient ----------------------
diet_percentages <- ingredients_clean %>% 
  filter(D.Type != "Entire Diet") %>%        # <-- keep the pipe!
  left_join(totals_clean,
            by = c("B.Code", "A.Level.Name", "D.Unit.Amount")) %>% 
  mutate(
    Percentage_of_Diet = ifelse(Total_Amount > 0,
                                (D.Amount / Total_Amount) * 100,
                                NA_real_)
  ) %>% 
  select(
    B.Code, A.Level.Name, D.Item, D.Type,
    D.Amount, D.Unit.Amount, Percentage_of_Diet
  )

# Interactive tables ------------------------------------------------------------
datatable(
  totals_clean,
  options = list(pageLength = 10, scrollX = TRUE, autoWidth = TRUE),
  caption = "Total amount of each diet (grams)"
)
datatable(
  diet_percentages,
  options = list(pageLength = 10, scrollX = TRUE, autoWidth = TRUE),
  caption = "Relative percentage of each ingredient within its diet"
)
# Clean up helper objects if desired
rm(consistent_diets, diet_completeness, total_ingr,
   feed_intake_diet_g, unit_check, consistent_unit_diets)
# Check if each diet sums close to 100%
validation <- diet_percentages %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(Total_Percentage = sum(Percentage_of_Diet, na.rm=TRUE)) %>%
  ungroup()

# Display diets and their total percentages
# datatable(
#   validation, 
#   options = list(
#     pageLength = 10,         # Show 10 rows per page by default
#     scrollX = TRUE,          # Enable horizontal scrolling
#     autoWidth = TRUE,        # Adjust column width automatically
#     searchHighlight = TRUE   # Highlight search terms
#   ),
#   caption = "Validation table"
# )

The lines that have 0 as validation are the ones that were feeding ad libitum for which we do not have a precise amount so it’s impossible to calculate the relative proportion of the ingredients.

11) Emission calculations

11.1 Estimation of Enteric Methane Emissions Using IPCC Tier 2

Enteric methane (CH₄) emissions were estimated using the IPCC Tier 2 methodology, which incorporates animal-specific data such as gross energy intake (GE) and methane conversion factors (Ym). This approach improves accuracy over Tier 1 by reflecting diet composition, feed intake, and species-specific differences.

11.2 Estimation of Daily GE Intake per Animal

For GE values reported at the entire diet level, daily gross energy intake per animal was calculated as:

\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]

Where:

  • \(\text{GE}_{\text{diet}}\) is the energy content of the complete diet (MJ/kg)
  • \(\text{Diet Dry Amount}\) is the daily dry matter ingredient (kg/day/animal). We can also infer from feed intake if missing.Needs to be dry matter so we will use only data when DC.is.Dry=yes.

To do that we will have ton convert all feed intake data into kg/day/animal.

When GE values were reported at the ingredient level, and information on diet composition (ingredient proportions) was available, the diet-level GE was reconstructed as:

\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]

Where:

  • \(\text{GE}_i\) is the GE value of ingredient \(i\)
  • \(p_i\) is the proportion of ingredient \(i\) in the total diet

Then, daily intake was computed using the same formula above.

11.3. Methane Emissions Calculation (Tier 2)

Annual methane emissions were calculated using the IPCC Tier 2 equation:

\[ \text{CH}_4 = \frac{\text{GE}_{\text{daily}} \times \text{Ym} \times 365}{55.65} \]

Where:

  • \(\text{CH}_4\) = annual enteric methane emissions (kg CH₄/animal/year)
  • \(\text{GE}_{\text{daily}}\) = daily gross energy intake per animal (MJ/day)
  • \(\text{Ym}\) = methane conversion factor:
    • Cattle: 0.065 (6.5%)
    • Sheep and Goats: 0.060 (6.0%)
  • 365 = days per year
  • 55.65 = energy content of methane (MJ/kg CH₄)

11.4 Data Requirements and Supporting Tables

1. Table of Diet Composition Proportions

This table defines the proportion of each ingredient in the overall diet:

# Create an interactive table for "total_amounts"
datatable(
  diet_percentages, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Relative % of Ingredients"
)

Used to compute reconstructed GE_diet from ingredient-level GE values:

\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]


2. Table of Ingredient Dry Amounts

This table stores the amount of each ingredient fed per animal per day, already in or converted to dry matter:

# This 'ingredients' now contains all diets + basal integrated.
# Table 1: Final diet amounts with harmonized units (including basal)
datatable(
  ingredients, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Diet Ingredients"
)

Useful when GE is per ingredient and diet is not proportionally described.


3. Table of Total Diet Dry Amounts

This is the total daily dry matter offered or consumed per animal, either directly from intake or inferred from ingredient sums:

datatable(
  total_amounts, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Total Amounts of Diet Ingredients"
)

\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 104
Goat 186 162
Sheep 222 195

12) Gross energy data

12.1) Gross energy harmonization

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE") %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 104
Goat 186 162
Sheep 222 195
# Inspect the units used for GE


Gross_energy <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE")

Gross_energy$DC.Value<-as.numeric(Gross_energy$DC.Value)

unique(Gross_energy$DC.Unit)
## [1] "g/kg"  "MJ/kg"
# Harmonize only GE values and overwrite DC.Value
## ---- harmonise_gross_energy_to_MJkg -----------------------------------------
Gross_energy <- Gross_energy %>% 
  mutate(
    ## 1) lower-case + trim once ----------------------------------------------
    unit_raw    = tolower(trimws(DC.Unit))
  ) %>% 
  
  ## 2) convert GE values to MJ/kg --------------------------------------------
  mutate(
    DC.Value = case_when(
      DC.Variable == "GE" ~ case_when(
        unit_raw %in% c("mj/kg", "mj/kg dm")                       ~ DC.Value,
        unit_raw %in% c("kcal/kg", "kcal/kg dm")                   ~ DC.Value * 0.004184,
        unit_raw ==  "kcal/g"                                      ~ DC.Value * 4.184,
        unit_raw ==  "kcal/100g"                                   ~ DC.Value * 0.04184,  # 1 kcal = 0.004184 MJ, ×10 (100 g→kg)
        unit_raw ==  "cal/g"                                       ~ DC.Value * 0.004184, # 1 cal g⁻¹ → 0.004184 MJ kg⁻¹
        unit_raw ==  "mcal/kg"                                     ~ DC.Value * 4.184,
        unit_raw %in% c("kj/kg", "kj/kg dm")                       ~ DC.Value / 1000,
        unit_raw ==  "kj/g"                                        ~ DC.Value,            # already MJ kg⁻¹ (kJ g⁻¹ ×1000)
        unit_raw %in% c("kj/100g", "kj/100g dm", "kj/100 g dm")    ~ DC.Value * 0.01,
        unit_raw ==  "mj/100g"                                     ~ DC.Value * 10,
        unit_raw ==  "j/mg"                                        ~ DC.Value,            # J mg⁻¹ → MJ kg⁻¹ (10⁻⁶ / 10⁻⁶)
        TRUE                                                       ~ NA_real_
      ),
      TRUE ~ DC.Value  # non-GE rows stay intact
    ),
    
    ## 3) update unit label ----------------------------------------------------
    DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", unit_raw)
  ) %>% 
  
  ## 4) tidy up ---------------------------------------------------------------
  select(-unit_raw)




unique(Gross_energy$DC.Unit)
## [1] "g/kg"  "MJ/kg"
Gross_energy <- Gross_energy %>%
  # STEP 1: Create GE_source from nutrition_source
  mutate(
    GE_source = case_when(
      nutrition_source == "feedipedia" ~ "feedipedia",
      nutrition_source == "inferred_from_same_ingredient" ~ "inferred_from_same_ingredient",
      nutrition_source == "raw data" ~ "raw data",
      TRUE ~ NA_character_
    )
  )%>%
  select(-"nutrition_source")

g/kg ❌ Not an energy unit Likely a mistake, investigate or exclude % / % DM / % as fed ❌ Not energy units Same as above Unspecified ❌ Ambiguous Investigate or exclude

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 89
Goat 186 146
Sheep 222 178

12.2 Gross energy data check

Gross_energy_check <- Gross_energy %>%
  filter(!is.na(DC.Value))%>%
  filter(GE_source=="raw data")

12.3) Gross energy for non nutritional items

# ── 1 · Identify the non-energy ingredients ───────────────────────────────────
nonenergy_types <- c("Non-ERA Feed", "Supplement")   # ← NEW: add more if needed

# ── 1 · Look-up table of those items ─────────────────────────────────────────
nonenergy_lookup <- merged_metadata$Animals.Diet %>%   # or use `ingredients`
  filter(D.Type %in% nonenergy_types) %>%              # ← NEW: %in% the vector
  distinct(B.Code, D.Item)

# ── 2 · Make sure every (paper × item) has a GE row ───────────────────────────
missing_nonera_ge <- nonenergy_lookup %>% 
  anti_join(                                               # keep only those
    Gross_energy %>% filter(DC.Variable == "GE") %>%       # already in GE table
      select(B.Code, D.Item),
    by = c("B.Code", "D.Item")
  ) %>% 
  mutate(
    DC.Variable = "GE",
    DC.Value    = 0,
    DC.Unit     = "MJ/kg",
    GE_source   = "assumed_zero"
  )

# ── 3 · Bind and override (value, unit, source) for every Non-ERA row ─────────
Gross_energy <- Gross_energy %>% 
  # add the rows that were completely missing
  bind_rows(missing_nonera_ge) %>% 
  
  # tag which rows are Non-ERA so we can overwrite them
  left_join(nonenergy_lookup %>% mutate(flag_nonera = TRUE),
            by = c("B.Code", "D.Item")) %>% 
  
  mutate(
    DC.Value  = if_else(DC.Variable == "GE" & flag_nonera, 0, DC.Value),
    DC.Unit   = if_else(DC.Variable == "GE" & flag_nonera, "MJ/kg", DC.Unit),
    GE_source = if_else(DC.Variable == "GE" & flag_nonera, "assumed_zero", GE_source)
  ) %>% 
  select(-flag_nonera)                                       # tidy up

13) Estimate gross energy for ingredients from proximate nutritional composition

To estimate Gross Energy (GE) when direct measurements are not available, we use a predictive equation based on proximate nutritional components: crude protein (CP), ether extract (EE), and ash. This method provides an accurate approximation of the energy content of feed ingredients based on their macronutrient composition.

The equation used is adapted from Weiss et al. (2019) and is expressed as:

GE (Mcal/kg) = (0.056 × CP) + (0.094 × EE) + [0.042 × (100 − CP − EE − Ash)]
GE (MJ/kg) = GE (Mcal/kg) × 4.184
# --- STEP 1: Prepare predictors (CP, EE, Ash) as % of DM ---
# Keep relevant metadata for later
metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
  select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
  distinct()

ge_predictors <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable %in% c("CP", "EE", "Ash")) %>%
  mutate(
    DC.Unit = trimws(tolower(DC.Unit)),
    DC.Value = as.numeric(DC.Value),

    # Harmonize to % of DM
    DC.Value = case_when(
      DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value,
      DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value / 10,
      DC.Unit %in% c("kg", "kg/day")                                ~ DC.Value * 100,
      TRUE ~ NA_real_
    ),
    DC.Unit = "%"
  ) %>%
  group_by(B.Code, D.Item, DC.Variable) %>%
  summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
  left_join(metadata_cols, by = c("B.Code", "D.Item"))

# --- STEP 2: Apply Weiss GE equation only where CP, EE, Ash are all present ---
ge_predicted <- ge_predictors %>%
  mutate(
    GE_Mcal_per_kg = ifelse(
      !is.na(CP) & !is.na(EE) & !is.na(Ash),
      (CP * 0.056) + (EE * 0.094) + ((100 - CP - EE - Ash) * 0.042),
      NA_real_
    ),
    GE_MJ_per_kg = GE_Mcal_per_kg * 4.184
  ) %>%
  select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)


# --- STEP 3: Merge and only impute where GE is missing ---
Gross_energy <- Gross_energy %>%
  
  # STEP 2: Join GE predictions (e.g. from Weiss)
  full_join(ge_predicted, by = c("B.Code", "D.Item","is_entire_diet","Feedipedia")) %>%
  
  # STEP 3: Only update GE_source if it is still missing
  mutate(
    GE_source = case_when(
      is.na(GE_source) & !is.na(GE_MJ_per_kg) & is.na(DC.Value) ~ "predicted_from_Weiss_GE",
      TRUE ~ GE_source
    ),
    DC.Value = if_else(is.na(DC.Value), GE_MJ_per_kg, DC.Value),
    DC.Unit = "MJ/kg",
    DC.Variable = "GE"
  ) %>%
  select(-GE_MJ_per_kg)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 119
Goat 186 179
Sheep 222 217

14) Infer gross energy from data

As we did in 9) we are inferring again GE data after estimation by proximal nutritional composition.

ingredient_means <- Gross_energy %>%
  filter(is_entire_diet == FALSE, !is.na(DC.Value)) %>%
  group_by(D.Item, DC.Variable) %>%
  summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")

# --- Fill Missing Values with Mean ---
Gross_energy <- Gross_energy %>%
  left_join(ingredient_means, by = c("D.Item", "DC.Variable")) %>%
  mutate(
    GE_source = case_when(
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value) ~ "inferred_from_same_ingredient-Step2",
      TRUE ~ GE_source
    ),
    DC.Value = if_else(
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value),
      mean_value,
      DC.Value
    )
  ) %>%
  select(-mean_value)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 119
Goat 186 179
Sheep 222 217

15) Estimate Digestible energy from proximate nutritional composition

# --- STEP 4: Prepare predictors for Weiss DE equation ---
# --- STEP 1: Store relevant metadata before pivot ---
de_metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
  select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
  distinct()

# --- STEP 2: Prepare DE predictors and harmonize units ---
de_predictors <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable %in% c("CP", "EE", "Ash", "NDF")) %>%
  mutate(
    DC.Unit = trimws(tolower(DC.Unit)),
    DC.Value = as.numeric(DC.Value),

    # Harmonize to g/kg DM
    DC.Value = case_when(
      DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
      DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value,
      DC.Unit %in% c("kg", "kg/day")                                ~ DC.Value * 1000,
      TRUE ~ NA_real_
    ),
    DC.Unit = "g/kg"
  ) %>%
  group_by(B.Code, D.Item, DC.Variable) %>%
  summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
  left_join(de_metadata_cols, by = c("B.Code", "D.Item"))

# --- STEP 3: Apply Weiss DE equation and convert to GE ---
de_predicted <- de_predictors %>%
  mutate(
    DE_Mcal_per_kg = ifelse(
      !is.na(CP) & !is.na(EE) & !is.na(Ash) & !is.na(NDF),
      1.044 + 0.0101 * CP + 0.0161 * EE - 0.0037 * Ash - 0.0023 * NDF,
      NA_real_
    ),
    DE_MJ_per_kg = DE_Mcal_per_kg * 4.184,
    GE_MJ_per_kg = ifelse(!is.na(DE_MJ_per_kg), DE_MJ_per_kg / 0.54, NA_real_)
  ) %>%
  select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)


# --- STEP 6: Fill remaining missing GE using DE-based predictions only if GE is still missing ---
Gross_energy <- full_join(Gross_energy, de_predicted, by = c("B.Code", "D.Item", "is_entire_diet", "Feedipedia")) %>%
  mutate(
    DC.Value = if_else(is.na(DC.Value) & !is.na(GE_MJ_per_kg), GE_MJ_per_kg, DC.Value),
    GE_source = case_when(
      is.na(GE_source) & is.na(DC.Value) & !is.na(GE_MJ_per_kg) ~ "predicted_from_Weiss_DE",
      TRUE ~ GE_source  # keep existing label (e.g., feedipedia, raw data, inferred, or missing)
    ),
    DC.Unit = "MJ/kg",
    DC.Variable = "GE"
  ) %>%
  select(-GE_MJ_per_kg)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 119
Goat 186 179
Sheep 222 217

We dont get more papers seems logical because probably the same ingredient as the equation for GE.

16) Convert digestible energy into gross energy

Converts digestible energy (DE) to gross energy (GE) using a default digestibility factor (54%) when DE is reported but GE is not.

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
DE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "DE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
DE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(DE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_DE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(DE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_DE
Cattle 127 0
Goat 186 2
Sheep 222 2

Most of the DE data is coming from feediepdia which is expressed as a % of DE so we can’t convert that data back using the IPCC method.

# # Define digestibility conversion factor (Song et al., 2025)
# default_DE_fraction <- 0.54  # 54%
# 
# # 1. Extract DE rows from merged_metadata
# de_data <- merged_metadata$Animals.Diet.Digest %>%
#   filter(DD.Variable == "DE", !is.na(DD.Value) & DD.Value != "") %>%
#   mutate(
#     DD.Value = as.numeric(DD.Value),
#     DD.Unit = trimws(tolower(DD.Unit)),
#     DD.Value = case_when(
#       DD.Unit %in% c("mj/kg", "mj/kg dm") ~ DD.Value,
#       DD.Unit == "mcal/kg" ~ DD.Value * 4.184,
#       DD.Unit == "kcal/kg" ~ DD.Value / 239,
#       DD.Unit == "kcal/kg dm" ~ DD.Value / 239,
#       DD.Unit == "kcal/100g" ~ (DD.Value * 10) / 239,
#       DD.Unit == "kj/g" ~ DD.Value / 1000,
#       DD.Unit == "kj/100g" ~ DD.Value / 100,
#       TRUE ~ NA_real_
#     ),
#     DD.Unit = "MJ/kg"
#   ) %>%
#   mutate(
#     GE_from_DE = DD.Value / default_DE_fraction
#   ) %>%
#   select(B.Code, D.Item, GE_from_DE)
# 
# # 2. Merge GE_from_DE into your existing Gross_energy dataset
# Gross_energy <- Gross_energy %>%
#   left_join(de_data, by = c("B.Code", "D.Item")) %>%
#   mutate(
#     # Flag only when GE was previously missing and DE is available
#     GE_source = case_when(
#       is.na(GE_source) & is.na(DC.Value) & !is.na(GE_from_DE) ~ "inferred_from_DE",
#       TRUE ~ GE_source
#     ),
#     # Fill in missing GE values with GE_from_DE
#     DC.Value = if_else(is.na(DC.Value), GE_from_DE, DC.Value),
#     DC.Unit = "MJ/kg",
#     DC.Variable = "GE"
#   ) %>%
#   select(-GE_from_DE)
# 1. Total studies per species
species_total <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product)

# 2. GE: Only B.Codes where DC.Variable == GE and DC.Value is NOT NA
ge_bcodes <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
  distinct(B.Code) %>%
  pull()

# 3. Feed: Only B.Codes where D.Amount is NOT NA
feed_amount_bcodes <- ingredients %>%
  filter(!is.na(D.Amount)) %>%
  distinct(B.Code) %>%
  pull()

# 4. B.Codes with BOTH GE value AND feed amount
both_bcodes <- intersect(ge_bcodes, feed_amount_bcodes)

# 5. Build final summary table by species
final_summary <- species_total %>%
  mutate(
    Has_GE = B.Code %in% ge_bcodes,
    Has_FEED_or_Amount = B.Code %in% feed_amount_bcodes,
    Has_BOTH = B.Code %in% both_bcodes
  ) %>%
  group_by(P.Product) %>%
  summarise(
    Total_Studies = n_distinct(B.Code),
    Studies_with_GE = sum(Has_GE),
    Studies_with_FEED_or_Amount = sum(Has_FEED_or_Amount),
    Studies_with_BOTH = sum(Has_BOTH)
  ) %>%
  arrange(P.Product)

# Show summary
knitr::kable(final_summary, caption = "Final Summary (with actual GE and Feed Amount values)")
Final Summary (with actual GE and Feed Amount values)
P.Product Total_Studies Studies_with_GE Studies_with_FEED_or_Amount Studies_with_BOTH
Cattle 127 119 63 62
Goat 186 179 105 102
Sheep 222 217 147 145
# Optional: List the actual B.Codes with both
bcode_list_both <- species_total %>%
  filter(B.Code %in% both_bcodes) %>%
  arrange(P.Product, B.Code)

# Show the B.Codes table
knitr::kable(bcode_list_both, caption = "B.Codes with BOTH GE (DC.Value) and Feed Intake (D.Amount)")
B.Codes with BOTH GE (DC.Value) and Feed Intake (D.Amount)
B.Code P.Product
AG0018 Cattle
BO1033 Cattle
BO1038 Cattle
BO1040 Cattle
BO1054 Cattle
BO1055 Cattle
BO1070 Cattle
BO1072 Cattle
BO1080 Cattle
CJ1005 Cattle
CJ1006 Cattle
EM1022 Cattle
EM1047 Cattle
EM1050 Cattle
EM1079 Cattle
EM1084 Cattle
EM1101 Cattle
HK0086 Cattle
HK0138 Cattle
HK0270 Cattle
JO1004 Cattle
JO1045 Cattle
JO1046 Cattle
JO1050 Cattle
JO1053 Cattle
JO1091 Cattle
JS0176 Cattle
JS0205 Cattle
JS0213.1 Cattle
JS0214 Cattle
JS0215.1 Cattle
JS0215.2 Cattle
JS0269.2 Cattle
JS0324.1 Cattle
JS0324.2 Cattle
LM0101 Cattle
LM0115 Cattle
LM0117.1 Cattle
LM0117.2 Cattle
LM0117.3 Cattle
LM0123 Cattle
LM0124 Cattle
LM0126 Cattle
LM0151.1 Cattle
LM0151.2 Cattle
LM0227.1 Cattle
LM0227.2 Cattle
LM0228.1 Cattle
LM0228.2 Cattle
LM0229 Cattle
NJ0016 Cattle
NJ0025 Cattle
NN0005 Cattle
NN0188 Cattle
NN0207.2 Cattle
NN0216 Cattle
NN0227 Cattle
NN0253 Cattle
NN0255 Cattle
NN0280.1 Cattle
NN0299 Cattle
NN0315 Cattle
AG0042 Goat
AN0057 Goat
BO1002 Goat
BO1003 Goat
BO1005 Goat
BO1009 Goat
BO1019 Goat
BO1026 Goat
BO1029 Goat
BO1037 Goat
BO1042 Goat
BO1046 Goat
BO1065 Goat
BO1068 Goat
BO1074 Goat
BO1076 Goat
BO1079 Goat
BO1083 Goat
BO1088 Goat
BO1091 Goat
BO1093 Goat
BO1098 Goat
BO1099 Goat
CJ1002 Goat
CJ1014 Goat
EM1005 Goat
EM1011 Goat
EM1014 Goat
EM1025 Goat
EM1028 Goat
EM1030 Goat
EM1045 Goat
EM1052 Goat
EM1053 Goat
EM1056 Goat
EM1072 Goat
EM1073 Goat
EM1074 Goat
EM1077 Goat
EM1078 Goat
EM1100 Goat
EM1107 Goat
EO0078 Goat
HK0029 Goat
HK0107 Goat
HK0108 Goat
HK0143 Goat
HK0171 Goat
JO1003 Goat
JO1005 Goat
JO1010 Goat
JO1023 Goat
JO1031 Goat
JO1034 Goat
JO1039.2 Goat
JO1044 Goat
JO1048 Goat
JO1051 Goat
JO1060 Goat
JO1079 Goat
JO1080 Goat
JO1081 Goat
JO1087 Goat
JO1099 Goat
JO1100 Goat
JS0048 Goat
JS0095 Goat
JS0115 Goat
JS0118 Goat
JS0119 Goat
JS0172 Goat
JS0192 Goat
JS0197 Goat
JS0206 Goat
JS0211 Goat
JS0243 Goat
JS0261 Goat
JS0289.2 Goat
JS0292 Goat
JS0349 Goat
JS0424 Goat
LM0103 Goat
LM0163 Goat
LM0248 Goat
LM0266 Goat
LM0280 Goat
LM0322 Goat
LM0329 Goat
LM0349 Goat
NJ0013 Goat
NN0092.2 Goat
NN0211 Goat
NN0229 Goat
NN0236 Goat
NN0251 Goat
NN0272.1 Goat
NN0272.1.2 Goat
NN0276.2 Goat
NN0361 Goat
NN0362 Goat
NN0374 Goat
NN0375 Goat
AN0007 Sheep
AN0021 Sheep
AN0029 Sheep
BO1000 Sheep
BO1004 Sheep
BO1006 Sheep
BO1008 Sheep
BO1011 Sheep
BO1017 Sheep
BO1020 Sheep
BO1021 Sheep
BO1024 Sheep
BO1025 Sheep
BO1027 Sheep
BO1032 Sheep
BO1034 Sheep
BO1036 Sheep
BO1046 Sheep
BO1048 Sheep
BO1049 Sheep
BO1050 Sheep
BO1051 Sheep
BO1059 Sheep
BO1070 Sheep
BO1082 Sheep
BO1085 Sheep
BO1086 Sheep
BO1087 Sheep
BO1096 Sheep
CJ1001 Sheep
CJ1004 Sheep
CJ1015 Sheep
EM1017 Sheep
EM1020 Sheep
EM1032 Sheep
EM1034 Sheep
EM1037 Sheep
EM1038 Sheep
EM1039 Sheep
EM1046 Sheep
EM1048 Sheep
EM1075 Sheep
EM1076 Sheep
EM1081 Sheep
EM1087 Sheep
EM1090 Sheep
EM1094 Sheep
EM1102 Sheep
EM1104 Sheep
EO0046 Sheep
EO0074 Sheep
HK0011 Sheep
HK0033 Sheep
HK0154 Sheep
HK0260 Sheep
HK0329 Sheep
JO1013 Sheep
JO1015 Sheep
JO1020 Sheep
JO1022 Sheep
JO1024 Sheep
JO1027 Sheep
JO1028 Sheep
JO1030 Sheep
JO1035 Sheep
JO1036 Sheep
JO1038 Sheep
JO1039.1 Sheep
JO1039.3 Sheep
JO1039.5a Sheep
JO1041 Sheep
JO1042 Sheep
JO1052 Sheep
JO1057 Sheep
JO1059 Sheep
JO1064 Sheep
JO1082 Sheep
JO1084 Sheep
JO1085 Sheep
JO1086 Sheep
JO1094 Sheep
JS0046 Sheep
JS0140 Sheep
JS0147 Sheep
JS0207 Sheep
JS0209 Sheep
JS0210.2 Sheep
JS0249 Sheep
JS0262 Sheep
JS0263 Sheep
JS0265 Sheep
JS0271 Sheep
JS0272 Sheep
JS0273 Sheep
JS0281 Sheep
JS0285 Sheep
JS0289.1 Sheep
JS0290 Sheep
JS0291 Sheep
JS0301 Sheep
JS0311 Sheep
JS0314 Sheep
JS0344 Sheep
JS0419 Sheep
LM0018 Sheep
LM0020 Sheep
LM0104 Sheep
LM0114 Sheep
LM0139 Sheep
LM0289 Sheep
LM0295 Sheep
LM0297 Sheep
LM0298 Sheep
LM0299 Sheep
LM0311 Sheep
LM0336 Sheep
LM0339 Sheep
NJ0008 Sheep
NJ0032 Sheep
NJ0044 Sheep
NN0007 Sheep
NN0021 Sheep
NN0022 Sheep
NN0106 Sheep
NN0179 Sheep
NN0180.1 Sheep
NN0189 Sheep
NN0196 Sheep
NN0197 Sheep
NN0200 Sheep
NN0201 Sheep
NN0202 Sheep
NN0203 Sheep
NN0204 Sheep
NN0205 Sheep
NN0207.1 Sheep
NN0212 Sheep
NN0214 Sheep
NN0217 Sheep
NN0224 Sheep
NN0234 Sheep
NN0316 Sheep
NN0329 Sheep
NN0345 Sheep
NN0377 Sheep

determine ingredient types based on nutritional values

Category Description Organic Matter (% DM) Ash (% DM) Protein (% DM) NDF (% DM) ADF (% DM) ADL (% DM) Starch (% DM) Ether Extract (% DM) Gross Energy (MJ kg-¹ DM) Metabolizable Energy (MJ kg-¹ DM) DM digestibility Feeds’ examples (incl. industrial by-products)
I High-digestible energy feeds (non-fibre carbohydrate sources: starch & sugars) ~96–98 ~2–4 8–12 Low (10–20) Low (4–8) Very low (0.5–2) High (50–75) Low (2–4) ~17–19 12–14 Very high (>80 %) Maize grain, barley, wheat, wheat bran, rice bran, hominy feed, molasses, beet pulp, citrus pulp
II High-digestible energy feeds (fat/oil sources) >99 <1 <5 Very low (<5) Very low (<2) Negligible (<0.5) Negligible (<1) Very high (80–99) ~30–38 20–25 Very high (>90 %) Vegetable oils (and residues), tallow, animal fats, protected fats
III High-protein feeds ~92–95 ~5–8 35–50 Low–moderate (10–30) Low–moderate (6–10) Low (1–3) Low (<5) Low–moderate (2–6) ~18–20 11–13 Very high (>75 %) Soybean meal, canola meal, cottonseed meal, sunflower meal, distillers’ grains, fish meal, meat & bone meal, blood meal
IV Good-quality forages & moderately digestible by-products ~90–93 ~7–10 12–20 Moderate (40–50) Moderate (25–35) Moderate (3–6) Low (1–5) Low (1–3) ~16–18 8–11 Moderate (55–70 %) Young grass hay, good-quality silages, dried beet tops, peanut hulls, soybean hulls
V Poor-quality roughages & fibrous by-products ~90–94 ~6–10 3–8 High (60–75) High (40–50) High (6–10) Very low (<2) Very low (<2) ~15–17 5–8 Low (<50 %) Mature pasture, straw, bagasse, maize stover, rice hulls, peanut shells, cottonseed hulls
VI Mineral/vitamin supplements ~50 (variable) ~50–95 Negligible Negligible Negligible Negligible Negligible Negligible Negligible Negligible Highly digestible (N/A) Mineral premixes, vitamin premixes, salt licks, bone meal, mineralised animal by-products
###############################################################################
## Draw updated feed‑classification decision tree   (DiagrammeR)
###############################################################################
# install.packages("DiagrammeR")  # run once if not installed
library(DiagrammeR)
## Warning: package 'DiagrammeR' was built under R version 4.4.3
grViz("
digraph feed_tree {
  graph [rankdir = TB, fontsize = 10, fontname = Helvetica]
  node  [shape = rectangle, style = filled, fillcolor = GhostWhite,
         fontname = Helvetica, fontsize = 9]

  start   [label = 'Start\\n(NDF, CP, Ash, EE, GE)']
  VI_q    [label = 'Ash > 500 g/kg ?']
  II_q    [label = 'EE ≥ 300 g/kg\\nAND GE > 25 MJ/kg ?']
  III_q   [label = 'CP ≥ 300 g/kg ?']
  V_q     [label = 'NDF ≥ 600 g/kg ?']
  IV_q    [label = '400 ≤ NDF < 600 ?']
  I_chk   [label = 'Low fibre (<400) &\\nCP<300 & EE<300 & Ash≤500 &\\nGE 15–25 (if known) ?',
           fontsize = 8]

  VI  [label = 'VI  | Mineral / vitamin',      fillcolor = LemonChiffon]
  II  [label = 'II  | Fat / oil energy',       fillcolor = LemonChiffon]
  III [label = 'III | High‑protein',           fillcolor = LemonChiffon]
  V   [label = 'V   | Poor roughage',          fillcolor = LemonChiffon]
  IV  [label = 'IV  | Good forage /\\nmoderate by‑product', fillcolor = LemonChiffon]
  I   [label = 'I   | Starch / sugar energy',  fillcolor = LemonChiffon]
  NAx [label = 'NA  | Unclassified',           fillcolor = Tomato, fontcolor = white]

  start -> VI_q
  VI_q  -> VI [label = 'Yes']
  VI_q  -> II_q [label = 'No']

  II_q  -> II [label = 'Yes']
  II_q  -> III_q [label = 'No']

  III_q -> III [label = 'Yes']
  III_q -> V_q  [label = 'No']

  V_q   -> V  [label = 'Yes']
  V_q   -> IV_q [label = 'No']

  IV_q  -> IV [label = 'Yes']
  IV_q  -> I_chk [label = 'No']

  I_chk -> I   [label = 'Yes']
  I_chk -> NAx [label = 'No']
}
")
###############################################################################
################################################################################
## 17‑new)  Ingredient‑level feed‑class assignment
##          • averages all rows of an ingredient first
##          • NDF / CP / Ash / EE in g kg‑¹ DM ; GE in MJ kg‑¹ DM
################################################################################
library(dplyr)
library(tidyr)

vars_needed <- c("NDF", "CP", "Ash", "EE", "GE")

# ── 1 · Build one mean row per ingredient ────────────────────────────────────
nutr_wide <- merged_metadata$Animals.Diet.Comp %>% 
  filter(DC.Variable %in% vars_needed, !is_entire_diet) %>% 
  mutate(DC.Value = as.numeric(DC.Value)) %>% 
  group_by(D.Item, DC.Variable) %>%                 # ← only D.Item, no B.Code
  summarise(val = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>% 
  pivot_wider(names_from = DC.Variable, values_from = val)

# ── 2 · Explicit six‑class decision rules ------------------------------------
classify_feed_gkg <- function(NDF, CP, Ash, EE, GE){
  ok  <- function(x) !is.na(x)
  nOK <- sum(ok(NDF), ok(CP), ok(Ash), ok(EE), ok(GE))
  
  if(ok(Ash) && Ash > 500)                                       return("VI")   # mineral
  if(ok(EE)  && EE  >= 300 && ok(GE) && GE > 25)                 return("II")  # fat/oil
  if(ok(CP)  && CP  >= 300)                                      return("III") # protein
  if(ok(NDF) && NDF >= 600)                                      return("V")   # poor roughage
  if(ok(NDF) && NDF >= 400 && NDF < 600)                         return("IV")  # good forage
  if(nOK > 0 &&
     (!ok(NDF) || NDF < 400) &&
     (!ok(CP)  || CP  < 300) &&
     (!ok(EE)  || EE  < 300) &&
     (!ok(Ash) || Ash <= 500) &&
     (!ok(GE)  || (GE >= 15 & GE <= 25)))                        return("I")   # starch/sugar
  
  NA_character_                                                                # unclassified
}

# ── 3 · Apply the classifier --------------------------------------------------
nutr_wide <- nutr_wide %>% 
  rowwise() %>% 
  mutate(Category = classify_feed_gkg(NDF, CP, Ash, EE, GE)) %>% 
  ungroup()

# ── 4 · Carry the ingredient‑level Category back into your data --------------
#    (join only on D.Item because the class is the same for every paper)
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  left_join(nutr_wide %>% select(D.Item, Category), by = "D.Item")

if (exists("ingredients")) {
  ingredients <- ingredients %>% 
    left_join(nutr_wide %>% select(D.Item, Category), by = "D.Item")
}

# ── 5 · Quick audit ----------------------------------------------------------
table(nutr_wide$Category, useNA = "ifany")
## 
##    I   II  III   IV    V <NA> 
##  597    9  138  273  264  646
################################################################################

supplements

## A · one authoritative lookup ------------------------------------------------
type_lookup <- merged_metadata$Animals.Diet %>% 
  distinct(B.Code, D.Item, D.Type)

vi_types <- c("Non-ERA Feed", "Supplement")
vi_regex <- regex("supplement", ignore_case = TRUE)

set_vi <- function(df){
  df2 <- df
  
  # add D.Type if it isn't there yet
  if(!"D.Type" %in% names(df2)){
    df2 <- df2 %>% left_join(type_lookup, by = c("B.Code", "D.Item"))
  }
  
  # if D.Type still missing, just leave Category as-is
  if(!"D.Type" %in% names(df2)){
    warning("D.Type still absent after join – Category unchanged for this table")
    return(df2)
  }
  
  df2 %>% 
    mutate(
      Category = if_else(
        !is.na(D.Type) &
        (D.Type %in% vi_types | str_detect(D.Type, vi_regex)),
        "VI",
        Category
      )
    )
}

## B · run it on both tables ---------------------------------------------------
merged_metadata$Animals.Diet.Comp <- set_vi(merged_metadata$Animals.Diet.Comp)
## Warning in left_join(., type_lookup, by = c("B.Code", "D.Item")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 279 of `x` matches multiple rows in `y`.
## ℹ Row 5198 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
if (exists("ingredients")) {
  ingredients <- set_vi(ingredients)
}

## optional sanity check
table(merged_metadata$Animals.Diet.Comp$Category, useNA = "ifany")
## 
##      I     II    III     IV      V     VI   <NA> 
##  79495    704  47565  42525  60043  73472 217534
###############################################################################
## A · rebuild / refresh Category column in nutr_wide --------------------------
nutr_wide <- nutr_wide %>% 
  select(-Category) %>%                                  # drop any old class
  left_join(
    merged_metadata$Animals.Diet.Comp %>% 
      distinct(D.Item, Category),                        # <- no B.Code here
    by = "D.Item"
  )


## B · 1) overall share of rows that now have a class --------------------------
overall <- nutr_wide %>% 
  summarise(
    Total_rows    = n(),
    Assigned_rows = sum(!is.na(Category)),
    `% assigned`  = round(100 * Assigned_rows / Total_rows, 1)
  )

kable(overall, caption = "Overall coverage of the Category field")
Overall coverage of the Category field
Total_rows Assigned_rows % assigned
1958 1392 71.1
## C · 2) number of **unique** ingredients per category ------------------------
uniq_per_cat <- nutr_wide %>% 
  filter(!is.na(Category)) %>% 
  group_by(Category) %>% 
  summarise(
    Unique_ingredients = n_distinct(D.Item),
    Rows               = n(),                 # row count if you still want it
    .groups = "drop"
  ) %>% 
  arrange(Category)

kable(uniq_per_cat, caption = "Unique ingredients by category")
Unique ingredients by category
Category Unique_ingredients Rows
I 594 594
II 8 8
III 135 135
IV 273 273
V 263 263
VI 119 119
## D · 3) ten *random* ingredients in each category ---------------------------
## D · 3) ten *random* ingredients in each category ---------------------------
set.seed(222)   # remove or change for a different random draw

rand10 <- nutr_wide %>% 
  filter(!is.na(Category)) %>% 
  group_by(Category) %>% 
  slice_sample(prop = 1) %>%         # randomise the whole group
  slice_head(n = 10) %>%             # keep up to 10 rows
  mutate(rank = row_number()) %>% 
  ungroup()

rand10_wide <- rand10 %>% 
  select(Category, rank, D.Item) %>% 
  tidyr::pivot_wider(names_from = rank, values_from = D.Item)

kable(rand10_wide,
      caption = "Ten random ingredients in every category")
Ten random ingredients in every category
Category 1 2 3 4 5 6 7 8 9 10
I Gamba grass 50% starter Hominy Sweet Potato Peels Grower Diet 1 Cotton Seed Cake||Milling;Undecorticated Calliandra calothyrsus Leaves Dried(Air,Shade) Ground Finisher Pithecellobium dulce Leaves Dried(Sun) Grewia similis Dried
II Sesame Seed Dried Ground Soaked Sunflower Seed Sunflower Seed Ground Sesbania sesban Seed Sesame Seed Ground Sesame Seed Sesame Seed Cake Autoclaved Dried Ground Sunflower Ground NA NA
III Corn Steep Liquor Soybean Meal Winged Bean Meal Pelleted Sun-dried Fish Meal Soybean||Milling Termite Dried Ground Cotton Seed Cake Ground Sesbania sesban Leaves Dried(Air) Linseed Cake Canola Cake
IV Oat||Drying Pasture Ficus exasperata Maize||Chopping||Ensilation Wheat||Milling Tribulus terrestris Euphorbia hirta Poultry Litter Deep Stacked Dried Groundnut Haulm Cocoa Bean Shell Dried Ground
V Grass Dried HSB silage Unspecified Grass Native Grass||Chopping||Drying Centrosema pubescens Leaves Andropogon Forest Pasture Pennisetum purpureum Wild Chopped Trifolium alexandrinum Straw||Urea Treatment 3% Urea treated maize stover with 5% molasses
VI Monocalcium Phosphate Kanwa-Potassium Salt Elancoban Minerals and Vitamins Premix Avatec Free Gossypol Calcium Oxide Mineral Lick Binder Bone Ash
###############################################################################
## E · Ingredients that remain unclassified (Category == NA)
###############################################################################
###############################################################################
## E · Ingredients still unclassified, counted by unique study
##     (“one vote per B.Code”)
###############################################################################
unclass_freq <- merged_metadata$Animals.Diet.Comp %>% 
  filter(is.na(Category)) %>% 
  filter(is_entire_diet=="FALSE")%>%# keep only the NA‑class rows
  distinct(D.Item, B.Code) %>%         # one row per ingredient *per study*
  count(D.Item, sort = TRUE, name = "Studies")   # how many studies

kable(head(unclass_freq, 20),
      caption = "Top 20 unclassified ingredients (ranked by # of studies)")
Top 20 unclassified ingredients (ranked by # of studies)
D.Item Studies
Water 232
Salt 173
Concentrate 58
Wheat Offal 46
Maize Meal 35
Soybean Oil 21
Blood Meal 17
Vegetable Oil 17
Palm Oil 16
Cod Liver Oil 12
Rice Offal 12
Lucerne 11
Oyster Shells 11
Sunflower Oil 11
Maize Oil 10
Unspecified Molasses Meal 10
Fish Oil 9
Salt Block 9
Salt Lick 7
Cashew Shell 6

practice diet comparison

# ──────────────────────────────────────────────────────────────────────────────
# 0 · Pre-processing helpers ----------------------------------------------------
# Convert D.Amount to numeric and drop strict duplicates that would otherwise
# blow up the joins further down the pipeline.
# ──────────────────────────────────────────────────────────────────────────────

ingredients_clean <- ingredients %>% 
  mutate(D.Amount = as.numeric(D.Amount)) %>% 
  distinct(B.Code, A.Level.Name, D.Item, D.Amount,
           D.Unit.Amount, D.Unit.Time, D.Unit.Animals,
           T.Control,                             # <- keep it
           .keep_all = TRUE)


# Optional sanity check ---------------------------------------------------------

dup_left <- ingredients_clean %>%
  count(B.Code, A.Level.Name, D.Item, D.Amount,
        D.Unit.Amount, D.Unit.Time, D.Unit.Animals) %>%
  filter(n > 1)

if (nrow(dup_left) == 0) {
  message("✅ all identical-amount duplicates removed.")
} else {
  message("⚠️ still some duplicates—investigate dup_left.")
}
## ⚠️ still some duplicates—investigate dup_left.
# ──────────────────────────────────────────────────────────────────────────────
# quantify_practice_changes() ---------------------------------------------------
#   Computes how one diet changes relative to another within each study.
#   * Works diet-by-diet, study-by-study (keyed by B.Code)
#   * Collapses duplicate ingredient rows *within* each diet before joining
#   * Avoids Cartesian explosions without requiring allow.cartesian = TRUE
# ──────────────────────────────────────────────────────────────────────────────

quantify_practice_changes <- function(ingredients_clean, min_pct = 0.5) {
  ing <- as.data.table(copy(ingredients_clean))  # never modify caller’s object

  #── 0 · basic housekeeping ---------------------------------------------------
  ing[, D.Amount := as.numeric(D.Amount)]
  ing[, T.Control := tolower(trimws(T.Control))]

  # kg → g so we have only one mass basis to compare
  ing[D.Unit.Amount == "kg", `:=`(D.Amount = D.Amount * 1000,
                                    D.Unit.Amount = "g")]

  # % of diet DM per ingredient (computed diet-by-diet)
  ing[, total_g := sum(D.Amount, na.rm = TRUE), by = .(B.Code, A.Level.Name)]
  ing[, pct_diet := (D.Amount / total_g) * 100]

  #── 1 · work study-by-study --------------------------------------------------
  result <- ing[, {
    dt <- .SD                         # only rows for this B.Code (paper)

    # Identify control diets BEFORE collapsing, because we need T.Control ----
    ctrl  <- unique(dt[T.Control == "yes", A.Level.Name])
    diets <- unique(dt$A.Level.Name)

    # Collapse duplicate ingredient rows *within* each diet -------------------
    dt <- dt[, .(
      Category          = first(Category),          # keep the category
      pct_diet        = sum(pct_diet),
      D.Unit.Amount   = first(D.Unit.Amount),
      D.Unit.Time     = first(D.Unit.Time),
      D.Unit.Animals  = first(D.Unit.Animals)
    ), by = .(A.Level.Name, D.Item)]

    # Build comparison pairs --------------------------------------------------
    pairs <- list()
    if (length(ctrl)) {
      trt <- setdiff(diets, ctrl)
      if (length(trt)) {
        for (c_diet in ctrl) {
          for (t_diet in trt) {
            pairs[[length(pairs) + 1]] <- c(c_diet, t_diet)
          }
        }
      }
    }
    if (length(diets) > 1) {
      pairs <- c(pairs, combn(diets, 2, simplify = FALSE))
    }

    # If no pairs, emit empty DT ---------------------------------------------
    if (length(pairs) == 0) {
      NULL
    } else {
      rbindlist(lapply(pairs, function(p) {
        diet_A <- p[1]; diet_B <- p[2]
        dA <- dt[A.Level.Name == diet_A]
        dB <- dt[A.Level.Name == diet_B]

        grid <- merge(
          dA[, .(D.Item, Category, pct_A = pct_diet,
                   unit_A = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals))],
          dB[, .(D.Item, Category, pct_B = pct_diet,
                   unit_B = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals))],
          by = c("D.Item", "Category"), all = TRUE, allow.cartesian = FALSE
        )

        # Unit compatibility check -------------------------------------------
        grid[, unit_ok := (is.na(unit_A) | is.na(unit_B) | unit_A == unit_B)]
        unit_mismatch <- grid[unit_ok == FALSE, D.Item]
        grid <- grid[unit_ok == TRUE]

        # Fill NAs, compute deltas, apply threshold ---------------------------
        grid[is.na(pct_A), pct_A := 0]
        grid[is.na(pct_B), pct_B := 0]
        grid[, delta := pct_B - pct_A]
        grid <- grid[abs(delta) >= min_pct]

        if (nrow(grid) == 0) return(NULL)

        # Aggregate change metrics (overall) ----------------------------------
        total_added    <- grid[pct_A == 0 & delta  > 0,  sum(delta)]
        total_removed  <- grid[pct_B == 0 & delta  < 0, -sum(delta)]
        total_increase <- grid[pct_A > 0 & pct_B > 0 & delta > 0,  sum(delta)]
        total_decrease <- grid[pct_A > 0 & pct_B > 0 & delta < 0, -sum(delta)]
        n_changed      <- nrow(grid)

        # Aggregate *by D.Type_Nutri* ----------------------------------------------
        fmt <- function(dt) {
          if (nrow(dt) == 0) return(NA_character_)
          paste(paste0(dt$Category, ":", round(dt$V1,2)), collapse = "; ")
        }

        add_by_type  <- fmt(grid[pct_A == 0 & delta  > 0,  .(V1 = sum(delta)), by = Category])
        rem_by_type  <- fmt(grid[pct_B == 0 & delta  < 0,  .(V1 = -sum(delta)), by = Category])
        inc_by_type  <- fmt(grid[pct_A > 0 & pct_B > 0 & delta > 0,  .(V1 = sum(delta)), by = Category])
        dec_by_type  <- fmt(grid[pct_A > 0 & pct_B > 0 & delta < 0,  .(V1 = -sum(delta)), by = Category])

        # Practice label ------------------------------------------------------
        practice_type <- fifelse(total_added   > 0 & total_removed > 0, "substitution",
                          fifelse(total_added   > 0 & total_removed == 0, "addition",
                          fifelse(total_added   == 0 & total_removed > 0, "removal",
                                 "dosage_change")))

        compare_to <- if (diet_A %in% ctrl) "control" else paste0("other_trt:", diet_A)

        data.table(
          diet_A               = diet_A,
          diet_B               = diet_B,
          compare_to           = compare_to,
          practice_type        = practice_type,
          total_pct_added      = round(total_added,   2),
          total_pct_removed    = round(total_removed, 2),
          total_pct_increase   = round(total_increase,2),
          total_pct_decrease   = round(total_decrease,2),
          added_by_type        = add_by_type,
          removed_by_type      = rem_by_type,
          increase_by_type     = inc_by_type,
          decrease_by_type     = dec_by_type,
          n_ingredients_changed= n_changed,
          unit_mismatch        = paste(unit_mismatch, collapse = "; ")
        )
      }), fill = TRUE)
    }
  }, by = .(B.Code)]

  # final column order ---------------------------------------------------------
  setcolorder(result,
              c("B.Code","diet_A","diet_B","compare_to","practice_type",
                "total_pct_added","total_pct_removed",
                "total_pct_increase","total_pct_decrease",
                "added_by_type","removed_by_type",
                "increase_by_type","decrease_by_type",
                "n_ingredients_changed","unit_mismatch"))

  as.data.frame(result)
}


changes <- quantify_practice_changes(ingredients_clean, min_pct = 1)  # 1 % threshold


changes<- changes %>% 
  mutate(pair_id = map2_chr(diet_A, diet_B, ~paste(sort(c(.x, .y)), collapse = "|"))) %>% 
  distinct(B.Code, pair_id, .keep_all = TRUE) %>%        # keep the first orientation
  select(-pair_id)
# ── 1 · Helper: explode one “…_by_type” column into tidy rows ─────────────
explode <- function(.data, col, kind) {                         # { 1
  .data %>% 
    select(B.Code, diet_A, diet_B, all_of(col)) %>% 
    mutate(pair_id = paste(diet_A, "→", diet_B)) %>%            # tag each comparison
    mutate(across(all_of(col), ~ ifelse(is.na(.x) | .x == "", NA, .x))) %>% 
    separate_rows(all_of(col), sep = ";\\s*") %>%               # "A:10; B:20" → rows
    filter(!is.na(.data[[col]])) %>% 
    separate(.data[[col]], into = c("Category", "pct"),           # "A:10" → A | 10
             sep = ":", convert = TRUE, fill = "right") %>% 
    mutate(
      # ─ collapse categories ────────────────────────────────────────────
      Category = case_when(
        str_detect(Category, regex("supplement",  ignore_case = TRUE)) ~ "Supplement",
        str_detect(Category, regex("concentrate", ignore_case = TRUE)) ~ "Concentrate",
        TRUE                                                        ~ Category
      ),
      change_kind = kind
    ) %>% 
    filter(Category != "Entire Diet")                               # drop whole-diet rows
}                                                                 # } 1

# ── 2 · Build a long table of ingredient-type moves ────────────────────
type_changes <- bind_rows(                                       # ( 2
  explode(changes, "added_by_type",    "added"),
  explode(changes, "removed_by_type",  "removed"),
  explode(changes, "increase_by_type", "increase"),
  explode(changes, "decrease_by_type", "decrease")
)                                                                 # ) 2

# ── 3 · Count how many comparisons show each Practice × Type combo ────
summary <- type_changes %>% 
  group_by(Category, change_kind) %>% 
  summarise(n_comparisons = n_distinct(pair_id), .groups = "drop")

# ── 4 · Plot: bar height = number of comparisons ───────────────────────
ggplot(summary,
       aes(Category, n_comparisons, fill = change_kind)) +
  geom_col(position = "dodge") +
  labs(title = "How often each practice affected each ingredient type",
       x     = "Ingredient type",
       y     = "# of diet-pair comparisons",
       fill  = "Change kind") +
  theme_bw() +
  theme(axis.text.x  = element_text(angle = 45, hjust = 1),
        legend.position = "top")

changes %>% 
  mutate(abs_delta = case_when(
            practice_type == "addition"      ~  total_pct_added,
            practice_type == "removal"       ~  total_pct_removed,
            practice_type == "substitution"  ~  pmax(total_pct_added,
                                                    total_pct_removed),
            practice_type == "dosage_change" ~  pmax(total_pct_increase,
                                                    total_pct_decrease)
          )) %>% 
  ggplot(aes(practice_type, abs_delta, fill = practice_type)) +
  geom_violin(trim = FALSE, alpha = .4, colour = NA) +
  geom_boxplot(width = .15, outlier.shape = NA) +
  stat_summary(fun = median, geom = "point", size = 2, colour = "black") +
  labs(x = NULL, y = "Magnitude (%-points of diet DM)",
       title = "Distribution of practice intensities") +
  theme_bw() +
  theme(legend.position = "none")

17) Calculation of CH4 emissions

To estimate enteric methane emissions, a methane conversion factor (Ym) is applied to the gross energy (GE) intake of livestock. Ym represents the proportion of the animal’s gross energy intake that is converted to methane and varies based on the species, productivity level, and diet composition. The IPCC (2019) provides updated Ym values based on feed digestibility and forage content. Below is the logic applied:

kable(
  data.frame(
    Category = c("Dairy Cattle (High)", "Dairy Cattle (Medium)", "Dairy Cattle (Low)", "Dairy Cattle (Unknown)",
                 "Meat Cattle (>75% forage)", "Meat Cattle (15–75% forage)", "Meat Cattle (<15% forage)", "Meat Cattle (Unknown)",
                 "Sheep", "Lambs", "Goats"),
    Ym_Percent = c(5.7, 6.3, 6.5, 6.0, 7.0, 6.3, 4.0, 6.5, 6.7, 4.5, 5.5),
    Condition = c("> 8500 kg milk/year", "5000–8500 kg milk/year", "< 5000 kg milk/year", "Unknown milk yield",
                  "Forage > 75%", "Forage 15–75%", "Forage < 15%", "Unknown forage %", 
                  "Default for Sheep", "If 'lamb' in Herd.Stage", "Default for Goats")
  ),
  caption = "Assigned Ym Values Based on IPCC 2019 Guidelines and Available Data"
)
Assigned Ym Values Based on IPCC 2019 Guidelines and Available Data
Category Ym_Percent Condition
Dairy Cattle (High) 5.7 > 8500 kg milk/year
Dairy Cattle (Medium) 6.3 5000–8500 kg milk/year
Dairy Cattle (Low) 6.5 < 5000 kg milk/year
Dairy Cattle (Unknown) 6.0 Unknown milk yield
Meat Cattle (>75% forage) 7.0 Forage > 75%
Meat Cattle (15–75% forage) 6.3 Forage 15–75%
Meat Cattle (<15% forage) 4.0 Forage < 15%
Meat Cattle (Unknown) 6.5 Unknown forage %
Sheep 6.7 Default for Sheep
Lambs 4.5 If ‘lamb’ in Herd.Stage
Goats 5.5 Default for Goats

17.1) Ym estimations

# Step 1: Identify purpose of each B.Code from the production output table
# Dairy = Milk Yield, Meat = Meat Yield

#harmonize yield data units 

yield_data<-merged_metadata$Data.Out%>%
  filter(Out.Subind %in% c("Meat Yield","Milk Yield"))

yield_data <- yield_data %>%
  mutate(
    Out.Unit = tolower(trimws(Out.Unit)),  # Normalize casing and whitespace
    Out.Unit = case_when(
      # Common direct mappings
      Out.Unit %in% c("kg", "kg/d", "kg/day", "kg/day/individual", "kg/individual/day", 
                      "kg/d/individual", "kg/individual") ~ "kg/individual/day",
      
      Out.Unit %in% c("g", "g/d", "g/day", "g/day/individual", "g/d/individual", 
                      "g/individual/day", "g/individual") ~ "g/individual/day",
      
      Out.Unit %in% c("mg/d/individual") ~ "mg/individual/day",
      
      Out.Unit %in% c("l/d", "l/day", "l/d/individual", "l/individual/day", "L/individual/day",
                      "l/individual") ~ "l/individual/day",
      
      Out.Unit %in% c("ml/d/individual", "ml/individual", "ml/individual/day") ~ "ml/individual/day",
      
      
      # Composite or unclear units
      Out.Unit == "g/kg/individual" ~ "g/kg/individual",

      
      Out.Unit == "kg/replicate/experiment" ~ "kg/replicate/experiment",
      
      TRUE ~ Out.Unit  # fallback
    )
  )

yield_data <- yield_data %>%
  mutate(
    Out.Unit = tolower(trimws(Out.Unit)),

    # Convert to standard units
    ED.Mean.T = case_when(
      Out.Unit == "g/individual/day" ~ ED.Mean.T / 1000,         # g → kg
      Out.Unit == "mg/individual/day" ~ ED.Mean.T / 1e6,         # mg → kg
      Out.Unit == "ml/individual/day" ~ ED.Mean.T / 1000,        # ml → l
      TRUE ~ ED.Mean.T                                            # leave others
    ),

    # Normalize units
    Out.Unit = case_when(
      Out.Unit %in% c("kg/individual/day", "g/individual/day", "mg/individual/day") ~ "kg/individual/day",
      Out.Unit %in% c("l/individual/day", "ml/individual/day") ~ "l/individual/day",
      TRUE ~ NA_character_  # mark invalid units
    )
  ) %>%
  # Keep only acceptable units
  filter(Out.Unit %in% c("kg/individual/day", "l/individual/day"))

milk_yield <- yield_data %>%
  filter(Out.Subind == "Milk Yield", Out.Unit == "kg/individual/day") %>%
  group_by(B.Code) %>%
  summarise(
    milk_kg_year = mean(ED.Mean.T, na.rm = TRUE) * 365,
    .groups = "drop"
  )


# Step 1: Get purpose (Milk vs. Meat) per B.Code, prioritizing Milk if both exist
purpose_by_code <- merged_metadata$Data.Out %>%
  filter(Out.Subind %in% c("Milk Yield", "Meat Yield")) %>%
  mutate(
    Purpose = case_when(
      Out.Subind == "Milk Yield" ~ "Dairy",
      Out.Subind == "Meat Yield" ~ "Meat",
      TRUE ~ NA_character_
    )
  ) %>%
  group_by(B.Code) %>%
  # Prioritize Dairy if both exist
  summarise(Purpose = if ("Dairy" %in% Purpose) "Dairy" else first(Purpose), .groups = "drop")



ingredients_cleaned <- ingredients %>%
  mutate(
    D.Amount = ifelse(D.Unit.Amount == "g", D.Amount / 1000, D.Amount),
    D.Unit.Amount = ifelse(D.Unit.Amount == "g", "kg", D.Unit.Amount)
  )

# Step 3: Keep only diets where all units are now consistent (all in kg)
diet_valid_units <- ingredients_cleaned %>%
  group_by(B.Code, A.Level.Name) %>%
  filter(n_distinct(D.Unit.Amount, na.rm = TRUE) == 1) %>%
  ungroup()


# Step 2: Tag forage types (you can expand this list as needed)
forage_types <- c("Forage Trees", "Crop Byproduct", 
                  "Herbaceous Fodders", "Agroforestry Fodders")

forage_proportions <- diet_valid_units %>%
  filter(D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    any_na = any(is.na(D.Amount)),  # Check for NA in D.Amount
    total_diet = ifelse(any_na, NA_real_, sum(D.Amount, na.rm = TRUE)),
    total_forage = ifelse(any_na, NA_real_, sum(D.Amount[D.Type %in% forage_types], na.rm = TRUE)),
    n_ingredients = n(),
    is_forage = all(D.Type[1] %in% forage_types),
    D.Type.first = D.Type[1],
    D.Unit.Amount = first(D.Unit.Amount),
    forage_prop = case_when(
      n_ingredients == 1 & D.Type.first %in% forage_types ~ 1,
      n_ingredients == 1 & !(D.Type.first %in% forage_types) ~ 0,
      total_diet > 0 ~ total_forage / total_diet,
      TRUE ~ NA_real_
    ),
    .groups = "drop"
  )
# 
# ingredients <- ingredients %>%                       # your table
#   # 1 ── throw away the old “source‑based” label -----------------------------
#   select(-D.Type) %>%                                       # drop the column
# 
#   # 2 ── remove any diet that still has an NA in D.Type_Nutri ----------------
#   group_by(B.Code, T.Name) %>%       # <<– keys that identify a single diet
#   filter( all(!is.na(D.Type_Nutri)) ) %>%  # keep diet only if none are NA
#   ungroup()

17.2) Reconstruct Gross Energy at the diet level from ingredient data

Reconstructs the gross energy of the entire diet from the weighted average of GE values of individual ingredients, when diet-level GE is not reported.

# --- Calculate GE_diet (MJ/kg) from ingredient GE and amount ---

# 1. Join GE values to ingredient amounts
# --- Calculate GE_diet (MJ/kg) ONLY if all ingredients have GE ---

# 1. Join GE values to ingredient amounts
ingredients_with_GE <- ingredients %>%
  filter(D.Type!="Entire Diet")%>%
  mutate(Diet_ID = paste(B.Code, A.Level.Name, sep = "__")) %>%
  left_join(
    Gross_energy %>%
      filter(!is_entire_diet) %>%
      select(B.Code, D.Item, DC.Value),
    by = c("B.Code", "D.Item")
  ) %>%
  mutate(
    D.Amount_kg = case_when(
      D.Unit.Amount == "g" ~ D.Amount / 1000,
      D.Unit.Amount == "kg" ~ D.Amount,
      TRUE ~ NA_real_
    )
  )

# 2. Count ingredients with/without GE
ingredient_counts <- ingredients_with_GE %>%
  group_by(Diet_ID) %>%
  summarise(
    n_total = n(),
    n_with_GE = sum(!is.na(DC.Value)),
    .groups = "drop"
  ) %>%
  filter(n_total == n_with_GE)

# 3. Reconstruct GE_diet (MJ/kg)
GE_from_ingredients <- ingredients_with_GE %>%
  semi_join(ingredient_counts, by = "Diet_ID") %>%
  group_by(Diet_ID, B.Code, A.Level.Name) %>%
  summarise(
    GE_diet = sum(DC.Value * D.Amount_kg) / sum(D.Amount_kg),
    .groups = "drop"
  )

# 4. Use GE_diet when full diet-level GE is missing
# Join forage proportions into GE_diet_reconstructed
GE_diet_reconstructed <- GE_from_ingredients %>%
  left_join(total_amounts %>%
              select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
            by = c("B.Code", "A.Level.Name")) %>%
  left_join(merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage), by = "B.Code") %>%
  left_join(purpose_by_code, by = "B.Code") %>%
  left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop), 
            by = c("B.Code", "A.Level.Name")) %>%
  left_join(milk_yield, by = "B.Code") %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  mutate(
    Total_Amount_kg = case_when(
      D.Unit.Amount == "g" ~ Total_Amount / 1000,
      D.Unit.Amount == "kg" ~ Total_Amount,
      TRUE ~ NA_real_
    ),
    D.Unit.Amount = "kg",  # <- FORCE ALL TO BE KG
    
    Ym = case_when(
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
      P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year)  ~ 0.06,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
      P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,
      P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
      P.Product == "Sheep" ~ 0.067,
      P.Product == "Goat" ~ 0.055,
      TRUE ~ NA_real_
    ),
    
    GE_daily = ifelse(!is.na(GE_diet) & !is.na(Total_Amount_kg),
                      GE_diet * Total_Amount_kg,
                      NA_real_),
    
    CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
                         GE_daily * Ym * 365 / 55.65,
                         NA_real_),
    
    CH4_flag = case_when(
      is.na(GE_diet) ~ "missing_GE_diet",
      is.na(Total_Amount_kg) ~ "missing_amount",
      is.na(Ym) ~ "missing_Ym",
      is.na(GE_daily) ~ "GE_not_calculated",
      is.na(CH4_kg_year) ~ "CH4_not_calculated",
      TRUE ~ "GE_diet_based"
    ),
    
    source = "GE_diet_from_ingredients"
  ) %>%
  rename(D.Item = A.Level.Name) %>%
  select(B.Code, D.Item, P.Product, Herd.Stage, GE_daily, CH4_kg_year, 
         Total_Amount = Total_Amount_kg, D.Unit.Amount, ED.Mean.T = Total_Amount_kg, 
         CH4_flag, Ym, source)

17.3) Gross energy at the entire diet level

Gross_energy_entire <- Gross_energy %>%
  filter(is_entire_diet == TRUE) %>%
  
  # Add feed amount and unit
  left_join(
    total_amounts %>% select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
    by = c("B.Code" = "B.Code", "D.Item" = "A.Level.Name")
  ) %>%

  # Add product info and herd stage
  left_join(
    merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage),
    by = "B.Code"
  ) %>%

  # Add dairy vs meat purpose
  left_join(purpose_by_code, by = "B.Code") %>%

  mutate(
    # Convert Total_Amount to kg/day
    Total_Amount = case_when(
      D.Unit.Amount == "g" ~ Total_Amount / 1000,
      D.Unit.Amount == "kg" ~ Total_Amount,
      TRUE ~ NA_real_
    ),

    # Fix unit
    D.Unit.Amount = case_when(
      D.Unit.Amount == "g" ~ "kg",
      TRUE ~ D.Unit.Amount
    ),

    # Diagnostic reason for failure
    reason_missing = case_when(
      is.na(Total_Amount) & D.Unit.Amount %in% c("g/kg", "%", "% Diet") ~ "Cannot convert unit to kg",
      TRUE ~ NA_character_
    ),

    # Gross energy intake
    GE_daily = ifelse(
      !is.na(Total_Amount) & !is.na(DC.Value),
      DC.Value * Total_Amount,
      NA_real_
    )
  )

17.4) Merge GE data with feed intake

  # Make sure 'purpose_by_code' is unique per B.Code
purpose_by_code_unique <- purpose_by_code %>%
  distinct(B.Code, Purpose)

# Then join safely before using 'Purpose'
GE_all <- Gross_energy_entire %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop), 
            by = c("B.Code",  "D.Item" = "A.Level.Name")) %>%
  left_join(milk_yield, by = "B.Code") %>%
  left_join(
    feed_intake_diet %>% 
      select(B.Code, A.Level.Name, ED.Mean.T),
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%
  mutate(
    Total_Amount = if_else(!is.na(ED.Mean.T), ED.Mean.T, Total_Amount),
    D.Unit.Amount = if_else(!is.na(ED.Mean.T), "kg", D.Unit.Amount),
Ym = case_when(
  # --- Dairy cattle based on productivity ---
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
  P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year)  ~ 0.06,
  
  # --- Meat cattle based on forage proportions ---
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
  P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,

  # --- Sheep & Lambs ---
  P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
  P.Product == "Sheep" ~ 0.067,

  # --- Goats ---
  P.Product == "Goat" ~ 0.055,
  
  TRUE ~ NA_real_
),
    
    GE_daily = ifelse(!is.na(DC.Value) & !is.na(Total_Amount),
                      DC.Value * Total_Amount,
                      NA_real_),
    
    CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
                         GE_daily * Ym * 365 / 55.65,
                         NA_real_),
    
    CH4_flag = case_when(
      is.na(DC.Value) ~ "missing_GE_value",
      is.na(Ym) ~ "missing_Ym",
      is.na(Total_Amount) ~ "missing_amount",
      is.na(GE_daily) ~ "GE_not_calculated",
      is.na(CH4_kg_year) ~ "CH4_not_calculated",
      !is.na(ED.Mean.T) ~ "amount_inferred from feed intake",
      TRUE ~ "ok"
    )
  )
###############################################################################
## SINGLE CHUNK: Sum Ingredient Intake & Merge into Entire-Diet GE + CH₄ Calc ##
###############################################################################

# 1) Sum ingredient-level feed intake -----------------------------------------


ingredient_intake_summed <- feed_intake_ingredients %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    sum_ingredient_intake = sum(ED.Mean.T, na.rm = TRUE),
    .groups = "drop"
  )

# 2) Merge this ingredient sum into the entire-diet GE table ------------------
#    Then calculate final intake, daily GE, and CH₄.
GE_all_reconstructed <- Gross_energy_entire %>%
    left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop), 
            by = c("B.Code",  "D.Item" = "A.Level.Name")) %>%
  left_join(milk_yield, by = "B.Code") %>%
  # Add species info and keep only ruminants
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  
  # Merge the ingredient-level summed intake (aliasing A.Level.Name to D.Item)
  left_join(
    ingredient_intake_summed,
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%

  # Compute CH₄-related fields
  mutate(
    # Methane conversion factor by species
Ym = case_when(
  # --- Dairy cattle based on productivity ---
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
  P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year)  ~ 0.06,
  
  # --- Meat cattle based on forage proportions ---
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
  P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,

  # --- Sheep & Lambs ---
  P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
  P.Product == "Sheep" ~ 0.067,

  # --- Goats ---
  P.Product == "Goat" ~ 0.055,
  
  TRUE ~ NA_real_
),

    # Final daily intake (kg/day): pick the best available, but avoid using DC.Value
    final_intake_kg_day = case_when(
      !is.na(Total_Amount) & D.Unit.Amount == "kg" ~ Total_Amount,
      !is.na(sum_ingredient_intake) ~ sum_ingredient_intake,
      TRUE ~ NA_real_
    ),
    
    # Track source of intake
    final_intake_source = case_when(
      !is.na(Total_Amount) & D.Unit.Amount == "kg" ~ "Total_Amount",
      !is.na(sum_ingredient_intake) ~ "Total feed intake",
      TRUE ~ "Missing"
    ),
    
    # Daily GE intake (MJ/day) = GE content × daily intake
    GE_daily = case_when(
      !is.na(final_intake_kg_day) & !is.na(DC.Value) ~ DC.Value * final_intake_kg_day,
      TRUE ~ NA_real_
    ),
    
    # Annual CH₄ emissions
    CH4_kg_year = ifelse(
      !is.na(GE_daily) & !is.na(Ym),
      GE_daily * Ym * 365 / 55.65,
      NA_real_
    ),
    
    # Flag potential issues
    CH4_flag = case_when(
      is.na(DC.Value) ~ "missing_GE_value",
      is.na(Ym) ~ "missing_Ym",
      is.na(final_intake_kg_day) ~ "missing_final_intake",
      is.na(GE_daily) ~ "GE_not_calculated",
      is.na(CH4_kg_year) ~ "CH4_not_calculated",
      TRUE ~ "ok"
    )
  )

17.5) Final calculation

#combine
# 1. Tag and select relevant columns from both tables
GE_all_clean <- GE_all %>%
  mutate(source = "GE_all") %>%
  select(
    B.Code, 
    D.Item, 
    P.Product,
    Herd.Stage,
    GE_daily, 
    CH4_kg_year,
    Total_Amount,
    D.Unit.Amount,
    ED.Mean.T, 
    CH4_flag,
    Ym,
    source
  )

GE_reconstructed_clean <- GE_all_reconstructed %>%
  mutate(source = "Reconstructed") %>%
  rename(ED.Mean.T = final_intake_kg_day) %>%
  select(
    B.Code, 
    D.Item, 
    P.Product, 
    Herd.Stage,
    GE_daily, 
    CH4_kg_year, 
    Total_Amount,
    D.Unit.Amount,
    ED.Mean.T, 
    CH4_flag, 
    Ym,
    source
  )

GE_combined <- bind_rows(GE_all_clean, GE_reconstructed_clean, GE_diet_reconstructed) %>%
  mutate(
    source_priority = case_when(
      source == "GE_all" ~ 1,
      source == "Reconstructed" ~ 2,
      source == "GE_diet_from_ingredients" ~ 3,
      TRUE ~ 4
    ),
    has_CH4 = !is.na(CH4_kg_year)
  ) %>%
  arrange(B.Code, D.Item, desc(has_CH4), source_priority) %>%
  distinct(B.Code, D.Item, .keep_all = TRUE) %>%
  select(-has_CH4, -source_priority)


# Define non-emitting or pre-ruminant stages
non_emitting_stages <- c(
   "Kid", "Kid Male", "Calf Female", "Weaned Calf Male",
  "Young Male", "Young Female", 
   "Intact Kid Male"
)

# Filter them out
GE_combined <- GE_combined %>%
  filter(!(Herd.Stage %in% non_emitting_stages))

GE_combined <- GE_combined %>%
  left_join(ctrl_key, by = c("B.Code", "D.Item" = "A.Level.Name"))



GE_data <- GE_combined %>%
  filter(!is.na(GE_daily) & GE_daily != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 43
Goat 186 68
Sheep 222 90
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code) %>%
  nrow()

n_CH4_codes
## [1] 186
n_CH4_codes_by_product <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "n_CH4_codes")

n_CH4_codes_by_product
##    P.Product n_CH4_codes
##       <char>       <int>
## 1:    Cattle          28
## 2:      Goat          68
## 3:     Sheep          90
# Step 1: Get list of B.Codes from GE_combined
relevant_BCodes <- GE_combined %>%
  distinct(B.Code)

# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
  semi_join(relevant_BCodes, by = "B.Code")

# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
  filter(!is.na(A.Level.Name)) %>%
  distinct(B.Code, A.Level.Name) %>%
  nrow()

total_ingredients <- diet_data_filtered %>%
  filter(!is.na(D.Item)) %>%
  distinct(B.Code, D.Item) %>%
  nrow()

# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 1868
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 2686

total number of codes that have ge and have both treatment and control

# Identify B.Codes that contain both control and treatment diets
b_codes_with_both_ctrl_and_trt <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  filter(T.Control %in% c("yes", "no")) %>%   # Only if T.Control is properly labeled
  group_by(B.Code) %>%
  summarise(n_control_status = n_distinct(T.Control), .groups = "drop") %>%
  filter(n_control_status > 1)   # Keep only those with both Yes and No

# Count of such B.Codes
n_both_ctrl_and_trt <- nrow(b_codes_with_both_ctrl_and_trt)
cat("Number of B.Codes with both control and treatment diets:", n_both_ctrl_and_trt, "\n")
## Number of B.Codes with both control and treatment diets: 89

papers that have no tagg for t control

check_control<-ingredients%>%
  filter(is.na(T.Control))%>%
  select(B.Code,A.Level.Name,D.Item,T.Control)

filter ge combined with only papers with both control and treatments

GE_combined <- GE_combined %>%
  semi_join(b_codes_with_both_ctrl_and_trt, by = "B.Code")

GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 127 43
Goat 186 68
Sheep 222 90
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code) %>%
  nrow()

n_CH4_codes
## [1] 89
n_CH4_codes_by_product <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "n_CH4_codes")

n_CH4_codes_by_product
##    P.Product n_CH4_codes
##       <char>       <int>
## 1:    Cattle          17
## 2:      Goat          37
## 3:     Sheep          35
relevant_BCodes <- GE_combined %>%
  distinct(B.Code)

# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
  semi_join(relevant_BCodes, by = "B.Code")

# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
  filter(!is.na(A.Level.Name)) %>%
  distinct(B.Code, A.Level.Name) %>%
  nrow()

total_ingredients <- diet_data_filtered %>%
  filter(!is.na(D.Item)) %>%
  distinct(B.Code, D.Item) %>%
  nrow()

# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 448
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 674

Analysis based on practices

################################################################################
##  Join “changes” with emission-intensity of BOTH diets in every comparison ##
###############################################################################

## keep only the columns we need from ei_prod
ei_core <- ei_prod %>% 
  select(B.Code,
         A.Level.Name,           # diet name
         EI_CH4_prod,            # kg CH4 · kg-product⁻¹
         CH4_kg_year,            # (optional, if you want absolute CH4 later)
         Product, P.Product,     # meat / milk & species
         T.Control,              # yes / no
         Yield_kg_day, Yield_kg_year)   # keep yields if you like

ei_pairs <- changes %>% 
  ## ---- attach intensity for diet_A -----------------------------------------
  left_join(ei_core,
            by = c("B.Code", "diet_A" = "A.Level.Name"),
            suffix = c("", "_A")) %>%          # tmp suffix to avoid clashes
  rename(EI_A = EI_CH4_prod) %>%               # keep only the field we need
  ## ---- attach intensity for diet_B -----------------------------------------
  left_join(ei_core,
            by = c("B.Code", "diet_B" = "A.Level.Name"),
            suffix = c("_A", "_B")) %>%        # second join, new suffix
  rename(EI_B = EI_CH4_prod) %>%               
  ## ---- derive deltas & practice magnitude -----------------------------------
  mutate(
    delta_EI_prod = EI_B - EI_A,               # +  ⇒ diet_B is worse
    abs_change    = coalesce(total_pct_added,    0) +
                    coalesce(total_pct_removed,  0) +
                    coalesce(total_pct_increase, 0) +
                    coalesce(total_pct_decrease, 0),
    pair_label    = paste(diet_A, "→", diet_B)
  ) %>% 
  ## keep only rows where we have both intensities
  filter(!is.na(delta_EI_prod))
## Warning in left_join(., ei_core, by = c("B.Code", diet_A = "A.Level.Name"), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning in left_join(., ei_core, by = c("B.Code", diet_B = "A.Level.Name"), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
            # keep only pairs with EI on both sides
library(dplyr)
library(stringr)

type_cols <- c("added_by_type",
               "removed_by_type",
               "increase_by_type",
               "decrease_by_type")

ei_pairs <- ei_pairs %>% 
  filter(
    !if_any(
      all_of(type_cols),
      ~ !is.na(.) &                               # real value, not missing
        str_detect(trimws(.), "^NA\\s*:")         # starts with “NA:”
    )
  )



species_counts <- ei_pairs %>% 
  distinct(B.Code, Species = P.Product_A) %>%   # 1 row per paper × species
  count(Species, name = "n_papers")   
###################################
##  Practice-vs-EI plotting chunk
###################################            # install.packages("patchwork") once

## ───────────────── 0 · Prep  ────────────────────────────────────────────────
ei_pairs_plot <- ei_pairs %>% 
  # 0·a  choose one species/product label
  mutate(Product = coalesce(Product_B, Product_A,
                            P.Product_B, P.Product_A)) %>% 
  # 0·b  compute yield delta before selecting
  mutate(delta_Yield = Yield_kg_day_B - Yield_kg_day_A) %>% 
  select(abs_change,
         delta_EI_prod,
         delta_Yield,
         practice_type,
         Product) %>% 
  filter(!is.na(Product),
         !is.na(delta_EI_prod))

## ───────────────── 1 · scatter, 2 · violin, 3 · trade-off ──────────────────
p1 <- ggplot(ei_pairs_plot,
             aes(abs_change, delta_EI_prod, colour = practice_type)) +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey50") +
  geom_point(alpha = .7, size = 2) +
  geom_smooth(method = "lm", se = TRUE, colour = "black",
              formula = y ~ x) +
  scale_x_continuous("% of diet DM changed") +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
  facet_wrap(~ Product) +
  theme_bw() +
  theme(legend.position = "bottom")

p2 <- ggplot(ei_pairs_plot,
             aes(practice_type, delta_EI_prod, fill = practice_type)) +
  geom_violin(trim = FALSE, alpha = .35, colour = NA) +
  geom_boxplot(width = .18, outlier.shape = NA) +
  stat_summary(fun = median, geom = "point",
               size = 2, colour = "black") +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey40") +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
  facet_wrap(~ Product, nrow = 1) +
  labs(x = NULL, title = "Emission-intensity shift by practice type") +
  theme_bw() +
  theme(legend.position = "none")

p3 <- ggplot(ei_pairs_plot,
             aes(delta_Yield, delta_EI_prod,
                 colour = practice_type,
                 size   = abs_change)) +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey60") +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
  geom_point(alpha = .7) +
  scale_x_continuous("Δ Yield (kg animal⁻¹ d⁻¹)",
                     labels = scales::comma) +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)",
                     labels = scales::comma) +
  scale_size(range = c(2,6), name = "% moved") +
  facet_wrap(~ Product) +
  labs(title = "Productivity vs. emission-intensity trade-offs") +
  theme_minimal() +
  theme(legend.position = "bottom")

## ───────────────── 2 · show or combine ──────────────────────────────────────
p1

p2

p3

# (p1 | p2 | p3) + plot_layout(guides = "collect")   # uncomment for one panel
library(tidyverse)
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()      masks data.table::between()
## ✖ readr::col_factor()   masks scales::col_factor()
## ✖ scales::discard()     masks purrr::discard()
## ✖ lubridate::duration() masks arrow::duration()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks data.table::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ dplyr::last()         masks data.table::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## ── 1 · tidy table -----------------------------------------------------------
plot_df <- ei_pairs %>% 
  filter(practice_type %in% c("addition", "substitution")) %>%     # only add / sub
  mutate(abs_change = total_pct_added + total_pct_removed +
                     total_pct_increase + total_pct_decrease) %>%  # magnitude
  
  separate_rows(added_by_type, sep = ";\\s*") %>%                  # explode list
  
  mutate(
    Ingredient_Type = str_remove(added_by_type, ":.*$") %>% str_trim(),
    Ingredient_Type = case_when(                                   # collapse names
      str_detect(Ingredient_Type, regex("supplement",  TRUE)) ~ "Supplement",
      str_detect(Ingredient_Type, regex("concentrate", TRUE)) ~ "Concentrate",
      TRUE                                                   ~ Ingredient_Type
    )
  ) %>% 
  filter(Ingredient_Type != "", Ingredient_Type != "Entire Diet") %>% 
  
  mutate(Product = coalesce(!!!syms(names(.)[str_detect(names(.), "Product")]))) %>% 
  
  select(Product, practice_type, Ingredient_Type,
         abs_change, delta_EI_prod)

## ── 2 · bubble panel ---------------------------------------------------------
ggplot(plot_df,
       aes(abs_change, delta_EI_prod,
           colour = Ingredient_Type,
           shape  = practice_type,
           size   = abs_change)) +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey50") +
  geom_point(alpha = .75) +
  scale_size(range = c(3,10), guide = "none") +
  scale_x_continuous("% of diet DM changed") +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)",
                     labels = scales::comma) +
  facet_wrap(~ Product) +
  labs(title  = "Effect of adding / substituting single ingredient categories",
       colour = "Ingredient category",
       shape  = "Practice") +
  theme_bw() +
  theme(legend.position = "right")

library(tidyverse)
library(ggalt)     # install.packages("ggalt")  for geom_dumbbell()


## ── 1 · tidy substitutions table --------------------------------------------
subs_db <- ei_pairs %>%
  filter(practice_type == "substitution") %>% 
  
  # explode the semicolon lists
  separate_rows(removed_by_type, added_by_type, sep = ";\\s*") %>% 
  
  # trim & strip trailing numbers, then normalise categories
  mutate(
    removed = removed_by_type %>% str_remove(":.*$") %>% str_trim() %>% str_to_lower(),
    added   = added_by_type   %>% str_remove(":.*$") %>% str_trim() %>% str_to_lower(),
    
    removed = case_when(
      str_detect(removed, "supplement")   ~ "Supplement",
      str_detect(removed, "concentrate")  ~ "Concentrate",
      TRUE                                ~ str_to_title(removed)
    ),
    added = case_when(
      str_detect(added, "supplement")     ~ "Supplement",
      str_detect(added, "concentrate")    ~ "Concentrate",
      TRUE                                ~ str_to_title(added)
    )
  ) %>% 
  
  # drop blanks & “Entire Diet”
  filter(removed != "", added != "",
         removed != "Entire Diet", added != "Entire Diet") %>% 
  
  # pair label + product label
  mutate(
    pair_id = paste(removed, "→", added),
    Product = coalesce(!!!syms(names(.)[str_detect(names(.), "Product")]))
  )

## ── 2 · dumb-bell plot -------------------------------------------------------
subs_db <- subs_db %>%                         # reuse the table
  mutate(effect = case_when(
    delta_EI_prod <= -0.02 ~ "Lower EI",
    delta_EI_prod >=  0.02 ~ "Higher EI",
    TRUE                   ~ "Neutral"
  ))

library(tidyverse)

## subs_db already built & classified with “effect” ----
## left  dot  = original EI  (grey)
## right dot  = new EI       (colour by effect)

ggplot(subs_db,
       aes(y = reorder(pair_id, delta_EI_prod))) +

  ## segment
  geom_segment(aes(x = 0, xend = delta_EI_prod),
               colour = "grey70") +

  ## left (before) dot
  geom_point(aes(x = 0),
             colour = "grey60",
             size   = 2) +

  ## right (after) dot – coloured by effect
  geom_point(aes(x = delta_EI_prod,
                 colour = effect),
             size = 2) +

  scale_colour_manual(values = c(
    "Lower EI"  = "#2b83ba",   # blue  = improvement
    "Neutral"   = "grey70",
    "Higher EI" = "#d7191c"    # red   = worse
  )) +

  facet_wrap(~ Product, scales = "free_y") +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey50") +
  labs(x = "Δ EI (kg CH₄ / kg product)",
       y = NULL,
       colour = "Effect",
       title = "Direction & magnitude of each substitution") +
  theme_minimal()

add intensity of the substitution. how are subtitution done ? removed and added werid concentrate thing. do it by species ? how many pooints ?

library(tidyverse)
library(scales)      # for pretty breaks if you want

## ── 1 · tidy substitutions + magnitude --------------------------------------
subs_db <- ei_pairs %>%
  filter(practice_type == "substitution") %>% 

  ## magnitude of this reformulation
  mutate(pct_moved = 0.5 * (total_pct_added + total_pct_removed +   # counts each swap once
                   total_pct_increase + total_pct_decrease)
) %>% 
  
  separate_rows(removed_by_type, added_by_type, sep = ";\\s*") %>% 
  mutate(
    removed = removed_by_type %>% str_remove(":.*$") %>% str_trim() %>% str_to_lower(),
    added   = added_by_type   %>% str_remove(":.*$") %>% str_trim() %>% str_to_lower(),
    removed = case_when(
      str_detect(removed, "supplement")   ~ "Supplement",
      str_detect(removed, "concentrate")  ~ "Concentrate",
      TRUE                                ~ str_to_title(removed)),
    added   = case_when(
      str_detect(added, "supplement")     ~ "Supplement",
      str_detect(added, "concentrate")    ~ "Concentrate",
      TRUE                                ~ str_to_title(added))
  ) %>% 
  filter(removed != "", added != "",
         removed != "Entire Diet", added != "Entire Diet") %>% 
  
  mutate(pair_id = paste(removed, "→", added),
         Product = coalesce(!!!syms(names(.)[str_detect(names(.), "Product")])),
         effect  = case_when(
                      delta_EI_prod <= -0.02 ~ "Lower EI",
                      delta_EI_prod >=  0.02 ~ "Higher EI",
                      TRUE                   ~ "Neutral"))

## ── 2 · dumb-bell with magnitude encoded ------------------------------------
ggplot(subs_db,
       aes(y = reorder(pair_id, delta_EI_prod))) +

  ## segment thickness = % DM moved
  geom_segment(aes(x    = 0,
                   xend = delta_EI_prod,
                   size = pct_moved),        # <- line width
               colour = "grey70",
               lineend = "round") +

  ## before-swap dot (always grey)
  geom_point(aes(x = 0),
             colour = "grey60",
             size   = 2) +

  ## after-swap dot: colour = direction, size = % moved
  geom_point(aes(x     = delta_EI_prod,
                 colour = effect,
                 size   = pct_moved)) +

  scale_colour_manual(values = c(
    "Lower EI"  = "#2b83ba",
    "Neutral"   = "grey70",
    "Higher EI" = "#d7191c")) +

  scale_size(range = c(0.5, 4),      # tweak min/max stroke & dot sizes
             breaks = pretty_breaks(4),
             name = "% diet DM\nmoved") +

  facet_wrap(~ Product, scales = "free_y") +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey50") +
  labs(x = "Δ EI (kg CH₄ / kg product)",
       y = NULL,
       colour = "Effect",
       title  = "Direction, magnitude and intensity of each substitution") +
  theme_minimal() +
  theme(legend.position = "right")

ggplot(subs_db,
       aes(x = delta_EI_prod,
           y = pct_moved,
           colour = effect,                 # blue / red / grey
           size   = pct_moved)) +           # double-encode size
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(alpha = .75) +
  scale_size(range = c(3,12), guide = "none") +
  scale_y_continuous("% of diet DM substituted") +
  scale_x_continuous("Δ EI (kg CH₄ / kg product)",
                     breaks = scales::pretty_breaks()) +
  facet_wrap(~ Product) +
  theme_bw()

heat_tbl <- subs_db %>% 
  group_by(removed, added, Product) %>% 
  summarise( mean_EI   = mean(delta_EI_prod),
             mean_pct  = mean(pct_moved),
             n = n(), .groups = "drop") %>% 
  filter(n >= 3)

# ggplot(heat_tbl,
#        aes(removed, added, fill = mean_EI)) +
#   geom_tile(colour = "white") +
#   geom_text(aes(label = round(mean_pct,0)), size = 3) +   # % printed on tile
#   scale_fill_gradient2(low = "#2b83ba", mid = "white",
#                        high = "#d7191c", midpoint = 0,
#                        name = "Δ EI") +
#   facet_wrap(~ Product) +
#   labs(x = "Category removed", y = "Category added",
#        title = "Average EI shift (colour) and % DM swapped (number)") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))
na_GE_counts <- Gross_energy %>% 
  filter(DC.Variable == "GE") %>%                         # only GE rows
  mutate(DC.Value = as.numeric(DC.Value)) %>%             # blanks → NA
  group_by(D.Item) %>% 
  summarise(
    n_rows = n(),                     # total GE rows for this item
    n_na   = sum(is.na(DC.Value)),    # how many of them are NA
    all_GE_missing = n_na == n_rows,  # TRUE ⇢ every GE row is NA
    .groups = "drop"
  )