Introduction

Overview

This notebook assembles and cleans heterogeneous livestock-nutrition trials into a single, analysis-ready dataset that can be used to quantify enteric-methane (CH₄) emissions and emission-intensity for cattle, sheep and goats. Using R, we use ERA metadata, harmonise units, fill nutritional gaps (five-step cascade: original paper → Feedipedia → ILRI feeds DB → proximate-composition equation of Weiss & Tebbe → ingredient means), and reconstruct diet-level gross energy (GE) where necessary.

Emission calculations

Daily GE intake is converted to annual CH₄ with the IPCC Tier 2 equation. Methane-conversion factors (Ym) follow the 2019 IPCC defaults but are adjusted for diet forage level and dairy productivity where the data allow. Absolute emissions are then paired with weight-gain and milk-/meat-yield records to derive emission-intensity (kg CH₄ kg-product⁻¹). A change-detection routine classifies every treatment diet relative to its control (addition, substitution, dosage change) so diet strategies can be linked directly to shifts in EI.

Why this matters

Ruminants in sub-Saharan Africa often operate far below potential productivity; improving feed quality can therefore lower both absolute and per-kilogram emissions. By turning hundred of individual studies into a coherent Tier-2 dataset, this pipeline provides a transparent basis for (i) benchmarking baseline emissions, (ii) identifying high-leverage nutritional interventions, and (iii) stress-testing alternative CH₄ equations or region-specific Ym values.

A detailed methodological rationale is given in Livestock Emission Method.docx.

1) Set up

This section installs and loads required packages, connects to the S3 bucket containing ERA data, and loads both the livestock and camel metadata into memory.

# Install and load 'pacman' package manager if not already installed

library(ERAg)
library(stringr)
library(knitr)
library(dplyr)
library(DT)
library(readxl)
library(tidyr)
library(arrow)
library(ggplot2)
library(data.table)
library(purrr)
library(janitor)
library(ggalt)

# Set up connection to S3 bucket
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# -------------------------
# Load LATEST skinny_cow
# -------------------------
# List the files in the s3 bucket
files <- s3$dir_ls(file.path(era_s3, "data"))

# Identify the latest skinny_cow_2022-YYYY-MM-DD.RData file
files <- tail(grep(".RData$", grep("skinny_cow_2022", files, value = TRUE), value = TRUE), 1)

# Download to local if not already present
save_path <- file.path(getwd(), basename(files))
if (!file.exists(save_path)) {
  s3$file_download(files, save_path, overwrite = TRUE)
}

# Load the data (as a list of objects)
livestock_metadata <- miceadds::load.Rdata2(file = basename(save_path), path = dirname(save_path))

# -------------------------
# Build D.Item -> D.Type lookup from 2025-04-11.1 (source-of-truth)
# -------------------------
src_file <- file.path(era_s3, "data/skinny_cow_2022-2025-04-11.1.RData")
src_local <- file.path(getwd(), basename(src_file))
if (!file.exists(src_local)) {
  s3$file_download(src_file, src_local, overwrite = TRUE)
}
src_obj <- miceadds::load.Rdata2(file = basename(src_local), path = dirname(src_local))

# Ensure data.table
lookup_dt <- as.data.table(src_obj$Animals.Diet)[
  , .(D.Item = str_squish(as.character(D.Item)),
      D.Type = str_squish(as.character(D.Type)))
][
  !is.na(D.Item) & nzchar(D.Item) & !is.na(D.Type) & nzchar(D.Type)
]

# Resolve any (D.Item, D.Type) duplicates by taking the most frequent D.Type per D.Item
lookup_dt <- lookup_dt[
  , .N, by = .(D.Item, D.Type)
][
  order(D.Item, -N)
][
  , .SD[1], by = D.Item
][
  , .(D.Item, D.Type)
]

# -------------------------
# Backfill D.Type into latest Animals.Diet
# -------------------------
diet_latest <- as.data.table(livestock_metadata$Animals.Diet)

# Normalize join key to match lookup normalization
diet_latest[, D.Item := str_squish(as.character(D.Item))]

# Add D.Type column if missing
if (!"D.Type" %in% names(diet_latest)) {
  diet_latest[, D.Type := NA_character_]
} else {
  diet_latest[, D.Type := as.character(D.Type)]
}

# Join and fill only missing D.Type
diet_latest <- merge(
  diet_latest, lookup_dt,
  by = "D.Item", all.x = TRUE, suffixes = c("", ".src")
)
diet_latest[is.na(D.Type) & !is.na(D.Type.src), D.Type := D.Type.src]
diet_latest[, D.Type.src := NULL]

# Write back into the loaded object
livestock_metadata$Animals.Diet <- diet_latest
# Set up connection to S3 bucket for camel data
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"

# List RData files related to courageous_camel_2024
camel_files <- s3$dir_ls(file.path(era_s3, "data"))
camel_files <- grep("courageous_camel_2024_draft_.*\\.RData$", camel_files, value = TRUE)

# Helper function to extract date and version
parse_versions <- function(fnames) {
  base <- sub(".*_draft_", "", fnames)
  base <- sub("\\.RData$", "", base)
  
  date_part <- sub("\\..*$", "", base)
  version_part <- ifelse(grepl("\\.", base),
                         as.integer(sub("^.*\\.", "", base)),
                         0L)
  
  data.frame(
    file = fnames,
    date = as.Date(date_part),
    version = version_part,
    stringsAsFactors = FALSE
  )
}

# Parse file names and determine latest
version_info <- parse_versions(basename(camel_files))
latest <- version_info[order(version_info$date, version_info$version, decreasing = TRUE), ][1, ]
latest_file <- camel_files[basename(camel_files) == latest$file]

# Download the file if not already present
save_path <- file.path(getwd(), basename(latest_file))
if (!file.exists(save_path)) {
  s3$file_download(latest_file, save_path, overwrite = TRUE)
}

# Load the data
camel <- miceadds::load.Rdata2(file = basename(save_path), path = dirname(save_path))

to DO

missing_unit_feed_intake<- camel$Data.Out %>%
  select(B.Code, T.Name, ED.Intake.Item,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,Out.Code.Joined) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(is.na(Out.Unit))

fill in feed intakes

# #Add units where they are NA for Feed-Intake rows
# camel$Data.Out[
#   Out.Subind == "Feed Intake" & is.na(Out.Unit),                 # target rows
#   Out.Unit := sub("^.*?\\.\\.(.*?)\\.\\..*$", "\\1", Out.Code.Joined)  # extract middle chunk
# ]
# 
# feed_intake<-camel$Data.Out%>%
#   filter(Out.Subind=="Feed Intake")
# 
# sort(unique(feed_intake$Out.Unit), na.last = TRUE)   # A-Z, NAs at the end
##############################################################################
# Canonical column templates (ALL use a D.Item field) ------------------------
##############################################################################
diet_cols   <- c("B.Code","A.Level.Name","D.Item",
                 "D.Type","D.Amount","D.Unit.Amount","D.Unit.Time",
                 "D.Unit.Animals","DC.Is.Dry","D.Ad.lib","D.Item.Group")

diet_cols_ext <- c(diet_cols, "D.Item.Root.Comp.Proc_Major")   # keep root too

mt_cols   <- c("B.Code","A.Level.Name","V.Level.Name",
               "T.Animals","T.Name","T.Control")

comp_cols <- c("B.Code","D.Item","is_group","is_entire_diet",
               "DC.Variable","DC.Value","DC.Unit")

digest_cols <- c("B.Code","D.Item","is_group","is_entire_diet",
                 "DD.Variable","DD.Value","DD.Unit")

data_cols <- c("B.Code","A.Level.Name","Out.Code.Joined",
               "ED.Intake.Item","ED.Intake.Item.Raw",
               "ED.Mean.T","ED.Error","Out.Subind","Out.Unit",
               "Out.WG.Start","Out.WG.Unit",
               "is_entire_diet","is_group","T.Animals","T.Name")

prod_cols <- c("B.Code","P.Product","P.Variety","P.Practice")

##############################################################################
# 1. LIVESTOCK tables --------------------------------------------------------
##############################################################################
## 1·1 Diet ------------------------------------------------------------------
diet_livestock <- copy(livestock_metadata$Animals.Diet)[
  , ..diet_cols_ext][, Source := "livestock"]

## 1·2 Lookup table :  (B.Code , FeedName)  ➜  Canonical root ---------------
lookup_ditem <- rbindlist(list(
  diet_livestock[
    !is.na(D.Item) &
    !is.na(D.Item.Root.Comp.Proc_Major),
    .(B.Code,
      FeedName  = D.Item,
      Canonical = D.Item.Root.Comp.Proc_Major)],
  diet_livestock[
    !is.na(D.Item.Root.Comp.Proc_Major),
    .(B.Code,
      FeedName  = D.Item.Root.Comp.Proc_Major,
      Canonical = D.Item.Root.Comp.Proc_Major)]
))[
  , .SD[1], by = .(B.Code, FeedName)]

setkey(lookup_ditem, B.Code, FeedName)

## 1·3 MT --------------------------------------------------------------------
mt_cols_ls  <- intersect(mt_cols, names(livestock_metadata$MT.Out))
mt_livestock <- livestock_metadata$MT.Out[, ..mt_cols_ls][
  , `:=`(Out.WG.Start = NA_real_,
         Out.WG.Unit  = NA_character_,
         Source       = "livestock")]

## 1·4 Composition -----------------------------------------------------------
comp_livestock <- merge(
  livestock_metadata$Animals.Diet.Comp,
  lookup_ditem,
  by.x = c("B.Code","D.Item"),
  by.y = c("B.Code","FeedName"),
  all.x = TRUE, allow.cartesian = TRUE
)[
  , D.Item := fcoalesce(Canonical, D.Item)
][
  , Canonical := NULL
][
  , ..comp_cols
][, Source := "livestock"]

## 1·5 Digestibility ---------------------------------------------------------
digest_livestock <- merge(
  livestock_metadata$Animals.Diet.Digest,
  lookup_ditem,
  by.x = c("B.Code","D.Item"),
  by.y = c("B.Code","FeedName"),
  all.x = TRUE, allow.cartesian = TRUE
)[
  , D.Item := fcoalesce(Canonical, D.Item)
][
  , Canonical := NULL
][
  , ..digest_cols
][, Source := "livestock"]

## 1·6 Data.Out  (canonicalise Intake item) ----------------------------------
# Canonicalise Intake item in Data.Out and keep legacy column names expected by data_cols
dt <- merge(
  livestock_metadata$Data.Out,
  lookup_ditem,
  by.x = c("B.Code","ED.Intake.Item"),
  by.y = c("B.Code","FeedName"),
  all.x = TRUE, allow.cartesian = TRUE
)

# Use Canonical when present, then drop it
dt[, ED.Intake.Item := fcoalesce(Canonical, ED.Intake.Item)]
dt[, Canonical := NULL]

# Map new column names back to old ones (so downstream ..data_cols still works)
if ("ED.Intake.Item.is_group" %in% names(dt) && !"is_group" %in% names(dt)) {
  setnames(dt, "ED.Intake.Item.is_group", "is_group")
}
if ("ED.Intake.Item.is_entire_diet" %in% names(dt) && !"is_entire_diet" %in% names(dt)) {
  setnames(dt, "ED.Intake.Item.is_entire_diet", "is_entire_diet")
}

# Now select the standard columns and tag the source
data_livestock <- dt[, ..data_cols][, Source := "livestock"]


## 1·7 Product ---------------------------------------------------------------
var_ls <- livestock_metadata$Var.Out[
  , .(P.Variety  = first(na.omit(V.Level.Name)),
      P.Practice = first(na.omit(V.Animal.Practice))), by = B.Code]

prod_livestock <- livestock_metadata$Prod.Out[
  , .(B.Code, P.Product)][var_ls, on = "B.Code"][
  , Source := "livestock"]

##############################################################################
# 2. CAMEL helper keys (unchanged) ------------------------------------------
##############################################################################
diet_key <- camel$MT.Out[
  , .(B.Code, T.Name, A.Level.Name, T.Control,
      Herd.Level.Name, Herd.Sublevel.Name)]

herd_info <- camel$Herd.Out[
  , .(B.Code, Herd.Level.Name, Herd.Sublevel.Name,
      Herd.Sex, Herd.Stage,
      V.Var, V.Animal.Practice,
      T.Animals              = Herd.N,
      Herd.Start.Weight      = Herd.Start.Weight,
      Herd.Start.Weight.Unit = Herd.Start.Weight.Unit)]

##############################################################################
# 3. CAMEL tables  – use root when present, fallback to D.Item --------------
##############################################################################
## 3·1 Diet ------------------------------------------------------------------
diet_camel <- copy(camel$Animal.Diet)[
  , D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
  , ..diet_cols
][, Source := "camel"]

## 3·2 Composition -----------------------------------------------------------
comp_camel <- copy(camel$Animal.Diet.Comp)[
  , D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
  , `:=`(DC.Variable = DN.Variable,
         DC.Value    = DN.Value,
         DC.Unit     = DN.Unit)
][
  , ..comp_cols
][, Source := "camel"]

## 3·3 Digestibility ---------------------------------------------------------
digest_camel <- copy(camel$Animal.Diet.Digest)[
  , D.Item := fcoalesce(D.Item.Root.Comp.Proc_Major, D.Item)
][
  , ..digest_cols
][, Source := "camel"]

## 3·4 MT.Out (unchanged) ----------------------------------------------------
mt_camel <- diet_key[
  herd_info, on = .(B.Code,Herd.Level.Name,Herd.Sublevel.Name), nomatch = 0L][
  camel$MT.Out[
    , .(B.Code, A.Level.Name, T.Name,
        T.Start.Weight,
        T.Start.Weight.Unit = `T.Start.Weight. Unit`)],
  on = .(B.Code,A.Level.Name,T.Name), nomatch = 0L][
  , `:=`(T.Start.Weight      = as.numeric(T.Start.Weight),
         Herd.Start.Weight   = as.numeric(Herd.Start.Weight),
         T.Start.Weight.Unit = as.character(T.Start.Weight.Unit),
         Herd.Start.Weight.Unit = as.character(Herd.Start.Weight.Unit))][
  , `:=`(Out.WG.Start = fcoalesce(T.Start.Weight, Herd.Start.Weight),
         Out.WG.Unit  = fifelse(!is.na(T.Start.Weight),
                                T.Start.Weight.Unit,
                                Herd.Start.Weight.Unit),
         Source       = "camel")][
  , !c("T.Start.Weight","T.Start.Weight.Unit",
       "Herd.Start.Weight","Herd.Start.Weight.Unit")]

## 3·5 Data.Out (unchanged) --------------------------------------------------
## 3·5 Data.Out (left-join + boolean-only Entire Diet) ------------------------
# Keep ALL rows from camel$Data.Out; attach A.Level.Name via (B.Code, T.Name),
# then attach herd fields via (B.Code, Herd.Level.Name, Herd.Sublevel.Name).

data_camel <- merge(
  camel$Data.Out,
  diet_key,
  by = c("B.Code","T.Name"),
  all.x = TRUE
)

data_camel <- merge(
  data_camel,
  herd_info,
  by = c("B.Code","Herd.Level.Name","Herd.Sublevel.Name"),
  all.x = TRUE
)

# Select/rename to your schema and enforce the boolean Entire Diet rule
data_camel <- data_camel[
  , .(B.Code,
      A.Level.Name,
      ED.Intake.Item,
      T.Name,
      D.Item.Root.Comp.Proc_Major,
      ED.Intake.Item.is_entire_diet,
      ED.Intake.Item.Raw = ED.Intake.Item,
      ED.Mean.T, ED.Error, Out.Unit, Out.Subind,
      Out.WG.Start = as.numeric(Herd.Start.Weight),
      Out.WG.Unit  = as.character(Herd.Start.Weight.Unit),
      T.Animals)
][
  # Boolean-only tagging: if camel says entire diet, force the token
  , ED.Intake.Item := fifelse(ED.Intake.Item.is_entire_diet == TRUE,
                              "Entire Diet", ED.Intake.Item)
][
  , Source := "camel"
]



prod_camel <- camel$Herd.Out[
  , .(B.Code,
      P.Product  = V.Product,
      P.Variety  = V.Var,
      P.Practice = V.Animal.Practice,
      Herd.Stage)][, Source := "camel"]

##############################################################################
# 4. Combine camel + livestock ----------------------------------------------
##############################################################################
diet_livestock_final <- diet_livestock[
  , `:=`(D.Item = D.Item.Root.Comp.Proc_Major,
         D.Item.Root.Comp.Proc_Major = NULL)]

diet_all   <- rbindlist(list(diet_livestock_final, diet_camel), fill = TRUE)
mt_all     <- rbindlist(list(mt_livestock, mt_camel), fill = TRUE)
comp_all   <- rbindlist(list(comp_livestock, comp_camel), fill = TRUE)
digest_all <- rbindlist(list(digest_livestock, digest_camel), fill = TRUE)
data_all   <- rbindlist(list(data_livestock, data_camel), fill = TRUE)
prod_all   <- rbindlist(list(prod_livestock, prod_camel), fill = TRUE)

##############################################################################
# 5. Master list -------------------------------------------------------------
##############################################################################
merged_metadata <- list(
  Animals.Diet        = diet_all,
  MT.Out              = mt_all,
  Animals.Diet.Comp   = comp_all,
  Animals.Diet.Digest = digest_all,
  Data.Out            = data_all,
  Prod.Out            = prod_all
)
merged_metadata$Data.Out <- merged_metadata$Data.Out %>% 
  mutate(
    ## 0 ────────────────────────────────────────────────────────────────
    ## normalize any variant spelling to a single "Entire Diet" token
    ED.Intake.Item = case_when(
      grepl("^\\s*entire\\s*diet\\s*$", tolower(coalesce(ED.Intake.Item, ""))) ~ "Entire Diet",
      TRUE ~ ED.Intake.Item
    ),

    ## 1 ────────────────────────────────────────────────────────────────
    ## overwrite with root ONLY for ingredient rows (never for "Entire Diet")
    ED.Intake.Item = if_else(
      !is.na(D.Item.Root.Comp.Proc_Major) & D.Item.Root.Comp.Proc_Major != "" &
        !grepl("^Entire Diet$", coalesce(ED.Intake.Item, "")),
      D.Item.Root.Comp.Proc_Major,
      ED.Intake.Item
    ),

    ## 2 ────────────────────────────────────────────────────────────────
    ## if still NA/blank, use T.Name
    ED.Intake.Item = if_else(
      (is.na(ED.Intake.Item) | ED.Intake.Item == "") & 
        !is.na(T.Name) & T.Name != "",
      T.Name,
      ED.Intake.Item
    ),

    ## 3 ────────────────────────────────────────────────────────────────
    ## set entire-diet flag from helper OR from the token
    is_entire_diet = case_when(
      !is.na(ED.Intake.Item.is_entire_diet) & ED.Intake.Item.is_entire_diet != "" ~ ED.Intake.Item.is_entire_diet,
      grepl("^Entire Diet$", coalesce(ED.Intake.Item, "")) ~ TRUE,
      TRUE ~ is_entire_diet
    )
  )
  


# 1) Build a lookup from MT with squished whitespace
mt_key <- as.data.table(camel$MT.Out)[
  , .(B.Code,
      T.Name_core = str_squish(as.character(T.Name)),
      A.Level.Name = str_squish(as.character(A.Level.Name)))
][
  # keep one row per (B.Code, T.Name_core), preferring non-NA A.Level.Name
  order(!is.na(A.Level.Name))
][
  , .SD[.N], by = .(B.Code, T.Name_core)
]

# 2) Prepare Data.Out: derive a "core" T.Name by stripping anything after "||"
dt <- as.data.table(merged_metadata$Data.Out)
dt[Source == "camel",
   T.Name_core := str_squish(sub("\\|\\|.*$", "", as.character(T.Name)))]

# 3) Left join to fill A.Level.Name where NA
dt[Source == "camel" & (is.na(A.Level.Name) | A.Level.Name == ""),
   A.Level.Name := mt_key[.SD, on = .(B.Code, T.Name_core), A.Level.Name]
]

# 4) Clean up helper
dt[, T.Name_core := NULL]

# 5) Write back
merged_metadata$Data.Out <- dt
## Keep only merged_metadata (add more names to the vector if needed)
rm(list = setdiff(ls(), c("merged_metadata")))
gc()   # optional: reclaim memory
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3738816 199.7    7526624 402.0  4590741 245.2
## Vcells 7354809  56.2   19812323 151.2 19811308 151.2

1.1 Correct T.Control INCLUDE IN ERA DEV

When t control is yes for one of the diets then automatically enter No for the other diets of the same paper. Also for missing t control if the name of the diet contains control or Con or Zero then automatically deduce this is a control diet and others are treatment diets.

MT <- copy(merged_metadata$MT.Out)


MT[, T.Control := tolower(trimws(as.character(T.Control))) ]


MT[
  ## diet name suggests a control …
  grepl("\\bzero\\b|\\bcon\\b|control", A.Level.Name, ignore.case = TRUE) &

  ## … but flag is still empty / ambiguous
  (is.na(T.Control) | T.Control %in% c("0", "false","na")),

  T.Control := "yes"
]



MT[, T.Control := {
  has_ctrl <- any(T.Control == "yes", na.rm = TRUE)          # does study have a control?
  
  if (has_ctrl) {
    # 4·a  normalise obvious “no” variants
    flag <- fifelse(
      T.Control %in% c("false", "0", "n", "no"), "no", T.Control
    )
    # 4·b  ensure exactly one category: yes / no
    fifelse(flag == "yes", "yes", "no")
    
  } else {
    # keep whatever was there (even "0", "false", NA, …)
    T.Control
  }
}, by = B.Code]


MT[, T.Control := fifelse(
  (is.na(T.Control) | T.Control %in% c("", "na")) & any(T.Control == "yes", na.rm = TRUE),
  "no",
  T.Control
), by = B.Code]


merged_metadata$MT.Out <- MT


controls_multi <- MT[
  T.Control == "yes", 
  .SD[uniqueN(A.Level.Name) > 1], 
  by = B.Code
]

controls_multi[, .(B.Code, A.Level.Name, T.Control)]
##      B.Code                     A.Level.Name T.Control
##      <char>                           <char>    <char>
##   1: HK0287 Conc Commercial...Concentrate HM       yes
##   2: HK0287                  Conc Commercial       yes
##   3: HK0287                   Concentrate HM       yes
##   4: JS0340         MSP0 Enzyme...MSP0 Yeast       yes
##   5: JS0340                      MSP0 Enzyme       yes
##  ---                                                  
## 261: JO1088                    100g MOBYS RS       yes
## 262: JO1088                    200g MOBYS RS       yes
## 263: JO1088                    300g MOBYS RS       yes
## 264: JO1095                           Diet 1       yes
## 265: JO1095                           Diet 5       yes
rm(controls_multi,MT)

1.2) Correct A.Level.Name

Some feed intake entries are not matched to any ingredient data due to discrepancies in A.Level.Name between the feed intake and ingredient datasets for the same study codes. In this section, we build a table to flag studies where diet names do not align, helping to identify cases that may require manual review or correction.

# --- Diet name alignment across Feed Intake, Diet Comp, and Digest -----------
# Uses Ingredient (Animals.Diet$A.Level.Name) as the canonical diet name.
# In Diet Comp and Digest, the diet name lives in D.Item, and we only consider rows with is.entire.diet == TRUE.


# -------- Helpers -------------------------------------------------------------
clean_name <- function(x) x %>% str_squish()  # add tolower() if you want case-insensitive matching

# Build a safe 1–1 mapping per B.Code between Ingredient name and a target source name.
# Expects data.frames with columns: B.Code and Diet (already cleaned).
build_mapping <- function(ingredient_df, target_df, target_label) {
  combined <- bind_rows(
    ingredient_df %>% mutate(Source = "Ingredient"),
    target_df     %>% mutate(Source = target_label)
  ) %>% distinct()

  comparison <- combined %>%
    group_by(B.Code, Diet) %>%
    summarise(Sources = paste(sort(unique(Source)), collapse = " & "), .groups = "drop") %>%
    mutate(
      Match_Status = dplyr::case_when(
        grepl(target_label, Sources) & grepl("Ingredient", Sources) ~ "Match",
        Sources == "Ingredient" ~ "Only in Ingredient",
        Sources == target_label  ~ paste("Only in", target_label),
        TRUE ~ "Other"
      )
    )

  comparison %>%
    filter(Match_Status %in% c("Only in Ingredient", paste("Only in", target_label))) %>%
    group_by(B.Code) %>%
    summarise(
      n_ing  = sum(Match_Status == "Only in Ingredient"),
      n_tar  = sum(Match_Status == paste("Only in", target_label)),
      Ingredient_Name = Diet[Match_Status == "Only in Ingredient"][1],
      Target_Name     = Diet[Match_Status == paste("Only in", target_label)][1],
      .groups = "drop"
    ) %>%
    filter(n_ing == 1, n_tar == 1)
}

# -------- 1) Collect cleaned diet names from each source ----------------------
ingredient_names <- merged_metadata$Animals.Diet %>%
  transmute(B.Code, Diet = clean_name(A.Level.Name)) %>%
  distinct()

feedintake_names <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
  transmute(B.Code, Diet = clean_name(A.Level.Name)) %>%
  distinct()

dietcomp_names <- merged_metadata$Animals.Diet.Comp %>%
  filter(is_entire_diet == TRUE) %>%
  transmute(B.Code, Diet = clean_name(D.Item)) %>%
  distinct()

digest_names <- merged_metadata$Animals.Diet.Digest %>%
  filter(is_entire_diet == TRUE) %>%
  transmute(B.Code, Diet = clean_name(D.Item)) %>%
  distinct()

# Presence BEFORE corrections (per (B.Code, Diet))
presence_before <- bind_rows(
  ingredient_names %>% mutate(Source = "Ingredient"),
  feedintake_names %>% mutate(Source = "FeedIntake"),
  dietcomp_names %>% mutate(Source = "DietComp"),
  digest_names %>% mutate(Source = "Digest")
) %>%
  mutate(val = 1L) %>%
  pivot_wider(names_from = Source, values_from = val, values_fill = 0L) %>%
  arrange(B.Code, Diet)

# -------- 2) Build 1–1 mappings vs Ingredient --------------------------------
to_correct_FI <- build_mapping(ingredient_names, feedintake_names, "FeedIntake")
to_correct_DC <- build_mapping(ingredient_names, dietcomp_names,  "DietComp")
to_correct_DG <- build_mapping(ingredient_names, digest_names,    "Digest")

# Optional: preview reports of what WILL change (before applying)
corrections_report_FI <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
  mutate(A.Level.Name_clean = clean_name(A.Level.Name)) %>%
  left_join(to_correct_FI, by = c("B.Code", "A.Level.Name_clean" = "Target_Name")) %>%
  transmute(B.Code, Old = A.Level.Name, New = dplyr::coalesce(Ingredient_Name, A.Level.Name)) %>%
  distinct() %>% filter(Old != New)

corrections_report_DC <- merged_metadata$Animals.Diet.Comp %>%
  filter(is_entire_diet == TRUE) %>%
  mutate(D.Item_clean = clean_name(D.Item)) %>%
  left_join(to_correct_DC, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
  transmute(B.Code, Old = D.Item, New = dplyr::coalesce(Ingredient_Name, D.Item)) %>%
  distinct() %>% filter(Old != New)

corrections_report_DG <- merged_metadata$Animals.Diet.Digest %>%
  filter(is_entire_diet == TRUE) %>%
  mutate(D.Item_clean = clean_name(D.Item)) %>%
  left_join(to_correct_DG, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
  transmute(B.Code, Old = D.Item, New = dplyr::coalesce(Ingredient_Name, D.Item)) %>%
  distinct() %>% filter(Old != New)

# -------- 3) Apply corrections (rename targets to Ingredient names) -----------
# FEED INTAKE
merged_metadata$Data.Out <- merged_metadata$Data.Out %>%
  mutate(A.Level.Name_clean = clean_name(A.Level.Name)) %>%
  left_join(to_correct_FI, by = c("B.Code", "A.Level.Name_clean" = "Target_Name")) %>%
  mutate(A.Level.Name = dplyr::coalesce(Ingredient_Name, A.Level.Name)) %>%
  select(-A.Level.Name_clean, -Ingredient_Name)

# DIET COMP (only where is.entire.diet == TRUE)
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  mutate(D.Item_clean = clean_name(D.Item)) %>%
  left_join(to_correct_DC, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
  mutate(
    D.Item = if_else(is_entire_diet == TRUE, dplyr::coalesce(Ingredient_Name, D.Item), D.Item)
  ) %>%
  select(-D.Item_clean, -Ingredient_Name)

# DIGEST (only where is.entire.diet == TRUE)
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
  mutate(D.Item_clean = clean_name(D.Item)) %>%
  left_join(to_correct_DG, by = c("B.Code", "D.Item_clean" = "Target_Name")) %>%
  mutate(
    D.Item = if_else(is_entire_diet == TRUE, dplyr::coalesce(Ingredient_Name, D.Item), D.Item)
  ) %>%
  select(-D.Item_clean, -Ingredient_Name)

# -------- 4) Presence AFTER corrections --------------------------------------
ingredient_names_after <- merged_metadata$Animals.Diet %>%
  transmute(B.Code, Diet = clean_name(A.Level.Name)) %>% distinct()

feedintake_names_after <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Feed Intake", !is.na(A.Level.Name)) %>%
  transmute(B.Code, Diet = clean_name(A.Level.Name)) %>% distinct()

dietcomp_names_after <- merged_metadata$Animals.Diet.Comp %>%
  filter(is_entire_diet == TRUE) %>%
  transmute(B.Code, Diet = clean_name(D.Item)) %>% distinct()

digest_names_after <- merged_metadata$Animals.Diet.Digest %>%
  filter(is_entire_diet == TRUE) %>%
  transmute(B.Code, Diet = clean_name(D.Item)) %>% distinct()

presence_after <- bind_rows(
  ingredient_names_after %>% mutate(Source = "Ingredient"),
  feedintake_names_after %>% mutate(Source = "FeedIntake"),
  dietcomp_names_after %>% mutate(Source = "DietComp"),
  digest_names_after %>% mutate(Source = "Digest")
) %>%
  mutate(val = 1L) %>%
  pivot_wider(names_from = Source, values_from = val, values_fill = 0L) %>%
  arrange(B.Code, Diet)

# Optional quick summaries
alignment_summary_before <- presence_before %>%
  summarise(
    n_rows = n(),
    full_matches = sum(Ingredient==1 & FeedIntake==1 & DietComp==1 & Digest==1),
    any_missing = sum(Ingredient + FeedIntake + DietComp + Digest < 4)
  )

alignment_summary_after <- presence_after %>%
  summarise(
    n_rows = n(),
    full_matches = sum(Ingredient==1 & FeedIntake==1 & DietComp==1 & Digest==1),
    any_missing = sum(Ingredient + FeedIntake + DietComp + Digest < 4)
  )

rm(alignment_summary_after,alignment_summary_before,corrections_report_DC,corrections_report_DG,corrections_report_FI,dietcomp_names,dietcomp_names_after,digest_names,digest_names_after,feedintake_names,feedintake_names_after,ingredient_names,ingredient_names_after,presence_after,presence_before,to_correct_DC,to_correct_DG,to_correct_FI)

1.3) Correct ingredient units

Some ingredients appear multiple times with amounts reported in different units. In this section, we prioritize entries that are already in preferred units to minimize the need for conversions.

## keep ingredient that appear twice with different units only once 

# Define your unit preferences
preferred_units <- c("g", "kg")
preferred_time <- "day"
preferred_animals <- "individual"

# Prioritize and deduplicate
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
  # Add a priority score to sort by best rows
  mutate(
    priority = case_when(
      D.Unit.Amount %in% preferred_units &
        D.Unit.Time == preferred_time &
        D.Unit.Animals == preferred_animals ~ 1,
      TRUE ~ 2  # lower priority
    )
  ) %>%
  group_by(B.Code, A.Level.Name, D.Item) %>%
  slice_min(order_by = priority, with_ties = FALSE) %>%
  ungroup() %>%
  select(-priority)  # optional: drop helper column

rm(preferred_units,preferred_time,preferred_animals)


### correct extraction errors 

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  mutate(
    DC.Value = ifelse(
      B.Code == "BO1006" &
      DC.Variable == "Ash" &
      D.Item == "Ration 2" & 
      as.numeric(DC.Value) == 628,
      6.28,
      DC.Value
    )
  )
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")

# Initialize result containers
species_summary <- list()
bcode_table <- list()

# Loop over each species
for (species in species_list) {
  
  # All B.Codes for this species
  species_bcodes <- merged_metadata$Prod.Out %>%
    filter(P.Product == species) %>%
    pull(B.Code) %>%
    unique()
  
  # Feed intake B.Codes
  feed_data <- merged_metadata$Data.Out %>%
    filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
    filter(Out.Subind=="Feed Intake")%>%
    select(B.Code, D.Item = ED.Intake.Item)
  
  feed_bcodes <- unique(feed_data$B.Code)
  
  # GE B.Codes
  ge_data <- merged_metadata$Animals.Diet.Comp %>%
    filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
    select(B.Code, D.Item)
  
  ge_bcodes <- unique(ge_data$B.Code)
  
  # Matched B.Codes (join on both B.Code and D.Item)
  matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
    pull(B.Code) %>%
    unique()
  
  # Store species-level summary
  species_summary[[species]] <- tibble(
    Species = species,
    Total_BCodes = length(species_bcodes),
    With_FEED = length(feed_bcodes),
    With_GE = length(ge_bcodes),
    With_BOTH = length(matched_bcodes)
  )
  
  # Store long-format bcode list
  bcode_table[[species]] <- tibble(
    B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
    Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"), 
                   times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
    Species = species
  ) %>% distinct()
}
## Warning in inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 315 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning in inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2814 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)

1.4 Take out grazing experiments

species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_before <- merged_metadata$Animals.Diet %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_before")

# Step 1: Identify A.Level.Name (diet) that include Grazing
grazing_diets <- merged_metadata$Animals.Diet %>%
  filter(D.Type == "Grazing") %>%
  distinct(B.Code, A.Level.Name)

length(unique(grazing_diets$B.Code))
## [1] 63
# Step 2: Count how many non-grazing diets exist per B.Code
nongrazing_counts <- merged_metadata$Animals.Diet %>%
  filter(!(D.Type == "Grazing")) %>%
  distinct(B.Code, A.Level.Name) %>%
  group_by(B.Code) %>%
  summarise(n_nongrazing = n(), .groups = "drop")

length(unique(nongrazing_counts$B.Code))
## [1] 714
# Step 3: Keep only diets that are not marked as Grazing AND are in B.Codes with at least one non-grazing diet
merged_metadata$Animals.Diet <- merged_metadata$Animals.Diet %>%
  anti_join(grazing_diets, by = c("B.Code", "A.Level.Name")) %>%  # remove all diets that were grazing
  semi_join(nongrazing_counts, by = "B.Code")  # keep only papers with at least one non-grazing diet


species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_after <- merged_metadata$Animals.Diet %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_after")

rm(grazing_diets,nongrazing_counts)
to_be_categorized<-merged_metadata$Animals.Diet%>%
  filter(is.na(D.Item))%>%
  filter(D.Type!="Entire Diet")

1.5) Summary of the data

# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")

# Initialize result containers
species_summary <- list()
bcode_table <- list()

# Loop over each species
for (species in species_list) {
  
  # All B.Codes for this species
  species_bcodes <- merged_metadata$Prod.Out %>%
    filter(P.Product == species) %>%
    pull(B.Code) %>%
    unique()
  
  # Feed intake B.Codes
  feed_data <- merged_metadata$Data.Out %>%
    filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
    filter(Out.Subind=="Feed Intake")%>%
    select(B.Code, D.Item = ED.Intake.Item)
  
  feed_bcodes <- unique(feed_data$B.Code)
  
  # GE B.Codes
  ge_data <- merged_metadata$Animals.Diet.Comp %>%
    filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
    select(B.Code, D.Item)
  
  ge_bcodes <- unique(ge_data$B.Code)
  
  # Matched B.Codes (join on both B.Code and D.Item)
  matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
    pull(B.Code) %>%
    unique()
  
  # Store species-level summary
  species_summary[[species]] <- tibble(
    Species = species,
    Total_BCodes = length(species_bcodes),
    With_FEED = length(feed_bcodes),
    With_GE = length(ge_bcodes),
    With_BOTH = length(matched_bcodes)
  )
  
  # Store long-format bcode list
  bcode_table[[species]] <- tibble(
    B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
    Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"), 
                   times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
    Species = species
  ) %>% distinct()
}

# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)

2) Harmonize units of ingredient amounts

This step standardizes ingredient amount units (e.g., from mg, kg, %, etc.) to a consistent base (usually grams per individual per day), ensuring reliable downstream calculations and comparisons.

# Diet ingredients: breakdown of all ingredients within each diet
ingredients <- merged_metadata$Animals.Diet %>%
  mutate(D.Amount = as.numeric(D.Amount))

# Function to harmonize diet ingredient units (only change: kg/t handling)
harmonize_units <- function(df) {
  df %>%
    mutate(
      # Record original units
      Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep = ";"),

      # Harmonize weight and volume units to a consistent base
      D.Amount = case_when(
        D.Unit.Amount %in% c("kg", "mg", "g/100g", "l", "kg/t", "g/300l",
                             "mg/kg", "kg/100kg", "kg/100kg body weight",
                             "kg/kg metabolic weight", "g/kg metabolic weight (0.75)",
                             "g/kg DMI", "g/L", "g/kg DM") ~ {
          case_when(
            D.Unit.Amount == "kg" ~ D.Amount * 1000,
            D.Unit.Amount == "mg" ~ D.Amount / 1000,
            D.Unit.Amount == "g/100g" ~ D.Amount * 10,
            D.Unit.Amount == "l" ~ D.Amount * 1000,
            D.Unit.Amount == "kg/t" ~ D.Amount,
            D.Unit.Amount == "g/300l" ~ D.Amount / 300,
            D.Unit.Amount == "mg/kg" ~ D.Amount / 1000,
            D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ D.Amount * 10,
            D.Unit.Amount == "kg/kg metabolic weight" ~ D.Amount * 1000,
            D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)") ~ D.Amount,
            D.Unit.Amount == "g/kg DMI" ~ D.Amount,
            D.Unit.Amount == "g/kg DM" ~ D.Amount,
            TRUE ~ D.Amount
          )
        },
        TRUE ~ D.Amount
      ),

      # Update D.Unit.Amount
      D.Unit.Amount = case_when(
        D.Unit.Amount == "kg" ~ "g",
        D.Unit.Amount == "mg" ~ "g",
        D.Unit.Amount == "g/100g" ~ "g/kg",
        D.Unit.Amount == "l" ~ "ml",
        D.Unit.Amount == "kg/t" ~ "g/kg",
        D.Unit.Amount == "g/300l" ~ "g/L",
        D.Unit.Amount == "mg/kg" ~ "g/kg",
        D.Unit.Amount %in% c("kg/100kg", "kg/100kg body weight") ~ "g/kg body weight",
        D.Unit.Amount %in% c("g/kg body weight (0.75)", "g/kg Body Weight (0.75)", "g/kg metabolic weight (0.75)") ~ "g/kg metabolic weight",
        D.Unit.Amount == "kg/kg metabolic weight" ~ "g/kg metabolic weight",
        D.Unit.Amount == "g/kg DMI" ~ "g/kg",
        D.Unit.Amount == "g/kg DM" ~ "g/kg",
        D.Unit.Amount == "g/L" ~ "g/L",
        D.Unit.Amount == "g/kg Body Weight" ~ "g/kg body weight",
        TRUE ~ D.Unit.Amount
      ),

      # Harmonize time units to day (excluding "individual", only changing "experiment")
    D.Amount = case_when(
  D.Unit.Time == "week" ~ D.Amount / 7,
  D.Unit.Time == "month" ~ D.Amount / 30,
  D.Unit.Time == "2 weeks" ~ D.Amount / 14,
  D.Unit.Time == "4 days interval" ~ D.Amount / 4,
  D.Unit.Time == "2x/day" ~ D.Amount * 2,
  TRUE ~ D.Amount  # Leave "experiment" and others untouched
),

      

      D.Unit.Time = case_when(
        D.Unit.Time %in% c("week", "month", "2 weeks", "4 days interval", "2x/day") ~ "day",
        D.Unit.Time == "experiment" ~ D.Unit.Time,
        TRUE ~ D.Unit.Time
      ),

      # Remove units if amount is NA
      D.Unit.Amount = ifelse(is.na(D.Amount), NA_character_, D.Unit.Amount),
      D.Unit.Time = ifelse(is.na(D.Amount), NA_character_, D.Unit.Time),
      D.Unit.Animals = ifelse(is.na(D.Amount), NA_character_, D.Unit.Animals)
    )
}

# Apply harmonization
ingredients <- harmonize_units(ingredients)

# Load replicate info
reps <- merged_metadata$MT.Out %>%
  mutate(T.Animals = as.numeric(T.Animals)) %>%
  distinct(B.Code, A.Level.Name, .keep_all = TRUE)

# Adjust for 'all animals in replicate'
ingredients <- ingredients %>%
  left_join(reps %>% dplyr::select(B.Code, A.Level.Name, T.Animals), by = c("B.Code", "A.Level.Name")) %>%
  mutate(
    D.Amount = case_when(
      D.Unit.Animals == "all animals in replicate" & !is.na(T.Animals) ~ D.Amount / T.Animals,
      TRUE ~ D.Amount
    ),
    D.Unit.Animals = ifelse(D.Unit.Animals == "all animals in replicate", "individual", D.Unit.Animals)
  )

# Refresh Units column
ingredients <- ingredients %>%
  mutate(Units = paste(D.Unit.Amount, D.Unit.Time, D.Unit.Animals, sep = ";"))

# Cleanup
rm(harmonize_units,reps,feed_data)

2.1) Suspicious dry amounts TO DO ADD IN ERA DEV

This chunk systematically fills in missing “DC.Is.Dry” flags by combining three complementary sources—analytical moisture/DM thresholds, name‐based cues (e.g. “dried,” “meal”), and broad feed/supplement rules—and then, where every measurable ingredient in a treatment is already known to be dry, assumes the lab used the same dry‐prep protocol for all feeds in that trial. By layering data‑driven thresholds, text mining, and a diet‑level consistency override, we ensure that no ingredient in a truly dried regimen slips through unflagged, while still preserving fresh‐vs.‐dry distinctions in mixed rations.

# ------------------------------------------------------------
# DRY flag with strict rule:
# 1) Use Moisture/DM thresholds when available (authoritative).
# 2) If NO Moisture/DM for an item, assume DRY ("Yes").
# 3) Diet-arm propagation: if all flagged items in the arm are "Yes",
#    fill any remaining NA in that arm with "Yes".
# 4) Drop "water" rows.
# No name/blanket/majority heuristics.
# ------------------------------------------------------------

# --- 1) Moisture/DM-based evidence per (B.Code, D.Item) ---------------------
dry_evidence <- bind_rows(
  # a) Moisture rules
  merged_metadata$Animals.Diet.Comp %>%
    filter(str_to_lower(DC.Variable) %in% c("moisture","moist")) %>%
    transmute(
      B.Code, D.Item,
      Predictor = "Moisture",
      Value = suppressWarnings(as.numeric(DC.Value)),
      Unit  = str_to_lower(str_squish(DC.Unit)),
      Dry_Prediction = dplyr::case_when(
        Unit %in% c("%","% dm","g/100g") ~ dplyr::case_when(
          Value <  15 ~ TRUE,   # low moisture ⇒ dry
          Value >  50 ~ FALSE,  # very wet     ⇒ not dry
          TRUE        ~ NA
        ),
        Unit == "g/kg" ~ dplyr::case_when(
          Value < 150 ~ TRUE,
          Value > 500 ~ FALSE,
          TRUE        ~ NA
        ),
        TRUE ~ NA
      )
    ),
  # b) DM rules
  merged_metadata$Animals.Diet.Comp %>%
    filter(str_to_lower(DC.Variable) == "dm") %>%
    transmute(
      B.Code, D.Item,
      Predictor = "DM",
      Value = suppressWarnings(as.numeric(DC.Value)),
      Unit  = str_to_lower(str_squish(DC.Unit)),
      Dry_Prediction = dplyr::case_when(
        Unit %in% c("%","% dm","g/100g") ~ dplyr::case_when(
          Value >  85 ~ TRUE,   # high DM ⇒ dry
          Value <  50 ~ FALSE,  # low  DM ⇒ not dry
          TRUE        ~ NA
        ),
        Unit == "g/kg" ~ dplyr::case_when(
          Value > 850 ~ TRUE,
          Value < 500 ~ FALSE,
          TRUE        ~ NA
        ),
        TRUE ~ NA
      )
    )
) %>%
  # keep one prediction per predictor per key (first non-NA)
  distinct(B.Code, D.Item, Predictor, Dry_Prediction) %>%
  group_by(B.Code, D.Item, Predictor) %>%
  summarise(Dry_Prediction = dplyr::first(na.omit(Dry_Prediction)), .groups = "drop") %>%
  # wide: Dry_Prediction_Moisture, Dry_Prediction_DM
  tidyr::pivot_wider(
    names_from = Predictor,
    values_from = Dry_Prediction,
    names_prefix = "Dry_Prediction_"
  ) %>%
  mutate(
    has_evidence = !is.na(Dry_Prediction_Moisture) | !is.na(Dry_Prediction_DM),
    # precedence: DM beats Moisture when both exist
    Dry_from_evidence = dplyr::case_when(
      !is.na(Dry_Prediction_DM)        ~ Dry_Prediction_DM,
      is.na(Dry_Prediction_DM) &
        !is.na(Dry_Prediction_Moisture) ~ Dry_Prediction_Moisture,
      TRUE                              ~ NA
    )
  ) %>%
  select(B.Code, D.Item, has_evidence, Dry_from_evidence)

# --- 2) Apply evidence to ingredients ---------------------------------------
ingredients <- ingredients %>%
  left_join(dry_evidence, by = c("B.Code","D.Item")) %>%
  mutate(
    # Evidence (DM/Moisture) is authoritative when present
    DC.Is.Dry = dplyr::case_when(
      Dry_from_evidence == TRUE  ~ "Yes",
      Dry_from_evidence == FALSE ~ "No",
      TRUE                       ~ DC.Is.Dry
    )
  )

# --- 3) Diet-level propagation (only fills NA when all flagged in arm are Yes)
ingredients <- ingredients %>%
  group_by(B.Code, A.Level.Name) %>%
  mutate(
    any_flag   = any(!is.na(DC.Is.Dry), na.rm = TRUE),
    all_dry    = all(DC.Is.Dry[!is.na(DC.Is.Dry)] == "Yes")
  ) %>%
  ungroup() %>%
  mutate(
    DC.Is.Dry = dplyr::case_when(
      any_flag & all_dry & is.na(DC.Is.Dry) ~ "Yes",
      TRUE                                  ~ DC.Is.Dry
    )
  ) %>%
  select(-any_flag, -all_dry)

# --- 4) Fallback: assume DRY only when NO Moisture/DM evidence --------------
# (i.e., we cannot decide because there is no proximate data)
ingredients <- ingredients %>%
  mutate(
    DC.Is.Dry = dplyr::case_when(
      is.na(DC.Is.Dry) & (has_evidence %in% c(FALSE, NA)) ~ "Yes",
      TRUE                                                ~ DC.Is.Dry
    )
  ) %>%
  select(-has_evidence, -Dry_from_evidence)

# --- 5) Drop pure "water" rows ----------------------------------------------
ingredients <- ingredients %>%
  filter(!(str_to_lower(D.Item) == "water"))

Subset for Dry data

When feed intake data is missing and we rely on ingredient-level amounts, we need to ensure that the quantities used refer to the dry form of the ingredient. To do this, we subset the data to include only ingredients identified as dried—based on either the declared DC.Is.Dry value or our earlier flagging using ingredient names and moisture content predictions. This ensures consistency and accuracy in downstream calculations.

#take out unspecified ingredients

species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

# 2. Papers per species before filtering
# papers_before <- ingredients %>%
#   select(B.Code) %>%
#   distinct() %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_before")
# length(unique(ingredients$B.Code))


# Step 1: Identify A.Level.Name groups with at least one ingredient NOT dry
non_dry_diets <- ingredients %>%
  filter(is.na(DC.Is.Dry) | str_to_lower(DC.Is.Dry) %in% c("no", "n", "unspecified")) %>%
  distinct(B.Code, A.Level.Name)

# Step 2: Filter out these diets entirely from ingredients
ingredients <- ingredients %>%
  anti_join(non_dry_diets, by = c("B.Code", "A.Level.Name"))

length(unique(ingredients$A.Level.Name))
## [1] 1903
rm(non_dry_diets)

length(unique(ingredients$B.Code))
## [1] 610
species_by_bcode <- merged_metadata$Prod.Out %>%
  select(B.Code, P.Product) %>%
  distinct()

#2. Papers per species before filtering
papers_after <- ingredients %>%
  select(B.Code) %>%
  distinct() %>%
  left_join(species_by_bcode, by = "B.Code") %>%
  count(P.Product, name = "papers_after")

2.2) Filter data with acceptable units

We filter the dataset to retain only ingredients with acceptable units. Uncommon or unexpected unit formats are excluded to ensure consistency and simplify downstream processing

# Define a list of acceptable units
acceptable_units <- c(
  "g/kg", "g", "%", "ml", "L", "kg", "g/L",
  "g/kg metabolic weight", "% Diet", 
  "g/kg body weight", "ml/kg", "% Body Mass", 
  "% Concentrate", "g/kg body weight"  # leave only one if repeated
)


# Filter rows to keep only acceptable units
ingredients <- ingredients %>%
  filter(D.Unit.Amount %in% acceptable_units | is.na(D.Unit.Amount)) %>%
  mutate(D.Item = ifelse(is.na(D.Item), "Unspecified", D.Item))

#tracking paper loss because of harmnozied units
#total_studies_harmonized<- length(unique(ingredients_harmonized$B.Code))
#total_ingredients_harmonized<- length(unique(ingredients_harmonized$D.Item))
#total_diets_harmonized<- length(unique(ingredients_harmonized$A.Level.Name))

rm(acceptable_units)

2.3) Convert body weight units by linking animals weight to data

# Pre-filter Data.Out to keep only the first non-NA weight per group
# Step 1: From Data.Out — compute unique weights
weights_data <- merged_metadata$Data.Out %>%
  filter(Out.Subind == "Weight Gain") %>%
  mutate(
    Out.WG.Unit = tolower(Out.WG.Unit),
    Out.WG.Start = case_when(
      Out.WG.Unit == "g" ~ Out.WG.Start / 1000,
      Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
      Out.WG.Unit %in% c("kg", "unspecified", NA) ~ Out.WG.Start,
      TRUE ~ NA_real_
    ),
    Out.WG.Unit = ifelse(!is.na(Out.WG.Start), "kg", NA_character_)
  ) %>%
  filter(!is.na(Out.WG.Start)) %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    Out.WG.Start = mean(Out.WG.Start, na.rm = TRUE),
    Out.WG.Unit = "kg",
    .groups = "drop"
  )

# Step 2: From MT.Out — extract matching weight data
weights_mt <- merged_metadata$MT.Out %>%
  select(B.Code, A.Level.Name, Out.WG.Start, Out.WG.Unit) %>%
  filter(!is.na(Out.WG.Start)) %>%
  mutate(
    Out.WG.Start = as.numeric(Out.WG.Start),              # <- force numeric
    Out.WG.Unit = tolower(as.character(Out.WG.Unit)),     # <- force character and lowercase
    Out.WG.Start = case_when(
      Out.WG.Unit == "g" ~ Out.WG.Start / 1000,
      Out.WG.Unit == "mg" ~ Out.WG.Start / 1e6,
      Out.WG.Unit %in% c("kg", "unspecified", NA) ~ Out.WG.Start,
      TRUE ~ NA_real_
    ),
    Out.WG.Unit = ifelse(!is.na(Out.WG.Start), "kg", NA_character_)
  ) %>%
  distinct(B.Code, A.Level.Name, .keep_all = TRUE)


# Step 3: Combine both sources, then de-duplicate (keep priority if desired)
weights_unique <- bind_rows(weights_data, weights_mt) %>%
  arrange(B.Code, A.Level.Name) %>%
  distinct(B.Code, A.Level.Name, .keep_all = TRUE)




# Step 3: Join body weight and keep only ingredients with weight
ingredients <- ingredients %>%
  left_join(
    weights_unique,
    by = c("B.Code", "A.Level.Name")) 


# Step 4: Convert D.Amount to grams based on body weight and update unit accordingly
ingredients <- ingredients %>%
  mutate(
    # Recalculate D.Amount using body weight or metabolic weight, result in kg
    D.Amount = case_when(
      D.Unit.Amount == "g/kg body weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start) / 1000,
      D.Unit.Amount == "g/kg metabolic weight" & !is.na(Out.WG.Start) ~ (D.Amount * Out.WG.Start^0.75) / 1000,
      D.Unit.Amount == "% Body Mass" & !is.na(Out.WG.Start) ~ ((D.Amount / 100) * Out.WG.Start),
      TRUE ~ D.Amount  # unchanged if not convertible
    ),

    # Update unit to "kg" if recalculated
    D.Unit.Amount = case_when(
      D.Unit.Amount %in% c("g/kg body weight", "g/kg metabolic weight", "% Body Mass") &
        !is.na(Out.WG.Start) ~ "kg",
      TRUE ~ D.Unit.Amount
    )
  )

2.4) Estimate amounts using feed intake

For ingredients reported in relative units such as %, % Diet, or g/kg, we can estimate their absolute amounts when total diet feed intake is available. This allows us to convert proportions into standardized quantities.

missing_unit_feed_intake<- merged_metadata$Data.Out %>%
  select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,is_entire_diet,Source,Out.Code.Joined) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(is.na(Out.Unit))

2.4.1) Harmonization of feed intake data

feed_intake <- merged_metadata$Data.Out %>%
  select(B.Code, A.Level.Name, ED.Intake.Item, ED.Intake.Item.Raw,
         ED.Mean.T, ED.Error, Out.Unit, Out.Subind,
         is_entire_diet, is_group, T.Animals) %>%
  filter(Out.Subind == "Feed Intake") %>%
  filter(!is.na(ED.Mean.T)) %>%
  ## ───────────────────────────────────────────────────────────────────────
  mutate(
    is_entire = is.na(ED.Intake.Item.Raw) | ED.Intake.Item.Raw == "",
    ED.Intake.Item.Raw = if_else(is_entire, "Entire Diet", ED.Intake.Item.Raw),
    ED.Intake.Item     = if_else(is_entire, "Entire Diet", ED.Intake.Item)
  ) %>%
  ## (optionally) drop the helper flag
  select(-is_entire) %>%    
  ## ───────────────────────────────────────────────────────────────────────
  mutate(
    Out.Unit = tolower(trimws(Out.Unit)),
    Out.Unit.Animals = str_extract(Out.Unit,
                                   "\\b(individual|replicate|animal replicate)\\b"),
    Out.Unit.Time    = str_extract(Out.Unit,
                                   "\\b(day|d|yr|year|experiment)\\b"),
    Out.Unit.Amount  = Out.Unit %>%
      str_remove_all("\\b(individual|replicate|animal replicate)\\b") %>%
      str_remove_all("\\b(day|d|yr|year)\\b") %>%
      str_replace_all("/+", "/") %>%
      str_trim() %>%
      str_remove("/$")
  )



# Step 3: Convert replicate-level intake to individual-level when possible
feed_intake <- feed_intake %>%
  mutate(
    # Ensure numeric type
    ED.Mean.T = as.numeric(ED.Mean.T),
    T.Animals = as.numeric(T.Animals),

    # Convert intake and update unit
    ED.Mean.T = case_when(
      Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ ED.Mean.T / T.Animals,
      TRUE ~ ED.Mean.T
    ),
    Out.Unit.Animals = case_when(
      Out.Unit.Animals == "replicate" & !is.na(T.Animals) & T.Animals > 0 ~ "individual",
      TRUE ~ Out.Unit.Animals
    )
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `T.Animals = as.numeric(T.Animals)`.
## Caused by warning:
## ! NAs introduced by coercion
# Define preferred units
preferred_units <- c("kg dm", "g dm")
preferred_time <- "day"
preferred_animals <- "individual"

# Create filtered table with prioritization
feed_intake <- feed_intake %>%
  mutate(
    priority = case_when(
      Out.Unit.Amount %in% preferred_units &
        Out.Unit.Time == preferred_time &
        Out.Unit.Animals == preferred_animals &
        !is.na(ED.Error) ~ 1,
      Out.Unit.Amount %in% preferred_units &
        Out.Unit.Time == preferred_time &
        Out.Unit.Animals == preferred_animals ~ 2,
      TRUE ~ 3
    )
  ) %>%
  group_by(B.Code, A.Level.Name, ED.Intake.Item) %>%
  slice_min(order_by = priority, with_ties = FALSE) %>%
  ungroup() %>%
  select(-priority)  # optional



### keep unique units of ingredients 


# Step 3: Join body weight
feed_intake <- feed_intake %>%
  left_join(
    weights_unique %>%
      select(B.Code, A.Level.Name, Out.WG.Start, Out.WG.Unit),
    by = c("B.Code", "A.Level.Name")
  ) %>%
  group_by(A.Level.Name) %>%
  mutate(
    Out.WG.Start = ifelse(is.na(Out.WG.Start),
                          first(na.omit(Out.WG.Start)),
                          Out.WG.Start)
  ) %>%
  ungroup() %>%
  distinct()

# Step 4: Convert ED.Mean.T to kg/day
feed_intake <- feed_intake %>%
  mutate(
    Out.Unit = trimws(tolower(Out.Unit)),
    Out.Unit.Amount = trimws(str_remove(Out.Unit.Amount, "/$")),
    
    # Convert ED.Mean.T to kg/day where possible
    ED.Mean.T = case_when(
      Out.Unit.Amount %in% c("g", "g dm") ~ ED.Mean.T / 1000,
      Out.Unit.Amount %in% c("kg", "kg dm") ~ ED.Mean.T,
      Out.Unit.Amount == "g dm/100 g" ~ ED.Mean.T * 10 / 1000,
      Out.Unit.Amount == "g/week" ~ ED.Mean.T / 7 / 1000,
      Out.Unit.Amount == "mg" ~ ED.Mean.T / 1e6,
      
      # Body weight–based conversions only if Out.WG.Start is not NA
      Out.Unit.Amount %in% c("g dm/kg metabolic weight", "g/kg metabolic weight", "g dm/kg metabolic weight") & !is.na(Out.WG.Start) ~ 
        (ED.Mean.T * Out.WG.Start^0.75) / 1000,

      Out.Unit.Amount == "kg dm/kg metabolic weight" & !is.na(Out.WG.Start) ~ 
        ED.Mean.T * Out.WG.Start^0.75,

      Out.Unit.Amount %in% c("g dm/kg body weight", "g/kg body weight", "g/g body weight") & !is.na(Out.WG.Start) ~ 
        (ED.Mean.T * Out.WG.Start) / 1000,

      Out.Unit.Amount %in% c("g/kg body weight (0.75)", "g dm/kg body weight (0.75)") & !is.na(Out.WG.Start) ~ 
        (ED.Mean.T * Out.WG.Start^0.75) / 1000,

      Out.Unit.Amount %in% c("% body weight", "dm % body weight") & !is.na(Out.WG.Start) ~ 
        (ED.Mean.T / 100) * Out.WG.Start,

      TRUE ~ ED.Mean.T  # fallback: leave unchanged
    ),

    # Only update unit if we were able to convert to kg
    Out.Unit.Amount = case_when(
      Out.Unit.Amount %in% c("g", "g dm", "g dm/100 g", "g/week", "mg") ~ "kg",
      Out.Unit.Amount %in% c("g/kg body weight", "g dm/kg body weight", "g/g body weight") & !is.na(Out.WG.Start) ~ "kg",
      Out.Unit.Amount %in% c("g/kg metabolic weight", "g dm/kg metabolic weight", "g dm/kg metabolic weight") & !is.na(Out.WG.Start) ~ "kg",
      Out.Unit.Amount %in% c("kg dm/kg metabolic weight") & !is.na(Out.WG.Start) ~ "kg",
      Out.Unit.Amount %in% c("g/kg body weight (0.75)", "g dm/kg body weight (0.75)") & !is.na(Out.WG.Start) ~ "kg",
      Out.Unit.Amount %in% c("% body weight", "dm % body weight") & !is.na(Out.WG.Start) ~ "kg",
      TRUE ~ Out.Unit.Amount  # keep the original if no conversion
    )
  ) %>%
  select(-Out.Unit)

# Retain most interpretable row per intake item
feed_intake <- feed_intake %>%
  group_by(B.Code, A.Level.Name, ED.Intake.Item.Raw) %>%
  arrange(
    desc(!is.na(ED.Mean.T)), 
    desc(Out.Unit.Amount == "kg"),
    desc(Out.Unit.Animals == "individual"),
    desc(Out.Unit.Time == "day")
  ) %>%
  slice(1) %>%
  ungroup()

2.4.2) Unit error check on feed intake

Performs basic outlier checks on feed intake data, identifying suspiciously high or low intake values based on predefined unit thresholds.

# Assuming your data is stored in a data frame named `feed_intake_data`

# Filter for units that imply kg/day/individual and check if ED.Mean.T > 50
# Identify suspicious feed intake values
## ---- detect_suspicious_feed_intake -----------------------------------------
# Thresholds
high_kg <- 50      # > 50 kg d-1 animal-1  ⟹ unrealistically high
low_kg  <- 0.1     # < 0.1 kg d-1 animal-1 ⟹ unrealistically low
low_g_min <- 1     # 1–20 g range unlikely for daily intake
low_g_max <- 20

suspicious_feed_intake <- feed_intake %>% 
  mutate(
    flag = case_when(
      Out.Unit.Amount %in% c("kg", "kg dm") &
      Out.Unit.Time   == "day"              &
      Out.Unit.Animals == "individual" & ED.Mean.T >  high_kg ~ "too_high_kg",

      Out.Unit.Amount %in% c("kg", "kg dm") &
      Out.Unit.Time   == "day"              &
      Out.Unit.Animals == "individual" & ED.Mean.T <  low_kg  ~ "too_low_kg",

      Out.Unit.Amount %in% c("g",  "g dm")  &
      ED.Mean.T >= low_g_min & ED.Mean.T <= low_g_max          ~ "implausibly_low_g",

      TRUE ~ NA_character_
    )
  ) %>% 
  filter(!is.na(flag)) %>% 
  select(B.Code, A.Level.Name, ED.Mean.T,
         Out.Unit.Amount, Out.Unit.Time, Out.Unit.Animals, flag)

print(suspicious_feed_intake)
## # A tibble: 606 × 7
##    B.Code A.Level.Name ED.Mean.T Out.Unit.Amount Out.Unit.Time Out.Unit.Animals
##    <chr>  <chr>            <dbl> <chr>           <chr>         <chr>           
##  1 AG0051 NG              0.0083 kg              day           individual      
##  2 AG0051 NGD             0.0082 kg              day           individual      
##  3 AG0051 PG              0.0091 kg              day           individual      
##  4 AG0051 PGD             0.0092 kg              day           individual      
##  5 AN0007 T1              0.0696 kg              day           individual      
##  6 AN0007 T2              0.0735 kg              day           individual      
##  7 AN0007 T3              0.0808 kg              day           individual      
##  8 AN0007 T4              0.0769 kg              day           individual      
##  9 AN0007 T5              0.0802 kg              day           individual      
## 10 AN0020 Control T1      0.0503 kg              day           individual      
## # ℹ 596 more rows
## # ℹ 1 more variable: flag <chr>
low_intake_codes <- feed_intake %>% 
  filter(
    # (a) < 0.1 kg / day / individual
    (Out.Unit.Amount %in% c("kg", "kg dm") &
     Out.Unit.Time   == "day" &
     Out.Unit.Animals == "individual" &
     ED.Mean.T < 0.1) |
    
    # (b) 1–20 g reported in gram units
    (Out.Unit.Amount %in% c("g", "g dm") &
     ED.Mean.T >= 1 & ED.Mean.T <= 20)
  ) %>% 
  distinct(B.Code) %>%           # keep each code only once
  pull()                         # return as a plain character vector

low_intake_codes
##   [1] "AG0051"     "AN0007"     "AN0020"     "B01063"     "BO1012"    
##   [6] "BO1015"     "BO1017"     "BO1030"     "BO1042"     "BO1048"    
##  [11] "BO1061"     "BO1062"     "BO1064"     "BO1075"     "BO1095"    
##  [16] "BO1099"     "CJ1001"     "CJ1006"     "DK0037"     "EM1000"    
##  [21] "EM1049"     "EM1060"     "EM1062"     "EM1065"     "EM1066"    
##  [26] "EM1068"     "EM1070"     "EM1077"     "EM1081"     "EM1092"    
##  [31] "EM1097"     "EM1100"     "EO0032"     "EO0047"     "EO0086"    
##  [36] "EO0102"     "HK0005"     "HK0007"     "HK0011"     "HK0014"    
##  [41] "HK0017"     "HK0037"     "HK0045"     "HK0066.2"   "HK0067"    
##  [46] "HK0102.2"   "HK0118"     "HK0228"     "HK0260"     "HK0319"    
##  [51] "HK0331"     "HK0333"     "HK0337"     "JO1001"     "JO1010"    
##  [56] "JO1015"     "JO1020"     "JO1022"     "JO1023"     "JO1025"    
##  [61] "JO1026"     "JO1027"     "JO1033"     "JO1034"     "JO1036"    
##  [66] "JO1038"     "JO1040"     "JO1041"     "JO1042"     "JO1044"    
##  [71] "JO1057"     "JO1068"     "JO1069"     "JO1070"     "JO1071"    
##  [76] "JO1074"     "JO1075"     "JO1077"     "JO1078"     "JS0047"    
##  [81] "JS0048"     "JS0076"     "JS0079.1"   "JS0093"     "JS0095"    
##  [86] "JS0123"     "JS0124"     "JS0125"     "JS0134"     "JS0135"    
##  [91] "JS0139"     "JS0177"     "JS0192"     "JS0193"     "JS0206"    
##  [96] "JS0207"     "JS0208"     "JS0209"     "JS0281"     "JS0291"    
## [101] "JS0295"     "JS0296"     "JS0309"     "JS0342"     "JS0370"    
## [106] "JS0371"     "JS0393"     "LM0120"     "LM0266"     "LM0277.1"  
## [111] "LM0289"     "LM0310"     "LM0312"     "LM0334"     "LM0335"    
## [116] "LM0354"     "NJ0003"     "NJ0047"     "NN0021"     "NN0083"    
## [121] "NN0085"     "NN0098"     "NN0108"     "NN0114"     "NN0115"    
## [126] "NN0118"     "NN0119.2"   "NN0189"     "NN0198"     "NN0201"    
## [131] "NN0205"     "NN0211"     "NN0212"     "NN0214"     "NN0230"    
## [136] "NN0251"     "NN0272.1"   "NN0272.1.2" "NN0272.2"   "NN0321"    
## [141] "NN0325"     "NN0326"     "NN0358"     "NN0359"     "NN0367"    
## [146] "NN0377"     "NN0396"

Some papers were identified with errors. This section corrects those error if they are still present before the next update of the data.

# Step: Convert suspiciously high kg values that are likely mislabelled g
feed_intake <- feed_intake %>%
  mutate(
    Out.Unit.Amount = case_when(
      Out.Unit.Amount == "kg" & Out.Unit.Animals == "individual" & Out.Unit.Time == "day" & ED.Mean.T > 50 ~ "g",  # correct mislabeling
      TRUE ~ Out.Unit.Amount
    )
  )

2.4.3) Use of feed intake for ingredient amounts

If feed intake data is there for the entire diet and that the ingredients in ingredients dataset are in % or % of diet or g/kg we can convert

When we translate relative inclusion rates (“% of diet”, “g kg⁻¹ diet”) into absolute grams of dry matter, the correct denominator depends on how feed‑intake data are structured. The table below summarises every layout we encounter, what it looks like in the two raw data frames (feed_intake and ingredients), how the percentages should be interpreted, and therefore which intake value—total diet or group‑level mix—must be used for the conversion.

Scenario What you see in feed_intake What you see in ingredients What the % or g kg⁻¹ actually mean Correct denominator for conversion
A — No diet groups One row per diet level marked Entire Diet Every ingredient row has D.Item.Group = NA “%” = % of whole diet
“g/kg” = g per kg whole diet
Total‑diet intake (intake_total)
B — Mix is part of diet Two + rows per diet level:
is_group = TRUE for each mix (with its own intake)
is_entire_diet = TRUE
Rows inside the mix have D.Item.Group = mix name; stand‑alone ingredients have NA Inside‑mix “%” = % of mix
Stand‑alone “%” = % of whole diet
Inside‑mix → group intake (intake_group)
Stand‑alone → total‑diet intake
C — Mix is the whole diet Only one is_group = TRUE row and that diet level has no stand‑alone ingredients Every ingredient has the same D.Item.Group “%” = % of whole diet, even though recorded under a group label Total‑diet intake (because group = total)
D — Group intake duplicated in ingredients A row like D.Item == "Unspecified" in kg day⁻¹ that repeats the group’s intake Not a nutrient source → should be removed Drop the row entirely
E — Ingredient in feed_intake but not in ingredients A stand‑alone is_group = FALSE row Missing ingredient row We still need this ingredient and its intake Create a synthetic ingredient row with grams = ED.Mean.T × 1000
# Flag any row whose description is literally “Entire Diet”
feed_intake <- feed_intake %>% 
  mutate(
    is_entire_diet = ifelse(
      grepl("^\\s*entire\\s*diet\\s*$",
            tolower(coalesce(ED.Intake.Item, ED.Intake.Item.Raw, "")),
            ignore.case = TRUE),
      TRUE,
      is_entire_diet
    )
  )
# ─────────────────────────────────────────────────────────────────────────
# 0 ▸  Ensure we have a Source column (so we can tag “extracted” vs feed_intake)
# ─────────────────────────────────────────────────────────────────────────
ingredients <- ingredients %>%
  { if (!"Source" %in% names(.)) mutate(., Source = NA_character_) else . }

# ─────────────────────────────────────────────────────────────────────────
# 1 ▸  Pull diet‑level intakes
#    • total_intake = “Entire Diet” rows
#    • mix_intake   = rows flagged is_group
# ─────────────────────────────────────────────────────────────────────────
intake_total <- feed_intake %>%
  filter(is_entire_diet) %>%
  transmute(B.Code, A.Level.Name, total_intake = ED.Mean.T)

intake_group <- feed_intake %>%
  filter(is_group) %>%
  transmute(B.Code, A.Level.Name,
            D.Item.Group = ED.Intake.Item,
            mix_intake   = ED.Mean.T)

# ─────────────────────────────────────────────────────────────────────────
# 2 ▸  Drop “container” rows (scenario D)
# ─────────────────────────────────────────────────────────────────────────
group_lookup <- ingredients %>%
  filter(!is.na(D.Item.Group)) %>%
  distinct(B.Code, A.Level.Name, grp_name = str_trim(D.Item.Group))

ingredients <- ingredients %>%
  left_join(group_lookup, by = c("B.Code","A.Level.Name")) %>%
  filter(
    !(D.Item=="Unspecified"      &
      D.Unit.Amount=="kg"        &
      D.Unit.Time=="day"         &
      !is.na(grp_name)           &
      str_trim(D.Type)==grp_name)
  ) %>%
  select(-grp_name)
## Warning in left_join(., group_lookup, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 5 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# ─────────────────────────────────────────────────────────────────────────
# 3 ▸  Attach intakes & flag pure‑mix diets (scenarios A–C)
# ─────────────────────────────────────────────────────────────────────────
group_flag <- ingredients %>%
  left_join(intake_total, by = c("B.Code","A.Level.Name")) %>%
  left_join(intake_group, by = c("B.Code","A.Level.Name","D.Item.Group")) %>%
  group_by(B.Code,A.Level.Name) %>%
  summarise(one_mix = all(!is.na(D.Item.Group)) &&
                   n_distinct(D.Item.Group)==1,
            .groups="drop")
## Warning in left_join(., intake_total, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8702 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ingredients <- ingredients %>%
  left_join(group_flag, by = c("B.Code","A.Level.Name")) %>%
  left_join(intake_total, by = c("B.Code","A.Level.Name")) %>%
  left_join(intake_group, by = c("B.Code","A.Level.Name","D.Item.Group"))
## Warning in left_join(., intake_total, by = c("B.Code", "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8702 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# ─────────────────────────────────────────────────────────────────────────
# 4 ▸  Convert “%” / “g/kg” → grams DM/day
#      A: no group → total_intake  
#      B: group part‑of‑diet → mix_intake for that ingredient  
#      C: pure‑mix → still use total_intake  
# ─────────────────────────────────────────────────────────────────────────
ingredients <- ingredients %>%
  mutate(
    base_intake = case_when(
      D.Unit.Amount %in% c("%","% Diet") &
        !is.na(mix_intake) & !one_mix ~ mix_intake,
      D.Unit.Amount %in% c("%","% Diet","g/kg") ~ total_intake,
      TRUE ~ NA_real_
    ),
    D.Amount = case_when(
      D.Unit.Amount %in% c("%","% Diet") & !is.na(base_intake) ~
        (D.Amount/100) * base_intake * 1000,
      D.Unit.Amount=="g/kg" & !is.na(base_intake) ~
        D.Amount * base_intake,
      TRUE ~ D.Amount
    ),
    D.Unit.Amount = if_else(
      D.Unit.Amount %in% c("%","% Diet","g/kg") & !is.na(base_intake),
      "g", D.Unit.Amount
    ),
    Source = coalesce(Source,"extracted"),
    basal_intake_status = NA_character_
  ) %>%
  select(-base_intake, -total_intake, -mix_intake, -one_mix)

# ─────────────────────────────────────────────────────────────────────────
# 5 ▸  Import truly missing items as basal diets (scenario E)
# ─────────────────────────────────────────────────────────────────────────
missing_basals <- feed_intake %>%
  filter(!is_group, !is_entire_diet) %>%
  anti_join(ingredients,
            by = c("B.Code","A.Level.Name","ED.Intake.Item"="D.Item"))

basal_ingredients <- missing_basals %>%
  transmute(
    B.Code,
    A.Level.Name      = paste0("Base_",A.Level.Name),
    D.Item            = ED.Intake.Item,
    D.Type            = "Imported_from_feed_intake",
    D.Amount          = case_when(
                         Out.Unit.Amount=="kg" ~ ED.Mean.T*1000,
                         Out.Unit.Amount=="g"  ~ ED.Mean.T,
                         TRUE                  ~ NA_real_
                       ),
    D.Unit.Amount     = "g",
    D.Unit.Time       = Out.Unit.Time,
    D.Unit.Animals    = Out.Unit.Animals,
    DC.Is.Dry         = NA,
    D.Ad.lib          = NA,
    D.Item.Group      = NA_character_,
    Units             = NA,
    T.Animals         = T.Animals,
    Prediction_Source = NA,
    Prediction_Source.from_AName = NA,
    Out.WG.Start      = Out.WG.Start,
    Out.WG.Unit       = Out.WG.Unit,
    Source            = "feed_intake",
    basal_intake_status = "imported"
  )

# ─────────────────────────────────────────────────────────────────────────
# 6 ▸  Combine & drop only imported basals that still lack an amount
# ─────────────────────────────────────────────────────────────────────────
ingredients <- bind_rows(ingredients, basal_ingredients)

3) Subset data

3.1) Take out basal diets

We excluded the basal diets because, in most of the livestock trials in our database, the basal ration is offered ad libitum and therefore lacks two pieces of information that are essential for a fair, productivity-based comparison: (i) actual feed-intake measurements and (ii) corresponding enteric-methane estimates. Without intake data we cannot convert emissions—or, in many cases, even estimate them—on a per-kilogram-of-product basis, and retaining these records would artificially inflate the number of “treatments” while contributing no usable information to the efficiency analyses. By focusing on the control diet (with measured intake) and the formulated treatment diets, we keep a coherent set of observations in which every data point can be expressed as both absolute emissions and emission intensity, allowing robust, like-for-like comparisons across studies.

ingredients <- ingredients %>%
  filter(!str_starts(A.Level.Name, "Base"))

ingredients <- ingredients %>%
  filter(!str_starts(D.Item.Group, "Base"))

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  filter(!str_starts(D.Item, "Base"))

4) Replace feed intake data where available

We prioritize feed intake data over the amounts reported in the methods section, as feed intake is the key variable needed for emission calculations. When feed intake data is unavailable, we fall back on the harmonized ingredient amounts extracted earlier.

4.1) Feed intake data at the ingredient level

# Step 5: Remove feed intake entries that are actually full-diet labels
feed_intake_ingredients <- feed_intake %>%
  filter(
    str_trim(ED.Intake.Item) != str_trim(A.Level.Name) & 
    str_trim(ED.Intake.Item) != "Entire Diet" & 
    str_trim(ED.Intake.Item.Raw) != "Entire Diet"
  ) %>%
  filter(!is.na(ED.Mean.T))


# Step 6: Merge with ingredients and override with ED.Mean.T when available
# FIXED VERSION
# Merge with ingredients and override with ED.Mean.T when available
ingredients <- ingredients %>%
  left_join(
    feed_intake_ingredients %>%
      select(B.Code, A.Level.Name, ED.Intake.Item, ED.Mean.T, Out.Unit.Amount, Out.Unit.Animals, Out.Unit.Time),
    by = c("B.Code", "A.Level.Name", "D.Item" = "ED.Intake.Item")
  ) %>%
  mutate(
    from_feed_intake = !is.na(ED.Mean.T),
    D.Amount = if_else(from_feed_intake, ED.Mean.T, D.Amount),
    D.Unit.Amount = if_else(from_feed_intake & !is.na(Out.Unit.Amount), Out.Unit.Amount, D.Unit.Amount),
    D.Unit.Animals = if_else(from_feed_intake & !is.na(Out.Unit.Animals), Out.Unit.Animals, D.Unit.Animals),
    D.Unit.Time = if_else(from_feed_intake & !is.na(Out.Unit.Time), Out.Unit.Time, D.Unit.Time)
  ) %>%
  select(-ED.Mean.T, -Out.Unit.Amount, -Out.Unit.Animals, -Out.Unit.Time) %>%
  distinct()


# Step 8: Final conversion of unit to grams (g)
ingredients <- ingredients %>%
  mutate(
    D.Amount = case_when(
      D.Unit.Amount == "kg" ~ D.Amount * 1000,
      TRUE ~ D.Amount
    ),
    D.Unit.Amount = case_when(
      D.Unit.Amount == "kg" ~ "g",
      TRUE ~ D.Unit.Amount
    )
  )

4.2) Feed intake at the diet level

TO do : See BO1070 we want the full diet to be included from feed intake which is not the case now see feed intake data the goal would be to add rows in ingredient to add info at the entire diet level have a close look to make it the same format as entire diets already in ingredients

# 
# # --- 0 ▸ helper --------------------------------------------------------------
# # Convert relative (% / g kg⁻¹) units to grams given a denominator vector
# rel_to_abs <- function(df, denom) {
#   df %>% mutate(
#     D.Amount = case_when(
#       D.Unit.Amount %in% c("%", "% Diet") ~ (D.Amount / 100) * denom * 1000,
#      
#       TRUE                                   ~  D.Amount
#     ),
#     D.Unit.Amount = if_else(D.Unit.Amount %in% c("%", "% Diet"),
#                             "g", D.Unit.Amount)
#   )
# }
# 
# # ---------------------------------------------------------------------------
# #  1 ▸ build diet‑ and mix‑level intake look‑ups -----------------------------
# intake_total <- feed_intake %>%
#   filter(is_entire_diet) %>%
#   transmute(B.Code, A.Level.Name, diet_intake_kg = ED.Mean.T)
# 
# intake_group <- feed_intake %>%
#   filter(is_group) %>%
#   transmute(B.Code, A.Level.Name, D.Item.Group = ED.Intake.Item, mix_intake_kg = ED.Mean.T)
# 
# # ---------------------------------------------------------------------------
# #  2 ▸ organise diet structure ----------------------------------------------
# group_flag <- ingredients %>%
#   group_by(B.Code, A.Level.Name) %>%
#   summarise(first_group = first(na.omit(D.Item.Group)),
#             all_one_group = all(!is.na(D.Item.Group)) & n_distinct(D.Item.Group[!is.na(D.Item.Group)]) == 1,
#             .groups = "drop")
# 
# intake_denoms <- group_flag %>%
#   left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
#   left_join(intake_group,  by = c("B.Code", "A.Level.Name", "first_group" = "D.Item.Group"))
# 
# # ---------------------------------------------------------------------------
# #  3 ▸ convert % / g kg⁻¹ rows -----------------------------------------------
# relative_rows <- ingredients %>% filter(D.Unit.Amount %in% c("%", "% Diet"))
# 
# ingredients_relative_fixed <- relative_rows %>%
#   left_join(intake_denoms, by = c("B.Code", "A.Level.Name")) %>%
#   mutate(den = coalesce(mix_intake_kg, diet_intake_kg)) %>%
#   rel_to_abs(denom = .$den) %>%
#   select(-den) %>%
#   mutate(Source = "converted")            # tag for precedence
# 
# # ---------------------------------------------------------------------------
# #  4 ▸ clone pure <Base> diets into each treatment ---------------------------
# basal_diets <- ingredients %>% filter(grepl("^Base$", A.Level.Name, ignore.case = TRUE))
# expand_basals <- function(code) {
#   trts <- ingredients %>% filter(B.Code == code & !grepl("^Base", A.Level.Name, ignore.case = TRUE)) %>% pull(A.Level.Name) %>% unique()
#   map_dfr(trts, \(trt) basal_diets %>% filter(B.Code == code) %>% mutate(A.Level.Name = trt))
# }
# basal_expanded <- basal_diets %>% pull(B.Code) %>% unique() %>% map_dfr(expand_basals)
# 
# # ---------------------------------------------------------------------------
# #  5 ▸ rows coming from feed_intake with prefixed diet name ------------------
# imported_match <- ingredients %>%
#   filter(Source == "feed_intake" & grepl("^Base_", A.Level.Name)) %>%
#   mutate(A.Level.Name = sub("^Base_", "", A.Level.Name))
# 
# # ---------------------------------------------------------------------------
# #  6 ▸ assemble with precedence ---------------------------------------------
# combine_tbl <- bind_rows(
#   imported_match,                                   # 1 author‑measured grams
#   ingredients_relative_fixed,                       # 2 converted grams (from originals)
#   ingredients %>% anti_join(relative_rows, by = c("B.Code","A.Level.Name","D.Item")), # 3 originals
#   basal_expanded                                    # 4 clones
# ) %>%
#   filter(!grepl("^Base_", A.Level.Name, ignore.case = TRUE))
# 
# # ---------------------------------------------------------------------------
# #  6b ▸ SECOND‑PASS conversion for cloned % rows --------------------------------
# # ---------------------------------------------------------------------------
# leftover_rel <- combine_tbl %>%
#   filter(D.Unit.Amount %in% c("%", "% Diet")) %>%
#   # drop any pre‑existing diet_intake columns to avoid .x/.y suffixes
#   select(-matches("^diet_intake_kg$")) %>%
#   left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
#   mutate(
#     D.Amount = case_when(
#       D.Unit.Amount %in% c("%", "% Diet") ~ (D.Amount / 100) * diet_intake_kg * 1000,
#       TRUE                                   ~  D.Amount
#     ),
#     D.Unit.Amount = "g",
#     Source = ifelse(Source == "extracted", "converted", Source)
#   ) %>%
#   select(-diet_intake_kg)
# 
# combine_tbl <- combine_tbl %>%
#   anti_join(leftover_rel, by = c("B.Code", "A.Level.Name", "D.Item")) %>%
#   bind_rows(leftover_rel)
# 
# # precedence ---------------------------------------------------------------
# src_levels <- c("feed_intake", "converted", "extracted", "basal_expanded")
# 
# ingredients <- combine_tbl %>%
#   mutate(Source = factor(Source, levels = src_levels, ordered = TRUE)) %>%
#   arrange(B.Code, A.Level.Name, D.Item, Source) %>%
#   distinct(B.Code, A.Level.Name, D.Item, .keep_all = TRUE) %>%
#   ungroup() %>%
#   mutate(Source = as.character(Source),
#          Source = ifelse(is.na(Source), "basal_expanded", Source))
# 
# # ---------------------------------------------------------------------------
# #  7 ▸ single‑NA remainder fill  — **disabled at user request**
# # ---------------------------------------------------------------------------
# # The block that back‑calculates a missing basal amount has been removed for
# # now, so any ingredient still lacking `D.Amount` will remain NA.
# # ---------------------------------------------------------------------------
# 
# # ---------------------------------------------------------------------------
# #  8 ▸ integrity check -------------------------------------------------------
# qc_totals <- ingredients %>%
#   group_by(B.Code, A.Level.Name) %>%
#   summarise(sum_g = sum(D.Amount, na.rm = TRUE), .groups = "drop") %>%
#   left_join(intake_total, by = c("B.Code", "A.Level.Name")) %>%
#   mutate(rel_diff = abs(sum_g/1000 - diet_intake_kg) / diet_intake_kg) %>%
#   filter(rel_diff > 0.10)
# 
# if(nrow(qc_totals) > 0) warning("Diets with >10 % mass‑balance error present; see qc_totals.")
# 
# # ingredients_final is now harmonised (g DM d⁻¹) and ready for GE → CH₄
# ─────────────────────────────────────────────────────────────────────────────
# 1 ▸ diet-level intake DIRECTLY from feed_intake -----------------------------
feed_intake_diet <- feed_intake %>%
  filter(is_entire_diet) %>%                         # just the whole-diet rows
  transmute(B.Code, A.Level.Name,
            diet_intake_kg = ED.Mean.T,              # already in kg DM d-1
            Source = "feed_intake")

# ─────────────────────────────────────────────────────────────────────────────
# 2 ▸ diet-level intake CALCULATED from ingredients_final --------------------
diet_intake_calc <- ingredients %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(diet_intake_kg = sum(D.Amount, na.rm = TRUE) / 1000,  # g → kg
            .groups = "drop") %>%
  mutate(Source = "sum_ingredients")

# ─────────────────────────────────────────────────────────────────────────────
# 3 ▸ combine, giving precedence to the direct value -------------------------
feed_intake_diet <- feed_intake_diet %>%
  full_join(diet_intake_calc,
            by = c("B.Code", "A.Level.Name"),
            suffix = c("_feed", "_calc")) %>%
  transmute(
    B.Code, A.Level.Name,
    diet_intake_kg = coalesce(diet_intake_kg_feed, diet_intake_kg_calc),
    Source = ifelse(!is.na(diet_intake_kg_feed),
                    "feed_intake", "sum_ingredients")
  )

# diet_intake_final is ready to use

expand basal diets and see in feed intake if we have the basal diet for each treatment

library(dplyr)
library(stringr)
library(tidyr)
library(DT)
library(readr)

# --- Source data shortcuts ----------------------------------------------------
fi_raw <- merged_metadata$Data.Out %>% 
  filter(Out.Subind == "Feed Intake")

species_info <- merged_metadata$Prod.Out %>%
  distinct(B.Code, Species = P.Product)

diets_per_paper <- merged_metadata$Animals.Diet %>%
  distinct(B.Code, A.Level.Name) %>%
  count(B.Code, name = "n_diets")

# --- Helper: detect "Entire Diet" rows in feed intake ------------------------
fi_marked <- fi_raw %>%
  mutate(
    ED.Intake.Item   = str_squish(coalesce(ED.Intake.Item, "")),
    ED.Intake.Item.Raw = str_squish(coalesce(ED.Intake.Item.Raw, "")),
    is_entire_diet_row = 
      ED.Intake.Item == "" |
      str_detect(ED.Intake.Item,  regex("^entire\\s*diet$", ignore_case = TRUE)) |
      str_detect(ED.Intake.Item.Raw, regex("^entire\\s*diet$", ignore_case = TRUE))
  )

# --- Summaries per paper (B.Code) --------------------------------------------
fi_by_paper <- fi_marked %>%
  group_by(B.Code) %>%
  summarise(
    n_feed_rows           = n(),
    n_with_value          = sum(!is.na(ED.Mean.T)),
    n_with_unit           = sum(!is.na(Out.Unit)),
    n_entire_diet_rows    = sum(is_entire_diet_row, na.rm = TRUE),
    has_feed_intake       = any(!is.na(ED.Mean.T)),
    has_entire_diet_intake= any(is_entire_diet_row & !is.na(ED.Mean.T), na.rm = TRUE),
    .groups = "drop"
  )

all_papers <- merged_metadata$Animals.Diet %>% 
  distinct(B.Code) %>%
  left_join(species_info, by = "B.Code") %>%
  left_join(diets_per_paper, by = "B.Code") %>%
  left_join(fi_by_paper, by = "B.Code") %>%
  mutate(
    n_diets                = replace_na(n_diets, 0L),
    n_feed_rows            = replace_na(n_feed_rows, 0L),
    n_with_value           = replace_na(n_with_value, 0L),
    n_with_unit            = replace_na(n_with_unit, 0L),
    n_entire_diet_rows     = replace_na(n_entire_diet_rows, 0L),
    has_feed_intake        = replace_na(has_feed_intake, FALSE),
    has_entire_diet_intake = replace_na(has_entire_diet_intake, FALSE),
    Status = if_else(has_feed_intake, "Has feed-intake data", "No feed-intake data")
  ) %>%
  select(
    B.Code, Species, n_diets,
    Status,
    n_feed_rows, n_with_value, n_with_unit, n_entire_diet_rows,
    has_entire_diet_intake
  ) %>%
  arrange(Species, desc(Status), B.Code)

# --- Handy vectors if you need them elsewhere --------------------------------
with_feed_intake    <- all_papers %>% filter(Status == "Has feed-intake data") %>% pull(B.Code)
without_feed_intake <- all_papers %>% filter(Status == "No feed-intake data")  %>% pull(B.Code)

# --- Save CSV ----------------------------------------------------------------
write_csv(all_papers, "paper_feed_intake_summary.csv")

# --- Show interactive table --------------------------------------------------
datatable(
  all_papers,
  rownames = FALSE,
  filter = "top",
  extensions = c("Buttons"),
  options = list(
    pageLength = 25,
    dom = "Bfrtip",
    buttons = list("copy", "csv", "excel"),
    autoWidth = TRUE
  )
)

6) Complete the nutrition and digestibility papers

Not all ingredients and diets are currently included in the digestibility and composition tables—only those for which data is available. We aim to add the missing ones to help complete the dataset and enable more comprehensive analysis. when basal diet amount is expressed in % deduce first from the total and then see the feed intake vfalue

## 1 · ingredient‑level rows (what you already had)
items_tbl <- merged_metadata$Animals.Diet %>% 
  distinct(B.Code, A.Level.Name, D.Item, Source) %>% 
  mutate(
    D.Item         = coalesce(D.Item, A.Level.Name),   # fall back to level name
    kind           = "Item",
    is_group = FALSE
  ) %>% 
  select(B.Code, D.Item, kind,is_group, Source)

## 2 · group‑level rows (PP, “Cactus‑Acacia …”, etc.)
groups_tbl <- merged_metadata$Animals.Diet %>% 
  filter(!is.na(D.Item.Group)) %>%              # keep only lines that have a group tag
  distinct(B.Code, D.Item.Group, Source) %>% 
  mutate(
    kind           = "Group",
    is_group = TRUE) %>% 
  rename(D.Item = D.Item.Group) %>% 
  select(B.Code, D.Item, kind,is_group, Source)

## 3 · bind the two together and drop duplicates
all_diet_items <- bind_rows(items_tbl, groups_tbl) %>% distinct()



# --- Step 2: Get unique variables in the nutrition and digestibility datasets ---
nutrition_variables <- unique(na.omit(merged_metadata$Animals.Diet.Comp$DC.Variable))
digestibility_variables <- unique(na.omit(merged_metadata$Animals.Diet.Digest$DD.Variable))

# --- Step 3: Expand to full grid of all D.Item × variables (by B.Code and Source) ---
# Nutrition (Comp)
comp_missing <- all_diet_items %>%
  crossing(DC.Variable = nutrition_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item,is_group, DC.Variable),
    by = c("B.Code", "D.Item", "DC.Variable","is_group")
  ) %>%
  mutate(
    DC.Value = NA_real_,
    DC.Unit = NA_character_
  )

# Digestibility (Digest)
digest_missing <- all_diet_items %>%
  crossing(DD.Variable = digestibility_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item,is_group, DD.Variable),
    by = c("B.Code", "D.Item", "DD.Variable","is_group")
  ) %>%
  mutate(
    DD.Value = NA_real_,
    DD.Unit = NA_character_
  )

# --- Step 4: Bind the missing values into the existing datasets ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_missing)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_missing)
# --- Step 5: Ensure each ingredient has a row for each nutrition variable ---
comp_completion <- merged_metadata$Animals.Diet.Comp %>%
  distinct(B.Code, D.Item, is_entire_diet, Source) %>%
  crossing(DC.Variable = nutrition_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Comp %>% select(B.Code, D.Item, DC.Variable),
    by = c("B.Code", "D.Item", "DC.Variable")
  ) %>%
  mutate(
    DC.Value = NA_real_,
    DC.Unit = NA_character_
  )

# --- Step 6: Ensure each ingredient has a row for each digestibility variable ---
digest_completion <- merged_metadata$Animals.Diet.Digest %>%
  distinct(B.Code, D.Item, is_entire_diet, Source) %>%
  crossing(DD.Variable = digestibility_variables) %>%
  anti_join(
    merged_metadata$Animals.Diet.Digest %>% select(B.Code, D.Item, DD.Variable),
    by = c("B.Code", "D.Item", "DD.Variable")
  ) %>%
  mutate(
    DD.Value = NA_real_,
    DD.Unit = NA_character_
  )

# --- Step 7: Bind additional completion rows ---
merged_metadata$Animals.Diet.Comp <- bind_rows(merged_metadata$Animals.Diet.Comp, comp_completion)
merged_metadata$Animals.Diet.Digest <- bind_rows(merged_metadata$Animals.Diet.Digest, digest_completion)

rm(comp_completion,comp_missing,digest_completion,digest_missing)

##fill is entire diet column

library(dplyr)
library(stringr)

# Build lookups from Animals.Diet (ingredients and diets)
ingredient_lookup <- merged_metadata$Animals.Diet %>%
  transmute(B.Code,
            token = str_squish(str_to_lower(as.character(D.Item)))) %>%
  filter(token != "") %>%
  distinct() %>%
  mutate(is_ingredient_token = TRUE)

diet_lookup <- merged_metadata$Animals.Diet %>%
  transmute(B.Code,
            token = str_squish(str_to_lower(as.character(A.Level.Name)))) %>%
  filter(token != "") %>%
  distinct() %>%
  mutate(is_diet_token = TRUE)

# Helper to fill is_entire_diet in a given table (Comp or Digest)
fill_is_entire <- function(df) {
  df %>%
    mutate(
      D.Item_norm = str_squish(str_to_lower(as.character(D.Item))),
      is_entire_literal = str_detect(D.Item_norm, "^(entire|whole)\\s*diet$")
    ) %>%
    left_join(ingredient_lookup, by = c("B.Code", "D.Item_norm" = "token")) %>%
    left_join(diet_lookup,       by = c("B.Code", "D.Item_norm" = "token")) %>%
    mutate(
      is_entire_diet = case_when(
        is_entire_literal           ~ TRUE,   # literal Entire/Whole Diet
        !is.na(is_ingredient_token) ~ FALSE,  # matches ingredient
        !is.na(is_diet_token)       ~ TRUE,   # matches diet name
        TRUE                        ~ FALSE
      ),
      # Optional: if you keep is_group in these tables and want groups never entire:
      is_entire_diet = if ("is_group" %in% names(.)) if_else(is_group, FALSE, is_entire_diet) else is_entire_diet
    ) %>%
    select(-D.Item_norm, -is_entire_literal, -is_ingredient_token, -is_diet_token)
}

# Apply to both tables AFTER your completion/bind steps
merged_metadata$Animals.Diet.Comp   <- fill_is_entire(merged_metadata$Animals.Diet.Comp)
merged_metadata$Animals.Diet.Digest <- fill_is_entire(merged_metadata$Animals.Diet.Digest)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
#   filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
#   distinct(B.Code) %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_with_GE")
# 
# # View result
# ge_papers

6.2) Correct nutrition units

# ── 2.  Build a “variable → units” summary table ───────────────────────────────
table_units <- merged_metadata$Animals.Diet.Comp %>%
  # keep rows where a unit is actually recorded
  filter(!is.na(DC.Unit) & DC.Unit != "NA" & str_trim(DC.Unit) != "") %>%
  # one row per unique (variable, unit) pair
  distinct(DC.Variable, DC.Unit) %>%
  # collapse multiple units for the same variable into a single comma-separated
  group_by(DC.Variable) %>%
  summarise(units = paste(sort(unique(DC.Unit)), collapse = ", "),
            .groups = "drop") %>%
  arrange(DC.Variable)

# ── 3.  Inspect / export ───────────────────────────────────────────────────────
print(table_units)
## # A tibble: 62 × 2
##    DC.Variable units                                                            
##    <chr>       <chr>                                                            
##  1 ADF         %, % DM, g, g/100g, g/100g DM, g/kg, g/kg DM                     
##  2 ADL         %, % DM, g/100g, g/100g DM, g/kg, g/kg DM                        
##  3 Ash         %, % DM, g/100g, g/100g DM, g/kg, g/kg DM, Unspecified           
##  4 CF          %, % DM, g/kg, g/kg DM                                           
##  5 CFbr        %, % DM, g/100g DM, g/kg, g/kg DM                                
##  6 CFibre      %, g/100g, g/100g DM, g/kg, g/kg(Prosopis)/ % (diet), Unspecified
##  7 CH          %, % DM, g/kg DM                                                 
##  8 CP          %, % DM, %/(g/100g for pumpkin leaves), g, g/100g, g/100g DM, g/…
##  9 CT          %, % DM, g/kg, g/kg DM, mg/g                                     
## 10 Ca          %, % DM, g/100g, g/kg, g/kg DM                                   
## # ℹ 52 more rows
# write_csv(table_units, "variable_units_lookup.csv")   # if you want a file
###############################################################################
##  Fix the mis-labelled GE unit for four specific ingredients
##  *and* only in the paper NN0106
###############################################################################

bad_items <- c(
  "Biscuit Waste Meal Dried(Sun)",
  "Leucaena leucocephala Leaves Dried",
  "Maize",
  "Wheat Offal"
)

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(
    DC.Unit = if_else(
      D.Item      %in% bad_items      &   # right ingredient name
      B.Code  ==  "NN0106"        &   # right paper / source
      DC.Variable ==  "GE"            &   # only Gross Energy rows
      DC.Unit     ==  "kcal/g",           # only if label is wrong
      "kcal/kg",                          # corrected label
      DC.Unit                             
    )
  )
animals <- merged_metadata$Animals.Diet.Comp %>% mutate(value_num = suppressWarnings(as.numeric(DC.Value)))

# 1. nutrient (% / g-based) ➜ g / kg DM ========================================
vars_gkg <- c("NDF", "EE", "CP", "Ash")

conv_gkg <- tribble(
  ~DC.Unit,       ~factor,
  "%",             10,
  "% DM",          10,
  "g/100g",        10,
  "g/100g DM",     10,
  "g/kg",           1,
  "g/kg DM",        1
)

animals <- animals %>%
  left_join(conv_gkg, by = "DC.Unit") %>%
  mutate(
    value_std_gkg = if_else(DC.Variable %in% vars_gkg & !is.na(factor),
                            value_num * factor, NA_real_),
    unit_std_gkg  = if_else(!is.na(value_std_gkg), "g/kg", NA_character_)
  ) %>%
  select(-factor)

# 2. energy ➜ MJ / kg (DM) =====================================================
vars_MJ <- c("GE", "ME", "NE")

conv_MJ <- tribble(
  ~DC.Unit,       ~factorMJ,
  "MJ/kg",           1,
  "MJ/kg DM",        1,
  "MJ/100g",        10,
  "MJ/100g DM",     10,
  "MJ/g",        1e3,
  "kJ/kg",       0.001,
  "kJ/kg DM",    0.001,
  "kJ/100g",     0.01,
  "kJ/100g DM",  0.01,
  "kJ/g",           1,
  "J/mg",           1,          # J per mg ×1e-6 MJ/J ×1e6 mg/kg
  "kcal/kg",     0.004184,
  "kcal/kg DM",  0.004184,
  "kcal/100g",   0.04184,
  "kcal/g",      4.184,
  "Mcal/kg",     4.184,
  "Mcal/kg DM",  4.184
)

animals <- animals %>%
  left_join(conv_MJ, by = "DC.Unit") %>%
  mutate(
    value_std_MJ  = if_else(DC.Variable %in% vars_MJ & !is.na(factorMJ),
                            value_num * factorMJ, NA_real_),
    unit_std_MJ   = if_else(!is.na(value_std_MJ), "MJ/kg", NA_character_)
  ) %>%
  select(-factorMJ)

# 3. overwrite DC.Value / DC.Unit where a conversion succeeded -----------------
animals <- animals %>%
  mutate(
    DC.Value = coalesce(value_std_gkg, value_std_MJ, value_num) |> format(scientific = FALSE),
    DC.Unit  = coalesce(unit_std_gkg, unit_std_MJ, DC.Unit)
  ) %>%
  select(-value_num, -value_std_gkg, -value_std_MJ,
         -unit_std_gkg, -unit_std_MJ)

# 4. save back into the metadata list ------------------------------------------
merged_metadata$Animals.Diet.Comp <- animals
# ── END CHUNK ──────────────────────────────────────────────────────────────────

6.3) Coverage of nutritent

coverage_raw <- merged_metadata$Animals.Diet.Comp %>%
  filter(!is.na(DC.Variable)) %>%
  mutate(
    # Clean and detect actual missing values
    DC.Value_clean = str_trim(as.character(DC.Value)),
    DC.Value_clean = na_if(DC.Value_clean, ""),
    DC.Value_clean = na_if(DC.Value_clean, "NA"),
    DC.Value_clean = na_if(DC.Value_clean, "na")
  ) %>%
  group_by(DC.Variable) %>%
  summarise(
    n_total = n(),
    n_non_missing = sum(!is.na(DC.Value_clean)),
    coverage_percent = round(100 * n_non_missing / n_total, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(coverage_percent))

7.1) add nutrition and digestibility from feedipedia

feedipedia_unit_audit <- feedipedia %>%
  mutate(
    Variable = as.character(Variable),
    Unit     = trimws(Unit)
  ) %>% 
  group_by(Variable, Unit) %>% 
  summarise(n_rows = n(), .groups = "drop") %>% 
  arrange(Variable, desc(n_rows)) %>% 
  group_by(Variable) %>% 
  mutate(
    n_units = n(),
    flag    = if_else(n_units > 1, "⚠︎ multiple units", "")
  ) %>% 
  ungroup()

# ── wide human-friendly table  ---------------------------------------------
unit_summary <- feedipedia_unit_audit %>% 
  unite(unit_n, Unit, n_rows, sep = " (n = ") %>% 
  mutate(unit_n = paste0(unit_n, ")")) %>% 
  group_by(Variable, flag) %>% 
  summarise(units = paste(unit_n, collapse = "; "), .groups = "drop") %>% 
  arrange(Variable)

###7.1) Harmonize feedipedia units

# helper -------------------------------------------------------------
to_canonical <- function(val, unit, vclass){
  u  <- str_trim(tolower(unit))
  nu <- unit              # keep the original unit by default
  v  <- val               # keep the original value by default

  if (vclass == "energy"){                 # ---------- ENERGY ----------
    if (u %in% c("mcal/kg dm","mcal/kg","mcal/kg d.m.")){
      v  <- val * 4.184 ; nu <- "MJ/kg"
    } else if (u %in% c("kcal/kg dm","kcal/kg")){
      v  <- val * 0.004184 ; nu <- "MJ/kg"
    } else if (u %in% c("mj/kg","mj/kg dm")){
      nu <- "MJ/kg"
    }

  } else {                                 # -------- NUTRIENT ----------
    if (u %in% c("% dm","%","% as fed")){
      v  <- val * 10 ;   nu <- "g/kg"
    } else if (u %in% c("mg/kg dm","mg/kg")){
      v  <- val / 1000 ; nu <- "g/kg"
    } else if (u %in% c("g/100g dm","g/100g")){
      v  <- val * 10 ;   nu <- "g/kg"
    } else if (u == "g dm"){
      nu <- "g/kg"
    }
  }
  list(val = v, unit = nu)
}

# ── 0 · variable lists ──────────────────────────────────────────────────────
nutrition_vars <- str_trim(c(
  "ADF", "Crude protein", "Ether extract", "Ash","NDF","CP"
))

energy_vars <- c(                          #  ✱ NEW: long names added here ✱
  "GE", "Gross energy",
  "ME", "Metabolizable energy",
  "NE",
  "AME", "AMEn", "TME", "TMEn",
  "ME ruminants", "ME ruminants (FAO, 1982)", "ME ruminants (gas production)",
  "MEn growing pig", "MEn rabbit",
  "NE growing pig"
)

# ── 1 · conversion ─────────────────────────────────────────────────────────
feedipedia <- feedipedia %>%
  rowwise() %>%
  mutate(
    conv = list(
      if (Variable %in% c(nutrition_vars, energy_vars))      # ← look in BOTH lists
        to_canonical(
          Avg, Unit,
          if_else(Variable %in% energy_vars, "energy", "nutrient")
        )
      else list(val = Avg, unit = Unit)      # leave untouched
    )
  ) %>%
  tidyr::unnest_wider(conv) %>%
  mutate(Avg = val, Unit = unit) %>%
  select(-val, -unit) %>%
  ungroup()

7.2) add nutrition and digestibility from feedipedia

# --------------------------------------------------------------------------- #
# A)  Attach Feedipedia node to Comp & Digest                                 #
# --------------------------------------------------------------------------- #
add_feedipedia_node <- function(df, mastersheet) {
  df %>% 
    left_join(mastersheet %>% select(Edge_Value, Feedipedia),
              by = c("D.Item" = "Edge_Value")) %>% 
    distinct()
}

merged_metadata$Animals.Diet.Comp   <- add_feedipedia_node(merged_metadata$Animals.Diet.Comp,   mastersheet)
merged_metadata$Animals.Diet.Digest <- add_feedipedia_node(merged_metadata$Animals.Diet.Digest, mastersheet)


# --------------------------------------------------------------------------- #
# B)  Clean Feedipedia dump                                                   #
# --------------------------------------------------------------------------- #
digestibility_keywords <- c(
  "digestibility","degradability","energy digestibility","DE","ME","NE","AMEn",
  "OM digestibility","DM digestibility","ruminants","rabbit","pig","poultry",
  "horse","salmonids","hydrolysis"
)

feedipedia <- feedipedia %>% 
  mutate(
    Variable = case_when(
      tolower(Variable) == "gross energy"            ~ "GE",
      tolower(Variable) == "crude protein"           ~ "CP",
      tolower(Variable) == "ether extract"           ~ "EE",
      tolower(Variable) == "ash"                     ~ "Ash",
      tolower(Variable) == "neutral detergent fibre" ~ "NDF",
      TRUE                                           ~ Variable
    ),
    category = case_when(
      Variable %in% c("GE","CP","EE","Ash","NDF")                                    ~ "nutrition",
      str_detect(Variable, regex(paste(digestibility_keywords, collapse="|"), TRUE)) ~ "digestibility",
      TRUE                                                                           ~ "nutrition"
    ),
    feedipedia_node = as.numeric(feedipedia_node),
    page_node       = as.numeric(page_node)
  )


# --------------------------------------------------------------------------- #
# C)  Node-level means  +  cross-walk (node ↔ page)                           #
# --------------------------------------------------------------------------- #
nutrition_vars <- c("GE","CP","EE","Ash","NDF")

## C1 – mean values per *true* node
feedipedia_nutri_node <- feedipedia %>% 
  filter(Variable %in% nutrition_vars) %>% 
  group_by(feedipedia_node, Variable, Unit) %>% 
  summarise(Value = mean(Avg, na.rm = TRUE), .groups = "drop") %>% 
  rename(DC.Variable = Variable, DC.Value.feedipedia = Value)

feedipedia_dig_node <- feedipedia %>% 
  filter(str_detect(tolower(Variable), "energy digestibility")) %>% 
  mutate(DD.Variable = "DE") %>% 
  group_by(feedipedia_node, DD.Variable, Unit) %>% 
  summarise(DD.Value.feedipedia = mean(Avg, na.rm = TRUE), .groups = "drop")

## C2 – cross-walk: duplicate every node–page pair into two lookup IDs
id_pairs <- feedipedia %>% distinct(feedipedia_node, page_node)

id_xwalk <- bind_rows(
  id_pairs %>% mutate(lookup_id = feedipedia_node),
  id_pairs %>% mutate(lookup_id = page_node)
) %>% 
  filter(!is.na(lookup_id)) %>% 
  distinct(feedipedia_node, lookup_id)

## C3 – attach node means to BOTH lookup IDs
feedipedia_nutri_ids <- id_xwalk %>% 
  left_join(feedipedia_nutri_node, by = "feedipedia_node")

feedipedia_dig_ids  <- id_xwalk %>% 
  left_join(feedipedia_dig_node,  by = "feedipedia_node")


# --------------------------------------------------------------------------- #
# D)  Source-tracking columns (create if missing)                             #
# --------------------------------------------------------------------------- #
if (!"nutrition_source"     %in% names(merged_metadata$Animals.Diet.Comp))
  merged_metadata$Animals.Diet.Comp$nutrition_source     <- NA_character_
if (!"digestibility_source" %in% names(merged_metadata$Animals.Diet.Digest))
  merged_metadata$Animals.Diet.Digest$digestibility_source <- NA_character_


# --------------------------------------------------------------------------- #
# E)  Fill proximate-analysis gaps in Comp                                    #
# --------------------------------------------------------------------------- #
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>% 
  left_join(feedipedia_nutri_ids,
            by = c("Feedipedia" = "lookup_id", "DC.Variable")) %>% 
  mutate(
    use_fp   = is.na(DC.Value) & !is.na(DC.Value.feedipedia),
    DC.Unit  = if_else(use_fp, Unit, DC.Unit),
    DC.Value = coalesce(DC.Value, DC.Value.feedipedia),
    nutrition_source = if_else(use_fp, "feedipedia", nutrition_source)
  ) %>% 
  select(-DC.Value.feedipedia, -Unit, -use_fp)


# --------------------------------------------------------------------------- #
# F)  Fill DE digestibility gaps in Digest                                    #
# --------------------------------------------------------------------------- #
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>% 
  mutate(Feedipedia = as.numeric(Feedipedia)) %>% 
  left_join(feedipedia_dig_ids,
            by = c("Feedipedia" = "lookup_id", "DD.Variable")) %>% 
  mutate(
    use_fp   = is.na(DD.Value) & !is.na(DD.Value.feedipedia),
    DD.Unit  = if_else(use_fp, Unit, DD.Unit),
    DD.Value = coalesce(DD.Value, DD.Value.feedipedia),
    digestibility_source = if_else(use_fp, "feedipedia", digestibility_source)
  ) %>% 
  select(-DD.Value.feedipedia, -Unit, -use_fp)


# --------------------------------------------------------------------------- #
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# Step 2: Filter Comp table for GE values
ge_papers <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
  distinct(B.Code) %>%
  left_join(species_by_bcode, by = "B.Code") %>%
  count(P.Product, name = "papers_with_GE")

# View result
ge_papers
##          P.Product papers_with_GE
##             <char>          <int>
##  1:        Buffalo              4
##  2:         Cattle            114
##  3:        Chicken             59
##  4:     Crustacean              1
##  5:           Fish             58
##  6:           Goat            171
##  7:     Guinea Pig              1
##  8: Japanese Quail              4
##  9:      Not Found             17
## 10:            Pig              1
## 11:           Pigs             29
## 12:          Quail              2
## 13:         Rabbit              8
## 14:          Sheep            214
## 15:          Snail              1
## 16:         Turkey              1
## 17:    Zebu Cattle              1

flag differences between raw data and feedipedia

# 0 ▸ make sure the Feedipedia key exists and is numeric ---------------------
for (tbl in c("Animals.Diet.Comp", "Animals.Diet.Digest")) {
  x <- merged_metadata[[tbl]]

  # rename lower-case column if present
  if (!"Feedipedia" %in% names(x) && "feedipedia" %in% names(x)) {
    x <- dplyr::rename(x, Feedipedia = feedipedia)
  }

  # stop early if the key is still missing
  if (!"Feedipedia" %in% names(x)) {
    stop(sprintf("Column 'Feedipedia' is missing from %s.", tbl))
  }

  merged_metadata[[tbl]] <- mutate(x, Feedipedia = as.numeric(Feedipedia))
}

# 1 ▸ build reference tables (mean per node × variable × unit) ---------------
ref_nutri <- feedipedia %>%
  filter(Variable %in% c("GE", "CP", "EE", "Ash", "NDF")) %>%
  group_by(feedipedia_node, Variable, Unit) %>%
  summarise(ref_val = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
  mutate(Unit = str_trim(Unit))

ref_dig <- feedipedia %>%
  filter(str_detect(tolower(Variable), "energy digestibility")) %>%
  mutate(Variable = "DE") %>%
  group_by(feedipedia_node, Variable, Unit) %>%
  summarise(ref_val = mean(Avg, na.rm = TRUE), .groups = "drop") %>%
  mutate(Unit = str_trim(Unit))

# 2 ▸ nutrition QC -----------------------------------------------------------
qc_nutrition <- merged_metadata$Animals.Diet.Comp %>%
  filter(!is.na(DC.Value)) %>%                 # raw value exists
  mutate(DC.Unit = str_trim(DC.Unit)) %>%
  left_join(ref_nutri,
            by = c("Feedipedia" = "feedipedia_node",
                   "DC.Variable" = "Variable",
                   "DC.Unit"     = "Unit")) %>%
  filter(!is.na(ref_val)) %>%                  # have reference
  mutate(rel_diff = abs(DC.Value - ref_val) / ref_val) %>%
  filter(rel_diff > 0.10) %>%                  # > 10 % difference
  arrange(desc(rel_diff))

# 3 ▸ digestibility QC -------------------------------------------------------
qc_digestibility <- merged_metadata$Animals.Diet.Digest %>%
  filter(!is.na(DD.Value)) %>%
  mutate(DD.Unit = str_trim(DD.Unit)) %>%
  left_join(ref_dig,
            by = c("Feedipedia" = "feedipedia_node",
                   "DD.Variable" = "Variable",
                   "DD.Unit"     = "Unit")) %>%
  filter(!is.na(ref_val)) %>%
  mutate(rel_diff = abs(DD.Value - ref_val) / ref_val) %>%
  filter(rel_diff > 0.10) %>%
  arrange(desc(rel_diff))

# 4 ▸ quick report -----------------------------------------------------------
message(sprintf("⚠︎  Nutrition rows with >10 %% deviation: %s",
                (nrow(qc_nutrition))))
## ⚠︎  Nutrition rows with >10 % deviation: 1601
message(sprintf("⚠︎  Digestibility rows with >10 %% deviation: %s",
                (nrow(qc_digestibility))))
## ⚠︎  Digestibility rows with >10 % deviation: 1
# 5 ▸ optional interactive view ---------------------------------------------
# DT::datatable(qc_nutrition, caption = "Nutrition >10 % difference")
# DT::datatable(qc_digestibility, caption = "Digestibility >10 % difference")

some matching is wrong

# 1) Build node–page crosswalk from Feedipedia
id_pairs <- feedipedia %>%
  distinct(feedipedia_node, page_node) %>%
  filter(!is.na(feedipedia_node) & !is.na(page_node))

id_xwalk <- bind_rows(
  id_pairs %>% mutate(type = "feedipedia_node", lookup_id = feedipedia_node),
  id_pairs %>% mutate(type = "page_node",      lookup_id = page_node)
) %>%
  distinct(lookup_id, feedipedia_node, type)

# 2) Compare mastersheet codes with Feedipedia IDs
audit_fp <- mastersheet %>%
  filter(!is.na(Feedipedia)) %>%
  mutate(Feedipedia = as.numeric(Feedipedia)) %>%
  left_join(id_xwalk, by = c("Feedipedia" = "lookup_id")) %>%
  mutate(
    status = case_when(
      type == "feedipedia_node" ~ "OK (node match)",
      type == "page_node"       ~ "Wrong (page match → should use node)",
      is.na(type)               ~ "Unresolved (no match in feedipedia)"
    )
  ) %>%
  select(Edge_Value, Feedipedia, feedipedia_node, status) %>%
  distinct()
## Warning in left_join(., id_xwalk, by = c(Feedipedia = "lookup_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 272 of `x` matches multiple rows in `y`.
## ℹ Row 949 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# 3) Inspect mis-matches
audit_fp %>%
  arrange(status, Edge_Value) %>%
  print(n = 50)
## # A tibble: 576 × 4
##    Edge_Value                           Feedipedia feedipedia_node status       
##    <chr>                                     <dbl>           <dbl> <chr>        
##  1 Acacia Albida                             11539           11539 OK (node mat…
##  2 Acacia Albida Leaves                      11538           11538 OK (node mat…
##  3 Acacia brevispica                         11707           11707 OK (node mat…
##  4 Acacia brevispica Leaves                  11706           11706 OK (node mat…
##  5 Acacia brevispica Pods Dried              11707           11707 OK (node mat…
##  6 Acacia karroo                             12684           12684 OK (node mat…
##  7 Acacia nilotica                           11794           11794 OK (node mat…
##  8 Acacia nilotica Leaves                    11793           11793 OK (node mat…
##  9 Acacia nilotica Pods                      11795           11795 OK (node mat…
## 10 Acacia nilotica Pods Dried                11794           11794 OK (node mat…
## 11 Acacia senegal                            12174           12174 OK (node mat…
## 12 Acacia senegal Leaves                     12173           12173 OK (node mat…
## 13 Acacia senegal Pods Dried                 12174           12174 OK (node mat…
## 14 Acacia tortilis                           12810           12810 OK (node mat…
## 15 Acacia tortilis Leaves                    11512           11512 OK (node mat…
## 16 Acacia tortilis Leaves Dried Ground       12810           12810 OK (node mat…
## 17 Acacia tortilis Pods                      12809           12809 OK (node mat…
## 18 Acacia tortilis Pods Dried                11513           11513 OK (node mat…
## 19 Acacia tortilis Pods Ground               11513           11513 OK (node mat…
## 20 Adansonia digitata                        11821           11821 OK (node mat…
## 21 Adansonia digitata Seed                   11824           11824 OK (node mat…
## 22 Albizia lebbeck                           12243           12243 OK (node mat…
## 23 Albizia lebbeck Leaves                    12243           12243 OK (node mat…
## 24 Albizia lebbeck Leaves Ground             12242           12242 OK (node mat…
## 25 Alfalfa                                   11745           11745 OK (node mat…
## 26 Alfalfa Dried                             11744           11744 OK (node mat…
## 27 Alfalfa Dried Ground                      11744           11744 OK (node mat…
## 28 Andropogon gayanus                        12114           12114 OK (node mat…
## 29 Andropogon gayanus Dried                  12116           12116 OK (node mat…
## 30 Andropogon gayanus Ensiled                12117           12117 OK (node mat…
## 31 Andropogon guianensis                     12114           12114 OK (node mat…
## 32 Argan Pulp                                11775           11775 OK (node mat…
## 33 Argan Seed Cake                           11774           11774 OK (node mat…
## 34 Aristida adscensionis                     12011           12011 OK (node mat…
## 35 Atella                                    22524           22524 OK (node mat…
## 36 Atella Dried                              22524           22524 OK (node mat…
## 37 Atriplex halimus                          25283           25283 OK (node mat…
## 38 Atriplex halimus Leaves and Twigs         22432           22432 OK (node mat…
## 39 Atriplex nummularia                       22432           22432 OK (node mat…
## 40 Atriplex nummularia Leaves and Twigs      22432           22432 OK (node mat…
## 41 Azadirachta indica                        12312           12312 OK (node mat…
## 42 Azadirachta indica Leaves                 12312           12312 OK (node mat…
## 43 Balanites aegyptiaca                      11591           11591 OK (node mat…
## 44 Balanites aegyptiaca Leaves               11591           11591 OK (node mat…
## 45 Bambara Groundnut                         11555           11555 OK (node mat…
## 46 Bambara Groundnut Offal                   11559           11559 OK (node mat…
## 47 Bambara Groundnut Offal Ground            11559           11559 OK (node mat…
## 48 Bamboo Leaves                             12081           12081 OK (node mat…
## 49 Bambusa balcooa                           11799           11799 OK (node mat…
## 50 Bambusa bambos                            12081           12081 OK (node mat…
## # ℹ 526 more rows

8) Infer nutritional and digestibility data from raw data and feedipedia

Now that we have both raw and Feedipedia data, we can use them to infer missing values. If some instances of an ingredient have nutritional or digestibility data while others do not, we can impute the missing values using the average of the available ones.

8.1) Nutritional composition inferrence

# --- Standardize Units in Animals.Diet.Comp ---
merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>% 
  mutate(
    # 1) normalise text --------------------------------------------------------
    DC.Unit     = tolower(trimws(DC.Unit)),
    DC.Value    = as.numeric(DC.Value)
  ) %>% 
  
  # 2) convert values ----------------------------------------------------------
  mutate(
    DC.Value = case_when(
      # --- gross energy → MJ kg-1 -------------------------------------------
      DC.Variable == "GE" ~ case_when(
        DC.Unit %in% c("mj/kg", "mj/kg dm")                       ~ DC.Value,
        DC.Unit %in% c("kcal/kg", "kcal/kg dm")                   ~ DC.Value * 0.004184,
        DC.Unit ==  "kcal/g"                                      ~ DC.Value * 4.184,
        DC.Unit ==  "kcal/100g"                                   ~ DC.Value * 0.04184,   # fixed
        DC.Unit ==  "cal/g"                                       ~ DC.Value * 0.004184,  # fixed
        DC.Unit ==  "mcal/kg"                                     ~ DC.Value * 4.184,
        DC.Unit %in% c("kj/kg", "kj/kg dm")                       ~ DC.Value / 1000,
        DC.Unit ==  "kj/g"                                        ~ DC.Value,             # already MJ/kg
        DC.Unit %in% c("kj/100g", "kj/100g dm", "kj/100 g dm")    ~ DC.Value * 0.01,
        DC.Unit ==  "mj/100g"                                     ~ DC.Value * 10,
        DC.Unit ==  "j/mg"                                        ~ DC.Value,             # fixed
        TRUE                                                      ~ NA_real_
      ),
      
      # --- everything else → g kg-1 -----------------------------------------
      TRUE ~ case_when(
        DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
        DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value,
        DC.Unit %in% c("kg", "kg/day")                                 ~ DC.Value * 1000,
        TRUE                                                           ~ NA_real_
      )
    ),
    
    # 3) harmonised unit labels ----------------------------------------------
    DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", "g/kg")
  )

impute from feedipedia when values are off

## ── 7.3 · override large raw-vs-Feedipedia discrepancies ────────────────────
library(dplyr)

deviation_threshold <- 0.10          # >10 % deviation → treat raw value as unreliable

override_if_far <- function(data,
                            ref_tbl,
                            value_col,
                            unit_col,
                            var_col,
                            node_col,
                            src_col,
                            threshold = deviation_threshold) {

  # 0 ▸ add a copy of Unit that won’t be used as a key (so it survives the join)
  ref_tbl <- ref_tbl %>% mutate(ref_unit = Unit)

  # 1 ▸ build the join map (x = data, y = ref_tbl)
  join_cols <- setNames(
    c("feedipedia_node", "Variable", "Unit"),   # cols in ref_tbl (y)
    c(node_col,         var_col,    unit_col)   # cols in data    (x)
  )

  data %>%
    # 2 ▸ attach the reference mean (with the extra `ref_unit`)
    left_join(ref_tbl, by = join_cols) %>%
    # 3 ▸ flag rows whose raw value deviates by more than `threshold`
    mutate(
      rel_diff = abs(.data[[value_col]] - ref_val) / ref_val,
      override = !is.na(rel_diff) & rel_diff > threshold
    ) %>%
    # 4 ▸ overwrite value/unit and record provenance when override is TRUE
    mutate(
      !!value_col := if_else(override, ref_val, .data[[value_col]]),
      !!unit_col  := if_else(override, ref_unit, .data[[unit_col]]),
      !!src_col   := case_when(
        override ~ "feedipedia_override_large_diff",
        TRUE     ~ .data[[src_col]]
      )
    ) %>%
    # 5 ▸ clean up helpers
    select(-ref_val, -rel_diff, -override, -ref_unit)
}

# ── Apply to nutrition -------------------------------------------------------
merged_metadata$Animals.Diet.Comp <- override_if_far(
  data       = merged_metadata$Animals.Diet.Comp,
  ref_tbl    = ref_nutri,                 # built earlier
  value_col  = "DC.Value",
  unit_col   = "DC.Unit",
  var_col    = "DC.Variable",
  node_col   = "Feedipedia",
  src_col    = "nutrition_source"
)

# ── Apply to digestibility ---------------------------------------------------
merged_metadata$Animals.Diet.Digest <- override_if_far(
  data       = merged_metadata$Animals.Diet.Digest,
  ref_tbl    = ref_dig,                   # built earlier
  value_col  = "DD.Value",
  unit_col   = "DD.Unit",
  var_col    = "DD.Variable",
  node_col   = "Feedipedia",
  src_col    = "digestibility_source"
)

# ── Quick log ----------------------------------------------------------------
message(sprintf(
  "✅  Nutrition rows corrected:      %s",
  scales::comma(sum(merged_metadata$Animals.Diet.Comp$nutrition_source ==
                      "feedipedia_override_large_diff", na.rm = TRUE))
))
## ✅  Nutrition rows corrected:      1,610
message(sprintf(
  "✅  Digestibility rows corrected:  %s",
  scales::comma(sum(merged_metadata$Animals.Diet.Digest$digestibility_source ==
                      "feedipedia_override_large_diff", na.rm = TRUE))
))
## ✅  Digestibility rows corrected:  1
# --- Compute Mean Value Per Ingredient and Variable ---
ingredient_means <- merged_metadata$Animals.Diet.Comp %>%
  filter(is_entire_diet == FALSE, !is.na(DC.Value)) %>%
  group_by(D.Item, DC.Variable) %>%
  summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")

merged_metadata$Animals.Diet.Comp <- merged_metadata$Animals.Diet.Comp %>%
  left_join(ingredient_means, by = c("D.Item", "DC.Variable")) %>%
  mutate(
    nutrition_source = case_when(
      nutrition_source == "feedipedia" ~ "feedipedia",  # preserve Feedipedia
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value) ~ "inferred_from_same_ingredient",  # imputed
      is.na(nutrition_source) & !is.na(DC.Value) ~ "raw data",  # only if there's a value
      TRUE ~ nutrition_source  # keep whatever is there
    ),
    DC.Value = if_else(
      is.na(DC.Value) & !is_entire_diet & !is.na(mean_value),
      mean_value,
      DC.Value
    )
  ) %>%
  select(-mean_value)

8.2) Digestibility inferrence

# --- B) Compute Mean Digestibility Per Ingredient and Variable ---
digest_means <- merged_metadata$Animals.Diet.Digest %>%
  filter(DD.Variable == "DE", is_entire_diet == FALSE, !is.na(DD.Value)) %>%
  group_by(D.Item, DD.Variable, DD.Unit) %>%
  summarise(mean_digest = mean(DD.Value, na.rm = TRUE), .groups = "drop")


# --- C) Fill Missing Digestibility Values with Ingredient Mean ---
merged_metadata$Animals.Diet.Digest <- merged_metadata$Animals.Diet.Digest %>%
  left_join(digest_means, by = c("D.Item", "DD.Variable")) %>%
  mutate(
    digestibility_source = case_when(
      digestibility_source == "feedipedia" ~ "feedipedia",
      is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest) ~ "inferred_from_same_ingredient",
      is.na(digestibility_source) ~ "raw data",
      TRUE ~ digestibility_source
    ),
    DD.Value = if_else(
      is.na(DD.Value) & !is_entire_diet & !is.na(mean_digest),
      mean_digest,
      DD.Value
    )
  ) %>%
  select(-mean_digest)
## Warning in left_join(., digest_means, by = c("D.Item", "DD.Variable")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4490 of `x` matches multiple rows in `y`.
## ℹ Row 179 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

since there is only very few digestibility in the raw data logic that we dont gain any data by doing this because feedipedia is infered for all the ingredients we have the info for.

# ✅ Count how many papers had GE filled by Feedipedia, and how many of those are Cattle / Sheep / Goat

# Step 1: Filter for Feedipedia-filled GE entries
ge_filled_by_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE", nutrition_source == "feedipedia") %>%
  distinct(B.Code)

# Step 2: Total number of studies
n_ge_feedipedia <- nrow(ge_filled_by_feedipedia)

# Step 3: Add species info and filter to ruminants
ge_species_feedipedia <- ge_filled_by_feedipedia %>%
  left_join(merged_metadata$`Prod.Out` %>% select(B.Code, P.Product), by = "B.Code") %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product)

# Step 4: Count per species
# ge_species_counts <- ge_species_feedipedia %>%
#   count(P.Product, name = "Papers_with_GE_from_Feedipedia")
# 
# # Display summary
# cat("📌 Total papers with GE filled from Feedipedia:", n_ge_feedipedia, "\n")
# cat("📌 Of those, species breakdown:\n")
# print(ge_species_counts)
# Step 1: Get species for each B.Code
species_by_bcode <- merged_metadata$Prod.Out %>%
  distinct(B.Code, P.Product)

# # Step 2: Filter Comp table for GE values
# ge_papers <- merged_metadata$Animals.Diet.Comp %>%
#   filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
#   distinct(B.Code) %>%
#   left_join(species_by_bcode, by = "B.Code") %>%
#   count(P.Product, name = "papers_with_GE")
# 
# # View result
# ge_papers
# Define species of interest
species_list <- c("Cattle", "Sheep", "Goat")

# Initialize result containers
species_summary <- list()
bcode_table <- list()

# Loop over each species
for (species in species_list) {
  
  # All B.Codes for this species
  species_bcodes <- merged_metadata$Prod.Out %>%
    filter(P.Product == species) %>%
    pull(B.Code) %>%
    unique()
  
  # Feed intake B.Codes
  feed_data <- merged_metadata$Data.Out %>%
    filter(B.Code %in% species_bcodes, !is.na(ED.Mean.T)) %>%
    select(B.Code, D.Item = ED.Intake.Item)
  
  feed_bcodes <- unique(feed_data$B.Code)
  
  # GE B.Codes
  ge_data <- merged_metadata$Animals.Diet.Comp %>%
    filter(B.Code %in% species_bcodes, DC.Variable == "GE", !is.na(DC.Value)) %>%
    select(B.Code, D.Item)
  
  ge_bcodes <- unique(ge_data$B.Code)
  
  # Matched B.Codes (join on both B.Code and D.Item)
  matched_bcodes <- inner_join(feed_data, ge_data, by = c("B.Code", "D.Item")) %>%
    pull(B.Code) %>%
    unique()
  
  # Store species-level summary
  species_summary[[species]] <- tibble(
    Species = species,
    Total_BCodes = length(species_bcodes),
    With_FEED = length(feed_bcodes),
    With_GE = length(ge_bcodes),
    With_BOTH = length(matched_bcodes)
  )
  
  # Store long-format bcode list
  bcode_table[[species]] <- tibble(
    B.Code = c(species_bcodes, feed_bcodes, ge_bcodes, matched_bcodes),
    Category = rep(c("All", "With_FEED", "With_GE", "With_BOTH"), 
                   times = c(length(species_bcodes), length(feed_bcodes), length(ge_bcodes), length(matched_bcodes))),
    Species = species
  ) %>% distinct()
}

# Bind results
species_summary_df <- bind_rows(species_summary)
bcode_table_df <- bind_rows(bcode_table)

# Show results
print(species_summary_df)
## # A tibble: 3 × 5
##   Species Total_BCodes With_FEED With_GE With_BOTH
##   <chr>          <int>     <int>   <int>     <int>
## 1 Cattle           126       126     115        38
## 2 Sheep            223       223     214       106
## 3 Goat             186       185     173        72
head(bcode_table_df)
## # A tibble: 6 × 3
##   B.Code   Category Species
##   <chr>    <chr>    <chr>  
## 1 HK0263   All      Cattle 
## 2 AG0051   All      Cattle 
## 3 NN0356   All      Cattle 
## 4 NN0399   All      Cattle 
## 5 JS0269.2 All      Cattle 
## 6 JS0270   All      Cattle

###7.3 Coverage after Feedipedia

coverage_feedipedia <- merged_metadata$Animals.Diet.Comp %>%
  filter(!is.na(DC.Variable)) %>%
  mutate(
    # Clean and detect actual missing values
    DC.Value_clean = str_trim(as.character(DC.Value)),
    DC.Value_clean = na_if(DC.Value_clean, ""),
    DC.Value_clean = na_if(DC.Value_clean, "NA"),
    DC.Value_clean = na_if(DC.Value_clean, "na")
  ) %>%
  group_by(DC.Variable) %>%
  summarise(
    n_total = n(),
    n_non_missing = sum(!is.na(DC.Value_clean)),
    coverage_percent = round(100 * n_non_missing / n_total, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(coverage_percent))

9) Tagg ingredient with control or trt

##— build the control key from your merged MT.Out
ctrl_key <- merged_metadata$MT.Out[, .(
  B.Code,
  A.Level.Name,
  T.Name,
  T.Control
)]

##— example join back onto your ingredient table
# (assuming your ingredients DT has B.Code + A.Level.Name)
ingredients <- merge(
  x = ingredients, 
  y = ctrl_key, 
  by = c("B.Code","A.Level.Name"), 
  all.x = TRUE, 
  sort = FALSE
)

10) Relative % of each ingredient in the diet

This table presents the relative percentage of each ingredient within a diet. To ensure accurate proportion calculations, we first filtered the dataset to include only diets with consistent units. Diets with incompatible or non-harmonizable units were excluded, as their proportions could not be reliably determined.

# ────────────────────────────────────────────────────────────────────────────
# (a) Ingredient-based totals  ----------------------------------------------
consistent_diets <- ingredients %>%                       # ignore “Entire Diet” rows
  filter(D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  mutate(Unit_Count = n_distinct(D.Unit.Amount)) %>%      # always 1 after conversion
  filter(Unit_Count == 1) %>%                             # keep only single-unit diets
  ungroup()

diet_completeness <- consistent_diets %>% 
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    n_total      = n(),
    n_nonmissing = sum(!is.na(D.Amount)),
    .groups      = "drop"
  ) %>%
  filter(n_total == n_nonmissing)

total_ingr <- consistent_diets %>%
  semi_join(diet_completeness, by = c("B.Code", "A.Level.Name")) %>%
  group_by(B.Code, A.Level.Name, D.Unit.Amount) %>%
  summarise(Total_Amount_ingr = sum(D.Amount), .groups = "drop")

# ────────────────────────────────────────────────────────────────────────────
# (b) Whole-diet feed-intake rows  ------------------------------------------
#     Use the ORIGINAL ‘feed_intake’ table, because that still carries
#     ED.Mean.T and Out.Unit.Amount.
feed_intake_diet_g <- feed_intake %>% 
  filter(is_entire_diet) %>%                               # only full-diet rows
  mutate(
    Intake_Amount_g = case_when(                           # kg → g
      Out.Unit.Amount == "kg" ~ ED.Mean.T * 1000,
      Out.Unit.Amount == "g"  ~ ED.Mean.T,
      TRUE                    ~ NA_real_
    ),
    D.Unit.Amount = "g"                                     # unify the unit
  ) %>% 
  select(B.Code, A.Level.Name, D.Unit.Amount, Intake_Amount_g)

# ────────────────────────────────────────────────────────────────────────────
# (c) Merge the two sources, prefer feed-intake -----------------------------
total_amounts <- full_join(
  feed_intake_diet_g,
  total_ingr,
  by = c("B.Code", "A.Level.Name", "D.Unit.Amount")
) %>%
  mutate(
    Total_Amount = coalesce(Intake_Amount_g, Total_Amount_ingr)
  ) %>%
  select(B.Code, A.Level.Name, D.Unit.Amount, Total_Amount)

# ────────────────────────────────────────────────────────────────────────────
# 2.  Check unit correspondence  ---------------------------------------------
unit_check <- ingredients %>%
  select(B.Code, A.Level.Name, Ingredient_Unit = D.Unit.Amount) %>%
  distinct() %>%
  left_join(
    total_amounts %>%
      select(B.Code, A.Level.Name, Total_Unit = D.Unit.Amount),
    by = c("B.Code", "A.Level.Name")
  ) %>%
  mutate(Consistent_Units = Ingredient_Unit == Total_Unit)

# Keep only diets whose units match
consistent_unit_diets <- unit_check %>%
  filter(Consistent_Units) %>%
  select(B.Code, A.Level.Name)

ingredients_clean <- ingredients %>%
  semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))

totals_clean <- total_amounts %>%
  semi_join(consistent_unit_diets, by = c("B.Code", "A.Level.Name"))

# ────────────────────────────────────────────────────────────────────────────
# 3.  Percentage contribution of each ingredient -----------------------------
diet_percentages <- ingredients_clean %>% 
  filter(D.Type != "Entire Diet") %>%
  left_join(totals_clean,
            by = c("B.Code", "A.Level.Name", "D.Unit.Amount")) %>% 
  mutate(
    Percentage_of_Diet = ifelse(Total_Amount > 0,
                                (D.Amount / Total_Amount) * 100,
                                NA_real_)
  ) %>% 
  select(
    B.Code, A.Level.Name, D.Item, D.Type,
    D.Amount, D.Unit.Amount, Percentage_of_Diet
  )

# (optional) remove helpers --------------------------------------------------
rm(consistent_diets, diet_completeness, total_ingr,
   feed_intake_diet_g, unit_check, consistent_unit_diets)
# Check if each diet sums close to 100%
validation <- diet_percentages %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(Total_Percentage = sum(Percentage_of_Diet, na.rm=TRUE)) %>%
  ungroup()

# Display diets and their total percentages
# datatable(
#   validation, 
#   options = list(
#     pageLength = 10,         # Show 10 rows per page by default
#     scrollX = TRUE,          # Enable horizontal scrolling
#     autoWidth = TRUE,        # Adjust column width automatically
#     searchHighlight = TRUE   # Highlight search terms
#   ),
#   caption = "Validation table"
# )

The lines that have 0 as validation are the ones that were feeding ad libitum for which we do not have a precise amount so it’s impossible to calculate the relative proportion of the ingredients.

11) Emission calculations

11.1 Estimation of Enteric Methane Emissions Using IPCC Tier 2

Enteric methane (CH₄) emissions were estimated using the IPCC Tier 2 methodology, which incorporates animal-specific data such as gross energy intake (GE) and methane conversion factors (Ym). This approach improves accuracy over Tier 1 by reflecting diet composition, feed intake, and species-specific differences.

11.2 Estimation of Daily GE Intake per Animal

For GE values reported at the entire diet level, daily gross energy intake per animal was calculated as:

\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]

Where:

  • \(\text{GE}_{\text{diet}}\) is the energy content of the complete diet (MJ/kg)
  • \(\text{Diet Dry Amount}\) is the daily dry matter ingredient (kg/day/animal). We can also infer from feed intake if missing.Needs to be dry matter so we will use only data when DC.is.Dry=yes.

To do that we will have ton convert all feed intake data into kg/day/animal.

When GE values were reported at the ingredient level, and information on diet composition (ingredient proportions) was available, the diet-level GE was reconstructed as:

\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]

Where:

  • \(\text{GE}_i\) is the GE value of ingredient \(i\)
  • \(p_i\) is the proportion of ingredient \(i\) in the total diet

Then, daily intake was computed using the same formula above.

11.3. Methane Emissions Calculation (Tier 2)

Annual methane emissions were calculated using the IPCC Tier 2 equation:

\[ \text{CH}_4 = \frac{\text{GE}_{\text{daily}} \times \text{Ym} \times 365}{55.65} \]

Where:

  • \(\text{CH}_4\) = annual enteric methane emissions (kg CH₄/animal/year)
  • \(\text{GE}_{\text{daily}}\) = daily gross energy intake per animal (MJ/day)
  • \(\text{Ym}\) = methane conversion factor:
    • Cattle: 0.065 (6.5%)
    • Sheep and Goats: 0.060 (6.0%)
  • 365 = days per year
  • 55.65 = energy content of methane (MJ/kg CH₄)

11.4 Data Requirements and Supporting Tables

1. Table of Diet Composition Proportions

This table defines the proportion of each ingredient in the overall diet:

# Create an interactive table for "total_amounts"
datatable(
  diet_percentages, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Relative % of Ingredients"
)

Used to compute reconstructed GE_diet from ingredient-level GE values:

\[ \text{GE}_{\text{diet}} = \sum_{i=1}^{n} (\text{GE}_i \times p_i) \]


2. Table of Ingredient Dry Amounts

This table stores the amount of each ingredient fed per animal per day, already in or converted to dry matter:

# This 'ingredients' now contains all diets + basal integrated.
# Table 1: Final diet amounts with harmonized units (including basal)
datatable(
  ingredients, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Diet Ingredients"
)

Useful when GE is per ingredient and diet is not proportionally described.


3. Table of Total Diet Dry Amounts

This is the total daily dry matter offered or consumed per animal, either directly from intake or inferred from ingredient sums:

datatable(
  total_amounts, 
  options = list(
    pageLength = 10,         # Show 10 rows per page by default
    scrollX = TRUE,          # Enable horizontal scrolling
    autoWidth = TRUE,        # Adjust column width automatically
    searchHighlight = TRUE   # Highlight search terms
  ),
  caption = "Interactive Table: Total Amounts of Diet Ingredients"
)

\[ \text{GE}_{\text{daily}} = \text{GE}_{\text{diet}} \times \text{Diet Dry Amount} \]

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 115
Goat 186 173
Sheep 223 214

12) Gross energy data

12.1) Gross energy harmonization

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE") %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 115
Goat 186 173
Sheep 223 214
# Inspect the units used for GE


Gross_energy <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "GE")

Gross_energy$DC.Value<-as.numeric(Gross_energy$DC.Value)

unique(Gross_energy$DC.Unit)
## [1] "g/kg"  "MJ/kg"
Gross_energy <- Gross_energy %>%
  # STEP 1: Create GE_source from nutrition_source
  mutate(
    GE_source = case_when(
      nutrition_source == "feedipedia" ~ "feedipedia",
      nutrition_source == "inferred_from_same_ingredient" ~ "inferred_from_same_ingredient",
      nutrition_source == "raw data" ~ "raw data",
      TRUE ~ NA_character_
    )
  )%>%
  select(-"nutrition_source")

g/kg ❌ Not an energy unit Likely a mistake, investigate or exclude % / % DM / % as fed ❌ Not energy units Same as above Unspecified ❌ Ambiguous Investigate or exclude

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 115
Goat 186 173
Sheep 223 214

12.2 Gross energy data check

Gross_energy_check <- Gross_energy %>%
  filter(!is.na(DC.Value))%>%
  filter(GE_source=="raw data")

12.3) Gross energy for non nutritional items

library(dplyr)
library(stringr)

# ── A · Define patterns for name-based matching ───────────────────────────────
salt_pat  <- "(?i)\\b(salt|sodium\\s*chloride|nacl|mineral\\s*(lick|block))\\b"
water_pat <- "(?i)\\bwater\\b"

# false positives to exclude (extend if you see more)
exclude_pat <- "(?i)\\b(water\\s*hyacinth|saltbush)\\b"

# ── B · Type-based lookup (as you had) ────────────────────────────────────────
nonenergy_types <- c("Non-ERA Feed", "Supplement")   # add more types if you have them

type_lookup <- merged_metadata$Animals.Diet %>%
  filter(D.Type %in% nonenergy_types) %>%
  distinct(B.Code, D.Item) %>%
  mutate(flag_nonenergy = TRUE)

# ── C · Name-based lookup for salt/water (avoids false positives) ─────────────
name_lookup <- merged_metadata$Animals.Diet %>%
  mutate(item_lc = str_squish(str_to_lower(D.Item))) %>%
  filter(
    (str_detect(item_lc, salt_pat) | str_detect(item_lc, water_pat)) &
      !str_detect(item_lc, exclude_pat)
  ) %>%
  distinct(B.Code, D.Item) %>%
  mutate(flag_nonenergy = TRUE)

# ── D · Union both lookups (type + name) ──────────────────────────────────────
nonenergy_lookup <- bind_rows(type_lookup, name_lookup) %>%
  distinct(B.Code, D.Item, .keep_all = TRUE)

# ── E · Ensure every (B.Code, D.Item) in lookup has a GE row (fill 0 if missing)
missing_nonenergy_ge <- nonenergy_lookup %>%
  anti_join(
    Gross_energy %>% filter(DC.Variable == "GE") %>% select(B.Code, D.Item),
    by = c("B.Code", "D.Item")
  ) %>%
  transmute(
    B.Code, D.Item,
    DC.Variable = "GE",
    DC.Value    = 0,
    DC.Unit     = "MJ/kg",
    GE_source   = "assumed_zero"
  )

# ── F · Bind, tag, and overwrite all matching rows to zero GE ─────────────────
Gross_energy <- Gross_energy %>%
  bind_rows(missing_nonenergy_ge) %>%
  left_join(nonenergy_lookup, by = c("B.Code", "D.Item")) %>%
  mutate(
    DC.Value  = if_else(DC.Variable == "GE" & flag_nonenergy, 0, DC.Value),
    DC.Unit   = if_else(DC.Variable == "GE" & flag_nonenergy, "MJ/kg", DC.Unit),
    GE_source = if_else(DC.Variable == "GE" & flag_nonenergy, "assumed_zero", GE_source)
  ) %>%
  select(-flag_nonenergy)
                             # tidy up

13) Estimate gross energy for ingredients from proximate nutritional composition

To estimate Gross Energy (GE) when direct measurements are not available, we use a predictive equation based on proximate nutritional components: crude protein (CP), ether extract (EE), and ash. This method provides an accurate approximation of the energy content of feed ingredients based on their macronutrient composition.

The equation used is adapted from Weiss et al. (2019) and is expressed as:

GE (Mcal/kg) = (0.056 × CP) + (0.094 × EE) + [0.042 × (100 − CP − EE − Ash)]
GE (MJ/kg) = GE (Mcal/kg) × 4.184
# --- STEP 1: Prepare predictors (CP, EE, Ash) as % of DM ---
# Keep relevant metadata for later
metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
  select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
  distinct()

ge_predictors <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable %in% c("CP", "EE", "Ash")) %>%
  mutate(
    DC.Unit = trimws(tolower(DC.Unit)),
    DC.Value = as.numeric(DC.Value),

    # Harmonize to % of DM
    DC.Value = case_when(
      DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value,
      DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value / 10,
      DC.Unit %in% c("kg", "kg/day")                                ~ DC.Value * 100,
      TRUE ~ NA_real_
    ),
    DC.Unit = "%"
  ) %>%
  group_by(B.Code, D.Item, DC.Variable) %>%
  summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
  left_join(metadata_cols, by = c("B.Code", "D.Item"))

# --- STEP 2: Apply Weiss GE equation only where CP, EE, Ash are all present ---
ge_predicted <- ge_predictors %>%
  mutate(
    GE_Mcal_per_kg = ifelse(
      !is.na(CP) & !is.na(EE) & !is.na(Ash),
      (CP * 0.056) + (EE * 0.094) + ((100 - CP - EE - Ash) * 0.042),
      NA_real_
    ),
    GE_MJ_per_kg = GE_Mcal_per_kg * 4.184
  ) %>%
  select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)


# --- STEP 3: Merge and only impute where GE is missing ---
Gross_energy <- Gross_energy %>%
  
  # STEP 2: Join GE predictions (e.g. from Weiss)
  full_join(ge_predicted, by = c("B.Code", "D.Item","is_entire_diet","Feedipedia")) %>%
  
  # STEP 3: Only update GE_source if it is still missing
  mutate(
    GE_source = case_when(
      is.na(GE_source) & !is.na(GE_MJ_per_kg) & is.na(DC.Value) ~ "predicted_from_Weiss_GE",
      TRUE ~ GE_source
    ),
    DC.Value = if_else(is.na(DC.Value), GE_MJ_per_kg, DC.Value),
    DC.Unit = "MJ/kg",
    DC.Variable = "GE"
  ) %>%
  select(-GE_MJ_per_kg)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 119
Goat 186 182
Sheep 223 221

14) Infer gross energy from data

As we did in 9) we are inferring again GE data after estimation by proximal nutritional composition.

library(dplyr)

# 1) Per-ingredient mean GE (exclude entire diets; treat NA as FALSE just for this calc)
ingredient_means <- Gross_energy %>%
  mutate(`_is_entire_diet` = coalesce(is_entire_diet, FALSE)) %>%   # NA -> FALSE
  filter(DC.Variable == "GE", `_is_entire_diet` == FALSE, !is.na(DC.Value)) %>%
  group_by(D.Item) %>%
  summarise(mean_value = mean(DC.Value, na.rm = TRUE), .groups = "drop")

# 2) Join and fill missing GE for ingredient rows only
Gross_energy <- Gross_energy %>%
  left_join(ingredient_means, by = "D.Item") %>%
  mutate(`_is_entire_diet` = coalesce(is_entire_diet, FALSE)) %>%   # NA -> FALSE (decision only)
  mutate(
    GE_source = case_when(
      DC.Variable == "GE" & is.na(DC.Value) & `_is_entire_diet` == FALSE & !is.na(mean_value)
        ~ "inferred_from_same_ingredient-Step2",
      TRUE ~ GE_source
    ),
    DC.Value = if_else(
      DC.Variable == "GE" & is.na(DC.Value) & `_is_entire_diet` == FALSE & !is.na(mean_value),
      mean_value,
      DC.Value
    ),
    # normalize unit whenever GE has a value (original or imputed)
    DC.Unit = if_else(DC.Variable == "GE" & !is.na(DC.Value), "MJ/kg", DC.Unit)
  ) %>%
  select(-mean_value, -`_is_entire_diet`)
# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 120
Goat 186 182
Sheep 223 221

15) Estimate Digestible energy from proximate nutritional composition

# --- STEP 4: Prepare predictors for Weiss DE equation ---
# --- STEP 1: Store relevant metadata before pivot ---
de_metadata_cols <- merged_metadata$Animals.Diet.Comp %>%
  select(B.Code, D.Item, is_entire_diet, Feedipedia) %>%
  distinct()

# --- STEP 2: Prepare DE predictors and harmonize units ---
de_predictors <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable %in% c("CP", "EE", "Ash", "NDF")) %>%
  mutate(
    DC.Unit = trimws(tolower(DC.Unit)),
    DC.Value = as.numeric(DC.Value),

    # Harmonize to g/kg DM
    DC.Value = case_when(
      DC.Unit %in% c("%", "% dm", "% as fed", "g/100g", "g/100g dm") ~ DC.Value * 10,
      DC.Unit %in% c("g", "g/kg", "g/kg dm")                         ~ DC.Value,
      DC.Unit %in% c("kg", "kg/day")                                ~ DC.Value * 1000,
      TRUE ~ NA_real_
    ),
    DC.Unit = "g/kg"
  ) %>%
  group_by(B.Code, D.Item, DC.Variable) %>%
  summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = DC.Variable, values_from = DC.Value) %>%
  left_join(de_metadata_cols, by = c("B.Code", "D.Item"))

# --- STEP 3: Apply Weiss DE equation and convert to GE ---
de_predicted <- de_predictors %>%
  mutate(
    DE_Mcal_per_kg = ifelse(
      !is.na(CP) & !is.na(EE) & !is.na(Ash) & !is.na(NDF),
      1.044 + 0.0101 * CP + 0.0161 * EE - 0.0037 * Ash - 0.0023 * NDF,
      NA_real_
    ),
    DE_MJ_per_kg = DE_Mcal_per_kg * 4.184,
    GE_MJ_per_kg = ifelse(!is.na(DE_MJ_per_kg), DE_MJ_per_kg / 0.54, NA_real_)
  ) %>%
  select(B.Code, D.Item, GE_MJ_per_kg, is_entire_diet, Feedipedia)


# --- STEP 6: Fill remaining missing GE using DE-based predictions only if GE is still missing ---
Gross_energy <- full_join(Gross_energy, de_predicted, by = c("B.Code", "D.Item", "is_entire_diet", "Feedipedia")) %>%
  mutate(
    DC.Value = if_else(is.na(DC.Value) & !is.na(GE_MJ_per_kg), GE_MJ_per_kg, DC.Value),
    GE_source = case_when(
      is.na(GE_source) & is.na(DC.Value) & !is.na(GE_MJ_per_kg) ~ "predicted_from_Weiss_DE",
      TRUE ~ GE_source  # keep existing label (e.g., feedipedia, raw data, inferred, or missing)
    ),
    DC.Unit = "MJ/kg",
    DC.Variable = "GE"
  ) %>%
  select(-GE_MJ_per_kg)
GE_data <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 120
Goat 186 182
Sheep 223 221

We dont get more papers seems logical because probably the same ingredient as the equation for GE.

16) Convert digestible energy into gross energy

Converts digestible energy (DE) to gross energy (GE) using a default digestibility factor (54%) when DE is reported but GE is not.

# 1. Total papers per species
species_counts <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Total_Studies")

# 2. All papers with GE (non-NA)
DE_data <- merged_metadata$Animals.Diet.Comp %>%
  filter(DC.Variable == "DE", !is.na(DC.Value) & DC.Value != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
DE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(DE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_DE")

# 4. Combine the two
summary_table <- species_counts %>%
  full_join(DE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

# Display table
kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_DE
Cattle 126 0
Goat 186 2
Sheep 223 2

Most of the DE data is coming from feediepdia which is expressed as a % of DE so we can’t convert that data back using the IPCC method.

# # Define digestibility conversion factor (Song et al., 2025)
# default_DE_fraction <- 0.54  # 54%
# 
# # 1. Extract DE rows from merged_metadata
# de_data <- merged_metadata$Animals.Diet.Digest %>%
#   filter(DD.Variable == "DE", !is.na(DD.Value) & DD.Value != "") %>%
#   mutate(
#     DD.Value = as.numeric(DD.Value),
#     DD.Unit = trimws(tolower(DD.Unit)),
#     DD.Value = case_when(
#       DD.Unit %in% c("mj/kg", "mj/kg dm") ~ DD.Value,
#       DD.Unit == "mcal/kg" ~ DD.Value * 4.184,
#       DD.Unit == "kcal/kg" ~ DD.Value / 239,
#       DD.Unit == "kcal/kg dm" ~ DD.Value / 239,
#       DD.Unit == "kcal/100g" ~ (DD.Value * 10) / 239,
#       DD.Unit == "kj/g" ~ DD.Value / 1000,
#       DD.Unit == "kj/100g" ~ DD.Value / 100,
#       TRUE ~ NA_real_
#     ),
#     DD.Unit = "MJ/kg"
#   ) %>%
#   mutate(
#     GE_from_DE = DD.Value / default_DE_fraction
#   ) %>%
#   select(B.Code, D.Item, GE_from_DE)
# 
# # 2. Merge GE_from_DE into your existing Gross_energy dataset
# Gross_energy <- Gross_energy %>%
#   left_join(de_data, by = c("B.Code", "D.Item")) %>%
#   mutate(
#     # Flag only when GE was previously missing and DE is available
#     GE_source = case_when(
#       is.na(GE_source) & is.na(DC.Value) & !is.na(GE_from_DE) ~ "inferred_from_DE",
#       TRUE ~ GE_source
#     ),
#     # Fill in missing GE values with GE_from_DE
#     DC.Value = if_else(is.na(DC.Value), GE_from_DE, DC.Value),
#     DC.Unit = "MJ/kg",
#     DC.Variable = "GE"
#   ) %>%
#   select(-GE_from_DE)
# 1. Total studies per species
species_total <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  distinct(B.Code, P.Product)

# 2. GE: Only B.Codes where DC.Variable == GE and DC.Value is NOT NA
ge_bcodes <- Gross_energy %>%
  filter(DC.Variable == "GE", !is.na(DC.Value)) %>%
  distinct(B.Code) %>%
  pull()

# 3. Feed: Only B.Codes where D.Amount is NOT NA
feed_amount_bcodes <- ingredients %>%
  filter(!is.na(D.Amount)) %>%
  distinct(B.Code) %>%
  pull()

# 4. B.Codes with BOTH GE value AND feed amount
both_bcodes <- intersect(ge_bcodes, feed_amount_bcodes)

# 5. Build final summary table by species
final_summary <- species_total %>%
  mutate(
    Has_GE = B.Code %in% ge_bcodes,
    Has_FEED_or_Amount = B.Code %in% feed_amount_bcodes,
    Has_BOTH = B.Code %in% both_bcodes
  ) %>%
  group_by(P.Product) %>%
  summarise(
    Total_Studies = n_distinct(B.Code),
    Studies_with_GE = sum(Has_GE),
    Studies_with_FEED_or_Amount = sum(Has_FEED_or_Amount),
    Studies_with_BOTH = sum(Has_BOTH)
  ) %>%
  arrange(P.Product)

# Show summary
knitr::kable(final_summary, caption = "Final Summary (with actual GE and Feed Amount values)")
Final Summary (with actual GE and Feed Amount values)
P.Product Total_Studies Studies_with_GE Studies_with_FEED_or_Amount Studies_with_BOTH
Cattle 126 120 29 29
Goat 186 182 34 34
Sheep 223 221 55 55
# Optional: List the actual B.Codes with both
bcode_list_both <- species_total %>%
  filter(B.Code %in% both_bcodes) %>%
  arrange(P.Product, B.Code)

# Show the B.Codes table
knitr::kable(bcode_list_both, caption = "B.Codes with BOTH GE (DC.Value) and Feed Intake (D.Amount)")
B.Codes with BOTH GE (DC.Value) and Feed Intake (D.Amount)
B.Code P.Product
BO1023 Cattle
BO1028 Cattle
BO1033 Cattle
BO1038 Cattle
CJ1006 Cattle
EM1050 Cattle
EM1093 Cattle
EO0041 Cattle
HK0086 Cattle
HK0138 Cattle
JS0201.1 Cattle
JS0201.2 Cattle
JS0205 Cattle
JS0215.1 Cattle
JS0215.2 Cattle
JS0324.1 Cattle
JS0324.2 Cattle
JS0345 Cattle
LM0101 Cattle
LM0124 Cattle
LM0127 Cattle
LM0227.1 Cattle
LM0227.2 Cattle
LM0228.2 Cattle
NJ0016 Cattle
NJ0025 Cattle
NN0207.2 Cattle
NN0227 Cattle
NN0280.2 Cattle
AG0042 Goat
BO1015 Goat
BO1042 Goat
BO1076 Goat
BO1098 Goat
BO1099 Goat
DK0037 Goat
EM1011 Goat
EM1035 Goat
EO0032 Goat
EO0057 Goat
HK0107 Goat
HK0171 Goat
HK0182 Goat
HK0337 Goat
HK0350 Goat
JS0172 Goat
JS0192 Goat
JS0193 Goat
JS0196 Goat
JS0197 Goat
JS0208 Goat
JS0211 Goat
JS0231 Goat
JS0261 Goat
LM0248 Goat
LM0280 Goat
LM0351 Goat
NJ0013 Goat
NN0083 Goat
NN0092.2 Goat
NN0229 Goat
NN0374 Goat
NN0375 Goat
0 Sheep
AN0007 Sheep
AN0021 Sheep
AN0029 Sheep
BO1008 Sheep
BO1025 Sheep
BO1034 Sheep
BO1041 Sheep
BO1048 Sheep
BO1050 Sheep
BO1051 Sheep
BO1059 Sheep
CJ1004 Sheep
CJ1015 Sheep
DK0007 Sheep
EM1023 Sheep
EM1031 Sheep
EM1033 Sheep
EM1034 Sheep
EM1082 Sheep
EM1085 Sheep
EM1090 Sheep
EO0046 Sheep
HK0033 Sheep
HK0082 Sheep
JO1028 Sheep
JO1036 Sheep
JO1038 Sheep
JO1082 Sheep
JS0210.1 Sheep
JS0210.2 Sheep
JS0249 Sheep
JS0262 Sheep
JS0273 Sheep
JS0276 Sheep
JS0285 Sheep
JS0291 Sheep
JS0301 Sheep
JS0316 Sheep
JS0344 Sheep
JS0350 Sheep
JS0419 Sheep
LM0298 Sheep
NJ0008 Sheep
NJ0032 Sheep
NJ0044 Sheep
NN0081 Sheep
NN0083 Sheep
NN0180.2 Sheep
NN0200 Sheep
NN0207.1 Sheep
NN0212 Sheep
NN0234 Sheep
NN0329 Sheep
NN0370 Sheep
Gross_energy <- Gross_energy %>%
  filter(!str_starts(D.Item, "Base"))

missing_GE_items <- Gross_energy %>%   # replace with your dataframe name
  filter(DC.Variable == "GE", is.na(DC.Value)) %>%
  count(D.Item, sort = TRUE, name = "n_missing")

17) Calculation of CH4 emissions

To estimate enteric methane emissions, a methane conversion factor (Ym) is applied to the gross energy (GE) intake of livestock. Ym represents the proportion of the animal’s gross energy intake that is converted to methane and varies based on the species, productivity level, and diet composition. The IPCC (2019) provides updated Ym values based on feed digestibility and forage content. Below is the logic applied:

kable(
  data.frame(
    Category = c("Dairy Cattle (High)", "Dairy Cattle (Medium)", "Dairy Cattle (Low)", "Dairy Cattle (Unknown)",
                 "Meat Cattle (>75% forage)", "Meat Cattle (15–75% forage)", "Meat Cattle (<15% forage)", "Meat Cattle (Unknown)",
                 "Sheep", "Lambs", "Goats"),
    Ym_Percent = c(5.7, 6.3, 6.5, 6.0, 7.0, 6.3, 4.0, 6.5, 6.7, 4.5, 5.5),
    Condition = c("> 8500 kg milk/year", "5000–8500 kg milk/year", "< 5000 kg milk/year", "Unknown milk yield",
                  "Forage > 75%", "Forage 15–75%", "Forage < 15%", "Unknown forage %", 
                  "Default for Sheep", "If 'lamb' in Herd.Stage", "Default for Goats")
  ),
  caption = "Assigned Ym Values Based on IPCC 2019 Guidelines and Available Data"
)
Assigned Ym Values Based on IPCC 2019 Guidelines and Available Data
Category Ym_Percent Condition
Dairy Cattle (High) 5.7 > 8500 kg milk/year
Dairy Cattle (Medium) 6.3 5000–8500 kg milk/year
Dairy Cattle (Low) 6.5 < 5000 kg milk/year
Dairy Cattle (Unknown) 6.0 Unknown milk yield
Meat Cattle (>75% forage) 7.0 Forage > 75%
Meat Cattle (15–75% forage) 6.3 Forage 15–75%
Meat Cattle (<15% forage) 4.0 Forage < 15%
Meat Cattle (Unknown) 6.5 Unknown forage %
Sheep 6.7 Default for Sheep
Lambs 4.5 If ‘lamb’ in Herd.Stage
Goats 5.5 Default for Goats

17.1) Ym estimations

library(dplyr)
library(stringr)

# ── Catch ALL meat-yield outcomes (final BW, carcass, slaughter body, etc.) ──
meat_regex <- paste0(
  "(?i)",                                     # case-insensitive
  "\\bmeat\\s*yield\\b",                      # "Meat Yield-..."
  "|\\bfinal\\s*body\\s*weight\\s*meat\\s*yield\\b",  # reversed wording
  "|\\bslaughter\\s*body\\b",
  "|\\bcarcass\\b"                            # hot/cold/empty carcass
)

# 1) Filter yield rows: Milk + ALL meat outcomes
yield_data <- merged_metadata$Data.Out %>%
  filter(
    Out.Subind == "Milk Yield" |
    str_detect(Out.Subind, meat_regex)
  )

# 2) Normalize units to a small canonical set (same behavior as before)
yield_data <- yield_data %>%
  mutate(
    Out.Unit = tolower(trimws(Out.Unit)),
    Out.Unit = case_when(
      Out.Unit %in% c("kg","kg/d","kg/day","kg/day/individual","kg/individual/day",
                      "kg/d/individual","kg/individual") ~ "kg/individual/day",
      Out.Unit %in% c("g","g/d","g/day","g/day/individual","g/d/individual",
                      "g/individual/day","g/individual") ~ "g/individual/day",
      Out.Unit %in% c("mg/d/individual")                ~ "mg/individual/day",
      Out.Unit %in% c("l/d","l/day","l/d/individual","l/individual/day","l/individual",
                      "l/individual/day","l/individual") ~ "l/individual/day",
      Out.Unit %in% c("ml/d/individual","ml/individual","ml/individual/day") ~ "ml/individual/day",
      Out.Unit == "g/kg/individual"                     ~ "g/kg/individual",
      Out.Unit == "kg/replicate/experiment"             ~ "kg/replicate/experiment",
      TRUE ~ Out.Unit
    )
  ) %>%
  mutate(
    Out.Unit  = tolower(trimws(Out.Unit)),
    ED.Mean.T = case_when(
      Out.Unit == "g/individual/day"  ~ ED.Mean.T / 1000,  # g → kg
      Out.Unit == "mg/individual/day" ~ ED.Mean.T / 1e6,   # mg → kg
      Out.Unit == "ml/individual/day" ~ ED.Mean.T / 1000,  # mL → L
      TRUE ~ ED.Mean.T
    ),
    Out.Unit = case_when(
      Out.Unit %in% c("kg/individual/day","g/individual/day","mg/individual/day") ~ "kg/individual/day",
      Out.Unit %in% c("l/individual/day","ml/individual/day") ~ "l/individual/day",
      TRUE ~ NA_character_
    )
  ) %>%
  # Keep only acceptable units (same as before)
  filter(Out.Unit %in% c("kg/individual/day","l/individual/day"))

# 3) Annual milk yield per B.Code (unchanged)
milk_yield <- yield_data %>%
  filter(Out.Subind == "Milk Yield", Out.Unit == "kg/individual/day") %>%
  group_by(B.Code) %>%
  summarise(milk_kg_year = mean(ED.Mean.T, na.rm = TRUE) * 365, .groups = "drop")

# 4) Purpose per B.Code, now recognizing ALL meat outcomes; Dairy still wins
purpose_by_code <- merged_metadata$Data.Out %>%
  mutate(
    is_milk = Out.Subind == "Milk Yield",
    is_meat = str_detect(Out.Subind, meat_regex)
  ) %>%
  filter(is_milk | is_meat) %>%
  mutate(
    Purpose = case_when(
      is_milk ~ "Dairy",
      is_meat ~ "Meat",
      TRUE ~ NA_character_
    )
  ) %>%
  group_by(B.Code) %>%
  summarise(Purpose = if ("Dairy" %in% Purpose) "Dairy" else first(Purpose),
            .groups = "drop")

# 5) (unchanged) Harmonize ingredient amounts to kg and proceed with your pipeline
ingredients <- ingredients %>%
  mutate(
    D.Amount = ifelse(D.Unit.Amount == "g", D.Amount / 1000, D.Amount),
    D.Unit.Amount = ifelse(D.Unit.Amount == "g", "kg", D.Unit.Amount)
  )

diet_valid_units <- ingredients %>%
  group_by(B.Code, A.Level.Name) %>%
  filter(n_distinct(D.Unit.Amount, na.rm = TRUE) == 1) %>%
  ungroup()

forage_types <- c("Forage Trees","Crop Byproduct","Herbaceous Fodders","Agroforestry Fodders")

forage_proportions <- diet_valid_units %>%
  filter(D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    any_na = any(is.na(D.Amount)),
    total_diet   = ifelse(any_na, NA_real_, sum(D.Amount, na.rm = TRUE)),
    total_forage = ifelse(any_na, NA_real_, sum(D.Amount[D.Type %in% forage_types], na.rm = TRUE)),
    n_ingredients = n(),
    is_forage = all(D.Type[1] %in% forage_types),
    D.Type.first = D.Type[1],
    D.Unit.Amount = first(D.Unit.Amount),
    forage_prop = case_when(
      n_ingredients == 1 & D.Type.first %in% forage_types    ~ 1,
      n_ingredients == 1 & !(D.Type.first %in% forage_types) ~ 0,
      total_diet > 0                                         ~ total_forage / total_diet,
      TRUE                                                   ~ NA_real_
    ),
    .groups = "drop"
  )
# 
# ingredients <- ingredients %>%                       # your table
#   # 1 ── throw away the old “source‑based” label -----------------------------
#   select(-D.Type) %>%                                       # drop the column
# 
#   # 2 ── remove any diet that still has an NA in D.Type_Nutri ----------------
#   group_by(B.Code, T.Name) %>%       # <<– keys that identify a single diet
#   filter( all(!is.na(D.Type_Nutri)) ) %>%  # keep diet only if none are NA
#   ungroup()

17.2) Reconstruct Gross Energy at the diet level from ingredient data

Reconstructs the gross energy of the entire diet from the weighted average of GE values of individual ingredients, when diet-level GE is not reported.

# --- Calculate GE_diet (MJ/kg) from ingredient GE and amount ---

# 1. Join GE values to ingredient amounts
# --- Calculate GE_diet (MJ/kg) ONLY if all ingredients have GE ---

# 1. Join GE values to ingredient amounts
# Collapse Gross_energy to mean DC.Value per D.Item
gross_energy_mean <- Gross_energy %>%
  filter(!is_entire_diet) %>%
  group_by(D.Item) %>%
  summarise(DC.Value = mean(DC.Value, na.rm = TRUE), .groups = "drop")

# Join by D.Item only
ingredients_with_GE <- ingredients %>%
  mutate(Diet_ID = paste(B.Code, A.Level.Name, sep = "__")) %>%
  left_join(gross_energy_mean, by = "D.Item") %>%
  mutate(
    D.Amount_kg = case_when(
      D.Unit.Amount == "g"  ~ D.Amount / 1000,
      D.Unit.Amount == "kg" ~ D.Amount,
      TRUE ~ NA_real_
    )
  )


# 2. Count ingredients with/without GE
ingredient_counts <- ingredients_with_GE %>%
  group_by(Diet_ID) %>%
  summarise(
    n_total = n(),
    n_with_GE = sum(!is.na(DC.Value)),
    .groups = "drop"
  ) %>%
  filter(n_total == n_with_GE)

# 3. Reconstruct GE_diet (MJ/kg)
GE_from_ingredients <- ingredients_with_GE %>%
  semi_join(ingredient_counts, by = "Diet_ID") %>%
  group_by(Diet_ID, B.Code, A.Level.Name) %>%
  summarise(
    GE_diet = sum(DC.Value * D.Amount_kg) / sum(D.Amount_kg),
    .groups = "drop"
  )

# 4. Use GE_diet when full diet-level GE is missing
# Join forage proportions into GE_diet_reconstructed
GE_diet_reconstructed <- GE_from_ingredients %>%
  left_join(total_amounts %>%
              select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
            by = c("B.Code", "A.Level.Name")) %>%
  left_join(merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage), by = "B.Code") %>%
  left_join(purpose_by_code, by = "B.Code") %>%
  left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop), 
            by = c("B.Code", "A.Level.Name")) %>%
  left_join(milk_yield, by = "B.Code") %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  mutate(
    Total_Amount_kg = case_when(
      D.Unit.Amount == "g" ~ Total_Amount / 1000,
      D.Unit.Amount == "kg" ~ Total_Amount,
      TRUE ~ NA_real_
    ),
    D.Unit.Amount = "kg",  # <- FORCE ALL TO BE KG
    
    Ym = case_when(
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
      P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year)  ~ 0.06,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
      P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,
      P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
      P.Product == "Sheep" ~ 0.067,
      P.Product == "Goat" ~ 0.055,
      TRUE ~ NA_real_
    ),
    
    GE_daily = ifelse(!is.na(GE_diet) & !is.na(Total_Amount_kg),
                      GE_diet * Total_Amount_kg,
                      NA_real_),
    
    CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
                         GE_daily * Ym * 365 / 55.65,
                         NA_real_),
    
    CH4_flag = case_when(
      is.na(GE_diet) ~ "missing_GE_diet",
      is.na(Total_Amount_kg) ~ "missing_amount",
      is.na(Ym) ~ "missing_Ym",
      is.na(GE_daily) ~ "GE_not_calculated",
      is.na(CH4_kg_year) ~ "CH4_not_calculated",
      TRUE ~ "GE_diet_based"
    ),
    
    source = "GE_diet_from_ingredients"
  ) %>%
  rename(D.Item = A.Level.Name) %>%
  select(B.Code, D.Item, P.Product, Herd.Stage, GE_daily, CH4_kg_year, 
         Total_Amount = Total_Amount_kg, D.Unit.Amount, ED.Mean.T = Total_Amount_kg, 
         CH4_flag, Ym, source)

17.3) Gross energy at the entire diet level

Gross_energy_entire <- Gross_energy %>%
  filter(is_entire_diet == TRUE) %>%
  
  # Add feed amount and unit
  left_join(
    total_amounts %>% select(B.Code, A.Level.Name, Total_Amount, D.Unit.Amount),
    by = c("B.Code" = "B.Code", "D.Item" = "A.Level.Name")
  ) %>%

  # Add product info and herd stage
  left_join(
    merged_metadata$Prod.Out %>% select(B.Code, P.Product, Herd.Stage),
    by = "B.Code"
  ) %>%

  # Add dairy vs meat purpose
  left_join(purpose_by_code, by = "B.Code") %>%

  mutate(
    # Convert Total_Amount to kg/day
    Total_Amount = case_when(
      D.Unit.Amount == "g" ~ Total_Amount / 1000,
      D.Unit.Amount == "kg" ~ Total_Amount,
      TRUE ~ NA_real_
    ),

    # Fix unit
    D.Unit.Amount = case_when(
      D.Unit.Amount == "g" ~ "kg",
      TRUE ~ D.Unit.Amount
    ),

    # Diagnostic reason for failure
    reason_missing = case_when(
      is.na(Total_Amount) & D.Unit.Amount %in% c("g/kg", "%", "% Diet") ~ "Cannot convert unit to kg",
      TRUE ~ NA_character_
    ),

    # Gross energy intake
    GE_daily = ifelse(
      !is.na(Total_Amount) & !is.na(DC.Value),
      DC.Value * Total_Amount,
      NA_real_
    )
  )

17.4) Merge GE data with feed intake

  # Make sure 'purpose_by_code' is unique per B.Code
purpose_by_code_unique <- purpose_by_code %>%
  distinct(B.Code, Purpose)

# Then join safely before using 'Purpose'
GE_all <- Gross_energy_entire %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  left_join(forage_proportions %>% select(B.Code, A.Level.Name, forage_prop), 
            by = c("B.Code",  "D.Item" = "A.Level.Name")) %>%
  left_join(milk_yield, by = "B.Code") %>%
  left_join(
    feed_intake_diet %>% 
      select(B.Code, A.Level.Name, diet_intake_kg),
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%
  mutate(
    Total_Amount = if_else(!is.na(diet_intake_kg), diet_intake_kg, Total_Amount),
    D.Unit.Amount = if_else(!is.na(diet_intake_kg), "kg", D.Unit.Amount),
Ym = case_when(
  # --- Dairy cattle based on productivity ---
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year > 8500 ~ 0.057,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year >= 5000 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Dairy" & milk_kg_year < 5000 ~ 0.065,
  P.Product == "Cattle" & Purpose == "Dairy" & is.na(milk_kg_year)  ~ 0.06,
  
  # --- Meat cattle based on forage proportions ---
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.75 ~ 0.07,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop > 0.15 ~ 0.063,
  P.Product == "Cattle" & Purpose == "Meat" & forage_prop >= 0 ~ 0.04,
  P.Product == "Cattle" & Purpose == "Meat" & is.na(forage_prop) ~ 0.065,

  # --- Sheep & Lambs ---
  P.Product == "Sheep" & grepl("lamb", Herd.Stage, ignore.case = TRUE) ~ 0.045,
  P.Product == "Sheep" ~ 0.067,

  # --- Goats ---
  P.Product == "Goat" ~ 0.055,
  
  TRUE ~ NA_real_
),
    
    GE_daily = ifelse(!is.na(DC.Value) & !is.na(Total_Amount),
                      DC.Value * Total_Amount,
                      NA_real_),
    
    CH4_kg_year = ifelse(!is.na(GE_daily) & !is.na(Ym),
                         GE_daily * Ym * 365 / 55.65,
                         NA_real_),
    
    CH4_flag = case_when(
      is.na(DC.Value) ~ "missing_GE_value",
      is.na(Ym) ~ "missing_Ym",
      is.na(Total_Amount) ~ "missing_amount",
      is.na(GE_daily) ~ "GE_not_calculated",
      is.na(CH4_kg_year) ~ "CH4_not_calculated",
      !is.na(diet_intake_kg) ~ "amount_inferred from feed intake",
      TRUE ~ "ok"
    )
  )
# ──────────────────────────────────────────────────────────────────────────────
# 0) Keep only ingredient GE (not entire diet) and choose ONE value per item
#    using a priority order of sources you already created upstream.
#    Adjust the order if you prefer a different hierarchy.
# ──────────────────────────────────────────────────────────────────────────────
ge_priority <- c(
  "raw data",
  "feedipedia",
  "predicted_from_Weiss_GE",
  "predicted_from_Weiss_DE",
  "inferred_from_same_ingredient-Step2",
  "inferred_from_same_ingredient",
  "assumed_zero"
)

GE_ingredients_best <- Gross_energy %>%
  filter(is_entire_diet == FALSE, DC.Variable == "GE") %>%
  mutate(
    # normalize source label text
    GE_source = str_squish(tolower(GE_source)),
    # ensure unit is MJ/kg
    DC.Unit   = "MJ/kg",
    # rank by source priority
    src_rank  = match(GE_source, ge_priority)
  ) %>%
  arrange(B.Code, D.Item, src_rank, desc(!is.na(DC.Value))) %>%
  group_by(B.Code, D.Item) %>%
  # keep the single best row per (B.Code, D.Item)
  slice_head(n = 1) %>%
  ungroup() %>%
  transmute(
    B.Code,
    D.Item,
    GE_MJ_per_kg = as.numeric(DC.Value),
    GE_source
  )

# ──────────────────────────────────────────────────────────────────────────────
# 1) Ingredients only + attach GE + compute per-ingredient GE intake (MJ/day)
#    NOTE: we keep your existing columns intact and just append new ones.
# ──────────────────────────────────────────────────────────────────────────────
ingredients_enriched <- ingredients %>%
  # enforce "ingredients only" (i.e., exclude entire-diet pseudo-rows if present)
  filter(D.Type != "Entire Diet") %>%

  # join GE per (B.Code, D.Item)
  left_join(GE_ingredients_best, by = c("B.Code", "D.Item")) %>%

  # convert amounts to kg/day (where possible)
  mutate(
    D.Unit.Amount = str_squish(tolower(D.Unit.Amount)),
    Amount_kg_day = case_when(
      D.Unit.Amount == "kg" ~ D.Amount,
      D.Unit.Amount == "g"  ~ D.Amount / 1000,
      TRUE ~ NA_real_  # unsupported units remain NA
    ),

    # GE intake contributed by this ingredient (MJ/day)
    GE_MJ_day = if_else(!is.na(GE_MJ_per_kg) & !is.na(Amount_kg_day),
                        GE_MJ_per_kg * Amount_kg_day,
                        NA_real_)
  )

# ──────────────────────────────────────────────────────────────────────────────
# 2) (Optional) Quick diagnostics
# ──────────────────────────────────────────────────────────────────────────────
n_total_ing   <- nrow(ingredients_enriched)
n_with_GE     <- sum(!is.na(ingredients_enriched$GE_MJ_per_kg))
n_with_GEint  <- sum(!is.na(ingredients_enriched$GE_MJ_day))

message("Ingredients total: ", n_total_ing,
        " | with GE (MJ/kg): ", n_with_GE,
        " | with GE intake (MJ/day): ", n_with_GEint)
## Ingredients total: 2879 | with GE (MJ/kg): 2521 | with GE intake (MJ/day): 2157
# If you want to keep only your original columns + added GE columns in a view:
ingredients_view <- ingredients_enriched %>%
  select(
    B.Code, A.Level.Name, D.Item, D.Type, D.Amount, D.Unit.Amount, D.Unit.Time,
    D.Unit.Animals, DC.Is.Dry, D.Ad.lib, D.Item.Group, Source, Units,
    T.Animals, Out.WG.Start, Out.WG.Unit, basal_intake_status,
    Prediction_Source, Prediction_Source.from_AName, from_feed_intake,
    T.Name, T.Control,
    # New appended columns:
    GE_MJ_per_kg, GE_source, Amount_kg_day, GE_MJ_day
  )
###############################################################################
## SINGLE CHUNK: Sum Ingredient Intake & Merge into Entire-Diet GE + CH₄ Calc
###############################################################################

# 1) Sum ingredient-level feed intake ----------------------------------------
ingredient_intake_summed <- feed_intake_ingredients %>%
  dplyr::group_by(B.Code, A.Level.Name) %>%
  dplyr::summarise(
    sum_ingredient_intake = sum(ED.Mean.T, na.rm = TRUE),
    .groups = "drop"
  )

# 2) Merge sums into entire-diet GE table, then compute intake, GE, and CH₄ --
GE_all_reconstructed <- Gross_energy_entire %>%
  # Avoid .x/.y suffixes from prior columns
  dplyr::select(-dplyr::any_of(c("Purpose",
                                 "forage_prop",
                                 "diet_intake_kg",
                                 "sum_ingredient_intake",
                                 "Total_Amount_kg"))) %>%
  # Purpose (unique per paper)
  dplyr::left_join(
    purpose_by_code %>% dplyr::distinct(B.Code, Purpose),
    by = "B.Code"
  ) %>%
  # Forage proportion (match diet name to D.Item)
  dplyr::left_join(
    forage_proportions %>% dplyr::select(B.Code, A.Level.Name, forage_prop),
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%
  # Milk yield (for dairy productivity bins)
  dplyr::left_join(milk_yield, by = "B.Code") %>%
  # Only ruminants
  dplyr::filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  # Entire-diet intake if available (kg DM d-1)
  dplyr::left_join(
    feed_intake_diet %>% dplyr::select(B.Code, A.Level.Name, diet_intake_kg),
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%
  # Sum of ingredient-level intakes (kg DM d-1)
  dplyr::left_join(
    ingredient_intake_summed,
    by = c("B.Code", "D.Item" = "A.Level.Name")
  ) %>%
  # Compute CH4 fields ---------------------------------------------------------
  dplyr::mutate(
    # Standardize any lingering total-amount unit to kg
    Total_Amount_kg = dplyr::case_when(
      D.Unit.Amount == "kg" ~ Total_Amount,
      D.Unit.Amount == "g"  ~ Total_Amount / 1000,
      TRUE                  ~ NA_real_
    ),

    # Ym by species/purpose/forage (IPCC-style rules)
    Ym = dplyr::case_when(
      # Dairy cattle by productivity
      P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year > 8500 ~ 0.057,
      P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year >= 5000 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Dairy" & !is.na(milk_kg_year) & milk_kg_year < 5000  ~ 0.065,
      P.Product == "Cattle" & Purpose == "Dairy" &  is.na(milk_kg_year)                          ~ 0.060,

      # Meat cattle by forage proportion
      P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop > 0.75 ~ 0.070,
      P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop > 0.15 ~ 0.063,
      P.Product == "Cattle" & Purpose == "Meat" & !is.na(forage_prop) & forage_prop >= 0   ~ 0.040,
      P.Product == "Cattle" & Purpose == "Meat" &  is.na(forage_prop)                      ~ 0.065,

      # Sheep & lambs
      P.Product == "Sheep" & stringr::str_detect(dplyr::coalesce(Herd.Stage, ""), "(?i)lamb") ~ 0.045,
      P.Product == "Sheep" ~ 0.067,

      # Goats
      P.Product == "Goat"  ~ 0.055,

      TRUE ~ NA_real_
    ),

    # Final daily intake (kg/day): prefer entire-diet intake, then total amount, then sum of ingredients
    final_intake_kg_day = dplyr::case_when(
      !is.na(diet_intake_kg)        ~ diet_intake_kg,
      !is.na(Total_Amount_kg)       ~ Total_Amount_kg,
      !is.na(sum_ingredient_intake) ~ sum_ingredient_intake,
      TRUE                          ~ NA_real_
    ),

    # Track source of intake
    final_intake_source = dplyr::case_when(
      !is.na(diet_intake_kg)        ~ "Entire diet feed intake",
      !is.na(Total_Amount_kg)       ~ "Total_Amount",
      !is.na(sum_ingredient_intake) ~ "Total feed intake (sum of ingredients)",
      TRUE                          ~ "Missing"
    ),

    # Daily GE intake (MJ/day) = GE content × daily intake
    GE_daily = dplyr::case_when(
      !is.na(final_intake_kg_day) & !is.na(DC.Value) ~ DC.Value * final_intake_kg_day,
      TRUE                                           ~ NA_real_
    ),

    # Annual CH4 emissions (kg/yr)
    CH4_kg_year = dplyr::if_else(
      !is.na(GE_daily) & !is.na(Ym),
      GE_daily * Ym * 365 / 55.65,
      NA_real_
    ),

    # Diagnostics
    CH4_flag = dplyr::case_when(
      is.na(DC.Value)            ~ "missing_GE_value",
      is.na(Ym)                  ~ "missing_Ym",
      is.na(final_intake_kg_day) ~ "missing_final_intake",
      is.na(GE_daily)            ~ "GE_not_calculated",
      is.na(CH4_kg_year)         ~ "CH4_not_calculated",
      TRUE                       ~ "ok"
    )
  )

17.5) Final calculation

#combine
# 1. Tag and select relevant columns from both tables
GE_all_clean <- GE_all %>%
  mutate(source = "GE_all") %>%
  select(
    B.Code, 
    D.Item, 
    P.Product,
    Herd.Stage,
    GE_daily, 
    CH4_kg_year,
    Total_Amount,
    D.Unit.Amount,
    diet_intake_kg, 
    CH4_flag,
    Ym,
    source
  )

GE_reconstructed_clean <- GE_all_reconstructed %>%
  mutate(source = "Reconstructed") %>%
  select(
    B.Code, 
    D.Item, 
    P.Product, 
    Herd.Stage,
    GE_daily, 
    CH4_kg_year, 
    Total_Amount,
    D.Unit.Amount,
    final_intake_kg_day, 
    CH4_flag, 
    Ym,
    source
  )

GE_combined <- bind_rows(GE_all_clean, GE_reconstructed_clean, GE_diet_reconstructed) %>%
  mutate(
    source_priority = case_when(
      source == "GE_all" ~ 1,
      source == "Reconstructed" ~ 2,
      source == "GE_diet_from_ingredients" ~ 3,
      TRUE ~ 4
    ),
    has_CH4 = !is.na(CH4_kg_year)
  ) %>%
  arrange(B.Code, D.Item, desc(has_CH4), source_priority) %>%
  distinct(B.Code, D.Item, .keep_all = TRUE) %>%
  select(-has_CH4, -source_priority)


# Define non-emitting or pre-ruminant stages
non_emitting_stages <- c(
   "Kid", "Kid Male", "Calf Female", "Weaned Calf Male",
  "Young Male", "Young Female", 
   "Intact Kid Male"
)

# Filter them out
GE_combined <- GE_combined %>%
  filter(!(Herd.Stage %in% non_emitting_stages))

GE_combined <- GE_combined %>%
  left_join(ctrl_key, by = c("B.Code", "D.Item" = "A.Level.Name"))



GE_data <- GE_combined %>%
  filter(!is.na(GE_daily) & GE_daily != "") %>%
  distinct(B.Code)

# 3. Papers with GE and species (intersection)
GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 32
Goat 186 67
Sheep 223 68
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code) %>%
  nrow()

n_CH4_codes
## [1] 162
n_CH4_codes_by_product <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "n_CH4_codes")

n_CH4_codes_by_product
##    P.Product n_CH4_codes
##       <char>       <int>
## 1:    Cattle          28
## 2:      Goat          66
## 3:     Sheep          68

variability of CH4 within codes with different diets

# 1. Keep only codes with CH4
GE_with_CH4 <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  filter(CH4_kg_year<500)%>%
  select(B.Code, D.Item, P.Product, CH4_kg_year)

# 2. Variability of CH4 within each code
ch4_variability <- GE_with_CH4 %>%
  group_by(B.Code, P.Product) %>%
  summarise(
    n_diets      = n_distinct(D.Item),
    min_CH4      = min(CH4_kg_year, na.rm = TRUE),
    max_CH4      = max(CH4_kg_year, na.rm = TRUE),
    mean_CH4     = mean(CH4_kg_year, na.rm = TRUE),
    sd_CH4       = sd(CH4_kg_year, na.rm = TRUE),
    range_CH4    = max_CH4 - min_CH4,
    .groups = "drop"
  ) %>%
  filter(n_diets > 1)   # only studies with >1 diet

# 3. Summary across products (Cattle, Sheep, Goat)
ch4_variability_summary <- ch4_variability %>%
  group_by(P.Product) %>%
  summarise(
    n_studies       = n(),
    avg_range       = mean(range_CH4, na.rm = TRUE),
    avg_sd          = mean(sd_CH4, na.rm = TRUE),
    max_within_diff = max(range_CH4, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(ch4_variability, aes(x = reorder(B.Code, range_CH4), 
                            ymin = min_CH4, ymax = max_CH4, 
                            color = P.Product)) +
  geom_linerange(size = 1) +
  geom_point(aes(y = min_CH4), shape = 21, fill = "white", size = 2) +
  geom_point(aes(y = max_CH4), shape = 21, fill = "black", size = 2) +
  coord_flip() +
  labs(
    x = "Study (B.Code)", 
    y = "CH₄ emissions (kg/year)",
    title = "Within-study range of CH₄ emissions across diets"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ch4_variability_stats <- GE_with_CH4 %>%
  filter(CH4_kg_year <= 500) %>%               # remove outliers
  group_by(B.Code, P.Product) %>%
  summarise(
    n_diets   = n(),
    mean_CH4  = mean(CH4_kg_year, na.rm = TRUE),
    sd_CH4    = sd(CH4_kg_year, na.rm = TRUE),       # standard deviation
    iqr_CH4   = IQR(CH4_kg_year, na.rm = TRUE),      # interquartile range
    cv_CH4    = sd(CH4_kg_year, na.rm = TRUE) / mean(CH4_kg_year, na.rm = TRUE), # coefficient of variation
    .groups = "drop"
  )

ggplot(ch4_variability_stats, aes(x = P.Product, y = cv_CH4, fill = P.Product)) +
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(
    x = "Species",
    y = "CH₄ coefficient of variation",
    title = "Relative variability of CH₄ emissions within studies"
  ) +
  theme_minimal()
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 1. Keep only codes with CH4
GE_with_CH4 <- GE_combined %>%
  filter(!is.na(GE_daily)) %>%
  filter(CH4_kg_year<500)%>%
  select(B.Code, D.Item, P.Product, GE_daily)

# 2. Variability of CH4 within each code
ch4_variability <- GE_with_CH4 %>%
  group_by(B.Code, P.Product) %>%
  summarise(
    n_diets      = n_distinct(D.Item),
    min_CH4      = min(GE_daily, na.rm = TRUE),
    max_CH4      = max(GE_daily, na.rm = TRUE),
    mean_CH4     = mean(GE_daily, na.rm = TRUE),
    sd_CH4       = sd(GE_daily, na.rm = TRUE),
    range_CH4    = max_CH4 - min_CH4,
    .groups = "drop"
  ) %>%
  filter(n_diets > 1)   # only studies with >1 diet

# 3. Summary across products (Cattle, Sheep, Goat)
ch4_variability_summary <- ch4_variability %>%
  group_by(P.Product) %>%
  summarise(
    n_studies       = n(),
    avg_range       = mean(range_CH4, na.rm = TRUE),
    avg_sd          = mean(sd_CH4, na.rm = TRUE),
    max_within_diff = max(range_CH4, na.rm = TRUE),
    .groups = "drop"
  )

ch4_variability_stats <- GE_with_CH4 %>%
  filter(GE_daily <= 500) %>%               # remove outliers
  group_by(B.Code, P.Product) %>%
  summarise(
    n_diets   = n(),
    mean_CH4  = mean(GE_daily, na.rm = TRUE),
    sd_CH4    = sd(GE_daily, na.rm = TRUE),       # standard deviation
    iqr_CH4   = IQR(GE_daily, na.rm = TRUE),      # interquartile range
    cv_CH4    = sd(GE_daily, na.rm = TRUE) / mean(GE_daily, na.rm = TRUE), # coefficient of variation
    .groups = "drop"
  )

ggplot(ch4_variability_stats, aes(x = P.Product, y = cv_CH4, fill = P.Product)) +
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(
    x = "Species",
    y = "GE coefficient of variation",
    title = "Relative variability of GE within studies"
  ) +
  theme_minimal()
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Step 1: Get list of B.Codes from GE_combined
relevant_BCodes <- GE_combined %>%
  distinct(B.Code)

# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
  semi_join(relevant_BCodes, by = "B.Code")

# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
  filter(!is.na(A.Level.Name)) %>%
  distinct(B.Code, A.Level.Name) %>%
  nrow()

total_ingredients <- diet_data_filtered %>%
  filter(!is.na(D.Item)) %>%
  distinct(B.Code, D.Item) %>%
  nrow()

# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 1317
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 2457

total number of codes that have ge and have both treatment and control

# Identify B.Codes that contain both control and treatment diets
b_codes_with_both_ctrl_and_trt <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  filter(T.Control %in% c("yes", "no")) %>%   # Only if T.Control is properly labeled
  group_by(B.Code) %>%
  summarise(n_control_status = n_distinct(T.Control), .groups = "drop") %>%
  filter(n_control_status > 1)   # Keep only those with both Yes and No

# Count of such B.Codes
n_both_ctrl_and_trt <- nrow(b_codes_with_both_ctrl_and_trt)
cat("Number of B.Codes with both control and treatment diets:", n_both_ctrl_and_trt, "\n")
## Number of B.Codes with both control and treatment diets: 100

papers that have no tagg for t control

check_control<-ingredients%>%
  filter(is.na(T.Control))%>%
  select(B.Code,A.Level.Name,D.Item,T.Control)

filter ge combined with only papers with both control and treatments

GE_combined <- GE_combined %>%
  semi_join(b_codes_with_both_ctrl_and_trt, by = "B.Code")

GE_species <- merged_metadata$`Prod.Out` %>%
  filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
  semi_join(GE_data, by = "B.Code") %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "Studies_with_GE")

summary_table <- species_counts %>%
  full_join(GE_species, by = "P.Product") %>%
  mutate(across(everything(), ~replace_na(., 0)))

kable(summary_table, caption = "Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)")
Summary of Data Availability for Enteric CH₄ Emissions (Tier 2)
P.Product Total_Studies Studies_with_GE
Cattle 126 32
Goat 186 67
Sheep 223 68
# Count unique B.Code with non-missing CH4 emissions
n_CH4_codes <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code) %>%
  nrow()

n_CH4_codes
## [1] 100
n_CH4_codes_by_product <- GE_combined %>%
  filter(!is.na(CH4_kg_year)) %>%
  distinct(B.Code, P.Product) %>%
  count(P.Product, name = "n_CH4_codes")

n_CH4_codes_by_product
##    P.Product n_CH4_codes
##       <char>       <int>
## 1:    Cattle          19
## 2:      Goat          44
## 3:     Sheep          37
relevant_BCodes <- GE_combined %>%
  distinct(B.Code)

# Step 2: Filter Animals.Diet.Comp for relevant B.Codes
diet_data_filtered <- merged_metadata$Animals.Diet %>%
  semi_join(relevant_BCodes, by = "B.Code")

# Step 3: Count unique diets and ingredients
total_diets <- diet_data_filtered %>%
  filter(!is.na(A.Level.Name)) %>%
  distinct(B.Code, A.Level.Name) %>%
  nrow()

total_ingredients <- diet_data_filtered %>%
  filter(!is.na(D.Item)) %>%
  distinct(B.Code, D.Item) %>%
  nrow()

# Step 4: Output
cat("Total unique diets:", total_diets, "\n")
## Total unique diets: 420
cat("Total unique ingredients:", total_ingredients, "\n")
## Total unique ingredients: 827
library(dplyr)
library(stringr)
library(tidyr)

# ──────────────────────────────────────────────────────────────────────────────
# 0) Keep only ingredient GE (not entire diet) and choose ONE value per item
#    using a priority order of sources you already created upstream.
#    Adjust the order if you prefer a different hierarchy.
# ──────────────────────────────────────────────────────────────────────────────
ge_priority <- c(
  "raw data",
  "feedipedia",
  "predicted_from_Weiss_GE",
  "predicted_from_Weiss_DE",
  "inferred_from_same_ingredient-Step2",
  "inferred_from_same_ingredient",
  "assumed_zero"
)

GE_ingredients_best <- Gross_energy %>%
  filter( DC.Variable == "GE") %>%
  mutate(
    # normalize source label text
    GE_source = str_squish(tolower(GE_source)),
    # ensure unit is MJ/kg
    DC.Unit   = "MJ/kg",
    # rank by source priority
    src_rank  = match(GE_source, ge_priority)
  ) %>%
  arrange(B.Code, D.Item, src_rank, desc(!is.na(DC.Value))) %>%
  group_by(B.Code, D.Item) %>%
  # keep the single best row per (B.Code, D.Item)
  slice_head(n = 1) %>%
  ungroup() %>%
  transmute(
    B.Code,
    D.Item,
    GE_MJ_per_kg = as.numeric(DC.Value),
    GE_source
  )

# ──────────────────────────────────────────────────────────────────────────────
# 1) Ingredients only + attach GE + compute per-ingredient GE intake (MJ/day)
#    NOTE: we keep your existing columns intact and just append new ones.
# ──────────────────────────────────────────────────────────────────────────────
ingredients_enriched <- ingredients %>%
  # enforce "ingredients only" (i.e., exclude entire-diet pseudo-rows if present)
  filter(D.Type != "Entire Diet") %>%

  # join GE per (B.Code, D.Item)
  left_join(GE_ingredients_best, by = c("B.Code", "D.Item")) %>%

  # convert amounts to kg/day (where possible)
  mutate(
    D.Unit.Amount = str_squish(tolower(D.Unit.Amount)),
    Amount_kg_day = case_when(
      D.Unit.Amount == "kg" ~ D.Amount,
      D.Unit.Amount == "g"  ~ D.Amount / 1000,
      TRUE ~ NA_real_  # unsupported units remain NA
    ),

    # GE intake contributed by this ingredient (MJ/day)
    GE_MJ_day = if_else(!is.na(GE_MJ_per_kg) & !is.na(Amount_kg_day),
                        GE_MJ_per_kg * Amount_kg_day,
                        NA_real_)
  )

# ──────────────────────────────────────────────────────────────────────────────
# 2) (Optional) Quick diagnostics
# ──────────────────────────────────────────────────────────────────────────────
n_total_ing   <- nrow(ingredients_enriched)
n_with_GE     <- sum(!is.na(ingredients_enriched$GE_MJ_per_kg))
n_with_GEint  <- sum(!is.na(ingredients_enriched$GE_MJ_day))

message("Ingredients total: ", n_total_ing,
        " | with GE (MJ/kg): ", n_with_GE,
        " | with GE intake (MJ/day): ", n_with_GEint)
## Ingredients total: 2879 | with GE (MJ/kg): 2521 | with GE intake (MJ/day): 2157
# If you want to keep only your original columns + added GE columns in a view:
ingredients_view <- ingredients_enriched %>%
  select(
    B.Code, A.Level.Name, D.Item, D.Type, D.Amount, D.Unit.Amount, D.Unit.Time,
    D.Unit.Animals, DC.Is.Dry, D.Ad.lib, D.Item.Group, Source, Units,
    T.Animals, Out.WG.Start, Out.WG.Unit, basal_intake_status,
    Prediction_Source, Prediction_Source.from_AName, from_feed_intake,
    T.Name, T.Control,
    # New appended columns:
    GE_MJ_per_kg, GE_source, Amount_kg_day, GE_MJ_day
  )
Variable Category I (High‑starch) Category II (High‑fat) Category III (Protein) Category IV (Good forage) Category V (Poor forage) Category VI (Mineral/Vit)
GE (MJ kg⁻¹ DM) 170 – 190 300 – 380 180 – 200 160 – 180 150 – 170
Ash (g kg⁻¹ DM) 20 – 40 0 – 10 50 – 80 70 – 100 60 – 100 ≥ 500
CP (g kg⁻¹ DM) 80 –120 0 – 50 350 – 500 120 – 200 30 – 80 ≈ 0
EE (g kg⁻¹ DM) 20 – 50 800 – 990 20 – 60 10 – 30 0 – 20 < 1
NDF (g kg⁻¹ DM) 100 – 200 0 – 50 100 – 300 400 – 500 600 – 750

18. Ingredient Classification methods

Method R function Core concept Variables needed One‑sentence rule
1 method1() Exact‑window filter GE, Ash, CP, EE, NDF
(all present)
Assign to the first category in which every variable falls inside its strict baseline window.
2 method2() Exact ± 10 % filter Same five Same as M1 but each numeric bound is widened ± 10 %.
3 method3() Priority‑variable filter (strict) Only the priority variables per category:
 • I → NDF
 • II → EE
 • III–V → CP & NDF
 • VI → Ash
If all priority variables exist and meet their strict windows, assign that category; missing non‑priority data are ignored.
4 method4() Priority‑variable ± 10 % Same Priority windows widened ± 10 %; non‑priority values (if present) still use strict windows and can only disqualify.
5 method5() Hierarchical decision tree (strict) Any subset of GE, Ash, CP, EE, NDF Gates tested in order:
EE ≥ 80 → II → else Ash ≥ 50 → VI → else CP ≥ 35 & NDF ≤ 30 → III → else NDF ≤ 20 → I → else NDF ≥ 60 → V → else 40 ≤ NDF ≤ 50 → IV → else unclassified.
6 method6() Hierarchical tree ± 10 % Same Same gates as M5 but every numeric constant is multiplied by 0.9 (lower) and 1.1 (upper) before testing.
7 method7() Hierarchical ± 10 % plus text overrides Same numerics + D.Item, D.Type Start with Method 6 result, then override:
 • oil|tallow|fat|grease in D.ItemII
 • protein‑meal regex (distillers grain, soybean meal, …) → III
 • D.Type ∈ “Supplement” or matches “supplement” → VI

All concentration variables are in g·kg⁻¹ DM except GE (MJ·kg⁻¹ DM).

18.1. Hierachical decision tree

Ing_classif <- merged_metadata$Animals.Diet.Comp

diets <- ingredients

Ing_classif <- Ing_classif %>% 
  filter(!D.Item %in% c("Salt", "Water"))


# mapping: variant (keep name) → donor (copy nutrients from here)
nutrient_donor_map <- tribble(
  ~variant,                                        ~donor,
  "Leucaena leucocephala Dried Ground",            "Leucaena leucocephala Leaves Ground",
  "Sheanut Cake Boiled Dried",                     "Sheanut Cake Ground",
  "maize straw dried ensiled ground urea treated", "maize straw ensiled urea treated",
  "leucaena leucocephala leaves dried",            "leucaena leucocephala leaves ground",
  "gliciridia sepium ground",                      "gliciridia sepium leaves dried ground",
  "sunflower",                                     "sunflower seed",
  "moringa stenopetala leaves dried",              "moringa stenopetala dried",
  "sesbania sesban leaves dried",                  "sesbania sesban leaves and twigs dried",
  "urochloa hybrid mulato ii dried",               "urochloa mosambicensis dried",
  "gliciridia sepium leaves ground",               "gliciridia sepium leaves dried ground",
  "leucaena leucocephala leaves dried ground",     "leucaena leucocephala leaves ground",
  "wheat straw effective microbes",                "wheat straw",
  "maize & wheat offal","maize offal",
  "leucaena leucocephala dried","leucaena leucocephala leaves ground"

)

# Normalize to lowercase for matching
nutrient_donor_map <- nutrient_donor_map %>%
  mutate(across(everything(), tolower))

Ing_classif <- Ing_classif %>%
  mutate(D.Item_lc = tolower(D.Item))

# --- Step 1: extract donor profiles
donor_profiles <- Ing_classif %>%
  mutate(D.Item_lc = tolower(D.Item)) %>%
  filter(D.Item_lc %in% nutrient_donor_map$donor)

# --- Step 2: build replacement rows
replacement_rows <- nutrient_donor_map %>%
  rename(Donor_lc = donor, Variant_lc = variant) %>%
  inner_join(donor_profiles, by = c("Donor_lc" = "D.Item_lc")) %>%
  mutate(
    D.Item_lc = Variant_lc,        # replace with variant name
    D.Item    = str_to_title(Variant_lc)  # keep nicer capitalization
  )

# --- Step 3: remove old variant rows and add replacements
Ing_classif <- Ing_classif %>%
  filter(!D.Item_lc %in% nutrient_donor_map$variant) %>%  # drop old variants
  bind_rows(replacement_rows %>% select(names(Ing_classif)))  # add clones
# --- Helpers ---------------------------------------------------------------
clean_key <- function(x) x %>% str_trim() %>% str_squish() %>% str_to_lower()

# Canonicalize names so duplicates collapse:
#   "unspecified molasses" → "molasses"
#   any "bran"             → "offal"
canonicalize_item <- function(x) {
  clean_key(x) %>%
    str_replace("^unspecified\\s+molasses$", "molasses") %>%
    str_replace_all("\\bbran\\b", "offal")
}

# Robust logical checks for columns stored as logical or "TRUE"/"FALSE"
is_false_or_na <- function(x) is.na(x) | x == FALSE | x == "FALSE"
is_true        <- function(x) !is.na(x) & (x == TRUE | x == "TRUE")

# Global D.Item → D.Type (built on canonical names)
type_lookup <- diets %>% 
  transmute(
    D.Item = canonicalize_item(D.Item),
    D.Type = str_trim(D.Type)
  ) %>%
  distinct(D.Item, .keep_all = TRUE)

# Canonicalize Ing_classif once
Ing_base <- Ing_classif %>%
  mutate(
    B.Code     = str_trim(B.Code),
    D.Item_raw = D.Item,
    D.Item     = canonicalize_item(D.Item)
  )

# Build (pivot-wide + attach D.Type; optional back-fill from global item means)
build_nutr_table <- function(df_long, type_lookup, backfill_by_item = TRUE) {
  target_vars <- c("CP","EE","NDF","Ash","GE")

  nutr <- df_long %>%
    filter(DC.Variable %in% target_vars) %>%
    mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>%
    pivot_wider(
      id_cols     = c(B.Code, D.Item),
      names_from  = DC.Variable,
      values_from = DC.Value,
      values_fn   = ~ mean(.x, na.rm = TRUE),
      values_fill = NA
    ) %>%
    left_join(type_lookup, by = "D.Item")

  if (backfill_by_item) {
    global_means <- df_long %>%
      filter(DC.Variable %in% target_vars) %>%
      mutate(DC.Value = suppressWarnings(as.numeric(DC.Value))) %>%
      group_by(D.Item, DC.Variable) %>%
      summarise(Value = mean(DC.Value, na.rm = TRUE), .groups = "drop") %>%
      pivot_wider(names_from = DC.Variable, values_from = Value)

    nutr <- nutr %>%
      left_join(global_means, by = "D.Item", suffix = c("", ".global")) %>%
      mutate(
        GE  = coalesce(GE,  GE.global),
        Ash = coalesce(Ash, Ash.global),
        CP  = coalesce(CP,  CP.global),
        EE  = coalesce(EE,  EE.global),
        NDF = coalesce(NDF, NDF.global)
      ) %>%
      select(-ends_with(".global"))
  }

  nutr
}

# Ingredients subset (non-entire, non-group) — WITH back-fill
Ing_ingredients_long <- Ing_base %>%
  filter(is_false_or_na(is_entire_diet)) %>%
  filter(is_false_or_na(is_group))

nutr_ing <- build_nutr_table(Ing_ingredients_long, type_lookup, backfill_by_item = TRUE) %>%
  mutate(scope = "ingredient") %>%
  relocate(scope, .before = B.Code)

# Diets subset (entire diets, non-group) — NO back-fill
Ing_diets_long <- Ing_base %>%
  filter(is_true(is_entire_diet)) %>%
  filter(is_false_or_na(is_group))

nutr_diet <- build_nutr_table(Ing_diets_long, type_lookup, backfill_by_item = FALSE) %>%
  mutate(scope = "diet") %>%
  relocate(scope, .before = B.Code)

# Combined (optional)
nutr_all <- bind_rows(nutr_ing, nutr_diet)

# Diagnostics: items without a mapped D.Type
unmatched_ing  <- anti_join(nutr_ing,  type_lookup, by = "D.Item")
unmatched_diet <- anti_join(nutr_diet, type_lookup, by = "D.Item")

# ── Canonical mean per ingredient (collapse to one row per D.Item) ──────────
canon_ing <- nutr_ing %>%
  group_by(D.Item) %>%
  summarise(
    GE  = mean(GE,  na.rm = TRUE),
    Ash = mean(Ash, na.rm = TRUE),
    CP  = mean(CP,  na.rm = TRUE),
    EE  = mean(EE,  na.rm = TRUE),
    NDF = mean(NDF, na.rm = TRUE),
    D.Type = first(D.Type),
    .groups = "drop"
  )

# Numeric windows (g/kg DM; GE in MJ/kg DM)
win <- list(
  GE  = list(I=c(170,190), II=c(300,380), III=c(180,200), IV=c(160,180), V=c(150,170)),
  Ash = list(I=c(20,40),   II=c(0,10),   III=c(50,80),   IV=c(70,100),  V=c(60,100), VI=c(500,1000)),
  CP  = list(I=c(80,120),  II=c(0,50),   III=c(350,500), IV=c(120,200), V=c(30,80)),
  EE  = list(I=c(20,50),   II=c(80,990), III=c(20,60),   IV=c(10,30),   V=c(0,20)),
  NDF = list(I=c(100,200), II=c(0,50),   III=c(100,300), IV=c(400,500), V=c(600,750))
)
expand10 <- function(x) c(x[1]*0.9, x[2]*1.1)
inside   <- function(x, rng) !is.na(x) & x >= rng[1] & x <= rng[2]

# Priority variables
priority <- list(
  I  = c("NDF"),
  II = c("EE"),
  III= c("CP","NDF"),
  IV = c("NDF","CP"),
  V  = c("NDF","CP"),
  VI = c("Ash")
)

# M1 strict all-five
method1 <- function(df, w = win){
  df %>% mutate(
    M1_Category = case_when(
      inside(GE,w$GE$I)   & inside(Ash,w$Ash$I)  & inside(CP,w$CP$I)   & inside(EE,w$EE$I)   & inside(NDF,w$NDF$I)   ~ "I",
      inside(GE,w$GE$II)  & inside(Ash,w$Ash$II) & inside(CP,w$CP$II)  & inside(EE,w$EE$II)  & inside(NDF,w$NDF$II)  ~ "II",
      inside(GE,w$GE$III) & inside(Ash,w$Ash$III)& inside(CP,w$CP$III) & inside(EE,w$EE$III) & inside(NDF,w$NDF$III) ~ "III",
      inside(GE,w$GE$IV)  & inside(Ash,w$Ash$IV) & inside(CP,w$CP$IV)  & inside(EE,w$EE$IV)  & inside(NDF,w$NDF$IV)  ~ "IV",
      inside(GE,w$GE$V)   & inside(Ash,w$Ash$V)  & inside(CP,w$CP$V)   & inside(EE,w$EE$V)   & inside(NDF,w$NDF$V)   ~ "V",
      inside(Ash,w$Ash$VI) ~ "VI",
      TRUE ~ NA_character_
    )
  )
}

# M2 ±10% all-five
method2 <- function(df){
  w10 <- map(win, ~ map(.x, expand10))
  method1(df, w10) %>% rename(M2_Category = M1_Category)
}

# Priority helper
method_priority <- function(df, w, out_name){
  df %>% rowwise() %>% mutate(
    "{out_name}" := {
      dat <- c(CP=CP, EE=EE, NDF=NDF, Ash=Ash, GE=GE)
      cat <- NA_character_
      for (cl in names(priority)) {
        req <- priority[[cl]]
        if (all(!is.na(dat[req])) &&
            all(map2_lgl(dat[req], req, \(v,r) inside(v, w[[r]][[cl]])))) {
          cat <- cl; break
        }
      }
      cat
    }
  ) %>% ungroup()
}
method3 <- function(df) method_priority(df, win,  "M3_Category")
method4 <- function(df){ w10 <- map(win, ~ map(.x, expand10)); method_priority(df, w10, "M4_Category") }

# Decision tree
tree_decide <- function(df, w){
  df %>% mutate(
    Category = case_when(
      EE  >= w$EE$II[1]                         ~ "II",
      Ash >= w$Ash$VI[1]                        ~ "VI",
      CP  >= w$CP$III[1] & NDF <= w$NDF$III[2]  ~ "III",
      NDF <= w$NDF$I[2]                         ~ "I",
      NDF >= w$NDF$V[1]                         ~ "V",
      NDF >= w$NDF$IV[1] & NDF <= w$NDF$IV[2]   ~ "IV",
      TRUE                                      ~ NA_character_
    )
  )
}
method5 <- function(df) tree_decide(df, win) %>% rename(M5_Category = Category)
method6 <- function(df){ w10 <- map(win, ~ map(.x, expand10)); tree_decide(df, w10) %>% rename(M6_Category = Category) }

# Method 7: start from M6 and apply name/type overrides
method7 <- function(df){
  out <- if ("M6_Category" %in% names(df)) df else method6(df)
  out$M7_Category <- out$M6_Category

  # normalize item names
  item_norm <- canonicalize_item(out$D.Item)

  # --- Category II: fats/oils ---
  oil_pat <- "\\b(oil|tallow|grease|lard)\\b"
  is_oil  <- str_detect(item_norm, regex(oil_pat, ignore_case = TRUE))
  out$M7_Category[is_oil] <- "II"

  # --- Category III: protein sources ---
  protein_pat <- "\\b(distillers?.*grain|soybean ground|canola meal|fish ground|blood ground)\\b"
  is_pmeal    <- str_detect(item_norm, regex(protein_pat, ignore_case = TRUE))
  is_fish     <- str_detect(item_norm, regex("\\bfish\\b", ignore_case = TRUE)) &
                 !str_detect(item_norm, regex("\\bfish\\s*oil\\b", ignore_case = TRUE))
  is_animprot <- str_detect(item_norm, regex("\\b(feather|larvae|snail|termite|shrimp|hydrolysate)\\b",
                                             ignore_case = TRUE))
  out$M7_Category[is_pmeal | is_fish | is_animprot] <- "III"

  # --- Category I: byproducts (bread waste only) ---
  is_bread <- str_detect(item_norm, regex("\\bbread waste\\b", ignore_case = TRUE))
  out$M7_Category[is_bread] <- "I"

  # --- Category VI: supplements / additives ---
  sup_types   <- c("Supplement", "Non-ERA Feed")
  vi_type_hit <- !is.na(out$D.Type) &
                 (out$D.Type %in% sup_types |
                  str_detect(out$D.Type, regex("supplement", ignore_case = TRUE)))
  is_shell    <- str_detect(item_norm, regex("\\boyster\\s*shells?\\b", ignore_case = TRUE))
  is_cellulose <- str_detect(
    item_norm,
    regex("\\b(cellulose|micro\\s*-?crystalline\\s+cellulose|carboxy\\s*methyl\\s*cellulose|\\bMCC\\b|\\bCMC\\b)\\b",
          ignore_case = TRUE)
  )
  is_nonfeed <- str_detect(item_norm,
                           regex("\\b(cement|polyethylene|humate|probiotic|solution|mixture|axtra xap)\\b",
                                 ignore_case = TRUE))
  out$M7_Category[vi_type_hit | is_shell | is_cellulose | is_nonfeed] <- "VI"

  out
}



# Run all methods so we can inspect coverage; M7 uses M6 base + overrides
classif_ing <- canon_ing %>%   # ← use canonical means for INGREDIENTS
  method1() %>% method2() %>% method3() %>% method4() %>%
  method5() %>% method6() %>% method7()

classif_diet <- nutr_diet %>%  # ← keep DIETS per-row (no mean collapse)
  method1() %>% method2() %>% method3() %>% method4() %>%
  method5() %>% method6() %>% method7()
# Helper to build coverage/counts/samples/unclassified per table
build_reports <- function(df){
  method_cols <- grep("^M[1-7]_Category$", names(df), value = TRUE)

  coverage_tbl <- purrr::map_dfr(method_cols, function(mc){
    tibble(
      Method   = sub("_Category$", "", mc),
      Total    = nrow(df),
      Assigned = sum(!is.na(df[[mc]])),
      Coverage = round(Assigned / Total * 100, 1)
    )
  })

  counts_tbl <- df %>%
    pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
    filter(!is.na(Category)) %>%
    mutate(Method = sub("_Category$", "", Method)) %>%
    count(Method, Category, name = "n") %>%
    arrange(Method, desc(n))

  set.seed(123)
sample_tbl <- df %>%
  pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
  filter(!is.na(Category)) %>%
  mutate(Method = sub("_Category$", "", Method)) %>%
  group_by(Method, Category) %>%
  # take distinct items per (Method,Category), sample up to 10 per group
  group_modify(function(.x, .y) {
    .x %>%
      distinct(D.Item, .keep_all = TRUE) %>%
      slice_sample(n = min(10, nrow(.)))
  }) %>%
  # still grouped by Method,Category here → summarise keeps those cols
  summarise(Examples = paste(D.Item, collapse = "; "), .groups = "drop") %>%
  arrange(Method, Category)


  unclassified_tbl <- df %>%
    pivot_longer(all_of(method_cols), names_to = "Method", values_to = "Category") %>%
    filter(is.na(Category)) %>%
    mutate(Method = sub("_Category$", "", Method)) %>%
    count(Method, D.Item, sort = TRUE) %>%
    group_by(Method) %>%
    slice_max(n, n = 10, with_ties = FALSE) %>%
    ungroup()

  list(coverage = coverage_tbl, counts = counts_tbl,
       samples = sample_tbl, unclassified = unclassified_tbl)
}

rep_ing  <- build_reports(classif_ing)
rep_diet <- build_reports(classif_diet)

# Display (ingredients)
datatable(classif_ing %>% arrange(M7_Category),
          caption = "INGREDIENTS: Classified rows (sorted by M7)",
          rownames = FALSE, options = list(pageLength = 30, dom = "tip"))
datatable(rep_ing$coverage,    caption = "INGREDIENTS: Coverage by method",    options = list(pageLength = 10, dom = "tip"))
datatable(rep_ing$counts,      caption = "INGREDIENTS: Counts per category",   options = list(pageLength = 25, dom = "tip"))
datatable(rep_ing$samples,     caption = "INGREDIENTS: Random examples",       options = list(pageLength = 25, dom = "tip"))
datatable(rep_ing$unclassified,caption = "INGREDIENTS: Top-10 unclassified",   options = list(pageLength = 25, dom = "tip"))
# Display (diets)
datatable(classif_diet %>% arrange(M7_Category),
          caption = "DIETS: Classified rows (sorted by M7)",
          rownames = FALSE, options = list(pageLength = 30, dom = "tip"))
datatable(rep_diet$coverage,    caption = "DIETS: Coverage by method",   options = list(pageLength = 10, dom = "tip"))
datatable(rep_diet$counts,      caption = "DIETS: Counts per category",  options = list(pageLength = 25, dom = "tip"))
datatable(rep_diet$samples,     caption = "DIETS: Random examples",      options = list(pageLength = 25, dom = "tip"))
datatable(rep_diet$unclassified,caption = "DIETS: Top-10 unclassified",  options = list(pageLength = 25, dom = "tip"))
# Canonicalize D.Item and A.Level.Name in ingredients before joining
ingredients_joinable <- ingredients %>%
  mutate(
    D.Item = canonicalize_item(D.Item),
    A.Level.Name = canonicalize_item(A.Level.Name)
  )

# Simplify classification tables (already canonicalized)
classif_ing_simple <- classif_ing %>%
  select(D.Item, Ingredient_Category = M7_Category)

classif_diet_simple <- classif_diet %>%
  rename(A.Level.Name = D.Item) %>%
  select(A.Level.Name, Diet_Category = M7_Category)

# Join the classifications
ingredients<- ingredients_joinable %>%
  left_join(classif_ing_simple,  by = "D.Item")%>%
  distinct()

# --- Final polish: map Roman numerals -> full names in ingredients ---
roman_to_name <- function(x) dplyr::recode(
  x,
  "I"   = "High-starch",
  "II"  = "High-fat",
  "III" = "Protein",
  "IV"  = "Good forage",
  "V"   = "Poor forage",
  "VI"  = "Mineral/Vitamin",
  .default = x,
  .missing = NA_character_
)

# Update the ingredient database
ingredients <- ingredients %>%
  dplyr::mutate(
    Ingredient_Category = roman_to_name(Ingredient_Category))
    # If you also keep a diet-level tag, uncomment:
    # , Diet_Category = roman_to_name(Diet_Category)
# ----------------------------
# Completeness within selected papers
# ----------------------------

# Selected papers (already built above)
selected_bcodes <- GE_data   # a data.frame with B.Code from your earlier step

# Ensure per-row INGREDIENT classifications exist (ingredients, by B.Code)
classif_ing_rows <- nutr_ing %>%
  method1() %>% method2() %>% method3() %>% method4() %>%
  method5() %>% method6() %>% method7()

# Choose which method’s category to test for completeness
method_col <- "M7_Category"

# --- Diet completeness (all diet rows classified for a paper) ---
diet_status <- classif_diet %>%
  semi_join(selected_bcodes, by = "B.Code") %>%
  group_by(B.Code) %>%
  summarise(
    diet_n             = n(),
    diet_n_classified  = sum(!is.na(.data[[method_col]])),
    diets_all_classified = diet_n > 0 & (diet_n_classified == diet_n),
    .groups = "drop"
  )

# --- Ingredient completeness (all ingredient rows classified for a paper) ---
ing_status <- classif_ing_rows %>%
  semi_join(selected_bcodes, by = "B.Code") %>%
  group_by(B.Code) %>%
  summarise(
    ing_n                = n(),
    ing_n_classified     = sum(!is.na(.data[[method_col]])),
    ingredients_all_classified = ing_n > 0 & (ing_n_classified == ing_n),
    .groups = "drop"
  )

# Combine statuses; treat missing as FALSE (paper may have diets but no ingredients, or vice versa)
paper_status <- full_join(diet_status, ing_status, by = "B.Code") %>%
  mutate(
    diets_all_classified       = replace_na(diets_all_classified, FALSE),
    ingredients_all_classified = replace_na(ingredients_all_classified, FALSE),
    either_complete            = diets_all_classified | ingredients_all_classified
  )

# Attach species for those selected papers (Cattle/Sheep/Goat)
paper_status <- paper_status %>%
  left_join(
    merged_metadata$`Prod.Out` %>%
      filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
      distinct(B.Code, P.Product),
    by = "B.Code"
  )

# ---------- Totals ----------
n_papers_diets_all        <- sum(paper_status$diets_all_classified)
n_papers_ingredients_all  <- sum(paper_status$ingredients_all_classified)
n_papers_either           <- sum(paper_status$either_complete)

n_papers_diets_all
## [1] 61
n_papers_ingredients_all
## [1] 49
n_papers_either
## [1] 98
# ---------- By species ----------
summary_by_product <- paper_status %>%
  group_by(P.Product) %>%
  summarise(
    Papers_total            = n_distinct(B.Code),
    Papers_diets_all        = sum(diets_all_classified),
    Papers_ingredients_all  = sum(ingredients_all_classified),
    Papers_either           = sum(either_complete),
    .groups = "drop"
  )

# View as a table
kable(summary_by_product, caption = "Papers with complete diet/ingredient classification (within selected papers)")
Papers with complete diet/ingredient classification (within selected papers)
P.Product Papers_total Papers_diets_all Papers_ingredients_all Papers_either
Cattle 32 15 10 21
Goat 67 23 15 35
Sheep 68 23 24 42
ingredients <-ingredients%>%
  filter(D.Item!="salt")

unmatched_ing_m7 <- ingredients %>%
  filter(is.na(Ingredient_Category)) %>%
  distinct(D.Item) %>%
  left_join(canon_ing, by = "D.Item") %>%
  arrange(D.Item)

k means

# ---------- Windows & priority (your spec) ----------
win <- list(
  GE  = list(I=c(170,190), II=c(300,380), III=c(180,200), IV=c(160,180), V=c(150,170)),
  Ash = list(I=c(20,40),   II=c(0,10),   III=c(50,80),   IV=c(70,100),  V=c(60,100), VI=c(500,1000)),
  CP  = list(I=c(80,120),  II=c(0,50),   III=c(350,500), IV=c(120,200), V=c(30,80)),
  EE  = list(I=c(20,50),   II=c(80,990), III=c(20,60),   IV=c(10,30),   V=c(0,20)),
  NDF = list(I=c(100,200), II=c(0,50),   III=c(100,300), IV=c(400,500), V=c(600,750))
)
priority <- list(
  I  = c("NDF","CP"),
  II = c("EE"),
  III= c("CP","NDF"),
  IV = c("NDF","CP"),
  V  = c("NDF","CP"),
  VI = c("Ash")
)

expand10 <- function(x) c(x[1]*0.9, x[2]*1.1)
mid      <- function(r) mean(r, na.rm = TRUE)
w10 <- map(win, ~ map(.x, expand10))  # ±10% windows (Method 6 spirit)

# ---------- 1) Build the working table from *your* dataset ----------
# Expecting unmatched_ing_m7 to already contain D.Item (+ optionally D.Type) and nutrients
unmatched <- unmatched_ing_m7 %>%
  select(any_of(c("D.Item","D.Type","GE","Ash","CP","EE","NDF"))) %>%
  group_by(across(any_of(c("D.Item","D.Type")))) %>%
  summarise(across(c(GE, Ash, CP, EE, NDF),
                   ~ suppressWarnings(mean(as.numeric(.), na.rm = TRUE))),
            .groups = "drop") %>%
  mutate(across(c(GE, Ash, CP, EE, NDF), ~ ifelse(is.nan(.), NA_real_, .))) %>%
  # drop rows where ALL five nutrients are NA
  filter(!(is.na(GE) & is.na(Ash) & is.na(CP) & is.na(EE) & is.na(NDF))) %>%
  mutate(
    n_values = rowSums(!is.na(across(c(GE, Ash, CP, EE, NDF)))),
    Missing  = pmap_chr(select(., GE, Ash, CP, EE, NDF), ~ {
      nm <- c("GE","Ash","CP","EE","NDF"); vv <- c(...); miss <- nm[is.na(vv)]
      if (length(miss)==0) "—" else paste(miss, collapse = ", ")
    })
  )

# ---------- 2) Priority-based score with HARD GATES ----------
# Standardized violation (0 if inside; >0 if outside by fraction of window width)
std_violation <- function(x, rng) {
  if (is.na(x) || any(is.na(rng))) return(NA_real_)
  L <- rng[1]; U <- rng[2]; W <- U - L; if (W <= 0) return(NA_real_)
  if (x < L) (L - x)/W else if (x > U) (x - U)/W else 0
}

cats <- names(priority)
missing_penalty <- 0.75  # penalty if a priority var is missing

scores <- matrix(NA_real_, nrow = nrow(unmatched), ncol = length(cats))
colnames(scores) <- cats

# Hard gates (fixes the “all go to II” issue):
gate_ii_lb  <- w10$EE$II[1]   # 72 (0.9 * 80)
gate_vi_lb  <- w10$Ash$VI[1]  # 450 (0.9 * 500)

for (j in seq_along(cats)) {
  cat <- cats[j]
  req <- priority[[cat]]

  # Per-variable violations for this category
  vlist <- lapply(req, function(v) {
    x   <- unmatched[[v]] %>% as.numeric()
    rng <- w10[[v]][[cat]]
    sapply(x, std_violation, rng = rng)
  })
  vmat <- do.call(cbind, vlist)

  # Replace NA violations (missing values) with bounded penalty
  vmat[is.na(vmat)] <- missing_penalty

  # Mean violation across this category’s priority vars
  s <- rowMeans(vmat)

  # Apply gates:
  if (cat == "II") {
    ee <- unmatched$EE
    s[is.na(ee) | ee < gate_ii_lb] <- Inf
  }
  if (cat == "VI") {
    ash <- unmatched$Ash
    s[is.na(ash) | ash < gate_vi_lb] <- Inf
  }

  scores[, j] <- s
}

best_idx   <- apply(scores, 1, which.min)
second_idx <- apply(scores, 1, function(v) order(v, na.last = TRUE)[2])

Assigned_Category    <- cats[best_idx]
Distance_Closest     <- scores[cbind(seq_len(nrow(scores)), best_idx)]
Second_Best_Category <- cats[second_idx]
Distance_Second      <- scores[cbind(seq_len(nrow(scores)), second_idx)]

# How many priority vars actually present for the chosen cat?
n_used_priority <- map2_int(seq_len(nrow(unmatched)), best_idx, function(i, j) {
  req <- priority[[cats[j]]]
  sum(!is.na(unmatched[i, req]))
})
Priority_vars_used <- map2_chr(seq_len(nrow(unmatched)), best_idx, function(i, j) {
  req <- priority[[cats[j]]]; have <- req[!is.na(unmatched[i, req])]
  if (length(have)==0) "—" else paste(have, collapse = ", ")
})
Priority_vars_missing <- map2_chr(seq_len(nrow(unmatched)), best_idx, function(i, j) {
  req <- priority[[cats[j]]]; miss <- req[is.na(unmatched[i, req])]
  if (length(miss)==0) "—" else paste(miss, collapse = ", ")
})

results <- unmatched %>%
  mutate(
    Assigned_Category    = Assigned_Category,
    Distance_Closest     = round(Distance_Closest, 3),
    Second_Best_Category = Second_Best_Category,
    Distance_Second      = round(Distance_Second, 3),
    n_used_priority      = n_used_priority,
    Priority_vars_used   = Priority_vars_used,
    Priority_vars_missing= Priority_vars_missing
  ) %>%
  arrange(Assigned_Category, Distance_Closest, desc(n_values), D.Item)

DT::datatable(
  results %>% select(D.Item, any_of("D.Type"), GE, Ash, CP, EE, NDF,
                     n_values, Missing,
                     Assigned_Category, Distance_Closest,
                     Second_Best_Category, Distance_Second,
                     n_used_priority, Priority_vars_used, Priority_vars_missing),
  caption = "M7-unclassified (from unmatched_ing_m7) → priority-window assignment (±10%), with gates for II/VI",
  options = list(pageLength = 50, dom = "tip"),
  rownames = FALSE
)
# ---------- 3) Quick plots (centroids midpoints + item segments) ----------
centroids_full <- tibble(
  Category = c("I","II","III","IV","V","VI"),
  GE  = c(mid(win$GE$I),  mid(win$GE$II),  mid(win$GE$III),  mid(win$GE$IV),  mid(win$GE$V),  NA_real_),
  Ash = c(mid(win$Ash$I), mid(win$Ash$II), mid(win$Ash$III), mid(win$Ash$IV), mid(win$Ash$V), mid(win$Ash$VI)),
  CP  = c(mid(win$CP$I),  mid(win$CP$II),  mid(win$CP$III),  mid(win$CP$IV),  mid(win$CP$V),  NA_real_),
  EE  = c(mid(win$EE$I),  mid(win$EE$II),  mid(win$EE$III),  mid(win$EE$IV),  mid(win$EE$V),  NA_real_),
  NDF = c(mid(win$NDF$I), mid(win$NDF$II), mid(win$NDF$III), mid(win$NDF$IV), mid(win$NDF$V), NA_real_)
)

plot_pair <- function(items_df, cents_df, xvar, yvar) {
  items_xy <- items_df %>% filter(!is.na(.data[[xvar]]), !is.na(.data[[yvar]]))
  cents_xy <- cents_df %>% filter(!is.na(.data[[xvar]]), !is.na(.data[[yvar]])) %>%
    transmute(Category, cx = .data[[xvar]], cy = .data[[yvar]])
  items_xy <- items_xy %>% left_join(cents_xy, by = c("Assigned_Category" = "Category"))

  ggplot() +
    geom_point(data = items_xy,
               aes(x = .data[[xvar]], y = .data[[yvar]], color = Assigned_Category),
               size = 2, alpha = 0.9) +
    geom_segment(data = items_xy %>% filter(!is.na(cx), !is.na(cy)),
                 aes(x = .data[[xvar]], y = .data[[yvar]], xend = cx, yend = cy, color = Assigned_Category),
                 alpha = 0.4, linewidth = 0.6) +
    geom_point(data = cents_xy, aes(x = cx, y = cy), shape = 4, size = 4, stroke = 1.1) +
    geom_text(data = cents_xy, aes(x = cx, y = cy, label = Category), vjust = -1, fontface = "bold") +
    labs(x = xvar, y = yvar, color = "Assigned",
         title = paste(xvar, "vs", yvar, "— items and category centroids")) +
    theme_minimal()
}

print(plot_pair(results, centroids_full, "CP", "NDF"))

print(plot_pair(results, centroids_full, "EE", "NDF"))

# ---- Assign the first (best) category back to `ingredients` ----
# Roman → full name map (keep if you already have it)
if (!exists("roman_to_name")) {
  roman_to_name <- function(x) dplyr::recode(
    x,
    "I"   = "High-starch",
    "II"  = "High-fat",
    "III" = "Protein",
    "IV"  = "Good forage",
    "V"   = "Poor forage",
    "VI"  = "Mineral/Vitamin",
    .default = x, .missing = NA_character_
  )
}

# Join keys (prefer D.Item + D.Type if you have both)
join_keys <- intersect(c("D.Item","D.Type"), names(ingredients))

# Make the assignment table unique on the join keys (prevents many-to-many)
assign_tbl <- results %>%
  mutate(Assigned_Category_Name = roman_to_name(Assigned_Category)) %>%
  select(any_of(c("D.Item","D.Type")), Assigned_Category, Assigned_Category_Name) %>%
  group_by(across(all_of(join_keys))) %>%
  summarise(
    Assigned_Category      = dplyr::first(Assigned_Category),
    Assigned_Category_Name = dplyr::first(Assigned_Category_Name),
    .groups = "drop"
  )

# Keep a backup of existing labels the first time you run this
if (!"Ingredient_Category_orig" %in% names(ingredients)) {
  ingredients <- ingredients %>%
    mutate(Ingredient_Category_orig = Ingredient_Category)
}

# Ensure Category_Source exists (so mutate below can reference it)
if (!"Category_Source" %in% names(ingredients)) {
  ingredients <- ingredients %>% mutate(Category_Source = NA_character_)
}

# Join and fill ONLY where Ingredient_Category is NA
ingredients <- ingredients %>%
  # dplyr >= 1.1: relationship arg suppresses warning if truly many-to-one
  left_join(assign_tbl, by = join_keys, relationship = "many-to-one") %>%
  mutate(
    Ingredient_Category = coalesce(Ingredient_Category, Assigned_Category_Name),
    Category_Source     = ifelse(
      is.na(Ingredient_Category_orig) & !is.na(Assigned_Category_Name),
      "Nearest-window (priority, ±10%, gated II/VI)",
      Category_Source
    )
  ) %>%
  select(-Assigned_Category, -Assigned_Category_Name)

# Quick sanity check
filled_n <- sum(is.na(ingredients$Ingredient_Category_orig) & !is.na(ingredients$Ingredient_Category))
message("Filled missing Ingredient_Category for ", filled_n, " ingredient rows.")
## Filled missing Ingredient_Category for 95 ingredient rows.
ingredients <-ingredients%>%
  filter(D.Item!="salt")

unmatched_ing_m7 <- ingredients %>%
  filter(is.na(Ingredient_Category)) %>%
  distinct(D.Item) %>%
  left_join(canon_ing, by = "D.Item") %>%
  arrange(D.Item)

papers with ing classified

# 1) Per-diet completeness: all ingredients classified?
diet_class_status <- ingredients %>%
  semi_join(GE_data, by = "B.Code") %>%
  filter(!is.na(A.Level.Name), !is.na(D.Item)) %>%
  # (Optionally exclude entire-diet placeholder rows if present)
  filter(is.na(D.Type) | D.Type != "Entire Diet") %>%
  group_by(B.Code, A.Level.Name) %>%
  summarise(
    n_ing          = n(),
    n_classified   = sum(!is.na(Ingredient_Category)),
    all_classified = n_ing > 0 & (n_ing == n_classified),
    .groups = "drop"
  )

# 2) Papers with at least TWO diets fully classified
papers_2plus <- diet_class_status %>%
  group_by(B.Code) %>%
  summarise(n_complete_diets = sum(all_classified), .groups = "drop") %>%
  filter(n_complete_diets >= 2)

# 3) Attach product and count per product (Cattle / Sheep / Goat)
papers_2plus_by_product <- papers_2plus %>%
  left_join(
    merged_metadata$`Prod.Out` %>%
      filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
      distinct(B.Code, P.Product),
    by = "B.Code"
  ) %>%
  count(P.Product, name = "Papers_with_≥2_fully_classified_diets")

# Show the summary table
knitr::kable(
  papers_2plus_by_product %>% arrange(P.Product),
  caption = "Papers per product with ≥2 diets where all ingredients are classified"
)
Papers per product with ≥2 diets where all ingredients are classified
P.Product Papers_with_≥2_fully_classified_diets
Cattle 15
Goat 19
Sheep 42
# (Optional) Show the actual B.Codes and how many complete diets each has
knitr::kable(
  papers_2plus %>%
    left_join(
      merged_metadata$`Prod.Out` %>%
        filter(P.Product %in% c("Cattle", "Sheep", "Goat")) %>%
        distinct(B.Code, P.Product),
      by = "B.Code"
    ) %>%
    arrange(P.Product, desc(n_complete_diets), B.Code),
  caption = "B.Codes with number of fully classified diets (within GE papers)"
)
B.Codes with number of fully classified diets (within GE papers)
B.Code n_complete_diets P.Product
JS0215.2 5 Cattle
EM1050 4 Cattle
JS0324.1 4 Cattle
JS0324.2 4 Cattle
NN0207.2 4 Cattle
NN0299 4 Cattle
BO1033 3 Cattle
EM1093 3 Cattle
JS0205 3 Cattle
NJ0025 3 Cattle
CJ1003 2 Cattle
HK0086 2 Cattle
JS0215.1 2 Cattle
LM0101 2 Cattle
NN0280.2 2 Cattle
JS0208 7 Goat
LM0248 7 Goat
NN0375 6 Goat
BO1015 5 Goat
HK0107 5 Goat
HK0171 5 Goat
BO1076 4 Goat
JS0192 4 Goat
JS0196 4 Goat
JS0197 4 Goat
BO1098 3 Goat
HK0182 3 Goat
HK0350 3 Goat
JS0193 3 Goat
JS0261 3 Goat
LM0280 3 Goat
NN0374 3 Goat
EO0057 2 Goat
NN0092.2 2 Goat
JS0291 16 Sheep
AN0007 5 Sheep
BO1025 5 Sheep
BO1051 5 Sheep
HK0033 5 Sheep
HK0082 5 Sheep
JS0301 5 Sheep
NJ0008 5 Sheep
0 4 Sheep
BO1034 4 Sheep
BO1048 4 Sheep
BO1050 4 Sheep
EM1023 4 Sheep
EM1031 4 Sheep
EM1034 4 Sheep
EO0046 4 Sheep
JS0210.1 4 Sheep
JS0316 4 Sheep
NJ0032 4 Sheep
NN0207.1 4 Sheep
NN0329 4 Sheep
BO1008 3 Sheep
CJ1004 3 Sheep
CJ1015 3 Sheep
EM1085 3 Sheep
EM1090 3 Sheep
JO1028 3 Sheep
JS0210.2 3 Sheep
JS0249 3 Sheep
JS0273 3 Sheep
JS0285 3 Sheep
JS0344 3 Sheep
JS0419 3 Sheep
NJ0044 3 Sheep
NN0081 3 Sheep
NN0234 3 Sheep
BO1059 2 Sheep
JO1082 2 Sheep
JS0262 2 Sheep
JS0350 2 Sheep
NN0200 2 Sheep
NN0212 2 Sheep

19 practice diet comparison

suppressPackageStartupMessages({
  library(dplyr); library(tidyr); library(stringr); library(purrr)
})

classify_diet_practice <- function(
  diet_percentages,
  total_amounts,
  ctrl_key,
  classif_ing_simple,
  TOTAL_TOL = 0.05,   # ±5% total DM offered
  ING_TOL   = 1.0,    # %-point threshold to count a change
  BAL_TOL   = 3.0     # %-point imbalance allowed for a "balanced swap"
){

  ensure_col <- function(df, col, fill = NA_character_) {
    if (!col %in% names(df)) df[[col]] <- fill
    df
  }

  norm_item <- function(x){
    x %>%
      stringr::str_to_lower() %>%
      stringr::str_replace_all("[[:space:]]+", " ") %>%
      stringr::str_trim()
  }

  # ---- Ingredient lookup (dedup + normalized) ----
  ing_lu <- classif_ing_simple %>%
    transmute(
      D.Item_norm = norm_item(D.Item),
      Ingredient_Category_lu = Ingredient_Category
    ) %>%
    filter(!is.na(D.Item_norm), D.Item_norm != "") %>%
    group_by(D.Item_norm) %>%
    summarise(Ingredient_Category_lu = dplyr::first(na.omit(Ingredient_Category_lu)),
              .groups = "drop")

  # ---- Control lookup (1 control per B.Code) ----
  ctrl_lu <- ctrl_key %>%
    filter(tolower(T.Control) == "yes") %>%
    arrange(B.Code, A.Level.Name) %>%
    group_by(B.Code) %>%
    summarise(Control = first(A.Level.Name), .groups = "drop")

  # ---- Totals to kg (aggregate duplicates) ----
  totals_kg <- total_amounts %>%
    mutate(Total_kg = case_when(
      D.Unit.Amount == "g"  ~ Total_Amount/1000,
      D.Unit.Amount == "kg" ~ Total_Amount,
      TRUE ~ NA_real_
    )) %>%
    group_by(B.Code, A.Level.Name) %>%
    summarise(Total_kg = sum(Total_kg, na.rm = TRUE), .groups = "drop")

  # ---- Percentages + robust category join ----
  dp <- ensure_col(diet_percentages, "Ingredient_Category")

  pct_long <- dp %>%
    select(B.Code, A.Level.Name, D.Item, Percentage_of_Diet, Ingredient_Category) %>%
    group_by(B.Code, A.Level.Name, D.Item, Ingredient_Category) %>%
    summarise(Percentage_of_Diet = sum(Percentage_of_Diet, na.rm = TRUE), .groups = "drop") %>%
    mutate(D.Item_norm = norm_item(D.Item)) %>%
    left_join(ing_lu, by = "D.Item_norm") %>%
    mutate(
      Ingredient_Category = coalesce(Ingredient_Category, Ingredient_Category_lu, "Unclassified")
    ) %>%
    select(-D.Item_norm, -Ingredient_Category_lu)

  # ---- Pairs (treatment vs control) ----
  pairs <- pct_long %>%
    distinct(B.Code, A.Level.Name) %>%
    inner_join(ctrl_lu, by = "B.Code") %>%
    filter(A.Level.Name != Control) %>%
    transmute(B.Code, Control, Trt = A.Level.Name) %>%
    distinct()

  trt_tbl <- pct_long %>%
    rename(Trt = A.Level.Name, p_trt = Percentage_of_Diet) %>%
    select(B.Code, Trt, D.Item, Ingredient_Category, p_trt)

  ctrl_tbl <- pct_long %>%
    rename(Control = A.Level.Name, p_ctrl = Percentage_of_Diet) %>%
    select(B.Code, Control, D.Item, p_ctrl)

  trt_vs_ctrl <- pairs %>%
    left_join(trt_tbl,  by = c("B.Code","Trt")) %>%
    left_join(ctrl_tbl, by = c("B.Code","Control","D.Item")) %>%
    mutate(
      p_trt  = coalesce(p_trt,  0),
      p_ctrl = coalesce(p_ctrl, 0),
      dP     = p_trt - p_ctrl,
      dir    = case_when(
        dP >=  ING_TOL ~ "increase",
        dP <= -ING_TOL ~ "decrease",
        TRUE           ~ "nochange"
      )
    )

  delta_totals <- pairs %>%
    left_join(totals_kg %>% rename(Total_trt_kg  = Total_kg),
              by = c("B.Code","Trt" = "A.Level.Name")) %>%
    left_join(totals_kg %>% rename(Total_ctrl_kg = Total_kg),
              by = c("B.Code","Control" = "A.Level.Name")) %>%
    mutate(ΔTOTAL = (Total_trt_kg - Total_ctrl_kg)/Total_ctrl_kg)

  # Precompute top-3 item changes (avoids the earlier vctrs error)
  top_changes <- trt_vs_ctrl %>%
    group_by(B.Code, Control, Trt) %>%
    arrange(desc(abs(dP)), .by_group = TRUE) %>%
    summarise(
      Top_changes = paste(
        head(sprintf("%s %s%.1f%%", D.Item, if_else(dP>=0,"+",""), dP), 3),
        collapse = "; "
      ),
      .groups = "drop"
    )

  summarize_types <- function(x) paste(sort(unique(na.omit(x))), collapse = ", ")

  paired <- trt_vs_ctrl %>%
    left_join(delta_totals %>% select(B.Code, Trt, ΔTOTAL), by = c("B.Code","Trt")) %>%
    group_by(B.Code, Control, Trt, ΔTOTAL) %>%
    summarise(
      POS_sum   = sum(pmax(dP,0),  na.rm = TRUE),
      NEG_sum   = sum(pmax(-dP,0), na.rm = TRUE),
      n_pos     = sum(dir=="increase",  na.rm = TRUE),
      n_neg     = sum(dir=="decrease",  na.rm = TRUE),
      Types_added     = summarize_types(Ingredient_Category[dir=="increase"]),
      Types_decreased = summarize_types(Ingredient_Category[dir=="decrease"]),
      .groups = "drop"
    ) %>%
    left_join(top_changes, by = c("B.Code","Control","Trt")) %>%
    mutate(
      Practice = case_when(
        is.finite(ΔTOTAL) & abs(ΔTOTAL) >  TOTAL_TOL & POS_sum >= ING_TOL ~ "Addition",
        is.finite(ΔTOTAL) & abs(ΔTOTAL) <= TOTAL_TOL &
          POS_sum >= ING_TOL & NEG_sum >= ING_TOL &
          abs(POS_sum - NEG_sum) <= BAL_TOL                               ~ "Substitution",
        is.finite(ΔTOTAL) & abs(ΔTOTAL) <= TOTAL_TOL &
          POS_sum >= ING_TOL & NEG_sum <  ING_TOL                         ~ "Addition",
        (POS_sum + NEG_sum) < ING_TOL                                     ~ "No change",
        TRUE                                                               ~ "Substitution"
      ),
      Magnitude_pct = case_when(
        Practice == "Addition"     ~ POS_sum,
        Practice == "Substitution" ~ 0.5*(POS_sum + NEG_sum),
        TRUE                       ~ 0
      )
    ) %>%
    rowwise() %>%
    mutate(
      Substitution_pairs = {
        if (Practice != "Substitution") {
          ""
        } else {
          b <- B.Code; t <- Trt
          pos_type <- trt_vs_ctrl %>%
            filter(B.Code == b, Trt == t) %>%
            group_by(Ingredient_Category) %>%
            summarise(pos = sum(pmax(dP,0),  na.rm=TRUE), .groups="drop") %>%
            filter(pos > 0) %>% arrange(desc(pos))
          neg_type <- trt_vs_ctrl %>%
            filter(B.Code == b, Trt == t) %>%
            group_by(Ingredient_Category) %>%
            summarise(neg = sum(pmax(-dP,0), na.rm=TRUE), .groups="drop") %>%
            filter(neg > 0) %>% arrange(desc(neg))
          np <- min(nrow(pos_type), nrow(neg_type))
          if (np == 0) "" else
            paste(paste0(neg_type$Ingredient_Category[seq_len(np)],
                         " → ",
                         pos_type$Ingredient_Category[seq_len(np)]),
                  collapse="; ")
        }
      }
    ) %>%
    ungroup() %>%
    arrange(B.Code, Control, Trt)

  paired
}

# ── Run it ──
practice_tbl <- classify_diet_practice(
  diet_percentages   = diet_percentages,
  total_amounts      = total_amounts,
  ctrl_key           = ctrl_key,
  classif_ing_simple = classif_ing_simple,
  TOTAL_TOL = 0.05, ING_TOL = 1.0, BAL_TOL = 3.0
)
suppressPackageStartupMessages({
  library(dplyr); library(stringr); library(tidyr); library(purrr)
})

# --- Rebuild the same control lookup used in the function ---
ctrl_lu <- ctrl_key %>%
  filter(tolower(T.Control) == "yes") %>%
  arrange(B.Code, A.Level.Name) %>%
  group_by(B.Code) %>%
  summarise(Control = first(A.Level.Name), .groups = "drop")

# --- Diet coverage per paper (from the raw diet_percentages you feed the function) ---
diets_per_code <- diet_percentages %>%
  filter(!is.na(B.Code), !is.na(A.Level.Name)) %>%
  group_by(B.Code) %>%
  summarise(
    n_diets = n_distinct(A.Level.Name),
    n_rows  = n(),                     # total ingredient rows for those diets
    .groups = "drop"
  )

# --- Candidates that *should* be classifiable: have a control & at least 2 diets ---
candidates <- diets_per_code %>%
  left_join(ctrl_lu %>% mutate(has_control = TRUE), by = "B.Code") %>%
  mutate(has_control = coalesce(has_control, FALSE)) %>%
  mutate(eligible = has_control & n_diets >= 2)

# --- B.Codes that actually produced a practice row ---
in_practice <- practice_tbl %>% distinct(B.Code)

# --- 1) Papers that could not be classified at all (no rows in practice_tbl) ---
unclassified_papers <- candidates %>%
  mutate(in_practice = B.Code %in% in_practice$B.Code) %>%
  filter(!in_practice | !eligible) %>%         # either not produced, or not eligible
  transmute(
    B.Code,
    n_diets,
    has_control,
    reason = case_when(
      !has_control                    ~ "No control diet flagged in ctrl_key",
      n_diets < 2                     ~ "Only one diet available (need ≥ 2)",
      has_control & n_diets >= 2 & !in_practice ~ "Pairing failed (no Trt vs Control built)",
      TRUE                            ~ "Other/unknown"
    )
  ) %>%
  arrange(desc(has_control), desc(n_diets), B.Code)

# --- 2) Papers that *were* classified, but every comparison is "No change" ---
no_change_papers <- practice_tbl %>%
  group_by(B.Code) %>%
  summarise(
    n_pairs         = n(),                         # number of Trt vs Control pairs
    n_no_change     = sum(Practice == "No change"),
    all_no_change   = n_pairs > 0 & n_no_change == n_pairs,
    .groups = "drop"
  ) %>%
  filter(all_no_change) %>%
  arrange(desc(n_pairs))

# (Optional) attach product/species to both tables
prod_lu <- merged_metadata$`Prod.Out` %>%
  distinct(B.Code, P.Product)

unclassified_papers <- unclassified_papers %>%
  left_join(prod_lu, by = "B.Code") %>%
  relocate(P.Product, .after = B.Code)

no_change_papers <- no_change_papers %>%
  left_join(prod_lu, by = "B.Code") %>%
  relocate(P.Product, .after = B.Code)

# --- Quick summaries you can show in a slide ---
summary_unclassified <- unclassified_papers %>%
  count(reason, name = "n_papers") %>%
  arrange(desc(n_papers))

summary_no_change_by_prod <- no_change_papers %>%
  count(P.Product, name = "n_papers_all_no_change")

# Peek
unclassified_papers %>% head(10)
## # A tibble: 10 × 5
##    B.Code   P.Product n_diets has_control reason                            
##    <chr>    <chr>       <int> <lgl>       <chr>                             
##  1 BO1028   Cattle          1 TRUE        Only one diet available (need ≥ 2)
##  2 CJ1006   Cattle          1 TRUE        Only one diet available (need ≥ 2)
##  3 DK0001   Pigs            1 TRUE        Only one diet available (need ≥ 2)
##  4 EM1069   Chicken         1 TRUE        Only one diet available (need ≥ 2)
##  5 HK0337   Goat            1 TRUE        Only one diet available (need ≥ 2)
##  6 JS0172   Goat            1 TRUE        Only one diet available (need ≥ 2)
##  7 JS0201.2 Cattle          1 TRUE        Only one diet available (need ≥ 2)
##  8 JS0345   Cattle          1 TRUE        Only one diet available (need ≥ 2)
##  9 LM0124   Cattle          1 TRUE        Only one diet available (need ≥ 2)
## 10 LM0227.1 Cattle          1 TRUE        Only one diet available (need ≥ 2)
no_change_papers %>% head(10)
## # A tibble: 7 × 5
##   B.Code   P.Product n_pairs n_no_change all_no_change
##   <chr>    <chr>       <int>       <int> <lgl>        
## 1 BO1012   Not Found       4           4 TRUE         
## 2 EM1062   Chicken         3           3 TRUE         
## 3 NN0207.1 Sheep           3           3 TRUE         
## 4 NN0207.2 Cattle          3           3 TRUE         
## 5 BO1008   Sheep           2           2 TRUE         
## 6 JS0350   Sheep           1           1 TRUE         
## 7 NN0092.2 Goat            1           1 TRUE
summary_unclassified
## # A tibble: 2 × 2
##   reason                              n_papers
##   <chr>                                  <int>
## 1 Only one diet available (need ≥ 2)        16
## 2 No control diet flagged in ctrl_key       12
summary_no_change_by_prod
## # A tibble: 5 × 2
##   P.Product n_papers_all_no_change
##   <chr>                      <int>
## 1 Cattle                         1
## 2 Chicken                        1
## 3 Goat                           1
## 4 Not Found                      1
## 5 Sheep                          3
# B.Codes that appear in the practice table
practice_codes <- practice_tbl %>%
  distinct(B.Code)

# Intersect with your ≥2-diets papers
practice_in_2plus <- practice_codes %>%
  semi_join(papers_2plus %>% select(B.Code), by = "B.Code")

# Attach species and count by product
practice_in_2plus_by_product <- practice_in_2plus %>%
  left_join(
    merged_metadata$`Prod.Out` %>%
      filter(P.Product %in% c("Cattle","Sheep","Goat")) %>%
      distinct(B.Code, P.Product),
    by = "B.Code"
  ) %>%
  count(P.Product, name = "Papers_with_practice_and_≥2_classified_diets")

# Show the summary
knitr::kable(
  practice_in_2plus_by_product,
  caption = "Papers per product that are in practice_tbl AND have ≥2 fully classified diets"
)
Papers per product that are in practice_tbl AND have ≥2 fully classified diets
P.Product Papers_with_practice_and_≥2_classified_diets
Cattle 12
Goat 18
Sheep 34
# (Optional) list the actual codes
knitr::kable(
  practice_in_2plus %>%
    left_join(
      merged_metadata$`Prod.Out` %>%
        distinct(B.Code, P.Product),
      by = "B.Code"
    ) %>%
    arrange(P.Product, B.Code),
  caption = "B.Codes with ≥2 fully classified diets that also have a practice"
)
B.Codes with ≥2 fully classified diets that also have a practice
B.Code P.Product
BO1033 Cattle
EM1093 Cattle
HK0086 Cattle
JS0205 Cattle
JS0215.1 Cattle
JS0215.2 Cattle
JS0324.1 Cattle
JS0324.2 Cattle
LM0101 Cattle
NJ0025 Cattle
NN0207.2 Cattle
NN0280.2 Cattle
BO1015 Goat
BO1076 Goat
BO1098 Goat
EO0057 Goat
HK0107 Goat
HK0171 Goat
HK0182 Goat
HK0350 Goat
JS0192 Goat
JS0193 Goat
JS0196 Goat
JS0197 Goat
JS0208 Goat
JS0261 Goat
LM0248 Goat
LM0280 Goat
NN0092.2 Goat
NN0375 Goat
BO1015 Not Found
0 Sheep
AN0007 Sheep
BO1008 Sheep
BO1025 Sheep
CJ1015 Sheep
EM1023 Sheep
EM1034 Sheep
EM1085 Sheep
EM1090 Sheep
EO0046 Sheep
HK0033 Sheep
HK0082 Sheep
JO1082 Sheep
JS0210.1 Sheep
JS0210.2 Sheep
JS0249 Sheep
JS0262 Sheep
JS0273 Sheep
JS0285 Sheep
JS0291 Sheep
JS0301 Sheep
JS0316 Sheep
JS0344 Sheep
JS0350 Sheep
JS0419 Sheep
NJ0008 Sheep
NJ0032 Sheep
NJ0044 Sheep
NN0081 Sheep
NN0200 Sheep
NN0207.1 Sheep
NN0212 Sheep
NN0234 Sheep
NN0329 Sheep
suppressPackageStartupMessages({
  library(dplyr); library(tidyr); library(stringr)
})

# --- Roman → full name (keeps non-Roman as-is) -----------------------------
roman_to_name <- function(x) dplyr::recode(
  x,
  "I"  = "High-starch",
  "II" = "High-fat",
  "III"= "Protein",
  "IV" = "Good forage",
  "V"  = "Poor forage",
  "VI" = "Mineral/Vitamin",
  .default = x, .missing = NA_character_
)

# --- Normalise item names to improve join hit-rate -------------------------
norm_item <- function(x){
  x %>%
    stringr::str_to_lower() %>%
    stringr::str_replace_all("[[:space:]]+", " ") %>%
    stringr::str_trim() %>%
    # a couple of handy harmonisations seen in your data:
    stringr::str_replace("^unspecified\\s+molasses$", "molasses") %>%
    stringr::str_replace_all("\\bbran\\b", "offal")
}

# --- 1) Ingredient lookup (D.Item -> Ingredient_Category) ------------------
stopifnot(all(c("D.Item","Ingredient_Category") %in% names(classif_ing_simple)))

ing_lu <- classif_ing_simple %>%
  transmute(
    D.Item_norm = norm_item(D.Item),
    Cat_ing = roman_to_name(Ingredient_Category)
  ) %>%
  filter(!is.na(D.Item_norm), D.Item_norm != "") %>%
  group_by(D.Item_norm) %>%
  summarise(Cat_ing = first(na.omit(Cat_ing)), .groups = "drop")

# --- 2) Per-arm diet category = top-contributing ingredient type ----------
# expects: diet_percentages has B.Code, A.Level.Name, D.Item, Percentage_of_Diet
stopifnot(all(c("B.Code","A.Level.Name","D.Item","Percentage_of_Diet") %in% names(diet_percentages)))

arm_cat <- diet_percentages %>%
  mutate(
    D.Item_norm = norm_item(D.Item),
    Percentage_of_Diet = coalesce(Percentage_of_Diet, 0)
  ) %>%
  left_join(ing_lu, by = "D.Item_norm") %>%
  # sum % by ingredient *category* within each arm
  group_by(B.Code, A.Level.Name, Cat_ing) %>%
  summarise(pct_cat = sum(Percentage_of_Diet, na.rm = TRUE), .groups = "drop") %>%
  # pick the max category per arm (ties broken deterministically)
  group_by(B.Code, A.Level.Name) %>%
  arrange(desc(pct_cat), Cat_ing, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup() %>%
  transmute(
    B.Code, A.Level.Name,
    Diet_Category = if_else(is.na(Cat_ing) | Cat_ing == "", "Unknown", Cat_ing)
  )

diet_cat_lu <- distinct(arm_cat)

# (optional) quick diagnostic
# table(diet_cat_lu$Diet_Category, useNA = "ifany")

# --- 3) Control → Treatment switches (simple) ------------------------------
# ctrl_key must have B.Code, A.Level.Name, T.Control (TRUE/"yes" marks control)
stopifnot(all(c("B.Code","A.Level.Name","T.Control") %in% names(ctrl_key)))

ctrl_lu <- ctrl_key %>%
  mutate(is_ctrl = ifelse(is.logical(T.Control), T.Control, tolower(T.Control) == "yes")) %>%
  filter(is_ctrl) %>%
  arrange(B.Code, A.Level.Name) %>%
  group_by(B.Code) %>%
  summarise(Control = first(A.Level.Name), .groups = "drop")

pairs <- diet_cat_lu %>%
  distinct(B.Code, A.Level.Name) %>%
  inner_join(ctrl_lu, by = "B.Code") %>%
  filter(A.Level.Name != Control) %>%
  transmute(B.Code, Control, Trt = A.Level.Name)

ctl <- diet_cat_lu %>% transmute(B.Code, Control = A.Level.Name, Cat_ctrl = Diet_Category)
trt <- diet_cat_lu %>% transmute(B.Code, Trt     = A.Level.Name, Cat_trt  = Diet_Category)

diet_switches <- pairs %>%
  left_join(ctl, by = c("B.Code","Control")) %>%
  left_join(trt, by = c("B.Code","Trt")) %>%
  mutate(
    Cat_ctrl = ifelse(is.na(Cat_ctrl) | Cat_ctrl=="", "Unknown", Cat_ctrl),
    Cat_trt  = ifelse(is.na(Cat_trt)  | Cat_trt=="",  "Unknown", Cat_trt),
    Switch   = paste0(Cat_ctrl, " \u2192 ", Cat_trt),
    Practice_simple = dplyr::case_when(
      Cat_ctrl == "Unknown" | Cat_trt == "Unknown" ~ "Unknown",
      Cat_ctrl == Cat_trt                          ~ "Same category",
      TRUE                                         ~ "Category switch"
    )
  ) %>%
  arrange(B.Code, Control, Trt)

# counts + transition table
switch_counts <- diet_switches %>%
  count(Cat_ctrl, Cat_trt, name = "n") %>%
  arrange(desc(n))

transition_table <- switch_counts %>%
  tidyr::pivot_wider(names_from = Cat_trt, values_from = n, values_fill = 0) %>%
  arrange(Cat_ctrl)

# --- Results ready to use ---------------------------------------------------
# diet_switches    # one row per trt vs control with Switch text
# switch_counts    # tall counts of Cat_ctrl -> Cat_trt
# transition_table # wide matrix of transitions

Practice diet analysis

suppressPackageStartupMessages({
  library(dplyr); library(tidyr); library(stringr); library(ggplot2); library(scales)
})

# ---------- 0) Pick the EI/CH4 fields you want ------------------------------
ei_core <- ei_prod %>%
  select(B.Code,
         A.Level.Name,
         EI_CH4_prod,      # emission intensity (kg CH4 / kg product)
         CH4_kg_year,      # absolute (if available)
         Product, P.Product,
         T.Control,        # "yes"/"no" or TRUE/FALSE
         Yield_kg_day, Yield_kg_year)

# ---------- 1) Join practice_tbl to EI on both arms --------------------------
# practice_tbl must have: B.Code, Control, Trt, Practice, Magnitude_pct, ΔTOTAL, Substitution_pairs, ...
stopifnot(all(c("B.Code","Control","Trt","Practice","Magnitude_pct") %in% names(practice_tbl)))

practice_ei <- practice_tbl %>%
  # attach control EI
  left_join(ei_core, by = c("B.Code","Control" = "A.Level.Name")) %>%
  # attach treatment EI (suffix conflicts)
  left_join(ei_core, by = c("B.Code","Trt" = "A.Level.Name"),
            suffix = c("_ctrl", "_trt")) %>%
  # rename the key metrics so they're easier to read
  rename(
    EI_ctrl         = EI_CH4_prod_ctrl,
    EI_trt          = EI_CH4_prod_trt,
    CH4_ctrl        = CH4_kg_year_ctrl,
    CH4_trt         = CH4_kg_year_trt,
    Yield_day_ctrl  = Yield_kg_day_ctrl,
    Yield_day_trt   = Yield_kg_day_trt,
    Yield_year_ctrl = Yield_kg_year_ctrl,
    Yield_year_trt  = Yield_kg_year_trt
  ) %>%
  mutate(
    # deltas: + means treatment is worse/higher
    delta_EI_prod   = EI_trt  - EI_ctrl,
    delta_CH4_year  = CH4_trt - CH4_ctrl,
    delta_Yield_day = Yield_day_trt - Yield_day_ctrl,
    delta_Yield_yr  = Yield_year_trt - Yield_year_ctrl,
    # best-available product/species label
    Product = coalesce(Product_trt, Product_ctrl, P.Product_trt, P.Product_ctrl)
  ) %>%
  filter(!is.na(delta_EI_prod))   # keep only pairs with EI on both sides
## Warning in left_join(., ei_core, by = c("B.Code", Control = "A.Level.Name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 3 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning in left_join(., ei_core, by = c("B.Code", Trt = "A.Level.Name"), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 17 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Quick species count (how many papers per species)
species_counts <- practice_ei %>%
  distinct(B.Code, Species = coalesce(P.Product_ctrl, P.Product_trt, Product_ctrl, Product_trt)) %>%
  count(Species, name = "n_papers") %>%
  arrange(desc(n_papers))

# ---------- 2) Simple summaries ---------------------------------------------
# (a) By practice (Addition/Substitution/No change)
summary_by_practice <- practice_ei %>%
  group_by(Practice) %>%
  summarise(
    n_pairs   = n(),
    mean_ΔEI  = mean(delta_EI_prod, na.rm = TRUE),
    median_ΔEI= median(delta_EI_prod, na.rm = TRUE),
    mean_mag  = mean(Magnitude_pct, na.rm = TRUE),
    .groups = "drop"
  )

# (b) By practice × species
summary_by_practice_species <- practice_ei %>%
  group_by(Product, Practice) %>%
  summarise(
    n_pairs   = n(),
    mean_ΔEI  = mean(delta_EI_prod, na.rm = TRUE),
    median_ΔEI= median(delta_EI_prod, na.rm = TRUE),
    mean_mag  = mean(Magnitude_pct, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(Product, Practice)

# ---------- 3) Plots (practice-level) ---------------------------------------
# 3a) Scatter: magnitude vs ΔEI by practice
p_scatter <- ggplot(practice_ei,
                    aes(Magnitude_pct, delta_EI_prod, colour = Practice)) +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
  geom_point(alpha = .7, size = 2) +
  geom_smooth(method = "lm", se = TRUE, colour = "black", formula = y ~ x) +
  scale_x_continuous("% of diet DM moved (practice magnitude)") +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
  facet_wrap(~ Product) +
  theme_bw() +
  theme(legend.position = "bottom")

# 3b) Violin/box: ΔEI by practice
p_violin <- ggplot(practice_ei,
                   aes(Practice, delta_EI_prod, fill = Practice)) +
  geom_violin(trim = FALSE, alpha = .35, colour = NA) +
  geom_boxplot(width = .18, outlier.shape = NA) +
  stat_summary(fun = median, geom = "point", size = 2, colour = "black") +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey50") +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)") +
  facet_wrap(~ Product, nrow = 1) +
  labs(x = NULL, title = "Emission-intensity shift by practice") +
  theme_bw() +
  theme(legend.position = "none")

# 3c) Trade-off: ΔYield vs ΔEI; size = magnitude
p_tradeoff <- ggplot(practice_ei,
                     aes(delta_Yield_day, delta_EI_prod,
                         colour = Practice,
                         size   = Magnitude_pct)) +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey60") +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
  geom_point(alpha = .75) +
  scale_size(range = c(2,6), name = "% moved") +
  scale_x_continuous("Δ Yield (kg animal⁻¹ d⁻¹)", labels = comma) +
  scale_y_continuous("Δ EI (kg CH₄ / kg product)", labels = comma) +
  facet_wrap(~ Product) +
  labs(title = "Productivity vs emission-intensity trade-offs, by practice") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Print them as needed:
# p_scatter; p_violin; p_tradeoff

# ---------- 4) Optional: look inside Substitution pairs ---------------------
# Uses the text column `Substitution_pairs` built in practice_tbl, e.g. "Forage → Concentrate; X → Y"
subs_pairs <- practice_ei %>%
  filter(Practice == "Substitution") %>%
  select(B.Code, Control, Trt, Product, Magnitude_pct, delta_EI_prod, Substitution_pairs) %>%
  mutate(Substitution_pairs = ifelse(is.na(Substitution_pairs), "", Substitution_pairs)) %>%
  filter(Substitution_pairs != "") %>%
  separate_rows(Substitution_pairs, sep = ";\\s*") %>%
  mutate(
    left  = str_trim(str_split_fixed(Substitution_pairs, "→", 2)[,1]),
    right = str_trim(str_split_fixed(Substitution_pairs, "→", 2)[,2])
  ) %>%
  filter(left != "", right != "") %>%
  mutate(pair_id = paste(left, "→", right))

# quick tally and mean ΔEI per pair (overall and by species)
subs_pair_summary <- subs_pairs %>%
  group_by(pair_id) %>%
  summarise(n = n(),
            mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
            mean_mag = mean(Magnitude_pct, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(n))

subs_pair_by_product <- subs_pairs %>%
  group_by(Product, pair_id) %>%
  summarise(n = n(),
            mean_ΔEI = mean(delta_EI_prod, na.rm = TRUE),
            mean_mag = mean(Magnitude_pct, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(Product, desc(n))

# Example plot: ΔEI vs % moved, coloured by pair direction
p_subs <- ggplot(subs_pairs,
                 aes(x = delta_EI_prod, y = Magnitude_pct,
                     colour = pair_id)) +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey60") +
  geom_point(alpha = .8) +
  facet_wrap(~ Product) +
  scale_y_continuous("% of diet DM substituted") +
  scale_x_continuous("Δ EI (kg CH₄ / kg product)", breaks = pretty_breaks()) +
  labs(title = "Substitution pairs: direction & intensity vs ΔEI",
       colour = "Removed → Added") +
  theme_bw() +
  theme(legend.position = "right")

# ---------- 5) A compact “effect” label (optional) --------------------------
EI_THRESH <- 0.02   # tweak as you like (kg CH4 / kg product)
practice_ei <- practice_ei %>%
  mutate(effect = case_when(
    delta_EI_prod <= -EI_THRESH ~ "Lower EI",
    delta_EI_prod >=  EI_THRESH ~ "Higher EI",
    TRUE                        ~ "Neutral"
  ))

effect_by_practice_species <- practice_ei %>%
  group_by(Product, Practice, effect) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Product, Practice) %>%
  mutate(pct = 100*n / sum(n)) %>%
  ungroup() %>%
  arrange(Product, Practice, desc(pct))

# View the main outputs:
species_counts
## # A tibble: 3 × 2
##   Species n_papers
##   <chr>      <int>
## 1 Goat           9
## 2 Sheep          8
## 3 Cattle         3
summary_by_practice
## # A tibble: 3 × 5
##   Practice     n_pairs   mean_ΔEI median_ΔEI mean_mag
##   <chr>          <int>      <dbl>      <dbl>    <dbl>
## 1 Addition         113  0.0000412  0.0000104     27.2
## 2 No change         42  0.0431     0.00256        0  
## 3 Substitution     268 -0.00232   -0.00277      107.
summary_by_practice_species
## # A tibble: 6 × 6
##   Product Practice     n_pairs  mean_ΔEI  median_ΔEI mean_mag
##   <chr>   <chr>          <int>     <dbl>       <dbl>    <dbl>
## 1 Meat    Addition         107 -0.000965  0.0000104      27.1
## 2 Meat    No change          9 -0.000900 -0.000619        0  
## 3 Meat    Substitution      17 -0.00614  -0.00000535     79.7
## 4 Milk    Addition           6  0.0180    0.00456        29.2
## 5 Milk    No change         33  0.0551    0.00256         0  
## 6 Milk    Substitution     251 -0.00206  -0.00277       109.
subs_pair_summary
## # A tibble: 9 × 4
##   pair_id               n    mean_ΔEI mean_mag
##   <chr>             <int>       <dbl>    <dbl>
## 1 II → Unclassified    10 -0.000120     128.  
## 2 I → II                8 -0.00404        6.22
## 3 II → VI               8 -0.00404        6.22
## 4 II → IV               3 -0.000389      26.1 
## 5 I → III               1 -0.00129       36.8 
## 6 IV → II               1 -0.00129       36.8 
## 7 Unclassified → IV     1 -0.0000137     32.4 
## 8 Unclassified → VI     1 -0.00000123    30   
## 9 V → Unclassified      1 -0.0000200     10
subs_pair_by_product
## # A tibble: 13 × 5
##    Product pair_id               n    mean_ΔEI mean_mag
##    <chr>   <chr>             <int>       <dbl>    <dbl>
##  1 Meat    II → Unclassified     9 -0.00000535   138   
##  2 Meat    I → II                4 -0.0261         6.22
##  3 Meat    II → VI               4 -0.0261         6.22
##  4 Meat    II → IV               2 -0.00000395    22.5 
##  5 Meat    Unclassified → IV     1 -0.0000137     32.4 
##  6 Meat    Unclassified → VI     1 -0.00000123    30   
##  7 Meat    V → Unclassified      1 -0.0000200     10   
##  8 Milk    I → II                4  0.0180         6.22
##  9 Milk    II → VI               4  0.0180         6.22
## 10 Milk    I → III               1 -0.00129       36.8 
## 11 Milk    II → IV               1 -0.00116       33.3 
## 12 Milk    II → Unclassified     1 -0.00115       33.3 
## 13 Milk    IV → II               1 -0.00129       36.8
p_scatter; p_violin; p_tradeoff; p_subs

suppressPackageStartupMessages({
  library(dplyr); library(tidyr); library(stringr); library(ggplot2); library(scales)
})

# -------- helpers -------------------------------------------------------------
norm_key <- function(x){
  x %>% tolower() %>% stringr::str_replace_all("[[:space:]]+", " ") %>% trimws()
}

# robust find of WG/ADG subinds (handles many name variants)
is_wg_subind <- function(x){
  str_detect(
    tolower(x),
    "(^|\\b)(weight\\s*gain|daily\\s*average\\s*weight\\s*gain|average\\s*daily\\s*weight\\s*gain|adg)(\\b|$)"
  )
}

# -------- 1) STANDARDISE WEIGHT GAIN to kg/day --------------------------------
# Source: merged_metadata$Data.Out  (your harmonised outcomes table)
# Uses weights_unique for BW^0.75 cases (g/kg metabolic weight).
stopifnot(exists("merged_metadata"), "Data.Out" %in% names(merged_metadata))
stopifnot(exists("weights_unique"))

wg_raw <- merged_metadata$Data.Out %>%
  filter(!is.na(Out.Subind)) %>%
  filter(is_wg_subind(Out.Subind)) %>%
  mutate(
    unit_raw = tolower(str_squish(Out.Unit)),
    value    = suppressWarnings(as.numeric(ED.Mean.T))
  ) %>%
  # keep only per-day measures or metabolic-weight forms
  filter(
    str_detect(unit_raw, "/d|/day| per day") |
    str_detect(unit_raw, "g/kg metabolic weight")
  ) %>%
  mutate(
    mass_unit = case_when(
      str_starts(unit_raw, "mg") ~ "mg",
      str_starts(unit_raw, "g")  ~ "g",
      str_starts(unit_raw, "kg") ~ "kg",
      TRUE                       ~ NA_character_
    ),
    gain_kg_day = case_when(
      mass_unit == "mg" ~ value / 1e6,
      mass_unit == "g"  ~ value / 1e3,
      mass_unit == "kg" ~ value,
      TRUE              ~ NA_real_
    )
  ) %>%
  # for g/kg metabolic weight, convert using BW^0.75
  left_join(
    weights_unique %>%
      transmute(B.Code, A.Level.Name, BW_start_kg = as.numeric(Out.WG.Start)),
    by = c("B.Code","A.Level.Name")
  ) %>%
  mutate(
    WG_kg_day_std = case_when(
      str_detect(unit_raw, "g/kg metabolic weight") &
        !is.na(value) & !is.na(BW_start_kg) ~ (value/1000) * (BW_start_kg^0.75),
      TRUE ~ gain_kg_day
    ),
    # normalised diet-arm key for safer joining later
    A_norm = norm_key(A.Level.Name)
  ) %>%
  filter(!is.na(WG_kg_day_std) & WG_kg_day_std > 0) %>%
  select(B.Code, A.Level.Name, A_norm, WG_kg_day_std)

# -------- 2) CH4 table with normalised arm names ------------------------------
# GE_combined has CH4_kg_year and diet arm under D.Item
ch4_tbl <- GE_combined %>%
  transmute(
    B.Code,
    A.Level.Name = D.Item,
    A_norm = norm_key(D.Item),
    CH4_kg_year = as.numeric(CH4_kg_year),
    P.Product,
    T.Control = ifelse(is.na(T.Control), NA_character_, T.Control)
  ) %>%
  filter(!is.na(CH4_kg_year))

# -------- 3) Build EI for weight gain (ei_wg) ---------------------------------
ei_wg <- ch4_tbl %>%
  inner_join(wg_raw, by = c("B.Code","A_norm")) %>%
  mutate(
    WG_kg_year = WG_kg_day_std * 365,
    EI_CH4_WG  = CH4_kg_year / WG_kg_year
  ) %>%
  filter(!is.na(EI_CH4_WG), EI_CH4_WG > 0, EI_CH4_WG < 1) %>%  # trim extremes
  select(B.Code,
         A.Level.Name = A.Level.Name.x,  # keep CH4-side label
         CH4_kg_year, P.Product, T.Control,
         WG_kg_day_std, WG_kg_year, EI_CH4_WG)
## Warning in inner_join(., wg_raw, by = c("B.Code", "A_norm")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 109 of `x` matches multiple rows in `y`.
## ℹ Row 1763 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# (Optional) quick diagnostics: unmatched WG or CH4
wg_unmatched  <- anti_join(wg_raw,  ch4_tbl, by = c("B.Code","A_norm")) %>% distinct(B.Code, A.Level.Name)
ch4_unmatched <- anti_join(ch4_tbl, wg_raw,  by = c("B.Code","A_norm")) %>% distinct(B.Code, A.Level.Name)
# View(wg_unmatched); View(ch4_unmatched)

# -------- 4) Join EI_WG to practice_tbl comparisons --------------------------
# Expect practice_tbl with columns: B.Code, Control, Trt, Practice, Magnitude_pct (from your earlier code)
stopifnot(exists("practice_tbl"))

ei_core <- ei_wg %>%
  select(B.Code, A.Level.Name, P.Product, T.Control,
         WG_kg_day_std, WG_kg_year, EI_CH4_WG, CH4_kg_year)

wg_pairs <- practice_tbl %>%
  transmute(B.Code, Control, Trt, Practice, Magnitude_pct) %>%
  # attach A (control)
  left_join(ei_core,
            by = c("B.Code","Control" = "A.Level.Name"),
            multiple = "first") %>%
  rename(
    P.Product_A    = P.Product,
    T.Control_A    = T.Control,
    WG_kg_day_A    = WG_kg_day_std,
    WG_kg_year_A   = WG_kg_year,
    EI_WG_A        = EI_CH4_WG,
    CH4_kg_year_A  = CH4_kg_year
  ) %>%
  # attach B (treatment)
  left_join(ei_core,
            by = c("B.Code","Trt" = "A.Level.Name"),
            multiple = "first") %>%
  rename(
    P.Product_B    = P.Product,
    T.Control_B    = T.Control,
    WG_kg_day_B    = WG_kg_day_std,
    WG_kg_year_B   = WG_kg_year,
    EI_WG_B        = EI_CH4_WG,
    CH4_kg_year_B  = CH4_kg_year
  ) %>%
  mutate(
    Species        = coalesce(P.Product_B, P.Product_A),
    delta_EI_WG    = EI_WG_B - EI_WG_A,          # (+) = worse intensity
    delta_WG_day   = WG_kg_day_B - WG_kg_day_A,  # change in ADG
    delta_CH4_year = CH4_kg_year_B - CH4_kg_year_A
  ) %>%
  filter(!is.na(EI_WG_A), !is.na(EI_WG_B))

# -------- 5) A couple of ready-to-run summaries/plots ------------------------

# A) Box/violin: EI shift by practice type (all species together)
ggplot(wg_pairs,
       aes(x = Practice, y = delta_EI_WG, fill = Practice)) +
  geom_violin(trim = FALSE, alpha = .35, colour = NA) +
  geom_boxplot(width = .15, outlier.shape = NA) +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey40") +
  stat_summary(fun = median, geom = "point", size = 2, colour = "black") +
  scale_y_continuous("Δ EI (kg CH₄ per kg LW-gain)",
                     labels = comma_format()) +
  labs(x = NULL, title = "Change in CH₄ intensity of growth by practice") +
  theme_bw() + theme(legend.position = "none")

# B) Scatter: magnitude of reformulation vs intensity shift (faceted by species)
ggplot(wg_pairs,
       aes(x = Magnitude_pct, y = delta_EI_WG, colour = Practice)) +
  geom_hline(yintercept = 0, linetype = 2, colour = "grey60") +
  geom_point(alpha = .7, size = 2) +
  geom_smooth(method = "lm", se = TRUE, colour = "black") +
  scale_x_continuous("% of diet DM changed") +
  scale_y_continuous("Δ EI (kg CH₄ per kg LW-gain)",
                     labels = comma_format()) +
  facet_wrap(~ Species) +
  theme_bw() + theme(legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'

# C) Table: how many comparisons per species × practice
wg_pairs %>%
  count(Species, Practice, name = "n_pairs") %>%
  arrange(desc(n_pairs))
## # A tibble: 8 × 3
##   Species Practice     n_pairs
##   <chr>   <chr>          <int>
## 1 Sheep   Addition          22
## 2 Goat    Addition          13
## 3 Goat    Substitution       7
## 4 Cattle  Addition           5
## 5 Sheep   No change          5
## 6 Sheep   Substitution       4
## 7 Cattle  Substitution       2
## 8 Goat    No change          1
suppressPackageStartupMessages({
  library(dplyr); library(tidyr); library(stringr)
  library(ggplot2); library(forcats); library(scales)
})

# ── config ────────────────────────────────────────────────────────────────────
MIN_N <- 3         # hide groups with <3 comparisons to keep plots clean
FACET_BY_SPECIES <- FALSE  # set TRUE if you want species facets

# ── 0) Bring types onto the EI deltas (wg_pairs) ─────────────────────────────
stopifnot(exists("wg_pairs"), exists("practice_tbl"))

pairs_with_types <- wg_pairs %>%
  left_join(
    practice_tbl %>%
      select(B.Code, Control, Trt, Practice,
             Types_added, Types_decreased, Substitution_pairs),
    by = c("B.Code","Control","Trt","Practice")
  )

# ── 1) ADDITIONS: explode Types_added → one row per category added ───────────
added_long <- pairs_with_types %>%
  filter(Practice == "Addition") %>%
  mutate(Types_added = na_if(Types_added, "")) %>%
  separate_rows(Types_added, sep = "\\s*,\\s*") %>%
  mutate(Category_added = str_trim(Types_added)) %>%
  filter(!is.na(Category_added),
         Category_added != "", Category_added != "Unclassified")

# keep only categories with enough points
add_keep <- added_long %>%
  count(Category_added, name = "n") %>%
  filter(n >= MIN_N)

added_long <- added_long %>%
  semi_join(add_keep, by = "Category_added") %>%
  mutate(
    x_lab = fct_reorder(
      paste0(Category_added, " (n=", add_keep$n[match(Category_added, add_keep$Category_added)], ")"),
      delta_EI_WG,
      .fun = median, .desc = TRUE, na.rm = TRUE
    )
  )

# ── Plot A: ΔEI by CATEGORY ADDED ────────────────────────────────────────────
p_add <- ggplot(
  added_long,
  aes(x = delta_EI_WG, y = x_lab)
) +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey65") +
  geom_boxplot(outlier.shape = NA, fill = "grey85") +
  geom_point(alpha = .55, size = 1.8) +
  scale_x_continuous("Δ EI (kg CH₄ per kg live-weight gain)", labels = comma) +
  labs(y = NULL, title = "Change in emission intensity by category ADDED") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())

if (FACET_BY_SPECIES) {
  p_add <- p_add + facet_wrap(~ Species, ncol = 1, scales = "free_y")
}

p_add

# ── 2) SUBSTITUTIONS: explode "Removed → Added" pairs ────────────────────────
subs_long <- pairs_with_types %>%
  filter(Practice == "Substitution") %>%
  mutate(Substitution_pairs = coalesce(Substitution_pairs, "")) %>%
  separate_rows(Substitution_pairs, sep = ";\\s*") %>%
  mutate(Substitution_pairs = str_trim(Substitution_pairs)) %>%
  filter(Substitution_pairs != "") %>%
  separate(Substitution_pairs, into = c("Removed", "Added"),
           sep = "\\s*→\\s*", remove = FALSE) %>%
  filter(!Removed %in% c("", "Unclassified"),
         !Added   %in% c("", "Unclassified"))

# keep only pairs with enough points
subs_keep <- subs_long %>%
  count(Removed, Added, name = "n") %>%
  filter(n >= MIN_N)

subs_long <- subs_long %>%
  inner_join(subs_keep, by = c("Removed","Added")) %>%
  mutate(
    pair_lab = paste0(Removed, " → ", Added, " (n=", n, ")")
  ) %>%
  mutate(
    pair_lab = forcats::fct_reorder(
      pair_lab, delta_EI_WG,
      .fun  = ~ median(.x, na.rm = TRUE),
      .desc = TRUE
    )
  )

# ── Plot B: ΔEI by SUBSTITUTION PAIR ─────────────────────────────────────────
p_sub <- ggplot(
  subs_long,
  aes(x = delta_EI_WG, y = pair_lab)
) +
  geom_vline(xintercept = 0, linetype = 2, colour = "grey65") +
  geom_boxplot(outlier.shape = NA, fill = "grey85") +
  geom_point(aes(size = Magnitude_pct), alpha = .6) +   # size encodes % DM swapped
  scale_size(range = c(1.5, 6), name = "% diet DM moved") +
  scale_x_continuous("Δ EI (kg CH₄ per kg live-weight gain)", labels = comma) +
  labs(y = NULL, title = "Change in emission intensity by SUBSTITUTION (Removed → Added)") +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        legend.position = "right")

if (FACET_BY_SPECIES) {
  p_sub <- p_sub + facet_wrap(~ Species, ncol = 1, scales = "free_y")
}

p_sub

# ── 3) (Optional) quick tables you may want in the text/report  --------------
add_summ <- added_long %>%
  group_by(Category_added) %>%
  summarise(n = n(),
            median_delta = median(delta_EI_WG, na.rm = TRUE),
            iqr_delta    = IQR(delta_EI_WG, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(median_delta)

sub_summ <- subs_long %>%
  group_by(Removed, Added) %>%
  summarise(n = n(),
            median_delta = median(delta_EI_WG, na.rm = TRUE),
            iqr_delta    = IQR(delta_EI_WG, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(median_delta)

add_summ
## # A tibble: 4 × 4
##   Category_added     n median_delta iqr_delta
##   <chr>          <int>        <dbl>     <dbl>
## 1 VI                 5    -0.108       0.306 
## 2 II                 5    -0.000818    0.0539
## 3 IV                10     0.00454     0.0441
## 4 V                  4     0.0131      0.0149
sub_summ
## # A tibble: 0 × 5
## # ℹ 5 variables: Removed <chr>, Added <chr>, n <int>, median_delta <dbl>,
## #   iqr_delta <dbl>