elegant code

get the average Sepal.Length per Species

■ Consider a better alternative:

groups <- levels(iris$Species)
averages <- c()
for(g in groups) averages[g] <- 
                 mean(iris$Sepal.Length[iris$Species==g])
rm(g, groups)
averages

then click here to see the elegant approach:

tapply(iris$Sepal.Length, iris$Species, mean)

conditionally set invalid records to NA

■ Consider a better alternative:

for(i in c(1:nrow(DF))){
  if(DF[i, "column1"] != "valid"){  # Where column1 is not
   DF[i, "column2"] <- NA           # 'valid', set
  }                                 # column2 to NA
}

then click here to see the elegant approach:

DF$column2[DF$column1 != "valid"] <- NA

collect results

■ Consider a better alternative:

results <- c()
for(i in 1:10000) {
  results <- c(results, some_calculation(i))
}

then click here to see the elegant approach:

results <- sapply(1:1e6, some_calculation)

scale numeric columns

■ Consider a better alternative:

for(col in names(DF)) {
  if(is.numeric(DF[ ,col])) {
    mean_val <- mean(DF[ ,col], na.rm=TRUE)
    sd_val <- sd(DF[ ,col], na.rm=TRUE)
    for(i in 1:nrow(DF)) {
      DF[i, col] <- (DF[i, col] - mean_val) / sd_val
    }
  }
}

then click here to see the elegant approach:

numcols <- sapply(DF, is.numeric)
DF[ ,numcols] <- scale(DF[ ,numcols])
rm(numcols)

read multiple csv files

■ Consider a better alternative:

file1 <- read.csv("data1.csv")
file2 <- read.csv("data2.csv")
file3 <- read.csv("data3.csv")
# ... repeat for 50 files
combined <- rbind(file1, file2, file3) # ... and so on

then click here to see the elegant approach:

files <- list.files(pattern="*.csv", full.names=TRUE)
combined <- do.call(rbind, lapply(files, read.csv))

count occurrences

■ Consider a better alternative:

categories <- unique(DF$category)
counts <- numeric(length(categories))
names(counts) <- categories
for(i in 1:nrow(DF)) {
  cat <- DF$category[i]
  counts[cat] <- counts[cat] + 1
}

then click here to see the elegant approach:

table(DF$category)

create age groups

■ Consider a better alternative:

DF$age_group <- NA
for(i in 1:nrow(DF)) {
       if(DF$age[i] >=  0 & DF$age[i] < 18) DF$age_group[i] <- "0-17"
  else if(DF$age[i] >= 18 & DF$age[i] < 30) DF$age_group[i] <- "18-29"
  else if(DF$age[i] >= 30 & DF$age[i] < 50) DF$age_group[i] <- "30-49"
  else if(DF$age[i] >= 50 & DF$age[i] < 65) DF$age_group[i] <- "50-64"
  else if(DF$age[i] >= 65)                  DF$age_group[i] <- "65+"
  else                                      DF$age_group[i] <- NA
}

then click here to see the elegant approach:

DF$age_group <- cut(DF$age, 
                    breaks = c(0, 18, 30, 50, 65, Inf),
                    labels = c("0-17", "18-29", "30-49", "50-64", "65+"),
                    right = FALSE)

summary statistics per group

■ Consider a better alternative:

set.seed(4)
values <- data.frame(v1=rpois(40,3), v2=rexp(40), group=rep(0:1, 20))

library(dplyr)
compare_groups <- function(data, variable) {
    summary_stats <- data %>%
    group_by(group) %>%
    summarise(
      Min = min(get(variable), na.rm = TRUE),
      Q25 = quantile(get(variable), 0.25, na.rm = TRUE),
      Median = median(get(variable), na.rm = TRUE),
      Mean = mean(get(variable), na.rm = TRUE),
      Q75 = quantile(get(variable), 0.75, na.rm = TRUE),
      Max = max(get(variable), na.rm = TRUE),
      .groups = "drop"
    )
    return(as.data.frame(summary_stats))
}
compare_groups(values, "v1")

then click here to see the elegant approach:

aggregate(v1~group, data=values, summary)

Cohen’s D for goup mean differences

■ Consider a better alternative:

cohens_d <- function(group1, group2) {
  mean_diff <- mean(group1, na.rm = TRUE) - mean(group2, na.rm = TRUE)
  pooled_sd <- sqrt((var(group1, na.rm = TRUE) + var(group2, na.rm = TRUE)) / 2)
  return(mean_diff / pooled_sd)
}
variables_to_compare <- c("V1", "V2", "V3", "V4")
effect_sizes <- sapply(variables_to_compare, function(var) {
  gr1 <- values[values$group == "gr1", var]
  gr2 <- values[values$group == "gr2", var]
  abs(cohens_d(gr1, gr2))
})

then click here to see the elegant approach:

sapply(c("V1","V2","V3","V4"), function(col)
  effectsize::cohens_d(values[,col] ~ values$group)$Cohens_d)

More examples are shown in the fundamentals of programming tutorial slides.