elegant code
get the average Sepal.Length per Species
■ Consider a better alternative:
groups <- levels(iris$Species)
averages <- c()
for(g in groups) averages[g] <-
mean(iris$Sepal.Length[iris$Species==g])
rm(g, groups)
averagesconditionally set invalid records to NA
■ Consider a better alternative:
for(i in c(1:nrow(DF))){
if(DF[i, "column1"] != "valid"){ # Where column1 is not
DF[i, "column2"] <- NA # 'valid', set
} # column2 to NA
}scale numeric columns
■ Consider a better alternative:
for(col in names(DF)) {
if(is.numeric(DF[ ,col])) {
mean_val <- mean(DF[ ,col], na.rm=TRUE)
sd_val <- sd(DF[ ,col], na.rm=TRUE)
for(i in 1:nrow(DF)) {
DF[i, col] <- (DF[i, col] - mean_val) / sd_val
}
}
}then click here to see the elegant approach:
read multiple csv files
■ Consider a better alternative:
file1 <- read.csv("data1.csv")
file2 <- read.csv("data2.csv")
file3 <- read.csv("data3.csv")
# ... repeat for 50 files
combined <- rbind(file1, file2, file3) # ... and so onthen click here to see the elegant approach:
count occurrences
■ Consider a better alternative:
categories <- unique(DF$category)
counts <- numeric(length(categories))
names(counts) <- categories
for(i in 1:nrow(DF)) {
cat <- DF$category[i]
counts[cat] <- counts[cat] + 1
}create age groups
■ Consider a better alternative:
DF$age_group <- NA
for(i in 1:nrow(DF)) {
if(DF$age[i] >= 0 & DF$age[i] < 18) DF$age_group[i] <- "0-17"
else if(DF$age[i] >= 18 & DF$age[i] < 30) DF$age_group[i] <- "18-29"
else if(DF$age[i] >= 30 & DF$age[i] < 50) DF$age_group[i] <- "30-49"
else if(DF$age[i] >= 50 & DF$age[i] < 65) DF$age_group[i] <- "50-64"
else if(DF$age[i] >= 65) DF$age_group[i] <- "65+"
else DF$age_group[i] <- NA
}then click here to see the elegant approach:
summary statistics per group
■ Consider a better alternative:
set.seed(4)
values <- data.frame(v1=rpois(40,3), v2=rexp(40), group=rep(0:1, 20))
library(dplyr)
compare_groups <- function(data, variable) {
summary_stats <- data %>%
group_by(group) %>%
summarise(
Min = min(get(variable), na.rm = TRUE),
Q25 = quantile(get(variable), 0.25, na.rm = TRUE),
Median = median(get(variable), na.rm = TRUE),
Mean = mean(get(variable), na.rm = TRUE),
Q75 = quantile(get(variable), 0.75, na.rm = TRUE),
Max = max(get(variable), na.rm = TRUE),
.groups = "drop"
)
return(as.data.frame(summary_stats))
}
compare_groups(values, "v1")Cohen’s D for goup mean differences
■ Consider a better alternative:
cohens_d <- function(group1, group2) {
mean_diff <- mean(group1, na.rm = TRUE) - mean(group2, na.rm = TRUE)
pooled_sd <- sqrt((var(group1, na.rm = TRUE) + var(group2, na.rm = TRUE)) / 2)
return(mean_diff / pooled_sd)
}
variables_to_compare <- c("V1", "V2", "V3", "V4")
effect_sizes <- sapply(variables_to_compare, function(var) {
gr1 <- values[values$group == "gr1", var]
gr2 <- values[values$group == "gr2", var]
abs(cohens_d(gr1, gr2))
})then click here to see the elegant approach:
More examples are shown in the fundamentals of programming tutorial slides.