R语言数据框 #
一、数据框概述 #
数据框(Data Frame)是R语言中最常用的数据结构,用于存储表格数据。每列可以是不同的数据类型,但同一列必须是相同类型。
二、创建数据框 #
2.1 使用data.frame函数 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22),
city = c("北京", "上海", "广州"),
stringsAsFactors = FALSE
)
print(df)
2.2 从向量创建 #
r
name <- c("张三", "李四", "王五")
age <- c(25, 30, 22)
city <- c("北京", "上海", "广州")
df <- data.frame(name, age, city, stringsAsFactors = FALSE)
print(df)
2.3 从矩阵创建 #
r
m <- matrix(1:12, nrow = 3, ncol = 4)
df <- as.data.frame(m)
print(df)
2.4 从列表创建 #
r
my_list <- list(
name = c("张三", "李四"),
age = c(25, 30)
)
df <- as.data.frame(my_list)
print(df)
2.5 创建空数据框 #
r
df <- data.frame()
print(df)
df <- data.frame(
name = character(),
age = numeric(),
city = character()
)
print(df)
三、数据框属性 #
3.1 维度 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
dim(df)
nrow(df)
ncol(df)
length(df)
3.2 列名和行名 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
names(df)
colnames(df)
rownames(df)
rownames(df) <- c("A", "B", "C")
print(df)
3.3 结构查看 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
str(df)
summary(df)
head(df)
tail(df)
四、访问数据 #
4.1 访问列 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22),
city = c("北京", "上海", "广州")
)
df$name
df[["age"]]
df[[2]]
df["city"]
df[, "name"]
df[, 2]
4.2 访问行 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df[1, ]
df[c(1, 3), ]
df[1:2, ]
4.3 访问单元格 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df[1, 2]
df[2, "age"]
df$name[1]
4.4 逻辑索引 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df[df$age > 24, ]
df[df$name == "张三", ]
df[df$age >= 25 & df$age <= 30, ]
subset(df, age > 24)
五、修改数据框 #
5.1 修改列 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df$age <- c(26, 31, 23)
df[["age"]] <- df$age + 1
print(df)
5.2 添加列 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df$city <- c("北京", "上海", "广州")
df["gender"] <- c("男", "男", "女")
df <- cbind(df, score = c(85, 90, 78))
print(df)
5.3 删除列 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22),
city = c("北京", "上海", "广州")
)
df$city <- NULL
df <- df[, -2]
df <- df[, !names(df) %in% c("age")]
print(df)
5.4 添加行 #
r
df <- data.frame(
name = c("张三", "李四"),
age = c(25, 30)
)
new_row <- data.frame(name = "王五", age = 22)
df <- rbind(df, new_row)
print(df)
5.5 删除行 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df <- df[-1, ]
df <- df[df$name != "李四", ]
print(df)
六、数据框操作 #
6.1 排序 #
r
df <- data.frame(
name = c("张三", "李四", "王五"),
age = c(25, 30, 22)
)
df[order(df$age), ]
df[order(df$age, decreasing = TRUE), ]
df[order(df$age, df$name), ]
6.2 去重 #
r
df <- data.frame(
name = c("张三", "李四", "张三"),
age = c(25, 30, 25)
)
unique(df)
df[!duplicated(df), ]
df[!duplicated(df$name), ]
6.3 合并数据框 #
r
df1 <- data.frame(
id = c(1, 2, 3),
name = c("张三", "李四", "王五")
)
df2 <- data.frame(
id = c(1, 2, 4),
score = c(85, 90, 78)
)
merge(df1, df2, by = "id")
merge(df1, df2, by = "id", all = TRUE)
merge(df1, df2, by = "id", all.x = TRUE)
merge(df1, df2, by = "id", all.y = TRUE)
6.4 分组统计 #
r
df <- data.frame(
group = c("A", "A", "B", "B", "A"),
value = c(10, 20, 30, 40, 50)
)
aggregate(value ~ group, data = df, mean)
aggregate(value ~ group, data = df, sum)
tapply(df$value, df$group, mean)
七、处理缺失值 #
7.1 检查缺失值 #
r
df <- data.frame(
name = c("张三", "李四", NA),
age = c(25, NA, 22)
)
is.na(df)
colSums(is.na(df))
rowSums(is.na(df))
anyNA(df)
7.2 删除缺失值 #
r
df <- data.frame(
name = c("张三", "李四", NA),
age = c(25, NA, 22)
)
na.omit(df)
df[complete.cases(df), ]
df[!is.na(df$name), ]
7.3 替换缺失值 #
r
df <- data.frame(
name = c("张三", "李四", NA),
age = c(25, NA, 22)
)
df$age[is.na(df$age)] <- mean(df$age, na.rm = TRUE)
df$name[is.na(df$name)] <- "未知"
print(df)
八、数据框转换 #
8.1 转置 #
r
df <- data.frame(
name = c("张三", "李四"),
age = c(25, 30)
)
t(df)
8.2 转换为矩阵 #
r
df <- data.frame(
a = c(1, 2, 3),
b = c(4, 5, 6)
)
as.matrix(df)
data.matrix(df)
8.3 转换为列表 #
r
df <- data.frame(
name = c("张三", "李四"),
age = c(25, 30)
)
as.list(df)
split(df, df$name)
九、apply函数族 #
9.1 apply #
r
df <- data.frame(
a = c(1, 2, 3),
b = c(4, 5, 6)
)
apply(df, 1, sum)
apply(df, 2, sum)
apply(df, 2, mean)
9.2 lapply和sapply #
r
df <- data.frame(
a = c(1, 2, 3),
b = c(4, 5, 6)
)
lapply(df, sum)
sapply(df, mean)
sapply(df, class)
十、实践示例 #
10.1 学生成绩管理 #
r
students <- data.frame(
id = 1:5,
name = c("张三", "李四", "王五", "赵六", "钱七"),
math = c(85, 90, 78, 92, 88),
english = c(80, 85, 90, 75, 82)
)
students$total <- students$math + students$english
students$average <- students$total / 2
students$grade <- ifelse(students$average >= 85, "优秀",
ifelse(students$average >= 70, "良好", "及格"))
print(students)
10.2 数据清洗 #
r
raw_data <- data.frame(
name = c("张三", "李四", "", "王五"),
age = c(25, -1, 30, 22),
score = c(85, 90, NA, 78)
)
raw_data$name[raw_data$name == ""] <- NA
raw_data$age[raw_data$age < 0] <- NA
raw_data <- na.omit(raw_data)
print(raw_data)
10.3 数据汇总 #
r
sales <- data.frame(
region = c("北", "南", "北", "南", "北"),
product = c("A", "A", "B", "B", "A"),
amount = c(100, 150, 200, 180, 120)
)
aggregate(amount ~ region, data = sales, sum)
aggregate(amount ~ product, data = sales, mean)
aggregate(amount ~ region + product, data = sales, sum)
十一、总结 #
本章学习了:
- 数据框的创建方法
- 数据框属性查看
- 数据访问:列、行、单元格
- 数据修改:增删改
- 排序、去重、合并操作
- 缺失值处理
- 数据框转换
- apply函数族的应用
数据框是R语言数据分析的核心数据结构,掌握数据框操作是数据科学的基础!
最后更新:2026-03-27