R语言因子 #
一、因子概述 #
因子(Factor)是R语言中用于表示分类数据的数据类型。它将数据存储为整数向量,每个整数对应一个水平(level)。
二、创建因子 #
2.1 基本创建 #
r
gender <- factor(c("男", "女", "男", "女", "男"))
print(gender)
class(gender)
levels(gender)
nlevels(gender)
2.2 指定水平 #
r
size <- factor(c("S", "M", "L", "M", "S"),
levels = c("S", "M", "L", "XL"))
print(size)
levels(size)
2.3 指定标签 #
r
status <- factor(c(1, 2, 1, 3, 2),
levels = c(1, 2, 3),
labels = c("低", "中", "高"))
print(status)
2.4 有序因子 #
r
rating <- factor(c("差", "中", "好", "好", "中"),
levels = c("差", "中", "好"),
ordered = TRUE)
print(rating)
rating[1] < rating[3]
三、因子操作 #
3.1 查看因子信息 #
r
x <- factor(c("a", "b", "c", "a", "b"))
levels(x)
nlevels(x)
table(x)
summary(x)
3.2 访问元素 #
r
x <- factor(c("a", "b", "c", "a", "b"))
x[1]
x[1:3]
x[c(1, 3, 5)]
3.3 修改元素 #
r
x <- factor(c("a", "b", "c", "a", "b"))
x[1] <- "c"
print(x)
x[1] <- "d"
print(x)
3.4 添加水平 #
r
x <- factor(c("a", "b", "c"))
levels(x) <- c(levels(x), "d")
x[1] <- "d"
print(x)
四、水平管理 #
4.1 修改水平名称 #
r
x <- factor(c("a", "b", "c", "a", "b"))
levels(x) <- c("A", "B", "C")
print(x)
4.2 重新编码水平 #
r
x <- factor(c("low", "medium", "high", "medium", "low"))
levels(x) <- list(
low = "low",
high = c("medium", "high")
)
print(x)
4.3 删除未使用的水平 #
r
x <- factor(c("a", "b", "c", "a", "b"))
x <- x[x != "c"]
print(x)
levels(x)
x <- droplevels(x)
levels(x)
4.4 合并水平 #
r
x <- factor(c("red", "green", "blue", "red", "green"))
levels(x) <- c("warm", "cool", "cool")
print(x)
4.5 重排序水平 #
r
x <- factor(c("a", "b", "c", "a", "b"))
levels(x)
x <- factor(x, levels = c("c", "b", "a"))
levels(x)
五、有序因子 #
5.1 创建有序因子 #
r
education <- factor(
c("高中", "本科", "硕士", "博士", "本科"),
levels = c("高中", "本科", "硕士", "博士"),
ordered = TRUE
)
print(education)
5.2 比较操作 #
r
education[1] < education[2]
education[3] > education[4]
education[1] <= education[5]
5.3 排序 #
r
education <- factor(
c("高中", "本科", "硕士", "博士", "本科"),
levels = c("高中", "本科", "硕士", "博士"),
ordered = TRUE
)
sort(education)
order(education)
六、因子与统计 #
6.1 频数统计 #
r
x <- factor(c("a", "b", "c", "a", "b", "a"))
table(x)
prop.table(table(x))
6.2 交叉表 #
r
gender <- factor(c("男", "女", "男", "女", "男"))
status <- factor(c("及格", "及格", "不及格", "及格", "不及格"))
table(gender, status)
6.3 聚合分析 #
r
df <- data.frame(
group = factor(c("A", "B", "A", "B", "A")),
value = c(10, 20, 15, 25, 12)
)
aggregate(value ~ group, data = df, mean)
tapply(df$value, df$group, mean)
七、因子转换 #
7.1 因子转字符 #
r
x <- factor(c("a", "b", "c"))
y <- as.character(x)
class(y)
7.2 因子转数值 #
r
x <- factor(c("1", "2", "3"))
as.numeric(x)
as.numeric(as.character(x))
7.3 数值转因子 #
r
x <- c(1, 2, 3, 1, 2)
y <- factor(x)
print(y)
cut(1:10, breaks = c(0, 3, 6, 10))
cut(1:10, breaks = c(0, 3, 6, 10), labels = c("低", "中", "高"))
7.4 字符转因子 #
r
x <- c("apple", "banana", "apple", "cherry")
y <- factor(x)
print(y)
八、forcats包 #
8.1 安装和加载 #
r
install.packages("forcats")
library(forcats)
8.2 重排序水平 #
r
library(forcats)
x <- factor(c("a", "b", "c", "a", "b"))
fct_relevel(x, "c")
fct_inorder(x)
fct_infreq(x)
8.3 合并水平 #
r
x <- factor(c("a", "b", "c", "d", "e"))
fct_collapse(x, other = c("d", "e"))
fct_lump(x, n = 2)
8.4 重命名水平 #
r
x <- factor(c("a", "b", "c"))
fct_recode(x, "A" = "a", "B" = "b", "C" = "c")
九、实践示例 #
9.1 学生成绩分析 #
r
students <- data.frame(
name = c("张三", "李四", "王五", "赵六", "钱七"),
gender = factor(c("男", "女", "男", "女", "男")),
grade = factor(c("大一", "大二", "大三", "大一", "大四"),
levels = c("大一", "大二", "大三", "大四"),
ordered = TRUE),
score = c(85, 92, 78, 88, 95)
)
table(students$gender)
table(students$grade)
tapply(students$score, students$gender, mean)
9.2 年龄分组 #
r
ages <- c(25, 35, 45, 55, 65, 75, 30, 40, 50, 60)
age_groups <- cut(ages,
breaks = c(0, 30, 50, 70, 100),
labels = c("青年", "中年", "中老年", "老年"))
table(age_groups)
9.3 满意度调查 #
r
satisfaction <- factor(
c("非常满意", "满意", "一般", "不满意", "满意", "非常满意"),
levels = c("不满意", "一般", "满意", "非常满意"),
ordered = TRUE
)
table(satisfaction)
prop.table(table(satisfaction))
median(satisfaction)
十、注意事项 #
10.1 字符串转因子 #
r
df <- data.frame(
name = c("张三", "李四"),
stringsAsFactors = FALSE
)
class(df$name)
df <- data.frame(
name = c("张三", "李四"),
stringsAsFactors = TRUE
)
class(df$name)
10.2 数值因子陷阱 #
r
x <- factor(c("10", "20", "30"))
as.numeric(x)
as.numeric(as.character(x))
10.3 水平顺序 #
r
x <- factor(c("低", "中", "高"))
levels(x)
x <- factor(c("低", "中", "高"), levels = c("低", "中", "高"))
levels(x)
十一、总结 #
本章学习了:
- 因子的创建和基本操作
- 水平管理和修改
- 有序因子的使用
- 因子在统计分析中的应用
- 因子与其他类型的转换
- forcats包的使用
因子是R语言处理分类数据的核心工具,在统计分析和数据可视化中广泛应用!
最后更新:2026-03-27