R语言因子 #

一、因子概述 #

因子（Factor）是R语言中用于表示分类数据的数据类型。它将数据存储为整数向量，每个整数对应一个水平（level）。

二、创建因子 #

2.1 基本创建 #

gender <- factor(c("男", "女", "男", "女", "男"))
print(gender)
class(gender)
levels(gender)
nlevels(gender)

2.2 指定水平 #

size <- factor(c("S", "M", "L", "M", "S"),
               levels = c("S", "M", "L", "XL"))
print(size)
levels(size)

2.3 指定标签 #

status <- factor(c(1, 2, 1, 3, 2),
                 levels = c(1, 2, 3),
                 labels = c("低", "中", "高"))
print(status)

2.4 有序因子 #

rating <- factor(c("差", "中", "好", "好", "中"),
                 levels = c("差", "中", "好"),
                 ordered = TRUE)
print(rating)
rating[1] < rating[3]

三、因子操作 #

3.1 查看因子信息 #

x <- factor(c("a", "b", "c", "a", "b"))

levels(x)
nlevels(x)
table(x)
summary(x)

3.2 访问元素 #

x <- factor(c("a", "b", "c", "a", "b"))

x[1]
x[1:3]
x[c(1, 3, 5)]

3.3 修改元素 #

x <- factor(c("a", "b", "c", "a", "b"))

x[1] <- "c"
print(x)

x[1] <- "d"
print(x)

3.4 添加水平 #

x <- factor(c("a", "b", "c"))

levels(x) <- c(levels(x), "d")
x[1] <- "d"
print(x)

四、水平管理 #

4.1 修改水平名称 #

x <- factor(c("a", "b", "c", "a", "b"))

levels(x) <- c("A", "B", "C")
print(x)

4.2 重新编码水平 #

x <- factor(c("low", "medium", "high", "medium", "low"))

levels(x) <- list(
  low = "low",
  high = c("medium", "high")
)
print(x)

4.3 删除未使用的水平 #

x <- factor(c("a", "b", "c", "a", "b"))
x <- x[x != "c"]
print(x)
levels(x)

x <- droplevels(x)
levels(x)

4.4 合并水平 #

x <- factor(c("red", "green", "blue", "red", "green"))

levels(x) <- c("warm", "cool", "cool")
print(x)

4.5 重排序水平 #

x <- factor(c("a", "b", "c", "a", "b"))

levels(x)
x <- factor(x, levels = c("c", "b", "a"))
levels(x)

五、有序因子 #

5.1 创建有序因子 #

education <- factor(
  c("高中", "本科", "硕士", "博士", "本科"),
  levels = c("高中", "本科", "硕士", "博士"),
  ordered = TRUE
)
print(education)

5.2 比较操作 #

education[1] < education[2]
education[3] > education[4]
education[1] <= education[5]

5.3 排序 #

education <- factor(
  c("高中", "本科", "硕士", "博士", "本科"),
  levels = c("高中", "本科", "硕士", "博士"),
  ordered = TRUE
)

sort(education)
order(education)

六、因子与统计 #

6.1 频数统计 #

x <- factor(c("a", "b", "c", "a", "b", "a"))

table(x)
prop.table(table(x))

6.2 交叉表 #

gender <- factor(c("男", "女", "男", "女", "男"))
status <- factor(c("及格", "及格", "不及格", "及格", "不及格"))

table(gender, status)

6.3 聚合分析 #

df <- data.frame(
  group = factor(c("A", "B", "A", "B", "A")),
  value = c(10, 20, 15, 25, 12)
)

aggregate(value ~ group, data = df, mean)
tapply(df$value, df$group, mean)

七、因子转换 #

7.1 因子转字符 #

x <- factor(c("a", "b", "c"))
y <- as.character(x)
class(y)

7.2 因子转数值 #

x <- factor(c("1", "2", "3"))
as.numeric(x)
as.numeric(as.character(x))

7.3 数值转因子 #

x <- c(1, 2, 3, 1, 2)
y <- factor(x)
print(y)

cut(1:10, breaks = c(0, 3, 6, 10))
cut(1:10, breaks = c(0, 3, 6, 10), labels = c("低", "中", "高"))

7.4 字符转因子 #

x <- c("apple", "banana", "apple", "cherry")
y <- factor(x)
print(y)

八、forcats包 #

8.1 安装和加载 #

install.packages("forcats")
library(forcats)

8.2 重排序水平 #

library(forcats)

x <- factor(c("a", "b", "c", "a", "b"))

fct_relevel(x, "c")
fct_inorder(x)
fct_infreq(x)

8.3 合并水平 #

x <- factor(c("a", "b", "c", "d", "e"))

fct_collapse(x, other = c("d", "e"))
fct_lump(x, n = 2)

8.4 重命名水平 #

x <- factor(c("a", "b", "c"))

fct_recode(x, "A" = "a", "B" = "b", "C" = "c")

九、实践示例 #

9.1 学生成绩分析 #

students <- data.frame(
  name = c("张三", "李四", "王五", "赵六", "钱七"),
  gender = factor(c("男", "女", "男", "女", "男")),
  grade = factor(c("大一", "大二", "大三", "大一", "大四"),
                 levels = c("大一", "大二", "大三", "大四"),
                 ordered = TRUE),
  score = c(85, 92, 78, 88, 95)
)

table(students$gender)
table(students$grade)
tapply(students$score, students$gender, mean)

9.2 年龄分组 #

ages <- c(25, 35, 45, 55, 65, 75, 30, 40, 50, 60)

age_groups <- cut(ages,
                  breaks = c(0, 30, 50, 70, 100),
                  labels = c("青年", "中年", "中老年", "老年"))
table(age_groups)

9.3 满意度调查 #

satisfaction <- factor(
  c("非常满意", "满意", "一般", "不满意", "满意", "非常满意"),
  levels = c("不满意", "一般", "满意", "非常满意"),
  ordered = TRUE
)

table(satisfaction)
prop.table(table(satisfaction))
median(satisfaction)

十、注意事项 #

10.1 字符串转因子 #

df <- data.frame(
  name = c("张三", "李四"),
  stringsAsFactors = FALSE
)
class(df$name)

df <- data.frame(
  name = c("张三", "李四"),
  stringsAsFactors = TRUE
)
class(df$name)

10.2 数值因子陷阱 #

x <- factor(c("10", "20", "30"))

as.numeric(x)
as.numeric(as.character(x))

10.3 水平顺序 #

x <- factor(c("低", "中", "高"))
levels(x)

x <- factor(c("低", "中", "高"), levels = c("低", "中", "高"))
levels(x)

十一、总结 #

本章学习了：

因子的创建和基本操作
水平管理和修改
有序因子的使用
因子在统计分析中的应用
因子与其他类型的转换
forcats包的使用

因子是R语言处理分类数据的核心工具，在统计分析和数据可视化中广泛应用！