合并连接 #
合并连接概述 #
Pandas 提供了多种合并 DataFrame 的方法,类似于 SQL 的连接操作。
text
┌─────────────────────────────────────────────────────────────┐
│ 合并连接方法 │
├─────────────────────────────────────────────────────────────┤
│ │
│ pd.merge() │
│ ├── 数据库风格连接 │
│ ├── 支持 inner, outer, left, right │
│ └── 基于一列或多列连接 │
│ │
│ df.join() │
│ ├── 基于索引连接 │
│ ├── 简化的 merge │
│ └── 适合多个 DataFrame 连接 │
│ │
│ pd.concat() │
│ ├── 沿轴拼接 │
│ ├── 支持行拼接和列拼接 │
│ └── 可设置忽略索引 │
│ │
└─────────────────────────────────────────────────────────────┘
准备数据 #
python
import pandas as pd
import numpy as np
# 员工表
employees = pd.DataFrame({
'employee_id': [1, 2, 3, 4, 5],
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'department_id': [10, 20, 10, 30, 20]
})
# 部门表
departments = pd.DataFrame({
'department_id': [10, 20, 40],
'department_name': ['Sales', 'Engineering', 'HR']
})
# 薪资表
salaries = pd.DataFrame({
'employee_id': [1, 2, 3, 4, 6],
'salary': [50000, 60000, 70000, 55000, 80000]
})
print("员工表:")
print(employees)
print("\n部门表:")
print(departments)
print("\n薪资表:")
print(salaries)
merge - 数据库风格连接 #
内连接(inner) #
python
# 内连接:只保留匹配的行
result = pd.merge(employees, departments, on='department_id', how='inner')
print(result)
# employee_id name department_id department_name
# 0 1 Alice 10 Sales
# 1 3 Charlie 10 Sales
# 2 2 Bob 20 Engineering
# 3 5 Eve 20 Engineering
左连接(left) #
python
# 左连接:保留左表所有行
result = pd.merge(employees, departments, on='department_id', how='left')
print(result)
# employee_id name department_id department_name
# 0 1 Alice 10 Sales
# 1 2 Bob 20 Engineering
# 2 3 Charlie 10 Sales
# 3 4 Diana 30 NaN
# 4 5 Eve 20 Engineering
右连接(right) #
python
# 右连接:保留右表所有行
result = pd.merge(employees, departments, on='department_id', how='right')
print(result)
# employee_id name department_id department_name
# 0 1.0 Alice 10 Sales
# 1 3.0 Charlie 10 Sales
# 2 2.0 Bob 20 Engineering
# 3 5.0 Eve 20 Engineering
# 4 NaN NaN 40 HR
外连接(outer) #
python
# 外连接:保留所有行
result = pd.merge(employees, departments, on='department_id', how='outer')
print(result)
# employee_id name department_id department_name
# 0 1.0 Alice 10 Sales
# 1 3.0 Charlie 10 Sales
# 2 2.0 Bob 20 Engineering
# 3 5.0 Eve 20 Engineering
# 4 4.0 Diana 30 NaN
# 5 NaN NaN 40 HR
不同列名连接 #
python
# 列名不同时
df1 = pd.DataFrame({'id': [1, 2, 3], 'value1': ['a', 'b', 'c']})
df2 = pd.DataFrame({'employee_id': [1, 2, 4], 'value2': ['x', 'y', 'z']})
result = pd.merge(df1, df2, left_on='id', right_on='employee_id', how='inner')
print(result)
# id value1 employee_id value2
# 0 1 a 1 x
# 1 2 b 2 y
多列连接 #
python
# 多列连接
df1 = pd.DataFrame({
'key1': ['a', 'a', 'b', 'b'],
'key2': [1, 2, 1, 2],
'value1': [10, 20, 30, 40]
})
df2 = pd.DataFrame({
'key1': ['a', 'a', 'b', 'c'],
'key2': [1, 2, 1, 1],
'value2': [100, 200, 300, 400]
})
result = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')
print(result)
# key1 key2 value1 value2
# 0 a 1 10 100
# 1 a 2 20 200
# 2 b 1 30 300
索引连接 #
python
# 基于索引连接
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['a', 'b', 'd'])
result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner')
print(result)
# value1 value2
# a 1 4
# b 2 5
处理重复列名 #
python
# 自动添加后缀
df1 = pd.DataFrame({'id': [1, 2], 'value': ['a', 'b']})
df2 = pd.DataFrame({'id': [1, 2], 'value': ['x', 'y']})
result = pd.merge(df1, df2, on='id')
print(result)
# id value_x value_y
# 0 1 a x
# 1 2 b y
# 自定义后缀
result = pd.merge(df1, df2, on='id', suffixes=('_left', '_right'))
print(result)
交叉连接 #
python
# 交叉连接(笛卡尔积)
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': ['x', 'y']})
result = pd.merge(df1, df2, how='cross')
print(result)
# A B
# 0 1 x
# 1 1 y
# 2 2 x
# 3 2 y
验证连接 #
python
# 验证连接类型
# 'one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'
# 一对一
result = pd.merge(employees, salaries, on='employee_id', validate='one_to_one')
# 一对多
result = pd.merge(employees, departments, on='department_id', validate='many_to_one')
join - 基于索引连接 #
python
# 基本 join
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['a', 'b', 'd'])
result = df1.join(df2)
print(result)
# value1 value2
# a 1 4.0
# b 2 5.0
# c 3 NaN
# 指定连接方式
result = df1.join(df2, how='inner')
result = df1.join(df2, how='outer')
result = df1.join(df2, how='left') # 默认
# 连接多个 DataFrame
df3 = pd.DataFrame({'value3': [7, 8, 9]}, index=['a', 'b', 'c'])
result = df1.join([df2, df3])
print(result)
# 指定列作为连接键
result = employees.set_index('department_id').join(
departments.set_index('department_id'),
how='left'
)
concat - 拼接 #
行拼接 #
python
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
# 默认行拼接
result = pd.concat([df1, df2])
print(result)
# A B
# 0 1 3
# 1 2 4
# 0 5 7
# 1 6 8
# 忽略索引
result = pd.concat([df1, df2], ignore_index=True)
print(result)
# A B
# 0 1 3
# 1 2 4
# 2 5 7
# 3 6 8
# 添加键标识
result = pd.concat([df1, df2], keys=['df1', 'df2'])
print(result)
# A B
# df1 0 1 3
# 1 2 4
# df2 0 5 7
# 1 6 8
列拼接 #
python
df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'B': [4, 5, 6]})
# 列拼接
result = pd.concat([df1, df2], axis=1)
print(result)
# A B
# 0 1 4
# 1 2 5
# 2 3 6
处理不同列 #
python
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'C': [7, 8]})
# 外连接(默认):保留所有列
result = pd.concat([df1, df2])
print(result)
# A B C
# 0 1.0 3.0 NaN
# 1 2.0 4.0 NaN
# 0 5.0 NaN 7.0
# 1 6.0 NaN 8.0
# 内连接:只保留共有列
result = pd.concat([df1, df2], join='inner')
print(result)
# A
# 0 1
# 1 2
# 0 5
# 1 6
Series 拼接 #
python
s1 = pd.Series([1, 2, 3], name='A')
s2 = pd.Series([4, 5, 6], name='B')
# 行拼接
result = pd.concat([s1, s2])
print(result)
# 列拼接
result = pd.concat([s1, s2], axis=1)
print(result)
append(已弃用) #
python
# append 方法已弃用,建议使用 concat
# 旧写法
# df1.append(df2)
# 新写法
pd.concat([df1, df2])
合并多个 DataFrame #
python
# 使用 reduce 链式合并
from functools import reduce
dfs = [employees, salaries, departments]
result = reduce(lambda left, right: pd.merge(left, right, on='employee_id', how='left'),
dfs[:-1])
print(result)
# 使用 join 合并多个
df1 = pd.DataFrame({'A': [1, 2]}, index=['a', 'b'])
df2 = pd.DataFrame({'B': [3, 4]}, index=['a', 'b'])
df3 = pd.DataFrame({'C': [5, 6]}, index=['a', 'b'])
result = df1.join([df2, df3])
print(result)
合并方法选择指南 #
text
┌─────────────────────────────────────────────────────────────┐
│ 合并方法选择 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 场景 推荐方法 │
│ ──────────────────────── ──────────────────────────── │
│ 数据库风格连接 pd.merge() │
│ 基于索引连接 df.join() │
│ 简单拼接 pd.concat() │
│ 多个 DataFrame 连接 df.join([df1, df2, ...]) │
│ 交叉连接 pd.merge(how='cross') │
│ │
└─────────────────────────────────────────────────────────────┘
性能优化 #
使用索引加速 #
python
# 设置索引后连接更快
df1 = df1.set_index('key')
df2 = df2.set_index('key')
result = df1.join(df2)
指定数据类型 #
python
# 连接前确保键的数据类型一致
df1['key'] = df1['key'].astype(str)
df2['key'] = df2['key'].astype(str)
避免重复计算 #
python
# 一次性合并,避免多次合并
result = pd.merge(df1, df2, on='key').merge(df3, on='key')
下一步 #
掌握了合并连接后,接下来学习 数据重塑,了解如何转换数据的形状和结构!
最后更新:2026-04-04