合并连接 #

合并连接概述 #

Pandas 提供了多种合并 DataFrame 的方法，类似于 SQL 的连接操作。

text

┌─────────────────────────────────────────────────────────────┐
│                    合并连接方法                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  pd.merge()                                                 │
│  ├── 数据库风格连接                                         │
│  ├── 支持 inner, outer, left, right                        │
│  └── 基于一列或多列连接                                     │
│                                                             │
│  df.join()                                                  │
│  ├── 基于索引连接                                           │
│  ├── 简化的 merge                                           │
│  └── 适合多个 DataFrame 连接                                │
│                                                             │
│  pd.concat()                                                │
│  ├── 沿轴拼接                                               │
│  ├── 支持行拼接和列拼接                                     │
│  └── 可设置忽略索引                                         │
│                                                             │
└─────────────────────────────────────────────────────────────┘

准备数据 #

python

import pandas as pd
import numpy as np

# 员工表
employees = pd.DataFrame({
    'employee_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'department_id': [10, 20, 10, 30, 20]
})

# 部门表
departments = pd.DataFrame({
    'department_id': [10, 20, 40],
    'department_name': ['Sales', 'Engineering', 'HR']
})

# 薪资表
salaries = pd.DataFrame({
    'employee_id': [1, 2, 3, 4, 6],
    'salary': [50000, 60000, 70000, 55000, 80000]
})

print("员工表:")
print(employees)
print("\n部门表:")
print(departments)
print("\n薪资表:")
print(salaries)

merge - 数据库风格连接 #

内连接（inner） #

python

# 内连接：只保留匹配的行
result = pd.merge(employees, departments, on='department_id', how='inner')
print(result)
#    employee_id     name  department_id department_name
# 0            1    Alice             10           Sales
# 1            3  Charlie             10           Sales
# 2            2      Bob             20     Engineering
# 3            5      Eve             20     Engineering

左连接（left） #

python

# 左连接：保留左表所有行
result = pd.merge(employees, departments, on='department_id', how='left')
print(result)
#    employee_id     name  department_id department_name
# 0            1    Alice             10           Sales
# 1            2      Bob             20     Engineering
# 2            3  Charlie             10           Sales
# 3            4    Diana             30             NaN
# 4            5      Eve             20     Engineering

右连接（right） #

python

# 右连接：保留右表所有行
result = pd.merge(employees, departments, on='department_id', how='right')
print(result)
#    employee_id     name  department_id department_name
# 0          1.0    Alice             10           Sales
# 1          3.0  Charlie             10           Sales
# 2          2.0      Bob             20     Engineering
# 3          5.0      Eve             20     Engineering
# 4          NaN      NaN             40              HR

外连接（outer） #

python

# 外连接：保留所有行
result = pd.merge(employees, departments, on='department_id', how='outer')
print(result)
#    employee_id     name  department_id department_name
# 0          1.0    Alice             10           Sales
# 1          3.0  Charlie             10           Sales
# 2          2.0      Bob             20     Engineering
# 3          5.0      Eve             20     Engineering
# 4          4.0    Diana             30             NaN
# 5          NaN      NaN             40              HR

不同列名连接 #

python

# 列名不同时
df1 = pd.DataFrame({'id': [1, 2, 3], 'value1': ['a', 'b', 'c']})
df2 = pd.DataFrame({'employee_id': [1, 2, 4], 'value2': ['x', 'y', 'z']})

result = pd.merge(df1, df2, left_on='id', right_on='employee_id', how='inner')
print(result)
#    id value1  employee_id value2
# 0   1      a            1      x
# 1   2      b            2      y

多列连接 #

python

# 多列连接
df1 = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b'],
    'key2': [1, 2, 1, 2],
    'value1': [10, 20, 30, 40]
})

df2 = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'c'],
    'key2': [1, 2, 1, 1],
    'value2': [100, 200, 300, 400]
})

result = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')
print(result)
#   key1  key2  value1  value2
# 0    a     1      10     100
# 1    a     2      20     200
# 2    b     1      30     300

索引连接 #

python

# 基于索引连接
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['a', 'b', 'd'])

result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner')
print(result)
#    value1  value2
# a       1       4
# b       2       5

处理重复列名 #

python

# 自动添加后缀
df1 = pd.DataFrame({'id': [1, 2], 'value': ['a', 'b']})
df2 = pd.DataFrame({'id': [1, 2], 'value': ['x', 'y']})

result = pd.merge(df1, df2, on='id')
print(result)
#    id value_x value_y
# 0   1       a       x
# 1   2       b       y

# 自定义后缀
result = pd.merge(df1, df2, on='id', suffixes=('_left', '_right'))
print(result)

交叉连接 #

python

# 交叉连接（笛卡尔积）
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': ['x', 'y']})

result = pd.merge(df1, df2, how='cross')
print(result)
#    A  B
# 0  1  x
# 1  1  y
# 2  2  x
# 3  2  y

验证连接 #

python

# 验证连接类型
# 'one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'

# 一对一
result = pd.merge(employees, salaries, on='employee_id', validate='one_to_one')

# 一对多
result = pd.merge(employees, departments, on='department_id', validate='many_to_one')

join - 基于索引连接 #

python

# 基本 join
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['a', 'b', 'd'])

result = df1.join(df2)
print(result)
#    value1  value2
# a       1     4.0
# b       2     5.0
# c       3     NaN

# 指定连接方式
result = df1.join(df2, how='inner')
result = df1.join(df2, how='outer')
result = df1.join(df2, how='left')  # 默认

# 连接多个 DataFrame
df3 = pd.DataFrame({'value3': [7, 8, 9]}, index=['a', 'b', 'c'])
result = df1.join([df2, df3])
print(result)

# 指定列作为连接键
result = employees.set_index('department_id').join(
    departments.set_index('department_id'),
    how='left'
)

concat - 拼接 #

行拼接 #

python

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

# 默认行拼接
result = pd.concat([df1, df2])
print(result)
#    A  B
# 0  1  3
# 1  2  4
# 0  5  7
# 1  6  8

# 忽略索引
result = pd.concat([df1, df2], ignore_index=True)
print(result)
#    A  B
# 0  1  3
# 1  2  4
# 2  5  7
# 3  6  8

# 添加键标识
result = pd.concat([df1, df2], keys=['df1', 'df2'])
print(result)
#        A  B
# df1 0  1  3
#     1  2  4
# df2 0  5  7
#     1  6  8

列拼接 #

python

df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'B': [4, 5, 6]})

# 列拼接
result = pd.concat([df1, df2], axis=1)
print(result)
#    A  B
# 0  1  4
# 1  2  5
# 2  3  6

处理不同列 #

python

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'C': [7, 8]})

# 外连接（默认）：保留所有列
result = pd.concat([df1, df2])
print(result)
#      A    B    C
# 0  1.0  3.0  NaN
# 1  2.0  4.0  NaN
# 0  5.0  NaN  7.0
# 1  6.0  NaN  8.0

# 内连接：只保留共有列
result = pd.concat([df1, df2], join='inner')
print(result)
#    A
# 0  1
# 1  2
# 0  5
# 1  6

Series 拼接 #

python

s1 = pd.Series([1, 2, 3], name='A')
s2 = pd.Series([4, 5, 6], name='B')

# 行拼接
result = pd.concat([s1, s2])
print(result)

# 列拼接
result = pd.concat([s1, s2], axis=1)
print(result)

append（已弃用） #

python

# append 方法已弃用，建议使用 concat
# 旧写法
# df1.append(df2)

# 新写法
pd.concat([df1, df2])

合并多个 DataFrame #

python

# 使用 reduce 链式合并
from functools import reduce

dfs = [employees, salaries, departments]

result = reduce(lambda left, right: pd.merge(left, right, on='employee_id', how='left'), 
                dfs[:-1])
print(result)

# 使用 join 合并多个
df1 = pd.DataFrame({'A': [1, 2]}, index=['a', 'b'])
df2 = pd.DataFrame({'B': [3, 4]}, index=['a', 'b'])
df3 = pd.DataFrame({'C': [5, 6]}, index=['a', 'b'])

result = df1.join([df2, df3])
print(result)

合并方法选择指南 #

text

┌─────────────────────────────────────────────────────────────┐
│                    合并方法选择                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  场景                      推荐方法                         │
│  ────────────────────────  ────────────────────────────    │
│  数据库风格连接            pd.merge()                       │
│  基于索引连接              df.join()                        │
│  简单拼接                  pd.concat()                      │
│  多个 DataFrame 连接       df.join([df1, df2, ...])        │
│  交叉连接                  pd.merge(how='cross')           │
│                                                             │
└─────────────────────────────────────────────────────────────┘

性能优化 #

使用索引加速 #

python

# 设置索引后连接更快
df1 = df1.set_index('key')
df2 = df2.set_index('key')
result = df1.join(df2)

指定数据类型 #

python

# 连接前确保键的数据类型一致
df1['key'] = df1['key'].astype(str)
df2['key'] = df2['key'].astype(str)

避免重复计算 #

python

# 一次性合并，避免多次合并
result = pd.merge(df1, df2, on='key').merge(df3, on='key')

下一步 #

掌握了合并连接后，接下来学习数据重塑，了解如何转换数据的形状和结构！