数据可视化 #

可视化概述 #

Pandas 集成了 Matplotlib，提供了便捷的绘图接口，可以快速创建常用图表。

text

┌─────────────────────────────────────────────────────────────┐
│                    Pandas 绘图类型                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  基础图表                                                   │
│  ├── plot.line()      折线图                                │
│  ├── plot.bar()       柱状图                                │
│  ├── plot.barh()      横向柱状图                            │
│  ├── plot.hist()      直方图                                │
│  └── plot.box()       箱线图                                │
│                                                             │
│  统计图表                                                   │
│  ├── plot.kde()       核密度估计                            │
│  ├── plot.area()      面积图                                │
│  ├── plot.pie()       饼图                                  │
│  └── plot.scatter()   散点图                                │
│                                                             │
│  高级图表                                                   │
│  ├── plot.hexbin()    六边形图                              │
│  └── scatter_matrix() 散点图矩阵                            │
│                                                             │
└─────────────────────────────────────────────────────────────┘

准备环境 #

python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # macOS
plt.rcParams['axes.unicode_minus'] = False

# 设置图表风格
plt.style.use('seaborn-v0_8-whitegrid')

# 准备数据
np.random.seed(42)
df = pd.DataFrame({
    'A': np.random.randn(100).cumsum(),
    'B': np.random.randn(100).cumsum(),
    'C': np.random.randn(100).cumsum()
}, index=pd.date_range('2024-01-01', periods=100))

折线图 #

python

# 基本折线图
df.plot()
plt.title('Line Plot')
plt.show()

# 单列
df['A'].plot()

# 指定样式
df.plot(style=['-', '--', ':'])
df.plot(linewidth=2, figsize=(10, 6))

# 带标记点
df.plot(marker='o', markersize=3)

# 子图
df.plot(subplots=True, figsize=(10, 8))
df.plot(subplots=True, layout=(1, 3), figsize=(15, 4))

柱状图 #

python

# 准备数据
df_bar = pd.DataFrame({
    'category': ['A', 'B', 'C', 'D'],
    'value1': [10, 20, 15, 25],
    'value2': [15, 25, 20, 30]
})

# 垂直柱状图
df_bar.plot(x='category', y='value1', kind='bar')
# 或
df_bar.plot.bar(x='category', y='value1')

# 多列柱状图
df_bar.plot.bar(x='category', y=['value1', 'value2'])

# 堆叠柱状图
df_bar.plot.bar(x='category', stacked=True)

# 水平柱状图
df_bar.plot.barh(x='category', y=['value1', 'value2'])

# 自定义颜色
df_bar.plot.bar(x='category', color=['skyblue', 'salmon'])

直方图 #

python

# 准备数据
df_hist = pd.DataFrame({
    'normal': np.random.randn(1000),
    'gamma': np.random.gamma(2, 2, 1000)
})

# 基本直方图
df_hist['normal'].plot.hist()

# 多列
df_hist.plot.hist(alpha=0.5)  # 半透明

# 指定箱数
df_hist.plot.hist(bins=50)

# 堆叠直方图
df_hist.plot.hist(stacked=True, bins=30)

# 子图
df_hist.plot.hist(subplots=True, bins=30, figsize=(10, 4))

# 概率密度
df_hist['normal'].plot.hist(density=True)
df_hist['normal'].plot.kde()  # 核密度估计

箱线图 #

python

# 准备数据
df_box = pd.DataFrame({
    'A': np.random.normal(0, 1, 100),
    'B': np.random.normal(1, 1.5, 100),
    'C': np.random.normal(-1, 0.5, 100)
})

# 基本箱线图
df_box.plot.box()

# 分组箱线图
df_box.plot.box(by='group')  # 如果有分组列

# 自定义样式
df_box.plot.box(
    color='blue',
    vert=True,  # 垂直
    patch_artist=True,  # 填充颜色
    showfliers=True  # 显示异常值
)

散点图 #

python

# 准备数据
df_scatter = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100),
    'size': np.random.rand(100) * 100,
    'color': np.random.rand(100)
})

# 基本散点图
df_scatter.plot.scatter(x='x', y='y')

# 指定颜色
df_scatter.plot.scatter(x='x', y='y', c='color', colormap='viridis')

# 指定点大小
df_scatter.plot.scatter(x='x', y='y', s='size')

# 自定义样式
df_scatter.plot.scatter(
    x='x', y='y',
    c='red',
    s=50,
    alpha=0.5,
    edgecolors='black'
)

饼图 #

python

# 准备数据
df_pie = pd.DataFrame({
    'category': ['A', 'B', 'C', 'D'],
    'value': [30, 25, 20, 25]
})

# 基本饼图
df_pie.plot.pie(y='value', labels=df_pie['category'])

# 自定义样式
df_pie.plot.pie(
    y='value',
    labels=df_pie['category'],
    autopct='%1.1f%%',
    startangle=90,
    colors=['skyblue', 'salmon', 'lightgreen', 'gold']
)

# 多列饼图
df_pie.set_index('category')['value'].plot.pie()

面积图 #

python

# 准备数据
df_area = pd.DataFrame({
    'A': np.random.randn(100).cumsum() + 100,
    'B': np.random.randn(100).cumsum() + 100
})

# 基本面积图
df_area.plot.area()

# 堆叠面积图
df_area.plot.area(stacked=True)

# 不堆叠
df_area.plot.area(stacked=False, alpha=0.5)

核密度估计 #

python

# 基本核密度估计
df_hist['normal'].plot.kde()

# 多列
df_hist.plot.kde()

# 指定带宽
df_hist['normal'].plot.kde(bw_method=0.5)

六边形图 #

python

# 大数据散点图
n = 10000
df_hex = pd.DataFrame({
    'x': np.random.randn(n),
    'y': np.random.randn(n)
})

# 六边形图
df_hex.plot.hexbin(x='x', y='y', gridsize=20)

# 指定聚合函数
df_hex.plot.hexbin(x='x', y='y', gridsize=20, reduce_C_function=np.mean)

散点图矩阵 #

python

from pandas.plotting import scatter_matrix

# 散点图矩阵
df_scatter = pd.DataFrame(np.random.randn(100, 4), columns=['A', 'B', 'C', 'D'])
scatter_matrix(df_scatter, figsize=(10, 10), diagonal='kde')
plt.show()

自定义图表 #

标题和标签 #

python

ax = df.plot()
ax.set_title('My Plot')
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.legend(['Series A', 'Series B', 'Series C'])

图例 #

python

ax = df.plot()
ax.legend(loc='upper left')
ax.legend(loc='best')  # 自动选择最佳位置
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # 图例在外部

网格 #

python

ax = df.plot()
ax.grid(True)
ax.grid(True, linestyle='--', alpha=0.5)

坐标轴范围 #

python

ax = df.plot()
ax.set_xlim(0, 50)
ax.set_ylim(-5, 5)

双 Y 轴 #

python

fig, ax1 = plt.subplots()

ax1.set_xlabel('X Axis')
ax1.set_ylabel('Y1', color='tab:blue')
df['A'].plot(ax=ax1, color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()
ax2.set_ylabel('Y2', color='tab:red')
df['B'].plot(ax=ax2, color='tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')

plt.show()

注释 #

python

ax = df['A'].plot()

# 添加文本
ax.text(10, 2, 'Important Point', fontsize=12)

# 添加箭头
ax.annotate('Peak', xy=(20, 3), xytext=(30, 4),
            arrowprops=dict(facecolor='black', shrink=0.05))

保存图表 #

python

# 保存为文件
fig = df.plot().get_figure()
fig.savefig('plot.png')
fig.savefig('plot.pdf')
fig.savefig('plot.svg')

# 指定分辨率
fig.savefig('plot.png', dpi=300)

# 指定大小
fig.savefig('plot.png', figsize=(10, 6))

绘图风格 #

python

# 查看可用风格
print(plt.style.available)

# 使用风格
plt.style.use('ggplot')
plt.style.use('seaborn-v0_8')
plt.style.use('dark_background')

# 临时使用
with plt.style.context('ggplot'):
    df.plot()
    plt.show()

实用案例 #

python

# 多图表组合
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 折线图
df['A'].plot(ax=axes[0, 0], title='Line Plot')

# 柱状图
df.head(10).plot.bar(ax=axes[0, 1], title='Bar Plot')

# 直方图
df['A'].plot.hist(ax=axes[1, 0], title='Histogram', bins=30)

# 箱线图
df.plot.box(ax=axes[1, 1], title='Box Plot')

plt.tight_layout()
plt.show()

下一步 #

掌握了数据可视化后，接下来学习性能优化，了解如何提升 Pandas 的运行效率！