数据清洗 #

数据清洗概述 #

数据清洗是数据分析的重要步骤，高质量的数据是分析结果可靠性的基础。

text

┌─────────────────────────────────────────────────────────────┐
│                    数据清洗任务                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  缺失值处理                                                 │
│  ├── 检测缺失值                                             │
│  ├── 删除缺失值                                             │
│  └── 填充缺失值                                             │
│                                                             │
│  重复值处理                                                 │
│  ├── 检测重复值                                             │
│  └── 删除重复值                                             │
│                                                             │
│  异常值处理                                                 │
│  ├── 检测异常值                                             │
│  ├── 删除异常值                                             │
│  └── 替换异常值                                             │
│                                                             │
│  数据规范化                                                 │
│  ├── 数据类型转换                                           │
│  ├── 字符串处理                                             │
│  └── 数据标准化                                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

缺失值处理 #

检测缺失值 #

python

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': ['a', 'b', np.nan, 'd', 'e'],
    'D': [1, 2, 3, 4, np.nan]
})

# 检测缺失值
print(df.isna())
print(df.isnull())  # 等价

# 检测非缺失值
print(df.notna())

# 统计缺失值数量
print(df.isna().sum())

# 每行缺失值数量
print(df.isna().sum(axis=1))

# 缺失值比例
print(df.isna().mean())

# 查看缺失值信息
print(df.info())

删除缺失值 #

python

# 删除包含缺失值的行
print(df.dropna())

# 删除包含缺失值的列
print(df.dropna(axis=1))

# 只删除全为缺失值的行
print(df.dropna(how='all'))

# 只删除全为非缺失值的行
print(df.dropna(how='any'))  # 默认

# 保留至少有 n 个非缺失值的行
print(df.dropna(thresh=3))

# 只考虑特定列
print(df.dropna(subset=['A', 'B']))

# 原地修改
df.dropna(inplace=True)

填充缺失值 #

python

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': ['a', 'b', np.nan, 'd', 'e']
})

# 用常数填充
print(df.fillna(0))

# 不同列用不同值
print(df.fillna({'A': 0, 'B': 1, 'C': 'unknown'}))

# 用统计值填充
print(df.fillna(df.mean(numeric_only=True)))  # 均值
print(df.fillna(df.median(numeric_only=True)))  # 中位数
print(df.fillna(df.mode().iloc[0]))  # 众数

# 前向填充
print(df.fillna(method='ffill'))  # 或 df.ffill()

# 后向填充
print(df.fillna(method='bfill'))  # 或 df.bfill()

# 限制填充次数
print(df.fillna(method='ffill', limit=1))

# 插值填充
print(df.interpolate())  # 线性插值
print(df.interpolate(method='linear'))
print(df.interpolate(method='polynomial', order=2))  # 多项式插值

高级缺失值处理 #

python

# 分组填充
df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'A'],
    'value': [1, np.nan, 3, np.nan, 5]
})
df['value'] = df.groupby('group')['value'].transform(
    lambda x: x.fillna(x.mean())
)
print(df)

# 条件填充
df = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan, 5],
    'B': [10, 20, 30, 40, 50]
})
df['A'] = df['A'].fillna(df['B'] / 10)
print(df)

重复值处理 #

检测重复值 #

python

df = pd.DataFrame({
    'A': [1, 1, 2, 2, 3],
    'B': ['a', 'a', 'b', 'b', 'c'],
    'C': [10, 10, 20, 20, 30]
})

# 检测重复行
print(df.duplicated())

# 查看重复行
print(df[df.duplicated()])

# 查看非重复行
print(df[~df.duplicated()])

# 指定列判断重复
print(df.duplicated(subset=['A']))

# 保留最后一个
print(df.duplicated(keep='last'))

# 标记所有重复
print(df.duplicated(keep=False))

# 统计重复数量
print(df.duplicated().sum())

删除重复值 #

python

# 删除重复行
print(df.drop_duplicates())

# 指定列判断
print(df.drop_duplicates(subset=['A']))

# 保留最后一个
print(df.drop_duplicates(keep='last'))

# 删除所有重复
print(df.drop_duplicates(keep=False))

# 原地修改
df.drop_duplicates(inplace=True)

异常值处理 #

检测异常值 #

python

import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    'value': np.concatenate([np.random.normal(0, 1, 100), [10, -10, 15]])
})

# 统计方法
print(df.describe())

# 使用 Z-score
from scipy import stats
df['z_score'] = np.abs(stats.zscore(df['value']))
outliers = df[df['z_score'] > 3]
print(outliers)

# 使用 IQR（四分位距）
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f'下界: {lower_bound}, 上界: {upper_bound}')
outliers = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
print(outliers)

# 使用百分位数
lower = df['value'].quantile(0.01)
upper = df['value'].quantile(0.99)
outliers = df[(df['value'] < lower) | (df['value'] > upper)]

处理异常值 #

python

# 删除异常值
df_clean = df[(df['value'] >= lower_bound) & (df['value'] <= upper_bound)]

# 替换为边界值
df['value_clipped'] = df['value'].clip(lower_bound, upper_bound)

# 替换为缺失值
df.loc[(df['value'] < lower_bound) | (df['value'] > upper_bound), 'value'] = np.nan

# 替换为中位数
median = df['value'].median()
df.loc[(df['value'] < lower_bound) | (df['value'] > upper_bound), 'value'] = median

可视化检测异常值 #

python

import matplotlib.pyplot as plt

# 箱线图
plt.figure(figsize=(8, 4))
plt.boxplot(df['value'])
plt.title('Box Plot')
plt.show()

# 散点图
plt.figure(figsize=(8, 4))
plt.scatter(range(len(df)), df['value'])
plt.title('Scatter Plot')
plt.show()

数据类型转换 #

类型检测与转换 #

python

df = pd.DataFrame({
    'A': ['1', '2', '3'],
    'B': ['1.1', '2.2', '3.3'],
    'C': ['2024-01-01', '2024-01-02', '2024-01-03'],
    'D': ['True', 'False', 'True']
})

# 查看数据类型
print(df.dtypes)

# 转换为数值
df['A'] = pd.to_numeric(df['A'])
df['B'] = pd.to_numeric(df['B'])

# 转换为整数
df['A'] = df['A'].astype(int)

# 转换为日期
df['C'] = pd.to_datetime(df['C'])

# 转换为布尔值
df['D'] = df['D'].map({'True': True, 'False': False})

# 批量转换
df = df.astype({'A': 'int32', 'B': 'float32'})

智能类型转换 #

python

# 自动推断最佳类型
df = df.convert_dtypes()
print(df.dtypes)

# 推断对象类型
df = df.infer_objects()

字符串处理 #

清理字符串 #

python

df = pd.DataFrame({
    'name': ['  Alice  ', 'BOB', 'Charlie ', '  diana']
})

# 去除空格
df['name'] = df['name'].str.strip()    # 两端
df['name'] = df['name'].str.lstrip()   # 左端
df['name'] = df['name'].str.rstrip()   # 右端

# 大小写转换
df['name_lower'] = df['name'].str.lower()
df['name_upper'] = df['name'].str.upper()
df['name_title'] = df['name'].str.title()
df['name_capitalize'] = df['name'].str.capitalize()

# 替换
df['name'] = df['name'].str.replace('a', 'A')

# 正则替换
df['name'] = df['name'].str.replace(r'\s+', ' ', regex=True)

字符串分割与提取 #

python

df = pd.DataFrame({
    'full_name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown']
})

# 分割
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

# 提取
df['initial'] = df['full_name'].str.extract(r'^(\w)')

# 提取多个组
df[['first_initial', 'last_initial']] = df['full_name'].str.extract(r'^(\w)\w+\s+(\w)')

数据规范化 #

标准化 #

python

from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# Z-score 标准化
scaler = StandardScaler()
df_std = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Min-Max 归一化
scaler = MinMaxScaler()
df_norm = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# 手动标准化
df_std = (df - df.mean()) / df.std()

# 手动归一化
df_norm = (df - df.min()) / (df.max() - df.min())

数值离散化 #

python

df = pd.DataFrame({'age': [22, 25, 30, 35, 40, 45, 50, 55, 60]})

# 等宽分箱
df['age_group'] = pd.cut(df['age'], bins=3)

# 自定义分箱
df['age_group'] = pd.cut(df['age'], 
                          bins=[0, 30, 50, 100], 
                          labels=['Young', 'Middle', 'Senior'])

# 等频分箱
df['age_quartile'] = pd.qcut(df['age'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

数据清洗流程示例 #

python

def clean_data(df):
    # 1. 处理缺失值
    df = df.dropna(subset=['important_column'])
    df = df.fillna({'numeric_col': df['numeric_col'].median(),
                    'text_col': 'unknown'})
    
    # 2. 处理重复值
    df = df.drop_duplicates()
    
    # 3. 处理异常值
    Q1 = df['value'].quantile(0.25)
    Q3 = df['value'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df['value'] = df['value'].clip(lower, upper)
    
    # 4. 数据类型转换
    df['date'] = pd.to_datetime(df['date'])
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    
    # 5. 字符串处理
    df['name'] = df['name'].str.strip().str.title()
    
    # 6. 重置索引
    df = df.reset_index(drop=True)
    
    return df

数据质量检查 #

python

def data_quality_report(df):
    print("=" * 50)
    print("数据质量报告")
    print("=" * 50)
    
    print(f"\n数据形状: {df.shape}")
    
    print("\n缺失值统计:")
    missing = df.isna().sum()
    print(missing[missing > 0])
    
    print("\n重复行数量:", df.duplicated().sum())
    
    print("\n数据类型:")
    print(df.dtypes)
    
    print("\n数值列统计:")
    print(df.describe())
    
    print("\n唯一值数量:")
    for col in df.columns:
        print(f"  {col}: {df[col].nunique()}")

data_quality_report(df)

下一步 #

掌握了数据清洗后，接下来学习统计分析，深入了解如何进行描述性统计和相关性分析！