缺失值处理 #
LightGBM 缺失值处理机制 #
LightGBM 原生支持缺失值处理,无需手动填充缺失值。
text
┌─────────────────────────────────────────────────────────────┐
│ LightGBM 缺失值处理 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 默认策略: │
│ 1. 学习缺失值的最优分裂方向 │
│ 2. 缺失值可以被分到左子树或右子树 │
│ 3. 选择使增益最大的方向 │
│ │
│ 参数控制: │
│ - use_missing: 是否使用缺失值 (默认 True) │
│ - zero_as_missing: 是否将 0 视为缺失值 (默认 False) │
│ │
└─────────────────────────────────────────────────────────────┘
基本使用 #
自动处理缺失值 #
python
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)
missing_mask = np.random.rand(1000, 10) < 0.1
X[missing_mask] = np.nan
print(f"缺失值比例: {np.isnan(X).mean():.2%}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_data = lgb.Dataset(X_train, label=y_train)
params = {
'objective': 'binary',
'metric': 'auc',
'use_missing': True,
'verbose': -1
}
model = lgb.train(params, train_data, num_boost_round=100)
print("模型训练完成")
参数配置 #
python
params = {
'objective': 'binary',
'metric': 'auc',
'use_missing': True,
'zero_as_missing': False,
'verbose': -1
}
| 参数 | 说明 | 默认值 |
|---|---|---|
| use_missing | 是否启用缺失值处理 | True |
| zero_as_missing | 是否将 0 视为缺失值 | False |
缺失值处理策略 #
策略一:LightGBM 原生处理 #
python
def native_missing_handling(X, y):
"""使用 LightGBM 原生处理"""
train_data = lgb.Dataset(X, label=y)
params = {
'objective': 'binary',
'use_missing': True,
'verbose': -1
}
model = lgb.train(params, train_data, num_boost_round=100)
return model
策略二:均值填充 #
python
from sklearn.impute import SimpleImputer
def mean_imputation(X_train, X_test):
"""均值填充"""
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
return X_train_imputed, X_test_imputed
策略三:中位数填充 #
python
def median_imputation(X_train, X_test):
"""中位数填充"""
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
return X_train_imputed, X_test_imputed
策略四:众数填充 #
python
def mode_imputation(X_train, X_test):
"""众数填充"""
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
return X_train_imputed, X_test_imputed
策略五:KNN 填充 #
python
from sklearn.impute import KNNImputer
def knn_imputation(X_train, X_test, n_neighbors=5):
"""KNN 填充"""
imputer = KNNImputer(n_neighbors=n_neighbors)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
return X_train_imputed, X_test_imputed
缺失值分析 #
缺失值统计 #
python
import pandas as pd
def analyze_missing_values(X, feature_names=None):
"""分析缺失值"""
if feature_names is None:
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
missing_stats = []
for i, name in enumerate(feature_names):
missing_count = np.isnan(X[:, i]).sum()
missing_ratio = missing_count / len(X)
missing_stats.append({
'feature': name,
'missing_count': missing_count,
'missing_ratio': missing_ratio
})
df = pd.DataFrame(missing_stats)
df = df.sort_values('missing_ratio', ascending=False)
print("缺失值统计:")
print(df[df['missing_count'] > 0])
return df
X = np.random.randn(1000, 10)
X[np.random.rand(1000, 10) < 0.1] = np.nan
missing_stats = analyze_missing_values(X)
可视化缺失值 #
python
import matplotlib.pyplot as plt
def plot_missing_values(X, feature_names=None):
"""可视化缺失值"""
if feature_names is None:
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
missing_ratios = [np.isnan(X[:, i]).mean() for i in range(X.shape[1])]
plt.figure(figsize=(12, 6))
plt.bar(feature_names, missing_ratios)
plt.xlabel('特征')
plt.ylabel('缺失值比例')
plt.title('各特征缺失值比例')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plot_missing_values(X)
处理策略对比 #
python
from sklearn.metrics import roc_auc_score
def compare_missing_strategies(X, y):
"""比较不同缺失值处理策略"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
results = []
print("策略1: LightGBM 原生处理")
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'binary', 'use_missing': True, 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
results.append({'strategy': 'Native', 'auc': auc})
print(f"AUC: {auc:.4f}")
print("\n策略2: 均值填充")
X_train_imp, X_test_imp = mean_imputation(X_train, X_test)
train_data = lgb.Dataset(X_train_imp, label=y_train)
model = lgb.train(params, train_data, num_boost_round=100)
y_pred = model.predict(X_test_imp)
auc = roc_auc_score(y_test, y_pred)
results.append({'strategy': 'Mean', 'auc': auc})
print(f"AUC: {auc:.4f}")
print("\n策略3: 中位数填充")
X_train_imp, X_test_imp = median_imputation(X_train, X_test)
train_data = lgb.Dataset(X_train_imp, label=y_train)
model = lgb.train(params, train_data, num_boost_round=100)
y_pred = model.predict(X_test_imp)
auc = roc_auc_score(y_test, y_pred)
results.append({'strategy': 'Median', 'auc': auc})
print(f"AUC: {auc:.4f}")
return pd.DataFrame(results)
X = np.random.randn(1000, 20)
y = (X[:, 0] + X[:, 1] > 0).astype(int)
X[np.random.rand(1000, 20) < 0.15] = np.nan
results = compare_missing_strategies(X, y)
print("\n结果对比:")
print(results)
高级处理技巧 #
创建缺失值指示特征 #
python
def create_missing_indicators(X):
"""创建缺失值指示特征"""
missing_indicators = np.isnan(X).astype(int)
X_with_indicators = np.column_stack([X, missing_indicators])
print(f"原始特征数: {X.shape[1]}")
print(f"添加指示特征后: {X_with_indicators.shape[1]}")
return X_with_indicators
X_with_indicators = create_missing_indicators(X)
分组填充 #
python
def group_imputation(X, group_column, value_column):
"""分组填充缺失值"""
df = pd.DataFrame(X, columns=[f'col_{i}' for i in range(X.shape[1])])
group_means = df.groupby(f'col_{group_column}')[f'col_{value_column}'].transform('mean')
df[f'col_{value_column}'] = df[f'col_{value_column}'].fillna(group_means)
return df.values
完整示例 #
python
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
np.random.seed(42)
n_samples = 5000
n_features = 20
X = np.random.randn(n_samples, n_features)
y = (X[:, 0] + X[:, 1] + np.random.randn(n_samples) * 0.5 > 0).astype(int)
missing_patterns = [
(0, 0.05),
(1, 0.10),
(2, 0.15),
(3, 0.20),
(4, 0.25)
]
for col, ratio in missing_patterns:
missing_idx = np.random.choice(n_samples, int(n_samples * ratio), replace=False)
X[missing_idx, col] = np.nan
print("缺失值分析:")
for i in range(n_features):
missing_ratio = np.isnan(X[:, i]).mean()
if missing_ratio > 0:
print(f"特征 {i}: {missing_ratio:.2%} 缺失")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\n训练模型(原生处理缺失值)...")
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'use_missing': True,
'verbose': -1
}
model = lgb.train(
params, train_data, num_boost_round=500,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f"\n测试集 AUC: {auc:.4f}")
feature_importance = pd.DataFrame({
'feature': [f'feature_{i}' for i in range(n_features)],
'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)
print("\n特征重要性 Top 10:")
print(feature_importance.head(10))
lgb.plot_importance(model, max_num_features=10, figsize=(10, 6))
plt.title("特征重要性")
plt.tight_layout()
plt.show()
最佳实践 #
- 优先使用原生处理:LightGBM 的原生缺失值处理通常效果最好
- 分析缺失模式:了解缺失值的分布和原因
- 创建指示特征:有时缺失本身是有信息量的
- 对比多种策略:不同数据集最优策略可能不同
- 注意缺失比例:缺失比例过高时考虑删除特征
下一步 #
现在你已经掌握了 LightGBM 的缺失值处理,接下来学习 单机并行,了解 LightGBM 的并行训练机制!
最后更新:2026-04-04