特征工程 #
特征重要性 #
内置特征重要性 #
XGBoost 提供了三种特征重要性计算方式:
python
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
# 训练模型
params = {'objective': 'binary:logistic', 'max_depth': 6}
model = xgb.train(params, dtrain, num_boost_round=100)
# 获取特征重要性
importance_weight = model.get_score(importance_type='weight')
importance_gain = model.get_score(importance_type='gain')
importance_cover = model.get_score(importance_type='cover')
三种重要性类型 #
text
┌─────────────────────────────────────────────────────────────┐
│ 特征重要性类型 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Weight(权重) │
│ - 特征被用于分裂的次数 │
│ - 反映特征的使用频率 │
│ - 高频特征不一定重要 │
│ │
│ Gain(增益) │
│ - 特征带来的平均增益 │
│ - 反映特征的贡献程度 │
│ - 最常用的指标 │
│ │
│ Cover(覆盖度) │
│ - 特征影响的样本数量 │
│ - 反映特征的覆盖范围 │
│ │
└─────────────────────────────────────────────────────────────┘
可视化特征重要性 #
python
import pandas as pd
# 创建 DataFrame
df_importance = pd.DataFrame({
'feature': list(importance_gain.keys()),
'gain': list(importance_gain.values()),
'weight': [importance_weight.get(f, 0) for f in importance_gain.keys()],
'cover': [importance_cover.get(f, 0) for f in importance_gain.keys()]
})
df_importance = df_importance.sort_values('gain', ascending=False)
# 可视化
fig, axes = plt.subplots(1, 3, figsize=(15, 8))
xgb.plot_importance(model, importance_type='weight', ax=axes[0], max_num_features=10)
axes[0].set_title('Feature Importance (Weight)')
xgb.plot_importance(model, importance_type='gain', ax=axes[1], max_num_features=10)
axes[1].set_title('Feature Importance (Gain)')
xgb.plot_importance(model, importance_type='cover', ax=axes[2], max_num_features=10)
axes[2].set_title('Feature Importance (Cover)')
plt.tight_layout()
plt.show()
SHAP 值 #
SHAP 简介 #
SHAP(SHapley Additive exPlanations)提供了更细粒度的特征解释:
python
import shap
# 创建 SHAP 解释器
explainer = shap.TreeExplainer(model)
# 计算 SHAP 值
shap_values = explainer.shap_values(dtrain)
# 全局特征重要性
shap.summary_plot(shap_values, X, feature_names=data.feature_names)
# 单个样本解释
shap.force_plot(explainer.expected_value, shap_values[0], X[0],
feature_names=data.feature_names)
# 依赖图
shap.dependence_plot('mean radius', shap_values, X, feature_names=data.feature_names)
SHAP 可视化 #
python
# 摘要图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, feature_names=data.feature_names, show=False)
plt.tight_layout()
plt.show()
# 条形图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, feature_names=data.feature_names, plot_type='bar', show=False)
plt.tight_layout()
plt.show()
# 交互效应
shap_interaction_values = explainer.shap_interaction_values(dtrain)
shap.summary_plot(shap_interaction_values, X, feature_names=data.feature_names)
特征选择 #
基于重要性的特征选择 #
python
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
# 方法1:使用 Scikit-Learn 的 SelectFromModel
clf = XGBClassifier(n_estimators=100, max_depth=6)
clf.fit(X, y)
selector = SelectFromModel(clf, threshold='median', prefit=True)
X_selected = selector.transform(X)
print(f"原始特征数: {X.shape[1]}")
print(f"选择后特征数: {X_selected.shape[1]}")
# 方法2:手动选择
importance = clf.get_booster().get_score(importance_type='gain')
threshold = np.percentile(list(importance.values()), 50)
selected_features = [f for f, imp in importance.items() if imp > threshold]
print(f"选择的特征: {selected_features}")
递归特征消除 #
python
from sklearn.feature_selection import RFE, RFECV
# 递归特征消除
clf = XGBClassifier(n_estimators=100, max_depth=6)
rfe = RFE(estimator=clf, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)
print(f"选择的特征索引: {np.where(rfe.support_)[0]}")
# 带交叉验证的递归特征消除
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='roc_auc')
X_rfecv = rfecv.fit_transform(X, y)
print(f"最优特征数: {rfecv.n_features_}")
基于排列的特征选择 #
python
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = XGBClassifier(n_estimators=100, max_depth=6)
clf.fit(X_train, y_train)
# 排列重要性
perm_importance = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
# 排序
sorted_idx = perm_importance.importances_mean.argsort()[::-1]
plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), perm_importance.importances_mean[sorted_idx])
plt.yticks(range(len(sorted_idx)), [data.feature_names[i] for i in sorted_idx])
plt.xlabel('Permutation Importance')
plt.tight_layout()
plt.show()
特征编码 #
类别特征编码 #
python
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 示例数据
df = pd.DataFrame({
'city': ['Beijing', 'Shanghai', 'Guangzhou', 'Beijing', 'Shanghai'],
'category': ['A', 'B', 'A', 'C', 'B'],
'price': [100, 200, 150, 120, 180],
'target': [0, 1, 0, 1, 1]
})
# 1. 标签编码
le = LabelEncoder()
df['city_encoded'] = le.fit_transform(df['city'])
# 2. One-Hot 编码
df_onehot = pd.get_dummies(df, columns=['city', 'category'])
# 3. 目标编码(Target Encoding)
def target_encode(df, column, target, smooth=1.0):
"""
目标编码
参数:
- df: DataFrame
- column: 要编码的列
- target: 目标列
- smooth: 平滑参数
"""
global_mean = df[target].mean()
stats = df.groupby(column)[target].agg(['mean', 'count'])
# 平滑
smoothed = (stats['count'] * stats['mean'] + smooth * global_mean) / (stats['count'] + smooth)
return df[column].map(smoothed)
df['city_target'] = target_encode(df, 'city', 'target')
# 4. 频率编码
def frequency_encode(df, column):
freq = df[column].value_counts(normalize=True)
return df[column].map(freq)
df['city_freq'] = frequency_encode(df, 'city')
# 5. Leave-One-Out 编码
def leave_one_out_encode(df, column, target):
result = np.zeros(len(df))
for i in range(len(df)):
# 排除当前样本
mask = df.index != i
mean_target = df.loc[mask].groupby(column)[target].mean()
result[i] = mean_target.get(df.loc[i, column], df[target].mean())
return result
df['city_loo'] = leave_one_out_encode(df, 'city', 'target')
XGBoost 原生类别特征 #
python
import xgboost as xgb
# XGBoost 1.3+ 支持原生类别特征
df['city'] = df['city'].astype('category')
df['category'] = df['category'].astype('category')
X = df.drop('target', axis=1)
y = df['target']
dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
params = {
'objective': 'binary:logistic',
'enable_categorical': True,
'max_depth': 6
}
model = xgb.train(params, dtrain, num_boost_round=100)
特征交互 #
交互特征创建 #
python
import pandas as pd
import numpy as np
# 示例数据
df = pd.DataFrame({
'price': [100, 200, 150, 300, 250],
'quantity': [10, 5, 8, 3, 6],
'rating': [4.5, 3.8, 4.2, 4.0, 3.5]
})
# 1. 数值交互
df['total'] = df['price'] * df['quantity']
df['price_per_quantity'] = df['price'] / df['quantity']
df['price_rating'] = df['price'] * df['rating']
# 2. 统计交互
df['log_price'] = np.log1p(df['price'])
df['sqrt_quantity'] = np.sqrt(df['quantity'])
# 3. 差值特征
df['price_diff_mean'] = df['price'] - df['price'].mean()
# 4. 比率特征
df['price_ratio'] = df['price'] / df['price'].max()
自动特征交互 #
python
from sklearn.preprocessing import PolynomialFeatures
# 多项式特征
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(df[['price', 'quantity', 'rating']])
feature_names = poly.get_feature_names_out(['price', 'quantity', 'rating'])
df_poly = pd.DataFrame(X_poly, columns=feature_names)
特征变换 #
数值变换 #
python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 归一化
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X)
# 鲁棒缩放
robust = RobustScaler()
X_robust = robust.fit_transform(X)
# 幂变换(使数据更接近正态分布)
power = PowerTransformer(method='yeo-johnson')
X_power = power.fit_transform(X)
# 对数变换
X_log = np.log1p(X)
# Box-Cox 变换
from scipy import stats
X_boxcox, _ = stats.boxcox(X[:, 0] + 1) # 需要正值
分箱 #
python
from sklearn.preprocessing import KBinsDiscretizer
# 等宽分箱
kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='uniform')
X_binned = kbins.fit_transform(X)
# 等频分箱
kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
X_binned = kbins.fit_transform(X)
# K-Means 分箱
kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='kmeans')
X_binned = kbins.fit_transform(X)
# 手动分箱
df['price_bin'] = pd.cut(df['price'], bins=[0, 100, 200, 300, np.inf], labels=['low', 'medium', 'high', 'very_high'])
时间特征 #
python
import pandas as pd
# 创建时间数据
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=100, freq='D'),
'value': np.random.randn(100)
})
# 提取时间特征
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df['quarter'] = df['date'].dt.quarter
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
# 周期性编码
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
# 滚动特征
df['rolling_mean_7'] = df['value'].rolling(window=7).mean()
df['rolling_std_7'] = df['value'].rolling(window=7).std()
df['rolling_max_7'] = df['value'].rolling(window=7).max()
# 滞后特征
df['lag_1'] = df['value'].shift(1)
df['lag_7'] = df['value'].shift(7)
特征工程最佳实践 #
python
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
def feature_engineering_pipeline(df, target_column):
"""
特征工程完整流程
"""
df = df.copy()
# 1. 处理类别特征
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
if col != target_column:
# 目标编码
df[f'{col}_target'] = target_encode(df, col, target_column)
# 频率编码
df[f'{col}_freq'] = frequency_encode(df, col)
# 删除原始列
df.drop(col, axis=1, inplace=True)
# 2. 处理缺失值
for col in df.columns:
if df[col].isnull().any():
df[f'{col}_is_missing'] = df[col].isnull().astype(int)
df[col].fillna(df[col].median(), inplace=True)
# 3. 创建交互特征
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = [c for c in numeric_cols if c != target_column]
for i, col1 in enumerate(numeric_cols[:5]): # 限制数量
for col2 in numeric_cols[i+1:i+3]:
df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
# 4. 对数变换
for col in numeric_cols[:5]:
if df[col].min() >= 0:
df[f'{col}_log'] = np.log1p(df[col])
return df
# 使用示例
df = pd.read_csv('data.csv')
df_engineered = feature_engineering_pipeline(df, 'target')
X = df_engineered.drop('target', axis=1)
y = df_engineered['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'binary:logistic', 'max_depth': 6, 'eta': 0.1}
model = xgb.train(params, dtrain, num_boost_round=100)
y_pred = model.predict(dtest)
print(f"AUC: {roc_auc_score(y_test, y_pred):.4f}")
下一步 #
现在你已经了解了特征工程,接下来学习 调参技巧 掌握超参数优化方法!
最后更新:2026-04-04