特征工程 #
特征重要性分析 #
获取特征重要性 #
python
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names
train_data = lgb.Dataset(X, label=y)
params = {'objective': 'binary', 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
importance_split = model.feature_importance(importance_type='split')
importance_gain = model.feature_importance(importance_type='gain')
importance_df = pd.DataFrame({
'feature': feature_names,
'split': importance_split,
'gain': importance_gain
})
importance_df = importance_df.sort_values('gain', ascending=False)
print(importance_df.head(10))
可视化特征重要性 #
python
def plot_feature_importance(model, feature_names, top_k=20):
"""可视化特征重要性"""
importance_gain = model.feature_importance(importance_type='gain')
importance_split = model.feature_importance(importance_type='split')
df = pd.DataFrame({
'feature': feature_names,
'gain': importance_gain,
'split': importance_split
}).sort_values('gain', ascending=False).head(top_k)
fig, axes = plt.subplots(1, 2, figsize=(14, 8))
axes[0].barh(df['feature'], df['gain'])
axes[0].set_xlabel('Total Gain')
axes[0].set_title('特征重要性 (Gain)')
axes[0].invert_yaxis()
axes[1].barh(df['feature'], df['split'])
axes[1].set_xlabel('Split Count')
axes[1].set_title('特征重要性 (Split)')
axes[1].invert_yaxis()
plt.tight_layout()
plt.show()
plot_feature_importance(model, feature_names)
使用 plot_importance #
python
lgb.plot_importance(model, max_num_features=20, figsize=(10, 8))
plt.title("特征重要性")
plt.tight_layout()
plt.show()
特征选择 #
基于重要性的特征选择 #
python
def select_features_by_importance(model, feature_names, threshold=0.01):
"""基于重要性选择特征"""
importance = model.feature_importance(importance_type='gain')
importance_normalized = importance / importance.sum()
selected_mask = importance_normalized >= threshold
selected_features = feature_names[selected_mask]
print(f"原始特征数: {len(feature_names)}")
print(f"选择特征数: {len(selected_features)}")
print(f"减少比例: {(1 - len(selected_features)/len(feature_names))*100:.1f}%")
return selected_features
selected = select_features_by_importance(model, feature_names, threshold=0.01)
print(f"\n选择的特征: {selected[:10]}")
递归特征消除 #
python
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
rfe = RFE(estimator=clf, n_features_to_select=20, step=5)
rfe.fit(X, y)
selected_features = feature_names[rfe.support_]
print(f"选择的特征: {selected_features}")
基于交叉验证的特征选择 #
python
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
def select_features_cv(X, y, feature_names, max_features=20):
"""基于交叉验证选择特征"""
results = []
for k in range(5, min(max_features + 1, X.shape[1] + 1), 5):
selector = SelectKBest(f_classif, k=k)
X_selected = selector.fit_transform(X, y)
clf = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
scores = cross_val_score(clf, X_selected, y, cv=5, scoring='roc_auc')
results.append({
'k': k,
'mean_auc': scores.mean(),
'std_auc': scores.std()
})
results_df = pd.DataFrame(results)
plt.figure(figsize=(10, 6))
plt.errorbar(results_df['k'], results_df['mean_auc'],
yerr=results_df['std_auc'], fmt='o-')
plt.xlabel('特征数量')
plt.ylabel('AUC')
plt.title('特征数量 vs 模型性能')
plt.grid(True)
plt.show()
return results_df
results = select_features_cv(X, y, feature_names)
print(results)
特征交互 #
特征交互分析 #
python
def analyze_feature_interactions(X, feature_names, top_k=10):
"""分析特征交互"""
from itertools import combinations
n_features = X.shape[1]
interactions = []
for i, j in combinations(range(n_features), 2):
correlation = np.corrcoef(X[:, i], X[:, j])[0, 1]
interactions.append({
'feature_i': feature_names[i],
'feature_j': feature_names[j],
'correlation': abs(correlation)
})
interactions_df = pd.DataFrame(interactions)
interactions_df = interactions_df.sort_values('correlation', ascending=False)
print("高相关性特征对:")
print(interactions_df.head(top_k))
return interactions_df
interactions = analyze_feature_interactions(X, feature_names)
创建交互特征 #
python
def create_interaction_features(X, feature_names, important_features):
"""创建交互特征"""
X_new = X.copy()
new_feature_names = list(feature_names)
important_indices = [list(feature_names).index(f) for f in important_features]
for i, j in combinations(important_indices[:5], 2):
interaction = X[:, i] * X[:, j]
X_new = np.column_stack([X_new, interaction])
new_feature_names.append(f"{feature_names[i]}_x_{feature_names[j]}")
print(f"原始特征数: {X.shape[1]}")
print(f"新特征数: {X_new.shape[1]}")
return X_new, new_feature_names
important_features = importance_df.head(5)['feature'].tolist()
X_new, new_feature_names = create_interaction_features(X, feature_names, important_features)
特征构造 #
统计特征 #
python
def create_statistical_features(X):
"""创建统计特征"""
X_new = X.copy()
X_new = np.column_stack([X_new, X.mean(axis=1)])
X_new = np.column_stack([X_new, X.std(axis=1)])
X_new = np.column_stack([X_new, X.max(axis=1)])
X_new = np.column_stack([X_new, X.min(axis=1)])
X_new = np.column_stack([X_new, X.max(axis=1) - X.min(axis=1)])
print(f"原始特征数: {X.shape[1]}")
print(f"新特征数: {X_new.shape[1]}")
return X_new
X_stats = create_statistical_features(X)
分组统计特征 #
python
def create_group_statistics(X, group_feature_idx, value_feature_idx):
"""创建分组统计特征"""
groups = X[:, group_feature_idx]
values = X[:, value_feature_idx]
group_means = {}
group_stds = {}
for g in np.unique(groups):
mask = groups == g
group_means[g] = values[mask].mean()
group_stds[g] = values[mask].std()
group_mean_feature = np.array([group_means[g] for g in groups])
group_std_feature = np.array([group_stds[g] for g in groups])
X_new = np.column_stack([X, group_mean_feature, group_std_feature])
return X_new
时间特征 #
python
def create_time_features(dates):
"""创建时间特征"""
import pandas as pd
dates = pd.to_datetime(dates)
features = pd.DataFrame({
'year': dates.dt.year,
'month': dates.dt.month,
'day': dates.dt.day,
'dayofweek': dates.dt.dayofweek,
'dayofyear': dates.dt.dayofyear,
'weekofyear': dates.dt.isocalendar().week,
'quarter': dates.dt.quarter,
'is_weekend': (dates.dt.dayofweek >= 5).astype(int),
'is_month_start': dates.dt.is_month_start.astype(int),
'is_month_end': dates.dt.is_month_end.astype(int)
})
return features
特征缩放 #
标准化 #
python
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"原始均值: {X.mean(axis=0)[:5]}")
print(f"缩放后均值: {X_scaled.mean(axis=0)[:5]}")
归一化 #
python
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
print(f"原始范围: [{X.min():.2f}, {X.max():.2f}]")
print(f"归一化后范围: [{X_normalized.min():.2f}, {X_normalized.max():.2f}]")
特征编码 #
标签编码 #
python
from sklearn.preprocessing import LabelEncoder
categories = ['A', 'B', 'C', 'A', 'B', 'C', 'A']
encoder = LabelEncoder()
encoded = encoder.fit_transform(categories)
print(f"原始数据: {categories}")
print(f"编码后: {encoded}")
print(f"类别: {encoder.classes_}")
频率编码 #
python
def frequency_encode(series):
"""频率编码"""
freq = series.value_counts(normalize=True)
return series.map(freq)
categories = pd.Series(['A', 'B', 'C', 'A', 'B', 'C', 'A', 'A', 'B'])
freq_encoded = frequency_encode(categories)
print(f"原始数据: {categories.tolist()}")
print(f"频率编码: {freq_encoded.tolist()}")
目标编码 #
python
def target_encode(categories, target, smoothing=1.0):
"""目标编码"""
df = pd.DataFrame({'category': categories, 'target': target})
global_mean = target.mean()
stats = df.groupby('category')['target'].agg(['mean', 'count'])
smoothed_mean = (stats['count'] * stats['mean'] + smoothing * global_mean) / (stats['count'] + smoothing)
return categories.map(smoothed_mean)
categories = ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C']
target = np.array([1, 0, 1, 1, 0, 0, 1, 0, 1])
target_encoded = target_encode(categories, target)
print(f"目标编码: {target_encoded.tolist()}")
特征降维 #
PCA 降维 #
python
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
print(f"原始维度: {X.shape[1]}")
print(f"降维后: {X_pca.shape[1]}")
print(f"解释方差比: {pca.explained_variance_ratio_.sum():.4f}")
特征聚类 #
python
from sklearn.cluster import FeatureAgglomeration
agglo = FeatureAgglomeration(n_clusters=10)
X_clustered = agglo.fit_transform(X)
print(f"原始维度: {X.shape[1]}")
print(f"聚类后: {X_clustered.shape[1]}")
完整示例 #
python
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
train_data = lgb.Dataset(X_train_scaled, label=y_train)
valid_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
model = lgb.train(
params, train_data, num_boost_round=100,
valid_sets=[valid_data],
callbacks=[lgb.log_evaluation(20)]
)
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)
print("\nTop 10 重要特征:")
print(importance_df.head(10))
lgb.plot_importance(model, max_num_features=15, figsize=(10, 8))
plt.title("特征重要性")
plt.tight_layout()
plt.show()
selector = SelectKBest(f_classif, k=15)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
selected_features = feature_names[selector.get_support()]
print(f"\n选择的特征: {selected_features}")
train_data_selected = lgb.Dataset(X_train_selected, label=y_train)
valid_data_selected = lgb.Dataset(X_test_selected, label=y_test, reference=train_data_selected)
model_selected = lgb.train(
params, train_data_selected, num_boost_round=100,
valid_sets=[valid_data_selected],
callbacks=[lgb.log_evaluation(20)]
)
print(f"\n原始特征模型 AUC: {model.best_score['valid_0']['auc']:.4f}")
print(f"选择特征模型 AUC: {model_selected.best_score['valid_0']['auc']:.4f}")
下一步 #
现在你已经掌握了 LightGBM 的特征工程,接下来学习 调参技巧,深入了解如何优化模型参数!
最后更新:2026-04-04