类别特征处理 #
为什么类别特征重要? #
LightGBM 原生支持类别特征,这是它相对于其他 GBDT 框架的重要优势之一。
text
┌─────────────────────────────────────────────────────────────┐
│ 类别特征处理对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 传统方法(独热编码): │
│ - 高基数特征 → 大量新特征 │
│ - 稀疏矩阵,计算效率低 │
│ - 特征维度爆炸 │
│ │
│ LightGBM 原生支持: │
│ - 无需独热编码 │
│ - 最优分割算法 │
│ - 高效处理高基数特征 │
│ │
└─────────────────────────────────────────────────────────────┘
基本使用 #
指定类别特征 #
python
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
'num_feature1': np.random.randn(1000),
'num_feature2': np.random.randn(1000),
'cat_feature1': np.random.choice(['A', 'B', 'C', 'D'], 1000),
'cat_feature2': np.random.choice(['X', 'Y', 'Z'], 1000),
'target': np.random.randint(0, 2, 1000)
})
df['cat_feature1'] = df['cat_feature1'].astype('category')
df['cat_feature2'] = df['cat_feature2'].astype('category')
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_data = lgb.Dataset(
X_train, label=y_train,
categorical_feature=['cat_feature1', 'cat_feature2']
)
params = {'objective': 'binary', 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
使用索引指定 #
python
train_data = lgb.Dataset(
X_train.values, label=y_train.values,
categorical_feature=[2, 3]
)
最优分割算法 #
算法原理 #
LightGBM 使用基于目标函数的最优分割算法处理类别特征:
python
def optimal_categorical_split(values, gradients, hessians):
"""
类别特征最优分割
Args:
values: 类别值数组
gradients: 梯度数组
hessians: Hessian 数组
Returns:
best_split: 最佳分割集合
best_gain: 最佳增益
"""
unique_values = np.unique(values)
grad_sum = {}
hess_sum = {}
count = {}
for val in unique_values:
mask = values == val
grad_sum[val] = np.sum(gradients[mask])
hess_sum[val] = np.sum(hessians[mask])
count[val] = np.sum(mask)
sorted_values = sorted(unique_values, key=lambda x: grad_sum[x] / (hess_sum[x] + 1e-10))
best_gain = -np.inf
best_split = None
for i in range(1, len(sorted_values)):
left_values = set(sorted_values[:i])
right_values = set(sorted_values[i:])
left_grad = sum(grad_sum[v] for v in left_values)
left_hess = sum(hess_sum[v] for v in left_values)
right_grad = sum(grad_sum[v] for v in right_values)
right_hess = sum(hess_sum[v] for v in right_values)
total_grad = left_grad + right_grad
total_hess = left_hess + right_hess
left_score = (left_grad ** 2) / (left_hess + 0.1)
right_score = (right_grad ** 2) / (right_hess + 0.1)
total_score = (total_grad ** 2) / (total_hess + 0.1)
gain = left_score + right_score - total_score
if gain > best_gain:
best_gain = gain
best_split = left_values
return best_split, best_gain
values = np.array(['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B'])
gradients = np.array([0.1, -0.2, 0.3, 0.1, -0.3, 0.2, 0.1, -0.2])
hessians = np.array([0.5, 0.6, 0.4, 0.5, 0.6, 0.4, 0.5, 0.6])
split, gain = optimal_categorical_split(values, gradients, hessians)
print(f"最佳分割: {split}")
print(f"最佳增益: {gain:.4f}")
编码方法对比 #
独热编码 vs 原生支持 #
python
from sklearn.preprocessing import OneHotEncoder
import time
def compare_encoding_methods():
"""比较编码方法"""
n_samples = 10000
n_categories = 100
cat_feature = np.random.choice([f'cat_{i}' for i in range(n_categories)], n_samples)
num_feature = np.random.randn(n_samples, 10)
y = np.random.randint(0, 2, n_samples)
print("方法1: 独热编码")
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(cat_feature.reshape(-1, 1))
X_onehot = np.column_stack([num_feature, cat_encoded])
print(f"特征维度: {X_onehot.shape[1]}")
start = time.time()
train_data = lgb.Dataset(X_onehot, label=y)
params = {'objective': 'binary', 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
onehot_time = time.time() - start
print(f"训练时间: {onehot_time:.2f}s")
print("\n方法2: 原生支持")
X_native = np.column_stack([num_feature, cat_feature.reshape(-1, 1)])
start = time.time()
train_data = lgb.Dataset(X_native, label=y, categorical_feature=[10])
params = {'objective': 'binary', 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
native_time = time.time() - start
print(f"训练时间: {native_time:.2f}s")
print(f"\n加速比: {onehot_time / native_time:.2f}x")
compare_encoding_methods()
标签编码 #
python
from sklearn.preprocessing import LabelEncoder
def label_encode_features(df, categorical_columns):
"""标签编码"""
df_encoded = df.copy()
encoders = {}
for col in categorical_columns:
encoder = LabelEncoder()
df_encoded[col] = encoder.fit_transform(df[col].astype(str))
encoders[col] = encoder
return df_encoded, encoders
df = pd.DataFrame({
'cat1': ['A', 'B', 'C', 'A', 'B'],
'cat2': ['X', 'Y', 'Z', 'X', 'Y']
})
df_encoded, encoders = label_encode_features(df, ['cat1', 'cat2'])
print(df_encoded)
目标编码 #
python
def target_encode(train_series, target, test_series=None, smoothing=1.0):
"""目标编码"""
target_mean = target.mean()
stats = pd.DataFrame({
'category': train_series,
'target': target
}).groupby('category')['target'].agg(['mean', 'count'])
smoothed_mean = (stats['count'] * stats['mean'] + smoothing * target_mean) / (stats['count'] + smoothing)
train_encoded = train_series.map(smoothed_mean)
if test_series is not None:
test_encoded = test_series.map(smoothed_mean)
test_encoded = test_encoded.fillna(target_mean)
return train_encoded, test_encoded
return train_encoded
categories = pd.Series(['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B'])
target = pd.Series([1, 0, 1, 1, 0, 0, 1, 0])
encoded = target_encode(categories, target)
print(encoded)
高基数特征处理 #
处理大量类别 #
python
def handle_high_cardinality(df, column, max_categories=50):
"""处理高基数特征"""
value_counts = df[column].value_counts()
top_categories = value_counts.head(max_categories - 1).index.tolist()
df_new = df.copy()
df_new[column] = df[column].apply(
lambda x: x if x in top_categories else 'OTHER'
)
print(f"原始类别数: {len(value_counts)}")
print(f"处理后类别数: {len(df_new[column].unique())}")
return df_new
df = pd.DataFrame({
'high_card': [f'cat_{i}' for i in np.random.randint(0, 1000, 10000)]
})
df_processed = handle_high_cardinality(df, 'high_card', max_categories=20)
参数配置 #
类别特征相关参数 #
python
params = {
'objective': 'binary',
'categorical_column': [2, 3],
'min_data_per_group': 100,
'cat_smooth': 10.0,
'cat_l2': 10.0,
'max_cat_threshold': 32,
'max_cat_to_onehot': 4,
'verbose': -1
}
参数说明 #
| 参数 | 说明 | 默认值 |
|---|---|---|
| min_data_per_group | 每个类别组最小数据量 | 100 |
| cat_smooth | 类别特征平滑系数 | 10.0 |
| cat_l2 | 类别特征 L2 正则化 | 10.0 |
| max_cat_threshold | 类别特征最大阈值数 | 32 |
| max_cat_to_onehot | 使用 one-vs-other 的最大类别数 | 4 |
完整示例 #
python
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
np.random.seed(42)
n_samples = 5000
df = pd.DataFrame({
'age': np.random.randint(18, 70, n_samples),
'income': np.random.randn(n_samples) * 10000 + 50000,
'city': np.random.choice(['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Other'], n_samples),
'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples),
'gender': np.random.choice(['M', 'F'], n_samples)
})
df['target'] = (
(df['age'] > 40).astype(int) * 0.3 +
(df['income'] > 60000).astype(int) * 0.3 +
(df['city'].isin(['Beijing', 'Shanghai'])).astype(int) * 0.2 +
(df['education'].isin(['Master', 'PhD'])).astype(int) * 0.2
)
df['target'] = (df['target'] + np.random.randn(n_samples) * 0.1 > 0.5).astype(int)
categorical_features = ['city', 'education', 'gender']
for col in categorical_features:
df[col] = df[col].astype('category')
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = lgb.Dataset(
X_train, label=y_train,
categorical_feature=categorical_features
)
valid_data = lgb.Dataset(
X_test, label=y_test,
reference=train_data,
categorical_feature=categorical_features
)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'min_data_per_group': 50,
'cat_smooth': 10.0,
'verbose': -1
}
print("训练模型...")
model = lgb.train(
params, train_data, num_boost_round=500,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f"\n测试集 AUC: {auc:.4f}")
import matplotlib.pyplot as plt
lgb.plot_importance(model, max_num_features=10, figsize=(10, 6))
plt.title("特征重要性")
plt.tight_layout()
plt.show()
下一步 #
现在你已经掌握了 LightGBM 的类别特征处理,接下来学习 缺失值处理,了解如何处理数据中的缺失值!
最后更新:2026-04-04