特征转换 #
概述 #
特征转换是将原始特征转换为更适合机器学习算法的新特征的过程。
转换类型 #
| 类型 | 描述 | 示例 |
|---|---|---|
| 数学变换 | 应用数学函数 | 对数、平方根 |
| 多项式扩展 | 生成多项式特征 | x², x*y |
| 离散化 | 连续值转离散 | 分箱 |
| 交互特征 | 特征组合 | x1 * x2 |
多项式特征 #
PolynomialFeatures #
python
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.array([[1, 2], [3, 4], [5, 6]])
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
print(f"原始特征: {X.shape}")
print(f"多项式特征: {X_poly.shape}")
print(f"特征名: {poly.get_feature_names_out()}")
参数说明 #
| 参数 | 描述 | 默认值 |
|---|---|---|
degree |
多项式次数 | 2 |
interaction_only |
只生成交互项 | False |
include_bias |
是否包含偏置项 | True |
python
poly = PolynomialFeatures(
degree=3,
interaction_only=True,
include_bias=False
)
X_poly = poly.fit_transform(X)
只生成交互项 #
python
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_inter = poly.fit_transform(X)
print(f"交互特征: {poly.get_feature_names_out()}")
多项式回归 #
python
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, 100)
model = Pipeline([
('poly', PolynomialFeatures(degree=5)),
('linear', LinearRegression())
])
model.fit(X, y)
y_pred = model.predict(X)
plt.scatter(X, y, s=5, label='Data')
plt.plot(X, y_pred, 'r-', label='Polynomial Fit')
plt.legend()
FunctionTransformer #
基本使用 #
python
from sklearn.preprocessing import FunctionTransformer
import numpy as np
X = np.array([[1, 2], [3, 4]])
log_transformer = FunctionTransformer(np.log1p)
X_log = log_transformer.fit_transform(X)
print(f"原始数据:\n{X}")
print(f"对数转换:\n{X_log}")
自定义函数 #
python
def custom_transform(X):
return X ** 2 + 1
transformer = FunctionTransformer(custom_transform)
X_transformed = transformer.fit_transform(X)
逆变换 #
python
transformer = FunctionTransformer(
func=np.log1p,
inverse_func=np.expm1
)
X_transformed = transformer.fit_transform(X)
X_original = transformer.inverse_transform(X_transformed)
常用转换函数 #
python
import numpy as np
transformers = {
'log': FunctionTransformer(np.log1p),
'sqrt': FunctionTransformer(np.sqrt),
'square': FunctionTransformer(np.square),
'reciprocal': FunctionTransformer(lambda x: 1 / (x + 1e-10))
}
自定义转换器 #
基于类实现 #
python
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class LogTransformer(BaseEstimator, TransformerMixin):
def __init__(self, base=np.e):
self.base = base
def fit(self, X, y=None):
return self
def transform(self, X):
return np.log(X + 1) / np.log(self.base)
def inverse_transform(self, X):
return np.power(self.base, X) - 1
transformer = LogTransformer(base=10)
X_transformed = transformer.fit_transform(X)
特定列转换 #
python
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns] if hasattr(X, 'iloc') else X[:, self.columns]
selector = ColumnSelector(columns=[0, 2])
统计特征生成 #
python
class StatisticalFeatures(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X = np.array(X)
return np.column_stack([
X.mean(axis=1),
X.std(axis=1),
X.min(axis=1),
X.max(axis=1),
X.max(axis=1) - X.min(axis=1)
])
stat_transformer = StatisticalFeatures()
分箱转换 #
KBinsDiscretizer #
python
from sklearn.preprocessing import KBinsDiscretizer
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])
discretizer = KBinsDiscretizer(
n_bins=3,
encode='onehot',
strategy='uniform'
)
X_binned = discretizer.fit_transform(X)
print(f"分箱边界: {discretizer.bin_edges_}")
编码方式 #
| 编码 | 描述 |
|---|---|
onehot |
独热编码 |
onehot-dense |
稠密独热编码 |
ordinal |
序数编码 |
python
discretizer_ordinal = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
X_ordinal = discretizer_ordinal.fit_transform(X)
分箱策略 #
| 策略 | 描述 |
|---|---|
uniform |
等宽分箱 |
quantile |
等频分箱 |
kmeans |
K-Means 分箱 |
特征交互 #
手动创建交互特征 #
python
import pandas as pd
df = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
})
df['A_times_B'] = df['A'] * df['B']
df['A_plus_B'] = df['A'] + df['B']
df['A_div_B'] = df['A'] / df['B']
print(df)
交互特征转换器 #
python
class InteractionFeatures(BaseEstimator, TransformerMixin):
def __init__(self, interaction_type='multiply'):
self.interaction_type = interaction_type
def fit(self, X, y=None):
return self
def transform(self, X):
X = np.array(X)
n_features = X.shape[1]
interactions = []
for i in range(n_features):
for j in range(i + 1, n_features):
if self.interaction_type == 'multiply':
interactions.append(X[:, i] * X[:, j])
elif self.interaction_type == 'add':
interactions.append(X[:, i] + X[:, j])
return np.column_stack([X] + interactions)
数值转换 #
PowerTransformer #
python
from sklearn.preprocessing import PowerTransformer
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
pt = PowerTransformer(method='yeo-johnson')
X_transformed = pt.fit_transform(X)
print(f"λ 参数: {pt.lambdas_}")
Box-Cox 变换 #
python
pt_boxcox = PowerTransformer(method='box-cox')
X_positive = np.abs(X) + 0.1
X_boxcox = pt_boxcox.fit_transform(X_positive)
QuantileTransformer #
python
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(
n_quantiles=100,
output_distribution='normal',
random_state=42
)
X_quantile = qt.fit_transform(X)
Pipeline 组合 #
完整特征工程 Pipeline #
python
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
numeric_features = ['age', 'income']
categorical_features = ['city', 'gender']
numeric_transformer = Pipeline([
('scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=2, include_bias=False))
])
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', 'passthrough', categorical_features)
])
pipe = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
FeatureUnion #
python
from sklearn.pipeline import FeatureUnion
combined_features = FeatureUnion([
('poly', PolynomialFeatures(degree=2)),
('log', FunctionTransformer(np.log1p))
])
pipe = Pipeline([
('features', combined_features),
('scaler', StandardScaler()),
('model', LinearRegression())
])
特定场景转换 #
时间特征提取 #
python
import pandas as pd
class TimeFeatures(BaseEstimator, TransformerMixin):
def __init__(self, time_column):
self.time_column = time_column
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
dt = pd.to_datetime(df[self.time_column])
df['year'] = dt.dt.year
df['month'] = dt.dt.month
df['day'] = dt.dt.day
df['hour'] = dt.dt.hour
df['dayofweek'] = dt.dt.dayofweek
df['is_weekend'] = dt.dt.dayofweek.isin([5, 6]).astype(int)
return df
文本长度特征 #
python
class TextLengthFeatures(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return np.array([
[len(text), len(text.split()), len(set(text.split()))]
for text in X
])
最佳实践 #
1. 选择合适的转换 #
| 数据特点 | 推荐转换 |
|---|---|
| 偏态分布 | PowerTransformer |
| 非线性关系 | PolynomialFeatures |
| 异常值 | QuantileTransformer |
| 零值多 | log1p |
2. 避免 overfitting #
python
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
3. 使用 Pipeline #
python
pipe = Pipeline([
('transform', FunctionTransformer(np.log1p)),
('scale', StandardScaler()),
('model', LinearRegression())
])
4. 验证转换效果 #
python
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(X, bins=30)
axes[0].set_title('Original')
axes[1].hist(X_transformed, bins=30)
axes[1].set_title('Transformed')
下一步 #
掌握特征转换后,继续学习 Pipeline流水线 了解如何组织完整的机器学习流程!
最后更新:2026-04-04