支持向量机 #

概述 #

支持向量机（Support Vector Machine, SVM）是一种强大的监督学习算法，通过寻找最优超平面来实现分类或回归。

核心思想 #

SVM 寻找能够最大化类别间隔的超平面：

text

        类别 B
            ○  ○  ○
               ○  ○
    ───────────────────  最大间隔超平面
               ×  ×
            ×  ×  ×
        类别 A

    ───────────────────  支持向量边界
               ○  ○
    ───────────────────  支持向量边界

SVM 类型 #

类型	用途	类
线性 SVM	线性可分数据	LinearSVC, LinearSVR
非线性 SVM	复杂边界数据	SVC, SVR

线性 SVM #

线性分类 #

python

from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, y = make_classification(
    n_samples=1000, n_features=10, n_informative=5,
    n_redundant=0, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf = LinearSVC(dual='auto', random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")

线性回归 #

python

from sklearn.svm import LinearSVR
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score

X, y = make_regression(n_samples=1000, n_features=10, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

reg = LinearSVR(dual='auto', random_state=42)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print(f"R²: {r2_score(y_test, y_pred):.4f}")

参数说明 #

参数	描述	默认值
`C`	正则化参数	1.0
`loss`	损失函数	‘squared_hinge’
`penalty`	正则化类型	‘l2’
`dual`	对偶问题	‘auto’

非线性 SVM #

使用核函数 #

python

from sklearn.svm import SVC

X, y = make_classification(
    n_samples=500, n_features=2, n_redundant=0,
    n_informative=2, n_clusters_per_class=1, random_state=42
)

clf = SVC(kernel='rbf', C=1.0, gamma='scale')
clf.fit(X_train, y_train)

print(f"准确率: {clf.score(X_test, y_test):.4f}")

核函数类型 #

核函数	公式	适用场景
`linear`	K(x, x’) = x · x’	线性可分
`poly`	K(x, x’) = (γx·x’ + r)^d	多项式关系
`rbf`	K(x, x’) = exp(-γ‖x-x’‖²)	复杂边界
`sigmoid`	K(x, x’) = tanh(γx·x’ + r)	神经网络类似

核函数对比 #

python

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    clf = SVC(kernel=kernel, random_state=42)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f"{kernel}: {score:.4f}")

核函数详解 #

RBF 核 #

python

clf_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
clf_rbf.fit(X_train, y_train)

clf_auto = SVC(kernel='rbf', gamma='auto')
clf_scale = SVC(kernel='rbf', gamma='scale')
clf_custom = SVC(kernel='rbf', gamma=0.1)

gamma 参数影响 #

python

import matplotlib.pyplot as plt
import numpy as np

gammas = [0.01, 0.1, 1, 10, 100]
scores = []

for gamma in gammas:
    clf = SVC(kernel='rbf', gamma=gamma)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

plt.semilogx(gammas, scores, 'o-')
plt.xlabel('Gamma')
plt.ylabel('Accuracy')

多项式核 #

python

clf_poly = SVC(
    kernel='poly',
    degree=3,
    gamma='scale',
    coef0=1
)
clf_poly.fit(X_train, y_train)

正则化参数 C #

C 的作用 #

C 值	效果
大	严格分类，可能过拟合
小	允许误分类，可能欠拟合

python

C_values = [0.01, 0.1, 1, 10, 100]

for C in C_values:
    clf = SVC(kernel='rbf', C=C)
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f"C={C}: Train={train_score:.4f}, Test={test_score:.4f}")

多分类 #

策略 #

策略	描述
`ovr`	一对多（One-vs-Rest）
`ovo`	一对一（One-vs-One）

python

from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

clf_ovr = SVC(kernel='rbf', decision_function_shape='ovr')
clf_ovo = SVC(kernel='rbf', decision_function_shape='ovo')

clf_ovr.fit(X, y)
clf_ovo.fit(X, y)

print("OVR decision shape:", clf_ovr.decision_function(X[:5]).shape)
print("OVO decision shape:", clf_ovo.decision_function(X[:5]).shape)

SVM 回归 #

SVR 基本使用 #

python

from sklearn.svm import SVR

X, y = make_regression(n_samples=500, n_features=1, noise=10, random_state=42)

svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X, y)

print(f"R²: {svr.score(X, y):.4f}")

epsilon 参数 #

python

epsilons = [0.01, 0.1, 0.5, 1.0]

for eps in epsilons:
    svr = SVR(kernel='rbf', epsilon=eps)
    svr.fit(X_train, y_train)
    print(f"ε={eps}: R²={svr.score(X_test, y_test):.4f}")

决策边界可视化 #

2D 可视化 #

python

from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

X, y = make_moons(n_samples=200, noise=0.2, random_state=42)

clf = SVC(kernel='rbf', C=1.0, gamma='scale')
clf.fit(X, y)

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(
    np.arange(x_min, x_max, 0.02),
    np.arange(y_min, y_max, 0.02)
)

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.4, cmap=ListedColormap(['red', 'blue']))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=ListedColormap(['red', 'blue']))
plt.title('SVM Decision Boundary (RBF Kernel)')

支持向量可视化 #

python

plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr', alpha=0.5)
plt.scatter(
    clf.support_vectors_[:, 0],
    clf.support_vectors_[:, 1],
    s=100, facecolors='none', edgecolors='k'
)
plt.title('Support Vectors')
print(f"支持向量数量: {len(clf.support_vectors_)}")

数据预处理 #

特征缩放的重要性 #

python

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf'))
])

pipe.fit(X_train, y_train)
print(f"准确率: {pipe.score(X_test, y_test):.4f}")

不缩放 vs 缩放 #

python

clf_no_scale = SVC(kernel='rbf')
clf_no_scale.fit(X_train, y_train)
score_no_scale = clf_no_scale.score(X_test, y_test)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf'))
])
pipe.fit(X_train, y_train)
score_scaled = pipe.score(X_test, y_test)

print(f"不缩放: {score_no_scale:.4f}")
print(f"缩放后: {score_scaled:.4f}")

超参数调优 #

GridSearchCV #

python

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

grid_search = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")

RandomizedSearchCV #

python

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_dist = {
    'C': loguniform(1e-1, 1e2),
    'gamma': loguniform(1e-3, 1e1),
    'kernel': ['rbf']
}

random_search = RandomizedSearchCV(
    SVC(),
    param_dist,
    n_iter=50,
    cv=5,
    random_state=42
)

random_search.fit(X_train, y_train)

大规模数据 #

使用 LinearSVC #

python

from sklearn.svm import LinearSVC

clf = LinearSVC(dual='auto', random_state=42)
clf.fit(X_train_large, y_train_large)

使用 SGDClassifier #

python

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', random_state=42)
clf.fit(X_train, y_train)

类别不平衡 #

使用 class_weight #

python

clf = SVC(kernel='rbf', class_weight='balanced')
clf.fit(X_train, y_train)

手动设置权重 #

python

class_weights = {0: 1, 1: 10}
clf = SVC(kernel='rbf', class_weight=class_weights)

实战示例 #

手写数字识别 #

python

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=10, gamma=0.01))
])

pipe.fit(X_train, y_train)
print(f"准确率: {pipe.score(X_test, y_test):.4f}")

癌症诊断 #

python

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', class_weight='balanced'))
])

pipe.fit(X_train, y_train)
print(f"准确率: {pipe.score(X_test, y_test):.4f}")

SVM 优缺点 #

优点 #

优点	描述
高维有效	在高维空间表现良好
内存高效	只使用支持向量
核函数灵活	可适应不同数据分布
泛化能力强	最大间隔原则

缺点 #

缺点	描述
大数据慢	训练时间复杂度高
参数敏感	需要调参
噪声敏感	对噪声和异常值敏感
可解释性差	黑盒模型

最佳实践 #

1. 特征缩放 #

python

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

2. 选择核函数 #

python

if n_features > n_samples:
    kernel = 'linear'
else:
    kernel = 'rbf'

3. 调参顺序 #

python

C_values = [0.1, 1, 10, 100]
gamma_values = ['scale', 'auto', 0.01, 0.1, 1]

4. 处理不平衡 #

python

clf = SVC(class_weight='balanced')

下一步 #

掌握 SVM 后，继续学习集成方法了解更强大的模型组合技术！