分类任务实战 #

二分类实战：信用卡欺诈检测 #

问题描述 #

text

┌─────────────────────────────────────────────────────────────┐
│                    信用卡欺诈检测                            │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  目标：识别信用卡交易是否为欺诈                              │
│                                                              │
│  挑战：                                                      │
│  - 极度不平衡的数据（欺诈 < 1%）                             │
│  - 需要高召回率（不能漏掉欺诈）                              │
│  - 实时性要求                                                │
│                                                              │
│  评估指标：                                                  │
│  - AUC-ROC                                                   │
│  - Precision-Recall AUC                                      │
│  - F1 Score                                                  │
│                                                              │
└─────────────────────────────────────────────────────────────┘

完整代码实现 #

python

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, classification_report,
    confusion_matrix, roc_curve, precision_recall_curve
)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 1. 数据加载与探索
def load_and_explore_data():
    """加载并探索数据"""
    # 模拟数据（实际使用时替换为真实数据）
    from sklearn.datasets import make_classification
    
    X, y = make_classification(
        n_samples=100000,
        n_features=20,
        n_informative=15,
        n_redundant=5,
        n_clusters_per_class=3,
        weights=[0.99, 0.01],  # 1% 欺诈
        random_state=42
    )
    
    print("数据集信息:")
    print(f"  样本数: {len(X)}")
    print(f"  特征数: {X.shape[1]}")
    print(f"  欺诈比例: {y.mean():.4f}")
    print(f"  欺诈样本: {y.sum()}")
    
    return X, y

X, y = load_and_explore_data()

# 2. 数据预处理
def preprocess_data(X, y):
    """数据预处理"""
    # 划分数据
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 标准化（可选，XGBoost 对缩放不敏感）
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(f"\n训练集: {len(X_train)} 样本")
    print(f"测试集: {len(X_test)} 样本")
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_data(X, y)

# 3. 处理类别不平衡
def handle_imbalance(X_train, y_train):
    """处理类别不平衡"""
    # 计算正负样本比例
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
    
    print(f"\n正负样本比例: {scale_pos_weight:.2f}")
    
    return scale_pos_weight

scale_pos_weight = handle_imbalance(X_train, y_train)

# 4. 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 5. 参数配置
params = {
    'objective': 'binary:logistic',
    'eval_metric': ['logloss', 'auc'],
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'seed': 42
}

# 6. 训练模型
def train_model(params, dtrain, dtest):
    """训练模型"""
    evals_result = {}
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=50,
        evals_result=evals_result,
        verbose_eval=20
    )
    
    print(f"\n最佳迭代: {model.best_iteration}")
    print(f"最佳分数: {model.best_score}")
    
    return model, evals_result

model, evals_result = train_model(params, dtrain, dtest)

# 7. 模型评估
def evaluate_model(model, dtest, y_test):
    """评估模型"""
    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    print("\n" + "="*50)
    print("模型评估结果")
    print("="*50)
    
    print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
    print(f"精确率: {precision_score(y_test, y_pred):.4f}")
    print(f"召回率: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 分数: {f1_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(f"AUC-PR: {average_precision_score(y_test, y_pred_proba):.4f}")
    
    print("\n分类报告:")
    print(classification_report(y_test, y_pred, target_names=['正常', '欺诈']))
    
    print("\n混淆矩阵:")
    print(confusion_matrix(y_test, y_pred))
    
    return y_pred_proba, y_pred

y_pred_proba, y_pred = evaluate_model(model, dtest, y_test)

# 8. 可视化
def plot_results(evals_result, y_test, y_pred_proba):
    """可视化结果"""
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 训练曲线
    axes[0, 0].plot(evals_result['train']['logloss'], label='Train')
    axes[0, 0].plot(evals_result['test']['logloss'], label='Test')
    axes[0, 0].set_xlabel('Round')
    axes[0, 0].set_ylabel('Log Loss')
    axes[0, 0].set_title('Training Log Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True)
    
    # AUC 曲线
    axes[0, 1].plot(evals_result['train']['auc'], label='Train')
    axes[0, 1].plot(evals_result['test']['auc'], label='Test')
    axes[0, 1].set_xlabel('Round')
    axes[0, 1].set_ylabel('AUC')
    axes[0, 1].set_title('Training AUC')
    axes[0, 1].legend()
    axes[0, 1].grid(True)
    
    # ROC 曲线
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    axes[1, 0].plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
    axes[1, 0].plot([0, 1], [0, 1], 'k--')
    axes[1, 0].set_xlabel('False Positive Rate')
    axes[1, 0].set_ylabel('True Positive Rate')
    axes[1, 0].set_title('ROC Curve')
    axes[1, 0].legend()
    axes[1, 0].grid(True)
    
    # PR 曲线
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    axes[1, 1].plot(recall, precision, label=f'AUC = {average_precision_score(y_test, y_pred_proba):.4f}')
    axes[1, 1].set_xlabel('Recall')
    axes[1, 1].set_ylabel('Precision')
    axes[1, 1].set_title('Precision-Recall Curve')
    axes[1, 1].legend()
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()

plot_results(evals_result, y_test, y_pred_proba)

# 9. 特征重要性
def plot_feature_importance(model, max_num_features=15):
    """绘制特征重要性"""
    plt.figure(figsize=(10, 6))
    xgb.plot_importance(model, max_num_features=max_num_features, importance_type='gain')
    plt.title('Feature Importance (Gain)')
    plt.tight_layout()
    plt.show()

plot_feature_importance(model)

# 10. 保存模型
model.save_model('fraud_detection_model.json')
print("\n模型已保存为 fraud_detection_model.json")

多分类实战：鸢尾花分类 #

python

import numpy as np
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 加载数据
iris = load_iris()
X, y = iris.data, iris.target
feature_names = iris.feature_names
target_names = iris.target_names

print("数据集信息:")
print(f"  样本数: {len(X)}")
print(f"  特征数: {X.shape[1]}")
print(f"  类别数: {len(target_names)}")
print(f"  类别: {target_names}")

# 2. 划分数据
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)

# 4. 参数配置
params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': 4,
    'eta': 0.1,
    'seed': 42
}

# 5. 训练模型
evals_result = {}
model = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=20,
    evals_result=evals_result,
    verbose_eval=20
)

# 6. 预测
y_pred_proba = model.predict(dtest)
y_pred = np.argmax(y_pred_proba, axis=1)

# 7. 评估
print("\n" + "="*50)
print("模型评估结果")
print("="*50)
print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=target_names))

# 8. 混淆矩阵可视化
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# 9. 特征重要性
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, importance_type='gain')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

文本分类实战 #

python

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 1. 模拟文本数据
texts = [
    "This product is great, I love it!",
    "Terrible experience, never buying again",
    "Amazing quality and fast delivery",
    "Worst product ever, total waste of money",
    "Highly recommend this to everyone",
    "Disappointed with the quality",
    "Best purchase I've made this year",
    "Not worth the price",
    "Excellent customer service",
    "Poor quality, returned immediately"
] * 100

labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 100

# 2. 特征提取
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(texts)
y = np.array(labels)

print(f"特征维度: {X.shape}")

# 3. 划分数据
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 5. 训练
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.1
}

model = xgb.train(params, dtrain, num_boost_round=100)

# 6. 评估
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))

分类任务最佳实践 #

python

def classification_best_practices():
    """
    分类任务最佳实践
    """
    practices = {
        '数据准备': [
            '检查类别分布',
            '处理类别不平衡',
            '正确划分训练/测试集'
        ],
        '模型训练': [
            '使用分层抽样',
            '设置早停策略',
            '监控训练过程'
        ],
        '模型评估': [
            '选择合适的评估指标',
            '关注少数类性能',
            '使用交叉验证'
        ],
        '模型优化': [
            '调整决策阈值',
            '尝试不同的采样策略',
            '集成多个模型'
        ]
    }
    
    for category, items in practices.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  • {item}")

classification_best_practices()

下一步 #

现在你已经掌握了分类任务实战，接下来学习回归任务了解 XGBoost 在回归问题中的应用！