数据接口 #
Dataset 对象 #
创建 Dataset #
LightGBM 使用 Dataset 对象高效存储和处理数据:
python
import lightgbm as lgb
import numpy as np
import pandas as pd
data = np.random.randn(1000, 10)
label = np.random.randint(0, 2, 1000)
dataset = lgb.Dataset(data, label=label)
print(f"数据形状: {dataset.data.shape}")
print(f"标签形状: {dataset.label.shape}")
从不同数据源创建 #
从 NumPy 数组创建 #
python
import numpy as np
X = np.random.randn(1000, 20)
y = np.random.randint(0, 2, 1000)
train_data = lgb.Dataset(X, label=y)
从 Pandas DataFrame 创建 #
python
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 20),
columns=[f'feature_{i}' for i in range(20)])
df['target'] = np.random.randint(0, 2, 1000)
train_data = lgb.Dataset(df.drop('target', axis=1), label=df['target'])
从稀疏矩阵创建 #
python
from scipy.sparse import csr_matrix
sparse_data = csr_matrix(np.random.randn(1000, 100) * (np.random.rand(1000, 100) > 0.9))
train_data = lgb.Dataset(sparse_data, label=y)
Dataset 参数 #
python
dataset = lgb.Dataset(
data,
label=label,
weight=None,
group=None,
init_score=None,
feature_name='auto',
categorical_feature='auto',
params=None
)
参数说明 #
| 参数 | 说明 | 示例 |
|---|---|---|
| data | 特征数据 | numpy array, DataFrame |
| label | 标签数据 | array-like |
| weight | 样本权重 | array-like |
| group | 分组信息(排序任务) | array-like |
| init_score | 初始分数 | array-like |
| feature_name | 特征名称 | list |
| categorical_feature | 类别特征索引 | list |
| params | 其他参数 | dict |
设置样本权重 #
python
weights = np.random.rand(1000)
train_data = lgb.Dataset(X, label=y, weight=weights)
print(f"样本权重: {train_data.weight[:5]}")
设置特征名称 #
python
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
train_data = lgb.Dataset(X, label=y, feature_name=feature_names)
print(f"特征名称: {train_data.feature_name[:5]}")
数据格式支持 #
CSV 文件 #
python
import pandas as pd
df = pd.read_csv('data.csv')
train_data = lgb.Dataset(df.drop('target', axis=1), label=df['target'])
直接加载 CSV:
python
train_data = lgb.Dataset('data.csv?has_header=true&label_column=0')
LibSVM 格式 #
python
train_data = lgb.Dataset('data.libsvm')
LibSVM 格式示例:
text
1 1:0.1 2:0.5 3:0.3
0 1:0.2 2:0.4 4:0.6
1 2:0.3 3:0.7 5:0.1
二进制文件 #
保存为二进制格式:
python
train_data = lgb.Dataset(X, label=y)
train_data.save_binary('train.bin')
加载二进制文件:
python
train_data = lgb.Dataset('train.bin')
内存优化 #
使用 free_raw_data #
python
train_data = lgb.Dataset(X, label=y, free_raw_data=True)
print(f"原始数据是否释放: {train_data.get_data() is None}")
分批加载数据 #
python
def batch_generator(batch_size=10000):
"""数据批次生成器"""
n_samples = 100000
for i in range(0, n_samples, batch_size):
X_batch = np.random.randn(batch_size, 20)
y_batch = np.random.randint(0, 2, batch_size)
yield X_batch, y_batch
train_data = None
for X_batch, y_batch in batch_generator():
if train_data is None:
train_data = lgb.Dataset(X_batch, label=y_batch)
else:
train_data = train_data.add_data(X_batch, y_batch)
使用数据子集 #
python
subset_idx = np.random.choice(len(X), size=int(len(X) * 0.5), replace=False)
train_data = lgb.Dataset(X[subset_idx], label=y[subset_idx])
验证集处理 #
创建验证集 #
python
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
reference 参数 #
reference 参数确保验证集使用与训练集相同的特征映射:
python
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
多个验证集 #
python
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val1, X_val2, y_val1, y_val2 = train_test_split(X_temp, y_temp, test_size=0.5)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data1 = lgb.Dataset(X_val1, label=y_val1, reference=train_data)
valid_data2 = lgb.Dataset(X_val2, label=y_val2, reference=train_data)
model = lgb.train(
params,
train_data,
valid_sets=[train_data, valid_data1, valid_data2],
valid_names=['train', 'val1', 'val2']
)
大数据处理 #
使用 max_bin 控制内存 #
python
params = {
'max_bin': 127,
'num_leaves': 31
}
train_data = lgb.Dataset(X, label=y, params=params)
使用 bin_construct_sample_cnt #
python
params = {
'bin_construct_sample_cnt': 100000
}
train_data = lgb.Dataset(X, label=y, params=params)
使用数据采样 #
python
params = {
'bagging_fraction': 0.8,
'bagging_freq': 5,
'feature_fraction': 0.8
}
类别特征处理 #
指定类别特征 #
python
X = pd.DataFrame({
'num_feature': np.random.randn(1000),
'cat_feature': np.random.choice(['A', 'B', 'C'], 1000)
})
y = np.random.randint(0, 2, 1000)
X['cat_feature'] = X['cat_feature'].astype('category')
train_data = lgb.Dataset(X, label=y, categorical_feature=['cat_feature'])
使用索引指定 #
python
X = np.random.randn(1000, 10)
X[:, 3] = np.random.choice([0, 1, 2], 1000)
X[:, 7] = np.random.choice([0, 1, 2, 3], 1000)
train_data = lgb.Dataset(X, label=y, categorical_feature=[3, 7])
类别特征编码 #
python
from sklearn.preprocessing import LabelEncoder
cat_data = ['A', 'B', 'C', 'A', 'B', 'C']
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(cat_data)
print(f"原始数据: {cat_data}")
print(f"编码后: {encoded_data}")
数据集操作 #
获取数据信息 #
python
train_data = lgb.Dataset(X, label=y)
print(f"样本数量: {train_data.num_data()}")
print(f"特征数量: {train_data.num_feature()}")
print(f"特征名称: {train_data.feature_name()}")
数据集切片 #
python
subset = train_data.subset([0, 1, 2, 3, 4])
print(f"子集样本数: {subset.num_data()}")
数据集合并 #
python
data1 = lgb.Dataset(X[:500], label=y[:500])
data2 = lgb.Dataset(X[500:], label=y[500:])
combined = data1.add_data(data2)
print(f"合并后样本数: {combined.num_data()}")
数据验证 #
检查数据有效性 #
python
def validate_dataset(dataset):
"""验证数据集"""
if dataset.num_data() == 0:
raise ValueError("数据集为空")
if dataset.num_feature() == 0:
raise ValueError("没有特征")
label = dataset.get_label()
if label is None:
raise ValueError("缺少标签")
if len(np.unique(label)) < 2:
raise ValueError("标签只有一个类别")
print("数据集验证通过")
train_data = lgb.Dataset(X, label=y)
validate_dataset(train_data)
检查数据分布 #
python
import matplotlib.pyplot as plt
def check_data_distribution(dataset):
"""检查数据分布"""
label = dataset.get_label()
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(label, bins=30)
plt.xlabel('标签值')
plt.ylabel('频数')
plt.title('标签分布')
data = dataset.get_data()
if data is not None:
plt.subplot(1, 2, 2)
plt.boxplot(data[:, :5])
plt.xlabel('特征索引')
plt.ylabel('特征值')
plt.title('前5个特征分布')
plt.tight_layout()
plt.show()
check_data_distribution(train_data)
实用技巧 #
1. 预处理数据 #
python
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
train_data = lgb.Dataset(X_scaled, label=y)
2. 处理缺失值 #
python
X_with_nan = X.copy()
X_with_nan[np.random.rand(*X.shape) < 0.1] = np.nan
train_data = lgb.Dataset(X_with_nan, label=y)
params = {
'use_missing': True,
'zero_as_missing': False
}
3. 特征选择 #
python
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
train_data = lgb.Dataset(X_selected, label=y)
下一步 #
现在你已经掌握了 LightGBM 的数据接口,接下来学习 参数配置,深入了解如何优化模型参数!
最后更新:2026-04-04