数据预处理 #
为什么需要数据预处理? #
text
┌─────────────────────────────────────────────────────────────┐
│ 数据预处理的重要性 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 原始数据问题: │
│ ├── 数据格式不统一 │
│ ├── 数值范围差异大 │
│ ├── 存在噪声和异常值 │
│ └── 数据量不足 │
│ │
│ 预处理目标: │
│ ├── 统一数据格式 │
│ ├── 标准化数值范围 │
│ ├── 清理噪声数据 │
│ └── 增强数据多样性 │
│ │
│ 好的数据预处理 = 更快的训练 + 更好的性能 │
│ │
└─────────────────────────────────────────────────────────────┘
数值数据预处理 #
归一化(Normalization) #
python
import keras
import numpy as np
x_train = np.random.randn(1000, 10) * 100
normalizer = keras.layers.Normalization()
normalizer.adapt(x_train)
model = keras.Sequential([
normalizer,
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(1)
])
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)
x_train_normalized = (x_train - mean) / std
Rescaling #
python
import keras
model = keras.Sequential([
keras.layers.Rescaling(1./255, input_shape=(28, 28, 1)),
keras.layers.Conv2D(32, 3, activation='relu'),
keras.layers.Flatten(),
keras.layers.Dense(10, activation='softmax')
])
Min-Max 缩放 #
python
import numpy as np
x_min = np.min(x_train, axis=0)
x_max = np.max(x_train, axis=0)
x_scaled = (x_train - x_min) / (x_max - x_min)
图像数据预处理 #
基本图像处理 #
python
import keras
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
mean = [0.4914, 0.4822, 0.4465]
std = [0.2023, 0.1994, 0.2010]
x_train = (x_train - mean) / std
x_test = (x_test - mean) / std
图像增强 #
python
import keras
datagen = keras.preprocessing.image.ImageDataGenerator(
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
zoom_range=0.2,
fill_mode='nearest'
)
datagen.fit(x_train)
model.fit(
datagen.flow(x_train, y_train, batch_size=32),
epochs=100,
validation_data=(x_test, y_test)
)
KerasCV 数据增强 #
python
import keras
import keras_cv
augmenter = keras.Sequential([
keras_cv.layers.RandomFlip(mode='horizontal'),
keras_cv.layers.RandomRotation(factor=0.2),
keras_cv.layers.RandomZoom(height_factor=0.2, width_factor=0.2),
keras_cv.layers.RandomContrast(factor=0.2),
])
model = keras.Sequential([
keras.layers.Input(shape=(32, 32, 3)),
augmenter,
keras.layers.Rescaling(1./255),
keras.layers.Conv2D(32, 3, activation='relu'),
keras.layers.Flatten(),
keras.layers.Dense(10, activation='softmax')
])
文本数据预处理 #
Tokenization #
python
import keras
texts = ['Hello world', 'Keras is great', 'Deep learning']
tokenizer = keras.preprocessing.text.Tokenizer(num_words=100)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)
one_hot = tokenizer.texts_to_matrix(texts, mode='binary')
print(one_hot.shape)
Padding #
python
import keras
sequences = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
padded = keras.preprocessing.sequence.pad_sequences(
sequences,
maxlen=5,
padding='post',
truncating='post'
)
print(padded)
TextVectorization 层 #
python
import keras
texts = ['Hello world', 'Keras is great', 'Deep learning']
vectorizer = keras.layers.TextVectorization(
max_tokens=100,
output_mode='int',
output_sequence_length=10
)
vectorizer.adapt(texts)
model = keras.Sequential([
vectorizer,
keras.layers.Embedding(100, 64),
keras.layers.LSTM(32),
keras.layers.Dense(1, activation='sigmoid')
])
序列数据预处理 #
TimeseriesGenerator #
python
import keras
import numpy as np
data = np.array([i for i in range(100)])
targets = np.array([i for i in range(100)])
data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
data,
targets,
length=10,
sampling_rate=1,
stride=1,
batch_size=2
)
model = keras.Sequential([
keras.layers.LSTM(32, input_shape=(10, 1)),
keras.layers.Dense(1)
])
model.fit(data_gen, epochs=10)
滑动窗口 #
python
import numpy as np
def create_sequences(data, seq_length):
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:i+seq_length])
y.append(data[i+seq_length])
return np.array(X), np.array(y)
data = np.sin(np.arange(0, 100, 0.1))
X, y = create_sequences(data, seq_length=20)
类别数据预处理 #
LabelEncoder #
python
from sklearn.preprocessing import LabelEncoder
labels = ['cat', 'dog', 'cat', 'bird', 'dog']
encoder = LabelEncoder()
encoded = encoder.fit_transform(labels)
print(encoded)
One-Hot 编码 #
python
import keras
labels = [0, 1, 2, 1, 0]
one_hot = keras.utils.to_categorical(labels, num_classes=3)
print(one_hot)
数据管道 #
tf.data.Dataset #
python
import keras
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(32)
dataset = dataset.prefetch(tf.data.AUTOTUNE)
model.fit(dataset, epochs=10)
完整数据管道 #
python
import keras
import tensorflow as tf
def preprocess(image, label):
image = tf.cast(image, tf.float32) / 255.0
image = tf.image.random_flip_left_right(image)
image = tf.image.random_brightness(image, max_delta=0.1)
return image, label
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
train_dataset = train_dataset.shuffle(10000)
train_dataset = train_dataset.batch(32)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))
test_dataset = test_dataset.batch(32)
model.fit(train_dataset, epochs=10, validation_data=test_dataset)
下一步 #
现在你已经掌握了数据预处理,接下来学习 图像数据增强,深入了解图像增强技术!
最后更新:2026-04-04