数据类型 #

Pandas 数据类型概述 #

Pandas 构建在 NumPy 之上，扩展了 NumPy 的类型系统，提供了更丰富的数据类型支持。

text

┌─────────────────────────────────────────────────────────────┐
│                    Pandas 数据类型                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  数值类型                                                   │
│  ├── int8, int16, int32, int64                             │
│  ├── uint8, uint16, uint32, uint64                         │
│  ├── float16, float32, float64                             │
│  └── complex64, complex128                                 │
│                                                             │
│  特殊类型                                                   │
│  ├── bool                                                  │
│  ├── object（混合类型）                                     │
│  ├── string（新字符串类型）                                 │
│  └── category（分类类型）                                   │
│                                                             │
│  时间类型                                                   │
│  ├── datetime64[ns]                                        │
│  ├── timedelta64[ns]                                       │
│  └── period                                                │
│                                                             │
│  可空类型（Nullable）                                       │
│  ├── Int8, Int16, Int32, Int64                             │
│  ├── UInt8, UInt16, UInt32, UInt64                         │
│  ├── Float32, Float64                                      │
│  ├── boolean                                               │
│  └── string                                                │
│                                                             │
└─────────────────────────────────────────────────────────────┘

查看数据类型 #

python

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'int_col': [1, 2, 3],
    'float_col': [1.1, 2.2, 3.3],
    'str_col': ['a', 'b', 'c'],
    'bool_col': [True, False, True]
})

# 查看所有列的数据类型
print(df.dtypes)

# 查看单列数据类型
print(df['int_col'].dtype)

# 查看详细信息
print(df.info())

数值类型 #

整数类型 #

python

# 默认整数类型
s = pd.Series([1, 2, 3])
print(s.dtype)  # int64

# 指定整数类型
s = pd.Series([1, 2, 3], dtype='int32')
print(s.dtype)  # int32

# 无符号整数
s = pd.Series([1, 2, 3], dtype='uint8')
print(s.dtype)  # uint8

# 各类型范围
print('int8:', np.iinfo('int8'))    # -128 to 127
print('int16:', np.iinfo('int16'))  # -32768 to 32767
print('int32:', np.iinfo('int32'))  # -2147483648 to 2147483647
print('int64:', np.iinfo('int64'))  # -9223372036854775808 to 9223372036854775807

浮点类型 #

python

# 默认浮点类型
s = pd.Series([1.1, 2.2, 3.3])
print(s.dtype)  # float64

# 指定浮点类型
s = pd.Series([1.1, 2.2, 3.3], dtype='float32')
print(s.dtype)  # float32

# 特殊浮点值
s = pd.Series([1.0, np.inf, -np.inf, np.nan])
print(s)
# 0    1.0
# 1    inf
# 2   -inf
# 3    NaN
# dtype: float64

# 检测特殊值
print(np.isinf(s))   # 检测无穷
print(np.isnan(s))   # 检测 NaN
print(np.isfinite(s))  # 检测有限值

可空整数类型 #

传统整数类型不支持缺失值，Pandas 2.0 引入了可空整数类型：

python

# 传统整数类型不支持 NaN
s = pd.Series([1, 2, None], dtype='int64')  # 报错

# 使用可空整数类型
s = pd.Series([1, 2, None], dtype='Int64')  # 注意大写 I
print(s)
# 0       1
# 1       2
# 2    <NA>
# dtype: Int64

# 可空整数类型列表
# Int8, Int16, Int32, Int64
# UInt8, UInt16, UInt32, UInt64

可空浮点类型 #

python

# 可空浮点类型
s = pd.Series([1.1, 2.2, None], dtype='Float64')  # 注意大写 F
print(s)
# 0     1.1
# 1     2.2
# 2    <NA>
# dtype: Float64

# 可空浮点类型列表
# Float32, Float64

字符串类型 #

object 类型（传统） #

python

# 默认字符串类型
s = pd.Series(['a', 'b', 'c'])
print(s.dtype)  # object

# object 类型可以存储任意 Python 对象
s = pd.Series(['a', 1, None, [1, 2, 3]])
print(s.dtype)  # object

string 类型（推荐） #

python

# 新字符串类型（Pandas 1.0+）
s = pd.Series(['a', 'b', 'c'], dtype='string')
print(s.dtype)  # string

# 支持缺失值
s = pd.Series(['a', 'b', None], dtype='string')
print(s)
# 0       a
# 1       b
# 2    <NA>
# dtype: string

# string 类型的优势
# 1. 明确的字符串类型
# 2. 支持缺失值
# 3. 更好的性能（某些操作）

字符串操作 #

python

s = pd.Series(['apple', 'banana', 'cherry'], dtype='string')

# 字符串方法
print(s.str.upper())           # 大写
print(s.str.lower())           # 小写
print(s.str.len())             # 长度
print(s.str.contains('a'))     # 包含检测
print(s.str.startswith('a'))   # 开头检测
print(s.str.replace('a', 'X')) # 替换
print(s.str.slice(0, 3))       # 切片

布尔类型 #

传统布尔类型 #

python

s = pd.Series([True, False, True])
print(s.dtype)  # bool

# 不支持缺失值
s = pd.Series([True, False, None], dtype='bool')  # 报错

可空布尔类型 #

python

# 可空布尔类型
s = pd.Series([True, False, None], dtype='boolean')
print(s)
# 0     True
# 1    False
# 2     <NA>
# dtype: boolean

# 三值逻辑
print(s & True)   # 与运算
print(s | False)  # 或运算

分类类型 #

分类类型适合有限取值的列，可以节省内存并提高性能。

python

# 创建分类类型
s = pd.Series(['a', 'b', 'c', 'a', 'b'], dtype='category')
print(s)
# 0    a
# 1    b
# 2    c
# 3    a
# 4    b
# dtype: category
# Categories (3, object): ['a', 'b', 'c']

# 查看分类信息
print(s.cat.categories)  # Index(['a', 'b', 'c'], dtype='object')
print(s.cat.codes)       # 编码 [0, 1, 2, 0, 1]

# 指定分类顺序
s = pd.Series(['low', 'medium', 'high'], dtype='category')
s = s.cat.set_categories(['low', 'medium', 'high'], ordered=True)
print(s.cat.categories)

# 有序分类比较
print(s > 'low')  # 可以比较

分类类型操作 #

python

s = pd.Series(['a', 'b', 'c', 'a', 'b'], dtype='category')

# 添加分类
s = s.cat.add_categories(['d'])

# 删除分类
s = s.cat.remove_categories(['c'])

# 重命名分类
s = s.cat.rename_categories({'a': 'A', 'b': 'B'})

# 重排序
s = s.cat.reorder_categories(['A', 'B'], ordered=True)

分类类型优势 #

python

import pandas as pd

# 内存对比
s_str = pd.Series(['type1'] * 100000)
s_cat = pd.Series(['type1'] * 100000, dtype='category')

print('字符串内存:', s_str.memory_usage(deep=True))
print('分类内存:', s_cat.memory_usage(deep=True))
# 分类类型内存占用小得多

时间类型 #

datetime 类型 #

python

# 创建时间序列
s = pd.Series(pd.date_range('2024-01-01', periods=5))
print(s.dtype)  # datetime64[ns]

# 从字符串转换
s = pd.Series(['2024-01-01', '2024-01-02', '2024-01-03'])
s = pd.to_datetime(s)
print(s.dtype)  # datetime64[ns]

# 指定格式
s = pd.to_datetime(['01-01-2024', '01-02-2024'], format='%m-%d-%Y')

# 时间属性
s = pd.Series(pd.date_range('2024-01-01', periods=5))
print(s.dt.year)     # 年
print(s.dt.month)    # 月
print(s.dt.day)      # 日
print(s.dt.hour)     # 时
print(s.dt.minute)   # 分
print(s.dt.second)   # 秒
print(s.dt.dayofweek)  # 星期几

timedelta 类型 #

python

# 创建时间差
s = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=2)])
print(s.dtype)  # timedelta64[ns]

# 从字符串转换
s = pd.to_timedelta(['1 day', '2 days', '3 hours'])

# 时间差运算
dates = pd.Series(pd.date_range('2024-01-01', periods=5))
deltas = pd.Series([pd.Timedelta(days=i) for i in range(5)])
print(dates + deltas)

Period 类型 #

python

# 创建周期
s = pd.Series(pd.period_range('2024-01', periods=5, freq='M'))
print(s.dtype)  # period[M]

# 周期属性
print(s.dt.year)
print(s.dt.month)
print(s.dt.to_timestamp())  # 转换为时间戳

类型转换 #

astype 方法 #

python

df = pd.DataFrame({
    'int_col': [1, 2, 3],
    'float_col': [1.1, 2.2, 3.3],
    'str_col': ['1', '2', '3']
})

# 单列转换
df['int_col'] = df['int_col'].astype(float)

# 多列转换
df = df.astype({
    'int_col': 'float32',
    'str_col': 'int64'
})

# 全部转换
df = df.astype(str)

to_numeric #

python

# 转换为数值
s = pd.Series(['1', '2', '3', 'a'])
s = pd.to_numeric(s)  # 报错

# 处理错误
s = pd.to_numeric(s, errors='ignore')  # 忽略错误
s = pd.to_numeric(s, errors='coerce')  # 转换为 NaN

# 指定类型
s = pd.to_numeric(s, downcast='integer')  # 自动选择最小整数类型
s = pd.to_numeric(s, downcast='float')    # 自动选择最小浮点类型

to_datetime #

python

# 转换为日期时间
s = pd.Series(['2024-01-01', '2024-01-02'])
s = pd.to_datetime(s)

# 处理错误
s = pd.Series(['2024-01-01', 'invalid'])
s = pd.to_datetime(s, errors='coerce')

# 指定格式
s = pd.to_datetime(s, format='%Y-%m-%d')

# 解析多列
df = pd.DataFrame({
    'year': [2024, 2024],
    'month': [1, 2],
    'day': [1, 1]
})
s = pd.to_datetime(df[['year', 'month', 'day']])

convert_dtypes #

python

# 自动转换为最佳类型
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': ['a', 'b', 'c'],
    'c': [True, False, True]
})

df = df.convert_dtypes()
print(df.dtypes)
# a      Int64
# b     string
# c    boolean

类型推断 #

python

# 推断类型
s = pd.Series([1, 2, 3])
print(s.infer_objects().dtype)

# 推断最佳类型
df = pd.DataFrame({
    'a': [1, 2, None],
    'b': ['1', '2', '3']
})
df = df.infer_objects()
print(df.dtypes)

内存优化 #

使用最小类型 #

python

import pandas as pd
import numpy as np

# 创建大数据集
df = pd.DataFrame({
    'small_int': np.random.randint(0, 100, 1000000),
    'big_int': np.random.randint(0, 1000000, 1000000),
    'float': np.random.randn(1000000),
    'category': np.random.choice(['A', 'B', 'C'], 1000000)
})

# 查看内存使用
print(df.memory_usage())

# 优化类型
df['small_int'] = df['small_int'].astype('int8')
df['big_int'] = df['big_int'].astype('int32')
df['float'] = df['float'].astype('float32')
df['category'] = df['category'].astype('category')

print('优化后:')
print(df.memory_usage())

类型选择建议 #

text

┌─────────────────────────────────────────────────────────────┐
│                    类型选择建议                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  整数：                                                     │
│  - 范围 0-255: uint8                                       │
│  - 范围 -128~127: int8                                     │
│  - 范围 -32768~32767: int16                                │
│  - 更大范围: int32, int64                                   │
│                                                             │
│  浮点：                                                     │
│  - 精度要求不高: float32                                    │
│  - 高精度: float64                                          │
│                                                             │
│  字符串：                                                   │
│  - 取值有限: category                                       │
│  - 取值多样: string                                         │
│                                                             │
│  布尔：                                                     │
│  - 无缺失值: bool                                           │
│  - 有缺失值: boolean                                        │
│                                                             │
└─────────────────────────────────────────────────────────────┘

下一步 #

掌握了数据类型后，接下来学习索引操作，深入了解 Pandas 的索引系统！