时间序列 #

时间序列概述 #

Pandas 提供了强大的时间序列处理功能，是金融、气象、物联网等领域的核心工具。

text

┌─────────────────────────────────────────────────────────────┐
│                    时间序列功能                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  时间创建                                                   │
│  ├── pd.Timestamp        时间戳                             │
│  ├── pd.DatetimeIndex    时间索引                           │
│  ├── pd.date_range       日期范围                           │
│  └── pd.to_datetime      转换为时间                         │
│                                                             │
│  时间操作                                                   │
│  ├── dt 访问器          时间属性                            │
│  ├── resample           重采样                              │
│  ├── rolling            滚动窗口                           │
│  └── shift              时间偏移                            │
│                                                             │
│  时间计算                                                   │
│  ├── pd.Timedelta       时间差                              │
│  ├── pd.DateOffset      日期偏移                            │
│  └── 时区处理                                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

时间创建 #

Timestamp #

python

import pandas as pd
import numpy as np

# 创建时间戳
ts = pd.Timestamp('2024-01-01')
print(ts)  # 2024-01-01 00:00:00

# 指定时间
ts = pd.Timestamp('2024-01-01 10:30:00')
ts = pd.Timestamp(year=2024, month=1, day=1, hour=10, minute=30)

# 从 Unix 时间戳
ts = pd.Timestamp(1704067200, unit='s')

# 当前时间
ts = pd.Timestamp.now()

DatetimeIndex #

python

# 创建时间索引
idx = pd.DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03'])
print(idx)

# 使用 date_range
idx = pd.date_range('2024-01-01', periods=5)
print(idx)
# DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'], dtype='datetime64[ns]', freq='D')

# 指定频率
idx = pd.date_range('2024-01-01', periods=5, freq='H')  # 小时
idx = pd.date_range('2024-01-01', periods=5, freq='M')  # 月末
idx = pd.date_range('2024-01-01', periods=5, freq='MS') # 月初
idx = pd.date_range('2024-01-01', periods=5, freq='W')  # 周
idx = pd.date_range('2024-01-01', periods=5, freq='Q')  # 季末
idx = pd.date_range('2024-01-01', periods=5, freq='Y')  # 年末

# 指定起止日期
idx = pd.date_range('2024-01-01', '2024-12-31', freq='M')

# 工作日
idx = pd.bdate_range('2024-01-01', periods=5)  # 只包含工作日

to_datetime #

python

# 字符串转换
dates = ['2024-01-01', '2024-01-02', '2024-01-03']
dt = pd.to_datetime(dates)
print(dt)

# 指定格式
dates = ['01-01-2024', '01-02-2024']
dt = pd.to_datetime(dates, format='%m-%d-%Y')

# 处理错误
dates = ['2024-01-01', 'invalid', '2024-01-03']
dt = pd.to_datetime(dates, errors='coerce')  # 无效值变为 NaT
dt = pd.to_datetime(dates, errors='ignore')  # 保持原样

# 解析多列
df = pd.DataFrame({
    'year': [2024, 2024],
    'month': [1, 2],
    'day': [1, 1]
})
dt = pd.to_datetime(df[['year', 'month', 'day']])

时间属性（dt 访问器） #

python

# 创建时间序列
df = pd.DataFrame({
    'date': pd.date_range('2024-01-01 10:30:00', periods=5, freq='D')
})

# 提取时间属性
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second

# 星期
df['dayofweek'] = df['date'].dt.dayofweek  # 0=Monday
df['day_name'] = df['date'].dt.day_name()
df['weekday'] = df['date'].dt.weekday

# 年中位置
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df['quarter'] = df['date'].dt.quarter

# 判断
df['is_month_start'] = df['date'].dt.is_month_start
df['is_month_end'] = df['date'].dt.is_month_end
df['is_leap_year'] = df['date'].dt.is_leap_year

print(df)

时间差（Timedelta） #

python

# 创建时间差
td = pd.Timedelta(days=1)
td = pd.Timedelta(weeks=1, days=2, hours=3)

# 字符串
td = pd.to_timedelta('1 day')
td = pd.to_timedelta('1 day 2 hours')
td = pd.to_timedelta(['1 day', '2 days', '3 days'])

# 时间运算
ts = pd.Timestamp('2024-01-01')
print(ts + pd.Timedelta(days=7))  # 2024-01-08

# 时间差运算
df = pd.DataFrame({
    'start': pd.date_range('2024-01-01', periods=5),
    'end': pd.date_range('2024-01-10', periods=5)
})
df['duration'] = df['end'] - df['start']
df['days'] = df['duration'].dt.days
print(df)

重采样（resample） #

重采样是时间序列分析的核心操作，用于改变时间频率。

python

# 创建时间序列
dates = pd.date_range('2024-01-01', periods=100, freq='D')
df = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(100).cumsum()
})
df.set_index('date', inplace=True)

# 降采样（高频 → 低频）
# 月度均值
monthly = df.resample('M').mean()
print(monthly)

# 周度求和
weekly = df.resample('W').sum()

# 季度统计
quarterly = df.resample('Q').agg({
    'value': ['mean', 'std', 'count']
})

# 升采样（低频 → 高频）
# 日度数据 → 小时数据
hourly = df.resample('H').asfreq()  # 产生缺失值
hourly = df.resample('H').ffill()   # 前向填充
hourly = df.resample('H').bfill()   # 后向填充
hourly = df.resample('H').interpolate()  # 插值

# 自定义聚合
result = df.resample('W').agg({
    'value': ['mean', 'std', 'min', 'max']
})

# OHLC（开高低收）
ohlc = df.resample('W').ohlc()
print(ohlc)

重采样频率 #

text

┌─────────────────────────────────────────────────────────────┐
│                    重采样频率代码                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  代码      说明              别名                           │
│  ──────    ────────────────  ────────────────────────────  │
│  S         秒                                              │
│  T/min     分钟                                           │
│  H         小时                                           │
│  D         天                                              │
│  W         周                                              │
│  M         月末             month                          │
│  MS        月初                                            │
│  Q         季末             quarter                        │
│  QS        季初                                            │
│  Y/year    年末                                            │
│  YS/year   年初                                            │
│  B         工作日                                          │
│  BH        工作小时                                        │
│                                                             │
└─────────────────────────────────────────────────────────────┘

滚动窗口（rolling） #

python

# 创建时间序列
dates = pd.date_range('2024-01-01', periods=100, freq='D')
df = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(100).cumsum()
})
df.set_index('date', inplace=True)

# 滚动均值（移动平均）
df['rolling_mean'] = df['value'].rolling(window=7).mean()

# 滚动标准差
df['rolling_std'] = df['value'].rolling(window=7).std()

# 滚动最大/最小
df['rolling_max'] = df['value'].rolling(window=7).max()
df['rolling_min'] = df['value'].rolling(window=7).min()

# 滚动求和
df['rolling_sum'] = df['value'].rolling(window=7).sum()

# 自定义函数
df['rolling_median'] = df['value'].rolling(window=7).median()

# 最小观测数
df['rolling_mean'] = df['value'].rolling(window=7, min_periods=1).mean()

# 居中窗口
df['centered'] = df['value'].rolling(window=7, center=True).mean()

print(df.head(10))

滚动窗口方法 #

python

# 常用方法
df['value'].rolling(7).mean()      # 均值
df['value'].rolling(7).std()       # 标准差
df['value'].rolling(7).var()       # 方差
df['value'].rolling(7).sum()       # 求和
df['value'].rolling(7).min()       # 最小值
df['value'].rolling(7).max()       # 最大值
df['value'].rolling(7).median()    # 中位数
df['value'].rolling(7).quantile(0.5)  # 分位数
df['value'].rolling(7).corr()      # 相关系数
df['value'].rolling(7).cov()       # 协方差
df['value'].rolling(7).skew()      # 偏度
df['value'].rolling(7).kurt()      # 峰度

# 应用自定义函数
df['value'].rolling(7).apply(lambda x: x.max() - x.min())

扩展窗口（expanding） #

python

# 扩展窗口（从开始到当前位置）
df['expanding_mean'] = df['value'].expanding().mean()
df['expanding_sum'] = df['value'].expanding().sum()
df['expanding_max'] = df['value'].expanding().max()

# 指定最小观测数
df['expanding_mean'] = df['value'].expanding(min_periods=1).mean()

指数加权移动平均（ewm） #

python

# 指数加权移动平均
df['ewm_mean'] = df['value'].ewm(span=7).mean()
df['ewm_std'] = df['value'].ewm(span=7).std()

# 指定衰减因子
df['ewm_mean'] = df['value'].ewm(alpha=0.3).mean()
df['ewm_mean'] = df['value'].ewm(halflife=3).mean()

时间偏移（shift） #

python

# 向后移动
df['shifted'] = df['value'].shift(1)  # 下移一行
df['shifted'] = df['value'].shift(7)  # 下移 7 行

# 向前移动
df['shifted'] = df['value'].shift(-1)  # 上移一行

# 计算变化
df['diff'] = df['value'].diff()      # 与前一行的差
df['diff_7'] = df['value'].diff(7)   # 与前 7 行的差

# 百分比变化
df['pct_change'] = df['value'].pct_change()

时区处理 #

python

# 创建时区无关时间
ts = pd.Timestamp('2024-01-01 10:00:00')

# 添加时区
ts_utc = ts.tz_localize('UTC')
print(ts_utc)  # 2024-01-01 10:00:00+00:00

# 转换时区
ts_ny = ts_utc.tz_convert('America/New_York')
ts_shanghai = ts_utc.tz_convert('Asia/Shanghai')
print(ts_shanghai)  # 2024-01-01 18:00:00+08:00

# 创建带时区的时间序列
idx = pd.date_range('2024-01-01', periods=5, tz='Asia/Shanghai')
print(idx)

# 时区列表
from pytz import all_timezones
print(all_timezones[:10])

时间序列实战案例 #

python

# 股票数据分析
dates = pd.date_range('2023-01-01', '2023-12-31', freq='B')  # 工作日
prices = pd.DataFrame({
    'open': 100 + np.random.randn(len(dates)).cumsum(),
    'high': 101 + np.random.randn(len(dates)).cumsum(),
    'low': 99 + np.random.randn(len(dates)).cumsum(),
    'close': 100 + np.random.randn(len(dates)).cumsum(),
    'volume': np.random.randint(1000000, 10000000, len(dates))
}, index=dates)

# 计算技术指标
prices['ma_5'] = prices['close'].rolling(5).mean()
prices['ma_20'] = prices['close'].rolling(20).mean()
prices['daily_return'] = prices['close'].pct_change()
prices['volatility'] = prices['daily_return'].rolling(20).std()

# 月度统计
monthly_stats = prices.resample('M').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
})

print(monthly_stats)

下一步 #

掌握了时间序列后，接下来学习数据可视化，了解如何使用 Pandas 进行数据可视化！