数据过滤 #

数据过滤概述 #

数据过滤是数据分析中最常用的操作之一，Pandas 提供了多种灵活的过滤方法。

text

┌─────────────────────────────────────────────────────────────┐
│                    数据过滤方法                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  布尔索引                                                   │
│  ├── 比较运算符：==, !=, >, <, >=, <=                      │
│  ├── 逻辑运算符：&, |, ~                                    │
│  └── isin(), between()                                     │
│                                                             │
│  字符串过滤                                                 │
│  ├── str.contains()                                        │
│  ├── str.startswith() / str.endswith()                     │
│  └── str.match()                                           │
│                                                             │
│  查询语法                                                   │
│  └── df.query('expression')                                │
│                                                             │
│  高级过滤                                                   │
│  ├── where() / mask()                                      │
│  ├── filter()                                              │
│  └── 自定义函数                                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

准备数据 #

python

import pandas as pd
import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace', 'Henry'],
    'age': [25, 30, 35, 28, 32, 45, 29, 38],
    'city': ['New York', 'London', 'Tokyo', 'Paris', 'Berlin', 'London', 'Tokyo', 'New York'],
    'department': ['Sales', 'Engineering', 'Engineering', 'Sales', 'Marketing', 'Engineering', 'Sales', 'Marketing'],
    'salary': [50000, 70000, 80000, 55000, 60000, 90000, 52000, 75000],
    'performance': [85, 92, 88, 78, 90, 95, 82, 87]
})

print(df)

基本条件过滤 #

比较运算符 #

python

# 等于
print(df[df['age'] == 30])

# 不等于
print(df[df['department'] != 'Sales'])

# 大于
print(df[df['salary'] > 70000])

# 小于
print(df[df['age'] < 30])

# 大于等于
print(df[df['performance'] >= 90])

# 小于等于
print(df[df['salary'] <= 55000])

多条件过滤 #

python

# 与条件（&）
print(df[(df['age'] > 30) & (df['salary'] > 70000)])

# 或条件（|）
print(df[(df['age'] < 28) | (df['age'] > 40)])

# 非条件（~）
print(df[~(df['department'] == 'Sales')])

# 复杂条件
print(df[(df['department'] == 'Engineering') & 
          ((df['salary'] > 75000) | (df['performance'] > 90))])

注意运算符优先级 #

python

# 错误：Python 会将 df['age'] > 30 & df['salary'] > 70000 解析错误
# df[df['age'] > 30 & df['salary'] > 70000]  # 报错

# 正确：使用括号
print(df[(df['age'] > 30) & (df['salary'] > 70000)])

isin 方法 #

基本用法 #

python

# 选择特定值
cities = ['New York', 'London']
print(df[df['city'].isin(cities)])

# 选择特定部门
departments = ['Engineering', 'Marketing']
print(df[df['department'].isin(departments)])

# 取反
print(df[~df['city'].isin(['New York', 'London'])])

结合其他条件 #

python

# isin + 其他条件
print(df[df['city'].isin(['New York', 'London']) & (df['salary'] > 60000)])

between 方法 #

python

# 数值范围
print(df[df['age'].between(30, 40)])
# 等价于
print(df[(df['age'] >= 30) & (df['age'] <= 40)])

# 包含边界
print(df[df['salary'].between(60000, 80000, inclusive='both')])  # 默认
print(df[df['salary'].between(60000, 80000, inclusive='neither')])
print(df[df['salary'].between(60000, 80000, inclusive='left')])
print(df[df['salary'].between(60000, 80000, inclusive='right')])

字符串过滤 #

contains 方法 #

python

# 包含子串
print(df[df['name'].str.contains('a')])
print(df[df['name'].str.contains('a', case=False)])  # 不区分大小写

# 正则表达式
print(df[df['name'].str.contains(r'^[AB]')])  # 以 A 或 B 开头
print(df[df['name'].str.contains(r'e$')])     # 以 e 结尾

startswith / endswith #

python

# 开头匹配
print(df[df['name'].str.startswith('A')])

# 结尾匹配
print(df[df['name'].str.endswith('e')])

# 多个匹配
print(df[df['name'].str.startswith(('A', 'B'))])

match 方法 #

python

# 正则匹配
print(df[df['name'].str.match(r'[AB]')])  # 以 A 或 B 开头
print(df[df['name'].str.match(r'.*ie.*')])  # 包含 ie

字符串长度 #

python

# 长度过滤
print(df[df['name'].str.len() > 5])

query 方法 #

基本语法 #

python

# 简单条件
print(df.query('age > 30'))

# 多条件
print(df.query('age > 30 and salary > 70000'))
print(df.query('age > 30 & salary > 70000'))  # 等价

# 或条件
print(df.query('age < 28 or age > 40'))

# 取反
print(df.query('not (department == "Sales")'))

使用变量 #

python

# 使用 @ 引用变量
min_salary = 70000
target_dept = 'Engineering'

print(df.query('salary > @min_salary'))
print(df.query('department == @target_dept'))

in 操作 #

python

# in 操作
print(df.query('city in ["New York", "London"]'))

# not in
print(df.query('city not in ["New York", "London"]'))

索引查询 #

python

# 设置索引名
df_indexed = df.copy()
df_indexed.index.name = 'id'

# 查询索引
print(df_indexed.query('id in [0, 2, 4]'))

字符串方法 #

python

# 字符串方法
print(df.query('name.str.contains("a")'))
print(df.query('name.str.startswith("A")'))

where 和 mask #

where 方法 #

python

# where - 保留满足条件的值，其他变为 NaN
print(df.where(df['salary'] > 70000))

# 指定替换值
print(df.where(df['salary'] > 70000, other=0))

# 只对特定列操作
print(df[['salary', 'performance']].where(df['salary'] > 70000, other=0))

mask 方法 #

python

# mask - 与 where 相反，保留不满足条件的值
print(df.mask(df['salary'] > 70000))

# 指定替换值
print(df.mask(df['salary'] > 70000, other='High Salary'))

filter 方法 #

python

# 按列名过滤
print(df.filter(like='name'))  # 包含 name 的列
print(df.filter(like='a'))     # 包含 a 的列

# 正则表达式
print(df.filter(regex='^[a-z]'))  # 以小写字母开头的列

# 按项过滤
print(df.filter(items=['name', 'age', 'salary']))

# 按行过滤
print(df.filter(like='0', axis=0))  # 索引包含 0 的行

高级过滤技巧 #

使用函数 #

python

# 使用 apply
print(df[df.apply(lambda row: row['salary'] / row['age'] > 2000, axis=1)])

# 使用自定义函数
def filter_func(row):
    return row['salary'] > 60000 and row['performance'] > 85

print(df[df.apply(filter_func, axis=1)])

使用 np.where #

python

import numpy as np

# 创建条件列
df['salary_level'] = np.where(df['salary'] > 70000, 'High', 'Low')
print(df)

使用 np.select #

python

# 多条件分类
conditions = [
    df['performance'] >= 90,
    df['performance'] >= 80,
    df['performance'] >= 70
]
choices = ['Excellent', 'Good', 'Average']
df['rating'] = np.select(conditions, choices, default='Poor')
print(df)

使用 pd.cut #

python

# 分箱
df['age_group'] = pd.cut(df['age'], 
                          bins=[0, 30, 40, 100], 
                          labels=['Young', 'Middle', 'Senior'])
print(df)

# 按分箱过滤
print(df[df['age_group'] == 'Young'])

使用 pd.qcut #

python

# 等频分箱
df['salary_quartile'] = pd.qcut(df['salary'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(df)

# 按分位数过滤
print(df[df['salary_quartile'] == 'Q4'])

缺失值过滤 #

python

# 创建带缺失值的数据
df_na = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})

# 过滤包含缺失值的行
print(df_na[df_na['A'].notna()])
print(df_na.dropna())

# 过滤不包含缺失值的行
print(df_na[df_na.isna().any(axis=1)])

# 过滤特定列有缺失值的行
print(df_na[df_na['B'].isna()])

重复值过滤 #

python

# 创建带重复值的数据
df_dup = pd.DataFrame({
    'A': [1, 1, 2, 2, 3],
    'B': ['a', 'a', 'b', 'b', 'c']
})

# 过滤重复行
print(df_dup[~df_dup.duplicated()])

# 保留最后一个
print(df_dup[~df_dup.duplicated(keep='last')])

# 按特定列判断
print(df_dup[~df_dup.duplicated(subset=['A'])])

性能优化 #

使用 isin 替代多个 or #

python

# 慢
print(df[(df['city'] == 'New York') | 
          (df['city'] == 'London') | 
          (df['city'] == 'Tokyo')])

# 快
print(df[df['city'].isin(['New York', 'London', 'Tokyo'])])

使用 query 处理复杂条件 #

python

# 复杂条件使用 query 更清晰
print(df.query('age > 30 and salary > 70000 and department in ["Engineering", "Marketing"]'))

避免链式操作 #

python

# 不推荐
result = df[df['age'] > 30][['name', 'salary']]

# 推荐
result = df.loc[df['age'] > 30, ['name', 'salary']]

过滤方法选择指南 #

text

┌─────────────────────────────────────────────────────────────┐
│                    过滤方法选择                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  场景                      推荐方法                         │
│  ────────────────────────  ────────────────────────────    │
│  单条件                    df[df['col'] > value]            │
│  多条件（与/或）           df[(cond1) & (cond2)]            │
│  特定值列表                df[df['col'].isin(values)]       │
│  范围                      df[df['col'].between(a, b)]      │
│  字符串匹配                df[df['col'].str.contains()]     │
│  复杂条件                  df.query('...')                  │
│  函数过滤                  df[df.apply(func, axis=1)]       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

下一步 #

掌握了数据过滤后，接下来学习数据清洗，了解如何处理缺失值、重复值和异常值！