HBase查询数据 #

一、GET语法 #

1.1 基本语法 #

ruby

# 基本语法
get '表名', '行键'

# 示例
get 'user', 'user001'

1.2 查询指定列 #

ruby

# 查询指定列族
get 'user', 'user001', 'info'

# 查询指定列
get 'user', 'user001', 'info:name'

# 查询多个列
get 'user', 'user001', ['info:name', 'info:age']

1.3 完整语法 #

ruby

# 完整语法
get '表名', '行键', {选项}

# 示例
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 3}

二、基本查询操作 #

2.1 查询整行 #

ruby

# 查询整行数据
get 'user', 'user001'

# 输出示例
COLUMN                   CELL
info:age                 timestamp=1704067200000, value=25
info:email               timestamp=1704067200000, value=zhangsan@example.com
info:name                timestamp=1704067200000, value=张三

2.2 查询指定列族 #

ruby

# 查询info列族
get 'user', 'user001', 'info'

# 输出示例
COLUMN                   CELL
info:age                 timestamp=1704067200000, value=25
info:name                timestamp=1704067200000, value=张三

2.3 查询指定列 #

ruby

# 查询单列
get 'user', 'user001', 'info:name'

# 输出示例
COLUMN                   CELL
info:name                timestamp=1704067200000, value=张三

# 查询多列
get 'user', 'user001', ['info:name', 'info:age']

# 输出示例
COLUMN                   CELL
info:age                 timestamp=1704067200000, value=25
info:name                timestamp=1704067200000, value=张三

2.4 查询命名空间中的表 #

ruby

# 查询命名空间中的表
get 'myapp:user', 'user001'

三、版本查询 #

3.1 查询最新版本 #

ruby

# 默认查询最新版本
get 'user', 'user001', 'info:name'

# 等同于
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 1}

3.2 查询多个版本 #

ruby

# 查询多个版本
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 3}

# 输出示例
COLUMN                   CELL
info:name                timestamp=1704067260000, value=张三丰
info:name                timestamp=1704067230000, value=张三
info:name                timestamp=1704067200000, value=小张

3.3 查询所有版本 #

ruby

# 查询所有版本（使用Integer::MAX_VALUE）
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 2147483647}

3.4 按时间戳查询 #

ruby

# 查询指定时间戳的版本
get 'user', 'user001', {COLUMN => 'info:name', TIMESTAMP => 1704067200000}

# 查询时间范围内的版本
get 'user', 'user001', {COLUMN => 'info:name', TIMERANGE => [1704067200000, 1704067300000]}

四、高级查询选项 #

4.1 查询选项列表 #

选项	说明	示例
COLUMN	指定列
COLUMNS	指定多列
VERSIONS	版本数
TIMESTAMP	时间戳
TIMERANGE	时间范围
FILTER	过滤器
MAX_LENGTH	最大长度

4.2 时间范围查询 #

ruby

# 查询时间范围内的数据
get 'user', 'user001', {TIMERANGE => [1704067200000, 1704067300000]}

# 查询指定时间戳
get 'user', 'user001', {TIMESTAMP => 1704067200000}

4.3 使用过滤器 #

ruby

# 值过滤器
get 'user', 'user001', {FILTER => "ValueFilter(=, 'binary:张三')"}

# 列名过滤器
get 'user', 'user001', {FILTER => "QualifierFilter(=, 'binary:name')"}

# 正则表达式过滤器
get 'user', 'user001', {FILTER => "ValueFilter(=, 'regexstring:张.*')"}

4.4 限制返回长度 #

ruby

# 限制返回值的最大长度
get 'user', 'user001', {COLUMN => 'info:content', MAX_LENGTH => 100}

五、查询示例 #

5.1 用户信息查询 #

ruby

# 查询用户基本信息
get 'user', 'user001'

# 查询用户姓名
get 'user', 'user001', 'info:name'

# 查询用户多个属性
get 'user', 'user001', ['info:name', 'info:age', 'info:email']

# 查询用户历史版本
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 5}

5.2 订单信息查询 #

ruby

# 查询订单详情
get 'order', 'order001'

# 查询订单状态
get 'order', 'order001', 'detail:status'

# 查询订单金额
get 'order', 'order001', 'detail:amount'

5.3 时序数据查询 #

ruby

# 查询传感器数据
get 'sensor', 'device001_1704067200'

# 查询温度数据
get 'sensor', 'device001_1704067200', 'data:temp'

# 查询多个指标
get 'sensor', 'device001_1704067200', ['data:temp', 'data:humidity']

六、Java API查询 #

6.1 基本查询 #

java

import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;

public class GetExample {
    public static void main(String[] args) throws Exception {
        Connection connection = ConnectionFactory.createConnection();
        Table table = connection.getTable(TableName.valueOf("user"));
        
        // 创建Get对象
        Get get = new Get(Bytes.toBytes("user001"));
        
        // 执行查询
        Result result = table.get(get);
        
        // 获取值
        byte[] nameBytes = result.getValue(
            Bytes.toBytes("info"), 
            Bytes.toBytes("name")
        );
        String name = Bytes.toString(nameBytes);
        System.out.println("Name: " + name);
        
        table.close();
        connection.close();
    }
}

6.2 查询指定列 #

java

// 创建Get对象
Get get = new Get(Bytes.toBytes("user001"));

// 添加要查询的列
get.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));
get.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"));

// 执行查询
Result result = table.get(get);

6.3 查询多个版本 #

java

// 创建Get对象
Get get = new Get(Bytes.toBytes("user001"));

// 设置版本数
get.readAllVersions();  // 读取所有版本
// 或
get.readVersions(3);    // 读取最新3个版本

// 执行查询
Result result = table.get(get);

// 遍历版本
result.getColumnCells(Bytes.toBytes("info"), Bytes.toBytes("name"))
    .forEach(cell -> {
        String value = Bytes.toString(CellUtil.cloneValue(cell));
        long timestamp = cell.getTimestamp();
        System.out.println("Value: " + value + ", Timestamp: " + timestamp);
    });

6.4 批量查询 #

java

// 创建多个Get对象
List<Get> gets = new ArrayList<>();
gets.add(new Get(Bytes.toBytes("user001")));
gets.add(new Get(Bytes.toBytes("user002")));
gets.add(new Get(Bytes.toBytes("user003")));

// 批量查询
Result[] results = table.get(gets);

// 遍历结果
for (Result result : results) {
    if (!result.isEmpty()) {
        String name = Bytes.toString(
            result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"))
        );
        System.out.println("Name: " + name);
    }
}

七、查询优化 #

7.1 指定列查询 #

text

查询优化建议
├── 只查询需要的列
│   └── 减少数据传输
│
├── 避免查询整行
│   └── 除非确实需要所有列
│
└── 使用列族限定
    └── 减少StoreFile扫描

7.2 使用布隆过滤器 #

ruby

# 创建表时启用布隆过滤器
create 'user', {NAME => 'info', BLOOMFILTER => 'ROW'}

# ROW类型：根据RowKey过滤
# ROWCOL类型：根据RowKey+Column过滤

7.3 BlockCache优化 #

xml

<!-- hbase-site.xml -->

<!-- 增大BlockCache -->
<property>
    <name>hfile.block.cache.size</name>
    <value>0.4</value>
</property>

7.4 查询缓存 #

java

// 开启查询缓存
Get get = new Get(Bytes.toBytes("user001"));
get.setCacheBlocks(true);  // 默认为true

// 对于一次性查询，可以关闭缓存
get.setCacheBlocks(false);

八、常见问题 #

8.1 行不存在 #

ruby

# 查询不存在的行
get 'user', 'user999'

# 输出为空（无结果）

8.2 列不存在 #

ruby

# 查询不存在的列
get 'user', 'user001', 'info:notexist'

# 输出为空（无结果）

8.3 版本数超过限制 #

ruby

# 创建表时版本数为3
create 'user', {NAME => 'info', VERSIONS => 3}

# 插入5个版本
put 'user', 'user001', 'info:name', 'v1'
put 'user', 'user001', 'info:name', 'v2'
put 'user', 'user001', 'info:name', 'v3'
put 'user', 'user001', 'info:name', 'v4'
put 'user', 'user001', 'info:name', 'v5'

# 查询所有版本
get 'user', 'user001', {COLUMN => 'info:name', VERSIONS => 10}

# 结果：只返回最新3个版本（v3, v4, v5）

8.4 时间戳问题 #

ruby

# 时间范围查询注意边界
# TIMERANGE => [start, end)
# start包含，end不包含

get 'user', 'user001', {TIMERANGE => [1704067200000, 1704067300000]}
# 查询时间戳 >= 1704067200000 且 < 1704067300000 的数据

九、最佳实践 #

9.1 查询设计 #

text

查询设计建议
├── 只查询需要的列
├── 合理设置版本数
├── 使用布隆过滤器
└── 避免频繁查询同一行

9.2 性能优化 #

text

性能优化建议
├── 使用批量查询减少网络开销
├── 开启BlockCache
├── 合理设计RowKey
└── 预分区避免热点

9.3 数据一致性 #

text

数据一致性建议
├── 理解MVCC机制
├── 注意版本查询
└── 关注时间戳范围

十、总结 #

本节介绍了HBase查询数据：

操作	语法
查询整行	get ‘表’, ‘row’
查询列族	get ‘表’, ‘row’, ‘cf’
查询列	get ‘表’, ‘row’, ‘cf:col’
查询多列	get ‘表’, ‘row’, [‘cf:col1’, ‘cf:col2’]
查询版本	get ‘表’, ‘row’,
时间范围	get ‘表’, ‘row’,

下一步，让我们学习删除数据！