Spanner高级查询 #
一、CTE (Common Table Expression) #
1.1 基本语法 #
sql
WITH cte_name AS (
SELECT ...
)
SELECT ... FROM cte_name;
1.2 简单CTE #
sql
-- 单个CTE
WITH active_users AS (
SELECT user_id, name
FROM users
WHERE status = 'active'
)
SELECT * FROM active_users;
-- 多个CTE
WITH
active_users AS (
SELECT user_id, name
FROM users
WHERE status = 'active'
),
user_orders AS (
SELECT user_id, COUNT(*) AS order_count
FROM orders
GROUP BY user_id
)
SELECT a.name, COALESCE(o.order_count, 0) AS orders
FROM active_users a
LEFT JOIN user_orders o ON a.user_id = o.user_id;
1.3 递归CTE #
sql
-- 递归CTE: 组织层级
WITH RECURSIVE org_hierarchy AS (
-- 基础查询: 顶级员工
SELECT emp_id, name, manager_id, 1 AS level
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- 递归查询: 下级员工
SELECT e.emp_id, e.name, e.manager_id, h.level + 1
FROM employees e
INNER JOIN org_hierarchy h ON e.manager_id = h.emp_id
)
SELECT * FROM org_hierarchy ORDER BY level;
-- 递归CTE: 路径查找
WITH RECURSIVE paths AS (
SELECT
from_node,
to_node,
ARRAY[from_node, to_node] AS path,
1 AS distance
FROM edges
UNION ALL
SELECT
p.from_node,
e.to_node,
ARRAY_CONCAT(p.path, [e.to_node]),
p.distance + 1
FROM paths p
INNER JOIN edges e ON p.to_node = e.from_node
WHERE p.distance < 5 -- 限制深度
)
SELECT * FROM paths;
1.4 CTE优势 #
text
CTE优势:
├── 提高可读性
├── 避免重复子查询
├── 支持递归查询
├── 便于维护
└── 逻辑清晰
二、窗口函数 #
2.1 窗口函数语法 #
sql
function_name() OVER (
[PARTITION BY partition_expression]
[ORDER BY order_expression [ASC|DESC]]
[frame_clause]
)
2.2 排名函数 #
sql
-- ROW_NUMBER: 行号
SELECT
name,
salary,
ROW_NUMBER() OVER (ORDER BY salary DESC) AS row_num
FROM users;
-- RANK: 排名(有并列)
SELECT
name,
salary,
RANK() OVER (ORDER BY salary DESC) AS rank
FROM users;
-- DENSE_RANK: 紧凑排名
SELECT
name,
salary,
DENSE_RANK() OVER (ORDER BY salary DESC) AS dense_rank
FROM users;
-- PERCENT_RANK: 百分比排名
SELECT
name,
salary,
PERCENT_RANK() OVER (ORDER BY salary DESC) AS pct_rank
FROM users;
-- NTILE: 分组
SELECT
name,
salary,
NTILE(4) OVER (ORDER BY salary DESC) AS quartile
FROM users;
2.3 聚合窗口函数 #
sql
-- 累计求和
SELECT
order_date,
amount,
SUM(amount) OVER (ORDER BY order_date) AS running_total
FROM orders;
-- 移动平均
SELECT
order_date,
amount,
AVG(amount) OVER (
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS moving_avg
FROM orders;
-- 分组聚合
SELECT
dept_id,
name,
salary,
SUM(salary) OVER (PARTITION BY dept_id) AS dept_total,
AVG(salary) OVER (PARTITION BY dept_id) AS dept_avg
FROM users;
-- 累计计数
SELECT
order_date,
COUNT(*) OVER (ORDER BY order_date) AS cumulative_count
FROM orders;
2.4 偏移函数 #
sql
-- LAG: 前一行
SELECT
order_date,
amount,
LAG(amount) OVER (ORDER BY order_date) AS prev_amount,
amount - LAG(amount) OVER (ORDER BY order_date) AS diff
FROM orders;
-- LEAD: 后一行
SELECT
order_date,
amount,
LEAD(amount) OVER (ORDER BY order_date) AS next_amount
FROM orders;
-- 指定偏移量和默认值
SELECT
order_date,
amount,
LAG(amount, 2, 0) OVER (ORDER BY order_date) AS prev_2_amount
FROM orders;
-- FIRST_VALUE: 第一个值
SELECT
order_date,
amount,
FIRST_VALUE(amount) OVER (ORDER BY order_date) AS first_amount
FROM orders;
-- LAST_VALUE: 最后一个值
SELECT
order_date,
amount,
LAST_VALUE(amount) OVER (
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_amount
FROM orders;
2.5 分帧 #
sql
-- ROWS帧
SELECT
order_date,
amount,
AVG(amount) OVER (
ORDER BY order_date
ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
) AS moving_avg
FROM orders;
-- RANGE帧
SELECT
order_date,
amount,
SUM(amount) OVER (
ORDER BY order_date
RANGE BETWEEN INTERVAL 7 DAY PRECEDING AND CURRENT ROW
) AS weekly_total
FROM orders;
-- 帧类型
-- UNBOUNDED PRECEDING: 从开始
-- UNBOUNDED FOLLOWING: 到结束
-- CURRENT ROW: 当前行
-- n PRECEDING: 前n行
-- n FOLLOWING: 后n行
三、分区查询 #
3.1 分区表查询 #
sql
-- 创建分区表
CREATE TABLE logs (
log_id INT64 NOT NULL,
log_date DATE NOT NULL,
message STRING(MAX)
) PRIMARY KEY (log_id, log_date)
PARTITION BY DATE(log_date);
-- 查询特定分区
SELECT * FROM logs
WHERE log_date = DATE '2024-03-27';
-- 查询分区范围
SELECT * FROM logs
WHERE log_date BETWEEN DATE '2024-01-01' AND DATE '2024-03-31';
3.2 分区裁剪 #
sql
-- 分区裁剪: 只扫描需要的分区
SELECT * FROM logs
WHERE log_date >= DATE '2024-03-01';
-- 不推荐: 函数导致全分区扫描
SELECT * FROM logs
WHERE DATE(log_date) >= DATE '2024-03-01'; -- 不使用分区裁剪
-- 推荐: 直接使用分区列
SELECT * FROM logs
WHERE log_date >= DATE '2024-03-01'; -- 使用分区裁剪
3.3 分区管理 #
sql
-- 查看分区信息
SELECT
table_name,
partition_expression
FROM INFORMATION_SCHEMA.TABLES
WHERE table_name = 'logs';
-- 分区过期设置
CREATE TABLE logs (
log_id INT64 NOT NULL,
log_date DATE NOT NULL,
message STRING(MAX)
) PRIMARY KEY (log_id, log_date)
PARTITION BY DATE(log_date)
OPTIONS (
partition_expiration_days = 30
);
四、全文搜索 #
4.1 创建搜索索引 #
sql
-- 创建全文搜索索引
CREATE SEARCH INDEX idx_products_search ON products(name, description);
-- 使用搜索函数
SELECT * FROM products
WHERE SEARCH(name, 'laptop');
-- 多词搜索
SELECT * FROM products
WHERE SEARCH(description, 'high performance laptop');
4.2 搜索语法 #
sql
-- 基本搜索
SELECT * FROM products WHERE SEARCH(name, 'laptop');
-- 短语搜索
SELECT * FROM products WHERE SEARCH(description, '"high performance"');
-- 布尔搜索
SELECT * FROM products
WHERE SEARCH(description, 'laptop AND computer');
SELECT * FROM products
WHERE SEARCH(description, 'laptop OR desktop');
SELECT * FROM products
WHERE SEARCH(description, 'laptop NOT refurbished');
-- 通配符
SELECT * FROM products WHERE SEARCH(name, 'lap*');
4.3 搜索排名 #
sql
-- 使用SCORE函数获取相关性分数
SELECT
name,
description,
SCORE() AS relevance
FROM products
WHERE SEARCH(description, 'laptop')
ORDER BY relevance DESC;
五、JSON查询 #
5.1 JSON提取 #
sql
-- 提取JSON值
SELECT
id,
JSON_VALUE(data, '$.name') AS name,
JSON_VALUE(data, '$.address.city') AS city
FROM documents;
-- 提取JSON对象
SELECT
id,
JSON_QUERY(data, '$.address') AS address
FROM documents;
-- 提取数组元素
SELECT
id,
JSON_VALUE(data, '$.tags[0]') AS first_tag
FROM documents;
5.2 JSON条件 #
sql
-- JSON条件查询
SELECT * FROM documents
WHERE JSON_VALUE(data, '$.status') = 'active';
-- JSON存在性检查
SELECT * FROM documents
WHERE JSON_QUERY(data, '$.address') IS NOT NULL;
5.3 JSON修改 #
sql
-- JSON_SET: 设置值
SELECT JSON_SET(
JSON '{"name": "John", "age": 30}',
'$.age', 31
);
-- JSON_REMOVE: 删除值
SELECT JSON_REMOVE(
JSON '{"name": "John", "age": 30}',
'$.age'
);
六、数组查询 #
6.1 数组操作 #
sql
-- 查询数组元素
SELECT * FROM posts
WHERE 'tech' IN UNNEST(tags);
-- 展开数组
SELECT
post_id,
tag
FROM posts,
UNNEST(tags) AS tag;
-- 数组聚合
SELECT
user_id,
ARRAY_AGG(order_id) AS order_ids
FROM orders
GROUP BY user_id;
6.2 数组函数 #
sql
-- 数组长度
SELECT ARRAY_LENGTH(tags) FROM posts;
-- 数组包含
SELECT * FROM posts WHERE ARRAY_INCLUDES(tags, 'tech');
-- 数组过滤
SELECT ARRAY(
SELECT x FROM UNNEST([1, 2, 3, 4, 5]) AS x WHERE x > 2
);
-- 数组排序
SELECT ARRAY(
SELECT x FROM UNNEST([3, 1, 2]) AS x ORDER BY x
);
七、高级聚合 #
7.1 GROUPING SETS #
sql
-- 多维聚合
SELECT
dept_id,
status,
COUNT(*) AS count
FROM users
GROUP BY GROUPING SETS (
(dept_id, status),
(dept_id),
(status),
()
);
-- 等价于多个GROUP BY的UNION ALL
7.2 ROLLUP #
sql
-- ROLLUP: 层级聚合
SELECT
dept_id,
status,
COUNT(*) AS count
FROM users
GROUP BY ROLLUP (dept_id, status);
-- 生成:
-- (dept_id, status)
-- (dept_id)
-- ()
7.3 CUBE #
sql
-- CUBE: 所有组合聚合
SELECT
dept_id,
status,
COUNT(*) AS count
FROM users
GROUP BY CUBE (dept_id, status);
-- 生成:
-- (dept_id, status)
-- (dept_id)
-- (status)
-- ()
7.4 GROUPING函数 #
sql
-- GROUPING: 判断是否为聚合行
SELECT
dept_id,
status,
COUNT(*) AS count,
GROUPING(dept_id) AS is_dept_total,
GROUPING(status) AS is_status_total
FROM users
GROUP BY ROLLUP (dept_id, status);
八、查询优化 #
8.1 使用EXPLAIN #
sql
-- 查看执行计划
EXPLAIN SELECT u.name, o.order_id
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id
WHERE u.status = 'active';
-- 查看实际执行统计
EXPLAIN ANALYZE SELECT u.name, o.order_id
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id
WHERE u.status = 'active';
8.2 查询提示 #
sql
-- 强制使用索引
SELECT * FROM users@{FORCE_INDEX=idx_users_status}
WHERE status = 'active';
-- 设置JOIN顺序
SELECT * FROM users@{FORCE_JOIN_ORDER=TRUE}
INNER JOIN orders ON users.user_id = orders.user_id;
-- 设置查询超时
SELECT * FROM users@{QUERY_TIMEOUT=10s};
8.3 性能建议 #
text
高级查询性能建议:
├── 使用分区裁剪
├── 创建合适的索引
├── 使用窗口函数替代自连接
├── 使用CTE提高可读性
├── 避免SELECT *
└── 使用EXPLAIN分析
九、总结 #
高级查询功能:
| 功能 | 说明 |
|---|---|
| CTE | 提高可读性,支持递归 |
| 窗口函数 | 排名、聚合、偏移 |
| 分区查询 | 分区裁剪优化 |
| 全文搜索 | 文本搜索功能 |
| JSON查询 | JSON数据处理 |
| 数组查询 | 数组数据处理 |
最佳实践:
text
1. 使用CTE
└── 提高可读性和维护性
2. 使用窗口函数
└── 替代复杂的自连接
3. 利用分区裁剪
└── 减少扫描数据量
4. 创建合适的索引
└── 加速查询
5. 分析执行计划
└── 使用EXPLAIN优化
下一步,让我们学习分布式特性!
最后更新:2026-03-27