sql
WITH hash_bucket AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY bucket_no) AS bucket_rel_index,
COUNT(1) OVER (PARTITION BY bucket_no ) AS bucket_size
FROM
(
SELECT
*,
ABS(HASH(cid)) % 100000 AS bucket_no
FROM
xxxx
)
),
bucket_base AS
(
SELECT
bucket_no,
SUM(bucket_size) OVER (ORDER BY bucket_no ASC) - bucket_size AS bucket_base
FROM
(
SELECT
DISTINCT bucket_no,
bucket_size
FROM
hash_bucket
)
)
SELECT
t1.*,
t2.bucket_base + bucket_rel_index AS id_index
FROM
hash_bucket t1
JOIN
bucket_base t2
ON t1.bucket_no = t2.bucket_no
;
核心代码如上,主要思路先做分桶进行分布式局部排序、再基于桶大小得到全局索引
本文作者:ender
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!