编辑
2025-07-23
数仓
0
请注意,本文编写于 294 天前,最后修改于 43 天前,其中某些信息可能已经过时。

目录

1. 建表

1. 建表

  1. text格式

    sql
    CREATE external table ods_crawler_google_keyword_bybatch_outtable_di ( msg string ) PARTITIONED BY (ymd string,batch string) row format serde 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' stored AS textfile location 'oss://oss-cn-shanghai-internal.aliyuncs.com/xxx/'; ;
  2. json格式

    sql
    CREATE EXTERNAL TABLE IF NOT EXISTS dialogue.dwd_ext_xingye_log_chatml_data( `data` STRING ) PARTITIONED BY ( data_type STRING, ymd STRING) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.mapred.TextOutputFormat' LOCATION 'oss://data-dlc/minimax-dialogue/data/chatml/';
  3. parquet格式

    sql
    CREATE EXTERNAL TABLE dwd_crawler_html_out_di ( url string, html string, source_tag string, lang string ) partitioned BY (ymd string) stored AS parquet location 'oss://data-dlc/minimax-dialogue/data/chatml/data_type=html_data/';
  4. 自定义解析器

    sql
    CREATE EXTERNAL TABLE ods_crawler_sites_warc_data_extract_result_external_hi ( warc_host string, meta_info string, final_url string, status_code string, data_dt string, biz_code string, target_url string, warc_date string, content_type string, content_length string, warc_file_name string, content string, import_msg string ) PARTITIONED BY ( host string, status string, ymd string, hour string ) stored BY 'com.crawler.udf.sites_warc.WarcStorageHandler' with serdeproperties ( 'max_file_size' = '157286400' ) LOCATION 'oss://crawler-data-storage/Crawlerlee/' USING 'crawler_udf-1.0-SNAPSHOT.jar'

2. 使用

  1. 插入数据
    sql
    SET odps.sql.unstructured.oss.commit.mode = true; -- 防止生成.odps文件夹 INSERT OVERWRITE TABLE dwd_crawler_html_out_di PARTITION(ymd='${P_DATE}') SELECT a.source_url, html, biz_code, parse_lang FROM dwd_crawler_content_filtered_detail_byscore_di a WHERE a.ymd = '${P_DATE}' AND a.score_level != 'low' AND COALESCE(html, '') != ''
  2. 读取数据
    -- 扫描所有路径 msck TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD partitions; -- 扫描指定路径 ALTER TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD IF NOT EXISTS PARTITION (host='demo.com',status='error',ymd='20250624', hour='09') PARTITION (host='demo.com',status='success',ymd='20250624', hour='09') ; -- 指定location ALTER TABLE log_table_external ADD PARTITION (year = '2016', month = '06', day = '01') location 'oss://oss-cn-hangzhou-internal.aliyuncs.com/bucket名称/oss-odps-test/log_data_customized/2016/06/01/'; -- 指定路径映射 MSCK REPAIR TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD PARTITIONS WITH PROPERTIES ('odps.msck.partition.column.mapping'='host:host,status:status,ymd:ymd');
  3. load和unload
  • load
sql
create table xxx lifecycle 10 AS SELECT '' AS msg ; load overwrite table xxx from location 'oss://xxx/' ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  • unload
sql
SET odps.sql.unstructured.oss.commit.mode = true; --导出数据。 UNLOAD FROM ( SELECT * FROM xxx ) INTO LOCATION 'oss://xxx/' stored AS TEXTFILE ;
  1. 大字段导出
  • 建表增加参数
sql
tblproperties( 'mcfed.parquet.block.row.count.limit' = '100', 'mcfed.parquet.page.size.row.check.max' = '100', 'mcfed.parquet.page.size.row.check.min' = '100' )
  • 导出时增加参数
sql
set odps.isolation.session.enable=true; set odps.stage.mapper.mem=8192; set odps.stage.mapper.jvm.mem=6144;
  1. 复杂parquet字段导入
  • 使用Schemaless Query读取数据 通过to_json转字符串
sql
INSERT OVERWRITE TABLE tmp_data_0319_100 SELECT * FROM LOCATION 'oss://oss-cn-shanghai-internal.aliyuncs.com/xxx/' ( 'file_format'='parquet' ) ; SELECT to_json(data) FROM tmp_data_0319_100
  1. 设置导出文件数 set odps.sql.split.dop = {"*":10};

本文作者:ender

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!