text格式
sqlCREATE external table ods_crawler_google_keyword_bybatch_outtable_di (
msg string
)
PARTITIONED BY
(ymd string,batch string)
row format serde 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
stored AS textfile
location 'oss://oss-cn-shanghai-internal.aliyuncs.com/xxx/';
;
json格式
sqlCREATE EXTERNAL TABLE IF NOT EXISTS dialogue.dwd_ext_xingye_log_chatml_data(
`data` STRING
)
PARTITIONED BY (
data_type STRING,
ymd STRING)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.mapred.TextOutputFormat'
LOCATION 'oss://data-dlc/minimax-dialogue/data/chatml/';
parquet格式
sqlCREATE EXTERNAL TABLE dwd_crawler_html_out_di
(
url string,
html string,
source_tag string,
lang string
)
partitioned BY (ymd string)
stored AS parquet
location 'oss://data-dlc/minimax-dialogue/data/chatml/data_type=html_data/';
自定义解析器
sqlCREATE EXTERNAL TABLE ods_crawler_sites_warc_data_extract_result_external_hi (
warc_host string,
meta_info string,
final_url string,
status_code string,
data_dt string,
biz_code string,
target_url string,
warc_date string,
content_type string,
content_length string,
warc_file_name string,
content string,
import_msg string
)
PARTITIONED BY (
host string,
status string,
ymd string,
hour string
)
stored BY 'com.crawler.udf.sites_warc.WarcStorageHandler'
with serdeproperties (
'max_file_size' = '157286400'
)
LOCATION 'oss://crawler-data-storage/Crawlerlee/'
USING 'crawler_udf-1.0-SNAPSHOT.jar'
sqlSET odps.sql.unstructured.oss.commit.mode = true; -- 防止生成.odps文件夹
INSERT OVERWRITE TABLE dwd_crawler_html_out_di PARTITION(ymd='${P_DATE}')
SELECT
a.source_url,
html,
biz_code,
parse_lang
FROM
dwd_crawler_content_filtered_detail_byscore_di a
WHERE
a.ymd = '${P_DATE}'
AND a.score_level != 'low'
AND COALESCE(html, '') != ''
-- 扫描所有路径 msck TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD partitions; -- 扫描指定路径 ALTER TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD IF NOT EXISTS PARTITION (host='demo.com',status='error',ymd='20250624', hour='09') PARTITION (host='demo.com',status='success',ymd='20250624', hour='09') ; -- 指定location ALTER TABLE log_table_external ADD PARTITION (year = '2016', month = '06', day = '01') location 'oss://oss-cn-hangzhou-internal.aliyuncs.com/bucket名称/oss-odps-test/log_data_customized/2016/06/01/'; -- 指定路径映射 MSCK REPAIR TABLE ods_crawler_sites_warc_data_extract_result_external_hi ADD PARTITIONS WITH PROPERTIES ('odps.msck.partition.column.mapping'='host:host,status:status,ymd:ymd');
sqlcreate table xxx lifecycle 10 AS
SELECT
'' AS msg
;
load overwrite table xxx
from
location 'oss://xxx/'
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
sqlSET odps.sql.unstructured.oss.commit.mode = true;
--导出数据。
UNLOAD FROM
(
SELECT * FROM xxx
)
INTO
LOCATION 'oss://xxx/'
stored AS TEXTFILE
;
sqltblproperties(
'mcfed.parquet.block.row.count.limit' = '100',
'mcfed.parquet.page.size.row.check.max' = '100',
'mcfed.parquet.page.size.row.check.min' = '100'
)
sqlset odps.isolation.session.enable=true;
set odps.stage.mapper.mem=8192;
set odps.stage.mapper.jvm.mem=6144;
sqlINSERT OVERWRITE TABLE tmp_data_0319_100
SELECT * FROM
LOCATION 'oss://oss-cn-shanghai-internal.aliyuncs.com/xxx/'
(
'file_format'='parquet'
)
;
SELECT
to_json(data)
FROM
tmp_data_0319_100
set odps.sql.split.dop = {"*":10};本文作者:ender
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!