(pixiv-data-process/yada/13_pixiv_streamlined.ipynb)
输入一个(本地或者s3地址), 返回包含了所有文件的列表, 上传图片-meta的关系到s3:
(没那么多数据的时候可以直接这么用:)
# https://github.com/troph-team/build-it/blob/f996fe55a6fd2beda9e62a6624be0f0fe2a05848/buildit/sagemaker/parquet_splitter.py#L13
import os
from dataproc3.sagemaker import ParquetSplitter
# get inventory files' s3 uris
inventory_s3_uris = get_inventory_s3_uris_from_manifest(MANIFEST_URI)
# get all the parquet files' s3 uris
df_inventory = get_s3_uris_from_inventory_parquets(inventory_s3_uris, keep_columns= ["file_s3_uri", "size"])
# construct image and metadata pairs
df_pairs = construct_image_meta_pairs(df_inventory)
# upload split inventory files to s3: specify at least one of input_parquet_path or input_df
splitter = ParquetSplitter(
# input_parquet_path=None,
input_df = df_pairs_curr,
image_s3_uri_col=IMAGE_S3_URI_COL,
s3_upload_dir=PARTS_UPLOAD_DIR,
ignore_gifs=True,
local_dir="./temp_parquet_chunks",
)
splitter.process_and_upload(rows_per_part=PROC_BATCH_SIZE)