Last active
April 6, 2022 09:09
-
-
Save csarron/df712e53c9e0dcaad4eb6843e7a3d51c to your computer and use it in GitHub Desktop.
pip install pyarrow fire tqdm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
crawl images: | |
pip install img2dataset==1.11.0 | |
img2dataset --url_list cc3m.tsv\ | |
--output_folder cc3m-img --input_format "tsv"\ | |
--url_col "url" --caption_col "caption"\ | |
--output_format files --resize_mode=no\ | |
--processes_count 10 --thread_count 64 --number_sample_per_shard 2000\ | |
--enable_wandb True --save_metadata False | |
img2dataset --url_list cc3m-val.tsv --input_format "tsv" \ | |
--url_col "url" --caption_col "caption" \ | |
--output_format files --output_folder cc3m-val-img \ | |
--image_size 512 --resize_only_if_bigger yes --resize_mode center_crop \ | |
--processes_count `nproc` --thread_count 16 --number_sample_per_shard 1000 \ | |
--enable_wandb True --wandb_project img2dataset --retries 5 | |
dataset=redcaps | |
dataset=cc12m | |
shard=4000 | |
processes=32 | |
threads=64 | |
dataset=cc3m | |
shard=2000 | |
processes=16 | |
threads=64 | |
dataset=sbu1m | |
shard=2000 | |
processes=16 | |
threads=32 | |
dataset=wit-v1-en-filtered | |
shard=1000 | |
processes=8 | |
threads=8 | |
dataset=wit-v1-en-noisy | |
shard=1000 | |
processes=8 | |
threads=8 | |
# set both processes and threads to 8 to avoid wikipedia has rate limiting | |
img2dataset --url_list ${dataset}.tsv.gz\ | |
--output_folder ${dataset}-img --input_format "tsv.gz"\ | |
--resize_only_if_bigger True --image_size 512\ | |
--url_col "url" --caption_col "caption"\ | |
--output_format files --resize_mode=keep_ratio\ | |
--processes_count ${processes} --thread_count ${threads} --number_sample_per_shard ${shard}\ | |
--enable_wandb True --save_metadata False --timeout 30 | |
preprocess wit-v1: | |
for i in `seq 1 9`; do | |
echo $i | |
gzip -dc wit_v1.train.all-0000$i-of-00010.tsv.gz | awk '{ if ($1 == "en") { print } }' > wit-v1-train-en-$i.tsv | |
done | |
usage: | |
python convert_pyarrow.py --dataset_dir cc3m-img | |
python convert_pyarrow.py --dataset_dir wit-v1-en-filtered-img | |
python convert_pyarrow.py --dataset_dir cc12m-img | |
python convert_pyarrow.py --dataset_dir sbu1m | |
""" | |
import fire | |
from pathlib import Path | |
from tqdm import tqdm | |
import pyarrow as pa | |
def iter_file(path): | |
for folder in path.glob('*'): | |
for file in folder.glob("*.jpg"): | |
yield file | |
def read_file(file, mod='rb'): | |
with open(file, mod) as f: | |
d = f.read() | |
return d | |
def batch_gen(ds_path, batch_size=4): | |
ids, texts, imgs = [], [], [] | |
ds_name = ds_path.stem | |
for img_file in iter_file(ds_path): | |
img_data = read_file(img_file) | |
pair_name = img_file.stem | |
pair_id = '-'.join([ds_name, pair_name]) | |
txt_file = img_file.with_suffix('.txt') | |
if not txt_file.exists(): | |
continue | |
text = read_file(txt_file, 'r') | |
ids.append(pair_id) | |
texts.append(text) | |
imgs.append(img_data) | |
if len(ids) == batch_size: | |
yield ids, texts, imgs | |
ids, texts, imgs = [], [], [] | |
if ids: | |
yield ids, texts, imgs | |
def process(dataset_dir, save_file=None, batch_size=1000): | |
dataset_path = Path(dataset_dir) | |
if save_file is None: | |
save_path = dataset_path.with_suffix('.arrow') | |
else: | |
save_path = Path(save_file) | |
save_path.parent.mkdir(parents=True, exist_ok=True) | |
count = 0 | |
p_bar = tqdm(batch_gen(dataset_path, batch_size)) | |
schema = pa.schema([('id', pa.string()), ('text', pa.string()), ('image', pa.binary())]) | |
with pa.OSFile(str(save_path), 'wb') as sink: | |
with pa.ipc.new_file(sink, schema) as writer: | |
for batch in p_bar: | |
ids, texts, imgs = batch | |
# print(ids, texts, len(imgs)) | |
p_id = pa.array(ids, type=pa.string()) | |
p_text = pa.array(texts, type=pa.string()) | |
p_img = pa.array(imgs, type=pa.binary()) | |
batch = pa.record_batch([p_id, p_text, p_img], schema) | |
writer.write(batch) | |
count += len(ids) | |
if count % batch_size == 0: | |
p_bar.set_description(f'processed {count}') | |
print(f'processed {count} in total') | |
def main(): | |
fire.Fire(process) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import fire | |
import csv | |
import gzip | |
from tqdm.auto import tqdm | |
from pathlib import Path | |
def process(redcaps_dir, out_file): | |
redcaps_path = Path(redcaps_dir) | |
num_files = 0 | |
num_pairs = 0 | |
with open(out_file, 'w', newline='') as of: | |
tsv_writer = csv.writer(of, delimiter='\t') | |
tsv_writer.writerow(["caption", "url"]) | |
for anno_file in tqdm(redcaps_path.glob('*.json')): | |
d = json.load(open(anno_file)) | |
num_files += 1 | |
for item in tqdm(d['annotations'], leave=False): | |
url = item['url'] | |
caption = item['caption'] | |
num_pairs += 1 | |
tsv_writer.writerow([caption, url]) | |
print(f'processed {num_files} files, {num_pairs} pairs in total') | |
def main(): | |
fire.Fire(process) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# parse wit en | |
# python parse.py -d wit-v1-train-en.tsv.gz -o wit-v1-en-noisy.tsv | |
import argparse | |
import csv | |
import gzip | |
from tqdm import tqdm | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d", "--data_file", type=str, required=True) | |
parser.add_argument("-o", "--output_file", type=str, required=True) | |
args = parser.parse_args() | |
data_file = args.data_file | |
if data_file.endswith('.gz'): | |
df = gzip.open(data_file, mode='rt', newline='') | |
else: | |
df = open(args.data_file) | |
count = 0 | |
skipped = 0 | |
with open(args.output_file, 'w', newline='') as of: | |
tsv_writer = csv.writer(of, delimiter='\t') | |
tsv_writer.writerow(["caption", "url"]) | |
tsv_reader = csv.reader(df, delimiter="\t") | |
p_bar = tqdm(tsv_reader) | |
for row in p_bar: | |
n_col = len(row) | |
if n_col < 7: | |
skipped += 1 | |
continue | |
caption = row[6] | |
url = row[2] | |
if url: | |
if caption: # wit-v1-en-filtered | |
# print(f't={len(row)}, cap={row[6]}, url={row[2]}') | |
tsv_writer.writerow([caption, url]) | |
count += 1 | |
else: | |
# use this to include more noisy data, i.e. wit-v1-en-noisy | |
caption = row[7] if n_col>7 else '' | |
caption += row[8] if n_col>8 else '' | |
if caption: | |
tsv_writer.writerow([caption, url]) | |
count += 1 | |
else: | |
skipped += 1 | |
else: | |
skipped += 1 | |
if count % 1000 == 0: | |
p_bar.set_description(f"Processed {count}, skip {skipped}") | |
print(f'all done, got {count}, skip {skipped}, total={count+skipped}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coco datasets https://cs.stanford.edu/people/karpathy/deepimagesent/ | |
import json | |
import fire | |
from pathlib import Path | |
from tqdm import tqdm | |
import pyarrow as pa | |
from collections import defaultdict | |
def read_image(image_name, image_dir=Path.cwd(), mod='rb'): | |
split = image_name.split('_')[1] | |
if type(split) != str: | |
raise ValueError(f'{image_name}, {split}, {type(split)}') | |
image_file = Path(image_dir, split, image_name) | |
if not image_file.exists(): | |
raise ValueError(f'{image_file} not exist!') | |
with open(image_file, mod) as f: | |
d = f.read() | |
return d | |
def get_data(dataset_dir): | |
dataset_path = Path(dataset_dir) | |
coco_dataset = json.load(open(dataset_path / 'dataset_coco.json')) | |
image_items = coco_dataset["images"] | |
img2captions = defaultdict(list) | |
img2split = dict() | |
for item in tqdm(image_items): | |
filename = item["filename"] | |
img2split[filename] = item["split"] | |
for i in item["sentences"]: | |
img2captions[filename].append(i["raw"]) | |
train_split, val_split, test_split = {}, {}, {} | |
for k, v in img2split.items(): | |
if v == 'train' or v == 'restval': | |
train_split[k] = img2captions[k] | |
elif v == 'val': | |
val_split[k] = img2captions[k] | |
else: | |
assert v == 'test' | |
test_split[k] = img2captions[k] | |
return train_split, val_split, test_split | |
def batch_gen(data_split, image_dir='coco', batch_size=4): | |
ids, texts, imgs = [], [], [] | |
for img_file, captions in data_split.items(): | |
img_data = read_image(img_file, image_dir) | |
ids.append(img_file) | |
texts.append(captions) | |
imgs.append(img_data) | |
if len(ids) == batch_size: | |
yield ids, texts, imgs | |
ids, texts, imgs = [], [], [] | |
if ids: | |
yield ids, texts, imgs | |
def process_split(data_split, image_dir, save_file=None, batch_size=1000): | |
save_path = Path(save_file) | |
save_path.parent.mkdir(parents=True, exist_ok=True) | |
count = 0 | |
p_bar = tqdm(batch_gen(data_split, image_dir, batch_size)) | |
schema = pa.schema( | |
[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())]) | |
with pa.OSFile(str(save_path), 'wb') as sink: | |
with pa.ipc.new_file(sink, schema) as writer: | |
for batch in p_bar: | |
ids, texts, imgs = batch | |
# print(ids, texts, len(imgs)) | |
p_id = pa.array(ids, type=pa.string()) | |
p_text = pa.array(texts, type=pa.list_(pa.string())) | |
p_img = pa.array(imgs, type=pa.binary()) | |
batch = pa.record_batch([p_id, p_text, p_img], schema) | |
writer.write(batch) | |
count += len(ids) | |
if count % batch_size == 0: | |
p_bar.set_description(f'processed {count}') | |
print(f'processed {count} in total') | |
def process(dataset_dir): | |
print('loading dataset...') | |
train_split, val_split, test_split = get_data(dataset_dir) | |
print('processing train split...') | |
process_split(train_split, dataset_dir, | |
save_file='coco-karpathy-train.arrow', batch_size=1000) | |
print('processing val split...') | |
process_split(val_split, dataset_dir, | |
save_file='coco-karpathy-val.arrow', batch_size=1000) | |
print('processing test split...') | |
process_split(test_split, dataset_dir, | |
save_file='coco-karpathy-test.arrow', batch_size=1000) | |
print('all done...') | |
def main(): | |
fire.Fire(process) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download datasets and images from https://visualgenome.org/api/v0/api_home.html | |
import json | |
import pyarrow as pa | |
from collections import defaultdict | |
from tqdm import tqdm | |
import fire | |
from pathlib import Path | |
def read_image(image_name, image_dir=Path.cwd(), mod='rb'): | |
img_f1 = Path(image_dir, 'VG_100K', f'{image_name}.jpg') | |
img_f2 = Path(image_dir, 'VG_100K_2', f'{image_name}.jpg') | |
if img_f1.exists(): | |
image_file = img_f1 | |
elif img_f2.exists(): | |
image_file = img_f2 | |
else: | |
raise ValueError(f'neither VG_100K nor VG_100K_2 has {image_name}.jpg !') | |
with open(image_file, mod) as f: | |
d = f.read() | |
return d | |
def get_data(dataset_dir): | |
dataset_path = Path(dataset_dir) | |
vg_dataset = json.load(open(dataset_path / 'region_descriptions.json')) | |
img2captions = defaultdict(list) | |
for cap in tqdm(vg_dataset): | |
cap = cap["regions"] | |
for c in cap: | |
img2captions[c["image_id"]].append(c['phrase']) | |
return img2captions | |
def batch_gen(data_split, image_dir='vg', batch_size=4): | |
ids, texts, imgs = [], [], [] | |
for img_file, captions in data_split.items(): | |
img_data = read_image(img_file, image_dir) | |
ids.append(f'{image_dir}-{img_file}') | |
texts.append(captions) | |
imgs.append(img_data) | |
if len(ids) == batch_size: | |
yield ids, texts, imgs | |
ids, texts, imgs = [], [], [] | |
if ids: | |
yield ids, texts, imgs | |
def process_split(data_split, image_dir, save_file=None, batch_size=1000): | |
save_path = Path(save_file) | |
save_path.parent.mkdir(parents=True, exist_ok=True) | |
count = 0 | |
p_bar = tqdm(batch_gen(data_split, image_dir, batch_size)) | |
schema = pa.schema( | |
[('id', pa.string()), ('text', pa.list_(pa.string())), ('image', pa.binary())]) | |
with pa.OSFile(str(save_path), 'wb') as sink: | |
with pa.ipc.new_file(sink, schema) as writer: | |
for batch in p_bar: | |
ids, texts, imgs = batch | |
# print(ids, texts, len(imgs)) | |
p_id = pa.array(ids, type=pa.string()) | |
p_text = pa.array(texts, type=pa.list_(pa.string())) | |
p_img = pa.array(imgs, type=pa.binary()) | |
batch = pa.record_batch([p_id, p_text, p_img], schema) | |
writer.write(batch) | |
count += len(ids) | |
if count % batch_size == 0: | |
p_bar.set_description(f'processed {count}') | |
print(f'processed {count} in total') | |
def process(dataset_dir): | |
print('loading dataset...') | |
train_split = get_data(dataset_dir) | |
print('processing vg train...') | |
process_split(train_split, dataset_dir, save_file='vg-train.arrow', batch_size=1000) | |
print('all done...') | |
def main(): | |
fire.Fire(process) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python read_arrow.py --arrow_file cc3m-img.arrow --iterate | |
import fire | |
import io | |
import time | |
from pathlib import Path | |
from PIL import Image | |
from tqdm import tqdm | |
import pyarrow as pa | |
def main(arrow_file, iterate=False): | |
start = time.perf_counter() | |
with pa.memory_map(arrow_file, 'rb') as source: | |
pa_reader = pa.ipc.open_file(source) | |
loaded_array = pa_reader.read_all() | |
init = time.perf_counter() | |
print(f"loaded {len(loaded_array)} in {init-start:.3f} s, used mem: {pa.total_allocated_bytes() >> 20} MB") | |
if iterate: | |
start = time.perf_counter() | |
count = 0 | |
for batch in tqdm(loaded_array.to_batches()): | |
d = batch.to_pydict() | |
for c1, c2, c3 in zip(d['id'], d['text'], d['image']): | |
count += 1 | |
print(f'{count} iters in {time.perf_counter()-start:.3f} s') | |
# main('cc3msmall.arrow', True) | |
# + | |
# from IPython.display import display # to display images | |
# for i, t, img in zip(loaded_array[0], loaded_array[1], loaded_array[2]): | |
# print(i, t, ) | |
# image = Image.open(io.BytesIO(img.as_py())) | |
# display(image) | |
# + | |
# loaded_array[2][1].as_py() | |
# - | |
if __name__ == "__main__": | |
fire.Fire(main) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import fire | |
import csv | |
import gzip | |
from tqdm.auto import tqdm | |
from pathlib import Path | |
import pyarrow as pa | |
schema = pa.schema([("id", pa.string()), ("text", pa.string()), ("image", pa.binary())]) | |
def read_file(file, mod="rb"): | |
with open(file, mod) as f: | |
d = f.read() | |
return d | |
def batch_gen(annotation_path, images_path, batch_size=4): | |
ids, texts, imgs = [], [], [] | |
for anno_file in tqdm(annotation_path.glob("*.json")): | |
d = json.load(open(anno_file)) | |
for item in tqdm(d["annotations"], leave=False): | |
image_id = item["image_id"] | |
subreddit = item["subreddit"] | |
img_file = images_path / subreddit / f"{image_id}.jpg" | |
if not img_file.exists(): | |
continue | |
img_data = read_file(img_file) | |
caption = item["caption"] | |
pair_id = "-".join(["redcaps", subreddit, image_id]) | |
ids.append(pair_id) | |
texts.append(caption) | |
imgs.append(img_data) | |
if len(ids) == batch_size: | |
yield ids, texts, imgs | |
ids, texts, imgs = [], [], [] | |
if ids: | |
yield ids, texts, imgs | |
def process(redcaps_dir, save_file, images_dir=None, batch_size=1000): | |
redcaps_path = Path(redcaps_dir) | |
annotation_path = redcaps_path / "annotations" | |
images_path = redcaps_path / "images" if images_dir is None else Path(images_dir) | |
if save_file is None: | |
save_path = redcaps_path.with_suffix(".arrow") | |
else: | |
save_path = Path(save_file) | |
save_path.parent.mkdir(parents=True, exist_ok=True) | |
count = 0 | |
p_bar = tqdm(batch_gen(annotation_path, images_path, batch_size)) | |
with pa.OSFile(str(save_path), "wb") as sink: | |
with pa.ipc.new_file(sink, schema) as writer: | |
for batch in p_bar: | |
ids, texts, imgs = batch | |
# print(ids, texts, len(imgs)) | |
p_id = pa.array(ids, type=pa.string()) | |
p_text = pa.array(texts, type=pa.string()) | |
p_img = pa.array(imgs, type=pa.binary()) | |
batch = pa.record_batch([p_id, p_text, p_img], schema) | |
writer.write(batch) | |
count += len(ids) | |
if count % batch_size == 0: | |
p_bar.set_description(f"processed {count}") | |
print(f"processed {count} in total") | |
def main(): | |
fire.Fire(process) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# follow https://github.com/redcaps-dataset/redcaps-downloader | |
for ann_file in annotations/[a-c]*.json; do | |
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4 | |
# Set --resize -1 to turn off resizing shorter edge (saves disk space). | |
done | |
for ann_file in annotations/[d-n]*.json; do | |
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4 | |
# Set --resize -1 to turn off resizing shorter edge (saves disk space). | |
done | |
for ann_file in annotations/[o-z]*.json; do | |
redcaps download-imgs -a $ann_file --save-to images --resize 512 -j 4 | |
# Set --resize -1 to turn off resizing shorter edge (saves disk space). | |
done | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stats: | |
vg 108077, 0.1M | |
coco 113287, 0.11M | |
wit-filtered 1511216 1.51M | |
sbu1m 867217 0.87M | |
cc3m 2907484 2.91M | |
cc12m 10914482 10.91M | |
redcaps 11843894 11.84 M | |
sbu1m loaded 867217 in 6.650 s | |
convert_pyarrow.py --dataset_dir cc3m-img | |
processed 2907000: : 2908it [9:19:07, 11.54s/it] | |
python convert_pyarrow.py --dataset_dir img-datasets/wit-v1-en-filtered-img | |
processed 1511000: : 1512it [1:55:07, 4.57s/it] | |
processed 1511216 in total | |
python convert_pyarrow.py --dataset_dir cc12m-img | |
processed 10914000: : 10915it [14:38:39, 4.83s/it] | |
processed 10914484 in total | |
# 16 cores model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz | |
python read_arrow.py --arrow_file cc3m-img.arrow --iterate | |
loaded 2907484 in 3.985 s, used mem: 0 MB | |
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2908/2908 [12:18<00:00, 3.94it/s] | |
2907484 iters in 738.930 s | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train | |
tar --sort=name --transform='s,/,-,g' --show-transformed-names -cvf vg.tar vg-train | |
find vg-train -type f -print0 | tar --sort=name --transform='s,vg-,,2' --transform='s,/,-,g' --show-transformed-names -cvf vgt.tar --null -T - | |
find cc3m -type f -name '*.jpg' -print0 | while read -d $'\0' image | |
do | |
new=${image//\//-} | |
echo "${new%.jpg}.txt file:${image%.jpg}.txt" | |
echo "$new file:$image" | |
done | | |
sort | | |
tarp create -o - - | | |
tarp split -c 200000 -o 'cc3m-tars/cc3m-%06d.tar' - |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment