Skip to content

Instantly share code, notes, and snippets.

@SqrtRyan
Created July 2, 2025 03:22
Show Gist options
  • Save SqrtRyan/c05658bad1b4d034d9c975b8fdc20b52 to your computer and use it in GitHub Desktop.
Save SqrtRyan/c05658bad1b4d034d9c975b8fdc20b52 to your computer and use it in GitHub Desktop.
from rp import *
envato_dir = "/home/jupyter/CleanCode/Datasets/Envato"
@memoized
@file_cache_wrap(path_join(envato_dir, ".envato_with_hashfiles"))
def get_filtered_csv():
csv = load_csv(
path_join(envato_dir, "captioned_envato_3869336.csv"),
show_progress=True,
use_cache=True,
)
hash_paths = os.listdir(
path_join(envato_dir, "Videos_Millions", show_progress=True)
)
csv["hash_paths"] = file_cache_call(
path_join(envato_dir, ".hash_paths.lines"),
get_file_names,
file_cache_call(
path_join(envato_dir, ".csv_cache_filenames.lines"),
get_cache_file_paths,
csv.video_url,
show_progress=True,
),
show_progress=True,
)
new_csv = csv[csv.hash_paths.isin(hash_paths)]
return new_csv
######
import decord, torch
def load_first_middle_last_frames(path: str) -> np.ndarray:
import decord
vr = decord.VideoReader(path)
total_frames = len(vr)
middle_frame_index = total_frames // 2
frames = vr.get_batch([0, middle_frame_index, total_frames - 1]).asnumpy()
return frames
# DEVICE = 0
def process_row_helper(row):
#global DEVICE
#DEVICE += 1
#DEVICE %= get_num_gpus()
video_path = path_join(envato_dir, "Videos_Millions", row.hash_paths)
video = load_first_middle_last_frames(video_path)
from rp.git.remove_watermark.remove_watermark_envato import remove_watermark
video = remove_watermark(video)
tracks, visible = run_cotracker(video, grid_size=25, device=DEVICE)
f, m, l = visible
score = int((f & l & ~m).sum())
row = dict(row)
row["score"] = score
return row
def process_index(index):
row=csv.iloc[index]
cache_path = get_cache_file_path(
row, cache_dir=path_join(envato_dir, ".scored_rows_cache")
)
try:
result = file_cache_call(cache_path, process_row_helper, row)
results[row['hash_paths']] = result
except Exception as e:
fansi_print(e, "red bold")
csv = get_filtered_csv()
DEVICE=rp.select_torch_device(reserve=True)
indices=shuffled(range(len(csv)))
results = {}
load_files(
process_index,
indices,
show_progress=True,
num_threads=0#get_num_gpus() * 8,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment