rolux · June 3, 2025 13:42 · nikitatishin5 · Apr 18, 2020 · doppiaeffe · May 23, 2020
diff --git a/culture_shock.py b/culture_shock.py
 # git clone https://github.com/NVlabs/stylegan2
 import os
 import numpy as np
 from scipy.interpolate import interp1d
 from scipy.io import wavfile
 import matplotlib.pyplot as plt
 import PIL.Image
 import moviepy.editor

 import dnnlib
 import dnnlib.tflib as tflib
 import pretrained_networks

 audio = {}
 fps = 60

 # https://www.google.com/search?q=death+grips+black+google+download
 for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
    mp3_filename = f'data/{mp3_filename}'
    wav_filename = mp3_filename[:-4] + '.wav'
    if not os.path.exists(wav_filename):
        audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
        audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
    track_name = os.path.basename(wav_filename)[15:-5]
    rate, signal = wavfile.read(wav_filename)
    signal = np.mean(signal, axis=1) # to mono
    signal = np.abs(signal)
    seed = signal.shape[0]
    duration = signal.shape[0] / rate
    frames = int(np.ceil(duration * fps))
    samples_per_frame = signal.shape[0] / frames
    audio[track_name] = np.zeros(frames, dtype=signal.dtype)
    for frame in range(frames):
        start = int(round(frame * samples_per_frame))
        stop = int(round((frame + 1) * samples_per_frame))
        audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
    audio[track_name] /= max(audio[track_name])

 for track in sorted(audio.keys()):
    plt.figure(figsize=(8, 3))
    plt.title(track)
    plt.plot(audio[track])
    plt.savefig(f'data/{track}.png')

 network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
 _G, _D, Gs = pretrained_networks.load_networks(network_pkl)

 Gs_kwargs = dnnlib.EasyDict()
 Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
 Gs_kwargs.randomize_noise = False
 Gs_syn_kwargs = dnnlib.EasyDict()
 Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
 Gs_syn_kwargs.randomize_noise = False
 Gs_syn_kwargs.minibatch_size = 4
 noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
 w_avg = Gs.get_var('dlatent_avg')

 def get_ws(n, frames, seed):
    filename = f'data/ws_{n}_{frames}_{seed}.npy'
    if not os.path.exists(filename):
        src_ws = np.random.RandomState(seed).randn(n, 512)
        ws = np.empty((frames, 512))
        for i in range(512):
            # FIXME: retarded
            x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
            y = np.tile(src_ws[:, i], 3)
            x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
            y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
            ws[:, i] = y_[frames:2*frames]
        np.save(filename, ws)
    else:
        ws = np.load(filename)
    return ws

 def mix_styles(wa, wb, ivs):
    w = np.copy(wa)
    for i, v in ivs:
        w[i] = wa[i] * (1 - v) + wb[i] * v
    return w

 def normalize_vector(v):
    return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

 def render_frame(t):
    global base_index
    frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
    base_index += base_speed * audio['Instrumental'][frame]**2
    base_w = base_ws[int(round(base_index)) % len(base_ws)]
    base_w = np.tile(base_w, (18, 1))
    psi = 0.5 + audio['FX'][frame] / 2
    base_w = w_avg + (base_w - w_avg) * psi
    mix_w = np.tile(mix_ws[frame], (18, 1))
    mix_w = w_avg + (mix_w - w_avg) * 0.75
    ranges = [range(0, 4), range(4, 8), range(8, 18)]
    values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
    w = mix_styles(base_w, mix_w, zip(ranges, values))
    w += mouth_open * audio['Vocal'][frame] * 1.5
    image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
    image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
    return np.array(image)
    
 size = 1080
 seconds = int(np.ceil(duration))
 resolution = 10
 base_frames = resolution * frames
 base_ws = get_ws(seconds, base_frames, seed)
 base_speed = base_frames / sum(audio['Instrumental']**2)
 base_index = 0
 mix_ws = get_ws(seconds, frames, seed + 1)
 # https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
 mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))

 mp4_filename = 'data/Culture Shock.mp4'
 video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
 audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
 audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
 audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
 video_clip = video_clip.set_audio(audio_clip)
 video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')
	# git clone https://github.com/NVlabs/stylegan2
	import os
	import numpy as np
	from scipy.interpolate import interp1d
	from scipy.io import wavfile
	import matplotlib.pyplot as plt
	import PIL.Image
	import moviepy.editor

	import dnnlib
	import dnnlib.tflib as tflib
	import pretrained_networks

	audio = {}
	fps = 60

	# https://www.google.com/search?q=death+grips+black+google+download
	for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
	mp3_filename = f'data/{mp3_filename}'
	wav_filename = mp3_filename[:-4] + '.wav'
	if not os.path.exists(wav_filename):
	audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
	audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
	track_name = os.path.basename(wav_filename)[15:-5]
	rate, signal = wavfile.read(wav_filename)
	signal = np.mean(signal, axis=1) # to mono
	signal = np.abs(signal)
	seed = signal.shape[0]
	duration = signal.shape[0] / rate
	frames = int(np.ceil(duration * fps))
	samples_per_frame = signal.shape[0] / frames
	audio[track_name] = np.zeros(frames, dtype=signal.dtype)
	for frame in range(frames):
	start = int(round(frame * samples_per_frame))
	stop = int(round((frame + 1) * samples_per_frame))
	audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
	audio[track_name] /= max(audio[track_name])

	for track in sorted(audio.keys()):
	plt.figure(figsize=(8, 3))
	plt.title(track)
	plt.plot(audio[track])
	plt.savefig(f'data/{track}.png')

	network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
	_G, _D, Gs = pretrained_networks.load_networks(network_pkl)

	Gs_kwargs = dnnlib.EasyDict()
	Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
	Gs_kwargs.randomize_noise = False
	Gs_syn_kwargs = dnnlib.EasyDict()
	Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
	Gs_syn_kwargs.randomize_noise = False
	Gs_syn_kwargs.minibatch_size = 4
	noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
	w_avg = Gs.get_var('dlatent_avg')

	def get_ws(n, frames, seed):
	filename = f'data/ws_{n}_{frames}_{seed}.npy'
	if not os.path.exists(filename):
	src_ws = np.random.RandomState(seed).randn(n, 512)
	ws = np.empty((frames, 512))
	for i in range(512):
	# FIXME: retarded
	x = np.linspace(0, 3frames, 3len(src_ws), endpoint=False)
	y = np.tile(src_ws[:, i], 3)
	x_ = np.linspace(0, 3frames, 3frames, endpoint=False)
	y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
	ws[:, i] = y_[frames:2*frames]
	np.save(filename, ws)
	else:
	ws = np.load(filename)
	return ws

	def mix_styles(wa, wb, ivs):
	w = np.copy(wa)
	for i, v in ivs:
	w[i] = wa[i] * (1 - v) + wb[i] * v
	return w

	def normalize_vector(v):
	return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

	def render_frame(t):
	global base_index
	frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
	base_index += base_speed * audio['Instrumental'][frame]**2
	base_w = base_ws[int(round(base_index)) % len(base_ws)]
	base_w = np.tile(base_w, (18, 1))
	psi = 0.5 + audio['FX'][frame] / 2
	base_w = w_avg + (base_w - w_avg) * psi
	mix_w = np.tile(mix_ws[frame], (18, 1))
	mix_w = w_avg + (mix_w - w_avg) * 0.75
	ranges = [range(0, 4), range(4, 8), range(8, 18)]
	values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
	w = mix_styles(base_w, mix_w, zip(ranges, values))
	w += mouth_open * audio['Vocal'][frame] * 1.5
	image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
	image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
	return np.array(image)

	size = 1080
	seconds = int(np.ceil(duration))
	resolution = 10
	base_frames = resolution * frames
	base_ws = get_ws(seconds, base_frames, seed)
	base_speed = base_frames / sum(audio['Instrumental']**2)
	base_index = 0
	mix_ws = get_ws(seconds, frames, seed + 1)
	# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
	mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))

	mp4_filename = 'data/Culture Shock.mp4'
	video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
	audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
	audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
	audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
	video_clip = video_clip.set_audio(audio_clip)
	video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')