Skip to content

Instantly share code, notes, and snippets.

@fajela
Forked from rolux/culture_shock.py
Created August 26, 2020 18:15
Show Gist options
  • Save fajela/0be6ff63d1b8a420365a91bc46b5c247 to your computer and use it in GitHub Desktop.
Save fajela/0be6ff63d1b8a420365a91bc46b5c247 to your computer and use it in GitHub Desktop.
# git clone https://github.com/NVlabs/stylegan2
import os
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
import matplotlib.pyplot as plt
import PIL.Image
import moviepy.editor
import dnnlib
import dnnlib.tflib as tflib
import pretrained_networks
audio = {}
fps = 60
# https://www.google.com/search?q=death+grips+black+google+download
for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
mp3_filename = f'data/{mp3_filename}'
wav_filename = mp3_filename[:-4] + '.wav'
if not os.path.exists(wav_filename):
audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
track_name = os.path.basename(wav_filename)[15:-5]
rate, signal = wavfile.read(wav_filename)
signal = np.mean(signal, axis=1) # to mono
signal = np.abs(signal)
seed = signal.shape[0]
duration = signal.shape[0] / rate
frames = int(np.ceil(duration * fps))
samples_per_frame = signal.shape[0] / frames
audio[track_name] = np.zeros(frames, dtype=signal.dtype)
for frame in range(frames):
start = int(round(frame * samples_per_frame))
stop = int(round((frame + 1) * samples_per_frame))
audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
audio[track_name] /= max(audio[track_name])
for track in sorted(audio.keys()):
plt.figure(figsize=(8, 3))
plt.title(track)
plt.plot(audio[track])
plt.savefig(f'data/{track}.png')
network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
_G, _D, Gs = pretrained_networks.load_networks(network_pkl)
Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
Gs_syn_kwargs = dnnlib.EasyDict()
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_syn_kwargs.randomize_noise = False
Gs_syn_kwargs.minibatch_size = 4
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
w_avg = Gs.get_var('dlatent_avg')
def get_ws(n, frames, seed):
filename = f'data/ws_{n}_{frames}_{seed}.npy'
if not os.path.exists(filename):
src_ws = np.random.RandomState(seed).randn(n, 512)
ws = np.empty((frames, 512))
for i in range(512):
# FIXME: retarded
x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
y = np.tile(src_ws[:, i], 3)
x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
ws[:, i] = y_[frames:2*frames]
np.save(filename, ws)
else:
ws = np.load(filename)
return ws
def mix_styles(wa, wb, ivs):
w = np.copy(wa)
for i, v in ivs:
w[i] = wa[i] * (1 - v) + wb[i] * v
return w
def normalize_vector(v):
return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)
def render_frame(t):
global base_index
frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
base_index += base_speed * audio['Instrumental'][frame]**2
base_w = base_ws[int(round(base_index)) % len(base_ws)]
base_w = np.tile(base_w, (18, 1))
psi = 0.5 + audio['FX'][frame] / 2
base_w = w_avg + (base_w - w_avg) * psi
mix_w = np.tile(mix_ws[frame], (18, 1))
mix_w = w_avg + (mix_w - w_avg) * 0.75
ranges = [range(0, 4), range(4, 8), range(8, 18)]
values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
w = mix_styles(base_w, mix_w, zip(ranges, values))
w += mouth_open * audio['Vocal'][frame] * 1.5
image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
return np.array(image)
size = 1080
seconds = int(np.ceil(duration))
resolution = 10
base_frames = resolution * frames
base_ws = get_ws(seconds, base_frames, seed)
base_speed = base_frames / sum(audio['Instrumental']**2)
base_index = 0
mix_ws = get_ws(seconds, frames, seed + 1)
# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))
mp4_filename = 'data/Culture Shock.mp4'
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment