human motion diffusion model 使用笔记-爱代码爱编程
目录
依赖项:
spacy
依赖项目:
GitHub - EricGuo5513/HumanML3D: HumanML3D: A large and diverse 3d human motion-language dataset.
验证数据集
HumanML3D/HumanML3D at main · EricGuo5513/HumanML3D · GitHub
第1步,把目录 HumanML3D/HumanML3D拷贝到dataset目录中,
第2步,把HumanML3D数据集中的texts.zip解压到目录texts
体验网址:
daanelson/motion_diffusion_model – Run with an API on Replicate
推理代码整理,一键运行
# This code is based on https://github.com/openai/guided-diffusion
"""
Generate a large batch of image samples from a model and save them as a large
numpy array. This can be used to produce samples for FID evaluation.
"""
from utils.fixseed import fixseed
import os
import numpy as np
import torch
from utils.parser_util import generate_args
from utils.model_util import create_model_and_diffusion, load_model_wo_clip
from utils import dist_util
from model.cfg_sampler import ClassifierFreeSampleModel
from data_loaders.get_data import get_dataset_loader
from data_loaders.humanml.scripts.motion_process import recover_from_ric
import data_loaders.humanml.utils.paramUtil as paramUtil
from data_loaders.humanml.utils.plot_script import plot_3d_motion
import shutil
from data_loaders.tensors import collate
def main():
args = generate_args()
fixseed(args.seed)
args.text_prompt=r''
args.input_text=r'assets/example_text_prompts.txt'
args.model_path = r'humanml_trans_enc_512/model000200000.pt'
out_path = args.output_dir
name = os.path.basename(os.path.dirname(args.model_path))
niter = os.path.basename(args.model_path).replace('model', '').replace('.pt', '')
max_frames = 196 if args.dataset in ['kit', 'humanml'] else 60
fps = 12.5 if args.dataset == 'kit' else 20
n_frames = min(max_frames, int(args.motion_length*fps))
is_using_data = not any([args.input_text, args.text_prompt, args.action_file, args.action_name])
dist_util.setup_dist(args.device)
if out_path == '':
out_path = os.path.join(os.path.dirname(args.model_path),
'samples_{}_{}_seed{}'.format(name, niter, args.seed))
if args.text_prompt != '':
out_path += '_' + args.text_prompt.replace(' ', '_').replace('.', '')
elif args.input_text != '':
out_path += '_' + os.path.basename(args.input_text).replace('.txt', '').replace(' ', '_').replace('.', '')
# this block must be called BEFORE the dataset is loaded
if args.text_prompt != '':
texts = [args.text_prompt]
args.num_samples = 1
elif args.input_text != '':
assert os.path.exists(args.input_text)
with open(args.input_text, 'r') as fr:
texts = fr.readlines()
texts = [s.replace('\n', '') for s in texts]
args.num_samples = len(texts)
elif args.action_name:
action_text = [args.action_name]
args.num_samples = 1
elif args.action_file != '':
assert os.path.exists(args.action_file)
with open(args.action_file, 'r') as fr:
action_text = fr.readlines()
action_text = [s.replace('\n', '') for s in action_text]
args.num_samples = len(action_text)
assert args.num_samples <= args.batch_size, \
f'Please either increase batch_size({args.batch_size}) or reduce num_samples({args.num_samples})'
# So why do we need this check? In order to protect GPU from a memory overload in the following line.
# If your GPU can handle batch size larger then default, you can specify it through --batch_size flag.
# If it doesn't, and you still want to sample more prompts, run this script with different seeds
# (specify through the --seed flag)
args.batch_size = args.num_samples # Sampling a single batch from the testset, with exactly args.num_samples
print('Loading dataset...')
data = load_dataset(args, max_frames, n_frames)
total_num_samples = args.num_samples * args.num_repetitions
print("Creating model and diffusion...")
model, diffusion = create_model_and_diffusion(args, data)
print(f"Loading checkpoints from [{args.model_path}]...")
state_dict = torch.load(args.model_path, map_location='cpu')
load_model_wo_clip(model, state_dict)
if args.guidance_param != 1:
model = ClassifierFreeSampleModel(model) # wrapping model with the classifier-free sampler
model.to(dist_util.dev())
model.eval() # disable random masking
if is_using_data:
iterator = iter(data)
_, model_kwargs = next(iterator)
else:
collate_args = [{'inp': torch.zeros(n_frames), 'tokens': None, 'lengths': n_frames}] * args.num_samples
is_t2m = any([args.input_text, args.text_prompt])
if is_t2m:
# t2m
collate_args = [dict(arg, text=txt) for arg, txt in zip(collate_args, texts)]
else:
# a2m
action = data.dataset.action_name_to_action(action_text)
collate_args = [dict(arg, action=one_action, action_text=one_action_text) for
arg, one_action, one_action_text in zip(collate_args, action, action_text)]
_, model_kwargs = collate(collate_args)
all_motions = []
all_lengths = []
all_text = []
for rep_i in range(args.num_repetitions):
print(f'### Sampling [repetitions #{rep_i}]')
# add CFG scale to batch
if args.guidance_param != 1:
model_kwargs['y']['scale'] = torch.ones(args.batch_size, device=dist_util.dev()) * args.guidance_param
sample_fn = diffusion.p_sample_loop
sample = sample_fn(
model,
# (args.batch_size, model.njoints, model.nfeats, n_frames), # BUG FIX - this one caused a mismatch between training and inference
(args.batch_size, model.njoints, model.nfeats, max_frames), # BUG FIX
clip_denoised=False,
model_kwargs=model_kwargs,
skip_timesteps=0, # 0 is the default value - i.e. don't skip any step
init_image=None,
progress=True,
dump_steps=None,
noise=None,
const_noise=False,
)
# Recover XYZ *positions* from HumanML3D vector representation
if model.data_rep == 'hml_vec':
n_joints = 22 if sample.shape[1] == 263 else 21
sample = data.dataset.t2m_dataset.inv_transform(sample.cpu().permute(0, 2, 3, 1)).float()
sample = recover_from_ric(sample, n_joints)
sample = sample.view(-1, *sample.shape[2:]).permute(0, 2, 3, 1)
rot2xyz_pose_rep = 'xyz' if model.data_rep in ['xyz', 'hml_vec'] else model.data_rep
rot2xyz_mask = None if rot2xyz_pose_rep == 'xyz' else model_kwargs['y']['mask'].reshape(args.batch_size, n_frames).bool()
sample = model.rot2xyz(x=sample, mask=rot2xyz_mask, pose_rep=rot2xyz_pose_rep, glob=True, translation=True,
jointstype='smpl', vertstrans=True, betas=None, beta=0, glob_rot=None,
get_rotations_back=False)
if args.unconstrained:
all_text += ['unconstrained'] * args.num_samples
else:
text_key = 'text' if 'text' in model_kwargs['y'] else 'action_text'
all_text += model_kwargs['y'][text_key]
all_motions.append(sample.cpu().numpy())
all_lengths.append(model_kwargs['y']['lengths'].cpu().numpy())
print(f"created {len(all_motions) * args.batch_size} samples")
all_motions = np.concatenate(all_motions, axis=0)
all_motions = all_motions[:total_num_samples] # [bs, njoints, 6, seqlen]
all_text = all_text[:total_num_samples]
all_lengths = np.concatenate(all_lengths, axis=0)[:total_num_samples]
if os.path.exists(out_path):
shutil.rmtree(out_path)
os.makedirs(out_path)
npy_path = os.path.join(out_path, 'results.npy')
print(f"saving results file to [{npy_path}]")
np.save(npy_path,
{'motion': all_motions, 'text': all_text, 'lengths': all_lengths,
'num_samples': args.num_samples, 'num_repetitions': args.num_repetitions})
with open(npy_path.replace('.npy', '.txt'), 'w') as fw:
fw.write('\n'.join(all_text))
with open(npy_path.replace('.npy', '_len.txt'), 'w') as fw:
fw.write('\n'.join([str(l) for l in all_lengths]))
print(f"saving visualizations to [{out_path}]...")
skeleton = paramUtil.kit_kinematic_chain if args.dataset == 'kit' else paramUtil.t2m_kinematic_chain
sample_files = []
num_samples_in_out_file = 7
sample_print_template, row_print_template, all_print_template, \
sample_file_template, row_file_template, all_file_template = construct_template_variables(args.unconstrained)
for sample_i in range(args.num_samples):
rep_files = []
for rep_i in range(args.num_repetitions):
caption = all_text[rep_i*args.batch_size + sample_i]
length = all_lengths[rep_i*args.batch_size + sample_i]
motion = all_motions[rep_i*args.batch_size + sample_i].transpose(2, 0, 1)[:length]
save_file = sample_file_template.format(sample_i, rep_i)
print(sample_print_template.format(caption, sample_i, rep_i, save_file))
animation_save_path = os.path.join(out_path, save_file)
plot_3d_motion(animation_save_path, skeleton, motion, dataset=args.dataset, title=caption, fps=fps)
# Credit for visualization: https://github.com/EricGuo5513/text-to-motion
rep_files.append(animation_save_path)
sample_files = save_multiple_samples(args, out_path,
row_print_template, all_print_template, row_file_template, all_file_template,
caption, num_samples_in_out_file, rep_files, sample_files, sample_i)
abs_path = os.path.abspath(out_path)
print(f'[Done] Results are at [{abs_path}]')
def save_multiple_samples(args, out_path, row_print_template, all_print_template, row_file_template, all_file_template,
caption, num_samples_in_out_file, rep_files, sample_files, sample_i):
all_rep_save_file = row_file_template.format(sample_i)
all_rep_save_path = os.path.join(out_path, all_rep_save_file)
ffmpeg_rep_files = [f' -i {f} ' for f in rep_files]
hstack_args = f' -filter_complex hstack=inputs={args.num_repetitions}' if args.num_repetitions > 1 else ''
ffmpeg_rep_cmd = f'ffmpeg -y -loglevel warning ' + ''.join(ffmpeg_rep_files) + f'{hstack_args} {all_rep_save_path}'
os.system(ffmpeg_rep_cmd)
print(row_print_template.format(caption, sample_i, all_rep_save_file))
sample_files.append(all_rep_save_path)
if (sample_i + 1) % num_samples_in_out_file == 0 or sample_i + 1 == args.num_samples:
# all_sample_save_file = f'samples_{(sample_i - len(sample_files) + 1):02d}_to_{sample_i:02d}.mp4'
all_sample_save_file = all_file_template.format(sample_i - len(sample_files) + 1, sample_i)
all_sample_save_path = os.path.join(out_path, all_sample_save_file)
print(all_print_template.format(sample_i - len(sample_files) + 1, sample_i, all_sample_save_file))
ffmpeg_rep_files = [f' -i {f} ' for f in sample_files]
vstack_args = f' -filter_complex vstack=inputs={len(sample_files)}' if len(sample_files) > 1 else ''
ffmpeg_rep_cmd = f'ffmpeg -y -loglevel warning ' + ''.join(
ffmpeg_rep_files) + f'{vstack_args} {all_sample_save_path}'
os.system(ffmpeg_rep_cmd)
sample_files = []
return sample_files
def construct_template_variables(unconstrained):
row_file_template = 'sample{:02d}.mp4'
all_file_template = 'samples_{:02d}_to_{:02d}.mp4'
if unconstrained:
sample_file_template = 'row{:02d}_col{:02d}.mp4'
sample_print_template = '[{} row #{:02d} column #{:02d} | -> {}]'
row_file_template = row_file_template.replace('sample', 'row')
row_print_template = '[{} row #{:02d} | all columns | -> {}]'
all_file_template = all_file_template.replace('samples', 'rows')
all_print_template = '[rows {:02d} to {:02d} | -> {}]'
else:
sample_file_template = 'sample{:02d}_rep{:02d}.mp4'
sample_print_template = '["{}" ({:02d}) | Rep #{:02d} | -> {}]'
row_print_template = '[ "{}" ({:02d}) | all repetitions | -> {}]'
all_print_template = '[samples {:02d} to {:02d} | all repetitions | -> {}]'
return sample_print_template, row_print_template, all_print_template, \
sample_file_template, row_file_template, all_file_template
def load_dataset(args, max_frames, n_frames):
data = get_dataset_loader(name=args.dataset,
batch_size=args.batch_size,
num_frames=max_frames,
split='test',
hml_mode='text_only')
if args.dataset in ['kit', 'humanml']:
data.dataset.t2m_dataset.fixed_length = n_frames
return data
if __name__ == "__main__":
main()
渲染mesh
import argparse
import os
from visualize import vis_utils
import shutil
from tqdm import tqdm
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_path", type=str, default=r"E:\project\202404\motion-diffusion-model-main\humanml_trans_enc_512\samples_humanml_trans_enc_512_000200000_seed10_example_text_prompts\sample00_rep00.mp4", help='stick figure mp4 file to be rendered.')
parser.add_argument("--cuda", type=bool, default=True, help='')
parser.add_argument("--device", type=int, default=0, help='')
params = parser.parse_args()
assert params.input_path.endswith('.mp4')
parsed_name = os.path.basename(params.input_path).replace('.mp4', '').replace('sample', '').replace('rep', '')
sample_i, rep_i = [int(e) for e in parsed_name.split('_')]
npy_path = os.path.join(os.path.dirname(params.input_path), 'results.npy')
out_npy_path = params.input_path.replace('.mp4', '_smpl_params.npy')
assert os.path.exists(npy_path)
results_dir = params.input_path.replace('.mp4', '_obj')
if os.path.exists(results_dir):
shutil.rmtree(results_dir)
os.makedirs(results_dir)
npy2obj = vis_utils.npy2obj(npy_path, sample_i, rep_i,
device=params.device, cuda=params.cuda)
print('Saving obj files to [{}]'.format(os.path.abspath(results_dir)))
for frame_i in tqdm(range(npy2obj.real_num_frames)):
npy2obj.save_obj(os.path.join(results_dir, 'frame{:03d}.obj'.format(frame_i)), frame_i)
print('Saving SMPL params to [{}]'.format(os.path.abspath(out_npy_path)))
npy2obj.save_npy(out_npy_path)
可视化代码:
自带的可视化代码windows系统不能显示,经过改进实现可以显示的代码:
import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation, FFMpegFileWriter
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import mpl_toolkits.mplot3d.axes3d as p3
# import cv2
from textwrap import wrap
def list_cut_average(ll, intervals):
if intervals == 1:
return ll
bins = math.ceil(len(ll) * 1.0 / intervals)
ll_new = []
for i in range(bins):
l_low = intervals * i
l_high = l_low + intervals
l_high = l_high if l_high < len(ll) else len(ll)
ll_new.append(np.mean(ll[l_low:l_high]))
return ll_new
def plot_3d_motion(save_path, kinematic_tree, joints, title, dataset, figsize=(3, 3), fps=120, radius=3, vis_mode='default', gt_frames=[]):
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111, projection='3d')
plt.tight_layout()
title = '\n'.join(wrap(title, 20))
colors_blue = ["#4D84AA", "#5B9965", "#61CEB9", "#34C1E2", "#80B79A"] # GT color
colors_orange = ["#DD5A37", "#D69E00", "#B75A39", "#FF6D00", "#DDB50E"] # Generation color
colors = colors_orange
def init():
ax.set_xlim3d([-radius / 2, radius / 2])
ax.set_ylim3d([0, radius])
ax.set_zlim3d([-radius / 3., radius * 2 / 3.])
fig.suptitle(title, fontsize=10)
ax.grid(b=False)
return fig,
def update(index):
while len(ax.lines) > 0:
ax.lines[0].remove()
while len(ax.collections) > 0:
ax.collections[0].remove()
ax.view_init(elev=120, azim=-90)
ax.dist = 7.5
# Draw motion
used_colors = colors_blue if index in gt_frames else colors
for i, (chain, color) in enumerate(zip(kinematic_tree, used_colors)):
linewidth = 4.0 if i < 5 else 2.0
ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth, color=color)
plt.axis('off')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])
return fig,
data = joints.copy().reshape(len(joints), -1, 3)
frame_number = data.shape[0]
ani = FuncAnimation(fig, update, frames=frame_number, interval=1000 / fps, repeat=False, init_func=init)
ani.save(save_path, fps=fps)
# plt.show()
plt.close()
if __name__ == '__main__':
save_path='0415.mp4'
dataset='humanml'
save_path='0415.mp4'
title='title'
kinematic_tree=[[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21], [9, 13, 16, 18, 20]]
npz_data = np.load(r"E:\04151.npz", allow_pickle=True)
joints = npz_data['joints_3d'].item()['data']
joints/=20
plot_3d_motion(save_path, kinematic_tree, joints, title, dataset, figsize=(3, 3), fps=120, radius=3, vis_mode='default', gt_frames=[])