3v324v23 commited on
Commit
28cda0c
1 Parent(s): 49ee521

Add config

Browse files
configs/txt2audio-cfm1-cfg-LargeDiT3.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.cfm1_audio.CFM
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: image
11
+ cond_stage_key: caption
12
+ mel_dim: 20
13
+ mel_length: 256
14
+ channels: 0
15
+ cond_stage_trainable: True
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_ema: false
20
+ scheduler_config:
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps:
24
+ - 10000
25
+ cycle_lengths:
26
+ - 10000000000000
27
+ f_start:
28
+ - 1.0e-06
29
+ f_max:
30
+ - 1.0
31
+ f_min:
32
+ - 1.0
33
+ unet_config:
34
+ target: ldm.modules.diffusionmodules.flag_large_dit.TxtFlagLargeImprovedDiTV2
35
+ params:
36
+ in_channels: 20
37
+ context_dim: 1024
38
+ hidden_size: 768
39
+ num_heads: 32
40
+ depth: 16
41
+ max_len: 1000
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder1d.AutoencoderKL
45
+ params:
46
+ embed_dim: 20
47
+ monitor: val/rec_loss
48
+ ckpt_path: useful_ckpts/maa2/maa2.ckpt
49
+ ddconfig:
50
+ double_z: true
51
+ in_channels: 80
52
+ out_ch: 80
53
+ z_channels: 20
54
+ kernel_size: 5
55
+ ch: 384
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ num_res_blocks: 2
61
+ attn_layers:
62
+ - 3
63
+ down_layers:
64
+ - 0
65
+ dropout: 0.0
66
+ lossconfig:
67
+ target: torch.nn.Identity
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
70
+ params:
71
+ weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
72
+
73
+
74
+ lightning:
75
+ callbacks:
76
+ image_logger:
77
+ target: main.AudioLogger
78
+ params:
79
+ sample_rate: 16000
80
+ for_specs: true
81
+ increase_log_steps: false
82
+ batch_frequency: 5000
83
+ max_images: 8
84
+ melvmin: -5
85
+ melvmax: 1.5
86
+ vocoder_cfg:
87
+ target: vocoder.bigvgan.models.VocoderBigVGAN
88
+ params:
89
+ ckpt_vocoder: useful_ckpts/bigvnat
90
+ trainer:
91
+ benchmark: True
92
+ gradient_clip_val: 1.0
93
+ limit_val_batches: 0.0
94
+
95
+ modelcheckpoint:
96
+ params:
97
+ monitor: epoch
98
+ mode: max
99
+ save_top_k: 10
100
+ every_n_epochs: 5
101
+
102
+ data:
103
+ target: main.DataModuleFromConfig
104
+ params:
105
+ batch_size: 16
106
+ num_workers: 20
107
+ wrap: True
108
+ train:
109
+ target: ldm.data.txt_spec_dataset.spec_join_Dataset_Train
110
+ params:
111
+ dataset_cfg:
112
+ dataset_name: audiocaps_train_16000_struct2.tsv
113
+ spec_crop_len: 624
114
+ drop: 0.2
115
+
116
+ validation:
117
+ target: ldm.data.txt_spec_dataset.spec_join_Dataset_Valid
118
+ params:
119
+ dataset_cfg:
120
+ dataset_name: audiocaps_train_16000_struct2.tsv
121
+ spec_crop_len: 624
122
+ drop: 0.0
123
+
124
+
125
+ test_dataset:
126
+ target: ldm.data.joinaudiodataset_struct_sample_anylen.TestManifest
127
+ params:
128
+ manifest: data/audiocaps_test_struct.tsv
129
+ spec_crop_len: 624
configs/txt2music-cfm1-cfg-LargeDiT3.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.cfm1_audio.CFM
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: image
11
+ cond_stage_key: caption
12
+ mel_dim: 20
13
+ mel_length: 256
14
+ channels: 0
15
+ cond_stage_trainable: True
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_ema: false
20
+ scheduler_config:
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps:
24
+ - 10000
25
+ cycle_lengths:
26
+ - 10000000000000
27
+ f_start:
28
+ - 1.0e-06
29
+ f_max:
30
+ - 1.0
31
+ f_min:
32
+ - 1.0
33
+ unet_config:
34
+ target: ldm.modules.diffusionmodules.flag_large_dit.TxtFlagLargeImprovedDiTV2
35
+ params:
36
+ in_channels: 20
37
+ context_dim: 1024
38
+ hidden_size: 768
39
+ num_heads: 32
40
+ depth: 16
41
+ max_len: 1000
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder1d.AutoencoderKL
45
+ params:
46
+ embed_dim: 20
47
+ monitor: val/rec_loss
48
+ ckpt_path: useful_ckpts/maa2/maa2.ckpt
49
+ ddconfig:
50
+ double_z: true
51
+ in_channels: 80
52
+ out_ch: 80
53
+ z_channels: 20
54
+ kernel_size: 5
55
+ ch: 384
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ num_res_blocks: 2
61
+ attn_layers:
62
+ - 3
63
+ down_layers:
64
+ - 0
65
+ dropout: 0.0
66
+ lossconfig:
67
+ target: torch.nn.Identity
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenFLANEmbedder
70
+ # params:
71
+ # version: /root/autodl-tmp/liuhuadai/audio_util/Make-An-Audio-3/useful_ckpts/flan-t5-large/pytorch_model.bin
72
+
73
+
74
+ lightning:
75
+ callbacks:
76
+ image_logger:
77
+ target: main.AudioLogger
78
+ params:
79
+ sample_rate: 16000
80
+ for_specs: true
81
+ increase_log_steps: false
82
+ batch_frequency: 5000
83
+ max_images: 8
84
+ melvmin: -5
85
+ melvmax: 1.5
86
+ vocoder_cfg:
87
+ target: vocoder.bigvgan.models.VocoderBigVGAN
88
+ params:
89
+ ckpt_vocoder: useful_ckpts/bigvnat
90
+ trainer:
91
+ benchmark: True
92
+ gradient_clip_val: 1.0
93
+ limit_val_batches: 0.0
94
+
95
+ modelcheckpoint:
96
+ params:
97
+ monitor: epoch
98
+ mode: max
99
+ save_top_k: 10
100
+ every_n_epochs: 5
101
+
102
+ data:
103
+ target: main.DataModuleFromConfig
104
+ params:
105
+ batch_size: 16
106
+ num_workers: 20
107
+ wrap: True
108
+ train:
109
+ target: ldm.data.txt_spec_dataset.spec_join_Dataset_audioset_Train
110
+ params:
111
+ dataset_cfg:
112
+ dataset_name: data/audioset_new_intern.tsv
113
+ spec_crop_len: 624
114
+ drop: 0.2
115
+
116
+ validation:
117
+ target: ldm.data.txt_spec_dataset.spec_join_Dataset_audioset_Valid
118
+ params:
119
+ dataset_cfg:
120
+ dataset_name: data/audioset_new_intern.tsv
121
+ spec_crop_len: 624
122
+ drop: 0.0
123
+
124
+
125
+ test_dataset:
126
+ target: ldm.data.joinaudiodataset_struct_sample_anylen.TestManifest
127
+ params:
128
+ manifest: data/musiccaps_test_16000_struct.tsv
129
+ spec_crop_len: 624
configs/video2audio-cfm1-cfg-LargeDiT1-moe.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.cfm1_audio.CFM
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "mix_spec"
11
+ cond_stage_key: "mix_video_feat"
12
+ mel_dim: 20
13
+ mel_length: 256
14
+ channels: 0
15
+ cond_stage_trainable: True
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_ema: false
20
+ scheduler_config:
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps:
24
+ - 10000
25
+ cycle_lengths:
26
+ - 10000000000000
27
+ f_start:
28
+ - 1.0e-06
29
+ f_max:
30
+ - 1.0
31
+ f_min:
32
+ - 1.0
33
+ unet_config:
34
+ target: ldm.modules.diffusionmodules.flag_large_dit_moe.VideoFlagLargeDiT
35
+ params:
36
+ in_channels: 20
37
+ context_dim: 768
38
+ hidden_size: 768
39
+ num_heads: 32
40
+ depth: 16
41
+ max_len: 1000
42
+ num_experts: 4
43
+
44
+
45
+ first_stage_config:
46
+ target: ldm.models.autoencoder1d.AutoencoderKL
47
+ params:
48
+ embed_dim: 20
49
+ monitor: val/rec_loss
50
+ ckpt_path: /apdcephfs_intern/share_1316500/nlphuang/results/Text_to_audio/ldm_src/ckpt/epoch=000032.ckpt
51
+ ddconfig:
52
+ double_z: true
53
+ in_channels: 80
54
+ out_ch: 80
55
+ z_channels: 20
56
+ kernel_size: 5
57
+ ch: 384
58
+ ch_mult:
59
+ - 1
60
+ - 2
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_layers:
64
+ - 3
65
+ down_layers:
66
+ - 0
67
+ dropout: 0.0
68
+ lossconfig:
69
+ target: torch.nn.Identity
70
+ cond_stage_config:
71
+ target: ldm.modules.encoders.modules.Video_Feat_Encoder_NoPosembed
72
+ params:
73
+ origin_dim: 512
74
+ embed_dim: 768
75
+ seq_len: 40
76
+
77
+ lightning:
78
+ callbacks:
79
+ image_logger:
80
+ target: main.AudioLogger
81
+ params:
82
+ sample_rate: 16000
83
+ for_specs: true
84
+ increase_log_steps: false
85
+ batch_frequency: 5000
86
+ max_images: 8
87
+ melvmin: -5
88
+ melvmax: 1.5
89
+ vocoder_cfg:
90
+ target: vocoder.bigvgan.models.VocoderBigVGAN
91
+ params:
92
+ ckpt_vocoder: /apdcephfs_intern/share_1316500/nlphuang/results/Text_to_audio/ldm_src/ckpt/bigvnat
93
+ trainer:
94
+ benchmark: True
95
+ gradient_clip_val: 1.0
96
+ modelcheckpoint:
97
+ params:
98
+ monitor: epoch
99
+ mode: max
100
+ save_top_k: 10
101
+ every_n_epochs: 5
102
+
103
+ data:
104
+ target: main.DataModuleFromConfig
105
+ params:
106
+ batch_size: 16 # originally 220
107
+ num_workers: 10
108
+ wrap: True
109
+ train:
110
+ target: ldm.data.video_spec_maa2_dataset.audio_video_spec_fullset_Dataset_Train
111
+ params:
112
+ dataset_cfg:
113
+ dataset1:
114
+ dataset_name: VGGSound
115
+ data_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/
116
+ video_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/ # Not Necessary Except for Inference
117
+ split_txt_path: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/split_txt
118
+
119
+ feat_type: CAVP_feat
120
+ sr: 16000
121
+ duration: 10
122
+ truncate: 131072
123
+ fps: 4
124
+ hop_len: 256
125
+ drop: 0.2
126
+
127
+ validation:
128
+ target: ldm.data.video_spec_maa2_dataset.audio_video_spec_fullset_Dataset_Valid
129
+ params:
130
+ dataset_cfg:
131
+ dataset1:
132
+ dataset_name: VGGSound
133
+ data_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/
134
+ video_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/ # Not Necessary Except for Inference
135
+ split_txt_path: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/split_txt
136
+
137
+ feat_type: CAVP_feat
138
+ sr: 16000
139
+ duration: 10
140
+ truncate: 131072
141
+ fps: 4
142
+ hop_len: 256
data/audiocaps_test_struct.tsv ADDED
The diff for this file is too large to render. See raw diff
 
data/musiccaps_test_16000_struct.tsv ADDED
The diff for this file is too large to render. See raw diff