Skip to content

Commit ab85e6c

Browse files
committed
Updating Tacotron2_pyt (BatchNorm init fix), Resnet_tf (cosine LR),
Transformer_pyt (bugfix)
1 parent 55a03b8 commit ab85e6c

33 files changed

Lines changed: 274 additions & 91 deletions

PyTorch/SpeechSynthesis/Tacotron2/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:19.06-py3
1+
FROM nvcr.io/nvidia/pytorch:19.07-py3
22

33
ADD . /workspace/tacotron2
44
WORKDIR /workspace/tacotron2

PyTorch/SpeechSynthesis/Tacotron2/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ Batch: 8/260 epoch 0
417417

418418
### Getting the data
419419

420-
The Tacotron 2 and WaveGlow models were trained on the LJSpeech-1.1 dataset.
420+
The Tacotron 2 and WaveGlow models were trained on the LJSpeech-1.1 dataset.
421421
This repository contains the `./scripts/prepare_dataset.sh` script which will automatically download and extract the whole dataset. By default, data will be extracted to the `./LJSpeech-1.1` directory. The dataset directory contains a `README` file, a `wavs` directory with all audio samples, and a file `metadata.csv` that contains audio file names and the corresponding transcripts.
422422

423423
#### Dataset guidelines
@@ -428,7 +428,7 @@ The LJSpeech dataset has 13,100 clips that amount to about 24 hours of speech. S
428428

429429
To use datasets different than the default LJSpeech dataset:
430430

431-
1. Prepare a directory with all audio files and pass it to the `--dataset-path` command-line option.
431+
1. Prepare a directory with all audio files and pass it to the `--dataset-path` command-line option.
432432

433433
2. Add two text files containing file lists: one for the training subset (`--training-files`) and one for the validation subset (`--validation files`).
434434
The structure of the filelists should be as follows:
@@ -679,6 +679,10 @@ benchmarks from input tokes per second to output mel-spectrograms per second
679679
* Introduced batched inference
680680
* Included warmup in the inference script
681681

682+
August 2019
683+
* Fixed inference results
684+
* Fixed initialization of Batch Normalization
685+
682686
### Known issues
683687

684688
There are no known issues in this release.

PyTorch/SpeechSynthesis/Tacotron2/inference.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ def parse_args(parser):
4646
Parse commandline arguments.
4747
"""
4848
parser.add_argument('-i', '--input', type=str, required=True,
49-
help='full path to the input text (phareses separated by new line); \
50-
if not provided then use default text')
49+
help='full path to the input text (phareses separated by new line)')
5150
parser.add_argument('-o', '--output', required=True,
5251
help='output folder to save audio (file per phrase)')
5352
parser.add_argument('--tacotron2', type=str,

PyTorch/SpeechSynthesis/Tacotron2/models.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,16 @@ def batchnorm_to_float(module):
5454
return module
5555

5656

57-
def get_model(model_name, model_config, to_cuda):
57+
def init_bn(module):
58+
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
59+
if module.affine:
60+
module.weight.data.uniform_()
61+
for child in module.children():
62+
init_bn(child)
63+
64+
65+
def get_model(model_name, model_config, to_cuda,
66+
uniform_initialize_bn_weight=False):
5867
""" Code chooses a model based on name"""
5968
model = None
6069
if model_name == 'Tacotron2':
@@ -63,6 +72,10 @@ def get_model(model_name, model_config, to_cuda):
6372
model = WaveGlow(**model_config)
6473
else:
6574
raise NotImplementedError(model_name)
75+
76+
if uniform_initialize_bn_weight:
77+
init_bn(model)
78+
6679
if to_cuda:
6780
model = model.cuda()
6881
return model

PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def __init__(self, encoder_n_convolutions,
206206
dilation=1, w_init_gain='relu'),
207207
nn.BatchNorm1d(encoder_embedding_dim))
208208
convolutions.append(conv_layer)
209-
self.convolutions = nn.ModuleList(convolutions)
209+
self.convolutions = nn.ModuleList(convolutions)
210210

211211
self.lstm = nn.LSTM(encoder_embedding_dim,
212212
int(encoder_embedding_dim / 2), 1,
@@ -231,17 +231,6 @@ def forward(self, x, input_lengths):
231231

232232
return outputs
233233

234-
def infer(self, x):
235-
for conv in self.convolutions:
236-
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
237-
238-
x = x.transpose(1, 2)
239-
240-
self.lstm.flatten_parameters()
241-
outputs, _ = self.lstm(x)
242-
243-
return outputs
244-
245234

246235
class Decoder(nn.Module):
247236
def __init__(self, n_mel_channels, n_frames_per_step,

PyTorch/SpeechSynthesis/Tacotron2/train.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ def parse_args(parser):
9696
help='Enable cudnn')
9797
training.add_argument('--cudnn-benchmark', action='store_true',
9898
help='Run cudnn benchmark')
99+
training.add_argument('--disable-uniform-initialize-bn-weight', action='store_true',
100+
help='disable uniform initialization of batchnorm layer weight')
99101

100102
optimization = parser.add_argument_group('optimization setup')
101103
optimization.add_argument(
@@ -343,7 +345,8 @@ def main():
343345

344346
model_config = models.get_model_config(model_name, args)
345347
model = models.get_model(model_name, model_config,
346-
to_cuda=True)
348+
to_cuda=True,
349+
uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)
347350

348351
if not args.amp_run and distributed_run:
349352
model = DDP(model)

PyTorch/Translation/Transformer/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.06-py3
1616
FROM ${FROM_IMAGE_NAME}
1717

18+
WORKDIR /workspace
19+
RUN git clone https://github.com/NVIDIA/apex \
20+
&& cd apex \
21+
&& pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
1822
# Install Python dependencies
1923
RUN pip install --upgrade --no-cache-dir pip \
2024
&& pip install --no-cache-dir \

PyTorch/Translation/Transformer/README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ The following sections provide details on how we achieved our performance and ac
345345
#### Training accuracy results
346346

347347
In order to test the accuracy of our implementation, we have run experiments with different seeds for 100 epochs with batch size 5120 per GPU and learning rate 6e-4 in the pytorch-18.12-py3 Docker container. The plot below shows the BLEU score changes.<br/>
348-
![Accuracy plot](/BLEU.png)
348+
![Accuracy plot](./BLEU.png)
349349

350350
Running this code with the provided hyperparameters will allow you to achieve the following results. Our setup is a DGX-1 with 8x Tesla V100 16GB. We've verified our results after training 32 epochs to obtain multi-GPU and mixed precision scaling results.
351351

@@ -424,12 +424,18 @@ January 2019
424424
- initial commit, forked from [fairseq](https://github.com/pytorch/fairseq/commit/ac5fddfc691267285a84c81d39475411da5ed1c6)
425425

426426
May 2019:
427-
- adding mid-training SacreBLEU evaluation. Better handling of OOMs.
427+
- add mid-training [SacreBLEU](https://pypi.org/project/sacrebleu/1.2.10/) evaluation. Better handling of OOMs.
428428

429429
June 2019
430430
- new README
431-
- jit support added
431+
432+
July 2019
433+
- replace custom fused operators with jit functions
434+
435+
August 2019
436+
- add basic AMP support
432437

433438
## Known issues
434439

435440
- Course of a training heavily depends on a random seed. There is high variance in the time required to reach a certain BLEU score. Also the highest BLEU score value observed vary between runs with different seeds.
441+
- Translations produced by training script during online evaluation may differ from those produced by `generate.py` script. It is probably a format conversion issue.

PyTorch/Translation/Transformer/fairseq/optim/fairseq_optimizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def optimizer(self):
4040
"""Return a torch.optim.optimizer.Optimizer instance."""
4141
if not hasattr(self, '_optimizer'):
4242
raise NotImplementedError
43-
if not isinstance(self._optimizer, torch.optim.Optimizer):
43+
#TODO: this shouldn't be dependent of args.amp
44+
if not isinstance(self._optimizer, torch.optim.Optimizer) and not self.args.amp:
4445
raise ValueError('_optimizer must be an instance of torch.optim.Optimizer')
4546
return self._optimizer
4647

PyTorch/Translation/Transformer/fairseq/options.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ def get_parser(desc, default_task='translation'):
144144
parser.add_argument('--seed', default=1, type=int, metavar='N',
145145
help='pseudo random number generator seed')
146146
parser.add_argument('--fp16', action='store_true', help='use FP16')
147+
parser.add_argument('--amp', action='store_true', help='use Automatic Mixed Precision')
148+
parser.add_argument('--amp-level', type=str, default="O1", help='choose apm\'s optimization level')
149+
147150
parser.add_argument('--profile', type=int, default=None)
148151
# Task definitions can be found under fairseq/tasks/
149152
parser.add_argument(

0 commit comments

Comments
 (0)