Each part of this method is covered in the other sections, so I’m just pasting it here for reference.

def make_model(
    src_vocab: torchtext.vocab.Vocab,  # Mapping between tokens and indices
    tgt_vocab: torchtext.vocab.Vocab,  # Same
    N: int = 6,                        # Number of encoder/decoder blocks
    d_model=512,                       # Sequence length (used for key, query, and value as well)
    d_ff=2048,                         # Size of position-wise feed-forward layer
    h=8,                               # Number of attention heads
    dropout=0.1                        # Dropout rate
) -> EncoderDecoder:
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )
 
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model