Each part of this method is covered in the other sections, so I’m just pasting it here for reference.
def make_model(
src_vocab: torchtext.vocab.Vocab, # Mapping between tokens and indices
tgt_vocab: torchtext.vocab.Vocab, # Same
N: int = 6, # Number of encoder/decoder blocks
d_model=512, # Sequence length (used for key, query, and value as well)
d_ff=2048, # Size of position-wise feed-forward layer
h=8, # Number of attention heads
dropout=0.1 # Dropout rate
) -> EncoderDecoder:
"Helper: Construct a model from hyperparameters."
c = copy.deepcopy
attn = MultiHeadedAttention(h, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
position = PositionalEncoding(d_model, dropout)
model = EncoderDecoder(
Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
Generator(d_model, tgt_vocab),
)
# This was important from their code.
# Initialize parameters with Glorot / fan_avg.
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
return model