From 6999b7ca9afb6ce0f5cf0aaddc65780100175390 Mon Sep 17 00:00:00 2001 From: kshitij Date: Wed, 18 Oct 2023 22:35:45 +0200 Subject: [PATCH] added pad token to tokenizer --- megatron/tokenizer/tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 98ad12629..d4089f2b5 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -231,6 +231,8 @@ def __init__(self, vocab_file): self.eod_id = self.tokenizer.token_to_id("<|endoftext|>") self.pad_id = self.tokenizer.token_to_id("<|padding|>") self.pad_token_id = self.pad_id + self._pad_token = self.pad_id + self.padding_side = "right" @property def vocab_size(self):