From 788af6f7bf95cf280cb89a3a704fc6e53eff19bf Mon Sep 17 00:00:00 2001 From: Jon Barker Date: Sat, 5 Aug 2023 11:16:39 -0700 Subject: [PATCH] Fix off by one error in document preprocessing --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 399f93c10e..66977f2850 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -95,6 +95,7 @@ def encode(self, json_line): sentence_lens.append(len(sentence_ids)) if len(doc_ids) > 0 and self.args.append_eod: doc_ids.append(Encoder.tokenizer.eod) + sentence_lens[-1] += 1 ids[key] = doc_ids lens[key] = sentence_lens return ids, lens, len(json_line)