From e20927318b3a17de46636c980ccb11beaa0352f5 Mon Sep 17 00:00:00 2001 From: Geoffrey Martin-Noble Date: Fri, 5 Jan 2024 22:27:18 +0000 Subject: [PATCH 1/2] gitignore further common output directories Probably most controversial here is the addition of `data/*/samples/*`. I was using this to save sample prompts for datasets. Happy to drop it if its inclusion is not desired. The other things are all common things you'd want to gitignore: venv directories, vs-code workspaces, output directories (using the directory names suggested by this codebase), and the default wandb output directory. --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index a3c82a25b7..545dce4a0e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,9 @@ __pycache__/ *.pt *.pyc input.txt +*.venv +*.code-workspace +out/ +out-*/ +wandb/* +data/*/samples/* From fba53b8858e925c1fb7c8753c9d589638c0e042a Mon Sep 17 00:00:00 2001 From: Geoffrey Martin-Noble Date: Fri, 5 Jan 2024 22:30:30 +0000 Subject: [PATCH 2/2] Generalize encode/decode for datasets This fixes a TODO to allow arbitrary encoding/decoding schemes for different datasets. To do so, I switched from pickle to dill, which extends pickle to enable things like pickling functions, including their referenced globals. dill is already a dependency of datasets, so this doesn't add any new dependencies. --- data/shakespeare_char/prepare.py | 8 ++++---- sample.py | 9 +++------ train.py | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py index 9fd1621d55..8eb5f8362d 100644 --- a/data/shakespeare_char/prepare.py +++ b/data/shakespeare_char/prepare.py @@ -5,7 +5,7 @@ encoder and decoder and some other related info. """ import os -import pickle +import dill import requests import numpy as np @@ -54,11 +54,11 @@ def decode(l): # save the meta information as well, to help us encode/decode later meta = { 'vocab_size': vocab_size, - 'itos': itos, - 'stoi': stoi, + 'encode': encode, + 'decode': decode, } with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: - pickle.dump(meta, f) + dill.dump(meta, f, recurse=True) # length of dataset in characters: 1115394 # all the unique characters: diff --git a/sample.py b/sample.py index d25d6e0861..50b49de43c 100644 --- a/sample.py +++ b/sample.py @@ -2,7 +2,7 @@ Sample from a trained model """ import os -import pickle +import dill from contextlib import nullcontext import torch import tiktoken @@ -61,11 +61,8 @@ if load_meta: print(f"Loading meta from {meta_path}...") with open(meta_path, 'rb') as f: - meta = pickle.load(f) - # TODO want to make this more general to arbitrary encoder/decoder schemes - stoi, itos = meta['stoi'], meta['itos'] - encode = lambda s: [stoi[c] for c in s] - decode = lambda l: ''.join([itos[i] for i in l]) + meta = dill.load(f) + encode, decode = meta['encode'], meta['decode'] else: # ok let's assume gpt-2 encodings by default print("No meta.pkl found, assuming GPT-2 encodings...") diff --git a/train.py b/train.py index a482ab7f4e..cf3454b575 100644 --- a/train.py +++ b/train.py @@ -19,7 +19,7 @@ import os import time import math -import pickle +import dill from contextlib import nullcontext import numpy as np @@ -136,7 +136,7 @@ def get_batch(split): meta_vocab_size = None if os.path.exists(meta_path): with open(meta_path, 'rb') as f: - meta = pickle.load(f) + meta = dill.load(f) meta_vocab_size = meta['vocab_size'] print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")