Skip to content

Commit

Permalink
Fix scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
knc6 committed Sep 17, 2024
1 parent bde0a6e commit ab8b8c9
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 59 deletions.
3 changes: 2 additions & 1 deletion atomgpt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
"""Version number."""
__version__ = "2024.6.8"

__version__ = "2024.9.8"
Empty file.
67 changes: 42 additions & 25 deletions atomgpt/forward_models/forward_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import sys
import argparse
from alignn.pretrained import get_figshare_model

parser = argparse.ArgumentParser(
description="Atomistic Generative Pre-trained Transformer."
)
Expand All @@ -39,6 +40,7 @@
help="Name of the config file",
)


class TrainingPropConfig(BaseSettings):
"""Training config defaults and validation."""

Expand All @@ -58,6 +60,8 @@ class TrainingPropConfig(BaseSettings):
n_val: Optional[int] = None
n_test: Optional[int] = None
output_dir: str = "out_temp"
desc_type: str = "desc_3"
convert: bool = False # raw files for false
train_ratio: Optional[float] = None
val_ratio: float = 0.1
test_ratio: float = 0.1
Expand Down Expand Up @@ -274,39 +278,46 @@ def __getitem__(self, idx):
# Example usage


def run_atomgpt(config_file="config.json",convert=False):
def run_atomgpt(config_file="config.json"):
print("Running AtomGPT prop predictor.")
run_path = os.path.abspath(config_file).split("config.json")[0]
print('PATH', run_path)
print("PATH", run_path)
config = loadjson(config_file)
config = TrainingPropConfig(**config)
pprint.pprint(config)
id_prop_path = config.id_prop_path
convert = config.convert
if convert:
model = get_figshare_model(model_name="jv_formation_energy_peratom_alignn")
model = get_figshare_model(
model_name="jv_formation_energy_peratom_alignn"
)
if ".zip" in id_prop_path:
zp = zipfile.ZipFile(id_prop_path)
dat = json.loads(zp.read(id_prop_path.split(".zip")[0]))
elif ".csv" in id_prop_path:
with open(id_prop_path, "r") as f:
reader = csv.reader(f)
dt = [row for row in reader]

dat=[]
for i in tqdm(dt,total=len(dt)):
info={}
info['id']=i[0]
info['prop']=[float(j) for j in i[1:]] # float(i[1])
pth=os.path.join(run_path,info['id'])
if convert:
atoms=Atoms.from_poscar(pth)
lines=atoms.describe(model=model)['desc_3']
else:
with open(pth,"r") as f:
lines=f.read()
info['desc']=lines
dat.append(info)

with open(id_prop_path, "r") as f:
reader = csv.reader(f)
dt = [row for row in reader]

dat = []
for i in tqdm(dt, total=len(dt)):
info = {}
info["id"] = i[0]
info["prop"] = [float(j) for j in i[1:]] # float(i[1])
# pth=os.path.join(run_path,info['id'])
pth = os.path.join(
id_prop_path.split("id_prop.csv")[0], info["id"]
)
if convert:
atoms = Atoms.from_poscar(pth)
lines = atoms.describe(model=model)[config.desc_type]
else:

with open(pth, "r") as f:
lines = f.read()
info["desc"] = lines
dat.append(info)

else:
dat = loadjson(id_prop_path)
print("len", len(dat))
Expand Down Expand Up @@ -747,10 +758,16 @@ def run_atomgpt(config_file="config.json",convert=False):
print("tot_time", tot_time)


if __name__ == "__main__":
#output_dir = make_id_prop()
#output_dir="."
def main():
args = parser.parse_args(sys.argv[1:])
run_atomgpt(config_file=args.config_name)


if __name__ == "__main__":
# output_dir = make_id_prop()
# output_dir="."
# args = parser.parse_args(sys.argv[1:])
# run_atomgpt(config_file=args.config_name)
# config_file="config.json"
# )
main()
72 changes: 43 additions & 29 deletions atomgpt/inverse_models/inverse_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pydantic_settings import BaseSettings
import sys
import argparse

parser = argparse.ArgumentParser(
description="Atomistic Generative Pre-trained Transformer."
)
Expand All @@ -28,6 +29,7 @@
help="Name of the config file",
)


# Adapted from https://github.com/unslothai/unsloth
class TrainingPropConfig(BaseSettings):
"""Training config defaults and validation."""
Expand All @@ -50,27 +52,27 @@ class TrainingPropConfig(BaseSettings):
# d = loadjson('dft_3d_Tc_supercon.json')


num_train = 2
num_val = 2
num_test = 2
run_path = "atomgpt/examples/inverse_model/"
id_prop_path = "id_prop.csv"
fourbit_models = [
"unsloth/mistral-7b-bnb-4bit",
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
"unsloth/llama-2-7b-bnb-4bit",
"unsloth/llama-2-13b-bnb-4bit",
"unsloth/codellama-34b-bnb-4bit",
"unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth
# num_train = 2
# num_val = 2
# num_test = 2
# run_path = "atomgpt/examples/inverse_model/"
# id_prop_path = "id_prop.csv"
# fourbit_models = [
# "unsloth/mistral-7b-bnb-4bit",
# "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
# "unsloth/llama-2-7b-bnb-4bit",
# "unsloth/llama-2-13b-bnb-4bit",
# "unsloth/codellama-34b-bnb-4bit",
# "unsloth/tinyllama-bnb-4bit",
# ] # More models at https://huggingface.co/unsloth

nm = "unsloth/mistral-7b-bnb-4bit"
nm = fourbit_models[-2]
nm = fourbit_models[0]
# nm = "unsloth/mistral-7b-bnb-4bit"
# nm = fourbit_models[-2]
# nm = fourbit_models[0]


instruction = "Below is a description of a superconductor material."
model_save_path = "lora_model_m"
# model_save_path = "lora_model_m"

alpaca_prompt1 = (
'"""\n'
Expand Down Expand Up @@ -144,7 +146,7 @@ def formatting_prompts_func(examples):
inputs = examples["input"]
outputs = examples["output"]
texts = []
EOS_TOKEN = '</s>'
EOS_TOKEN = "</s>"
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
Expand Down Expand Up @@ -187,7 +189,7 @@ def text2atoms(response):
return atoms


def gen_atoms(prompt="", max_new_tokens=512,model='',tokenizer=''):
def gen_atoms(prompt="", max_new_tokens=512, model="", tokenizer=""):
inputs = tokenizer(
[
alpaca_prompt.format(
Expand All @@ -211,6 +213,8 @@ def gen_atoms(prompt="", max_new_tokens=512,model='',tokenizer=''):
print(exp)
pass
return atoms


#######################################


Expand All @@ -220,8 +224,8 @@ def run_atomgpt_inverse(config_file="config.json"):
config = TrainingPropConfig(**config)
pprint.pprint(config)
id_prop_path = config.id_prop_path
num_train=config.num_train
num_test=config.num_test
num_train = config.num_train
num_test = config.num_test
id_prop_path = os.path.join(run_path, id_prop_path)
with open(id_prop_path, "r") as f:
reader = csv.reader(f)
Expand All @@ -234,7 +238,8 @@ def run_atomgpt_inverse(config_file="config.json"):
info["id"] = i[0]
ids.append(i[0])
info["prop"] = float(i[1]) # [float(j) for j in i[1:]] # float(i[1]
pth = os.path.join(run_path, info["id"])
# pth = os.path.join(run_path, info["id"])
pth = os.path.join(id_prop_path.split("id_prop.csv")[0], info["id"])
atoms = Atoms.from_poscar(pth)
info["atoms"] = atoms.to_dict()
dat.append(info)
Expand All @@ -255,15 +260,17 @@ def run_atomgpt_inverse(config_file="config.json"):
# m_test = make_alpaca_json(dataset=dft_3d, jids=test_ids, prop="Tc_supercon",include_jid=True)
# dumpjson(data=m_val, filename="alpaca_Tc_supercon_test.json")

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
max_seq_length = (
2048 # Choose any! We auto support RoPE Scaling internally!
)
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = (
True # Use 4bit quantization to reduce memory usage. Can be False.
)

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=nm, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
model_name=config.model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
Expand Down Expand Up @@ -324,7 +331,7 @@ def run_atomgpt_inverse(config_file="config.json"):
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
num_train_epochs=5,
num_train_epochs=config.num_epochs,
report_to="none",
),
)
Expand All @@ -346,7 +353,9 @@ def run_atomgpt_inverse(config_file="config.json"):
for i in tqdm(m_test):
prompt = i["input"]
print("prompt", prompt)
gen_mat = gen_atoms(prompt=i["input"],tokenizer=tokenizer,model=model)
gen_mat = gen_atoms(
prompt=i["input"], tokenizer=tokenizer, model=model
)
target_mat = text2atoms("\n" + i["output"])
print("target_mat", target_mat)
print("genmat", gen_mat)
Expand All @@ -356,10 +365,15 @@ def run_atomgpt_inverse(config_file="config.json"):
print()
f.close()

if __name__ == "__main__":
#output_dir = make_id_prop()
#output_dir="."

def main():
args = parser.parse_args(sys.argv[1:])
run_atomgpt_inverse(config_file=args.config_name)


if __name__ == "__main__":
# output_dir = make_id_prop()
# output_dir="."
# config_file="config.json"
# )
main()
14 changes: 10 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="atomgpt",
version="2024.6.8",
version="2024.9.8",
author="Kamal Choudhary",
author_email="[email protected]",
description="atomgpt",
Expand All @@ -17,13 +17,19 @@
"pydantic_settings",
"peft",
"trl",
#"alignn",
"triton",
"torch",
"sentencepiece"

"sentencepiece",
"protobuf",
# "alignn",
],
# scripts=["atomgpt/train_prop.py"],
entry_points={
"console_scripts": [
"atomgpt_forward=atomgpt.forward_models.forward_models:main",
"atomgpt_inverse=atomgpt.inverse_models.inverse_models:main",
]
},
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/usnistgov/atomgpt",
Expand Down

0 comments on commit ab8b8c9

Please sign in to comment.