rhymes-ai · aria-hacker · Oct 1, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/README.md b/README.md
@@ -77,7 +77,7 @@ inputs = processor(text=text, images=image, return_tensors="pt")
 inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
-with torch.inference_mode():
+with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
     output = model.generate(
         **inputs,
         max_new_tokens=500,

diff --git a/aria/inference.py b/aria/inference.py
@@ -112,7 +112,7 @@ def inference(
     inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         output = model.generate(
             **inputs,
             max_new_tokens=500,

diff --git a/docs/inference.md b/docs/inference.md
@@ -38,7 +38,7 @@ inputs = processor(text=text, images=image, return_tensors="pt")
 inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
 inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
-with torch.inference_mode():
+with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
     output = model.generate(
         **inputs,
         max_new_tokens=500,

diff --git a/examples/nextqa/evaluation.py b/examples/nextqa/evaluation.py
@@ -88,7 +88,7 @@ def load_model_and_tokenizer(args):
 
 def process_batch(model, tokenizer, inputs, original_batch, prompts):
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         output = model.generate(
             **inputs,
             max_new_tokens=20,

diff --git a/examples/nlvr2/evaluation.py b/examples/nlvr2/evaluation.py
@@ -90,7 +90,7 @@ def load_model_and_tokenizer(args):
 
 def process_batch(model, tokenizer, inputs, original_batch, prompts):
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         output = model.generate(
             **inputs,
             max_new_tokens=50,

diff --git a/examples/refcoco/evaluation.py b/examples/refcoco/evaluation.py
@@ -92,7 +92,7 @@ def load_model_and_tokenizer(args):
 
 def process_batch(model, tokenizer, inputs, original_batch, prompts):
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         output = model.generate(
             **inputs,
             max_new_tokens=50,

diff --git a/examples/refcoco/inference.py b/examples/refcoco/inference.py
@@ -87,7 +87,7 @@ def inference(
     inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         output = model.generate(
             **inputs,
             max_new_tokens=500,