diff --git a/aepsych/models/base.py b/aepsych/models/base.py
index 7a00b14b5..18ab9e939 100644
--- a/aepsych/models/base.py
+++ b/aepsych/models/base.py
@@ -10,7 +10,7 @@
 import time
 from collections.abc import Iterable
 from copy import deepcopy
-from typing import Any, Dict, List, Mapping, Optional, Protocol, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Protocol, Tuple, Union
 
 import gpytorch
 import numpy as np
@@ -91,7 +91,7 @@ def _get_extremum(
         extremum_type: str,
         locked_dims: Optional[Mapping[int, List[float]]],
         n_samples=1000,
-    ) -> Tuple[float, np.ndarray]:
+    ) -> Tuple[float, torch.Tensor]:
         pass
 
     def dim_grid(self, gridsize: int = 30) -> torch.Tensor:
@@ -131,14 +131,17 @@ def get_max(
         max_time: Optional[float] = None,
     ) -> Tuple[float, torch.Tensor]:
         """Return the maximum of the modeled function, subject to constraints
+
         Args:
-            locked_dims (Mapping[int, List[float]]): Dimensions to fix, so that the
-                inverse is along a slice of the full surface.
+            locked_dims (Mapping[int, List[float]], optional): Dimensions to fix, so that the
+                inverse is along a slice of the full surface. Defaults to None.
             probability_space (bool): Is y (and therefore the returned nearest_y) in
                 probability space instead of latent function space? Defaults to False.
-            n_samples int: number of coarse grid points to sample for optimization estimate.
+            n_samples (int): number of coarse grid points to sample for optimization estimate.
+            max_time (float, optional): Maximum time to spend optimizing. Defaults to None.
+
         Returns:
-            Tuple[float, np.ndarray]: Tuple containing the max and its location (argmax).
+            Tuple[float, torch.Tensor]: Tuple containing the max and its location (argmax).
         """
         locked_dims = locked_dims or {}
         _, _arg = get_extremum(
@@ -160,11 +163,13 @@ def get_min(
     ) -> Tuple[float, torch.Tensor]:
         """Return the minimum of the modeled function, subject to constraints
         Args:
-            locked_dims (Mapping[int, List[float]]): Dimensions to fix, so that the
+            locked_dims (Mapping[int, List[float]], optional): Dimensions to fix, so that the
                 inverse is along a slice of the full surface.
             probability_space (bool): Is y (and therefore the returned nearest_y) in
                 probability space instead of latent function space? Defaults to False.
-            n_samples int: number of coarse grid points to sample for optimization estimate.
+            n_samples (int): number of coarse grid points to sample for optimization estimate.
+            max_time (float, optional): Maximum time to spend optimizing. Defaults to None.
+
         Returns:
             Tuple[float, torch.Tensor]: Tuple containing the min and its location (argmin).
         """
@@ -191,12 +196,17 @@ def inv_query(
         """Query the model inverse.
         Return nearest x such that f(x) = queried y, and also return the
             value of f at that point.
+
         Args:
             y (float): Points at which to find the inverse.
-            locked_dims (Mapping[int, List[float]]): Dimensions to fix, so that the
+            locked_dims (Mapping[int, List[float]], optional): Dimensions to fix, so that the
                 inverse is along a slice of the full surface.
             probability_space (bool): Is y (and therefore the returned nearest_y) in
                 probability space instead of latent function space? Defaults to False.
+            n_samples (int): number of coarse grid points to sample for optimization estimate. Defaults to 1000.
+            max_time (float, optional): Maximum time to spend optimizing. Defaults to None.
+            weights (torch.Tensor, optional): Weights for the optimization. Defaults to None.
+
         Returns:
             Tuple[float, torch.Tensor]: Tuple containing the value of f
                 nearest to queried y and the x position of this value.
@@ -220,7 +230,7 @@ def inv_query(
 
     def get_jnd(
         self: ModelProtocol,
-        grid: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        grid: Optional[torch.Tensor] = None,
         cred_level: Optional[float] = None,
         intensity_dim: int = -1,
         confsamps: int = 500,
@@ -239,20 +249,17 @@ def get_jnd(
         Both definitions are equivalent for linear psychometric functions.
 
         Args:
-            grid (Optional[np.ndarray], optional): Mesh grid over which to find the JND.
-                Defaults to a square grid of size as determined by aepsych.utils.dim_grid
+            grid (torch.Tensor, optional): Mesh grid over which to find the JND.
+                Defaults to a square grid of size as determined by aepsych.utils.dim_grid. 
             cred_level (float, optional): Credible level for computing an interval.
                 Defaults to None, computing no interval.
-            intensity_dim (int, optional): Dimension over which to compute the JND.
+            intensity_dim (int): Dimension over which to compute the JND.
                 Defaults to -1.
-            confsamps (int, optional): Number of posterior samples to use for
+            confsamps (int): Number of posterior samples to use for
                 computing the credible interval. Defaults to 500.
-            method (str, optional): "taylor" or "step" method (see docstring).
+            method (str): "taylor" or "step" method (see docstring).
                 Defaults to "step".
 
-        Raises:
-            RuntimeError: for passing an unknown method.
-
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: either the
                 mean JND, or a median, lower, upper tuple of the JND posterior.
@@ -316,6 +323,12 @@ def dim_grid(
         gridsize: int = 30,
         slice_dims: Optional[Mapping[int, float]] = None,
     ) -> torch.Tensor:
+        """Generate a grid based on lower, upper, and dim.
+        
+        Args:
+            gridsize (int): Number of points in each dimension. Defaults to 30.
+            slice_dims (Mapping[int, float], optional): Dimensions to fix at a certain value. Defaults to None.
+        """
         return dim_grid(self.lb, self.ub, gridsize, slice_dims)
 
     def set_train_data(
@@ -325,9 +338,13 @@ def set_train_data(
         strict: bool = False,
     ):
         """
-        :param torch.Tensor inputs: The new training inputs.
-        :param torch.Tensor targets: The new training targets.
-        :param bool strict: (default False, ignored). Here for compatibility with
+        Set the training data for the model.
+        
+        Args:
+            inputs (torch.Tensor, optional):  The new training inputs.
+            targets (torch.Tensor, optional): The new training targets.
+            strict (bool):  Default is False. Ignored, just for compatibility.
+
         input transformers. TODO: actually use this arg or change input transforms
         to not require it.
         """
@@ -356,9 +373,16 @@ def _fit_mll(
         self,
         mll: MarginalLogLikelihood,
         optimizer_kwargs: Optional[Dict[str, Any]] = None,
-        optimizer=fit_gpytorch_mll_scipy,
+        optimizer: Callable = fit_gpytorch_mll_scipy,
         **kwargs,
     ) -> None:
+        """Fits the model by maximizing the marginal log likelihood.
+        
+        Args:
+            mll (MarginalLogLikelihood): Marginal log likelihood object.
+            optimizer_kwargs (Dict[str, Any], optional): Keyword arguments for the optimizer.
+            optimizer (Callable): Optimizer to use. Defaults to fit_gpytorch_mll_scipy.
+        """
         self.train()
         train_x, train_y = mll.model.train_inputs[0], mll.model.train_targets
         optimizer_kwargs = {} if optimizer_kwargs is None else optimizer_kwargs.copy()
@@ -385,8 +409,19 @@ def _fit_mll(
         return res
 
     def p_below_threshold(
-        self, x, f_thresh
-    ) -> torch.Tensor:  # Return a tensor instead of NumPy array
+        self,
+        x: torch.Tensor,
+        f_thresh: torch.Tensor
+        ) -> torch.Tensor: 
+        """Compute the probability that the latent function is below a threshold.
+        
+        Args:
+            x (torch.Tensor): Points at which to evaluate the probability.
+            f_thresh (torch.Tensor): Threshold value.
+            
+        Returns:
+            torch.Tensor: Probability that the latent function is below the threshold.
+        """
         f, var = self.predict(x)
         f_thresh = f_thresh.reshape(-1, 1)
         f = f.reshape(1, -1)
@@ -400,11 +435,14 @@ class AEPsychModelDeviceMixin(AEPsychMixin):
     _train_inputs: Optional[Tuple[torch.Tensor]]
     _train_targets: Optional[torch.Tensor]
 
-    def set_train_data(self, inputs=None, targets=None, strict=False):
-        """
-        :param torch.Tensor inputs: The new training inputs.
-        :param torch.Tensor targets: The new training targets.
-        :param bool strict: (default False, ignored). Here for compatibility with
+    def set_train_data(self, inputs: Optional[torch.Tensor] = None, targets: Optional[torch.Tensor] = None, strict: bool = False) -> None:
+        """Set the training data for the model.
+
+        Args:
+            inputs (torch.Tensor, optional): The new training inputs X.
+            targets (torch.Tensor, optional): The new training targets Y.
+            strict (bool): Whether to strictly enforce the device of the inputs and targets.
+        
         input transformers. TODO: actually use this arg or change input transforms
         to not require it.
         """
@@ -417,12 +455,22 @@ def set_train_data(self, inputs=None, targets=None, strict=False):
 
     @property
     def device(self) -> torch.device:
+        """Get the device of the model.
+        
+        Returns:
+            torch.device: Device of the model.
+        """
         # We assume all models have some parameters and all models will only use one device
         # notice that this has no setting, don't let users set device, use .to().
         return next(self.parameters()).device
 
     @property
     def train_inputs(self) -> Optional[Tuple[torch.Tensor]]:
+        """Get the training inputs.
+        
+        Returns:
+            Optional[Tuple[torch.Tensor]]: Training inputs.
+        """
         if self._train_inputs is None:
             return None
 
@@ -434,6 +482,11 @@ def train_inputs(self) -> Optional[Tuple[torch.Tensor]]:
 
     @train_inputs.setter
     def train_inputs(self, train_inputs: Optional[Tuple[torch.Tensor]]) -> None:
+        """Set the training inputs.
+
+        Args:
+            train_inputs (Tuple[torch.Tensor]): Training inputs.
+        """
         if train_inputs is None:
             self._train_inputs = None
         else:
@@ -446,6 +499,11 @@ def train_inputs(self, train_inputs: Optional[Tuple[torch.Tensor]]) -> None:
 
     @property
     def train_targets(self) -> Optional[torch.Tensor]:
+        """Get the training targets.
+
+        Returns:
+            Optional[torch.Tensor]: Training targets.
+        """
         if self._train_targets is None:
             return None
 
@@ -456,6 +514,11 @@ def train_targets(self) -> Optional[torch.Tensor]:
 
     @train_targets.setter
     def train_targets(self, train_targets: Optional[torch.Tensor]) -> None:
+        """Set the training targets.
+
+        Args:
+            train_targets (torch.Tensor, optional): Training targets.
+        """
         if train_targets is None:
             self._train_targets = None
         else:
diff --git a/aepsych/models/derivative_gp.py b/aepsych/models/derivative_gp.py
index b338f5a7a..08691bd43 100644
--- a/aepsych/models/derivative_gp.py
+++ b/aepsych/models/derivative_gp.py
@@ -50,7 +50,7 @@ def __init__(
                 is an observation of df/dx_i.
             train_y (torch.Tensor): Training y points
             inducing_points (torch.Tensor): Inducing points to use
-            scales (Union[torch.Tensor, float], optional): Typical scale of each dimension
+            scales (Union[torch.Tensor, float]): Typical scale of each dimension
                 of input space (this is used to set the lengthscale prior).
                 Defaults to 1.0.
             mean_module (Mean, optional): A mean class that supports derivative
diff --git a/aepsych/models/gp_classification.py b/aepsych/models/gp_classification.py
index bdd8fdf0d..325fdd8cc 100644
--- a/aepsych/models/gp_classification.py
+++ b/aepsych/models/gp_classification.py
@@ -62,8 +62,8 @@ def __init__(
         """Initialize the GP Classification model
 
         Args:
-            lb torch.Tensor: Lower bounds of the parameters.
-            ub torch.Tensor: Upper bounds of the parameters.
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
             dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
                 of lb and ub.
             mean_module (gpytorch.means.Mean, optional): GP mean class. Defaults to a constant with a normal prior.
@@ -140,7 +140,7 @@ def __init__(
 
     @classmethod
     def from_config(cls, config: Config) -> GPClassificationModel:
-        """Alternate constructor for GPClassification model.
+        """Alternate constructor for GPClassification model from a configuration.
 
         This is used when we recursively build a full sampling strategy
         from a configuration. TODO: document how this works in some tutorial.
@@ -196,6 +196,7 @@ def from_config(cls, config: Config) -> GPClassificationModel:
         )
 
     def _reset_hyperparameters(self) -> None:
+        """Reset hyperparameters to their initial values."""
         # warmstart_hyperparams affects hyperparams but not the variational strat,
         # so we keep the old variational strat (which is only refreshed
         # if warmstart_induc=False).
@@ -270,7 +271,7 @@ def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
 
         Args:
             x (torch.Tensor): Points at which to sample.
-            num_samples (int, optional): Number of samples to return. Defaults to None.
+            num_samples (int): Number of samples to return.
             kwargs are ignored
 
         Returns:
@@ -286,7 +287,7 @@ def predict(
 
         Args:
             x (torch.Tensor): Points at which to predict from the model.
-            probability_space (bool, optional): Return outputs in units of
+            probability_space (bool): Return outputs in units of
                 response probability instead of latent function value. Defaults to False.
 
         Returns:
@@ -324,10 +325,23 @@ def predict(
             return promote_0d(fmean), promote_0d(fvar)
 
     def predict_probability(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Query the model for posterior mean and variance in probability space.
+
+        Args:
+            x (torch.Tensor): Points at which to predict from the model.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+        """
         return self.predict(x, probability_space=True)
 
     def update(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs):
-        """Perform a warm-start update of the model from previous fit."""
+        """Perform a warm-start update of the model from previous fit.
+
+        Args:
+            train_x (torch.Tensor): Inputs.
+            train_y (torch.Tensor): Responses.
+        """
         return self.fit(
             train_x, train_y, warmstart_hyperparams=True, warmstart_induc=True, **kwargs
         )
@@ -349,6 +363,23 @@ def __init__(
         inducing_point_method: str = "auto",
         optimizer_options: Optional[Dict[str, Any]] = None,
     ) -> None:
+        """Initialize the GP Beta Regression model
+
+        Args:
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
+            dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
+                of lb and ub. Defaults to None.
+            mean_module (gpytorch.means.Mean, optional): GP mean class. Defaults to a constant with a normal prior. Defaults to None.
+            covar_module (gpytorch.kernels.Kernel, optional): GP covariance kernel class. Defaults to scaled RBF with a
+                gamma prior.
+            likelihood (gpytorch.likelihood.Likelihood, optional): The likelihood function to use. If None defaults to
+                Beta likelihood.
+            inducing_size (int, optional): Number of inducing points. Defaults to 100.
+            max_fit_time (float, optional): The maximum amount of time, in seconds, to spend fitting the model. If None,
+                there is no limit to the fitting time. Defaults to None.
+            inducing_point_method (string): The method to use to select the inducing points. Defaults to "auto".
+        """
         if likelihood is None:
             likelihood = BetaLikelihood()
         super().__init__(
diff --git a/aepsych/models/gp_regression.py b/aepsych/models/gp_regression.py
index 6f954229f..a3be95cc6 100644
--- a/aepsych/models/gp_regression.py
+++ b/aepsych/models/gp_regression.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple
 
 import gpytorch
 import numpy as np
@@ -33,8 +33,8 @@ class GPRegressionModel(AEPsychModelDeviceMixin, ExactGP):
 
     def __init__(
         self,
-        lb: Union[np.ndarray, torch.Tensor],
-        ub: Union[np.ndarray, torch.Tensor],
+        lb: torch.Tensor,
+        ub: torch.Tensor,
         dim: Optional[int] = None,
         mean_module: Optional[gpytorch.means.Mean] = None,
         covar_module: Optional[gpytorch.kernels.Kernel] = None,
@@ -45,8 +45,8 @@ def __init__(
         """Initialize the GP regression model
 
         Args:
-            lb (Union[numpy.ndarray, torch.Tensor]): Lower bounds of the parameters.
-            ub (Union[numpy.ndarray, torch.Tensor]): Upper bounds of the parameters.
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
             dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
                 of lb and ub.
             mean_module (gpytorch.means.Mean, optional): GP mean class. Defaults to a constant with a normal prior.
@@ -88,6 +88,14 @@ def __init__(
 
     @classmethod
     def construct_inputs(cls, config: Config) -> Dict:
+        """Construct inputs for the GP regression model from configuration.
+        
+        Args:
+            config (Config): A configuration containing keys/values matching this class.
+            
+        Returns:
+            Dict: Dictionary of inputs for the GP regression model.
+        """
         classname = cls.__name__
 
         lb = config.gettensor(classname, "lb")
@@ -133,7 +141,7 @@ def from_config(cls, config: Config) -> GPRegressionModel:
         from a configuration. TODO: document how this works in some tutorial.
 
         Args:
-            config (Config): A configuration containing keys/values matching this class
+            config (Config): A configuration containing keys/values matching this class.
 
         Returns:
             GPRegressionModel: Configured class instance.
@@ -159,8 +167,7 @@ def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
 
         Args:
             x (torch.Tensor): Points at which to sample.
-            num_samples (int, optional): Number of samples to return. Defaults to None.
-            kwargs are ignored
+            num_samples (int): Number of samples to return.
 
         Returns:
             torch.Tensor: Posterior samples [num_samples x dim]
@@ -168,7 +175,12 @@ def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
         return self.posterior(x).rsample(torch.Size([num_samples])).detach().squeeze()
 
     def update(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs):
-        """Perform a warm-start update of the model from previous fit."""
+        """Perform a warm-start update of the model from previous fit.
+        
+        Args:
+            train_x (torch.Tensor): Inputs.
+            train_y (torch.Tensor): Responses.
+        """
         return self.fit(train_x, train_y, **kwargs)
 
     def predict(self, x: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -176,11 +188,9 @@ def predict(self, x: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, torch.Tensor
 
         Args:
             x (torch.Tensor): Points at which to predict from the model.
-            probability_space (bool, optional): Return outputs in units of
-                response probability instead of latent function value. Defaults to False.
 
         Returns:
-            Tuple[np.ndarray, np.ndarray]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
         """
         with torch.no_grad():
             post = self.posterior(x)
diff --git a/aepsych/models/monotonic_projection_gp.py b/aepsych/models/monotonic_projection_gp.py
index f4c1b21f7..4e1573788 100644
--- a/aepsych/models/monotonic_projection_gp.py
+++ b/aepsych/models/monotonic_projection_gp.py
@@ -107,6 +107,27 @@ def __init__(
         inducing_point_method: str = "auto",
         optimizer_options: Optional[Dict[str, Any]] = None,
     ) -> None:
+        """Initialize the MonotonicProjectionGP model.
+        
+        Args:
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
+            monotonic_dims (List[int]): A list of the dimensions on which monotonicity should
+                be enforced.
+            monotonic_grid_size (int): The size of the grid, s, in 1. above. Defaults to 20.
+            min_f_val (float, optional): If provided, maintains this minimum in the projection in 5. Defaults to None.
+            dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
+                of lb and ub. Defaults to None.
+            mean_module (gpytorch.means.Mean, optional): GP mean class. Defaults to a constant with a normal prior. Defaults to None.
+            covar_module (gpytorch.kernels.Kernel, optional): GP covariance kernel class. Defaults to scaled RBF with a
+                gamma prior. Defaults to None.
+            likelihood (Likelihood, optional): The likelihood function to use. If None defaults to
+                Gaussian likelihood. Defaults to None.
+            inducing_size (int, optional): The number of inducing points to use. Defaults to None.
+            max_fit_time (float, optional): The maximum amount of time, in seconds, to spend fitting the model. If None,
+                there is no limit to the fitting time. Defaults to None.
+            inducing_point_method (string): The method to use to select the inducing points. Defaults to "auto".
+        """
         assert len(monotonic_dims) > 0
         self.monotonic_dims = [int(d) for d in monotonic_dims]
         self.mon_grid_size = monotonic_grid_size
@@ -130,6 +151,16 @@ def posterior(
         observation_noise: Union[bool, torch.Tensor] = False,
         **kwargs: Any,
     ) -> GPyTorchPosterior:
+        """Compute the posterior at X, projecting to enforce monotonicity.
+
+        Args:
+            X (torch.Tensor): The input points at which to compute the posterior.
+            observation_noise (Union[bool, torch.Tensor]): Whether or not to include the observation noise in the
+                posterior. Defaults to False.
+
+        Returns:
+            GPyTorchPosterior: The posterior at X.
+        """
         # Augment X with monotonicity grid points, for each monotonic dim
         n, d = X.shape  # Require no batch dimensions
         m = len(self.monotonic_dims)
@@ -170,6 +201,15 @@ def posterior(
         return GPyTorchPosterior(mvn_proj)
 
     def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
+        """Sample from the model.
+
+        Args:
+            x (torch.Tensor): The input points at which to sample.
+            num_samples (int): The number of samples to draw.
+
+        Returns:
+            torch.Tensor: The samples at x.
+        """
         samps = super().sample(x=x, num_samples=num_samples)
         if self.min_f_val is not None:
             samps = samps.clamp(min=self.min_f_val)
diff --git a/aepsych/models/monotonic_rejection_gp.py b/aepsych/models/monotonic_rejection_gp.py
index 23ad9bc4a..b670136c7 100644
--- a/aepsych/models/monotonic_rejection_gp.py
+++ b/aepsych/models/monotonic_rejection_gp.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import gpytorch
 import numpy as np
@@ -29,7 +29,6 @@
 from gpytorch.models import ApproximateGP
 from gpytorch.variational import CholeskyVariationalDistribution, VariationalStrategy
 from scipy.stats import norm
-from torch import Tensor
 
 
 class MonotonicRejectionGP(AEPsychMixin, ApproximateGP):
@@ -52,8 +51,8 @@ class MonotonicRejectionGP(AEPsychMixin, ApproximateGP):
     def __init__(
         self,
         monotonic_idxs: Sequence[int],
-        lb: Union[np.ndarray, torch.Tensor],
-        ub: Union[np.ndarray, torch.Tensor],
+        lb: torch.Tensor,
+        ub: torch.Tensor,
         dim: Optional[int] = None,
         mean_module: Optional[Mean] = None,
         covar_module: Optional[Kernel] = None,
@@ -68,21 +67,22 @@ def __init__(
         """Initialize MonotonicRejectionGP.
 
         Args:
-            likelihood (str): Link function and likelihood. Can be 'probit-bernoulli' or
-                'identity-gaussian'.
-            monotonic_idxs (List[int]): List of which columns of x should be given monotonicity
+            monotonic_idxs (Sequence[int]): List of which columns of x should be given monotonicity
             constraints.
-            fixed_prior_mean (Optional[float], optional): Fixed prior mean. If classification, should be the prior
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
+            dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size.
+            covar_module (Kernel, optional): Covariance kernel to use. Default is scaled RBF.
+            mean_module (Mean, optional): Mean module to use. Default is constant mean.
+            likelihood (str, optional): Link function and likelihood. Can be 'probit-bernoulli' or
+                'identity-gaussian'.
+            fixed_prior_mean (float, optional): Fixed prior mean. If classification, should be the prior
             classification probability (not the latent function value). Defaults to None.
-            covar_module (Optional[Kernel], optional): Covariance kernel to use (default: scaled RBF).
-            mean_module (Optional[Mean], optional): Mean module to use (default: constant mean).
-            num_induc (int, optional): Number of inducing points for variational GP.]. Defaults to 25.
-            num_samples (int, optional): Number of samples for estimating posterior on preDict or
+            num_induc (int): Number of inducing points for variational GP.]. Defaults to 25.
+            num_samples (int): Number of samples for estimating posterior on preDict or
             acquisition function evaluation. Defaults to 250.
-            num_rejection_samples (int, optional): Number of samples used for rejection sampling. Defaults to 4096.
-            acqf (MonotonicMCAcquisition, optional): Acquisition function to use for querying points. Defaults to MonotonicMCLSE.
-            objective (Optional[MCAcquisitionObjective], optional): Transformation of GP to apply before computing acquisition function. Defaults to identity transform for gaussian likelihood, probit transform for probit-bernoulli.
-            extra_acqf_args (Optional[Dict[str, object]], optional): Additional arguments to pass into the acquisition function. Defaults to None.
+            num_rejection_samples (int): Number of samples used for rejection sampling. Defaults to 4096. 
+            inducing_point_method (str): Method for selecting inducing points. Defaults to "auto".
             optimizer_options (Dict[str, Any], optional): Optimizer options to pass to the SciPy optimizer during
                 fitting. Assumes we are using L-BFGS-B.
         """
@@ -152,12 +152,12 @@ def __init__(
             {"options": optimizer_options} if optimizer_options else {"options": {}}
         )
 
-    def fit(self, train_x: Tensor, train_y: Tensor, **kwargs) -> None:
+    def fit(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs) -> None:
         """Fit the model
 
         Args:
-            train_x (Tensor): Training x points
-            train_y (Tensor): Training y points. Should be (n x 1).
+            train_x (torch.Tensor): Training x points
+            train_y (torch.Tensor): Training y points. Should be (n x 1).
         """
         self.set_train_data(train_x, train_y)
 
@@ -172,11 +172,19 @@ def fit(self, train_x: Tensor, train_y: Tensor, **kwargs) -> None:
 
     def _set_model(
         self,
-        train_x: Tensor,
-        train_y: Tensor,
-        model_state_dict: Optional[Dict[str, Tensor]] = None,
-        likelihood_state_dict: Optional[Dict[str, Tensor]] = None,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        model_state_dict: Optional[Dict[str, torch.Tensor]] = None,
+        likelihood_state_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> None:
+        """Sets the model with the given data and state dicts.
+        
+        Args:
+            train_x (torch.Tensor): Training x points
+            train_y (torch.Tensor): Training y points. Should be (n x 1).
+            model_state_dict (Dict[str, torch.Tensor], optional): State dict for the model
+            likelihood_state_dict (Dict[str, torch.Tensor], optional): State dict for the likelihood
+        """
         train_x_aug = self._augment_with_deriv_index(train_x, 0)
         self.set_train_data(train_x_aug, train_y)
         # Set model parameters
@@ -191,16 +199,16 @@ def _set_model(
         )
         mll = fit_gpytorch_mll(mll, optimizer_kwargs=self.optimizer_options)
 
-    def update(self, train_x: Tensor, train_y: Tensor, warmstart: bool = True) -> None:
+    def update(self, train_x: torch.Tensor, train_y: torch.Tensor, warmstart: bool = True) -> None:
         """
         Update the model with new data.
 
         Expects the full set of data, not the incremental new data.
 
         Args:
-            train_x (Tensor): Train X.
-            train_y (Tensor): Train Y. Should be (n x 1).
-            warmstart (bool): If True, warm-start model fitting with current parameters.
+            train_x (torch.Tensor): Train X.
+            train_y (torch.Tensor): Train Y. Should be (n x 1).
+            warmstart (bool): If True, warm-start model fitting with current parameters. Defaults to True.
         """
         if warmstart:
             model_state_dict = self.state_dict()
@@ -217,15 +225,16 @@ def update(self, train_x: Tensor, train_y: Tensor, warmstart: bool = True) -> No
 
     def sample(
         self,
-        x: Tensor,
+        x: torch.Tensor,
         num_samples: Optional[int] = None,
         num_rejection_samples: Optional[int] = None,
     ) -> torch.Tensor:
         """Sample from monotonic GP
 
         Args:
-            x (Tensor): tensor of n points at which to sample
-            num_samples (int, optional): how many points to sample (default: self.num_samples)
+            x (torch.Tensor): tensor of n points at which to sample
+            num_samples (int, optional): how many points to sample. Default is self.num_samples.
+            num_rejection_samples (int): how many samples to use for rejection sampling. Default is self.num_rejection_samples.
 
         Returns: a Tensor of shape [n_samp, n]
         """
@@ -263,14 +272,16 @@ def sample(
         return samples_f
 
     def predict(
-        self, x: Tensor, probability_space: bool = False
-    ) -> Tuple[Tensor, Tensor]:
+        self, x: torch.Tensor, probability_space: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Predict
 
         Args:
-            x: tensor of n points at which to predict.
+            x (torch.Tensor): tensor of n points at which to predict.
+            probability_space (bool): whether to return in probability space. Defaults to False.
 
-        Returns: tuple (f, var) where f is (n,) and var is (n,)
+        Returns: 
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         samples_f = self.sample(x)
         mean = torch.mean(samples_f, dim=0).squeeze()
@@ -285,17 +296,35 @@ def predict(
         return mean, variance
 
     def predict_probability(
-        self, x: Union[torch.Tensor, np.ndarray]
+        self, x: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predict in probability space
+
+        Args:
+            x (torch.Tensor): Points at which to predict.
+        
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
+        """
         return self.predict(x, probability_space=True)
 
-    def _augment_with_deriv_index(self, x: Tensor, indx) -> Tensor:
+    def _augment_with_deriv_index(self, x: torch.Tensor, indx: int) -> torch.Tensor:
+        """Augment input with derivative index
+
+        Args:
+            x (torch.Tensor): Input tensor
+            indx (int): Derivative index
+        
+        Returns:
+            torch.Tensor: Augmented tensor
+        """
         return torch.cat(
             (x, indx * torch.ones(x.shape[0], 1)),
             dim=1,
         )
 
-    def _get_deriv_constraint_points(self) -> Tensor:
+    def _get_deriv_constraint_points(self) -> torch.Tensor:
+        """Get derivative constraint points"""
         deriv_cp = torch.tensor([])
         for i in self.monotonic_idxs:
             induc_i = self._augment_with_deriv_index(self.inducing_points, i + 1)
@@ -304,6 +333,14 @@ def _get_deriv_constraint_points(self) -> Tensor:
 
     @classmethod
     def from_config(cls, config: Config) -> MonotonicRejectionGP:
+        """ Alternate constructor for MonotonicRejectionGP
+        
+        Args:
+            config (Config): a configuration containing keys/values matching this class
+            
+        Returns:
+            MonotonicRejectionGP: configured class instance
+        """
         classname = cls.__name__
         num_induc = config.gettensor(classname, "num_induc", fallback=25)
         num_samples = config.gettensor(classname, "num_samples", fallback=250)
diff --git a/aepsych/models/multitask_regression.py b/aepsych/models/multitask_regression.py
index aab0b396c..5e2b0faa0 100644
--- a/aepsych/models/multitask_regression.py
+++ b/aepsych/models/multitask_regression.py
@@ -46,13 +46,13 @@ def __init__(
         """Initialize multitask GPR model.
 
         Args:
-            num_outputs (int, optional): Number of tasks (outputs). Defaults to 2.
-            rank (int, optional): Rank of cross-task covariance. Lower rank is a simpler model.
+            num_outputs (int): Number of tasks (outputs). Defaults to 2.
+            rank (int): Rank of cross-task covariance. Lower rank is a simpler model.
                 Should be less than or equal to num_outputs. Defaults to 1.
-            mean_module (Optional[gpytorch.means.Mean], optional): GP mean. Defaults to a constant mean.
-            covar_module (Optional[gpytorch.kernels.Kernel], optional): GP kernel module.
+            mean_module (gpytorch.means.Mean, optional): GP mean. Defaults to a constant mean.
+            covar_module (gpytorch.kernels.Kernel, optional): GP kernel module.
                 Defaults to scaled RBF kernel.
-            likelihood (Optional[gpytorch.likelihoods.Likelihood], optional): Likelihood
+            likelihood (gpytorch.likelihoods.Likelihood, optional): Likelihood
                 (should be a multitask-compatible likelihood). Defaults to multitask Gaussian likelihood.
         """
         self._num_outputs = num_outputs
@@ -79,12 +79,25 @@ def __init__(
     def forward(
         self, x: torch.Tensor
     ) -> gpytorch.distributions.MultitaskMultivariateNormal:
+        """ Evaluate GP.
+        
+        Args:
+            x (torch.Tensor): Tensor of points at which GP should be evaluated.
+            
+        Returns:
+            gpytorch.distributions.MultitaskMultivariateNormal: Distribution object
+                holding the mean and covariance at x."""
         mean_x = self.mean_module(x)
         covar_x = self.covar_module(x)
         return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)
 
     @classmethod
     def construct_inputs(cls, config: Config):
+        """Construct inputs for the Multitask GPR model from configuration.
+        
+        Args:
+            config (Config): A configuration containing keys/values matching this class.
+        """
         classname = cls.__name__
         args = super().construct_inputs(config)
         args["num_outputs"] = config.getint(classname, "num_outputs", fallback=2)
@@ -97,7 +110,6 @@ class IndependentMultitaskGPRModel(GPRegressionModel):
     fitting a batch of independent GPRegression models. It wraps the GPyTorch tutorial here
     https://docs.gpytorch.ai/en/stable/examples/03_Multitask_Exact_GPs/Batch_Independent_Multioutput_GP.html
     with AEPsych API and convenience fitting / prediction methods.
-
     """
 
     _num_outputs = 1
@@ -117,11 +129,11 @@ def __init__(
         """Initialize independent multitask GPR model.
 
         Args:
-            num_outputs (int, optional): Number of tasks (outputs). Defaults to 2.
-            mean_module (Optional[gpytorch.means.Mean], optional): GP mean. Defaults to a constant mean.
-            covar_module (Optional[gpytorch.kernels.Kernel], optional): GP kernel module.
+            num_outputs (int): Number of tasks (outputs). Defaults to 2.
+            mean_module (gpytorch.means.Mean, optional): GP mean. Defaults to a constant mean.
+            covar_module (gpytorch.kernels.Kernel, optional): GP kernel module.
                 Defaults to scaled RBF kernel.
-            likelihood (Optional[gpytorch.likelihoods.Likelihood], optional): Likelihood
+            likelihood (gpytorch.likelihoods.Likelihood, optional): Likelihood
                 (should be a multitask-compatible likelihood). Defaults to multitask Gaussian likelihood.
         """
 
@@ -152,6 +164,15 @@ def __init__(
     def forward(
         self, x: torch.Tensor
     ) -> gpytorch.distributions.MultitaskMultivariateNormal:
+        """ Evaluate GP.
+
+        Args:
+            x (torch.Tensor): Tensor of points at which GP should be evaluated.
+
+        Returns:
+            gpytorch.distributions.MultitaskMultivariateNormal: Distribution object
+                holding the mean and covariance at x.
+        """
         base_mvn = super().forward(x)  # do transforms
         return gpytorch.distributions.MultitaskMultivariateNormal.from_batch_mvn(
             base_mvn
@@ -159,6 +180,14 @@ def forward(
 
     @classmethod
     def get_config_args(cls, config: Config) -> Dict[str, Any]:
+        """Get configuration arguments for the model.
+
+        Args:
+            config (Config): A configuration containing keys/values matching this class.
+
+        Returns:
+            Dict[str, Any]: Dictionary of configuration arguments.
+        """
         classname = cls.__name__
         args = super().get_config_args(config)
         args["num_outputs"] = config.getint(classname, "num_outputs", fallback=2)
diff --git a/aepsych/models/ordinal_gp.py b/aepsych/models/ordinal_gp.py
index c33ed60c2..1ccdc532f 100644
--- a/aepsych/models/ordinal_gp.py
+++ b/aepsych/models/ordinal_gp.py
@@ -5,8 +5,6 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Union
-
 import gpytorch
 import torch
 from aepsych.likelihoods import OrdinalLikelihood
@@ -27,6 +25,12 @@ class OrdinalGPModel(GPClassificationModel):
     outcome_type = "ordinal"
 
     def __init__(self, likelihood=None, *args, **kwargs):
+        """Initialize the OrdinalGPModel
+        
+        Args:
+            likelihood (Likelihood): The likelihood function to use. If None defaults to
+                Ordinal likelihood.
+        """
         covar_module = kwargs.pop("covar_module", None)
         dim = kwargs.get("dim")
         if covar_module is None:
@@ -52,11 +56,28 @@ def __init__(self, likelihood=None, *args, **kwargs):
             **kwargs,
         )
 
-    def predict_probs(self, xgrid: torch.Tensor) -> torch.Tensor:
+    def predict_probs(self, xgrid:torch.Tensor) -> torch.Tensor:
+        """Predict probabilities of each ordinal level at xgrid
+
+        Args:
+            xgrid (torch.Tensor): Tensor of input points to predict at
+
+        Returns:
+            torch.Tensor: Tensor of probabilities of each ordinal level at xgrid
+        """
         fmean, fvar = self.predict(xgrid)
         return self.calculate_probs(fmean, fvar)
 
     def calculate_probs(self, fmean: torch.Tensor, fvar: torch.Tensor) -> torch.Tensor:
+        """Calculate probabilities of each ordinal level given a mean and variance
+
+        Args:
+            fmean (torch.Tensor): Mean of the latent function
+            fvar (torch.Tensor): Variance of the latent function
+
+        Returns:
+            torch.Tensor: Tensor of probabilities of each ordinal level
+        """
         fsd = torch.sqrt(1 + fvar)
         probs = torch.zeros(*fmean.size(), self.likelihood.n_levels)
 
diff --git a/aepsych/models/pairwise_probit.py b/aepsych/models/pairwise_probit.py
index ee5ce0d3c..3cb589417 100644
--- a/aepsych/models/pairwise_probit.py
+++ b/aepsych/models/pairwise_probit.py
@@ -30,9 +30,18 @@ class PairwiseProbitModel(PairwiseGP, AEPsychMixin):
     def _pairs_to_comparisons(
         self, x: torch.Tensor, y: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Takes x, y structured as pairs and judgments and
-        returns pairs and comparisons as PairwiseGP requires
+        """Convert pairs of points and their judgements to comparisons.
+
+        Args:
+            x (torch.Tensor): Tensor of shape (n, d, 2) where n is the number of pairs and d is the dimensionality of the
+                parameter space.
+            y (torch.Tensor): Tensor of shape (n,) where n is the number of pairs. Each element is 0 if the first point
+                in the pair is preferred, and 1 if the second point is preferred.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple of tensors. The first tensor is of shape (n, d) and contains the
+                unique points in the pairs. The second tensor is of shape (n, 2) and contains the indices of the unique
+                points in the first tensor that correspond to the points in the pairs.
         """
         # This needs to take a unique over the feature dim by flattening
         # over pairs but not instances/batches. This is actually tensor
@@ -65,6 +74,17 @@ def __init__(
         max_fit_time: Optional[float] = None,
         optimizer_options: Optional[Dict[str, Any]] = None,
     ) -> None:
+        """Initialize the PairwiseProbitModel
+
+        Args:
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
+            dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
+                of lb and ub. Defaults to None.
+            covar_module (gpytorch.kernels.Kernel, optional): GP covariance kernel class. Defaults to scaled RBF with a
+                gamma prior. Defaults to None.
+            max_fit_time (float, optional): The maximum amount of time, in seconds, to spend fitting the model. Defaults to None.
+        """
         self.lb, self.ub, dim = _process_bounds(lb, ub, dim)
 
         self.max_fit_time = max_fit_time
@@ -104,12 +124,20 @@ def fit(
         optimizer_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
     ) -> None:
+        """Fit the model to the training data.
+
+        Args:
+            train_x (torch.Tensor): Trainin x points.
+            train_y (torch.Tensor): Training y points.
+            optimizer_kwargs (Dict[str, Any], optional): Keyword arguments to pass to the optimizer. Defaults to None.
+        """
         if optimizer_kwargs is not None:
             if not "optimizer_kwargs" in optimizer_kwargs:
                 optimizer_kwargs = optimizer_kwargs.copy()
                 optimizer_kwargs.update(self.optimizer_options)
         else:
             optimizer_kwargs = {"options": self.optimizer_options}
+
         self.train()
         mll = PairwiseLaplaceMarginalLogLikelihood(self.likelihood, self)
         datapoints, comparisons = self._pairs_to_comparisons(train_x, train_y)
@@ -138,7 +166,13 @@ def fit(
     def update(
         self, train_x: torch.Tensor, train_y: torch.Tensor, warmstart: bool = True
     ) -> None:
-        """Perform a warm-start update of the model from previous fit."""
+        """Perform a warm-start update of the model from previous fit.
+
+        Args:
+            train_x (torch.Tensor): Train X.
+            train_y (torch.Tensor): Train Y.
+            warmstart (bool): If True, warm-start model fitting with current parameters. Defaults to True.
+        """
         self.fit(train_x, train_y)
 
     def predict(
@@ -148,6 +182,17 @@ def predict(
         num_samples: int = 1000,
         rereference: str = "x_min",
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Query the model for posterior mean and variance.
+
+        Args:
+            x (torch.Tensor): Points at which to predict from the model.
+            probability_space (bool): Return outputs in units of response probability instead of latent function value. Defaults to False.
+            num_samples (int): Number of samples to return. Defaults to 1000.
+            rereference (str): How to sample. Options are "x_min", "x_max", "f_min", "f_max". Defaults to "x_min".
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+        """
         if rereference is not None:
             samps = self.sample(x, num_samples, rereference)
             fmean, fvar = samps.mean(0).squeeze(), samps.var(0).squeeze()
@@ -170,6 +215,17 @@ def predict_probability(
         num_samples: int = 1000,
         rereference: str = "x_min",
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Query the model for posterior mean and variance in probability space.
+
+        Args:
+            x (torch.Tensor): Points at which to predict from the model.
+            probability_space (bool): Return outputs in units of response probability instead of latent function value. Defaults to False.
+            num_samples (int): Number of samples to return. Defaults to 1000.
+            rereference (str): How to sample. Options are "x_min", "x_max", "f_min", "f_max". Defaults to "x_min".
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+        """
         return self.predict(
             x, probability_space=True, num_samples=num_samples, rereference=rereference
         )
@@ -177,6 +233,16 @@ def predict_probability(
     def sample(
         self, x: torch.Tensor, num_samples: int, rereference: str = "x_min"
     ) -> torch.Tensor:
+        """Sample from the model model posterior.
+
+        Args:
+            x (torch.Tensor): Points at which to sample.
+            num_samples (int): Number of samples to return.
+            rereference (str): How to sample. Options are "x_min", "x_max", "f_min", "f_max". Defaults to "x_min".
+
+        Returns:
+            torch.Tensor: Posterior samples [num_samples x dim]
+        """
         if len(x.shape) < 2:
             x = x.reshape(-1, 1)
         if rereference is None:
@@ -204,7 +270,15 @@ def sample(
             return -samps + samps_ref
 
     @classmethod
-    def from_config(cls, config: Config) -> "PairwiseProbitModel":
+    def from_config(cls, config: Config) -> 'PairwiseProbitModel':
+        """Initialize the model from a config object.
+
+        Args:
+            config (Config): a configuration containing keys/values matching this class
+
+        Returns:
+            PairwiseProbitModel: Configured class instance.
+        """
         classname = cls.__name__
 
         mean_covar_factory = config.getobj(
diff --git a/aepsych/models/semi_p.py b/aepsych/models/semi_p.py
index 2d5f20f00..310569cf0 100644
--- a/aepsych/models/semi_p.py
+++ b/aepsych/models/semi_p.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 from copy import deepcopy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple
 
 import gpytorch
 import numpy as np
@@ -28,7 +28,6 @@
 from gpytorch.likelihoods import BernoulliLikelihood, Likelihood
 from gpytorch.means import ConstantMean, ZeroMean
 from gpytorch.priors import GammaPrior
-from torch import Tensor
 from torch.distributions import Normal
 
 # TODO: Implement a covar factory and analytic method for getting the lse
@@ -46,6 +45,16 @@ def _hadamard_mvn_approx(
 
     MVN approximation to the hadamard product of GPs (from the SemiP paper, extending the
     zero-mean results in https://mathoverflow.net/questions/293955/normal-approximation-to-the-pointwise-hadamard-schur-product-of-two-multivariat)
+
+    Args:
+        x_intensity (torch.Tensor): The intensity dimension
+        slope_mean (torch.Tensor): The mean of the slope GP
+        slope_cov (torch.Tensor): The covariance of the slope GP
+        offset_mean (torch.Tensor): The mean of the offset GP
+        offset_cov (torch.Tensor): The covariance of the offset GP
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The mean and covariance of the approximated MVN
     """
     offset_mean = offset_mean + x_intensity
 
@@ -65,6 +74,14 @@ def _hadamard_mvn_approx(
 
 
 def semi_p_posterior_transform(posterior: GPyTorchPosterior) -> GPyTorchPosterior:
+    """Transform a posterior from a SemiP model to a Hadamard model.
+    
+    Args:
+        posterior (GPyTorchPosterior): The posterior to transform
+        
+    Returns:
+        GPyTorchPosterior: The transformed posterior.
+    """
     batch_mean = posterior.mvn.mean
     batch_cov = posterior.mvn.covariance_matrix
     offset_mean = batch_mean[..., 0, :]
@@ -90,6 +107,14 @@ def __init__(
         likelihood: LinearBernoulliLikelihood,
         Xi: torch.Tensor,
     ) -> None:
+        """Initialize a SemiPPosterior object.
+        
+        Args:
+            mvn (MultivariateNormal): The MVN object to use
+            likelihood (LinearBernoulliLikelihood): The likelihood object
+            Xi (torch.Tensor): The intensity dimension
+        """
+        
         super().__init__(distribution=mvn)
         self.likelihood = likelihood
         self.Xi = Xi
@@ -97,12 +122,19 @@ def __init__(
     def rsample_from_base_samples(
         self,
         sample_shape: torch.Size,
-        base_samples: Tensor,
-    ) -> Tensor:
-        r"""Sample from the posterior (with gradients) using base samples.
+        base_samples: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sample from the posterior (with gradients) using base samples.
 
         This is intended to be used with a sampler that produces the corresponding base
         samples, and enables acquisition optimization via Sample Average Approximation.
+
+        Args:
+            sample_shape (torch.Size): The desired shape of the samples
+            base_samples (torch.Tensor): The base samples
+
+        Returns:
+            torch.Tensor: The sampled values from the posterior distribution
         """
         return (
             super()
@@ -117,6 +149,15 @@ def rsample(
         sample_shape: Optional[torch.Size] = None,
         base_samples: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Sample from the posterior distribution using the reparameterization trick
+        
+        Args:
+            sample_shape (torch.Size, optional): The desired shape of the samples. Defaults to None.
+            base_samples (torch.Tensor, optional): The base samples. Defaults to None.
+            
+        Returns:
+            torch.Tensor: The sampled values from the posterior distribution.
+        """
         if base_samples is None:
             samps_ = super().rsample(sample_shape=sample_shape)
         else:
@@ -135,6 +176,15 @@ def sample_p(
         sample_shape: Optional[torch.Size] = None,
         base_samples: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Sample from the likelihood distribution of the modeled function.
+        
+        Args:
+            sample_shape (torch.Size, optional): The desired shape of the samples. Defaults to None.
+            base_samples (torch.Tensor, optional): The base samples. Defaults to None.
+            
+        Returns:
+            torch.Tensor: The sampled values from the likelihood distribution.
+        """
         kcsamps = self.rsample(sample_shape=sample_shape, base_samples=base_samples)
         return self.likelihood.p(function_samples=kcsamps, Xi=self.Xi).squeeze(-1)
 
@@ -143,6 +193,16 @@ def sample_f(
         sample_shape: Optional[torch.Size] = None,
         base_samples: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Sample from the function values of the modeled distribution.
+        
+        Args:
+            sample_shape (torch.Size, optional): The desired shape of the samples. Defaults to None.
+            base_samples (torch.Tensor, optional): The base samples. Defaults to None.
+            
+        Returns:
+            torch.Tensor: The sampled function values from the likelihood.
+        """
+        
         kcsamps = self.rsample(sample_shape=sample_shape, base_samples=base_samples)
         return self.likelihood.f(function_samples=kcsamps, Xi=self.Xi).squeeze(-1)
 
@@ -152,6 +212,17 @@ def sample_thresholds(
         sample_shape: Optional[torch.Size] = None,
         base_samples: Optional[torch.Tensor] = None,
     ) -> SemiPThresholdObjective:
+        """Sample the thresholds based on the given threshold level.
+        
+        Args:
+        threshold_level (float): The target threshold level for sampling.
+        sample_shape (torch.Size, optional): The desired shape of the samples. Defaults to None.
+        base_samples (torch.Tensor, optional): The base samples. Defaults to None.
+
+        Returns:
+            SemiPThresholdObjective: The sampled thresholds based on the threshold level.
+        """
+
         fsamps = self.rsample(sample_shape=sample_shape, base_samples=base_samples)
         return SemiPThresholdObjective(
             likelihood=self.likelihood, target=threshold_level
@@ -178,8 +249,8 @@ class SemiParametricGPModel(GPClassificationModel):
 
     def __init__(
         self,
-        lb: Union[np.ndarray, torch.Tensor],
-        ub: Union[np.ndarray, torch.Tensor],
+        lb: torch.Tensor,
+        ub: torch.Tensor,
         dim: Optional[int] = None,
         stim_dim: int = 0,
         mean_module: Optional[gpytorch.means.Mean] = None,
@@ -194,18 +265,18 @@ def __init__(
         """
         Initialize SemiParametricGP.
         Args:
-        Args:
-            lb (Union[numpy.ndarray, torch.Tensor]): Lower bounds of the parameters.
-            ub (Union[numpy.ndarray, torch.Tensor]): Upper bounds of the parameters.
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
             dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
-                of lb and ub.
+                of lb and ub. Defaults to None.
             stim_dim (int): Index of the intensity (monotonic) dimension. Defaults to 0.
             mean_module (gpytorch.means.Mean, optional): GP mean class. Defaults to a constant with a normal prior.
             covar_module (gpytorch.kernels.Kernel, optional): GP covariance kernel class. Defaults to scaled RBF with a
                 gamma prior.
             likelihood (gpytorch.likelihood.Likelihood, optional): The likelihood function to use. If None defaults to
                 linear-Bernouli likelihood with probit link.
-            inducing_size (int): Number of inducing points. Defaults to 99.
+            slope_mean (float): The mean of the slope. Defaults to 2.
+            inducing_size (int, optional): Number of inducing points. Defaults to 99.
             max_fit_time (float, optional): The maximum amount of time, in seconds, to spend fitting the model. If None,
                 there is no limit to the fitting time.
             inducing_point_method (string): The method to use to select the inducing points. Defaults to "auto".
@@ -346,13 +417,15 @@ def sample(
         self,
         x: torch.Tensor,
         num_samples: int,
-        probability_space=False,
+        probability_space: bool = False,
     ) -> torch.Tensor:
         """Sample from underlying model.
 
         Args:
-            x ((n x d) torch.Tensor): Points at which to sample.
-            num_samples (int, optional): Number of samples to return. Defaults to None.
+            
+            x (torch.Tensor): `n x d` Points at which to sample.
+            num_samples (int): Number of samples to return. Defaults to None.
+            probability_space (bool): Whether to sample from the probability space (True) or the latent function. Defaults to False.
             kwargs are ignored
 
         Returns:
@@ -374,7 +447,7 @@ def predict(
 
         Args:
             x (torch.Tensor): Points at which to predict from the model.
-            probability_space (bool, optional): Return outputs in units of
+            probability_space (bool): Return outputs in units of
                 response probability instead of latent function value. Defaults to False.
 
         Returns:
@@ -390,6 +463,15 @@ def predict(
     def posterior(
         self, X: torch.Tensor, posterior_transform: Optional[PosteriorTransform] = None
     ) -> SemiPPosterior:
+        """Get the posterior distribution at the given points.
+
+        Args:
+            X (torch.Tensor): Points at which to evaluate the posterior.
+            posterior_transform (PosteriorTransform, optional): A transform to apply to the posterior. Defaults to None.
+
+        Returns:
+            SemiPPosterior: The posterior distribution at the given points.
+        """
         # Assume x is (b) x n x d
         if X.ndim > 3:
             raise ValueError
@@ -452,8 +534,8 @@ def __init__(
         """
         Initialize HadamardSemiPModel.
         Args:
-            lb (Union[numpy.ndarray, torch.Tensor]): Lower bounds of the parameters.
-            ub (Union[numpy.ndarray, torch.Tensor]): Upper bounds of the parameters.
+            lb (torch.Tensor): Lower bounds of the parameters.
+            ub (torch.Tensor): Upper bounds of the parameters.
             dim (int, optional): The number of dimensions in the parameter space. If None, it is inferred from the size
                 of lb and ub.
             stim_dim (int): Index of the intensity (monotonic) dimension. Defaults to 0.
@@ -462,7 +544,8 @@ def __init__(
             offset_mean_module (gpytorch.means.Mean, optional): Mean module to use (default: constant mean) for offset.
             offset_covar_module (gpytorch.kernels.Kernel, optional): Covariance kernel to use (default: scaled RBF) for offset.
             likelihood (gpytorch.likelihood.Likelihood, optional)): defaults to bernoulli with logistic input and a floor of .5
-            inducing_size (int): Number of inducing points. Defaults to 99.
+            slope_mean (float): The mean of the slope. Defaults to 2.
+            inducing_size (int, optional): Number of inducing points. Defaults to 99.
             max_fit_time (float, optional): The maximum amount of time, in seconds, to spend fitting the model. If None,
                 there is no limit to the fitting time.
             inducing_point_method (string): The method to use to select the inducing points. Defaults to "auto".
@@ -529,7 +612,7 @@ def __init__(
         self._fresh_likelihood_dict = deepcopy(self.likelihood.state_dict())
 
     def forward(self, x: torch.Tensor) -> MultivariateNormal:
-        """Forward pass for semip GP.
+        """Forward pass for HadamardSemiPModel GP.
 
         generates a k(c + x[:,stim_dim]) = kc + kx[:,stim_dim] mvn object where k and c are
         slope and offset GPs and x[:,stim_dim] are the intensity stimulus (x)
@@ -632,7 +715,7 @@ def predict(
 
         Args:
             x (torch.Tensor): Points at which to predict from the model.
-            probability_space (bool, optional): Return outputs in units of
+            probability_space (bool): Return outputs in units of
                 response probability instead of latent function value. Defaults to False.
 
         Returns:
diff --git a/aepsych/models/utils.py b/aepsych/models/utils.py
index 25aa21f0a..ca9559ca6 100644
--- a/aepsych/models/utils.py
+++ b/aepsych/models/utils.py
@@ -27,7 +27,6 @@
 from scipy.cluster.vq import kmeans2
 from scipy.special import owens_t
 from scipy.stats import norm
-from torch import Tensor
 from torch.distributions import Normal
 
 
@@ -46,6 +45,14 @@ def compute_p_quantile(
     A 95% CI for p can be computed as
     p_l = compute_p_quantile(f_mean, f_std, 0.025)
     p_u = compute_p_quantile(f_mean, f_std, 0.975)
+
+    Args:
+        f_mean (torch.Tensor): The mean of the latent function.
+        f_std (torch.Tensor): The standard deviation of the latent function.
+        alpha (Union[torch.Tensor, float]): The quantile to compute.
+
+    Returns:
+        torch.Tensor: The quantile of p.
     """
     norm = torch.distributions.Normal(0, 1)
     alpha = torch.tensor(alpha, dtype=f_mean.dtype)
@@ -56,9 +63,22 @@ def select_inducing_points(
     inducing_size: int,
     covar_module: Kernel = None,
     X: Optional[torch.Tensor] = None,
-    bounds: Optional[Union[torch.Tensor, np.ndarray]] = None,
+    bounds: Optional[torch.Tensor] = None,
     method: str = "auto",
 ) -> torch.Tensor:
+    """Select inducing points for GP model
+
+    Args:
+        inducing_size (int): Number of inducing points to select.
+        covar_module (Kernel): The kernel module to use for inducing point selection.
+        X (torch.Tensor, optional): The training data.
+        bounds (torch.Tensor, optional): The bounds of the input space.
+        method (str): The method to use for inducing point selection. One of
+            "pivoted_chol", "kmeans++", "auto", or "sobol".
+        
+    Returns:
+        torch.Tensor: The selected inducing points.
+    """
     with torch.no_grad():
         assert (
             method
@@ -111,6 +131,15 @@ def select_inducing_points(
 def get_probability_space(
     likelihood: Likelihood, posterior: GPyTorchPosterior
 ) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get the mean and variance of the probability space for a given posterior
+
+    Args:
+        likelihood (Likelihood): The likelihood function.
+        posterior (GPyTorchPosterior): The posterior to transform.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The mean and variance of the probability space.
+    """
     fmean = posterior.mean.squeeze()
     fvar = posterior.variance.squeeze()
     if isinstance(likelihood, BernoulliLikelihood):
@@ -149,13 +178,15 @@ def get_extremum(
     """Return the extremum (min or max) of the modeled function
     Args:
         extremum_type (str): Type of extremum (currently 'min' or 'max'.
-        bounds (tensor): Lower and upper bounds of the search space.
-        locked_dims (Mapping[int, List[float]]): Dimensions to fix, so that the
+        bounds (torch.Tensor): Lower and upper bounds of the search space.
+        locked_dims (Mapping[int, List[float]], optional): Dimensions to fix, so that the
             inverse is along a slice of the full surface.
         n_samples (int): number of coarse grid points to sample for optimization estimate.
-        max_time (float): Maximum amount of time in seconds to spend optimizing.
+        posterior_transform (PosteriorTransform, optional): Posterior transform to apply to the model.
+        max_time (float, optional): Maximum amount of time in seconds to spend optimizing.
+        weights (torch.Tensor, optional): Weights to apply to the target value. Defaults to None.
     Returns:
-        Tuple[float, np.ndarray]: Tuple containing the min and its location (argmin).
+        Tuple[float, torch.Tensor]: Tuple containing the min and its location (argmin).
     """
     locked_dims = locked_dims or {}
 
@@ -202,17 +233,18 @@ def inv_query(
     Return nearest x such that f(x) = queried y, and also return the
         value of f at that point.
     Args:
-        y (float): Points at which to find the inverse.
-        bounds (tensor): Lower and upper bounds of the search space.
-        locked_dims (Mapping[int, List[float]]): Dimensions to fix, so that the
-            inverse is along a slice of the full surface.
+        y (Union[float, torch.Tensor]): Points at which to find the inverse.
+        bounds (torch.Tensor): Lower and upper bounds of the search space.
+        locked_dims (Mapping[int, List[float]], optional): Dimensions to fix, so that the
+            inverse is along a slice of the full surface. Defaults to None.
         probability_space (bool): Is y (and therefore the
             returned nearest_y) in probability space instead of latent
             function space? Defaults to False.
-        n_samples (int): number of coarse grid points to sample for optimization estimate.
-        max_time float: Maximum amount of time in seconds to spend optimizing.
+        n_samples (int): number of coarse grid points to sample for optimization estimate. Defaults to 1000.
+        max_time (float, optional): Maximum amount of time in seconds to spend optimizing. Defaults to None.
+        weights (torch.Tensor, optional): Weights to apply to the target value. Defaults to None.
     Returns:
-        Tuple[float, np.ndarray]: Tuple containing the value of f
+        Tuple[float, torch.Tensor]: Tuple containing the value of f
             nearest to queried y and the x position of this value.
     """
     locked_dims = locked_dims or {}
@@ -241,16 +273,39 @@ def inv_query(
 
 class TargetDistancePosteriorTransform(PosteriorTransform):
     def __init__(
-        self, target_value: Union[float, Tensor], weights: Optional[Tensor] = None
+        self, target_value: Union[float, torch.Tensor], weights: Optional[torch.Tensor] = None
     ) -> None:
+        """Initialize the TargetDistancePosteriorTransform
+        
+        Args:
+            target_value (Union[float, torch.Tensor]): The target value to transform the posterior to.
+            weights (torch.Tensor, optional): Weights to apply to the target value. Defaults to None.
+        """
         super().__init__()
         self.target_value = target_value
         self.weights = weights
 
-    def evaluate(self, Y: Tensor) -> Tensor:
+    def evaluate(self, Y: torch.Tensor) -> torch.Tensor:
+        """Evaluate the squared distance from the target value.
+        
+        Args:
+            Y (torch.Tensor): The tensor to evaluate.
+            
+        Returns:
+            torch.Tensor: The squared distance from the target value.
+        """
         return (Y - self.target_value) ** 2
 
-    def _forward(self, mean: Tensor, var: Tensor) -> GPyTorchPosterior:
+    def _forward(self, mean: torch.Tensor, var: torch.Tensor) -> GPyTorchPosterior:
+        """Transform the posterior mean and variance based on the target value.
+        
+        Args:
+            mean (torch.Tensor): The posterior mean.
+            var (torch.Tensor): The posterior variance.
+            
+        Returns:
+            GPyTorchPosterior: The transformed posterior.
+        """
         q, _ = mean.shape[-2:]
         batch_shape = mean.shape[:-2]
 
@@ -265,6 +320,14 @@ def _forward(self, mean: Tensor, var: Tensor) -> GPyTorchPosterior:
         return GPyTorchPosterior(mvn)
 
     def forward(self, posterior: GPyTorchPosterior) -> GPyTorchPosterior:
+        """Transform the given posterior distribution to reflect the target distance.
+        
+        Args:
+            posterior (GPyTorchPosterior): The posterior to transform.
+            
+        Returns:
+            GPyTorchPosterior: The transformed posterior.
+        """
         mean = posterior.mean
         var = posterior.variance
         return self._forward(mean, var)
@@ -273,6 +336,14 @@ def forward(self, posterior: GPyTorchPosterior) -> GPyTorchPosterior:
 # Requires botorch approximate model to accept posterior transforms
 class TargetProbabilityDistancePosteriorTransform(TargetDistancePosteriorTransform):
     def forward(self, posterior: GPyTorchPosterior) -> GPyTorchPosterior:
+        """Transform the given posterior distribution to reflect the target probability distance.
+        
+        Args:
+            posterior (GPyTorchPosterior): The posterior to transform.
+            
+        Returns:
+            GPyTorchPosterior: The transformed posterior distribution reflecting the target probability distance.
+        """
         pmean, pvar = get_probability_space(BernoulliLikelihood(), posterior)
         pmean = pmean.unsqueeze(-1).unsqueeze(-1)
         pvar = pvar.unsqueeze(-1).unsqueeze(-1)