Skip to content

Commit

Permalink
Change BoxForceReciprocalGPU implementation to eliminate roundoff
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Aug 27, 2024
1 parent ff3f67a commit 7d5721a
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 108 deletions.
26 changes: 13 additions & 13 deletions src/Ewald.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1501,11 +1501,8 @@ void Ewald::BoxForceReciprocal(XYZArray const &molCoords,
double constValue = ff.alpha[box] * M_2_SQRTPI;

#ifdef GOMC_CUDA
bool *particleUsed;
particleUsed = new bool[atomForceRec.Count()];
memset((void *)particleUsed, false, atomForceRec.Count() * sizeof(bool));
std::vector<int> particleUsed;
#if ENSEMBLE == GEMC || ENSEMBLE == GCMC
memset((void *)particleUsed, false, atomForceRec.Count() * sizeof(bool));
MoleculeLookup::box_iterator thisMol = molLookup.BoxBegin(box);
MoleculeLookup::box_iterator end = molLookup.BoxEnd(box);
while (thisMol != end) {
Expand All @@ -1514,25 +1511,28 @@ void Ewald::BoxForceReciprocal(XYZArray const &molCoords,
uint start = mols.MolStart(molIndex);
for (uint p = 0; p < length; p++) {
atomForceRec.Set(start + p, 0.0, 0.0, 0.0);
particleUsed[start + p] = true;
// Need to include only the charged atoms
if (particleCharge[start + p] != 0.0)
particleUsed.push_back(start + p);
}
molForceRec.Set(molIndex, 0.0, 0.0, 0.0);
thisMol++;
}
#else
// Only one box, so clear all atoms and molecules and mark all particles as
// Used
// used
atomForceRec.Reset();
molForceRec.Reset();
memset((void *)particleUsed, true, atomForceRec.Count() * sizeof(bool));
for (auto i; i < atomForceRec.Count(); ++i) {
particleUsed.push_back(i);
}
#endif

CallBoxForceReciprocalGPU(
ff.particles->getCUDAVars(), atomForceRec, molForceRec, particleCharge,
particleMol, particleHasNoCharge, particleUsed, startMol, lengthMol,
ff.alpha[box], ff.alphaSq[box], constValue, imageSizeRef[box],
molCoords, currentAxes, box);
delete[] particleUsed;
CallBoxForceReciprocalGPU(ff.particles->getCUDAVars(), atomForceRec,
molForceRec, particleCharge, particleMol,
particleUsed, startMol, lengthMol, ff.alpha[box],
ff.alphaSq[box], constValue, imageSizeRef[box],
molCoords, currentAxes, box);
#else
// molecule iterator
MoleculeLookup::box_iterator thisMol = molLookup.BoxBegin(box);
Expand Down
162 changes: 74 additions & 88 deletions src/GPU/CalculateEwaldCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using namespace cub;

#define IMAGES_PER_BLOCK 64
#define PARTICLES_PER_BLOCK 64
#define THREADS_PER_BLOCK 128

#define FULL_MASK 0xffffffff

Expand Down Expand Up @@ -448,35 +449,23 @@ void CallBoxForceReciprocalGPU(
VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec,
const std::vector<double> &particleCharge,
const std::vector<int> &particleMol,
const std::vector<bool> &particleHasNoCharge, const bool *particleUsed,
const std::vector<int> &particleUsed,
const std::vector<int> &startMol, const std::vector<int> &lengthMol,
double alpha, double alphaSq, double constValue, uint imageSize,
XYZArray const &molCoords, BoxDimensions const &boxAxes, int box) {
int atomCount = atomForceRec.Count();
int molCount = molForceRec.Count();
double *gpu_particleCharge;
int *gpu_particleMol;
bool *gpu_particleHasNoCharge, *gpu_particleUsed;
bool *arr_particleHasNoCharge = new bool[particleHasNoCharge.size()];
int *gpu_startMol, *gpu_lengthMol;

// particleHasNoCharge is stored in vector<bool>, so in order to copy it to
// GPU it needs to be stored in bool[]. because: std::vector<bool> : Does not
// necessarily store its elements as a contiguous array
for (int i = 0; i < particleHasNoCharge.size(); i++) {
arr_particleHasNoCharge[i] = particleHasNoCharge[i];
}
int *gpu_particleUsed, *gpu_startMol, *gpu_lengthMol;

// calculate block and grid sizes
dim3 threadsPerBlock(256, 1, 1);
int blocksPerGridX = (int)(atomCount / threadsPerBlock.x) + 1;
int blocksPerGridY = (int)(imageSize / IMAGES_PER_BLOCK) + 1;
dim3 blocksPerGrid(blocksPerGridX, blocksPerGridY, 1);
// calculate block and grid sizes
int threadsPerBlock = THREADS_PER_BLOCK;
int blocksPerGrid = particleUsed.size();

CUMALLOC((void **)&gpu_particleCharge,
particleCharge.size() * sizeof(double));
CUMALLOC((void **)&gpu_particleHasNoCharge,
particleHasNoCharge.size() * sizeof(bool));
CUMALLOC((void **)&gpu_particleUsed, atomCount * sizeof(bool));
CUMALLOC((void **)&gpu_startMol, startMol.size() * sizeof(int));
CUMALLOC((void **)&gpu_lengthMol, lengthMol.size() * sizeof(int));
Expand All @@ -498,9 +487,7 @@ void CallBoxForceReciprocalGPU(
sizeof(double) * particleCharge.size(), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_particleMol, &particleMol[0], sizeof(int) * particleMol.size(),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_particleHasNoCharge, arr_particleHasNoCharge,
sizeof(bool) * particleHasNoCharge.size(), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_particleUsed, particleUsed, sizeof(bool) * atomCount,
cudaMemcpy(gpu_particleUsed, &particleUsed[0], sizeof(int) * particleUsed.size(),
cudaMemcpyHostToDevice);
cudaMemcpy(vars->gpu_x, molCoords.x, sizeof(double) * atomCount,
cudaMemcpyHostToDevice);
Expand All @@ -517,16 +504,16 @@ void CallBoxForceReciprocalGPU(
BoxForceReciprocalGPU<<<blocksPerGrid, threadsPerBlock>>>(
vars->gpu_aForceRecx, vars->gpu_aForceRecy, vars->gpu_aForceRecz,
vars->gpu_mForceRecx, vars->gpu_mForceRecy, vars->gpu_mForceRecz,
gpu_particleCharge, gpu_particleMol, gpu_particleHasNoCharge,
gpu_particleUsed, gpu_startMol, gpu_lengthMol, alpha, alphaSq, constValue,
imageSize, vars->gpu_kxRef[box], vars->gpu_kyRef[box],
vars->gpu_kzRef[box], vars->gpu_x, vars->gpu_y, vars->gpu_z,
vars->gpu_prefactRef[box], vars->gpu_sumRnew[box], vars->gpu_sumInew[box],
vars->gpu_isFraction, vars->gpu_molIndex, vars->gpu_lambdaCoulomb,
vars->gpu_cell_x[box], vars->gpu_cell_y[box], vars->gpu_cell_z[box],
vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box],
vars->gpu_Invcell_z[box], vars->gpu_nonOrth, boxAxes.GetAxis(box).x,
boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z, box, atomCount);
gpu_particleCharge, gpu_particleMol, gpu_particleUsed, gpu_startMol,
gpu_lengthMol, alpha, alphaSq, constValue, imageSize,
vars->gpu_kxRef[box], vars->gpu_kyRef[box], vars->gpu_kzRef[box],
vars->gpu_x, vars->gpu_y, vars->gpu_z, vars->gpu_prefactRef[box],
vars->gpu_sumRnew[box], vars->gpu_sumInew[box], vars->gpu_isFraction,
vars->gpu_molIndex, vars->gpu_lambdaCoulomb, vars->gpu_cell_x[box],
vars->gpu_cell_y[box], vars->gpu_cell_z[box], vars->gpu_Invcell_x[box],
vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], vars->gpu_nonOrth,
boxAxes.GetAxis(box).x, boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z,
box);
cudaDeviceSynchronize();
checkLastErrorCUDA(__FILE__, __LINE__);

Expand All @@ -544,9 +531,7 @@ void CallBoxForceReciprocalGPU(
cudaMemcpyDeviceToHost);

cudaDeviceSynchronize();
delete[] arr_particleHasNoCharge;
CUFREE(gpu_particleCharge);
CUFREE(gpu_particleHasNoCharge);
CUFREE(gpu_particleUsed);
CUFREE(gpu_startMol);
CUFREE(gpu_lengthMol);
Expand All @@ -556,70 +541,55 @@ void CallBoxForceReciprocalGPU(
__global__ void BoxForceReciprocalGPU(
double *gpu_aForceRecx, double *gpu_aForceRecy, double *gpu_aForceRecz,
double *gpu_mForceRecx, double *gpu_mForceRecy, double *gpu_mForceRecz,
double *gpu_particleCharge, int *gpu_particleMol,
bool *gpu_particleHasNoCharge, bool *gpu_particleUsed, int *gpu_startMol,
int *gpu_lengthMol, double alpha, double alphaSq, double constValue,
int imageSize, double *gpu_kx, double *gpu_ky, double *gpu_kz,
double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_prefact,
double *gpu_sumRnew, double *gpu_sumInew, bool *gpu_isFraction,
int *gpu_molIndex, double *gpu_lambdaCoulomb, double *gpu_cell_x,
double *gpu_cell_y, double *gpu_cell_z, double *gpu_Invcell_x,
double *gpu_Invcell_y, double *gpu_Invcell_z, int *gpu_nonOrth, double axx,
double axy, double axz, int box, int atomCount) {
__shared__ double shared_kvector[IMAGES_PER_BLOCK * 3];
int particleID = blockDim.x * blockIdx.x + threadIdx.x;
int offset_vector_index = blockIdx.y * IMAGES_PER_BLOCK;
int numberOfVectors = min(IMAGES_PER_BLOCK, imageSize - offset_vector_index);

if (threadIdx.x < numberOfVectors) {
shared_kvector[threadIdx.x * 3] = gpu_kx[offset_vector_index + threadIdx.x];
shared_kvector[threadIdx.x * 3 + 1] =
gpu_ky[offset_vector_index + threadIdx.x];
shared_kvector[threadIdx.x * 3 + 2] =
gpu_kz[offset_vector_index + threadIdx.x];
double *gpu_particleCharge, int *gpu_particleMol, const int *gpu_particleUsed,
int *gpu_startMol, int *gpu_lengthMol, double alpha, double alphaSq,
double constValue, int imageSize, double *gpu_kx, double *gpu_ky,
double *gpu_kz, double *gpu_x, double *gpu_y, double *gpu_z,
double *gpu_prefact, double *gpu_sumRnew, double *gpu_sumInew,
bool *gpu_isFraction, int *gpu_molIndex, double *gpu_lambdaCoulomb,
double *gpu_cell_x, double *gpu_cell_y, double *gpu_cell_z,
double *gpu_Invcell_x, double *gpu_Invcell_y, double *gpu_Invcell_z,
int *gpu_nonOrth, double axx, double axy, double axz, int box) {

__shared__ int particleID, moleculeID;
__shared__ double x, y, z, lambdaCoef, fixed;

if (threadIdx.x == 0) {
// The particleID is the atom that corresponds to this particleUsed entry
particleID = gpu_particleUsed[blockIdx.x];
moleculeID = gpu_particleMol[particleID];
x = gpu_x[particleID];
y = gpu_y[particleID];
z = gpu_z[particleID];
lambdaCoef = DeviceGetLambdaCoulomb(moleculeID, box, gpu_isFraction,
gpu_molIndex, gpu_lambdaCoulomb);
fixed = 2.0 * lambdaCoef * gpu_particleCharge[particleID];
}
__syncthreads();

if (particleID >= atomCount || !gpu_particleUsed[particleID])
return;
double forceX = 0.0, forceY = 0.0, forceZ = 0.0;
int moleculeID = gpu_particleMol[particleID];

if (gpu_particleHasNoCharge[particleID])
return;

double x = gpu_x[particleID];
double y = gpu_y[particleID];
double z = gpu_z[particleID];
double lambdaCoef = DeviceGetLambdaCoulomb(moleculeID, box, gpu_isFraction,
gpu_molIndex, gpu_lambdaCoulomb);

__syncthreads();
// loop over images
for (int vectorIndex = 0; vectorIndex < numberOfVectors; vectorIndex++) {
double dot = x * shared_kvector[vectorIndex * 3] +
y * shared_kvector[vectorIndex * 3 + 1] +
z * shared_kvector[vectorIndex * 3 + 2];
for (int image = threadIdx.x; image < imageSize; image += THREADS_PER_BLOCK) {
double dot = x * gpu_kx[image] + y * gpu_ky[image] + z * gpu_kz[image];
double dotsin, dotcos;
sincos(dot, &dotsin, &dotcos);
double factor = 2.0 * gpu_particleCharge[particleID] *
gpu_prefact[offset_vector_index + vectorIndex] *
lambdaCoef *
(dotsin * gpu_sumRnew[offset_vector_index + vectorIndex] -
dotcos * gpu_sumInew[offset_vector_index + vectorIndex]);

forceX += factor * shared_kvector[vectorIndex * 3];
forceY += factor * shared_kvector[vectorIndex * 3 + 1];
forceZ += factor * shared_kvector[vectorIndex * 3 + 2];
double factor = fixed * gpu_prefact[image] * (dotsin * gpu_sumRnew[image] -
dotcos * gpu_sumInew[image]);
forceX += factor * gpu_kx[image];
forceY += factor * gpu_ky[image];
forceZ += factor * gpu_kz[image];
}

// loop over other particles within the same molecule
if (blockIdx.y == 0) {
// Pick the thread most likely to exit the for loop early
if (threadIdx.x == THREADS_PER_BLOCK-1) {
double intraForce = 0.0, distSq = 0.0, dist = 0.0;
double3 distVect;
int lastParticleWithinSameMolecule =
gpu_startMol[particleID] + gpu_lengthMol[particleID];
for (int otherParticle = gpu_startMol[particleID];
otherParticle < lastParticleWithinSameMolecule; otherParticle++) {
otherParticle < lastParticleWithinSameMolecule; ++otherParticle) {
if (particleID != otherParticle) {
DeviceInRcut(distSq, distVect, gpu_x, gpu_y, gpu_z, particleID,
otherParticle, axx, axy, axz, *gpu_nonOrth, gpu_cell_x,
Expand All @@ -631,20 +601,36 @@ __global__ void BoxForceReciprocalGPU(
double qiqj = gpu_particleCharge[particleID] *
gpu_particleCharge[otherParticle] * qqFactGPU;
intraForce = qiqj * lambdaCoef * lambdaCoef / distSq;
intraForce *= ((erf(alpha * dist) / dist) - constValue * expConstValue);
intraForce *= (erf(alpha * dist) / dist) - constValue * expConstValue;
forceX -= intraForce * distVect.x;
forceY -= intraForce * distVect.y;
forceZ -= intraForce * distVect.z;
}
}
}
__syncthreads();

atomicAdd(&gpu_aForceRecx[particleID], forceX);
atomicAdd(&gpu_aForceRecy[particleID], forceY);
atomicAdd(&gpu_aForceRecz[particleID], forceZ);
atomicAdd(&gpu_mForceRecx[moleculeID], forceX);
atomicAdd(&gpu_mForceRecy[moleculeID], forceY);
atomicAdd(&gpu_mForceRecz[moleculeID], forceZ);
// Specialize BlockReduce for a 1D block of threads of type double
using BlockReduce = cub::BlockReduce<double, THREADS_PER_BLOCK>;

// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage forceX_temp_storage;
__shared__ typename BlockReduce::TempStorage forceY_temp_storage;
__shared__ typename BlockReduce::TempStorage forceZ_temp_storage;

// Compute the block-wide sum for thread 0
double aggregateX = BlockReduce(forceX_temp_storage).Sum(forceX);
double aggregateY = BlockReduce(forceY_temp_storage).Sum(forceY);
double aggregateZ = BlockReduce(forceZ_temp_storage).Sum(forceZ);

if (threadIdx.x == 0) {
gpu_aForceRecx[particleID] = aggregateX;
gpu_aForceRecy[particleID] = aggregateY;
gpu_aForceRecz[particleID] = aggregateZ;
atomicAdd(&gpu_mForceRecx[moleculeID], aggregateX);
atomicAdd(&gpu_mForceRecy[moleculeID], aggregateY);
atomicAdd(&gpu_mForceRecz[moleculeID], aggregateZ);
}
}

__global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z,
Expand Down
11 changes: 4 additions & 7 deletions src/GPU/CalculateEwaldCUDAKernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ void CallBoxForceReciprocalGPU(VariablesCUDA *vars,
XYZArray &molForceRec,
const std::vector<double> &particleCharge,
const std::vector<int> &particleMol,
const std::vector<bool> &particleHasNoCharge,
const bool *particleUsed,
const std::vector<int> &particleUsed,
const std::vector<int> &startMol,
const std::vector<int> &lengthMol,
double alpha,
Expand Down Expand Up @@ -99,9 +98,8 @@ __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx,
double *gpu_mForceRecz,
double *gpu_particleCharge,
int *gpu_particleMol,
bool *gpu_particleHasNoCharge,
bool *gpu_particleUsed,
int *gpu_startMol,
const int *gpu_particleUsed,
int *gpu_startMol,
int *gpu_lengthMol,
double alpha,
double alphaSq,
Expand Down Expand Up @@ -129,8 +127,7 @@ __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx,
double axx,
double axy,
double axz,
int box,
int atomCount);
int box);

__global__ void BoxReciprocalSumsGPU(double * gpu_x,
double * gpu_y,
Expand Down

0 comments on commit 7d5721a

Please sign in to comment.