Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streamline DataManager -> TreePiece Coordination #183

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 7 additions & 16 deletions DataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ void DataManager::init() {
root = NULL;
oldNumChunks = 0;
chunkRoots = NULL;
cleanupTreePieces = true;
#ifdef CUDA
treePiecesDone = 0;
treePiecesDonePrefetch = 0;
Expand Down Expand Up @@ -162,21 +161,20 @@ void DataManager::notifyPresence(Tree::GenericTreeNode *root, TreePiece *tp) {
/// \brief Clear registeredTreePieces on this node.
void DataManager::clearRegisteredPieces(const CkCallback& cb) {
registeredTreePieces.removeAll();
cleanupTreePieces = true;
contribute(cb);
}

#ifdef CUDA
// This gets called before a tree build happens and ensures that
// registeredTreePieces doesnt get cleared during combineLocalTrees
// if we are about to do a gravity calculation on the GPU
void DataManager::unmarkTreePiecesForCleanup(const CkCallback& cb) {
cleanupTreePieces = false;
contribute(cb);
void DataManager::assignCUDAStreams(const CkCallback& cb) {
int tpIdx;
for(int i = 0; i < registeredTreePieces.size(); i++) {
tpIdx = registeredTreePieces[i].treePiece->getIndex();
treePieces[tpIdx].assignCUDAStream((intptr_t) &streams[tpIdx % numStreams]);
}
contribute(cb);
}
#endif


/// \brief Build a local tree inside the node.
///
/// This will be an exact superset of all the trees in this
Expand Down Expand Up @@ -223,10 +221,6 @@ void DataManager::combineLocalTrees(CkReductionMsg *msg) {
}
root = buildProcessorTree(totalChares, &gtn[0]);

if (cleanupTreePieces) {
registeredTreePieces.removeAll();
}

#ifdef PRINT_MERGED_TREE
ostringstream dmName;
dmName << "dm_" << CkMyNode();
Expand Down Expand Up @@ -512,7 +506,6 @@ void DataManager::startLocalWalk() {
treePieces[in].commenceCalculateGravityLocal((intptr_t)d_localMoments,
(intptr_t)d_localParts,
(intptr_t)d_localVars,
(intptr_t)streams, numStreams,
sMoments, sCompactParts, sVarParts);
if(registeredTreePieces[0].treePiece->bEwald) {
EwaldMsg *msg = new (8*sizeof(int)) EwaldMsg;
Expand Down Expand Up @@ -1008,7 +1001,6 @@ void DataManager::transferParticleVarsBack(){
cudaFree(d_localVars);
cudaFree(d_remoteMoments);
cudaFree(d_remoteParts);
cleanupTreePieces = true;

#ifdef CUDA_PRINT_ERRORS
printf("transferParticleVarsBack: %s\n", cudaGetErrorString( cudaGetLastError() ) );
Expand Down Expand Up @@ -1069,7 +1061,6 @@ void DataManager::updateParticlesFreeMemory(UpdateParticlesStruct *data)
}
delete (data->cb);
delete data;
registeredTreePieces.length() = 0;
}
CmiUnlock(__nodelock);
}
Expand Down
5 changes: 1 addition & 4 deletions DataManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ class DataManager : public CBase_DataManager {
/// A list of roots of the TreePieces in this node
// holds chare array indices of registered treepieces
CkVec<TreePieceDescriptor> registeredTreePieces;
/// Signal whether registeredTreePieces needs to be cleaned
/// when combining local trees
bool cleanupTreePieces;
#ifdef CUDA
//CkVec<int> registeredTreePieceIndices;
/// @brief counter for the number of tree nodes that are
Expand Down Expand Up @@ -190,6 +187,7 @@ class DataManager : public CBase_DataManager {
void createStreams(int _numStreams, const CkCallback& cb);
void donePrefetch(int chunk); // serialize remote chunk wrapper
void serializeLocalTree();
void assignCUDAStreams(const CkCallback& cb);

#ifdef GPU_LOCAL_TREE_WALK
void transformLocalTreeRecursive(GenericTreeNode *node, CkVec<CudaMultipoleMoments>& localMoments);
Expand Down Expand Up @@ -255,7 +253,6 @@ class DataManager : public CBase_DataManager {
std::map<NodeKey, int> &getCachedPartsOnGpuTable(){
return cachedPartsOnGpu;
}
void unmarkTreePiecesForCleanup(const CkCallback& cb);
#endif
// Functions used to create a tree inside the DataManager comprising
// all the trees in the TreePieces in the local node
Expand Down
4 changes: 2 additions & 2 deletions ParallelGravity.ci
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ mainmodule ParallelGravity {
entry void startLocalWalk();
entry void resumeRemoteChunk();
entry void createStreams(int _numStreams, const CkCallback& cb);
entry void unmarkTreePiecesForCleanup(const CkCallback& cb);
entry void assignCUDAStreams(const CkCallback& cb);
#endif
entry void initCooling(double dGmPerCcUnit, double dComovingGmPerCcUnit,
double dErgPerGmUnit, double dSecUnit, double dKpcUnit,
Expand Down Expand Up @@ -488,7 +488,6 @@ mainmodule ParallelGravity {
entry void commenceCalculateGravityLocal(intptr_t d_localMoments,
intptr_t d_localParts,
intptr_t d_localVars,
intptr_t streams, int numStreams,
size_t sMoments, size_t sCompactParts, size_t sVarParts);
#else
entry void commenceCalculateGravityLocal();
Expand Down Expand Up @@ -541,6 +540,7 @@ mainmodule ParallelGravity {

// jetley
#ifdef CUDA
entry void assignCUDAStream(intptr_t stream);
entry void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
entry void fillGPUBuffer(intptr_t bufLocalParts,
intptr_t bufLocalMoments,
Expand Down
33 changes: 5 additions & 28 deletions ParallelGravity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1716,13 +1716,7 @@ Main::loadBalance(int iPhase)
/// @param iPhase Active rung (or phase).
void Main::buildTree(int iPhase)
{
#ifdef CUDA
// If we are about to use the GPU, tell the data manager
// not to clean up its TreePiece list during combineLocalTrees
if (nActiveGrav >= param.nGpuMinParts) {
dMProxy.unmarkTreePiecesForCleanup(CkCallbackResumeThread());
}
#endif
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
#ifdef PUSH_GRAVITY
bool bDoPush = param.dFracPushParticles*nTotalParticles > nActiveGrav;
if(bDoPush) CkPrintf("[main] fracActive %f PUSH_GRAVITY\n", 1.0*nActiveGrav/nTotalParticles);
Expand All @@ -1734,6 +1728,10 @@ void Main::buildTree(int iPhase)
#else
treeProxy.buildTree(bucketSize, CkCallbackResumeThread());
#endif

#ifdef CUDA
dMProxy.assignCUDAStreams(CkCallbackResumeThread());
#endif
double tTB = CkWallTimer()-startTime;
timings[iPhase].tTBuild += tTB;
CkPrintf("took %g seconds.\n", tTB);
Expand Down Expand Up @@ -1816,13 +1814,6 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung,
else {
*startTime = CkWallTimer();
treeProxy.initAccel(iActiveRung, CkCallbackResumeThread());
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
}
}

Expand Down Expand Up @@ -3663,13 +3654,6 @@ void Main::writeOutput(int iStep)
treeProxy.startSmooth(&pDen, 1, param.nSmooth, dfBall2OverSoft2,
CkCallbackResumeThread());
treeProxy.finishNodeCache(CkCallbackResumeThread());
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
if(verbosity) {
ckout << " took " << (CkWallTimer() - startTime) << " seconds."
<< endl;
Expand Down Expand Up @@ -3706,13 +3690,6 @@ void Main::writeOutput(int iStep)
treeProxy.startSmooth(&pDenGas, 1, param.nSmooth, dfBall2OverSoft2,
CkCallbackResumeThread());
treeProxy.finishNodeCache(CkCallbackResumeThread());
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
if(verbosity)
ckout << " took " << (CkWallTimer() - startTime) << " seconds."
<< endl;
Expand Down
4 changes: 3 additions & 1 deletion ParallelGravity.h
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,9 @@ class TreePiece : public CBase_TreePiece {
#endif

#ifdef CUDA
void assignCUDAStream(intptr_t stream) {
this->stream = *((cudaStream_t *) stream);
}
void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
void fillGPUBuffer(intptr_t bufLocalParts,
intptr_t bufLocalMoments,
Expand Down Expand Up @@ -1869,7 +1872,6 @@ class TreePiece : public CBase_TreePiece {
void commenceCalculateGravityLocal(intptr_t d_localMoments,
intptr_t d_localParts,
intptr_t d_localVars,
intptr_t streams, int numStreams,
size_t sMoments, size_t sCompactParts, size_t sVarParts);
#else
void commenceCalculateGravityLocal();
Expand Down
4 changes: 1 addition & 3 deletions TreePiece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5241,7 +5241,7 @@ void TreePiece::startGravity(int am, // the active mask for multistepping
if (!bUseCpu) {
dm->serializeLocalTree();
} else {
thisProxy[thisIndex].commenceCalculateGravityLocal(0, 0, 0, 0, 0, 0, 0, 0);
thisProxy[thisIndex].commenceCalculateGravityLocal(0, 0, 0, 0, 0, 0);
}
#else
thisProxy[thisIndex].commenceCalculateGravityLocal();
Expand Down Expand Up @@ -5301,13 +5301,11 @@ void TreePiece::initiatePrefetch(int chunk){
void TreePiece::commenceCalculateGravityLocal(intptr_t d_localMoments,
intptr_t d_localParts,
intptr_t d_localVars,
intptr_t streams, int numStreams,
size_t sMoments, size_t sCompactParts, size_t sVarParts) {
if (!bUseCpu) {
this->d_localMoments = (CudaMultipoleMoments *)d_localMoments;
this->d_localParts = (CompactPartData *)d_localParts;
this->d_localVars = (VariablePartData *)d_localVars;
this->stream = ((cudaStream_t *)streams)[thisIndex % numStreams];
this->sMoments = sMoments;
this->sCompactParts = sCompactParts;
this->sVarParts = sVarParts;
Expand Down
7 changes: 0 additions & 7 deletions feedback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,6 @@ void Main::StellarFeedback(double dTime, double dDelta)
treeProxy.startReSmooth(&pSHG, CkCallbackResumeThread());
#endif
treeProxy.finishNodeCache(CkCallbackResumeThread());
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif

#ifdef SPLITGAS
addDelParticles();//Don't forget to run an addDelParticles after a split
Expand Down
7 changes: 0 additions & 7 deletions starform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,6 @@ void Main::FormStars(double dTime, double dDelta)
}

treeProxy.finishNodeCache(CkCallbackResumeThread());
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif

addDelParticles();
double tSF = CkWallTimer() - startTime;
Expand Down
Loading