diff --git a/DataManager.cpp b/DataManager.cpp index ecdc4dcb..a0f307b2 100644 --- a/DataManager.cpp +++ b/DataManager.cpp @@ -32,7 +32,6 @@ void DataManager::init() { root = NULL; oldNumChunks = 0; chunkRoots = NULL; - cleanupTreePieces = true; #ifdef CUDA treePiecesDone = 0; treePiecesDonePrefetch = 0; @@ -162,21 +161,20 @@ void DataManager::notifyPresence(Tree::GenericTreeNode *root, TreePiece *tp) { /// \brief Clear registeredTreePieces on this node. void DataManager::clearRegisteredPieces(const CkCallback& cb) { registeredTreePieces.removeAll(); - cleanupTreePieces = true; contribute(cb); } #ifdef CUDA -// This gets called before a tree build happens and ensures that -// registeredTreePieces doesnt get cleared during combineLocalTrees -// if we are about to do a gravity calculation on the GPU -void DataManager::unmarkTreePiecesForCleanup(const CkCallback& cb) { - cleanupTreePieces = false; - contribute(cb); +void DataManager::assignCUDAStreams(const CkCallback& cb) { + int tpIdx; + for(int i = 0; i < registeredTreePieces.size(); i++) { + tpIdx = registeredTreePieces[i].treePiece->getIndex(); + treePieces[tpIdx].assignCUDAStream((intptr_t) &streams[tpIdx % numStreams]); + } + contribute(cb); } #endif - /// \brief Build a local tree inside the node. /// /// This will be an exact superset of all the trees in this @@ -223,10 +221,6 @@ void DataManager::combineLocalTrees(CkReductionMsg *msg) { } root = buildProcessorTree(totalChares, >n[0]); - if (cleanupTreePieces) { - registeredTreePieces.removeAll(); - } - #ifdef PRINT_MERGED_TREE ostringstream dmName; dmName << "dm_" << CkMyNode(); @@ -512,7 +506,6 @@ void DataManager::startLocalWalk() { treePieces[in].commenceCalculateGravityLocal((intptr_t)d_localMoments, (intptr_t)d_localParts, (intptr_t)d_localVars, - (intptr_t)streams, numStreams, sMoments, sCompactParts, sVarParts); if(registeredTreePieces[0].treePiece->bEwald) { EwaldMsg *msg = new (8*sizeof(int)) EwaldMsg; @@ -1008,7 +1001,6 @@ void DataManager::transferParticleVarsBack(){ cudaFree(d_localVars); cudaFree(d_remoteMoments); cudaFree(d_remoteParts); - cleanupTreePieces = true; #ifdef CUDA_PRINT_ERRORS printf("transferParticleVarsBack: %s\n", cudaGetErrorString( cudaGetLastError() ) ); @@ -1069,7 +1061,6 @@ void DataManager::updateParticlesFreeMemory(UpdateParticlesStruct *data) } delete (data->cb); delete data; - registeredTreePieces.length() = 0; } CmiUnlock(__nodelock); } diff --git a/DataManager.h b/DataManager.h index 7ce2d459..33267c75 100644 --- a/DataManager.h +++ b/DataManager.h @@ -77,9 +77,6 @@ class DataManager : public CBase_DataManager { /// A list of roots of the TreePieces in this node // holds chare array indices of registered treepieces CkVec registeredTreePieces; - /// Signal whether registeredTreePieces needs to be cleaned - /// when combining local trees - bool cleanupTreePieces; #ifdef CUDA //CkVec registeredTreePieceIndices; /// @brief counter for the number of tree nodes that are @@ -190,6 +187,7 @@ class DataManager : public CBase_DataManager { void createStreams(int _numStreams, const CkCallback& cb); void donePrefetch(int chunk); // serialize remote chunk wrapper void serializeLocalTree(); + void assignCUDAStreams(const CkCallback& cb); #ifdef GPU_LOCAL_TREE_WALK void transformLocalTreeRecursive(GenericTreeNode *node, CkVec& localMoments); @@ -255,7 +253,6 @@ class DataManager : public CBase_DataManager { std::map &getCachedPartsOnGpuTable(){ return cachedPartsOnGpu; } - void unmarkTreePiecesForCleanup(const CkCallback& cb); #endif // Functions used to create a tree inside the DataManager comprising // all the trees in the TreePieces in the local node diff --git a/ParallelGravity.ci b/ParallelGravity.ci index 51ae1d1d..fe70259b 100644 --- a/ParallelGravity.ci +++ b/ParallelGravity.ci @@ -250,7 +250,7 @@ mainmodule ParallelGravity { entry void startLocalWalk(); entry void resumeRemoteChunk(); entry void createStreams(int _numStreams, const CkCallback& cb); - entry void unmarkTreePiecesForCleanup(const CkCallback& cb); + entry void assignCUDAStreams(const CkCallback& cb); #endif entry void initCooling(double dGmPerCcUnit, double dComovingGmPerCcUnit, double dErgPerGmUnit, double dSecUnit, double dKpcUnit, @@ -488,7 +488,6 @@ mainmodule ParallelGravity { entry void commenceCalculateGravityLocal(intptr_t d_localMoments, intptr_t d_localParts, intptr_t d_localVars, - intptr_t streams, int numStreams, size_t sMoments, size_t sCompactParts, size_t sVarParts); #else entry void commenceCalculateGravityLocal(); @@ -541,6 +540,7 @@ mainmodule ParallelGravity { // jetley #ifdef CUDA + entry void assignCUDAStream(intptr_t stream); entry void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts); entry void fillGPUBuffer(intptr_t bufLocalParts, intptr_t bufLocalMoments, diff --git a/ParallelGravity.cpp b/ParallelGravity.cpp index 8624335f..05cb6df0 100644 --- a/ParallelGravity.cpp +++ b/ParallelGravity.cpp @@ -1716,13 +1716,7 @@ Main::loadBalance(int iPhase) /// @param iPhase Active rung (or phase). void Main::buildTree(int iPhase) { -#ifdef CUDA - // If we are about to use the GPU, tell the data manager - // not to clean up its TreePiece list during combineLocalTrees - if (nActiveGrav >= param.nGpuMinParts) { - dMProxy.unmarkTreePiecesForCleanup(CkCallbackResumeThread()); - } -#endif + dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); #ifdef PUSH_GRAVITY bool bDoPush = param.dFracPushParticles*nTotalParticles > nActiveGrav; if(bDoPush) CkPrintf("[main] fracActive %f PUSH_GRAVITY\n", 1.0*nActiveGrav/nTotalParticles); @@ -1734,6 +1728,10 @@ void Main::buildTree(int iPhase) #else treeProxy.buildTree(bucketSize, CkCallbackResumeThread()); #endif + +#ifdef CUDA + dMProxy.assignCUDAStreams(CkCallbackResumeThread()); +#endif double tTB = CkWallTimer()-startTime; timings[iPhase].tTBuild += tTB; CkPrintf("took %g seconds.\n", tTB); @@ -1816,13 +1814,6 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung, else { *startTime = CkWallTimer(); treeProxy.initAccel(iActiveRung, CkCallbackResumeThread()); -#ifdef CUDA - // We didn't do gravity where the registered TreePieces on the - // DataManager normally get cleared. Clear them here instead. - if (nActiveGrav > param.nGpuMinParts) { - dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); - } -#endif } } @@ -3663,13 +3654,6 @@ void Main::writeOutput(int iStep) treeProxy.startSmooth(&pDen, 1, param.nSmooth, dfBall2OverSoft2, CkCallbackResumeThread()); treeProxy.finishNodeCache(CkCallbackResumeThread()); -#ifdef CUDA - // We didn't do gravity where the registered TreePieces on the - // DataManager normally get cleared. Clear them here instead. - if (nActiveGrav > param.nGpuMinParts) { - dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); - } -#endif if(verbosity) { ckout << " took " << (CkWallTimer() - startTime) << " seconds." << endl; @@ -3706,13 +3690,6 @@ void Main::writeOutput(int iStep) treeProxy.startSmooth(&pDenGas, 1, param.nSmooth, dfBall2OverSoft2, CkCallbackResumeThread()); treeProxy.finishNodeCache(CkCallbackResumeThread()); -#ifdef CUDA - // We didn't do gravity where the registered TreePieces on the - // DataManager normally get cleared. Clear them here instead. - if (nActiveGrav > param.nGpuMinParts) { - dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); - } -#endif if(verbosity) ckout << " took " << (CkWallTimer() - startTime) << " seconds." << endl; diff --git a/ParallelGravity.h b/ParallelGravity.h index c1dcd3a9..91cbbf3a 100644 --- a/ParallelGravity.h +++ b/ParallelGravity.h @@ -1014,6 +1014,9 @@ class TreePiece : public CBase_TreePiece { #endif #ifdef CUDA + void assignCUDAStream(intptr_t stream) { + this->stream = *((cudaStream_t *) stream); + } void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts); void fillGPUBuffer(intptr_t bufLocalParts, intptr_t bufLocalMoments, @@ -1869,7 +1872,6 @@ class TreePiece : public CBase_TreePiece { void commenceCalculateGravityLocal(intptr_t d_localMoments, intptr_t d_localParts, intptr_t d_localVars, - intptr_t streams, int numStreams, size_t sMoments, size_t sCompactParts, size_t sVarParts); #else void commenceCalculateGravityLocal(); diff --git a/TreePiece.cpp b/TreePiece.cpp index 4d2b8130..dccf43e7 100644 --- a/TreePiece.cpp +++ b/TreePiece.cpp @@ -5241,7 +5241,7 @@ void TreePiece::startGravity(int am, // the active mask for multistepping if (!bUseCpu) { dm->serializeLocalTree(); } else { - thisProxy[thisIndex].commenceCalculateGravityLocal(0, 0, 0, 0, 0, 0, 0, 0); + thisProxy[thisIndex].commenceCalculateGravityLocal(0, 0, 0, 0, 0, 0); } #else thisProxy[thisIndex].commenceCalculateGravityLocal(); @@ -5301,13 +5301,11 @@ void TreePiece::initiatePrefetch(int chunk){ void TreePiece::commenceCalculateGravityLocal(intptr_t d_localMoments, intptr_t d_localParts, intptr_t d_localVars, - intptr_t streams, int numStreams, size_t sMoments, size_t sCompactParts, size_t sVarParts) { if (!bUseCpu) { this->d_localMoments = (CudaMultipoleMoments *)d_localMoments; this->d_localParts = (CompactPartData *)d_localParts; this->d_localVars = (VariablePartData *)d_localVars; - this->stream = ((cudaStream_t *)streams)[thisIndex % numStreams]; this->sMoments = sMoments; this->sCompactParts = sCompactParts; this->sVarParts = sVarParts; diff --git a/feedback.cpp b/feedback.cpp index 72d29b76..512ffadf 100644 --- a/feedback.cpp +++ b/feedback.cpp @@ -320,13 +320,6 @@ void Main::StellarFeedback(double dTime, double dDelta) treeProxy.startReSmooth(&pSHG, CkCallbackResumeThread()); #endif treeProxy.finishNodeCache(CkCallbackResumeThread()); -#ifdef CUDA - // We didn't do gravity where the registered TreePieces on the - // DataManager normally get cleared. Clear them here instead. - if (nActiveGrav > param.nGpuMinParts) { - dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); - } -#endif #ifdef SPLITGAS addDelParticles();//Don't forget to run an addDelParticles after a split diff --git a/starform.cpp b/starform.cpp index 5f504986..4acb53e5 100644 --- a/starform.cpp +++ b/starform.cpp @@ -186,13 +186,6 @@ void Main::FormStars(double dTime, double dDelta) } treeProxy.finishNodeCache(CkCallbackResumeThread()); -#ifdef CUDA - // We didn't do gravity where the registered TreePieces on the - // DataManager normally get cleared. Clear them here instead. - if (nActiveGrav > param.nGpuMinParts) { - dMProxy.clearRegisteredPieces(CkCallbackResumeThread()); - } -#endif addDelParticles(); double tSF = CkWallTimer() - startTime;