Skip to content

Commit

Permalink
Harvester / Remove records by harvester UUID
Browse files Browse the repository at this point in the history
When harvester contains lot of records, remove records take a while or could even return heapspace errors.

Try to improve performances by using delete by query (instead of loop on each records)
eg. 1500 records
* Select > Delete all = 2min
* Harvester > Remove records = 700ms

This will bypass events but maybe that is fine for harvested records?

Maybe there is better JPA alternative for this kind of query?
  • Loading branch information
fxprunayre committed Oct 14, 2024
1 parent 9afd1bf commit d151b55
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,17 @@
package org.fao.geonet.repository;

import java.util.List;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.fao.geonet.domain.Metadata;
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.transaction.annotation.Transactional;

/**
* Data Access object for the {@link Metadata} entities.
*
* <p>
* The use of this class is discouraged, you should use IMetadataUtils or IMetadataManager instead.
*
* @author Jesse
Expand All @@ -60,12 +57,12 @@ public interface MetadataRepository extends GeonetRepository<Metadata, Integer>,

/**
* Find all metadata by the metadata's uuid.
*
* @param uuid the uuid of the metadata to find
* @return a list of metadata.
*/
@Nullable
List<Metadata> findAllByUuid(@Nonnull String uuid);
*
* @param uuid the uuid of the metadata to find
* @return a list of metadata.
*/
@Nullable
List<Metadata> findAllByUuid(@Nonnull String uuid);

/**
* Find all metadata harvested by the identified harvester.
Expand All @@ -76,7 +73,77 @@ public interface MetadataRepository extends GeonetRepository<Metadata, Integer>,
@Nonnull
List<Metadata> findAllByHarvestInfo_Uuid(@Nonnull String uuid);

int countByHarvestInfo_Uuid(@Nonnull String uuid);


@Query(value = "SELECT distinct(source) FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
List<String> findDistinctSourcesByHarvestInfo__uuid(String harvesterUuid);


@Query(value = "DELETE FROM operationallowed WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllOperationAllowedByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatarating WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadataRatingByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM validation WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllValidationByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM usersavedselections WHERE metadatauuid IN (SELECT uuid FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllUsersavedselectionsByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatafiledownloads WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadatafiledownloadsByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatafileuploads WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadatafileuploadsByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatastatus WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadatastatusByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatalink WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadatalinkByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadatacateg WHERE metadataid IN (SELECT id FROM metadata WHERE harvestuuid = :harvesterUuid)",
nativeQuery = true)
@Modifying
void deleteAllMetadatacategByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

@Query(value = "DELETE FROM metadata WHERE harvestuuid = :harvesterUuid",
nativeQuery = true)
@Modifying
void deleteAllMetadataByHarvesterUuid(@Param("harvesterUuid") String harvesterUuid);

default void deleteAllByHarvesterUuid(String harvesterUuid) {
deleteAllOperationAllowedByHarvesterUuid(harvesterUuid);
deleteAllMetadataRatingByHarvesterUuid(harvesterUuid);
deleteAllValidationByHarvesterUuid(harvesterUuid);
deleteAllUsersavedselectionsByHarvesterUuid(harvesterUuid);
deleteAllMetadatafiledownloadsByHarvesterUuid(harvesterUuid);
deleteAllMetadatafiledownloadsByHarvesterUuid(harvesterUuid);
deleteAllMetadatafileuploadsByHarvesterUuid(harvesterUuid);
deleteAllMetadatastatusByHarvesterUuid(harvesterUuid);
deleteAllMetadatalinkByHarvesterUuid(harvesterUuid);
deleteAllMetadatacategByHarvesterUuid(harvesterUuid);
deleteAllMetadataByHarvesterUuid(harvesterUuid);
}

@Query(value = "SELECT replace(data, :search, :replace) FROM metadata m " +
"WHERE uuid = :uuid",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@
import org.fao.geonet.kernel.AccessManager;
import org.fao.geonet.kernel.DataManager;
import org.fao.geonet.kernel.HarvestInfoProvider;
import org.fao.geonet.kernel.datamanager.IMetadataManager;
import org.fao.geonet.kernel.harvest.Common.OperResult;
import org.fao.geonet.kernel.harvest.harvester.AbstractHarvester;
import org.fao.geonet.kernel.harvest.harvester.AbstractParams;
import org.fao.geonet.kernel.harvest.harvester.HarversterJobListener;
import org.fao.geonet.kernel.search.EsSearchManager;
import org.fao.geonet.kernel.setting.HarvesterSettingsManager;
import org.fao.geonet.repository.HarvestHistoryRepository;
import org.fao.geonet.repository.MetadataRepository;
import org.fao.geonet.repository.specification.MetadataSpecs;
import org.fao.geonet.utils.Log;
import org.fao.geonet.utils.Xml;
Expand Down Expand Up @@ -83,6 +86,8 @@ public class HarvestManagerImpl implements HarvestInfoProvider, HarvestManager {
private ServiceContext context;
private boolean readOnly;
private ConfigurableApplicationContext applicationContext;
protected MetadataRepository metadataRepository;
protected EsSearchManager searchManager;
private Map<String, AbstractHarvester> hmHarvesters = new HashMap<>();
private Map<String, AbstractHarvester> hmHarvestLookup = new HashMap<>();

Expand All @@ -108,6 +113,8 @@ public ConfigurableApplicationContext getApplicationContext() {
public void init(ServiceContext context, boolean isReadOnly) throws Exception {
this.context = context;
this.dataMan = context.getBean(DataManager.class);
this.metadataRepository = context.getBean(MetadataRepository.class);
this.searchManager = context.getBean(EsSearchManager.class);
this.settingMan = context.getBean(HarvesterSettingsManager.class);
this.translationPackBuilder = context.getBean(TranslationPackBuilder.class);

Expand Down Expand Up @@ -689,9 +696,10 @@ public synchronized OperResult clearBatch(String id) throws Exception {
long elapsedTime = System.currentTimeMillis();

String harvesterUUID = ah.getParams().getUuid();
int numberOfRecordsRemoved = metadataRepository.countByHarvestInfo_Uuid(harvesterUUID);
metadataRepository.deleteAllByHarvesterUuid(harvesterUUID);
searchManager.delete(String.format("+harvesterUuid:\"%s\"", harvesterUUID));

final Specification<Metadata> specification = (Specification<Metadata>) MetadataSpecs.hasHarvesterUuid(harvesterUUID);
int numberOfRecordsRemoved = dataMan.batchDeleteMetadataAndUpdateIndex(specification);
ah.emptyResult();
elapsedTime = (System.currentTimeMillis() - elapsedTime) / 1000;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

package org.fao.geonet.kernel.harvest.harvester;

import com.google.common.io.Files;
import jeeves.server.UserSession;
import jeeves.server.context.ServiceContext;
import org.apache.commons.lang.StringUtils;
Expand All @@ -49,11 +50,13 @@
import org.fao.geonet.kernel.datamanager.IMetadataUtils;
import org.fao.geonet.kernel.harvest.Common.OperResult;
import org.fao.geonet.kernel.harvest.Common.Status;
import org.fao.geonet.kernel.search.EsSearchManager;
import org.fao.geonet.kernel.setting.HarvesterSettingsManager;
import org.fao.geonet.kernel.setting.SettingManager;
import org.fao.geonet.kernel.setting.Settings;
import org.fao.geonet.repository.GroupRepository;
import org.fao.geonet.repository.HarvestHistoryRepository;
import org.fao.geonet.repository.MetadataRepository;
import org.fao.geonet.repository.SortUtils;
import org.fao.geonet.repository.SourceRepository;
import org.fao.geonet.repository.UserRepository;
Expand Down Expand Up @@ -81,6 +84,7 @@
import java.io.File;
import java.io.IOException;
import java.net.UnknownHostException;
import java.nio.file.FileSystem;
import java.sql.SQLException;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
Expand Down Expand Up @@ -128,6 +132,8 @@ public abstract class AbstractHarvester<T extends HarvestResult, P extends Abstr
protected DataManager dataMan;
protected IMetadataManager metadataManager;
protected IMetadataUtils metadataUtils;
protected MetadataRepository metadataRepository;
protected EsSearchManager searchManager;

protected P params;
protected T result;
Expand Down Expand Up @@ -169,9 +175,11 @@ protected void setContext(ServiceContext context) {
this.context = context;
this.dataMan = context.getBean(DataManager.class);
this.metadataUtils = context.getBean(IMetadataUtils.class);
this.metadataRepository = context.getBean(MetadataRepository.class);
this.harvesterSettingsManager = context.getBean(HarvesterSettingsManager.class);
this.settingManager = context.getBean(SettingManager.class);
this.metadataManager = context.getBean(IMetadataManager.class);
this.searchManager = context.getBean(EsSearchManager.class);
}

public void add(Element node) throws BadInputEx, SQLException {
Expand Down Expand Up @@ -271,22 +279,18 @@ public void destroy() throws Exception {
if (lock.tryLock(10, TimeUnit.SECONDS)) {

doUnschedule();

final IMetadataUtils metadataRepository = context.getBean(IMetadataUtils.class);
final IMetadataUtils metadataUtils = context.getBean(IMetadataUtils.class);
final SourceRepository sourceRepository = context.getBean(SourceRepository.class);
final Resources resources = context.getBean(Resources.class);

final Specification<? extends AbstractMetadata> ownedByHarvester = Specification.where(MetadataSpecs.hasHarvesterUuid(getParams().getUuid()));
Set<String> sources = new HashSet<>();
for (Integer metadataId : metadataRepository.findAllIdsBy(ownedByHarvester)) {
sources.add(metadataUtils.findOne(metadataId).getSourceInfo().getSourceId());
metadataManager.deleteMetadata(context, "" + metadataId);
}
List<String> sources = metadataRepository.findDistinctSourcesByHarvestInfo__uuid(getParams().getUuid());
metadataRepository.deleteAllByHarvesterUuid(getParams().getUuid());
searchManager.delete(String.format("+harvesterUuid:\"%s\"", getParams().getUuid()));

// Remove all sources related to the harvestUuid if they are not linked to any record anymore
for (String sourceUuid : sources) {
Long ownedBySource =
metadataRepository.count(Specification.where(MetadataSpecs.hasSource(sourceUuid)));
metadataUtils.count(Specification.where(MetadataSpecs.hasSource(sourceUuid)));
if (ownedBySource == 0
&& !sourceUuid.equals(params.getUuid())
&& sourceRepository.existsById(sourceUuid)) {
Expand Down

0 comments on commit d151b55

Please sign in to comment.