Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MCR-2742 New ids found during enrichment are added to the idPool befo… #1681

Draft
wants to merge 2 commits into
base: 2021.06.x
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

package org.mycore.mods.enrichment;

import java.nio.charset.StandardCharsets;
import java.net.URLDecoder;
import java.util.Locale;

import org.jaxen.JaxenException;
import org.jdom2.Element;
import org.mycore.common.MCRException;
Expand Down Expand Up @@ -47,14 +51,22 @@ public String getValue() {
return value;
}

public static String simplifyID(String id) {
return URLDecoder.decode(id.toLowerCase(Locale.ENGLISH),StandardCharsets.UTF_8).replace("-","");
}

public String simplifiedID() {
return simplifyID(toString());
}

@Override
public boolean equals(Object other) {
return (other instanceof MCRIdentifier && this.toString().equals(other.toString()));
return (other instanceof MCRIdentifier && this.simplifiedID().equals(((MCRIdentifier)other).simplifiedID()));
}

@Override
public int hashCode() {
return toString().hashCode();
return simplifiedID().hashCode();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

import java.nio.charset.StandardCharsets;
import java.net.URLDecoder;
import java.util.Locale;

import org.jdom2.Element;
import org.mycore.mods.enrichment.MCRIdentifier;

/**
* Compares and merges mods:identifier elements.
Expand All @@ -43,8 +43,7 @@ private String getType() {
}

private String getSimplifiedID() {
return URLDecoder.decode(this.element.getTextNormalize().toLowerCase(Locale.ENGLISH),StandardCharsets.UTF_8)
.replace("-","");
return MCRIdentifier.simplifyID(this.element.getTextNormalize());
}

@Override
Expand All @@ -60,7 +59,9 @@ public boolean isProbablySameAs(MCRMerger other) {

@Override
public void mergeFrom(MCRMerger other) {
if (!this.element.getText().contains("-") && other.element.getText().contains("-")) {
if ((!this.element.getText().contains("-") && other.element.getText().contains("-")) ||
(!URLDecoder.decode(this.element.getText(),StandardCharsets.UTF_8).equals(this.element.getText())
&& URLDecoder.decode(other.element.getText(),StandardCharsets.UTF_8).equals(other.element.getText()))) {
this.element.setText(other.element.getText());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ public void testMergeSame() throws Exception {
String a = "[mods:identifier[@type='issn']='12345678']";
String b = "[mods:identifier[@type='issn']='1234-5678']";
MCRMergerTest.test(a, b, b);
MCRMergerTest.test(b, a, b);
}

@Test
public void testMergeURLEncoded() throws Exception {
String a = "[mods:identifier[@type='doi']='10.1002/%28issn%291521-3765']";
String b = "[mods:identifier[@type='doi']='10.1002/(issn)1521-3765']";
MCRMergerTest.test(a, b, b);
MCRMergerTest.test(b, a, b);
}

@Test
Expand All @@ -51,5 +60,6 @@ public void testCaseInsensitiveDOIs() throws Exception {
String a = "[mods:identifier[@type='doi']='10.1530/EJE-21-1086']";
String b = "[mods:identifier[@type='doi']='10.1530/eje-21-1086']";
MCRMergerTest.test(a, b, a);
MCRMergerTest.test(b, a, b);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@
</before>
<enrichmentIteration>
<newIdentifiersFound>
<mods:identifier xmlns:mods="http://www.loc.gov/mods/v3" type="issn">1234-5678</mods:identifier>
<mods:identifier xmlns:mods="http://www.loc.gov/mods/v3" type="issn">1234-6789</mods:identifier>
<mods:identifier xmlns:mods="http://www.loc.gov/mods/v3" type="issn">1000-9999</mods:identifier>
<mods:identifier xmlns:mods="http://www.loc.gov/mods/v3" type="issn">1234-6789</mods:identifier>
<mods:identifier xmlns:mods="http://www.loc.gov/mods/v3" type="issn">1234-5678</mods:identifier>
</newIdentifiersFound>
<resolved from="DSC">
<mods:relatedItem xmlns:mods="http://www.loc.gov/mods/v3" type="host">
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:identifier type="issn">1234-5678</mods:identifier>
<mods:note>from data source C for ISSN 1234-6789</mods:note>
<mods:identifier type="issn">1234-6789</mods:identifier>
</mods:relatedItem>
</resolved>
<afterMerge>
Expand All @@ -84,13 +84,13 @@
<mods:note>host from data source B</mods:note>
<mods:identifier type="issn">1234-5678</mods:identifier>
<mods:identifier type="issn">1234-6789</mods:identifier>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:note>from data source C for ISSN 1234-6789</mods:note>
</mods:relatedItem>
</afterMerge>
<resolved from="DSC">
<mods:relatedItem xmlns:mods="http://www.loc.gov/mods/v3" type="host">
<mods:note>from data source C for ISSN 1234-6789</mods:note>
<mods:identifier type="issn">1234-6789</mods:identifier>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:identifier type="issn">1234-5678</mods:identifier>
</mods:relatedItem>
</resolved>
<afterMerge>
Expand All @@ -100,9 +100,9 @@
<mods:note>host from data source B</mods:note>
<mods:identifier type="issn">1234-5678</mods:identifier>
<mods:identifier type="issn">1234-6789</mods:identifier>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:note>from data source C for ISSN 1234-6789</mods:note>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
</mods:relatedItem>
</afterMerge>
</enrichmentIteration>
</debugEnrichment>
</debugEnrichment>
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
<mods:relatedItem type="host">
<mods:note>host from data source A</mods:note>
<mods:note>host from data source B</mods:note>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:note>from data source C for ISSN 1234-6789</mods:note>
<mods:note>from data source C for ISSN 1234-5678</mods:note>
<mods:identifier type="issn">1000-9999</mods:identifier>
<mods:identifier type="issn">1234-5678</mods:identifier>
<mods:identifier type="issn">1234-6789</mods:identifier>
Expand Down