AccessionDuplicateFinder.java
/*
* Copyright 2022 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.worker.dupe;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.StringFilter;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Accession;
import org.gringlobal.model.AccessionInvName;
import org.gringlobal.model.Inventory;
import org.gringlobal.service.AccessionService;
import org.gringlobal.service.filter.AccessionFilter;
import org.gringlobal.service.filter.SiteFilter;
import org.gringlobal.service.filter.TaxonomyGenusFilter;
import org.gringlobal.service.filter.TaxonomySpeciesFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@Component
@Slf4j
public class AccessionDuplicateFinder extends DuplicateFinder<Accession> {
@Autowired
private AccessionService accessionService;
@Override
protected double getBestScoreThreshold() {
return 1000d;
}
@Override
protected List<Accession> getCandidates(Accession target, Collection<Long> excludedById) {
assert (target != null);
log.info("Searching for duplicates of {}", target);
List<Accession> candidates = new ArrayList<>(100);
// Accession fields for search: doi, faoInstituteNumber, accessionNumber, accessionNumberPart1, genusName, preferredName
var doi = target.getDoi();
var site = target.getSite();
var faoInstituteNumber = site != null ? site.getFaoInstituteNumber() : null;
var accessionNumber = target.getAccessionNumber();
var accessionNumberPart1 = target.getAccessionNumberPart1();
var taxonomy = target.getTaxonomySpecies();
var genus = taxonomy != null ? taxonomy.getTaxonomyGenus() : null;
var genusName = genus != null ? genus.getGenusName() : null;
var preferredName = target.getPreferredName();
// By doi
if (StringUtils.isNotBlank(doi)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.doi().add(doi);
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
// By faoInstituteNumber
if (StringUtils.isNotBlank(faoInstituteNumber)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter
.site(new SiteFilter().faoInstituteNumber(new StringFilter().eq(Set.of(faoInstituteNumber))))
._text(toSafeEsQuery(faoInstituteNumber));
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
// By accession number
if (StringUtils.isNotBlank(accessionNumber)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.accessionNumber(Set.of(accessionNumber));
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
// By accession number part 1
if (StringUtils.isNotBlank(accessionNumberPart1)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter
.accessionNumberPart1(new StringFilter().eq(Set.of(accessionNumberPart1)))
._text(toSafeEsQuery(accessionNumberPart1));
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
// By genus
if (StringUtils.isNotBlank(genusName)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter
.taxonomySpecies(
new TaxonomySpeciesFilter().taxonomyGenus(new TaxonomyGenusFilter().genusName(new StringFilter().eq(Set.of(genusName))))
)
._text(toSafeEsQuery(genusName));
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
// By preferred name
if (StringUtils.isNotBlank(preferredName)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter
.preferredName(new StringFilter().eq(Set.of(preferredName)))
._text(toSafeEsQuery(preferredName));
try {
log.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
return candidates;
}
@Override
protected double scoreHit(Accession target, Hit<Accession> hit) {
double score = hit.score;
var candidate = hit.result;
var targetAcceNumb = StringUtils.lowerCase(target.getAccessionNumber());
var candidateAcceNumb = StringUtils.lowerCase(candidate.getAccessionNumber());
if (notNullEquals(hit.matches, candidateAcceNumb, targetAcceNumb)) {
score += 500;
} else {
score += stringsAndNumbersCompare(hit.matches, candidateAcceNumb, targetAcceNumb) * 500;
}
if (notNullEquals(hit.matches, candidate.getDoi(), target.getDoi())) {
score += 500;
}
var targetAcceNumbPart1 = StringUtils.lowerCase(target.getAccessionNumberPart1());
var candidateAcceNumbPart1 = StringUtils.lowerCase(candidate.getAccessionNumberPart1());
if (notNullEquals(hit.matches, targetAcceNumbPart1, candidateAcceNumbPart1)) {
score += 100;
}
var targetTaxonomy = target.getTaxonomySpecies();
var candidateTaxonomy = candidate.getTaxonomySpecies();
if (candidateTaxonomy != null && targetTaxonomy != null && targetTaxonomy.getTaxonomyGenus() != null) {
if (notNullEquals(hit.matches, candidateTaxonomy.getTaxonomyGenus().getGenusName(), targetTaxonomy.getTaxonomyGenus().getGenusName())) {
score += 100;
}
if (notNullEquals(hit.matches, candidateTaxonomy.getSpeciesName(), targetTaxonomy.getSpeciesName())) {
score += 200;
}
}
if (notNullEquals(hit.matches, candidate.getPreferredName(), target.getPreferredName())) {
score += 100;
} else {
score += similarityScore(hit.matches, candidate.getPreferredName(), target.getPreferredName()) * 100;
}
var targetInstitute = target.getSite();
var candidateInstitute = candidate.getSite();
if (targetInstitute != null && candidateInstitute != null) {
if (notNullEquals(hit.matches, candidateInstitute.getFaoInstituteNumber(), targetInstitute.getFaoInstituteNumber())) {
score += 100;
}
}
var targetInvNames = target.getNames();
if (CollectionUtils.isNotEmpty(targetInvNames)) {
var candidateInventories = candidate.getInventories();
if (CollectionUtils.isNotEmpty(candidateInventories)) {
var candidatePlantNames = candidateInventories.stream().map(Inventory::getNames)
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.map(AccessionInvName::getPlantName)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
score += targetInvNames.stream()
.map(AccessionInvName::getPlantName)
.filter(Objects::nonNull)
.filter(candidatePlantNames::contains)
.peek(targetPlantName -> hit.matches.add(targetPlantName))
.mapToDouble(targetName -> 100).sum();
}
}
hit.score = score;
return score;
}
private AccessionFilter getCandidatesFilter(Accession target, Collection<Long> excludedById, List<Accession> candidates) {
AccessionFilter filter = new AccessionFilter();
if (target.getId() != null) {
filter.NOT(new AccessionFilter());
filter.NOT.id().add(target.getId()); // Not this
}
if (! CollectionUtils.isEmpty(excludedById)) {
if (filter.NOT == null) filter.NOT(new AccessionFilter());
filter.NOT.id().addAll(excludedById);
}
if (! CollectionUtils.isEmpty(candidates)) {
if (filter.NOT == null) filter.NOT(new AccessionFilter());
filter.NOT.id().addAll(candidates.stream().map(Accession::getId).collect(Collectors.toSet())); // Not already found
}
return filter;
}
}