CooperatorDuplicateFinder.java
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.worker.dupe;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.StringFilter;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Cooperator;
import org.gringlobal.service.CooperatorService;
import org.gringlobal.service.filter.CooperatorFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
/**
* Cooperator Duplicate Finder.
*/
@Component
@Slf4j
public class CooperatorDuplicateFinder extends DuplicateFinder<Cooperator> {
@Autowired
private CooperatorService cooperatorService;
@Override
protected double getBestScoreThreshold() {
return 800d;
}
@Override
protected List<Cooperator> getCandidates(Cooperator target, Collection<Long> excludedById) {
assert (target != null);
log.info("Searching for duplicates of {}", target);
List<Cooperator> candidates = new ArrayList<>(20);
// By email
if (StringUtils.isNotBlank(target.getEmail())) {
var filter = getCandidatesFilter(target, excludedById, candidates);
filter.email = new StringFilter().eq(Set.of(target.getEmail()));
try {
log.info("Filtering for email {}", filter);
var matches = cooperatorService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
try {
var filter = getCandidatesFilter(target, excludedById, candidates);
filter._text =
StringUtils.defaultIfBlank(target.getEmail(), "")
+ " " + StringUtils.defaultIfBlank(target.getFirstName(), "")
+ " " + StringUtils.defaultIfBlank(target.getLastName(), "")
+ " " + StringUtils.defaultIfBlank(target.getOrganization(), "")
+ " " + StringUtils.defaultIfBlank(target.getOrganizationAbbrev(), "");
log.info("Filtering for names {}", filter);
Page<Cooperator> matches = cooperatorService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
if (StringUtils.isNotBlank(target.getAddressLine1())) {
try {
var filter = getCandidatesFilter(target, excludedById, candidates);
filter._text = toSafeEsQuery(target.getAddressLine1());
log.info("Filtering for address {}", filter);
Page<Cooperator> matches = cooperatorService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
log.warn(e.getMessage());
}
}
return candidates;
}
private CooperatorFilter getCandidatesFilter(Cooperator target, Collection<Long> excludedById, List<Cooperator> candidates) {
CooperatorFilter filter = new CooperatorFilter();
// exclude target
if (target.getId() != null) {
if (filter.NOT == null) {
filter.NOT(new CooperatorFilter());
filter.NOT.id(new HashSet<>());
}
filter.NOT.id().add(target.getId()); // Not this
}
if (! CollectionUtils.isEmpty(excludedById)) {
if (filter.NOT == null) {
filter.NOT(new CooperatorFilter());
filter.NOT.id(new HashSet<>());
}
filter.NOT.id().addAll(excludedById);
}
if (! CollectionUtils.isEmpty(candidates)) {
if (filter.NOT == null) {
filter.NOT(new CooperatorFilter());
filter.NOT.id(new HashSet<>());
}
filter.NOT.id().addAll(candidates.stream().map(Cooperator::getId).collect(Collectors.toSet())); // Not already found
}
return filter;
}
/**
* Score hit.
*
* @param target the target
* @param hit the hit
* @return the double
*/
@Override
protected double scoreHit(Cooperator target, Hit<Cooperator> hit) {
var candidate = hit.result;
var score = hit.score;
if (notNullEquals(hit.matches, candidate.getFirstName(), target.getFirstName())) {
score += 250;
} else {
// could be Max | Maxim | Maxym | Maksim | Maksym
score += similarityScore(hit.matches, candidate.getFirstName(), target.getFirstName()) * 250;
}
if (notNullEquals(hit.matches, candidate.getLastName(), target.getLastName())) {
score += 300;
} else {
score += similarityScore(hit.matches, candidate.getLastName(), target.getLastName()) * 300;
}
if (notNullEquals(hit.matches, candidate.getEmail(), target.getEmail())) {
score += 300;
} else if (notNullEquals(hit.matches, candidate.getSecondaryEmail(), target.getSecondaryEmail())) {
score += 300;
} else if (notNullEquals(hit.matches, candidate.getEmail(), target.getSecondaryEmail())) {
score += 300;
}
if (notNullEquals(hit.matches, candidate.getTitle(), target.getTitle())) {
score += 10;
}
// categoryCode is a codeValue
if (notNullEquals(hit.matches, candidate.getCategoryCode(), target.getCategoryCode())) {
score += 20;
}
// disciplineCode is a codeValue
if (notNullEquals(hit.matches, candidate.getDisciplineCode(), target.getDisciplineCode())) {
score += 10;
}
if (notNullEquals(hit.matches, candidate.getOrganization(), target.getOrganization())) {
score += 300;
} else {
score += similarityScore(hit.matches, candidate.getOrganization(), target.getOrganization()) * 300;
}
if (notNullEquals(hit.matches, candidate.getOrganizationAbbrev(), target.getOrganizationAbbrev())) {
score += 50;
}
if (notNullEquals(hit.matches, candidate.getJob(), target.getJob())) {
score += 10;
}
// score += similarityScore(hit.matches, candidate.getNote(), target.getNote()) * 10;
{
var candidateAddress1 = StringUtils.joinWith(" ", candidate.getAddressLine1(), candidate.getAddressLine2(), candidate.getAddressLine3(), candidate.getPostalIndex(), candidate.getCity()).replace(" ", " ").strip();
var targetAddress1 = StringUtils.joinWith(" ", target.getAddressLine1(), target.getAddressLine2(), target.getAddressLine3(), target.getPostalIndex(), target.getCity()).replace(" ", " ").strip();
var candidateAddress2 = StringUtils.joinWith(" ", candidate.getSecondaryAddressLine1(), candidate.getSecondaryAddressLine2(), candidate.getSecondaryAddressLine3(), candidate.getSecondaryPostalIndex(), candidate.getSecondaryCity()).replace(" ", " ").strip();
var targetAddress2 = StringUtils.joinWith(" ", target.getSecondaryAddressLine1(), target.getSecondaryAddressLine2(), target.getSecondaryAddressLine3(), target.getSecondaryPostalIndex(), target.getSecondaryCity()).replace(" ", " ").strip();
/*
* Compare address data
*/
score += similarityScore(hit.matches, candidateAddress1, targetAddress1) * 200;
score += similarityScore(hit.matches, candidateAddress1, targetAddress2) * 200;
score += similarityScore(hit.matches, candidateAddress2, targetAddress1) * 200;
score += similarityScore(hit.matches, candidateAddress2, targetAddress2) * 200;
}
hit.score = score;
return score;
}
}