CooperatorDuplicateFinder.java

/*
 * Copyright 2021 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gringlobal.worker.dupe;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.StringFilter;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Cooperator;
import org.gringlobal.service.CooperatorService;
import org.gringlobal.service.filter.CooperatorFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

/**
 * Cooperator Duplicate Finder.
 */
@Component
@Slf4j
public class CooperatorDuplicateFinder extends DuplicateFinder<Cooperator> {

	@Autowired
	private CooperatorService cooperatorService;

	@Override
	protected double getBestScoreThreshold() {
		return 800d;
	}

	@Override
	protected List<Cooperator> getCandidates(Cooperator target, Collection<Long> excludedById) {
		assert (target != null);
		log.info("Searching for duplicates of {}", target);

		List<Cooperator> candidates = new ArrayList<>(20);

		
		// By email
		if (StringUtils.isNotBlank(target.getEmail())) {
			var filter = getCandidatesFilter(target, excludedById, candidates);
			filter.email = new StringFilter().eq(Set.of(target.getEmail()));

			try {
				log.info("Filtering for email {}", filter);
				var matches = cooperatorService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		try {
			var filter = getCandidatesFilter(target, excludedById, candidates);

			filter._text =
					StringUtils.defaultIfBlank(target.getEmail(), "")
						+ " " + StringUtils.defaultIfBlank(target.getFirstName(), "")
						+ " " + StringUtils.defaultIfBlank(target.getLastName(), "")
						+ " " + StringUtils.defaultIfBlank(target.getOrganization(), "")
						+ " " + StringUtils.defaultIfBlank(target.getOrganizationAbbrev(), "");

			log.info("Filtering for names {}", filter);
			Page<Cooperator> matches = cooperatorService.list(filter, PageRequest.of(0, 20));
			candidates.addAll(matches.getContent());
		} catch (SearchException e) {
			log.warn(e.getMessage());
		}

		if (StringUtils.isNotBlank(target.getAddressLine1())) {
			try {
				var filter = getCandidatesFilter(target, excludedById, candidates);
				filter._text = toSafeEsQuery(target.getAddressLine1());

				log.info("Filtering for address {}", filter);
				Page<Cooperator> matches = cooperatorService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());
			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		return candidates;
	}

	private CooperatorFilter getCandidatesFilter(Cooperator target, Collection<Long> excludedById, List<Cooperator> candidates) {
		CooperatorFilter filter = new CooperatorFilter();

		// exclude target
		if (target.getId() != null) {
			if (filter.NOT == null) {
				filter.NOT(new CooperatorFilter());
				filter.NOT.id(new HashSet<>());
			}
			filter.NOT.id().add(target.getId()); // Not this
		}
		if (! CollectionUtils.isEmpty(excludedById)) {
			if (filter.NOT == null) {
				filter.NOT(new CooperatorFilter());
				filter.NOT.id(new HashSet<>());
			}
			filter.NOT.id().addAll(excludedById);
		}
		if (! CollectionUtils.isEmpty(candidates)) {
			if (filter.NOT == null) {
				filter.NOT(new CooperatorFilter());
				filter.NOT.id(new HashSet<>());
			}
			filter.NOT.id().addAll(candidates.stream().map(Cooperator::getId).collect(Collectors.toSet())); // Not already found
		}
		return filter;
	}

	/**
	 * Score hit.
	 *
	 * @param target the target
	 * @param hit the hit
	 * @return the double
	 */
	@Override
	protected double scoreHit(Cooperator target, Hit<Cooperator> hit) {
		var candidate = hit.result;
		var score = hit.score;


		if (notNullEquals(hit.matches, candidate.getFirstName(), target.getFirstName())) {
			score += 250;
		} else {
			// could be Max | Maxim | Maxym | Maksim | Maksym
			score += similarityScore(hit.matches, candidate.getFirstName(), target.getFirstName()) * 250;
		}
		if (notNullEquals(hit.matches, candidate.getLastName(), target.getLastName())) {
			score += 300;
		} else {
			score += similarityScore(hit.matches, candidate.getLastName(), target.getLastName()) * 300;
		}
		if (notNullEquals(hit.matches, candidate.getEmail(), target.getEmail())) {
			score += 300;
		} else if (notNullEquals(hit.matches, candidate.getSecondaryEmail(), target.getSecondaryEmail())) {
			score += 300;
		} else if (notNullEquals(hit.matches, candidate.getEmail(), target.getSecondaryEmail())) {
			score += 300;
		}
		if (notNullEquals(hit.matches, candidate.getTitle(), target.getTitle())) {
			score += 10;
		}
		// categoryCode is a codeValue
		if (notNullEquals(hit.matches, candidate.getCategoryCode(), target.getCategoryCode())) {
			score += 20;
		}
		// disciplineCode is a codeValue
		if (notNullEquals(hit.matches, candidate.getDisciplineCode(), target.getDisciplineCode())) {
			score += 10;
		}

		if (notNullEquals(hit.matches, candidate.getOrganization(), target.getOrganization())) {
			score += 300;
		} else {
			score += similarityScore(hit.matches, candidate.getOrganization(), target.getOrganization()) * 300;
		}
		if (notNullEquals(hit.matches, candidate.getOrganizationAbbrev(), target.getOrganizationAbbrev())) {
			score += 50;
		}
		if (notNullEquals(hit.matches, candidate.getJob(), target.getJob())) {
			score += 10;
		}
		// score += similarityScore(hit.matches, candidate.getNote(), target.getNote()) * 10;

		{
			var candidateAddress1 = StringUtils.joinWith(" ", candidate.getAddressLine1(), candidate.getAddressLine2(), candidate.getAddressLine3(), candidate.getPostalIndex(), candidate.getCity()).replace("  ", " ").strip();
			var targetAddress1 = StringUtils.joinWith(" ", target.getAddressLine1(), target.getAddressLine2(), target.getAddressLine3(), target.getPostalIndex(), target.getCity()).replace("  ", " ").strip();
			var candidateAddress2 = StringUtils.joinWith(" ", candidate.getSecondaryAddressLine1(), candidate.getSecondaryAddressLine2(), candidate.getSecondaryAddressLine3(), candidate.getSecondaryPostalIndex(), candidate.getSecondaryCity()).replace("  ", " ").strip();
			var targetAddress2 = StringUtils.joinWith(" ", target.getSecondaryAddressLine1(), target.getSecondaryAddressLine2(), target.getSecondaryAddressLine3(), target.getSecondaryPostalIndex(), target.getSecondaryCity()).replace("  ", " ").strip();

			/*
			 * Compare address data
			 */
			score += similarityScore(hit.matches, candidateAddress1, targetAddress1) * 200;
			score += similarityScore(hit.matches, candidateAddress1, targetAddress2) * 200;
			score += similarityScore(hit.matches, candidateAddress2, targetAddress1) * 200;
			score += similarityScore(hit.matches, candidateAddress2, targetAddress2) * 200;
		}

		hit.score = score;
		return score;
	}
}