AccessionDuplicateFinder.java

/*
 * Copyright 2022 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gringlobal.worker.dupe;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.StringFilter;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Accession;
import org.gringlobal.model.AccessionInvName;
import org.gringlobal.model.Inventory;
import org.gringlobal.service.AccessionService;
import org.gringlobal.service.filter.AccessionFilter;
import org.gringlobal.service.filter.SiteFilter;
import org.gringlobal.service.filter.TaxonomyGenusFilter;
import org.gringlobal.service.filter.TaxonomySpeciesFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

@Component
@Slf4j
public class AccessionDuplicateFinder extends DuplicateFinder<Accession> {

	@Autowired
	private AccessionService accessionService;

	@Override
	protected double getBestScoreThreshold() {
		return 1000d;
	}

	@Override
	protected List<Accession> getCandidates(Accession target, Collection<Long> excludedById) {
		assert (target != null);
		log.info("Searching for duplicates of {}", target);

		List<Accession> candidates = new ArrayList<>(100);

		// Accession fields for search: doi, faoInstituteNumber, accessionNumber, accessionNumberPart1, genusName, preferredName

		var doi = target.getDoi();
		var site = target.getSite();
		var faoInstituteNumber = site != null ? site.getFaoInstituteNumber() : null;
		var accessionNumber = target.getAccessionNumber();
		var accessionNumberPart1 = target.getAccessionNumberPart1();
		var taxonomy = target.getTaxonomySpecies();
		var genus = taxonomy != null ? taxonomy.getTaxonomyGenus() : null;
		var genusName = genus != null ? genus.getGenusName() : null;
		var preferredName = target.getPreferredName();

		// By doi
		if (StringUtils.isNotBlank(doi)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter.doi().add(doi);

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		// By faoInstituteNumber
		if (StringUtils.isNotBlank(faoInstituteNumber)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter
				.site(new SiteFilter().faoInstituteNumber(new StringFilter().eq(Set.of(faoInstituteNumber))))
				._text(toSafeEsQuery(faoInstituteNumber));

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		// By accession number
		if (StringUtils.isNotBlank(accessionNumber)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter.accessionNumber(Set.of(accessionNumber));

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		// By accession number part 1
		if (StringUtils.isNotBlank(accessionNumberPart1)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter
				.accessionNumberPart1(new StringFilter().eq(Set.of(accessionNumberPart1)))
				._text(toSafeEsQuery(accessionNumberPart1));

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		// By genus
		if (StringUtils.isNotBlank(genusName)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter
				.taxonomySpecies(
					new TaxonomySpeciesFilter().taxonomyGenus(new TaxonomyGenusFilter().genusName(new StringFilter().eq(Set.of(genusName))))
				)
				._text(toSafeEsQuery(genusName));

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		// By preferred name
		if (StringUtils.isNotBlank(preferredName)) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
			filter
				.preferredName(new StringFilter().eq(Set.of(preferredName)))
				._text(toSafeEsQuery(preferredName));

			try {
				log.info("Filtering for aliases {}", filter);
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());

			} catch (SearchException e) {
				log.warn(e.getMessage());
			}
		}

		return candidates;
	}

	@Override
	protected double scoreHit(Accession target, Hit<Accession> hit) {
		double score = hit.score;
		var candidate = hit.result;

		var targetAcceNumb  =  StringUtils.lowerCase(target.getAccessionNumber());
		var candidateAcceNumb  =  StringUtils.lowerCase(candidate.getAccessionNumber());
		if (notNullEquals(hit.matches, candidateAcceNumb, targetAcceNumb)) {
			score += 500;
		} else {
			score += stringsAndNumbersCompare(hit.matches, candidateAcceNumb, targetAcceNumb) * 500;
		}

		if (notNullEquals(hit.matches, candidate.getDoi(), target.getDoi())) {
			score += 500;
		}

		var targetAcceNumbPart1 = StringUtils.lowerCase(target.getAccessionNumberPart1());
		var candidateAcceNumbPart1 = StringUtils.lowerCase(candidate.getAccessionNumberPart1());
		if (notNullEquals(hit.matches, targetAcceNumbPart1, candidateAcceNumbPart1)) {
			score += 100;
		}

		var targetTaxonomy = target.getTaxonomySpecies();
		var candidateTaxonomy = candidate.getTaxonomySpecies();
		if (candidateTaxonomy != null && targetTaxonomy != null && targetTaxonomy.getTaxonomyGenus() != null) {
			if (notNullEquals(hit.matches, candidateTaxonomy.getTaxonomyGenus().getGenusName(), targetTaxonomy.getTaxonomyGenus().getGenusName())) {
				score += 100;
			}
			if (notNullEquals(hit.matches, candidateTaxonomy.getSpeciesName(), targetTaxonomy.getSpeciesName())) {
				score += 200;
			}
		}

		if (notNullEquals(hit.matches, candidate.getPreferredName(), target.getPreferredName())) {
			score += 100;
		} else {
			score += similarityScore(hit.matches, candidate.getPreferredName(), target.getPreferredName()) * 100;
		}

		var targetInstitute = target.getSite();
		var candidateInstitute = candidate.getSite();
		if (targetInstitute != null && candidateInstitute != null) {
			if (notNullEquals(hit.matches, candidateInstitute.getFaoInstituteNumber(), targetInstitute.getFaoInstituteNumber())) {
				score += 100;
			}
		}

		var targetInvNames = target.getNames();
		if (CollectionUtils.isNotEmpty(targetInvNames)) {
			var candidateInventories = candidate.getInventories();
			if (CollectionUtils.isNotEmpty(candidateInventories)) {

				var candidatePlantNames = candidateInventories.stream().map(Inventory::getNames)
					.filter(Objects::nonNull)
					.flatMap(Collection::stream)
					.map(AccessionInvName::getPlantName)
					.filter(Objects::nonNull)
					.collect(Collectors.toSet());

				score += targetInvNames.stream()
					.map(AccessionInvName::getPlantName)
					.filter(Objects::nonNull)
					.filter(candidatePlantNames::contains)
					.peek(targetPlantName -> hit.matches.add(targetPlantName))
					.mapToDouble(targetName -> 100).sum();
			}
		}

		hit.score = score;
		return score;
	}

	private AccessionFilter getCandidatesFilter(Accession target, Collection<Long> excludedById, List<Accession> candidates) {
		AccessionFilter filter = new AccessionFilter();

		if (target.getId() != null) {
			filter.NOT(new AccessionFilter());
			filter.NOT.id().add(target.getId()); // Not this
		}
		if (! CollectionUtils.isEmpty(excludedById)) {
			if (filter.NOT == null) filter.NOT(new AccessionFilter());

			filter.NOT.id().addAll(excludedById);
		}
		if (! CollectionUtils.isEmpty(candidates)) {
			if (filter.NOT == null) filter.NOT(new AccessionFilter());

			filter.NOT.id().addAll(candidates.stream().map(Accession::getId).collect(Collectors.toSet())); // Not already found
		}
		return filter;
	}
}