UsdaTaxonomyUpdater.java
/*
* Copyright 2026 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.worker;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import javax.persistence.EntityManager;
import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.CabReader;
import org.genesys.taxonomy.gringlobal.model.AuthorRow;
import org.genesys.taxonomy.gringlobal.model.FamilyRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import com.google.common.collect.Lists;
import com.opencsv.CSVReader;
import com.querydsl.jpa.impl.JPAQueryFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Strings;
import org.gringlobal.api.exception.InvalidApiUsageException;
import org.gringlobal.model.QTaxonomyGenus;
import org.gringlobal.model.QTaxonomySpecies;
import org.gringlobal.model.TaxonomyAuthor;
import org.gringlobal.model.TaxonomyFamily;
import org.gringlobal.model.TaxonomyGenus;
import org.gringlobal.model.TaxonomySpecies;
import org.gringlobal.persistence.TaxonomyAuthorRepository;
import org.gringlobal.persistence.TaxonomyFamilyRepository;
import org.gringlobal.persistence.TaxonomyGenusRepository;
import org.gringlobal.persistence.TaxonomySpeciesRepository;
/**
* The component downloads current GRIN Taxonomy database if no local copy
* exists and updates Family, Genus and Species tables in the local database.
*
* The matching is done on names only, local identifiers will not match GRIN
* Taxonomy IDs.
*
* @author Matija Obreza
*/
@Component
@Slf4j
public class UsdaTaxonomyUpdater {
private static final String DEBUG_GENUS_NAME = "Allodissotis";
private static final String DEBUG_SPECIES_NAME = "Neurachne alopecuroides";
@Autowired
private TaxonomyFamilyRepository taxonomyFamilyRepository;
@Autowired
private TaxonomyGenusRepository taxonomyGenusRepository;
@Autowired
private TaxonomySpeciesRepository taxonomySpeciesRepository;
@Autowired
private TaxonomyAuthorRepository taxonomyAuthorRepository;
@Autowired
private JPAQueryFactory jpaQueryFactory;
private File downloadFolder = new File(FileUtils.getTempDirectory(), "grin-taxonomy-source"); // + System.currentTimeMillis());
@Autowired
private EntityManager entityManager;
/**
* Update local taxonomy tables with data from GRIN Taxonomy.
*
* @throws Exception
*/
@PreAuthorize("hasAuthority('GROUP_ADMINS')")
@Transactional
public void update() throws Exception {
log.info("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
downloadDataIfNeeded(downloadFolder);
updateLocalDatabase();
log.warn("Taxonomy database updated successfully. Transaction will now be committed. This takes a long time if there are loads of updates!");
}
/**
* The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
* {@link TaxonomySpecies}. The entries from source database are mapped to local
* identifiers. No records are removed from the local database.
*
* <p>
* Note: The update may update capitalization of names.
* </p>
*
* @throws Exception
*/
private void updateLocalDatabase() throws Exception {
Map<Long, TaxonomyFamily> famTheirsToOurs = new HashMap<>();
Map<Long, TaxonomyGenus> genTheirsToOurs = new HashMap<>();
Map<Long, TaxonomySpecies> speTheirsToOurs = new HashMap<>();
// Map<Long, TaxonomyAuthor> authTheirsToOurs = new HashMap<>();
Map<Long, Long> currentTypeGenus = new HashMap<>();
{
log.warn("Loading {} TaxonomyFamily records to memory...", taxonomyFamilyRepository.count());
Map<Long, Long> currentFamily = new HashMap<>();
List<TaxonomyFamily> allFamilies = taxonomyFamilyRepository.findAll();
final Map<Long, TaxonomyFamily> allFamiliesByGrinId = new HashMap<>();
allFamilies.forEach(family -> {
if (family.getGrinId() != null) {
allFamiliesByGrinId.put(family.getGrinId(), family);
}
});
List<TaxonomyFamily> toSave = new ArrayList<>();
log.warn("Reading {}/taxonomy_family.txt", downloadFolder);
// read taxonomy_genus.txt
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
var beanReader = CabReader.beanReader(FamilyRow.class, reader);
beanReader.forEach(familyRow -> {
TaxonomyFamily family = null;
var other = allFamiliesByGrinId.get(familyRow.getTaxonomyFamilyId());
if (other != null) {
family = other;
} else {
if (allFamilies.size() > 0) {
final List<TaxonomyFamily> narrow = allFamilies.stream()
// filter
.filter(m -> (
Strings.CI.equals(m.getFamilyName(), familyRow.getFamilyName())
&& Strings.CI.equals(m.getFamilyAuthority(), familyRow.getFamilyAuthority())
&& Strings.CI.equals(m.getSubfamilyName(), familyRow.getSubfamilyName())
&& Strings.CI.equals(m.getTribeName(), familyRow.getTribeName())
&& Strings.CI.equals(m.getSubtribeName(), familyRow.getSubtribeName())
))
// print
.peek(m -> {
log.debug("{} {} {} {} {}", m.getFamilyName(), m.getFamilyAuthority(), m.getSubfamilyName(), m.getTribeName(), m.getSubtribeName());
})
// collect
.collect(Collectors.toList());
if (narrow.size() == 1) {
family = narrow.get(0);
} else if (narrow.size() == 0) {
log.debug("No matches found! Will create new entry.");
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_family needs cleaning: " + familyRow.getFamilyName());
}
}
if (family == null) {
family = new TaxonomyFamily();
log.info("New family {} {} {} {} {} gid={}.", familyRow.getFamilyName(), familyRow.getFamilyAuthority(), familyRow.getSubfamilyName(), familyRow.getTribeName(), familyRow.getSubtribeName(), familyRow.getTaxonomyFamilyId());
}
}
family.setGrinId(familyRow.getTaxonomyFamilyId());
family.setFamilyName(familyRow.getFamilyName());
family.setFamilyAuthority(familyRow.getFamilyAuthority());
family.setSubfamilyName(familyRow.getSubfamilyName());
family.setTribeName(familyRow.getTribeName());
family.setSubtribeName(familyRow.getSubtribeName());
family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
family.setAlternateName(familyRow.getAlternateName());
family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
family.setNote(familyRow.getNote());
toSave.add(family);
famTheirsToOurs.put(familyRow.getTaxonomyFamilyId(), family);
currentFamily.put(familyRow.getTaxonomyFamilyId(), familyRow.getCurrentTaxonomyFamilyId());
currentTypeGenus.put(familyRow.getTaxonomyFamilyId(), familyRow.getTypeTaxonomyGenusId());
});
}
// Save updates
Lists.partition(toSave, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAllAndFlush(batch);
entityManager.flush();;
});
toSave.clear();
// Update references
currentFamily.forEach((theirId, theirCurrentId) -> {
var family = famTheirsToOurs.get(theirId);
var current = famTheirsToOurs.get(theirCurrentId);
if (current == null || family.getCurrentTaxonomyFamily() == null || !family.getCurrentTaxonomyFamily().getId().equals(current.getId())) {
var reloaded = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
reloaded.setCurrentTaxonomyFamily(taxonomyFamilyRepository.findById(current.getId()).orElseThrow());
toSave.add(reloaded);
}
});
// Save updates
Lists.partition(toSave, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAllAndFlush(batch);
entityManager.flush();
});
allFamilies.clear();
toSave.clear();
allFamiliesByGrinId.clear();
}
{
// read taxonomy_genus.txt
log.warn("Loading {} TaxonomyGenus records to memory...", taxonomyGenusRepository.count());
// Group list of genera by family#id for faster lookups
final LookupList<String, TaxonomyGenus> allGeneraIndex = new LookupList<>();
final Map<Long, TaxonomyGenus> allGeneraByGrinId = new HashMap<>();
taxonomyGenusRepository.findAll().forEach(genus -> {
allGeneraIndex.add(indexLookupKey(genus), genus);
if (genus.getGrinId() != null) {
allGeneraByGrinId.put(genus.getGrinId(), genus);
}
});
List<TaxonomyGenus> toSave = new ArrayList<>();
Map<Long, Long> currentGenus = new HashMap<>();
log.warn("Reading {}/taxonomy_genus.txt", downloadFolder);
// SCAN taxonomy_genus.txt to get TAXONOMY_GENUS_ID. Our records that have GRIN_ID that is not in the list
// need to be updated with GRIN_ID = NULL.
var grinIdsInFile = new LinkedHashSet<Long>(100);
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
var beanReader = CabReader.beanReader(GenusRow.class, reader);
beanReader.forEach(row -> {
var grinId = row.getTaxonomyGenusId();
grinIdsInFile.add(grinId);
// Find genera records by GRIN ID where the name had changed
var genusName = row.getGenusName();
var existingGenus = allGeneraByGrinId.get(grinId);
if (existingGenus != null && ! Strings.CI.equals(genusName, existingGenus.getGenusName())) {
log.warn("GRIN genus {} with id={} does not match what we have {} with id={}. Let try to update it.", genusName, grinId, existingGenus.getGenusName(), existingGenus.getId());
applyGrinGenus(row, existingGenus, famTheirsToOurs);
taxonomyGenusRepository.saveAndFlush(existingGenus);
}
});
}
log.warn("Found {} taxonomy_genus records", grinIdsInFile.size());
// Scan current genera and clear grinId
var missingGrinId = new LinkedList<Long>();
allGeneraByGrinId.keySet().forEach(weHave -> {
if (! grinIdsInFile.contains(weHave)) missingGrinId.add(weHave);
});
log.warn("We have {} taxonomy_genus with GRIN ID that are no longer in GRIN Taxonomy", missingGrinId.size());
for (var miss : missingGrinId) {
var missed = allGeneraByGrinId.get(miss);
log.warn("Not in GRIN Taxonomy: {} {}", missed.getGrinId(), missed);
jpaQueryFactory.update(QTaxonomyGenus.taxonomyGenus).where(
QTaxonomyGenus.taxonomyGenus.grinId.in(missed.getGrinId())
).setNull(QTaxonomyGenus.taxonomyGenus.grinId)
.execute();
entityManager.flush();
allGeneraByGrinId.remove(miss); // Remove from cache
missed.setGrinId(null);
}
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
var beanReader = CabReader.beanReader(GenusRow.class, reader);
beanReader.forEach(genusRow -> {
if (famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()) == null) {
log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
return;
}
if (Strings.CI.equals(genusRow.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Matching", genusRow);
}
TaxonomyGenus genus = null;
var other = allGeneraByGrinId.get(genusRow.getTaxonomyGenusId());
if (other != null) {
genus = other;
} else {
List<TaxonomyGenus> generaWithName = allGeneraIndex.get(indexLookupKey(genusRow));
if (generaWithName != null) {
if (genusRow.getGenusName().equals(DEBUG_GENUS_NAME)) {
print(">> Looking for: ", genusRow);
}
List<TaxonomyGenus> narrow = generaWithName.stream()
// print
.peek(m -> {
if (genusRow.getGenusName().equals(DEBUG_GENUS_NAME)) {
print("Candidate: ", m);
}
})
// filter
.filter(m -> (
Objects.equals(m.getTaxonomyFamily().getId(), famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()).getId())
&& Strings.CI.equals(m.getGenusName(), genusRow.getGenusName())
&& Strings.CI.equals(m.getGenusAuthority(), genusRow.getGenusAuthority())
&& Strings.CI.equals(m.getSubgenusName(), genusRow.getSubgenusName())
&& Strings.CI.equals(m.getSectionName(), genusRow.getSectionName())
&& Strings.CI.equals(m.getSubsectionName(), genusRow.getSubsectionName())
&& Strings.CI.equals(m.getSeriesName(), genusRow.getSeriesName())
&& Strings.CI.equals(m.getSubseriesName(), genusRow.getSubseriesName())
))
// print
.peek(m -> {
if (m.getGenusName().equals(DEBUG_GENUS_NAME)) {
print("Match", m);
}
log.debug("{} {} {} {} {} {} {}", m.getGenusName(), m.getGenusAuthority(), m.getSubgenusName(), m.getSectionName(), m.getSubsectionName(), m.getSeriesName(), m.getSubseriesName());
})
// collect
.collect(Collectors.toList());
if (narrow.size() == 1) {
genus = narrow.get(0);
} else if (narrow.size() == 0) {
genus = applyGrinGenus(genusRow, new TaxonomyGenus(), famTheirsToOurs);
log.info("{} matches found for {} {} {} {} {} {} {}! Will create new entry.", narrow.size(), genus.getGenusName(), genus.getGenusAuthority(), genus
.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
} else {
print("Too many matches for:", genusRow);
narrow.forEach(m -> print(">> ", m));
var narrower = narrow.stream().filter(m -> (
Strings.CI.equals(m.getHybridCode(), genusRow.getHybridCode())
&& Strings.CI.equals(m.getQualifyingCode(), genusRow.getQualifyingCode())
)).collect(Collectors.toList());
if (narrower.size() == 1) {
genus = narrower.get(0);
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_genus needs cleaning: " + genusRow.getGenusName() + " " + genusRow.getGenusAuthority());
}
}
} else {
log.info("No existing genera for index={}", indexLookupKey(genusRow));
genus = applyGrinGenus(genusRow, new TaxonomyGenus(), famTheirsToOurs);
log.info("No matches found for {} {} {} {} {} {} {}! Will create new entry.", genus.getGenusName(), genus.getGenusAuthority(), genus
.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
// print("New taxonomy_genus", genus);
}
}
if (Strings.CI.equals(genus.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Updating", genus);
}
// genus.setGenusId(genusRow.getGenusId());
// genus.setCurrentGenusId(genusRow.getCurrentGenusId());
genus.setGrinId(genusRow.getGenusId());
genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
if (genus.getTaxonomyFamily() == null) {
log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
return;
}
genus.setQualifyingCode(genusRow.getQualifyingCode());
genus.setHybridCode(genusRow.getHybridCode());
genus.setGenusName(genusRow.getGenusName());
genus.setGenusAuthority(genusRow.getGenusAuthority());
genus.setSubgenusName(genusRow.getSubgenusName());
genus.setSectionName(genusRow.getSectionName());
genus.setSubsectionName(genusRow.getSubsectionName());
genus.setSeriesName(genusRow.getSeriesName());
genus.setSubseriesName(genusRow.getSubseriesName());
genus.setNote(genusRow.getNote());
// genus.setCreatedDate(genusRow.getCreatedDate());
// genus.setModifiedDate(genusRow.getModifiedDate()); // Do not update @Versioned modifiedDate
if (Strings.CI.equals(genus.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Updated", genus);
}
toSave.add(genus);
genTheirsToOurs.put(genusRow.getGenusId(), genus);
currentGenus.put(genusRow.getTaxonomyGenusId(), genusRow.getCurrentTaxonomyGenusId());
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyGenus", batch.size());
taxonomyGenusRepository.saveAllAndFlush(batch);
entityManager.flush();
});
toSave.clear();
// Update references
currentGenus.forEach((theirId, theirCurrentId) -> {
var genus = genTheirsToOurs.get(theirId);
var current = genTheirsToOurs.get(theirCurrentId);
if (current == null || genus.getCurrentTaxonomyGenus() == null || !genus.getCurrentTaxonomyGenus().getId().equals(current.getId())) {
var reloaded = taxonomyGenusRepository.findById(genus.getId()).orElseThrow();
reloaded.setCurrentTaxonomyGenus(taxonomyGenusRepository.findById(current.getId()).orElseThrow());
toSave.add(reloaded);
}
});
// Save updates
log.info("Updating {} genus references", toSave.size());
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyGenus", batch.size());
taxonomyGenusRepository.saveAllAndFlush(batch);
entityManager.flush();
});
toSave.clear();
allGeneraIndex.clear();
allGeneraByGrinId.clear();
{
List<TaxonomyFamily> toSaveFam = new ArrayList<>();
currentTypeGenus.forEach((theirId, theirGenusId) -> {
TaxonomyFamily family = famTheirsToOurs.get(theirId);
if (theirGenusId == null) {
if (family.getTypeTaxonomyGenus() != null) {
family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
family.setTypeTaxonomyGenus(null);
toSaveFam.add(family);
}
} else {
var typeGenus = genTheirsToOurs.get(theirGenusId);
if (typeGenus == null || family.getTypeTaxonomyGenus() == null || family.getTypeTaxonomyGenus().getId().equals(typeGenus.getId())) {
family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
family.setTypeTaxonomyGenus(taxonomyGenusRepository.findById(typeGenus.getId()).orElseThrow());
toSaveFam.add(family);
}
}
if (family.getTypeTaxonomyGenus() == null && theirGenusId != null) {
log.warn("Type genus is null: their genus_id={} our taxonomy_family_id={}", theirGenusId, family.getId());
}
});
Lists.partition(toSaveFam, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAllAndFlush(batch);
entityManager.flush();
});
currentTypeGenus.clear();
}
}
{
// read taxonomy_species.txt
log.warn("Loading {} TaxonomySpecies records to memory...", taxonomySpeciesRepository.count());
// Group list of species by epithet for faster lookups
final LookupList<String, TaxonomySpecies> allSpeciesByEpithet = new LookupList<>();
final Map<Long, TaxonomySpecies> allSpeciesByGrinId = new HashMap<>();
taxonomySpeciesRepository.findAll().forEach(species -> {
allSpeciesByEpithet.add(indexLookupKey(species), species);
if (species.getGrinId() != null) {
allSpeciesByGrinId.put(species.getGrinId(), species);
}
});
log.warn("Reading {}/taxonomy_species.txt", downloadFolder);
// SCAN taxonomy_species.txt to check for conflicts
var grinIdsInFile = new LinkedHashSet<Long>(100);
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
beanReader.forEach(row -> {
var grinId = row.getTaxonomySpeciesId();
grinIdsInFile.add(grinId);
// Find genera records by GRIN ID where the name had changed
var speciesName = row.getName();
var existingSpecies = allSpeciesByGrinId.get(grinId);
if (existingSpecies != null && ! Strings.CI.equals(speciesName, existingSpecies.getName())) {
log.warn("GRIN species {} with id={} does not match what we have {} with id={}. Let try to update it.\n\t{}\n\t{}", speciesName, grinId, existingSpecies.getName(), existingSpecies.getId(), row, existingSpecies);
// Do we have this already?
var whatWeHave = allSpeciesByEpithet.getOrDefault(indexLookupKey(row), List.of()).stream().filter(m ->
Strings.CI.equals(m.getName(), speciesName)
&& Strings.CI.equals(m.getNameAuthority(), row.getNameAuthority())
&& Strings.CI.equals(m.getProtologue(), row.getProtologue())
&& Strings.CI.equals(m.getSynonymCode(), row.getSynonymCode())
).collect(Collectors.toList());
whatWeHave.forEach(existing -> {
log.warn("For {} we have: {}", row.getName(), existing);
});
if (whatWeHave.size() == 1) {
var candidate = whatWeHave.get(0);
log.warn("We have a single existing record: {}.", candidate);
if (candidate.getGrinId() == null) {
// Get the ones that point to this
var pointers = (List<TaxonomySpecies>) taxonomySpeciesRepository.findAll(QTaxonomySpecies.taxonomySpecies.currentTaxonomySpecies().eq(existingSpecies));
// Delete the wrong one!
log.warn("Deleting TaxonomySpecies and those {} that point to it: {}", pointers.size(), existingSpecies);
jpaQueryFactory.delete(QTaxonomySpecies.taxonomySpecies)
.where(
QTaxonomySpecies.taxonomySpecies.currentTaxonomySpecies().id.eq(existingSpecies.getId())
.or(QTaxonomySpecies.taxonomySpecies.id.eq(existingSpecies.getId()))
)
.execute();
entityManager.flush();
log.warn("Updating existing record with correct GRIN ID={}: {}", grinId, candidate);
candidate.setGrinId(row.getTaxonomySpeciesId()); // Update existing GRIN ID!
taxonomySpeciesRepository.saveAndFlush(candidate);
// Update cache
allSpeciesByEpithet.get(indexLookupKey(row)).remove(existingSpecies);
pointers.forEach(pointer -> {
allSpeciesByEpithet.get(indexLookupKey(pointer)).remove(pointer);
if (pointer.getGrinId() != null) allSpeciesByGrinId.remove(pointer.getGrinId());
});
allSpeciesByGrinId.put(candidate.getGrinId(), candidate);
} else {
log.warn("Candidate already has GRIN ID={} {}", candidate.getGrinId(), candidate);
}
} else if (whatWeHave.size() == 0) {
applyGrinSpecies(row, existingSpecies, genTheirsToOurs);
taxonomySpeciesRepository.saveAndFlush(existingSpecies);
} else {
log.error("We have {} TaxonomySpecies candidates! This cannot be automatically fixed.", whatWeHave.size());
throw new RuntimeException("Too many TaxonomySpecies candidates for " + row);
}
}
});
}
log.warn("Found {} taxonomy_species records", grinIdsInFile.size());
// Scan current genera and clear grinId
var missingGrinId = new LinkedList<Long>();
allSpeciesByGrinId.keySet().forEach(weHave -> {
if (! grinIdsInFile.contains(weHave)) missingGrinId.add(weHave);
});
log.warn("We have {} taxonomy_species with GRIN ID that are no longer in GRIN Taxonomy", missingGrinId.size());
for (var miss : missingGrinId) {
var missed = allSpeciesByGrinId.get(miss);
log.warn("Not in GRIN Taxonomy {}, clearing grin_id for {}", missed.getGrinId(), missed);
jpaQueryFactory.update(QTaxonomySpecies.taxonomySpecies).where(
QTaxonomySpecies.taxonomySpecies.grinId.in(missed.getGrinId())
).setNull(QTaxonomySpecies.taxonomySpecies.grinId)
.execute();
entityManager.flush();
allSpeciesByGrinId.remove(miss); // Remove from cache
missed.setGrinId(null);
}
List<TaxonomySpecies> toSave = new ArrayList<>();
Map<Long, Long> currentSpecies = new HashMap<>();
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
final AtomicInteger counter = new AtomicInteger(0);
var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
beanReader.forEach(speciesRow -> {
if (counter.incrementAndGet() % 1000 == 0) {
log.warn("Read {} species rows", counter.get());
}
TaxonomySpecies species = null;
var other = allSpeciesByGrinId.get(speciesRow.getTaxonomySpeciesId());
if (other != null) {
species = other;
} else {
log.debug("No species with usda_id={}! Searching for {} {}", speciesRow.getTaxonomySpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());
List<TaxonomySpecies> speciesForEpithet = allSpeciesByEpithet.get(indexLookupKey(speciesRow));
if (speciesForEpithet != null) {
if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
print(">> Looking for", speciesRow);
}
List<TaxonomySpecies> narrow = speciesForEpithet.stream()
// debug
.peek(m -> {
if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
print("Inspecting:", m);
}
})
// filter
.filter(m -> (
Objects.equals(m.getTaxonomyGenus().getId(), genTheirsToOurs.get(speciesRow.getGenusId()).getId())
&& Strings.CI.equals(StringUtils.trimToNull(m.getName()), StringUtils.trimToNull(speciesRow.getName()))
&& Strings.CI.equals(StringUtils.trimToNull(m.getNameAuthority()), StringUtils.trimToNull(speciesRow.getNameAuthority()))
&& Strings.CI.equals(StringUtils.trimToNull(m.getSynonymCode()), StringUtils.trimToNull(speciesRow.getSynonymCode()))
&& Strings.CI.equals(StringUtils.trimToNull(m.getProtologue()), StringUtils.trimToNull(speciesRow.getProtologue()))
))
// print
.peek(m -> {
if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
print("Potential match:", m);
}
log.debug("{} {}", m.getName(), m.getNameAuthority());
})
// gather
.collect(Collectors.toList());
if (narrow.size() == 1) {
species = narrow.get(0);
} else if (narrow.size() == 0) {
if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
print("No matches found, will add", speciesRow);
}
log.debug("No matches found for {} {}! Will create new entry.", speciesRow.getName(), speciesRow.getNameAuthority());
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_species needs cleaning: " + speciesRow.getName() + " " + speciesRow.getNameAuthority());
}
} else {
log.debug("No species for epithet={}", speciesRow.getSpeciesName());
if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
print("Will add", speciesRow);
}
}
}
if (species != null && Strings.CI.equals(species.getName(), DEBUG_SPECIES_NAME)) {
print(">> Updating", species);
}
species = applyGrinSpecies(speciesRow, species == null ? new TaxonomySpecies() : species, genTheirsToOurs);
if (Strings.CI.equals(species.getName(), DEBUG_SPECIES_NAME)) {
print(">> Updated", species);
}
toSave.add(species);
speTheirsToOurs.put(speciesRow.getSpeciesId(), species);
currentSpecies.put(speciesRow.getSpeciesId(), speciesRow.getCurrentTaxonomySpeciesId());
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomySpecies", batch.size());
taxonomySpeciesRepository.saveAllAndFlush(batch);
entityManager.flush();
});
toSave.clear();
log.warn("Tackling {} records and their referrences to current species", currentSpecies.size());
// Update references
currentSpecies.forEach((theirId, theirCurrentId) -> {
var species = speTheirsToOurs.get(theirId);
var current = speTheirsToOurs.get(theirCurrentId);
if (current == null || species.getCurrentTaxonomySpecies() == null || !species.getCurrentTaxonomySpecies().getId().equals(current.getId())) {
species.setCurrentTaxonomySpecies(current);
toSave.add(species);
}
});
// Save updates
log.info("Updating {} species references", toSave.size());
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomySpecies", batch.size());
taxonomySpeciesRepository.saveAllAndFlush(batch);
entityManager.flush();
});
toSave.clear();
}
{
log.warn("Reading {}/taxonomy_author.txt", downloadFolder);
List<TaxonomyAuthor> allAuthors = taxonomyAuthorRepository.findAll();
List<TaxonomyAuthor> toSave = new ArrayList<>();
final LookupList<String, TaxonomyAuthor> authorsLookup = new LookupList<>();
allAuthors.forEach(author -> {
authorsLookup.add(indexLookupKey(author), author);
});
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_author.txt")), 0)) {
var beanReader = CabReader.beanReader(AuthorRow.class, reader);
beanReader.forEach(authorRow -> {
TaxonomyAuthor author = new TaxonomyAuthor();
author.setShortName(authorRow.getShortName());
if (author.getShortName() == null) {
log.warn("Missing shortName id={}", authorRow.getTaxonomyAuthorId());
return;
}
List<TaxonomyAuthor> authorsByFirst = authorsLookup.get(indexLookupKey(author));
if (authorsByFirst != null) {
final TaxonomyAuthor compareTo = author;
List<TaxonomyAuthor> narrow = authorsByFirst.stream()
// filter
.filter(m -> (
Strings.CI.equals(StringUtils.trimToNull(m.getShortName()), StringUtils.trim(compareTo.getShortName()))
))
// print
.peek(m -> {
log.debug("{}", m.getShortName());
})
// gather
.collect(Collectors.toList());
if (narrow.size() == 1) {
author = narrow.get(0);
} else if (narrow.size() == 0) {
log.debug("{} matches found for {}! Will create new entry.", narrow.size(), author.getShortName());
} else {
narrow.forEach(match -> {
log.warn("Found id={} short={} for input {}", match.getId(), match.getShortName(), compareTo.getShortName());
});
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_author needs cleaning: " + author.getShortName());
}
}
author.setFullName(authorRow.getFullName());
author.setFullNameExpandedDiacritic(authorRow.getFullNameExpandedDiacritic());
author.setShortName(authorRow.getShortName());
author.setShortNameExpandedDiacritic(authorRow.getShortNameExpandedDiacritic());
author.setNote(authorRow.getNote());
toSave.add(author);
// authTheirsToOurs.put(authorRow.getTaxonomyAuthorId(), author);
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyAuthors", batch.size());
taxonomyAuthorRepository.saveAllAndFlush(batch);
entityManager.flush();
});
toSave.clear();
}
log.warn("Done.");
}
private TaxonomyGenus applyGrinGenus(GenusRow genusRow, TaxonomyGenus genus, Map<Long,TaxonomyFamily> famTheirsToOurs) {
genus.setGrinId(genusRow.getTaxonomyGenusId());
genus.setQualifyingCode(genusRow.getQualifyingCode());
genus.setHybridCode(genusRow.getHybridCode());
genus.setGenusName(genusRow.getGenusName());
genus.setGenusAuthority(genusRow.getGenusAuthority());
genus.setSubgenusName(genusRow.getSubgenusName());
genus.setSectionName(genusRow.getSectionName());
genus.setSubsectionName(genusRow.getSubsectionName());
genus.setSeriesName(genusRow.getSeriesName());
genus.setSubseriesName(genusRow.getSubseriesName());
genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
return genus;
}
private TaxonomySpecies applyGrinSpecies(SpeciesRow speciesRow, TaxonomySpecies species, Map<Long,TaxonomyGenus> genTheirsToOurs) {
// species.setSpeciesId(speciesRow.getSpeciesId());
// species.setCurrentSpeciesId(speciesRow.getCurrentSpeciesId());
species.setGrinId(speciesRow.getTaxonomySpeciesId());
species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
if (species.getTaxonomyGenus() == null) {
log.warn("Missing genus for species id={} genus_id={}", speciesRow.getSpeciesId(), speciesRow.getGenusId());
return null; // Intentional to throw NPE
}
species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
species.setSpeciesName(speciesRow.getSpeciesName());
species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
species.setSubspeciesName(speciesRow.getSubspeciesName());
species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
species.setVarietyName(speciesRow.getVarietyName());
species.setVarietyAuthority(speciesRow.getVarietyAuthority());
species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
species.setSubvarietyName(speciesRow.getSubvarietyName());
species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
species.setFormaRankType(speciesRow.getFormaRankType());
species.setFormaName(speciesRow.getFormaName());
species.setFormaAuthority(speciesRow.getFormaAuthority());
// species.setPrioritySite1(speciesRow.getPrioritySite1());
// species.setPrioritySite2(speciesRow.getPrioritySite2());
// species.setCurator1Id(speciesRow.getCurator1Id());
// species.setCurator2Id(speciesRow.getCurator2Id());
species.setRestrictionCode(speciesRow.getRestrictionCode());
species.setLifeFormCode(speciesRow.getLifeFormCode());
species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
species.setIsNamePending(speciesRow.getIsNamePending());
species.setSynonymCode(speciesRow.getSynonymCode());
// species.setVerifierCooperator(speciesRow.getVerifierId());
if (speciesRow.getNameVerifiedDate() != null) {
species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
}
species.setName(speciesRow.getName());
species.setNameAuthority(speciesRow.getNameAuthority());
species.setProtologue(speciesRow.getProtologue());
species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
species.setNote(speciesRow.getNote());
species.setSiteNote(speciesRow.getSiteNote());
species.setAlternateName(speciesRow.getAlternateName());
// species.setCreatedDate(speciesRow.getCreatedDate());
// species.setModifiedDate(speciesRow.getModifiedDate()); // Do not update @Versioned modifiedDate
return species;
}
private void print(String message, SpeciesRow species) {
// TaxonomyGenus tg = species.getTaxonomyGenus();
log.info("{} {} {} {} proto={} id={} tgid={}",
message,
StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
species.getName(), species.getNameAuthority(),
species.getProtologue(),
species.getTaxonomySpeciesId(),
species.getTaxonomyGenusId()
);
}
private void print(String message, TaxonomySpecies species) {
TaxonomyGenus tg = species.getTaxonomyGenus();
log.info("{} {} {} {} proto={} id={}/{} tgid={}/{}",
message,
StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
species.getName(), species.getNameAuthority(),
species.getProtologue(),
species.getId(), species.getGrinId(),
(tg == null ? "null" : tg.getId()), (tg == null ? "null" : tg.getGrinId())
);
}
private String indexLookupKey(TaxonomyGenus genus) {
return StringUtils.toRootLowerCase(StringUtils.substring(genus.getGenusName(), 0, 3));
}
private String indexLookupKey(GenusRow genus) {
return StringUtils.toRootLowerCase(StringUtils.substring(genus.getGenusName(), 0, 3));
}
private String indexLookupKey(TaxonomySpecies species) {
return StringUtils.toRootLowerCase(species.getSpeciesName());
}
private String indexLookupKey(SpeciesRow species) {
return StringUtils.toRootLowerCase(species.getSpeciesName());
}
private String indexLookupKey(TaxonomyAuthor author) {
return StringUtils.toRootLowerCase(author.getShortName().substring(0, 2));
}
private void print(String message, GenusRow m) {
log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}",
message,
m.getQualifyingCode(),
StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
m.getGenusAuthority(),
m.getSubgenusName(),
m.getSectionName(), m.getSubsectionName(),
m.getSeriesName(), m.getSubseriesName(),
m.getTaxonomyFamilyId(), m.getTaxonomyGenusId());
}
private void print(String message, TaxonomyGenus m) {
log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}/{}",
message,
m.getQualifyingCode(),
StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
m.getGenusAuthority(),
m.getSubgenusName(),
m.getSectionName(), m.getSubsectionName(),
m.getSeriesName(), m.getSubseriesName(),
(m.getTaxonomyFamily() == null ? null : m.getTaxonomyFamily().getId()), m.getId(), m.getGrinId());
}
static void downloadDataIfNeeded(File folder) throws IOException {
if (!folder.exists()) {
log.warn("Making directory " + folder.getAbsolutePath());
if (!folder.mkdirs() || !folder.exists()) {
throw new IOException("Failed to create data folder at " + folder.getAbsolutePath());
}
}
// The two required files
final File genusFile = new File(folder, "taxonomy_genus.txt");
final File speciesFile = new File(folder, "taxonomy_species.txt");
if (!genusFile.exists() || !speciesFile.exists()) {
log.warn("Taxonomy data not provided in {}, starting download", folder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
log.warn("Downloading GRIN-Taxonomy database to {}", folder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, folder, false);
if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
FileUtils.forceDelete(downloadedCabFile);
}
}
}
/**
* Implementation of a group-by list
*
* @param <K> key
* @param <V> value
*/
public static class LookupList<K, V> extends HashMap<K, List<V>> {
private static final long serialVersionUID = 2452703619583443005L;
public V add(K key, V element) {
computeIfAbsent(key, k -> new LinkedList<>()).add(element);
return element;
}
}
}