UsdaTaxonomyUpdater.java

/*
 * Copyright 2026 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gringlobal.worker;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import javax.persistence.EntityManager;

import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.CabReader;
import org.genesys.taxonomy.gringlobal.model.AuthorRow;
import org.genesys.taxonomy.gringlobal.model.FamilyRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;

import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

import com.google.common.collect.Lists;
import com.opencsv.CSVReader;
import com.querydsl.jpa.impl.JPAQueryFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Strings;
import org.gringlobal.api.exception.InvalidApiUsageException;
import org.gringlobal.model.QTaxonomyGenus;
import org.gringlobal.model.QTaxonomySpecies;
import org.gringlobal.model.TaxonomyAuthor;
import org.gringlobal.model.TaxonomyFamily;
import org.gringlobal.model.TaxonomyGenus;
import org.gringlobal.model.TaxonomySpecies;
import org.gringlobal.persistence.TaxonomyAuthorRepository;
import org.gringlobal.persistence.TaxonomyFamilyRepository;
import org.gringlobal.persistence.TaxonomyGenusRepository;
import org.gringlobal.persistence.TaxonomySpeciesRepository;

/**
 * The component downloads current GRIN Taxonomy database if no local copy
 * exists and updates Family, Genus and Species tables in the local database.
 * 
 * The matching is done on names only, local identifiers will not match GRIN
 * Taxonomy IDs.
 * 
 * @author Matija Obreza
 */
@Component
@Slf4j
public class UsdaTaxonomyUpdater {

	private static final String DEBUG_GENUS_NAME = "Allodissotis";
	private static final String DEBUG_SPECIES_NAME = "Neurachne alopecuroides";

	@Autowired
	private TaxonomyFamilyRepository taxonomyFamilyRepository;
	@Autowired
	private TaxonomyGenusRepository taxonomyGenusRepository;
	@Autowired
	private TaxonomySpeciesRepository taxonomySpeciesRepository;
	@Autowired
	private TaxonomyAuthorRepository taxonomyAuthorRepository;
	@Autowired
	private JPAQueryFactory jpaQueryFactory;

	private File downloadFolder = new File(FileUtils.getTempDirectory(), "grin-taxonomy-source"); // + System.currentTimeMillis());

	@Autowired
	private EntityManager entityManager;


	/**
	 * Update local taxonomy tables with data from GRIN Taxonomy.
	 * 
	 * @throws Exception
	 */
	@PreAuthorize("hasAuthority('GROUP_ADMINS')")
	@Transactional
	public void update() throws Exception {
		log.info("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
		downloadDataIfNeeded(downloadFolder);
		updateLocalDatabase();
		log.warn("Taxonomy database updated successfully. Transaction will now be committed. This takes a long time if there are loads of updates!");
	}

	/**
	 * The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
	 * {@link TaxonomySpecies}. The entries from source database are mapped to local
	 * identifiers. No records are removed from the local database.
	 * 
	 * <p>
	 * Note: The update may update capitalization of names.
	 * </p>
	 * 
	 * @throws Exception
	 */
	private void updateLocalDatabase() throws Exception {
		Map<Long, TaxonomyFamily> famTheirsToOurs = new HashMap<>();
		Map<Long, TaxonomyGenus> genTheirsToOurs = new HashMap<>();
		Map<Long, TaxonomySpecies> speTheirsToOurs = new HashMap<>();
//		Map<Long, TaxonomyAuthor> authTheirsToOurs = new HashMap<>();

		Map<Long, Long> currentTypeGenus = new HashMap<>();

		{
			log.warn("Loading {} TaxonomyFamily records to memory...", taxonomyFamilyRepository.count());

			Map<Long, Long> currentFamily = new HashMap<>();
			List<TaxonomyFamily> allFamilies = taxonomyFamilyRepository.findAll();
			final Map<Long, TaxonomyFamily> allFamiliesByGrinId = new HashMap<>();
			allFamilies.forEach(family -> {
				if (family.getGrinId() != null) {
					allFamiliesByGrinId.put(family.getGrinId(), family);
				}
			});
			List<TaxonomyFamily> toSave = new ArrayList<>();

			log.warn("Reading {}/taxonomy_family.txt", downloadFolder);

			// read taxonomy_genus.txt
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
				var beanReader = CabReader.beanReader(FamilyRow.class, reader);
				beanReader.forEach(familyRow -> {
					TaxonomyFamily family = null;
					var other = allFamiliesByGrinId.get(familyRow.getTaxonomyFamilyId());
					if (other != null) {
						family = other;
					} else {
						if (allFamilies.size() > 0) {
							final List<TaxonomyFamily> narrow = allFamilies.stream()
								// filter
								.filter(m -> (
										Strings.CI.equals(m.getFamilyName(), familyRow.getFamilyName())
										&& Strings.CI.equals(m.getFamilyAuthority(), familyRow.getFamilyAuthority())
										&& Strings.CI.equals(m.getSubfamilyName(), familyRow.getSubfamilyName())
										&& Strings.CI.equals(m.getTribeName(), familyRow.getTribeName())
										&& Strings.CI.equals(m.getSubtribeName(), familyRow.getSubtribeName())
								))
								// print
								.peek(m -> {
									log.debug("{} {} {} {} {}", m.getFamilyName(), m.getFamilyAuthority(), m.getSubfamilyName(), m.getTribeName(), m.getSubtribeName());
								})
								// collect
								.collect(Collectors.toList());
			
							if (narrow.size() == 1) {
								family = narrow.get(0);
							} else if (narrow.size() == 0) {
								log.debug("No matches found! Will create new entry.");
							} else {
								throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_family needs cleaning: " + familyRow.getFamilyName());
							}
						}
						if (family == null) {
							family = new TaxonomyFamily();
							log.info("New family {} {} {} {} {} gid={}.", familyRow.getFamilyName(), familyRow.getFamilyAuthority(), familyRow.getSubfamilyName(), familyRow.getTribeName(), familyRow.getSubtribeName(), familyRow.getTaxonomyFamilyId());
						}
					}

					family.setGrinId(familyRow.getTaxonomyFamilyId());
					family.setFamilyName(familyRow.getFamilyName());
					family.setFamilyAuthority(familyRow.getFamilyAuthority());
					family.setSubfamilyName(familyRow.getSubfamilyName());
					family.setTribeName(familyRow.getTribeName());
					family.setSubtribeName(familyRow.getSubtribeName());
		
					family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
					family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
					family.setAlternateName(familyRow.getAlternateName());
					family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
					family.setNote(familyRow.getNote());
		
					toSave.add(family);
					famTheirsToOurs.put(familyRow.getTaxonomyFamilyId(), family);
					currentFamily.put(familyRow.getTaxonomyFamilyId(), familyRow.getCurrentTaxonomyFamilyId());
					currentTypeGenus.put(familyRow.getTaxonomyFamilyId(), familyRow.getTypeTaxonomyGenusId());
				});
			}

			// Save updates
			Lists.partition(toSave, 100).forEach(batch -> {
				log.warn("Saving {} taxonomyFamily", batch.size());
				taxonomyFamilyRepository.saveAllAndFlush(batch);
				entityManager.flush();;
			});
			toSave.clear();

			// Update references
			currentFamily.forEach((theirId, theirCurrentId) -> {
				var family = famTheirsToOurs.get(theirId);
				var current = famTheirsToOurs.get(theirCurrentId);
				if (current == null || family.getCurrentTaxonomyFamily() == null || !family.getCurrentTaxonomyFamily().getId().equals(current.getId())) {
					var reloaded = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
					reloaded.setCurrentTaxonomyFamily(taxonomyFamilyRepository.findById(current.getId()).orElseThrow());
					toSave.add(reloaded);
				}
			});
			// Save updates
			Lists.partition(toSave, 100).forEach(batch -> {
				log.warn("Saving {} taxonomyFamily", batch.size());
				taxonomyFamilyRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});

			allFamilies.clear();
			toSave.clear();
			allFamiliesByGrinId.clear();
		}

		{
			// read taxonomy_genus.txt
			log.warn("Loading {} TaxonomyGenus records to memory...", taxonomyGenusRepository.count());
			// Group list of genera by family#id for faster lookups
			final LookupList<String, TaxonomyGenus> allGeneraIndex = new LookupList<>();
			final Map<Long, TaxonomyGenus> allGeneraByGrinId = new HashMap<>();
			taxonomyGenusRepository.findAll().forEach(genus -> {
				allGeneraIndex.add(indexLookupKey(genus), genus);
				if (genus.getGrinId() != null) {
					allGeneraByGrinId.put(genus.getGrinId(), genus);
				}
			});
			
			List<TaxonomyGenus> toSave = new ArrayList<>();
			Map<Long, Long> currentGenus = new HashMap<>();

			log.warn("Reading {}/taxonomy_genus.txt", downloadFolder);
			
			// SCAN taxonomy_genus.txt to get TAXONOMY_GENUS_ID. Our records that have GRIN_ID that is not in the list
			// need to be updated with GRIN_ID = NULL.
			var grinIdsInFile = new LinkedHashSet<Long>(100);
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
				var beanReader = CabReader.beanReader(GenusRow.class, reader);
				beanReader.forEach(row -> {
					var grinId = row.getTaxonomyGenusId();
					grinIdsInFile.add(grinId);
					// Find genera records by GRIN ID where the name had changed
					var genusName = row.getGenusName();
					var existingGenus = allGeneraByGrinId.get(grinId);
					if (existingGenus != null && ! Strings.CI.equals(genusName, existingGenus.getGenusName())) {
						log.warn("GRIN genus {} with id={} does not match what we have {} with id={}. Let try to update it.", genusName, grinId, existingGenus.getGenusName(), existingGenus.getId());
						applyGrinGenus(row, existingGenus, famTheirsToOurs);
						taxonomyGenusRepository.saveAndFlush(existingGenus);
					}
				});
			}
			log.warn("Found {} taxonomy_genus records", grinIdsInFile.size());
			// Scan current genera and clear grinId
			var missingGrinId = new LinkedList<Long>();
			allGeneraByGrinId.keySet().forEach(weHave -> {
				if (! grinIdsInFile.contains(weHave)) missingGrinId.add(weHave);
			});
			log.warn("We have {} taxonomy_genus with GRIN ID that are no longer in GRIN Taxonomy", missingGrinId.size());
			for (var miss : missingGrinId) {
				var missed = allGeneraByGrinId.get(miss);
				log.warn("Not in GRIN Taxonomy: {} {}", missed.getGrinId(), missed);

				jpaQueryFactory.update(QTaxonomyGenus.taxonomyGenus).where(
					QTaxonomyGenus.taxonomyGenus.grinId.in(missed.getGrinId())
				).setNull(QTaxonomyGenus.taxonomyGenus.grinId)
				.execute();
				entityManager.flush();
				allGeneraByGrinId.remove(miss); // Remove from cache
				missed.setGrinId(null);
			}

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
				var beanReader = CabReader.beanReader(GenusRow.class, reader);
				beanReader.forEach(genusRow -> {
					if (famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()) == null) {
						log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
						return;
					}

					if (Strings.CI.equals(genusRow.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Matching", genusRow);
					}

					TaxonomyGenus genus = null;
					var other = allGeneraByGrinId.get(genusRow.getTaxonomyGenusId());
					if (other != null) {
						genus = other;
					} else {
						List<TaxonomyGenus> generaWithName = allGeneraIndex.get(indexLookupKey(genusRow));
						if (generaWithName != null) {
							if (genusRow.getGenusName().equals(DEBUG_GENUS_NAME)) {
								print(">> Looking for: ", genusRow);
							}
							List<TaxonomyGenus> narrow = generaWithName.stream()
								// print
								.peek(m -> {
									if (genusRow.getGenusName().equals(DEBUG_GENUS_NAME)) {
										print("Candidate: ", m);
									}
								})
								// filter
								.filter(m -> (
									Objects.equals(m.getTaxonomyFamily().getId(), famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()).getId())
									&& Strings.CI.equals(m.getGenusName(), genusRow.getGenusName())
									&& Strings.CI.equals(m.getGenusAuthority(), genusRow.getGenusAuthority())
									&& Strings.CI.equals(m.getSubgenusName(), genusRow.getSubgenusName())
									&& Strings.CI.equals(m.getSectionName(), genusRow.getSectionName())
									&& Strings.CI.equals(m.getSubsectionName(), genusRow.getSubsectionName())
									&& Strings.CI.equals(m.getSeriesName(), genusRow.getSeriesName())
									&& Strings.CI.equals(m.getSubseriesName(), genusRow.getSubseriesName())
								))
								// print
								.peek(m -> {
									if (m.getGenusName().equals(DEBUG_GENUS_NAME)) {
										print("Match", m);
									}
									log.debug("{} {} {} {} {} {} {}", m.getGenusName(), m.getGenusAuthority(), m.getSubgenusName(), m.getSectionName(), m.getSubsectionName(), m.getSeriesName(), m.getSubseriesName());
								})
								// collect
								.collect(Collectors.toList());

							if (narrow.size() == 1) {
								genus = narrow.get(0);
							} else if (narrow.size() == 0) {
								genus = applyGrinGenus(genusRow, new TaxonomyGenus(), famTheirsToOurs);
								log.info("{} matches found for {} {} {} {} {} {} {}! Will create new entry.", narrow.size(), genus.getGenusName(), genus.getGenusAuthority(), genus
									.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
							} else {
								print("Too many matches for:", genusRow);
								narrow.forEach(m -> print(">> ", m));
								var narrower = narrow.stream().filter(m -> (
									Strings.CI.equals(m.getHybridCode(), genusRow.getHybridCode())
									&& Strings.CI.equals(m.getQualifyingCode(), genusRow.getQualifyingCode())
								)).collect(Collectors.toList());
								if (narrower.size() == 1) {
									genus = narrower.get(0);
								} else {
									throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_genus needs cleaning: " + genusRow.getGenusName() + " " + genusRow.getGenusAuthority());
								}
							}
						} else {
							log.info("No existing genera for index={}", indexLookupKey(genusRow));
							genus = applyGrinGenus(genusRow, new TaxonomyGenus(), famTheirsToOurs);
							log.info("No matches found for {} {} {} {} {} {} {}! Will create new entry.", genus.getGenusName(), genus.getGenusAuthority(), genus
									.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
							// print("New taxonomy_genus", genus);
						}
					}

					if (Strings.CI.equals(genus.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Updating", genus);
					}

					// genus.setGenusId(genusRow.getGenusId());
					// genus.setCurrentGenusId(genusRow.getCurrentGenusId());
					genus.setGrinId(genusRow.getGenusId());
					genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
					if (genus.getTaxonomyFamily() == null) {
						log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
						return;
					}

					genus.setQualifyingCode(genusRow.getQualifyingCode());
					genus.setHybridCode(genusRow.getHybridCode());
					genus.setGenusName(genusRow.getGenusName());
					genus.setGenusAuthority(genusRow.getGenusAuthority());
					genus.setSubgenusName(genusRow.getSubgenusName());
					genus.setSectionName(genusRow.getSectionName());
					genus.setSubsectionName(genusRow.getSubsectionName());
					genus.setSeriesName(genusRow.getSeriesName());
					genus.setSubseriesName(genusRow.getSubseriesName());
					genus.setNote(genusRow.getNote());

					// genus.setCreatedDate(genusRow.getCreatedDate());
					// genus.setModifiedDate(genusRow.getModifiedDate()); // Do not update @Versioned modifiedDate

					if (Strings.CI.equals(genus.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Updated", genus);
					}

					toSave.add(genus);
					genTheirsToOurs.put(genusRow.getGenusId(), genus);
					currentGenus.put(genusRow.getTaxonomyGenusId(), genusRow.getCurrentTaxonomyGenusId());
				});
			}
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyGenus", batch.size());
				taxonomyGenusRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});
			toSave.clear();

			// Update references
			currentGenus.forEach((theirId, theirCurrentId) -> {
				var genus = genTheirsToOurs.get(theirId);
				var current = genTheirsToOurs.get(theirCurrentId);
				if (current == null || genus.getCurrentTaxonomyGenus() == null || !genus.getCurrentTaxonomyGenus().getId().equals(current.getId())) {
					var reloaded = taxonomyGenusRepository.findById(genus.getId()).orElseThrow();
					reloaded.setCurrentTaxonomyGenus(taxonomyGenusRepository.findById(current.getId()).orElseThrow());
					toSave.add(reloaded);
				}
			});
			// Save updates
			log.info("Updating {} genus references", toSave.size());
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyGenus", batch.size());
				taxonomyGenusRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});

			toSave.clear();
			allGeneraIndex.clear();
			allGeneraByGrinId.clear();

			{
				List<TaxonomyFamily> toSaveFam = new ArrayList<>();
				currentTypeGenus.forEach((theirId, theirGenusId) -> {
					TaxonomyFamily family = famTheirsToOurs.get(theirId);
					if (theirGenusId == null) {
						if (family.getTypeTaxonomyGenus() != null) {
							family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
							family.setTypeTaxonomyGenus(null);
							toSaveFam.add(family);
						}
					} else {
						var typeGenus = genTheirsToOurs.get(theirGenusId);
						if (typeGenus == null || family.getTypeTaxonomyGenus() == null || family.getTypeTaxonomyGenus().getId().equals(typeGenus.getId())) {
							family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
							family.setTypeTaxonomyGenus(taxonomyGenusRepository.findById(typeGenus.getId()).orElseThrow());
							toSaveFam.add(family);
						}
					}
					if (family.getTypeTaxonomyGenus() == null && theirGenusId != null) {
						log.warn("Type genus is null: their genus_id={} our taxonomy_family_id={}", theirGenusId, family.getId());
					}
				});
				Lists.partition(toSaveFam, 100).forEach(batch -> {
					log.warn("Saving {} taxonomyFamily", batch.size());
					taxonomyFamilyRepository.saveAllAndFlush(batch);
					entityManager.flush();
				});

				currentTypeGenus.clear();
			}
		}


		{
			// read taxonomy_species.txt
			log.warn("Loading {} TaxonomySpecies records to memory...", taxonomySpeciesRepository.count());
			// Group list of species by epithet for faster lookups
			final LookupList<String, TaxonomySpecies> allSpeciesByEpithet = new LookupList<>();
			final Map<Long, TaxonomySpecies> allSpeciesByGrinId = new HashMap<>();
			taxonomySpeciesRepository.findAll().forEach(species -> {
				allSpeciesByEpithet.add(indexLookupKey(species), species);
				if (species.getGrinId() != null) {
					allSpeciesByGrinId.put(species.getGrinId(), species);
				}
			});

			log.warn("Reading {}/taxonomy_species.txt", downloadFolder);

			// SCAN taxonomy_species.txt to check for conflicts
			var grinIdsInFile = new LinkedHashSet<Long>(100);
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
				var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
				beanReader.forEach(row -> {
					var grinId = row.getTaxonomySpeciesId();
					grinIdsInFile.add(grinId);
					// Find genera records by GRIN ID where the name had changed
					var speciesName = row.getName();
					var existingSpecies = allSpeciesByGrinId.get(grinId);
					if (existingSpecies != null && ! Strings.CI.equals(speciesName, existingSpecies.getName())) {
						log.warn("GRIN species {} with id={} does not match what we have {} with id={}. Let try to update it.\n\t{}\n\t{}", speciesName, grinId, existingSpecies.getName(), existingSpecies.getId(), row, existingSpecies);

						// Do we have this already?
						var whatWeHave = allSpeciesByEpithet.getOrDefault(indexLookupKey(row), List.of()).stream().filter(m -> 
							Strings.CI.equals(m.getName(), speciesName)
								&& Strings.CI.equals(m.getNameAuthority(), row.getNameAuthority())
								&& Strings.CI.equals(m.getProtologue(), row.getProtologue())
								&& Strings.CI.equals(m.getSynonymCode(), row.getSynonymCode())
						).collect(Collectors.toList());

						whatWeHave.forEach(existing -> {
							log.warn("For {} we have: {}", row.getName(), existing);
						});
						if (whatWeHave.size() == 1) {
							var candidate = whatWeHave.get(0);
							log.warn("We have a single existing record: {}.", candidate);
							if (candidate.getGrinId() == null) {
								// Get the ones that point to this
								var pointers = (List<TaxonomySpecies>) taxonomySpeciesRepository.findAll(QTaxonomySpecies.taxonomySpecies.currentTaxonomySpecies().eq(existingSpecies));
								// Delete the wrong one!
								log.warn("Deleting TaxonomySpecies and those {} that point to it: {}", pointers.size(), existingSpecies);
								jpaQueryFactory.delete(QTaxonomySpecies.taxonomySpecies)
									.where(
										QTaxonomySpecies.taxonomySpecies.currentTaxonomySpecies().id.eq(existingSpecies.getId())
										.or(QTaxonomySpecies.taxonomySpecies.id.eq(existingSpecies.getId()))
									)
									.execute();
								entityManager.flush();
								log.warn("Updating existing record with correct GRIN ID={}: {}", grinId, candidate);
								candidate.setGrinId(row.getTaxonomySpeciesId()); // Update existing GRIN ID!
								taxonomySpeciesRepository.saveAndFlush(candidate);
								// Update cache
								allSpeciesByEpithet.get(indexLookupKey(row)).remove(existingSpecies);
								pointers.forEach(pointer -> {
									allSpeciesByEpithet.get(indexLookupKey(pointer)).remove(pointer);
									if (pointer.getGrinId() != null) allSpeciesByGrinId.remove(pointer.getGrinId());
								});
								allSpeciesByGrinId.put(candidate.getGrinId(), candidate);
							} else {
								log.warn("Candidate already has GRIN ID={} {}", candidate.getGrinId(), candidate);
							}
						} else if (whatWeHave.size() == 0) {
							applyGrinSpecies(row, existingSpecies, genTheirsToOurs);
							taxonomySpeciesRepository.saveAndFlush(existingSpecies);
						} else {
							log.error("We have {} TaxonomySpecies candidates! This cannot be automatically fixed.", whatWeHave.size());
							throw new RuntimeException("Too many TaxonomySpecies candidates for " + row);
						}
					}
				});
			}
			log.warn("Found {} taxonomy_species records", grinIdsInFile.size());
			// Scan current genera and clear grinId
			var missingGrinId = new LinkedList<Long>();
			allSpeciesByGrinId.keySet().forEach(weHave -> {
				if (! grinIdsInFile.contains(weHave)) missingGrinId.add(weHave);
			});
			log.warn("We have {} taxonomy_species with GRIN ID that are no longer in GRIN Taxonomy", missingGrinId.size());
			for (var miss : missingGrinId) {
				var missed = allSpeciesByGrinId.get(miss);
				log.warn("Not in GRIN Taxonomy {}, clearing grin_id for {}", missed.getGrinId(), missed);
				jpaQueryFactory.update(QTaxonomySpecies.taxonomySpecies).where(
					QTaxonomySpecies.taxonomySpecies.grinId.in(missed.getGrinId())
				).setNull(QTaxonomySpecies.taxonomySpecies.grinId)
				.execute();
				entityManager.flush();
				allSpeciesByGrinId.remove(miss); // Remove from cache
				missed.setGrinId(null);
			}

			List<TaxonomySpecies> toSave = new ArrayList<>();
			Map<Long, Long> currentSpecies = new HashMap<>();

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
				final AtomicInteger counter = new AtomicInteger(0);
				var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
				beanReader.forEach(speciesRow -> {
					if (counter.incrementAndGet() % 1000 == 0) {
						log.warn("Read {} species rows", counter.get());
					}

					TaxonomySpecies species = null;
					var other = allSpeciesByGrinId.get(speciesRow.getTaxonomySpeciesId());
					if (other != null) {
						species = other;
					} else {
						log.debug("No species with usda_id={}! Searching for {} {}", speciesRow.getTaxonomySpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());

						List<TaxonomySpecies> speciesForEpithet = allSpeciesByEpithet.get(indexLookupKey(speciesRow));
						if (speciesForEpithet != null) {
							if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
								print(">> Looking for", speciesRow);
							}

							List<TaxonomySpecies> narrow = speciesForEpithet.stream()
								// debug
								.peek(m -> {
									if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
										print("Inspecting:", m);
									}
								})
								// filter
								.filter(m -> (
									Objects.equals(m.getTaxonomyGenus().getId(), genTheirsToOurs.get(speciesRow.getGenusId()).getId())
									&& Strings.CI.equals(StringUtils.trimToNull(m.getName()), StringUtils.trimToNull(speciesRow.getName()))
									&& Strings.CI.equals(StringUtils.trimToNull(m.getNameAuthority()), StringUtils.trimToNull(speciesRow.getNameAuthority()))
									&& Strings.CI.equals(StringUtils.trimToNull(m.getSynonymCode()), StringUtils.trimToNull(speciesRow.getSynonymCode()))
									&& Strings.CI.equals(StringUtils.trimToNull(m.getProtologue()), StringUtils.trimToNull(speciesRow.getProtologue()))
								))
								// print
								.peek(m -> {
									if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
										print("Potential match:", m);
									}
									log.debug("{} {}", m.getName(), m.getNameAuthority());
								})
								// gather
								.collect(Collectors.toList());
			
							if (narrow.size() == 1) {
								species = narrow.get(0);
							} else if (narrow.size() == 0) {
								if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
									print("No matches found, will add", speciesRow);
								}
								log.debug("No matches found for {} {}! Will create new entry.", speciesRow.getName(), speciesRow.getNameAuthority());
							} else {
								throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_species needs cleaning: " + speciesRow.getName() + " " + speciesRow.getNameAuthority());
							}
						} else {
							log.debug("No species for epithet={}", speciesRow.getSpeciesName());
							if (Strings.CI.equals(speciesRow.getName(), DEBUG_SPECIES_NAME)) {
								print("Will add", speciesRow);
							}
						}
					}

					if (species != null && Strings.CI.equals(species.getName(), DEBUG_SPECIES_NAME)) {
						print(">> Updating", species);
					}

					species = applyGrinSpecies(speciesRow, species == null ? new TaxonomySpecies() : species, genTheirsToOurs);

					if (Strings.CI.equals(species.getName(), DEBUG_SPECIES_NAME)) {
						print(">> Updated", species);
					}

					toSave.add(species);
					speTheirsToOurs.put(speciesRow.getSpeciesId(), species);
					currentSpecies.put(speciesRow.getSpeciesId(), speciesRow.getCurrentTaxonomySpeciesId());
				});
			}
	
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomySpecies", batch.size());
				taxonomySpeciesRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});
			toSave.clear();

			log.warn("Tackling {} records and their referrences to current species", currentSpecies.size());
			// Update references
			currentSpecies.forEach((theirId, theirCurrentId) -> {
				var species = speTheirsToOurs.get(theirId);
				var current = speTheirsToOurs.get(theirCurrentId);
				if (current == null || species.getCurrentTaxonomySpecies() == null || !species.getCurrentTaxonomySpecies().getId().equals(current.getId())) {
					species.setCurrentTaxonomySpecies(current);
					toSave.add(species);
				}
			});
			// Save updates
			log.info("Updating {} species references", toSave.size());
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomySpecies", batch.size());
				taxonomySpeciesRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});
	
			toSave.clear();
		}

		{
			log.warn("Reading {}/taxonomy_author.txt", downloadFolder);

			List<TaxonomyAuthor> allAuthors = taxonomyAuthorRepository.findAll();
			List<TaxonomyAuthor> toSave = new ArrayList<>();
			final LookupList<String, TaxonomyAuthor> authorsLookup = new LookupList<>();
			allAuthors.forEach(author -> {
				authorsLookup.add(indexLookupKey(author), author);
			});

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_author.txt")), 0)) {
				var beanReader = CabReader.beanReader(AuthorRow.class, reader);
				beanReader.forEach(authorRow -> {
					TaxonomyAuthor author = new TaxonomyAuthor();
					author.setShortName(authorRow.getShortName());

					if (author.getShortName() == null) {
						log.warn("Missing shortName id={}", authorRow.getTaxonomyAuthorId());
						return;
					}

					List<TaxonomyAuthor> authorsByFirst = authorsLookup.get(indexLookupKey(author));
					if (authorsByFirst != null) {
						final TaxonomyAuthor compareTo = author;
						List<TaxonomyAuthor> narrow = authorsByFirst.stream()
							// filter
							.filter(m -> (
								Strings.CI.equals(StringUtils.trimToNull(m.getShortName()), StringUtils.trim(compareTo.getShortName()))
							))
							// print
							.peek(m -> {
								log.debug("{}", m.getShortName());
							})
							// gather
							.collect(Collectors.toList());

						if (narrow.size() == 1) {
							author = narrow.get(0);
						} else if (narrow.size() == 0) {
							log.debug("{} matches found for {}! Will create new entry.", narrow.size(), author.getShortName());
						} else {
							narrow.forEach(match -> {
								log.warn("Found id={} short={} for input {}", match.getId(), match.getShortName(), compareTo.getShortName());
							});
							throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_author needs cleaning: " + author.getShortName());
						}
					}

					author.setFullName(authorRow.getFullName());
					author.setFullNameExpandedDiacritic(authorRow.getFullNameExpandedDiacritic());
					author.setShortName(authorRow.getShortName());
					author.setShortNameExpandedDiacritic(authorRow.getShortNameExpandedDiacritic());
					author.setNote(authorRow.getNote());

					toSave.add(author);
//					authTheirsToOurs.put(authorRow.getTaxonomyAuthorId(), author);
				});
			}
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyAuthors", batch.size());
				taxonomyAuthorRepository.saveAllAndFlush(batch);
				entityManager.flush();
			});
			toSave.clear();
		}

		log.warn("Done.");
	}

	private TaxonomyGenus applyGrinGenus(GenusRow genusRow, TaxonomyGenus genus, Map<Long,TaxonomyFamily> famTheirsToOurs) {
		genus.setGrinId(genusRow.getTaxonomyGenusId());
		genus.setQualifyingCode(genusRow.getQualifyingCode());
		genus.setHybridCode(genusRow.getHybridCode());
		genus.setGenusName(genusRow.getGenusName());
		genus.setGenusAuthority(genusRow.getGenusAuthority());
		genus.setSubgenusName(genusRow.getSubgenusName());
		genus.setSectionName(genusRow.getSectionName());
		genus.setSubsectionName(genusRow.getSubsectionName());
		genus.setSeriesName(genusRow.getSeriesName());
		genus.setSubseriesName(genusRow.getSubseriesName());
		genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
		return genus;
	}

	private TaxonomySpecies applyGrinSpecies(SpeciesRow speciesRow, TaxonomySpecies species, Map<Long,TaxonomyGenus> genTheirsToOurs) {
		// species.setSpeciesId(speciesRow.getSpeciesId());
		// species.setCurrentSpeciesId(speciesRow.getCurrentSpeciesId());
		species.setGrinId(speciesRow.getTaxonomySpeciesId());
		species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
		if (species.getTaxonomyGenus() == null) {
			log.warn("Missing genus for species id={} genus_id={}", speciesRow.getSpeciesId(), speciesRow.getGenusId());
			return null; // Intentional to throw NPE
		}
		species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
		species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
		species.setSpeciesName(speciesRow.getSpeciesName());
		species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
		species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
		species.setSubspeciesName(speciesRow.getSubspeciesName());
		species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
		species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
		species.setVarietyName(speciesRow.getVarietyName());
		species.setVarietyAuthority(speciesRow.getVarietyAuthority());
		species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
		species.setSubvarietyName(speciesRow.getSubvarietyName());
		species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
		species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
		species.setFormaRankType(speciesRow.getFormaRankType());
		species.setFormaName(speciesRow.getFormaName());
		species.setFormaAuthority(speciesRow.getFormaAuthority());
		// species.setPrioritySite1(speciesRow.getPrioritySite1());
		// species.setPrioritySite2(speciesRow.getPrioritySite2());
		// species.setCurator1Id(speciesRow.getCurator1Id());
		// species.setCurator2Id(speciesRow.getCurator2Id());
		species.setRestrictionCode(speciesRow.getRestrictionCode());
		species.setLifeFormCode(speciesRow.getLifeFormCode());
		species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
		species.setIsNamePending(speciesRow.getIsNamePending());
		species.setSynonymCode(speciesRow.getSynonymCode());
		// species.setVerifierCooperator(speciesRow.getVerifierId());
		if (speciesRow.getNameVerifiedDate() != null) {
			species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
		}

		species.setName(speciesRow.getName());
		species.setNameAuthority(speciesRow.getNameAuthority());
		species.setProtologue(speciesRow.getProtologue());
		species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
		species.setNote(speciesRow.getNote());
		species.setSiteNote(speciesRow.getSiteNote());
		species.setAlternateName(speciesRow.getAlternateName());

		// species.setCreatedDate(speciesRow.getCreatedDate());
		// species.setModifiedDate(speciesRow.getModifiedDate()); // Do not update @Versioned modifiedDate

		return species;
	}

	private void print(String message, SpeciesRow species) {
		// TaxonomyGenus tg = species.getTaxonomyGenus();
		log.info("{} {} {} {} proto={} id={} tgid={}",
			message,
			StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
			species.getName(), species.getNameAuthority(),
			species.getProtologue(),
			species.getTaxonomySpeciesId(),
			species.getTaxonomyGenusId()
		);
	}

	private void print(String message, TaxonomySpecies species) {
		TaxonomyGenus tg = species.getTaxonomyGenus();
		log.info("{} {} {} {} proto={} id={}/{} tgid={}/{}",
			message,
			StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
			species.getName(), species.getNameAuthority(),
			species.getProtologue(),
			species.getId(), species.getGrinId(),
			(tg == null ? "null" : tg.getId()), (tg == null ? "null" : tg.getGrinId())
		);
	}

	private String indexLookupKey(TaxonomyGenus genus) {
		return StringUtils.toRootLowerCase(StringUtils.substring(genus.getGenusName(), 0, 3));
	}

	private String indexLookupKey(GenusRow genus) {
		return StringUtils.toRootLowerCase(StringUtils.substring(genus.getGenusName(), 0, 3));
	}

	private String indexLookupKey(TaxonomySpecies species) {
		return StringUtils.toRootLowerCase(species.getSpeciesName());
	}

	private String indexLookupKey(SpeciesRow species) {
		return StringUtils.toRootLowerCase(species.getSpeciesName());
	}

	private String indexLookupKey(TaxonomyAuthor author) {
		return StringUtils.toRootLowerCase(author.getShortName().substring(0, 2));
	}

	private void print(String message, GenusRow m) {
		log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}", 
			message,
			m.getQualifyingCode(),
			StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
			m.getGenusAuthority(),
			m.getSubgenusName(),
			m.getSectionName(), m.getSubsectionName(),
			m.getSeriesName(), m.getSubseriesName(),
			m.getTaxonomyFamilyId(), m.getTaxonomyGenusId());
	}

	private void print(String message, TaxonomyGenus m) {
		log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}/{}", 
			message,
			m.getQualifyingCode(),
			StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
			m.getGenusAuthority(),
			m.getSubgenusName(),
			m.getSectionName(), m.getSubsectionName(),
			m.getSeriesName(), m.getSubseriesName(),
			(m.getTaxonomyFamily() == null ? null : m.getTaxonomyFamily().getId()), m.getId(), m.getGrinId());
	}

	static void downloadDataIfNeeded(File folder) throws IOException {
		if (!folder.exists()) {
			log.warn("Making directory " + folder.getAbsolutePath());

			if (!folder.mkdirs() || !folder.exists()) {
				throw new IOException("Failed to create data folder at " + folder.getAbsolutePath());
			}
		}

		// The two required files
		final File genusFile = new File(folder, "taxonomy_genus.txt");
		final File speciesFile = new File(folder, "taxonomy_species.txt");

		if (!genusFile.exists() || !speciesFile.exists()) {
			log.warn("Taxonomy data not provided in {}, starting download", folder.getAbsolutePath());
			final TaxonomyDownloader dl = new TaxonomyDownloader();

			log.warn("Downloading GRIN-Taxonomy database to {}", folder.getAbsolutePath());
			final File downloadedCabFile = File.createTempFile("grin-", ".cab");
			dl.downloadCurrent(downloadedCabFile);

			TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, folder, false);
			if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
				log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
				FileUtils.forceDelete(downloadedCabFile);
			}
		}
	}

	/**
	 * Implementation of a group-by list
	 *
	 * @param <K> key
	 * @param <V> value
	 */
	public static class LookupList<K, V> extends HashMap<K, List<V>> {
		private static final long serialVersionUID = 2452703619583443005L;

		public V add(K key, V element) {
			computeIfAbsent(key, k -> new LinkedList<>()).add(element);
			return element;
		}
	}
}