UsdaTaxonomyUpdater.java

/*
 * Copyright 2020 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gringlobal.worker;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import javax.persistence.EntityManager;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.CabReader;
import org.genesys.taxonomy.gringlobal.model.AuthorRow;
import org.genesys.taxonomy.gringlobal.model.FamilyRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
import org.gringlobal.api.exception.InvalidApiUsageException;
import org.gringlobal.model.TaxonomyAuthor;
import org.gringlobal.model.TaxonomyFamily;
import org.gringlobal.model.TaxonomyGenus;
import org.gringlobal.model.TaxonomySpecies;
import org.gringlobal.persistence.TaxonomyAuthorRepository;
import org.gringlobal.persistence.TaxonomyFamilyRepository;
import org.gringlobal.persistence.TaxonomyGenusRepository;
import org.gringlobal.persistence.TaxonomySpeciesRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

import com.google.common.collect.Lists;
import com.opencsv.CSVReader;

import lombok.extern.slf4j.Slf4j;

/**
 * The component downloads current GRIN Taxonomy database if no local copy
 * exists and updates Family, Genus and Species tables in the local database.
 * 
 * The matching is done on names only, local identifiers will not match GRIN
 * Taxonomy IDs.
 * 
 * @author Matija Obreza
 */
@Component
@Slf4j
public class UsdaTaxonomyUpdater {

	private static final String DEBUG_GENUS_NAME = "Neurachne";
	private static final String DEBUG_SPECIES_NAME = "Neurachne alopecuroides";

	@Autowired
	private TaxonomyFamilyRepository taxonomyFamilyRepository;
	@Autowired
	private TaxonomyGenusRepository taxonomyGenusRepository;
	@Autowired
	private TaxonomySpeciesRepository taxonomySpeciesRepository;
	@Autowired
	private TaxonomyAuthorRepository taxonomyAuthorRepository;

	private File downloadFolder = new File(FileUtils.getTempDirectory(), "grin-taxonomy-source"); // + System.currentTimeMillis());

	@Autowired
	private EntityManager entityManager;


	/**
	 * Update local taxonomy tables with data from GRIN Taxonomy.
	 * 
	 * @throws Exception
	 */
	@PreAuthorize("hasAuthority('GROUP_ADMINS')")
	@Transactional
	public void update() throws Exception {
		log.info("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
		downloadDataIfNeeded(downloadFolder);
		updateLocalDatabase();
		log.warn("Taxonomy database updated successfully. Transaction will now be committed. This takes time!");
	}

	/**
	 * The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
	 * {@link TaxonomySpecies}. The entries from source database are mapped to local
	 * identifiers. No records are removed from the local database.
	 * 
	 * <p>
	 * Note: The update may update capitalization of names.
	 * </p>
	 * 
	 * @throws Exception
	 */
	private void updateLocalDatabase() throws Exception {
		log.info("Loading taxonomy_family.txt");
		Map<Long, TaxonomyFamily> famTheirsToOurs = new HashMap<>();
		Map<Long, TaxonomyGenus> genTheirsToOurs = new HashMap<>();
		Map<Long, TaxonomySpecies> speTheirsToOurs = new HashMap<>();
//		Map<Long, TaxonomyAuthor> authTheirsToOurs = new HashMap<>();

		Map<Long, Long> currentTypeGenus = new HashMap<>();

		{
			log.warn("Loading {}/taxonomy_family.txt", downloadFolder);

			Map<Long, Long> currentFamily = new HashMap<>();
			List<TaxonomyFamily> allFamilies = taxonomyFamilyRepository.findAll();
			final Map<Long, TaxonomyFamily> allFamiliesByGrinId = new HashMap<>();
			allFamilies.forEach(family -> {
				if (family.getGrinId() != null) {
					allFamiliesByGrinId.put(family.getGrinId(), family);
				}
			});
			List<TaxonomyFamily> toSave = new ArrayList<>();
			// read taxonomy_genus.txt
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
				var beanReader = CabReader.beanReader(FamilyRow.class, reader);
				beanReader.forEach(familyRow -> {
					TaxonomyFamily family = new TaxonomyFamily();
					family.setGrinId(familyRow.getTaxonomyFamilyId());
					// family.setId(familyRow.getTaxonomyFamilyId());
					// family.setTypeTaxonomyGenus(familyRow.getTypeTaxonomyGenusId());
					family.setFamilyName(familyRow.getFamilyName());
					family.setFamilyAuthority(familyRow.getFamilyAuthority());
					family.setSubfamilyName(familyRow.getSubfamilyName());
					family.setTribeName(familyRow.getTribeName());
					family.setSubtribeName(familyRow.getSubtribeName());
		
					var other = allFamiliesByGrinId.get(familyRow.getTaxonomyFamilyId());
					if (other != null) {
						family = other;
					} else {
						if (allFamilies.size() > 0) {
							final TaxonomyFamily compareTo = family;
							final List<TaxonomyFamily> narrow = allFamilies.stream()
								// filter
								.filter(m -> (
										StringUtils.equalsIgnoreCase(m.getFamilyName(), compareTo.getFamilyName())
										&& StringUtils.equalsIgnoreCase(m.getFamilyAuthority(), compareTo.getFamilyAuthority())
										&& StringUtils.equalsIgnoreCase(m.getSubfamilyName(), compareTo.getSubfamilyName())
										&& StringUtils.equalsIgnoreCase(m.getTribeName(), compareTo.getTribeName())
										&& StringUtils.equalsIgnoreCase(m.getSubtribeName(), compareTo.getSubtribeName())
								))
								// print
								.peek(m -> {
									log.debug("{} {} {} {} {}", m.getFamilyName(), m.getFamilyAuthority(), m.getSubfamilyName(), m.getTribeName(), m.getSubtribeName());
								})
								// collect
								.collect(Collectors.toList());
			
							if (narrow.size() == 1) {
								family = narrow.get(0);
							} else if (narrow.size() == 0) {
								log.debug("{} matches found! Will create new entry.", narrow.size());
							} else {
								throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_family needs cleaning: " + family.getFamilyName());
							}
						}
					}

					family.setGrinId(familyRow.getTaxonomyFamilyId());
					family.setFamilyName(familyRow.getFamilyName());
					family.setFamilyAuthority(familyRow.getFamilyAuthority());
					family.setSubfamilyName(familyRow.getSubfamilyName());
					family.setTribeName(familyRow.getTribeName());
					family.setSubtribeName(familyRow.getSubtribeName());
		
					family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
					family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
					family.setAlternateName(familyRow.getAlternateName());
					family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
					family.setNote(familyRow.getNote());
		
					toSave.add(family);
					famTheirsToOurs.put(familyRow.getTaxonomyFamilyId(), family);
					currentFamily.put(familyRow.getTaxonomyFamilyId(), familyRow.getCurrentTaxonomyFamilyId());
					currentTypeGenus.put(familyRow.getTaxonomyFamilyId(), familyRow.getTypeTaxonomyGenusId());
				});
			}

			// Save updates
			Lists.partition(toSave, 100).forEach(batch -> {
				log.warn("Saving {} taxonomyFamily", batch.size());
				taxonomyFamilyRepository.saveAll(batch);
				entityManager.flush();;
			});
			toSave.clear();

			// Update references
			currentFamily.forEach((theirId, theirCurrentId) -> {
				var family = famTheirsToOurs.get(theirId);
				var current = famTheirsToOurs.get(theirCurrentId);
				if (current == null || family.getCurrentTaxonomyFamily() == null || !family.getCurrentTaxonomyFamily().getId().equals(current.getId())) {
					var reloaded = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
					reloaded.setCurrentTaxonomyFamily(taxonomyFamilyRepository.findById(current.getId()).orElseThrow());
					toSave.add(reloaded);
				}
			});
			// Save updates
			Lists.partition(toSave, 100).forEach(batch -> {
				log.warn("Saving {} taxonomyFamily", batch.size());
				taxonomyFamilyRepository.saveAll(batch);
				entityManager.flush();
			});

			allFamilies.clear();
			toSave.clear();
			allFamiliesByGrinId.clear();
		}

		{
			// read taxonomy_genus.txt
			log.warn("Loading {}/taxonomy_genus.txt", downloadFolder);
			// Group list of genera by family#id for faster lookups
			final LookupList<String, TaxonomyGenus> allGeneraIndex = new LookupList<>();
			final Map<Long, TaxonomyGenus> allGeneraByGrinId = new HashMap<>();
			taxonomyGenusRepository.findAll().forEach(genus -> {
				allGeneraIndex.add(indexLookupKey(genus), genus);
				if (genus.getGrinId() != null) {
					allGeneraByGrinId.put(genus.getGrinId(), genus);
				}
			});

			List<TaxonomyGenus> toSave = new ArrayList<>();
			Map<Long, Long> currentGenus = new HashMap<>();

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
				var beanReader = CabReader.beanReader(GenusRow.class, reader);
				beanReader.forEach(genusRow -> {
					TaxonomyGenus genus = new TaxonomyGenus();
					genus.setGrinId(genusRow.getTaxonomyGenusId());
					genus.setQualifyingCode(genusRow.getQualifyingCode());
					genus.setHybridCode(genusRow.getHybridCode());
					genus.setGenusName(genusRow.getGenusName());
					genus.setGenusAuthority(genusRow.getGenusAuthority());
					genus.setSubgenusName(genusRow.getSubgenusName());
					genus.setSectionName(genusRow.getSectionName());
					genus.setSubsectionName(genusRow.getSubsectionName());
					genus.setSeriesName(genusRow.getSeriesName());
					genus.setSubseriesName(genusRow.getSubseriesName());
					genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
					if (genus.getTaxonomyFamily() == null) {
						log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
						return;
					}

					if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Matching", genus);
					}


					var other = allGeneraByGrinId.get(genusRow.getTaxonomyGenusId());
					if (other != null) {
						genus = other;
					} else {
						List<TaxonomyGenus> generaWithName = allGeneraIndex.get(indexLookupKey(genus));
						if (generaWithName != null) {
							final TaxonomyGenus compareTo = genus;

							if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
								print(">> Looking for: ", compareTo);
							}
							List<TaxonomyGenus> narrow = generaWithName.stream()
								// print
								.peek(m -> {
									if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
										print("Candidate: ", m);
									}
								})
								// filter
								.filter(m -> (
									Objects.equals(m.getTaxonomyFamily().getId(), compareTo.getTaxonomyFamily().getId())
									&& StringUtils.equalsIgnoreCase(m.getGenusName(), compareTo.getGenusName())
									&& StringUtils.equalsIgnoreCase(m.getGenusAuthority(), compareTo.getGenusAuthority())
									&& StringUtils.equalsIgnoreCase(m.getSubgenusName(), compareTo.getSubgenusName())
									&& StringUtils.equalsIgnoreCase(m.getSectionName(), compareTo.getSectionName())
									&& StringUtils.equalsIgnoreCase(m.getSubsectionName(), compareTo.getSubsectionName())
									&& StringUtils.equalsIgnoreCase(m.getSeriesName(), compareTo.getSeriesName())
									&& StringUtils.equalsIgnoreCase(m.getSubseriesName(), compareTo.getSubseriesName())
								))
								// print
								.peek(m -> {
									if (m.getGenusName().equals(DEBUG_GENUS_NAME)) {
										print("Match", m);
									}
									log.debug("{} {} {} {} {} {} {}", m.getGenusName(), m.getGenusAuthority(), m.getSubgenusName(), m.getSectionName(), m.getSubsectionName(), m.getSeriesName(), m.getSubseriesName());
								})
								// collect
								.collect(Collectors.toList());

							if (narrow.size() == 1) {
								genus = narrow.get(0);
							} else if (narrow.size() == 0) {
								log.info("{} matches found for {} {} {} {} {} {} {}! Will create new entry.", narrow.size(), genus.getGenusName(), genus.getGenusAuthority(), genus
									.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
							} else {
								print("Too many matches for:", compareTo);
								narrow.forEach(m -> print(">> ", m));
								var narrower = narrow.stream().filter(m -> (
									StringUtils.equalsIgnoreCase(m.getHybridCode(), compareTo.getHybridCode())
									&& StringUtils.equalsIgnoreCase(m.getQualifyingCode(), compareTo.getQualifyingCode())
								)).collect(Collectors.toList());
								if (narrower.size() == 1) {
									genus = narrower.get(0);
								} else {
									throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_genus needs cleaning: " + genus.getGenusName() + " " + genus.getGenusAuthority());
								}
							}
						} else {
							log.info("No existing genera for index={}", indexLookupKey(genus));
							// print("New taxonomy_genus", genus);
						}
					}

					if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Updating", genus);
					}

					// genus.setGenusId(genusRow.getGenusId());
					// genus.setCurrentGenusId(genusRow.getCurrentGenusId());
					genus.setGrinId(genusRow.getGenusId());
					genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
					if (genus.getTaxonomyFamily() == null) {
						log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
						return;
					}

					genus.setQualifyingCode(genusRow.getQualifyingCode());
					genus.setHybridCode(genusRow.getHybridCode());
					genus.setGenusName(genusRow.getGenusName());
					genus.setGenusAuthority(genusRow.getGenusAuthority());
					genus.setSubgenusName(genusRow.getSubgenusName());
					genus.setSectionName(genusRow.getSectionName());
					genus.setSubsectionName(genusRow.getSubsectionName());
					genus.setSeriesName(genusRow.getSeriesName());
					genus.setSubseriesName(genusRow.getSubseriesName());
					genus.setNote(genusRow.getNote());

					// genus.setCreatedDate(genusRow.getCreatedDate());
					// genus.setModifiedDate(genusRow.getModifiedDate()); // Do not update @Versioned modifiedDate

					if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
						print(">> Updated", genus);
					}

					toSave.add(genus);
					genTheirsToOurs.put(genusRow.getGenusId(), genus);
					currentGenus.put(genusRow.getTaxonomyGenusId(), genusRow.getCurrentTaxonomyGenusId());
				});
			}
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyGenus", batch.size());
				taxonomyGenusRepository.saveAll(batch);
				entityManager.flush();
			});
			toSave.clear();

			// Update references
			currentGenus.forEach((theirId, theirCurrentId) -> {
				var genus = genTheirsToOurs.get(theirId);
				var current = genTheirsToOurs.get(theirCurrentId);
				if (current == null || genus.getCurrentTaxonomyGenus() == null || !genus.getCurrentTaxonomyGenus().getId().equals(current.getId())) {
					var reloaded = taxonomyGenusRepository.findById(genus.getId()).orElseThrow();
					reloaded.setCurrentTaxonomyGenus(taxonomyGenusRepository.findById(current.getId()).orElseThrow());
					toSave.add(reloaded);
				}
			});
			// Save updates
			log.info("Updating {} genus references", toSave.size());
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyGenus", batch.size());
				taxonomyGenusRepository.saveAll(batch);
				entityManager.flush();
			});

			toSave.clear();
			allGeneraIndex.clear();
			allGeneraByGrinId.clear();

			{
				List<TaxonomyFamily> toSaveFam = new ArrayList<>();
				currentTypeGenus.forEach((theirId, theirGenusId) -> {
					TaxonomyFamily family = famTheirsToOurs.get(theirId);
					if (theirGenusId == null) {
						if (family.getTypeTaxonomyGenus() != null) {
							family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
							family.setTypeTaxonomyGenus(null);
							toSaveFam.add(family);
						}
					} else {
						var typeGenus = genTheirsToOurs.get(theirGenusId);
						if (typeGenus == null || family.getTypeTaxonomyGenus() == null || family.getTypeTaxonomyGenus().getId().equals(typeGenus.getId())) {
							family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
							family.setTypeTaxonomyGenus(taxonomyGenusRepository.findById(typeGenus.getId()).orElseThrow());
							toSaveFam.add(family);
						}
					}
					if (family.getTypeTaxonomyGenus() == null && theirGenusId != null) {
						log.warn("Type genus is null: their genus_id={} our taxonomy_family_id={}", theirGenusId, family.getId());
					}
				});
				Lists.partition(toSaveFam, 100).forEach(batch -> {
					log.warn("Saving {} taxonomyFamily", batch.size());
					taxonomyFamilyRepository.saveAll(batch);
					entityManager.flush();
				});

				currentTypeGenus.clear();
			}
		}


		{
			// read taxonomy_species.txt
			log.warn("Loading {}/taxonomy_species.txt", downloadFolder);
			// Group list of species by epithet for faster lookups
			final LookupList<String, TaxonomySpecies> allSpeciesByEpithet = new LookupList<>();
			final Map<Long, TaxonomySpecies> allSpeciesByGrinId = new HashMap<>();
			taxonomySpeciesRepository.findAll().forEach(species -> {
				allSpeciesByEpithet.add(StringUtils.toRootLowerCase(species.getSpeciesName()), species);
				if (species.getGrinId() != null) {
					allSpeciesByGrinId.put(species.getGrinId(), species);
				}
			});

			List<TaxonomySpecies> toSave = new ArrayList<>();
			Map<Long, Long> currentSpecies = new HashMap<>();

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
				final AtomicInteger counter = new AtomicInteger(0);
				var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
				beanReader.forEach(speciesRow -> {
					if (counter.incrementAndGet() % 1000 == 0) {
						log.warn("Read {} species rows", counter.get());
					}
					TaxonomySpecies species = new TaxonomySpecies();
					species.setGrinId(speciesRow.getTaxonomySpeciesId());
					species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
					species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
					species.setSpeciesName(speciesRow.getSpeciesName());
					species.setName(speciesRow.getName());
					species.setNameAuthority(speciesRow.getNameAuthority());
					species.setProtologue(speciesRow.getProtologue());

					var other = allSpeciesByGrinId.get(speciesRow.getTaxonomySpeciesId());
					if (other != null) {
						species = other;
					} else {
						log.debug("No species with usda_id={}! Searching for {} {}", speciesRow.getTaxonomySpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());

						List<TaxonomySpecies> speciesForEpithet = allSpeciesByEpithet.get(StringUtils.toRootLowerCase(species.getSpeciesName()));
						final TaxonomySpecies compareTo = species;

						if (speciesForEpithet != null) {
							if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
								print(">> Looking for", species);
							}

							List<TaxonomySpecies> narrow = speciesForEpithet.stream()
								// debug
								.peek(m -> {
									if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
										print("Inspecting:", m);
									}
								})
								// filter
								.filter(m -> (
									Objects.equals(m.getTaxonomyGenus().getId(), compareTo.getTaxonomyGenus().getId())
									&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getName()), StringUtils.trimToNull(compareTo.getName()))
									&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getNameAuthority()), StringUtils.trimToNull(compareTo.getNameAuthority()))
									&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getSynonymCode()), StringUtils.trimToNull(compareTo.getSynonymCode()))
									&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getProtologue()), StringUtils.trimToNull(compareTo.getProtologue()))
								))
								// print
								.peek(m -> {
									if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
										print("Potential match:", m);
									}
									log.debug("{} {}", m.getName(), m.getNameAuthority());
								})
								// gather
								.collect(Collectors.toList());
			
							if (narrow.size() == 1) {
								species = narrow.get(0);
							} else if (narrow.size() == 0) {
								if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
									print("No matches found, will add", species);
								}
								log.debug("{} matches found for {} {}! Will create new entry.", narrow.size(), species.getName(), species.getNameAuthority());
							} else {
								throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_species needs cleaning: " + species.getName() + " " + species.getNameAuthority());
							}
						} else {
							log.debug("No species for epithet={}", species.getSpeciesName());
							if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
								print("Will add", species);
							}
						}
					}

					if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
						print(">> Updating", species);
					}

					// species.setSpeciesId(speciesRow.getSpeciesId());
					// species.setCurrentSpeciesId(speciesRow.getCurrentSpeciesId());
					species.setGrinId(speciesRow.getTaxonomySpeciesId());
					species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
					if (species.getTaxonomyGenus() == null) {
						log.warn("Missing genus for species id={} genus_id={}", speciesRow.getSpeciesId(), speciesRow.getGenusId());
						return;
					}

					species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
					species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
					species.setSpeciesName(speciesRow.getSpeciesName());
					species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
					species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
					species.setSubspeciesName(speciesRow.getSubspeciesName());
					species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
					species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
					species.setVarietyName(speciesRow.getVarietyName());
					species.setVarietyAuthority(speciesRow.getVarietyAuthority());
					species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
					species.setSubvarietyName(speciesRow.getSubvarietyName());
					species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
					species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
					species.setFormaRankType(speciesRow.getFormaRankType());
					species.setFormaName(speciesRow.getFormaName());
					species.setFormaAuthority(speciesRow.getFormaAuthority());
					// species.setPrioritySite1(speciesRow.getPrioritySite1());
					// species.setPrioritySite2(speciesRow.getPrioritySite2());
					// species.setCurator1Id(speciesRow.getCurator1Id());
					// species.setCurator2Id(speciesRow.getCurator2Id());
					species.setRestrictionCode(speciesRow.getRestrictionCode());
					species.setLifeFormCode(speciesRow.getLifeFormCode());
					species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
					species.setIsNamePending(speciesRow.getIsNamePending());
					species.setSynonymCode(speciesRow.getSynonymCode());
					// species.setVerifierCooperator(speciesRow.getVerifierId());
					if (speciesRow.getNameVerifiedDate() != null) {
						species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
					}

					species.setName(speciesRow.getName());
					species.setNameAuthority(speciesRow.getNameAuthority());
					species.setProtologue(speciesRow.getProtologue());
					species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
					species.setNote(speciesRow.getNote());
					species.setSiteNote(speciesRow.getSiteNote());
					species.setAlternateName(speciesRow.getAlternateName());

					// species.setCreatedDate(speciesRow.getCreatedDate());
					// species.setModifiedDate(speciesRow.getModifiedDate()); // Do not update @Versioned modifiedDate

					if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
						print(">> Updated", species);
					}

					toSave.add(species);
					speTheirsToOurs.put(speciesRow.getSpeciesId(), species);
					currentSpecies.put(speciesRow.getSpeciesId(), speciesRow.getCurrentTaxonomySpeciesId());
				});
			}
	
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomySpecies", batch.size());
				taxonomySpeciesRepository.saveAll(batch);
				entityManager.flush();
			});
			toSave.clear();
	
			// Update references
			currentSpecies.forEach((theirId, theirCurrentId) -> {
				var species = speTheirsToOurs.get(theirId);
				var current = speTheirsToOurs.get(theirCurrentId);
				if (current == null || species.getCurrentTaxonomySpecies() == null || !species.getCurrentTaxonomySpecies().getId().equals(current.getId())) {
					species.setCurrentTaxonomySpecies(current);
					toSave.add(species);
				}
			});
			// Save updates
			log.info("Updating {} species references", toSave.size());
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomySpecies", batch.size());
				taxonomySpeciesRepository.saveAll(batch);
				entityManager.flush();
			});
	
			toSave.clear();
		}

		{
			log.warn("Loading {}/taxonomy_author.txt", downloadFolder);

			List<TaxonomyAuthor> allAuthors = taxonomyAuthorRepository.findAll();
			List<TaxonomyAuthor> toSave = new ArrayList<>();
			final LookupList<String, TaxonomyAuthor> authorsLookup = new LookupList<>();
			allAuthors.forEach(author -> {
				authorsLookup.add(author.getShortName().substring(0, 2), author);
			});

			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_author.txt")), 0)) {
				var beanReader = CabReader.beanReader(AuthorRow.class, reader);
				beanReader.forEach(authorRow -> {
					TaxonomyAuthor author = new TaxonomyAuthor();
					author.setShortName(authorRow.getShortName());

					if (author.getShortName() == null) {
						log.warn("Missing shortName id={}", authorRow.getTaxonomyAuthorId());
						return;
					}

					List<TaxonomyAuthor> authorsByFirst = authorsLookup.get(author.getShortName().substring(0, 2));
					if (authorsByFirst != null) {
						final TaxonomyAuthor compareTo = author;
						List<TaxonomyAuthor> narrow = authorsByFirst.stream()
							// filter
							.filter(m -> (
								StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getShortName()), StringUtils.trim(compareTo.getShortName()))
							))
							// print
							.peek(m -> {
								log.debug("{}", m.getShortName());
							})
							// gather
							.collect(Collectors.toList());

						if (narrow.size() == 1) {
							author = narrow.get(0);
						} else if (narrow.size() == 0) {
							log.debug("{} matches found for {}! Will create new entry.", narrow.size(), author.getShortName());
						} else {
							narrow.forEach(match -> {
								log.warn("Found id={} short={} for input {}", match.getId(), match.getShortName(), compareTo.getShortName());
							});
							throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_author needs cleaning: " + author.getShortName());
						}
					}

					author.setFullName(authorRow.getFullName());
					author.setFullNameExpandedDiacritic(authorRow.getFullNameExpandedDiacritic());
					author.setShortName(authorRow.getShortName());
					author.setShortNameExpandedDiacritic(authorRow.getShortNameExpandedDiacritic());
					author.setNote(authorRow.getNote());

					toSave.add(author);
//					authTheirsToOurs.put(authorRow.getTaxonomyAuthorId(), author);
				});
			}
			Lists.partition(toSave, 1000).forEach(batch -> {
				log.warn("Saving {} taxonomyAuthors", batch.size());
				taxonomyAuthorRepository.saveAll(batch);
				entityManager.flush();
			});
			toSave.clear();
		}

		log.warn("Done.");
	}

	private void print(String message, TaxonomySpecies species) {
		TaxonomyGenus tg = species.getTaxonomyGenus();
		log.info("{} {} {} {} proto={} id={}/{} tgid={}/{}",
			message,
			StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
			species.getName(), species.getNameAuthority(),
			species.getProtologue(),
			species.getId(), species.getGrinId(),
			(tg == null ? "null" : tg.getId()), (tg == null ? "null" : tg.getGrinId())
		);
	}

	private String indexLookupKey(TaxonomyGenus genus) {
		return StringUtils.substring(genus.getGenusName(), 0, 3);
	}

	private void print(String message, TaxonomyGenus m) {
		log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}/{}", 
			message,
			m.getQualifyingCode(),
			StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
			m.getGenusAuthority(),
			m.getSubgenusName(),
			m.getSectionName(), m.getSubsectionName(),
			m.getSeriesName(), m.getSubseriesName(),
			(m.getTaxonomyFamily() == null ? null : m.getTaxonomyFamily().getId()), m.getId(), m.getGrinId());
	}

	static void downloadDataIfNeeded(File folder) throws IOException {
		if (!folder.exists()) {
			log.warn("Making directory " + folder.getAbsolutePath());

			if (!folder.mkdirs() || !folder.exists()) {
				throw new IOException("Failed to create data folder at " + folder.getAbsolutePath());
			}
		}

		// The two required files
		final File genusFile = new File(folder, "taxonomy_genus.txt");
		final File speciesFile = new File(folder, "taxonomy_species.txt");

		if (!genusFile.exists() || !speciesFile.exists()) {
			log.warn("Taxonomy data not provided in {}, starting download", folder.getAbsolutePath());
			final TaxonomyDownloader dl = new TaxonomyDownloader();

			log.warn("Downloading GRIN-Taxonomy database to {}", folder.getAbsolutePath());
			final File downloadedCabFile = File.createTempFile("grin-", ".cab");
			dl.downloadCurrent(downloadedCabFile);

			TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, folder, false);
			if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
				log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
				FileUtils.forceDelete(downloadedCabFile);
			}
		}
	}

	/**
	 * Implementation of a group-by list
	 *
	 * @param <K> key
	 * @param <V> value
	 */
	public static class LookupList<K, V> extends HashMap<K, List<V>> {
		private static final long serialVersionUID = 2452703619583443005L;

		public V add(K key, V element) {
			computeIfAbsent(key, k -> new LinkedList<>()).add(element);
			return element;
		}
	}
}