UsdaTaxonomyUpdater.java
/*
* Copyright 2020 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.worker;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import javax.persistence.EntityManager;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.CabReader;
import org.genesys.taxonomy.gringlobal.model.AuthorRow;
import org.genesys.taxonomy.gringlobal.model.FamilyRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
import org.gringlobal.api.exception.InvalidApiUsageException;
import org.gringlobal.model.TaxonomyAuthor;
import org.gringlobal.model.TaxonomyFamily;
import org.gringlobal.model.TaxonomyGenus;
import org.gringlobal.model.TaxonomySpecies;
import org.gringlobal.persistence.TaxonomyAuthorRepository;
import org.gringlobal.persistence.TaxonomyFamilyRepository;
import org.gringlobal.persistence.TaxonomyGenusRepository;
import org.gringlobal.persistence.TaxonomySpeciesRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import com.google.common.collect.Lists;
import com.opencsv.CSVReader;
import lombok.extern.slf4j.Slf4j;
/**
* The component downloads current GRIN Taxonomy database if no local copy
* exists and updates Family, Genus and Species tables in the local database.
*
* The matching is done on names only, local identifiers will not match GRIN
* Taxonomy IDs.
*
* @author Matija Obreza
*/
@Component
@Slf4j
public class UsdaTaxonomyUpdater {
private static final String DEBUG_GENUS_NAME = "Neurachne";
private static final String DEBUG_SPECIES_NAME = "Neurachne alopecuroides";
@Autowired
private TaxonomyFamilyRepository taxonomyFamilyRepository;
@Autowired
private TaxonomyGenusRepository taxonomyGenusRepository;
@Autowired
private TaxonomySpeciesRepository taxonomySpeciesRepository;
@Autowired
private TaxonomyAuthorRepository taxonomyAuthorRepository;
private File downloadFolder = new File(FileUtils.getTempDirectory(), "grin-taxonomy-source"); // + System.currentTimeMillis());
@Autowired
private EntityManager entityManager;
/**
* Update local taxonomy tables with data from GRIN Taxonomy.
*
* @throws Exception
*/
@PreAuthorize("hasAuthority('GROUP_ADMINS')")
@Transactional
public void update() throws Exception {
log.info("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
downloadDataIfNeeded(downloadFolder);
updateLocalDatabase();
log.warn("Taxonomy database updated successfully. Transaction will now be committed. This takes time!");
}
/**
* The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
* {@link TaxonomySpecies}. The entries from source database are mapped to local
* identifiers. No records are removed from the local database.
*
* <p>
* Note: The update may update capitalization of names.
* </p>
*
* @throws Exception
*/
private void updateLocalDatabase() throws Exception {
log.info("Loading taxonomy_family.txt");
Map<Long, TaxonomyFamily> famTheirsToOurs = new HashMap<>();
Map<Long, TaxonomyGenus> genTheirsToOurs = new HashMap<>();
Map<Long, TaxonomySpecies> speTheirsToOurs = new HashMap<>();
// Map<Long, TaxonomyAuthor> authTheirsToOurs = new HashMap<>();
Map<Long, Long> currentTypeGenus = new HashMap<>();
{
log.warn("Loading {}/taxonomy_family.txt", downloadFolder);
Map<Long, Long> currentFamily = new HashMap<>();
List<TaxonomyFamily> allFamilies = taxonomyFamilyRepository.findAll();
final Map<Long, TaxonomyFamily> allFamiliesByGrinId = new HashMap<>();
allFamilies.forEach(family -> {
if (family.getGrinId() != null) {
allFamiliesByGrinId.put(family.getGrinId(), family);
}
});
List<TaxonomyFamily> toSave = new ArrayList<>();
// read taxonomy_genus.txt
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
var beanReader = CabReader.beanReader(FamilyRow.class, reader);
beanReader.forEach(familyRow -> {
TaxonomyFamily family = new TaxonomyFamily();
family.setGrinId(familyRow.getTaxonomyFamilyId());
// family.setId(familyRow.getTaxonomyFamilyId());
// family.setTypeTaxonomyGenus(familyRow.getTypeTaxonomyGenusId());
family.setFamilyName(familyRow.getFamilyName());
family.setFamilyAuthority(familyRow.getFamilyAuthority());
family.setSubfamilyName(familyRow.getSubfamilyName());
family.setTribeName(familyRow.getTribeName());
family.setSubtribeName(familyRow.getSubtribeName());
var other = allFamiliesByGrinId.get(familyRow.getTaxonomyFamilyId());
if (other != null) {
family = other;
} else {
if (allFamilies.size() > 0) {
final TaxonomyFamily compareTo = family;
final List<TaxonomyFamily> narrow = allFamilies.stream()
// filter
.filter(m -> (
StringUtils.equalsIgnoreCase(m.getFamilyName(), compareTo.getFamilyName())
&& StringUtils.equalsIgnoreCase(m.getFamilyAuthority(), compareTo.getFamilyAuthority())
&& StringUtils.equalsIgnoreCase(m.getSubfamilyName(), compareTo.getSubfamilyName())
&& StringUtils.equalsIgnoreCase(m.getTribeName(), compareTo.getTribeName())
&& StringUtils.equalsIgnoreCase(m.getSubtribeName(), compareTo.getSubtribeName())
))
// print
.peek(m -> {
log.debug("{} {} {} {} {}", m.getFamilyName(), m.getFamilyAuthority(), m.getSubfamilyName(), m.getTribeName(), m.getSubtribeName());
})
// collect
.collect(Collectors.toList());
if (narrow.size() == 1) {
family = narrow.get(0);
} else if (narrow.size() == 0) {
log.debug("{} matches found! Will create new entry.", narrow.size());
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_family needs cleaning: " + family.getFamilyName());
}
}
}
family.setGrinId(familyRow.getTaxonomyFamilyId());
family.setFamilyName(familyRow.getFamilyName());
family.setFamilyAuthority(familyRow.getFamilyAuthority());
family.setSubfamilyName(familyRow.getSubfamilyName());
family.setTribeName(familyRow.getTribeName());
family.setSubtribeName(familyRow.getSubtribeName());
family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
family.setAlternateName(familyRow.getAlternateName());
family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
family.setNote(familyRow.getNote());
toSave.add(family);
famTheirsToOurs.put(familyRow.getTaxonomyFamilyId(), family);
currentFamily.put(familyRow.getTaxonomyFamilyId(), familyRow.getCurrentTaxonomyFamilyId());
currentTypeGenus.put(familyRow.getTaxonomyFamilyId(), familyRow.getTypeTaxonomyGenusId());
});
}
// Save updates
Lists.partition(toSave, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAll(batch);
entityManager.flush();;
});
toSave.clear();
// Update references
currentFamily.forEach((theirId, theirCurrentId) -> {
var family = famTheirsToOurs.get(theirId);
var current = famTheirsToOurs.get(theirCurrentId);
if (current == null || family.getCurrentTaxonomyFamily() == null || !family.getCurrentTaxonomyFamily().getId().equals(current.getId())) {
var reloaded = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
reloaded.setCurrentTaxonomyFamily(taxonomyFamilyRepository.findById(current.getId()).orElseThrow());
toSave.add(reloaded);
}
});
// Save updates
Lists.partition(toSave, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAll(batch);
entityManager.flush();
});
allFamilies.clear();
toSave.clear();
allFamiliesByGrinId.clear();
}
{
// read taxonomy_genus.txt
log.warn("Loading {}/taxonomy_genus.txt", downloadFolder);
// Group list of genera by family#id for faster lookups
final LookupList<String, TaxonomyGenus> allGeneraIndex = new LookupList<>();
final Map<Long, TaxonomyGenus> allGeneraByGrinId = new HashMap<>();
taxonomyGenusRepository.findAll().forEach(genus -> {
allGeneraIndex.add(indexLookupKey(genus), genus);
if (genus.getGrinId() != null) {
allGeneraByGrinId.put(genus.getGrinId(), genus);
}
});
List<TaxonomyGenus> toSave = new ArrayList<>();
Map<Long, Long> currentGenus = new HashMap<>();
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
var beanReader = CabReader.beanReader(GenusRow.class, reader);
beanReader.forEach(genusRow -> {
TaxonomyGenus genus = new TaxonomyGenus();
genus.setGrinId(genusRow.getTaxonomyGenusId());
genus.setQualifyingCode(genusRow.getQualifyingCode());
genus.setHybridCode(genusRow.getHybridCode());
genus.setGenusName(genusRow.getGenusName());
genus.setGenusAuthority(genusRow.getGenusAuthority());
genus.setSubgenusName(genusRow.getSubgenusName());
genus.setSectionName(genusRow.getSectionName());
genus.setSubsectionName(genusRow.getSubsectionName());
genus.setSeriesName(genusRow.getSeriesName());
genus.setSubseriesName(genusRow.getSubseriesName());
genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
if (genus.getTaxonomyFamily() == null) {
log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
return;
}
if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Matching", genus);
}
var other = allGeneraByGrinId.get(genusRow.getTaxonomyGenusId());
if (other != null) {
genus = other;
} else {
List<TaxonomyGenus> generaWithName = allGeneraIndex.get(indexLookupKey(genus));
if (generaWithName != null) {
final TaxonomyGenus compareTo = genus;
if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
print(">> Looking for: ", compareTo);
}
List<TaxonomyGenus> narrow = generaWithName.stream()
// print
.peek(m -> {
if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
print("Candidate: ", m);
}
})
// filter
.filter(m -> (
Objects.equals(m.getTaxonomyFamily().getId(), compareTo.getTaxonomyFamily().getId())
&& StringUtils.equalsIgnoreCase(m.getGenusName(), compareTo.getGenusName())
&& StringUtils.equalsIgnoreCase(m.getGenusAuthority(), compareTo.getGenusAuthority())
&& StringUtils.equalsIgnoreCase(m.getSubgenusName(), compareTo.getSubgenusName())
&& StringUtils.equalsIgnoreCase(m.getSectionName(), compareTo.getSectionName())
&& StringUtils.equalsIgnoreCase(m.getSubsectionName(), compareTo.getSubsectionName())
&& StringUtils.equalsIgnoreCase(m.getSeriesName(), compareTo.getSeriesName())
&& StringUtils.equalsIgnoreCase(m.getSubseriesName(), compareTo.getSubseriesName())
))
// print
.peek(m -> {
if (m.getGenusName().equals(DEBUG_GENUS_NAME)) {
print("Match", m);
}
log.debug("{} {} {} {} {} {} {}", m.getGenusName(), m.getGenusAuthority(), m.getSubgenusName(), m.getSectionName(), m.getSubsectionName(), m.getSeriesName(), m.getSubseriesName());
})
// collect
.collect(Collectors.toList());
if (narrow.size() == 1) {
genus = narrow.get(0);
} else if (narrow.size() == 0) {
log.info("{} matches found for {} {} {} {} {} {} {}! Will create new entry.", narrow.size(), genus.getGenusName(), genus.getGenusAuthority(), genus
.getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
} else {
print("Too many matches for:", compareTo);
narrow.forEach(m -> print(">> ", m));
var narrower = narrow.stream().filter(m -> (
StringUtils.equalsIgnoreCase(m.getHybridCode(), compareTo.getHybridCode())
&& StringUtils.equalsIgnoreCase(m.getQualifyingCode(), compareTo.getQualifyingCode())
)).collect(Collectors.toList());
if (narrower.size() == 1) {
genus = narrower.get(0);
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_genus needs cleaning: " + genus.getGenusName() + " " + genus.getGenusAuthority());
}
}
} else {
log.info("No existing genera for index={}", indexLookupKey(genus));
// print("New taxonomy_genus", genus);
}
}
if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Updating", genus);
}
// genus.setGenusId(genusRow.getGenusId());
// genus.setCurrentGenusId(genusRow.getCurrentGenusId());
genus.setGrinId(genusRow.getGenusId());
genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
if (genus.getTaxonomyFamily() == null) {
log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
return;
}
genus.setQualifyingCode(genusRow.getQualifyingCode());
genus.setHybridCode(genusRow.getHybridCode());
genus.setGenusName(genusRow.getGenusName());
genus.setGenusAuthority(genusRow.getGenusAuthority());
genus.setSubgenusName(genusRow.getSubgenusName());
genus.setSectionName(genusRow.getSectionName());
genus.setSubsectionName(genusRow.getSubsectionName());
genus.setSeriesName(genusRow.getSeriesName());
genus.setSubseriesName(genusRow.getSubseriesName());
genus.setNote(genusRow.getNote());
// genus.setCreatedDate(genusRow.getCreatedDate());
// genus.setModifiedDate(genusRow.getModifiedDate()); // Do not update @Versioned modifiedDate
if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
print(">> Updated", genus);
}
toSave.add(genus);
genTheirsToOurs.put(genusRow.getGenusId(), genus);
currentGenus.put(genusRow.getTaxonomyGenusId(), genusRow.getCurrentTaxonomyGenusId());
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyGenus", batch.size());
taxonomyGenusRepository.saveAll(batch);
entityManager.flush();
});
toSave.clear();
// Update references
currentGenus.forEach((theirId, theirCurrentId) -> {
var genus = genTheirsToOurs.get(theirId);
var current = genTheirsToOurs.get(theirCurrentId);
if (current == null || genus.getCurrentTaxonomyGenus() == null || !genus.getCurrentTaxonomyGenus().getId().equals(current.getId())) {
var reloaded = taxonomyGenusRepository.findById(genus.getId()).orElseThrow();
reloaded.setCurrentTaxonomyGenus(taxonomyGenusRepository.findById(current.getId()).orElseThrow());
toSave.add(reloaded);
}
});
// Save updates
log.info("Updating {} genus references", toSave.size());
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyGenus", batch.size());
taxonomyGenusRepository.saveAll(batch);
entityManager.flush();
});
toSave.clear();
allGeneraIndex.clear();
allGeneraByGrinId.clear();
{
List<TaxonomyFamily> toSaveFam = new ArrayList<>();
currentTypeGenus.forEach((theirId, theirGenusId) -> {
TaxonomyFamily family = famTheirsToOurs.get(theirId);
if (theirGenusId == null) {
if (family.getTypeTaxonomyGenus() != null) {
family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
family.setTypeTaxonomyGenus(null);
toSaveFam.add(family);
}
} else {
var typeGenus = genTheirsToOurs.get(theirGenusId);
if (typeGenus == null || family.getTypeTaxonomyGenus() == null || family.getTypeTaxonomyGenus().getId().equals(typeGenus.getId())) {
family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
family.setTypeTaxonomyGenus(taxonomyGenusRepository.findById(typeGenus.getId()).orElseThrow());
toSaveFam.add(family);
}
}
if (family.getTypeTaxonomyGenus() == null && theirGenusId != null) {
log.warn("Type genus is null: their genus_id={} our taxonomy_family_id={}", theirGenusId, family.getId());
}
});
Lists.partition(toSaveFam, 100).forEach(batch -> {
log.warn("Saving {} taxonomyFamily", batch.size());
taxonomyFamilyRepository.saveAll(batch);
entityManager.flush();
});
currentTypeGenus.clear();
}
}
{
// read taxonomy_species.txt
log.warn("Loading {}/taxonomy_species.txt", downloadFolder);
// Group list of species by epithet for faster lookups
final LookupList<String, TaxonomySpecies> allSpeciesByEpithet = new LookupList<>();
final Map<Long, TaxonomySpecies> allSpeciesByGrinId = new HashMap<>();
taxonomySpeciesRepository.findAll().forEach(species -> {
allSpeciesByEpithet.add(StringUtils.toRootLowerCase(species.getSpeciesName()), species);
if (species.getGrinId() != null) {
allSpeciesByGrinId.put(species.getGrinId(), species);
}
});
List<TaxonomySpecies> toSave = new ArrayList<>();
Map<Long, Long> currentSpecies = new HashMap<>();
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
final AtomicInteger counter = new AtomicInteger(0);
var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
beanReader.forEach(speciesRow -> {
if (counter.incrementAndGet() % 1000 == 0) {
log.warn("Read {} species rows", counter.get());
}
TaxonomySpecies species = new TaxonomySpecies();
species.setGrinId(speciesRow.getTaxonomySpeciesId());
species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
species.setSpeciesName(speciesRow.getSpeciesName());
species.setName(speciesRow.getName());
species.setNameAuthority(speciesRow.getNameAuthority());
species.setProtologue(speciesRow.getProtologue());
var other = allSpeciesByGrinId.get(speciesRow.getTaxonomySpeciesId());
if (other != null) {
species = other;
} else {
log.debug("No species with usda_id={}! Searching for {} {}", speciesRow.getTaxonomySpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());
List<TaxonomySpecies> speciesForEpithet = allSpeciesByEpithet.get(StringUtils.toRootLowerCase(species.getSpeciesName()));
final TaxonomySpecies compareTo = species;
if (speciesForEpithet != null) {
if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
print(">> Looking for", species);
}
List<TaxonomySpecies> narrow = speciesForEpithet.stream()
// debug
.peek(m -> {
if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
print("Inspecting:", m);
}
})
// filter
.filter(m -> (
Objects.equals(m.getTaxonomyGenus().getId(), compareTo.getTaxonomyGenus().getId())
&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getName()), StringUtils.trimToNull(compareTo.getName()))
&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getNameAuthority()), StringUtils.trimToNull(compareTo.getNameAuthority()))
&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getSynonymCode()), StringUtils.trimToNull(compareTo.getSynonymCode()))
&& StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getProtologue()), StringUtils.trimToNull(compareTo.getProtologue()))
))
// print
.peek(m -> {
if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
print("Potential match:", m);
}
log.debug("{} {}", m.getName(), m.getNameAuthority());
})
// gather
.collect(Collectors.toList());
if (narrow.size() == 1) {
species = narrow.get(0);
} else if (narrow.size() == 0) {
if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
print("No matches found, will add", species);
}
log.debug("{} matches found for {} {}! Will create new entry.", narrow.size(), species.getName(), species.getNameAuthority());
} else {
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_species needs cleaning: " + species.getName() + " " + species.getNameAuthority());
}
} else {
log.debug("No species for epithet={}", species.getSpeciesName());
if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
print("Will add", species);
}
}
}
if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
print(">> Updating", species);
}
// species.setSpeciesId(speciesRow.getSpeciesId());
// species.setCurrentSpeciesId(speciesRow.getCurrentSpeciesId());
species.setGrinId(speciesRow.getTaxonomySpeciesId());
species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
if (species.getTaxonomyGenus() == null) {
log.warn("Missing genus for species id={} genus_id={}", speciesRow.getSpeciesId(), speciesRow.getGenusId());
return;
}
species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
species.setSpeciesName(speciesRow.getSpeciesName());
species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
species.setSubspeciesName(speciesRow.getSubspeciesName());
species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
species.setVarietyName(speciesRow.getVarietyName());
species.setVarietyAuthority(speciesRow.getVarietyAuthority());
species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
species.setSubvarietyName(speciesRow.getSubvarietyName());
species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
species.setFormaRankType(speciesRow.getFormaRankType());
species.setFormaName(speciesRow.getFormaName());
species.setFormaAuthority(speciesRow.getFormaAuthority());
// species.setPrioritySite1(speciesRow.getPrioritySite1());
// species.setPrioritySite2(speciesRow.getPrioritySite2());
// species.setCurator1Id(speciesRow.getCurator1Id());
// species.setCurator2Id(speciesRow.getCurator2Id());
species.setRestrictionCode(speciesRow.getRestrictionCode());
species.setLifeFormCode(speciesRow.getLifeFormCode());
species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
species.setIsNamePending(speciesRow.getIsNamePending());
species.setSynonymCode(speciesRow.getSynonymCode());
// species.setVerifierCooperator(speciesRow.getVerifierId());
if (speciesRow.getNameVerifiedDate() != null) {
species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
}
species.setName(speciesRow.getName());
species.setNameAuthority(speciesRow.getNameAuthority());
species.setProtologue(speciesRow.getProtologue());
species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
species.setNote(speciesRow.getNote());
species.setSiteNote(speciesRow.getSiteNote());
species.setAlternateName(speciesRow.getAlternateName());
// species.setCreatedDate(speciesRow.getCreatedDate());
// species.setModifiedDate(speciesRow.getModifiedDate()); // Do not update @Versioned modifiedDate
if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
print(">> Updated", species);
}
toSave.add(species);
speTheirsToOurs.put(speciesRow.getSpeciesId(), species);
currentSpecies.put(speciesRow.getSpeciesId(), speciesRow.getCurrentTaxonomySpeciesId());
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomySpecies", batch.size());
taxonomySpeciesRepository.saveAll(batch);
entityManager.flush();
});
toSave.clear();
// Update references
currentSpecies.forEach((theirId, theirCurrentId) -> {
var species = speTheirsToOurs.get(theirId);
var current = speTheirsToOurs.get(theirCurrentId);
if (current == null || species.getCurrentTaxonomySpecies() == null || !species.getCurrentTaxonomySpecies().getId().equals(current.getId())) {
species.setCurrentTaxonomySpecies(current);
toSave.add(species);
}
});
// Save updates
log.info("Updating {} species references", toSave.size());
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomySpecies", batch.size());
taxonomySpeciesRepository.saveAll(batch);
entityManager.flush();
});
toSave.clear();
}
{
log.warn("Loading {}/taxonomy_author.txt", downloadFolder);
List<TaxonomyAuthor> allAuthors = taxonomyAuthorRepository.findAll();
List<TaxonomyAuthor> toSave = new ArrayList<>();
final LookupList<String, TaxonomyAuthor> authorsLookup = new LookupList<>();
allAuthors.forEach(author -> {
authorsLookup.add(author.getShortName().substring(0, 2), author);
});
try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_author.txt")), 0)) {
var beanReader = CabReader.beanReader(AuthorRow.class, reader);
beanReader.forEach(authorRow -> {
TaxonomyAuthor author = new TaxonomyAuthor();
author.setShortName(authorRow.getShortName());
if (author.getShortName() == null) {
log.warn("Missing shortName id={}", authorRow.getTaxonomyAuthorId());
return;
}
List<TaxonomyAuthor> authorsByFirst = authorsLookup.get(author.getShortName().substring(0, 2));
if (authorsByFirst != null) {
final TaxonomyAuthor compareTo = author;
List<TaxonomyAuthor> narrow = authorsByFirst.stream()
// filter
.filter(m -> (
StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getShortName()), StringUtils.trim(compareTo.getShortName()))
))
// print
.peek(m -> {
log.debug("{}", m.getShortName());
})
// gather
.collect(Collectors.toList());
if (narrow.size() == 1) {
author = narrow.get(0);
} else if (narrow.size() == 0) {
log.debug("{} matches found for {}! Will create new entry.", narrow.size(), author.getShortName());
} else {
narrow.forEach(match -> {
log.warn("Found id={} short={} for input {}", match.getId(), match.getShortName(), compareTo.getShortName());
});
throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_author needs cleaning: " + author.getShortName());
}
}
author.setFullName(authorRow.getFullName());
author.setFullNameExpandedDiacritic(authorRow.getFullNameExpandedDiacritic());
author.setShortName(authorRow.getShortName());
author.setShortNameExpandedDiacritic(authorRow.getShortNameExpandedDiacritic());
author.setNote(authorRow.getNote());
toSave.add(author);
// authTheirsToOurs.put(authorRow.getTaxonomyAuthorId(), author);
});
}
Lists.partition(toSave, 1000).forEach(batch -> {
log.warn("Saving {} taxonomyAuthors", batch.size());
taxonomyAuthorRepository.saveAll(batch);
entityManager.flush();
});
toSave.clear();
}
log.warn("Done.");
}
private void print(String message, TaxonomySpecies species) {
TaxonomyGenus tg = species.getTaxonomyGenus();
log.info("{} {} {} {} proto={} id={}/{} tgid={}/{}",
message,
StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
species.getName(), species.getNameAuthority(),
species.getProtologue(),
species.getId(), species.getGrinId(),
(tg == null ? "null" : tg.getId()), (tg == null ? "null" : tg.getGrinId())
);
}
private String indexLookupKey(TaxonomyGenus genus) {
return StringUtils.substring(genus.getGenusName(), 0, 3);
}
private void print(String message, TaxonomyGenus m) {
log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}/{}",
message,
m.getQualifyingCode(),
StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
m.getGenusAuthority(),
m.getSubgenusName(),
m.getSectionName(), m.getSubsectionName(),
m.getSeriesName(), m.getSubseriesName(),
(m.getTaxonomyFamily() == null ? null : m.getTaxonomyFamily().getId()), m.getId(), m.getGrinId());
}
static void downloadDataIfNeeded(File folder) throws IOException {
if (!folder.exists()) {
log.warn("Making directory " + folder.getAbsolutePath());
if (!folder.mkdirs() || !folder.exists()) {
throw new IOException("Failed to create data folder at " + folder.getAbsolutePath());
}
}
// The two required files
final File genusFile = new File(folder, "taxonomy_genus.txt");
final File speciesFile = new File(folder, "taxonomy_species.txt");
if (!genusFile.exists() || !speciesFile.exists()) {
log.warn("Taxonomy data not provided in {}, starting download", folder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
log.warn("Downloading GRIN-Taxonomy database to {}", folder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, folder, false);
if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
FileUtils.forceDelete(downloadedCabFile);
}
}
}
/**
* Implementation of a group-by list
*
* @param <K> key
* @param <V> value
*/
public static class LookupList<K, V> extends HashMap<K, List<V>> {
private static final long serialVersionUID = 2452703619583443005L;
public V add(K key, V element) {
computeIfAbsent(key, k -> new LinkedList<>()).add(element);
return element;
}
}
}