UsdaTaxonomyUpdater.java
- /*
- * Copyright 2020 Global Crop Diversity Trust
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.gringlobal.worker;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.time.ZoneOffset;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Map;
- import java.util.Objects;
- import java.util.concurrent.atomic.AtomicInteger;
- import java.util.stream.Collectors;
- import javax.persistence.EntityManager;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.genesys.taxonomy.download.TaxonomyDownloader;
- import org.genesys.taxonomy.gringlobal.component.CabReader;
- import org.genesys.taxonomy.gringlobal.model.AuthorRow;
- import org.genesys.taxonomy.gringlobal.model.FamilyRow;
- import org.genesys.taxonomy.gringlobal.model.GenusRow;
- import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
- import org.gringlobal.api.exception.InvalidApiUsageException;
- import org.gringlobal.model.TaxonomyAuthor;
- import org.gringlobal.model.TaxonomyFamily;
- import org.gringlobal.model.TaxonomyGenus;
- import org.gringlobal.model.TaxonomySpecies;
- import org.gringlobal.persistence.TaxonomyAuthorRepository;
- import org.gringlobal.persistence.TaxonomyFamilyRepository;
- import org.gringlobal.persistence.TaxonomyGenusRepository;
- import org.gringlobal.persistence.TaxonomySpeciesRepository;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.security.access.prepost.PreAuthorize;
- import org.springframework.stereotype.Component;
- import org.springframework.transaction.annotation.Transactional;
- import com.google.common.collect.Lists;
- import com.opencsv.CSVReader;
- import lombok.extern.slf4j.Slf4j;
- /**
- * The component downloads current GRIN Taxonomy database if no local copy
- * exists and updates Family, Genus and Species tables in the local database.
- *
- * The matching is done on names only, local identifiers will not match GRIN
- * Taxonomy IDs.
- *
- * @author Matija Obreza
- */
- @Component
- @Slf4j
- public class UsdaTaxonomyUpdater {
- private static final String DEBUG_GENUS_NAME = "Neurachne";
- private static final String DEBUG_SPECIES_NAME = "Neurachne alopecuroides";
- @Autowired
- private TaxonomyFamilyRepository taxonomyFamilyRepository;
- @Autowired
- private TaxonomyGenusRepository taxonomyGenusRepository;
- @Autowired
- private TaxonomySpeciesRepository taxonomySpeciesRepository;
- @Autowired
- private TaxonomyAuthorRepository taxonomyAuthorRepository;
- private File downloadFolder = new File(FileUtils.getTempDirectory(), "grin-taxonomy-source"); // + System.currentTimeMillis());
- @Autowired
- private EntityManager entityManager;
- /**
- * Update local taxonomy tables with data from GRIN Taxonomy.
- *
- * @throws Exception
- */
- @PreAuthorize("hasAuthority('GROUP_ADMINS')")
- @Transactional
- public void update() throws Exception {
- log.info("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
- downloadDataIfNeeded(downloadFolder);
- updateLocalDatabase();
- log.warn("Taxonomy database updated successfully. Transaction will now be committed. This takes time!");
- }
- /**
- * The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
- * {@link TaxonomySpecies}. The entries from source database are mapped to local
- * identifiers. No records are removed from the local database.
- *
- * <p>
- * Note: The update may update capitalization of names.
- * </p>
- *
- * @throws Exception
- */
- private void updateLocalDatabase() throws Exception {
- log.info("Loading taxonomy_family.txt");
- Map<Long, TaxonomyFamily> famTheirsToOurs = new HashMap<>();
- Map<Long, TaxonomyGenus> genTheirsToOurs = new HashMap<>();
- Map<Long, TaxonomySpecies> speTheirsToOurs = new HashMap<>();
- // Map<Long, TaxonomyAuthor> authTheirsToOurs = new HashMap<>();
- Map<Long, Long> currentTypeGenus = new HashMap<>();
- {
- log.warn("Loading {}/taxonomy_family.txt", downloadFolder);
- Map<Long, Long> currentFamily = new HashMap<>();
- List<TaxonomyFamily> allFamilies = taxonomyFamilyRepository.findAll();
- final Map<Long, TaxonomyFamily> allFamiliesByGrinId = new HashMap<>();
- allFamilies.forEach(family -> {
- if (family.getGrinId() != null) {
- allFamiliesByGrinId.put(family.getGrinId(), family);
- }
- });
- List<TaxonomyFamily> toSave = new ArrayList<>();
- // read taxonomy_genus.txt
- try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
- var beanReader = CabReader.beanReader(FamilyRow.class, reader);
- beanReader.forEach(familyRow -> {
- TaxonomyFamily family = new TaxonomyFamily();
- family.setGrinId(familyRow.getTaxonomyFamilyId());
- // family.setId(familyRow.getTaxonomyFamilyId());
- // family.setTypeTaxonomyGenus(familyRow.getTypeTaxonomyGenusId());
- family.setFamilyName(familyRow.getFamilyName());
- family.setFamilyAuthority(familyRow.getFamilyAuthority());
- family.setSubfamilyName(familyRow.getSubfamilyName());
- family.setTribeName(familyRow.getTribeName());
- family.setSubtribeName(familyRow.getSubtribeName());
-
- var other = allFamiliesByGrinId.get(familyRow.getTaxonomyFamilyId());
- if (other != null) {
- family = other;
- } else {
- if (allFamilies.size() > 0) {
- final TaxonomyFamily compareTo = family;
- final List<TaxonomyFamily> narrow = allFamilies.stream()
- // filter
- .filter(m -> (
- StringUtils.equalsIgnoreCase(m.getFamilyName(), compareTo.getFamilyName())
- && StringUtils.equalsIgnoreCase(m.getFamilyAuthority(), compareTo.getFamilyAuthority())
- && StringUtils.equalsIgnoreCase(m.getSubfamilyName(), compareTo.getSubfamilyName())
- && StringUtils.equalsIgnoreCase(m.getTribeName(), compareTo.getTribeName())
- && StringUtils.equalsIgnoreCase(m.getSubtribeName(), compareTo.getSubtribeName())
- ))
- // print
- .peek(m -> {
- log.debug("{} {} {} {} {}", m.getFamilyName(), m.getFamilyAuthority(), m.getSubfamilyName(), m.getTribeName(), m.getSubtribeName());
- })
- // collect
- .collect(Collectors.toList());
-
- if (narrow.size() == 1) {
- family = narrow.get(0);
- } else if (narrow.size() == 0) {
- log.debug("{} matches found! Will create new entry.", narrow.size());
- } else {
- throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_family needs cleaning: " + family.getFamilyName());
- }
- }
- }
- family.setGrinId(familyRow.getTaxonomyFamilyId());
- family.setFamilyName(familyRow.getFamilyName());
- family.setFamilyAuthority(familyRow.getFamilyAuthority());
- family.setSubfamilyName(familyRow.getSubfamilyName());
- family.setTribeName(familyRow.getTribeName());
- family.setSubtribeName(familyRow.getSubtribeName());
-
- family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
- family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
- family.setAlternateName(familyRow.getAlternateName());
- family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
- family.setNote(familyRow.getNote());
-
- toSave.add(family);
- famTheirsToOurs.put(familyRow.getTaxonomyFamilyId(), family);
- currentFamily.put(familyRow.getTaxonomyFamilyId(), familyRow.getCurrentTaxonomyFamilyId());
- currentTypeGenus.put(familyRow.getTaxonomyFamilyId(), familyRow.getTypeTaxonomyGenusId());
- });
- }
- // Save updates
- Lists.partition(toSave, 100).forEach(batch -> {
- log.warn("Saving {} taxonomyFamily", batch.size());
- taxonomyFamilyRepository.saveAll(batch);
- entityManager.flush();;
- });
- toSave.clear();
- // Update references
- currentFamily.forEach((theirId, theirCurrentId) -> {
- var family = famTheirsToOurs.get(theirId);
- var current = famTheirsToOurs.get(theirCurrentId);
- if (current == null || family.getCurrentTaxonomyFamily() == null || !family.getCurrentTaxonomyFamily().getId().equals(current.getId())) {
- var reloaded = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
- reloaded.setCurrentTaxonomyFamily(taxonomyFamilyRepository.findById(current.getId()).orElseThrow());
- toSave.add(reloaded);
- }
- });
- // Save updates
- Lists.partition(toSave, 100).forEach(batch -> {
- log.warn("Saving {} taxonomyFamily", batch.size());
- taxonomyFamilyRepository.saveAll(batch);
- entityManager.flush();
- });
- allFamilies.clear();
- toSave.clear();
- allFamiliesByGrinId.clear();
- }
- {
- // read taxonomy_genus.txt
- log.warn("Loading {}/taxonomy_genus.txt", downloadFolder);
- // Group list of genera by family#id for faster lookups
- final LookupList<String, TaxonomyGenus> allGeneraIndex = new LookupList<>();
- final Map<Long, TaxonomyGenus> allGeneraByGrinId = new HashMap<>();
- taxonomyGenusRepository.findAll().forEach(genus -> {
- allGeneraIndex.add(indexLookupKey(genus), genus);
- if (genus.getGrinId() != null) {
- allGeneraByGrinId.put(genus.getGrinId(), genus);
- }
- });
- List<TaxonomyGenus> toSave = new ArrayList<>();
- Map<Long, Long> currentGenus = new HashMap<>();
- try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
- var beanReader = CabReader.beanReader(GenusRow.class, reader);
- beanReader.forEach(genusRow -> {
- TaxonomyGenus genus = new TaxonomyGenus();
- genus.setGrinId(genusRow.getTaxonomyGenusId());
- genus.setQualifyingCode(genusRow.getQualifyingCode());
- genus.setHybridCode(genusRow.getHybridCode());
- genus.setGenusName(genusRow.getGenusName());
- genus.setGenusAuthority(genusRow.getGenusAuthority());
- genus.setSubgenusName(genusRow.getSubgenusName());
- genus.setSectionName(genusRow.getSectionName());
- genus.setSubsectionName(genusRow.getSubsectionName());
- genus.setSeriesName(genusRow.getSeriesName());
- genus.setSubseriesName(genusRow.getSubseriesName());
- genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
- if (genus.getTaxonomyFamily() == null) {
- log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
- return;
- }
- if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
- print(">> Matching", genus);
- }
- var other = allGeneraByGrinId.get(genusRow.getTaxonomyGenusId());
- if (other != null) {
- genus = other;
- } else {
- List<TaxonomyGenus> generaWithName = allGeneraIndex.get(indexLookupKey(genus));
- if (generaWithName != null) {
- final TaxonomyGenus compareTo = genus;
- if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
- print(">> Looking for: ", compareTo);
- }
- List<TaxonomyGenus> narrow = generaWithName.stream()
- // print
- .peek(m -> {
- if (compareTo.getGenusName().equals(DEBUG_GENUS_NAME)) {
- print("Candidate: ", m);
- }
- })
- // filter
- .filter(m -> (
- Objects.equals(m.getTaxonomyFamily().getId(), compareTo.getTaxonomyFamily().getId())
- && StringUtils.equalsIgnoreCase(m.getGenusName(), compareTo.getGenusName())
- && StringUtils.equalsIgnoreCase(m.getGenusAuthority(), compareTo.getGenusAuthority())
- && StringUtils.equalsIgnoreCase(m.getSubgenusName(), compareTo.getSubgenusName())
- && StringUtils.equalsIgnoreCase(m.getSectionName(), compareTo.getSectionName())
- && StringUtils.equalsIgnoreCase(m.getSubsectionName(), compareTo.getSubsectionName())
- && StringUtils.equalsIgnoreCase(m.getSeriesName(), compareTo.getSeriesName())
- && StringUtils.equalsIgnoreCase(m.getSubseriesName(), compareTo.getSubseriesName())
- ))
- // print
- .peek(m -> {
- if (m.getGenusName().equals(DEBUG_GENUS_NAME)) {
- print("Match", m);
- }
- log.debug("{} {} {} {} {} {} {}", m.getGenusName(), m.getGenusAuthority(), m.getSubgenusName(), m.getSectionName(), m.getSubsectionName(), m.getSeriesName(), m.getSubseriesName());
- })
- // collect
- .collect(Collectors.toList());
- if (narrow.size() == 1) {
- genus = narrow.get(0);
- } else if (narrow.size() == 0) {
- log.info("{} matches found for {} {} {} {} {} {} {}! Will create new entry.", narrow.size(), genus.getGenusName(), genus.getGenusAuthority(), genus
- .getSubgenusName(), genus.getSectionName(), genus.getSubsectionName(), genus.getSeriesName(), genus.getSubseriesName());
- } else {
- print("Too many matches for:", compareTo);
- narrow.forEach(m -> print(">> ", m));
- var narrower = narrow.stream().filter(m -> (
- StringUtils.equalsIgnoreCase(m.getHybridCode(), compareTo.getHybridCode())
- && StringUtils.equalsIgnoreCase(m.getQualifyingCode(), compareTo.getQualifyingCode())
- )).collect(Collectors.toList());
- if (narrower.size() == 1) {
- genus = narrower.get(0);
- } else {
- throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_genus needs cleaning: " + genus.getGenusName() + " " + genus.getGenusAuthority());
- }
- }
- } else {
- log.info("No existing genera for index={}", indexLookupKey(genus));
- // print("New taxonomy_genus", genus);
- }
- }
- if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
- print(">> Updating", genus);
- }
- // genus.setGenusId(genusRow.getGenusId());
- // genus.setCurrentGenusId(genusRow.getCurrentGenusId());
- genus.setGrinId(genusRow.getGenusId());
- genus.setTaxonomyFamily(famTheirsToOurs.get(genusRow.getTaxonomyFamilyId()));
- if (genus.getTaxonomyFamily() == null) {
- log.warn("No family with their id=" + genusRow.getTaxonomyFamilyId());
- return;
- }
- genus.setQualifyingCode(genusRow.getQualifyingCode());
- genus.setHybridCode(genusRow.getHybridCode());
- genus.setGenusName(genusRow.getGenusName());
- genus.setGenusAuthority(genusRow.getGenusAuthority());
- genus.setSubgenusName(genusRow.getSubgenusName());
- genus.setSectionName(genusRow.getSectionName());
- genus.setSubsectionName(genusRow.getSubsectionName());
- genus.setSeriesName(genusRow.getSeriesName());
- genus.setSubseriesName(genusRow.getSubseriesName());
- genus.setNote(genusRow.getNote());
- // genus.setCreatedDate(genusRow.getCreatedDate());
- // genus.setModifiedDate(genusRow.getModifiedDate()); // Do not update @Versioned modifiedDate
- if (StringUtils.equalsIgnoreCase(genus.getGenusName(), DEBUG_GENUS_NAME)) {
- print(">> Updated", genus);
- }
- toSave.add(genus);
- genTheirsToOurs.put(genusRow.getGenusId(), genus);
- currentGenus.put(genusRow.getTaxonomyGenusId(), genusRow.getCurrentTaxonomyGenusId());
- });
- }
- Lists.partition(toSave, 1000).forEach(batch -> {
- log.warn("Saving {} taxonomyGenus", batch.size());
- taxonomyGenusRepository.saveAll(batch);
- entityManager.flush();
- });
- toSave.clear();
- // Update references
- currentGenus.forEach((theirId, theirCurrentId) -> {
- var genus = genTheirsToOurs.get(theirId);
- var current = genTheirsToOurs.get(theirCurrentId);
- if (current == null || genus.getCurrentTaxonomyGenus() == null || !genus.getCurrentTaxonomyGenus().getId().equals(current.getId())) {
- var reloaded = taxonomyGenusRepository.findById(genus.getId()).orElseThrow();
- reloaded.setCurrentTaxonomyGenus(taxonomyGenusRepository.findById(current.getId()).orElseThrow());
- toSave.add(reloaded);
- }
- });
- // Save updates
- log.info("Updating {} genus references", toSave.size());
- Lists.partition(toSave, 1000).forEach(batch -> {
- log.warn("Saving {} taxonomyGenus", batch.size());
- taxonomyGenusRepository.saveAll(batch);
- entityManager.flush();
- });
- toSave.clear();
- allGeneraIndex.clear();
- allGeneraByGrinId.clear();
- {
- List<TaxonomyFamily> toSaveFam = new ArrayList<>();
- currentTypeGenus.forEach((theirId, theirGenusId) -> {
- TaxonomyFamily family = famTheirsToOurs.get(theirId);
- if (theirGenusId == null) {
- if (family.getTypeTaxonomyGenus() != null) {
- family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
- family.setTypeTaxonomyGenus(null);
- toSaveFam.add(family);
- }
- } else {
- var typeGenus = genTheirsToOurs.get(theirGenusId);
- if (typeGenus == null || family.getTypeTaxonomyGenus() == null || family.getTypeTaxonomyGenus().getId().equals(typeGenus.getId())) {
- family = taxonomyFamilyRepository.findById(family.getId()).orElseThrow();
- family.setTypeTaxonomyGenus(taxonomyGenusRepository.findById(typeGenus.getId()).orElseThrow());
- toSaveFam.add(family);
- }
- }
- if (family.getTypeTaxonomyGenus() == null && theirGenusId != null) {
- log.warn("Type genus is null: their genus_id={} our taxonomy_family_id={}", theirGenusId, family.getId());
- }
- });
- Lists.partition(toSaveFam, 100).forEach(batch -> {
- log.warn("Saving {} taxonomyFamily", batch.size());
- taxonomyFamilyRepository.saveAll(batch);
- entityManager.flush();
- });
- currentTypeGenus.clear();
- }
- }
- {
- // read taxonomy_species.txt
- log.warn("Loading {}/taxonomy_species.txt", downloadFolder);
- // Group list of species by epithet for faster lookups
- final LookupList<String, TaxonomySpecies> allSpeciesByEpithet = new LookupList<>();
- final Map<Long, TaxonomySpecies> allSpeciesByGrinId = new HashMap<>();
- taxonomySpeciesRepository.findAll().forEach(species -> {
- allSpeciesByEpithet.add(StringUtils.toRootLowerCase(species.getSpeciesName()), species);
- if (species.getGrinId() != null) {
- allSpeciesByGrinId.put(species.getGrinId(), species);
- }
- });
- List<TaxonomySpecies> toSave = new ArrayList<>();
- Map<Long, Long> currentSpecies = new HashMap<>();
- try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
- final AtomicInteger counter = new AtomicInteger(0);
- var beanReader = CabReader.beanReader(SpeciesRow.class, reader);
- beanReader.forEach(speciesRow -> {
- if (counter.incrementAndGet() % 1000 == 0) {
- log.warn("Read {} species rows", counter.get());
- }
- TaxonomySpecies species = new TaxonomySpecies();
- species.setGrinId(speciesRow.getTaxonomySpeciesId());
- species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
- species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
- species.setSpeciesName(speciesRow.getSpeciesName());
- species.setName(speciesRow.getName());
- species.setNameAuthority(speciesRow.getNameAuthority());
- species.setProtologue(speciesRow.getProtologue());
- var other = allSpeciesByGrinId.get(speciesRow.getTaxonomySpeciesId());
- if (other != null) {
- species = other;
- } else {
- log.debug("No species with usda_id={}! Searching for {} {}", speciesRow.getTaxonomySpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());
- List<TaxonomySpecies> speciesForEpithet = allSpeciesByEpithet.get(StringUtils.toRootLowerCase(species.getSpeciesName()));
- final TaxonomySpecies compareTo = species;
- if (speciesForEpithet != null) {
- if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
- print(">> Looking for", species);
- }
- List<TaxonomySpecies> narrow = speciesForEpithet.stream()
- // debug
- .peek(m -> {
- if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
- print("Inspecting:", m);
- }
- })
- // filter
- .filter(m -> (
- Objects.equals(m.getTaxonomyGenus().getId(), compareTo.getTaxonomyGenus().getId())
- && StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getName()), StringUtils.trimToNull(compareTo.getName()))
- && StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getNameAuthority()), StringUtils.trimToNull(compareTo.getNameAuthority()))
- && StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getSynonymCode()), StringUtils.trimToNull(compareTo.getSynonymCode()))
- && StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getProtologue()), StringUtils.trimToNull(compareTo.getProtologue()))
- ))
- // print
- .peek(m -> {
- if (StringUtils.equalsIgnoreCase(compareTo.getName(), DEBUG_SPECIES_NAME)) {
- print("Potential match:", m);
- }
- log.debug("{} {}", m.getName(), m.getNameAuthority());
- })
- // gather
- .collect(Collectors.toList());
-
- if (narrow.size() == 1) {
- species = narrow.get(0);
- } else if (narrow.size() == 0) {
- if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
- print("No matches found, will add", species);
- }
- log.debug("{} matches found for {} {}! Will create new entry.", narrow.size(), species.getName(), species.getNameAuthority());
- } else {
- throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_species needs cleaning: " + species.getName() + " " + species.getNameAuthority());
- }
- } else {
- log.debug("No species for epithet={}", species.getSpeciesName());
- if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
- print("Will add", species);
- }
- }
- }
- if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
- print(">> Updating", species);
- }
- // species.setSpeciesId(speciesRow.getSpeciesId());
- // species.setCurrentSpeciesId(speciesRow.getCurrentSpeciesId());
- species.setGrinId(speciesRow.getTaxonomySpeciesId());
- species.setTaxonomyGenus(genTheirsToOurs.get(speciesRow.getGenusId()));
- if (species.getTaxonomyGenus() == null) {
- log.warn("Missing genus for species id={} genus_id={}", speciesRow.getSpeciesId(), speciesRow.getGenusId());
- return;
- }
- species.setNomenNumber(speciesRow.getNomenNumber() == null ? null : speciesRow.getNomenNumber().intValue());
- species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
- species.setSpeciesName(speciesRow.getSpeciesName());
- species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
- species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
- species.setSubspeciesName(speciesRow.getSubspeciesName());
- species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
- species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
- species.setVarietyName(speciesRow.getVarietyName());
- species.setVarietyAuthority(speciesRow.getVarietyAuthority());
- species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
- species.setSubvarietyName(speciesRow.getSubvarietyName());
- species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
- species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
- species.setFormaRankType(speciesRow.getFormaRankType());
- species.setFormaName(speciesRow.getFormaName());
- species.setFormaAuthority(speciesRow.getFormaAuthority());
- // species.setPrioritySite1(speciesRow.getPrioritySite1());
- // species.setPrioritySite2(speciesRow.getPrioritySite2());
- // species.setCurator1Id(speciesRow.getCurator1Id());
- // species.setCurator2Id(speciesRow.getCurator2Id());
- species.setRestrictionCode(speciesRow.getRestrictionCode());
- species.setLifeFormCode(speciesRow.getLifeFormCode());
- species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
- species.setIsNamePending(speciesRow.getIsNamePending());
- species.setSynonymCode(speciesRow.getSynonymCode());
- // species.setVerifierCooperator(speciesRow.getVerifierId());
- if (speciesRow.getNameVerifiedDate() != null) {
- species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
- }
- species.setName(speciesRow.getName());
- species.setNameAuthority(speciesRow.getNameAuthority());
- species.setProtologue(speciesRow.getProtologue());
- species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
- species.setNote(speciesRow.getNote());
- species.setSiteNote(speciesRow.getSiteNote());
- species.setAlternateName(speciesRow.getAlternateName());
- // species.setCreatedDate(speciesRow.getCreatedDate());
- // species.setModifiedDate(speciesRow.getModifiedDate()); // Do not update @Versioned modifiedDate
- if (StringUtils.equalsIgnoreCase(species.getName(), DEBUG_SPECIES_NAME)) {
- print(">> Updated", species);
- }
- toSave.add(species);
- speTheirsToOurs.put(speciesRow.getSpeciesId(), species);
- currentSpecies.put(speciesRow.getSpeciesId(), speciesRow.getCurrentTaxonomySpeciesId());
- });
- }
-
- Lists.partition(toSave, 1000).forEach(batch -> {
- log.warn("Saving {} taxonomySpecies", batch.size());
- taxonomySpeciesRepository.saveAll(batch);
- entityManager.flush();
- });
- toSave.clear();
-
- // Update references
- currentSpecies.forEach((theirId, theirCurrentId) -> {
- var species = speTheirsToOurs.get(theirId);
- var current = speTheirsToOurs.get(theirCurrentId);
- if (current == null || species.getCurrentTaxonomySpecies() == null || !species.getCurrentTaxonomySpecies().getId().equals(current.getId())) {
- species.setCurrentTaxonomySpecies(current);
- toSave.add(species);
- }
- });
- // Save updates
- log.info("Updating {} species references", toSave.size());
- Lists.partition(toSave, 1000).forEach(batch -> {
- log.warn("Saving {} taxonomySpecies", batch.size());
- taxonomySpeciesRepository.saveAll(batch);
- entityManager.flush();
- });
-
- toSave.clear();
- }
- {
- log.warn("Loading {}/taxonomy_author.txt", downloadFolder);
- List<TaxonomyAuthor> allAuthors = taxonomyAuthorRepository.findAll();
- List<TaxonomyAuthor> toSave = new ArrayList<>();
- final LookupList<String, TaxonomyAuthor> authorsLookup = new LookupList<>();
- allAuthors.forEach(author -> {
- authorsLookup.add(author.getShortName().substring(0, 2), author);
- });
- try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_author.txt")), 0)) {
- var beanReader = CabReader.beanReader(AuthorRow.class, reader);
- beanReader.forEach(authorRow -> {
- TaxonomyAuthor author = new TaxonomyAuthor();
- author.setShortName(authorRow.getShortName());
- if (author.getShortName() == null) {
- log.warn("Missing shortName id={}", authorRow.getTaxonomyAuthorId());
- return;
- }
- List<TaxonomyAuthor> authorsByFirst = authorsLookup.get(author.getShortName().substring(0, 2));
- if (authorsByFirst != null) {
- final TaxonomyAuthor compareTo = author;
- List<TaxonomyAuthor> narrow = authorsByFirst.stream()
- // filter
- .filter(m -> (
- StringUtils.equalsIgnoreCase(StringUtils.trimToNull(m.getShortName()), StringUtils.trim(compareTo.getShortName()))
- ))
- // print
- .peek(m -> {
- log.debug("{}", m.getShortName());
- })
- // gather
- .collect(Collectors.toList());
- if (narrow.size() == 1) {
- author = narrow.get(0);
- } else if (narrow.size() == 0) {
- log.debug("{} matches found for {}! Will create new entry.", narrow.size(), author.getShortName());
- } else {
- narrow.forEach(match -> {
- log.warn("Found id={} short={} for input {}", match.getId(), match.getShortName(), compareTo.getShortName());
- });
- throw new InvalidApiUsageException("This shouldn't happen, your taxonomy_author needs cleaning: " + author.getShortName());
- }
- }
- author.setFullName(authorRow.getFullName());
- author.setFullNameExpandedDiacritic(authorRow.getFullNameExpandedDiacritic());
- author.setShortName(authorRow.getShortName());
- author.setShortNameExpandedDiacritic(authorRow.getShortNameExpandedDiacritic());
- author.setNote(authorRow.getNote());
- toSave.add(author);
- // authTheirsToOurs.put(authorRow.getTaxonomyAuthorId(), author);
- });
- }
- Lists.partition(toSave, 1000).forEach(batch -> {
- log.warn("Saving {} taxonomyAuthors", batch.size());
- taxonomyAuthorRepository.saveAll(batch);
- entityManager.flush();
- });
- toSave.clear();
- }
- log.warn("Done.");
- }
- private void print(String message, TaxonomySpecies species) {
- TaxonomyGenus tg = species.getTaxonomyGenus();
- log.info("{} {} {} {} proto={} id={}/{} tgid={}/{}",
- message,
- StringUtils.defaultIfBlank(species.getSynonymCode(), ""),
- species.getName(), species.getNameAuthority(),
- species.getProtologue(),
- species.getId(), species.getGrinId(),
- (tg == null ? "null" : tg.getId()), (tg == null ? "null" : tg.getGrinId())
- );
- }
- private String indexLookupKey(TaxonomyGenus genus) {
- return StringUtils.substring(genus.getGenusName(), 0, 3);
- }
- private void print(String message, TaxonomyGenus m) {
- log.info("{} {} {}{} {} {} {} {} {} {} tf={} gid={}/{}",
- message,
- m.getQualifyingCode(),
- StringUtils.defaultIfBlank(m.getHybridCode(), ""), m.getGenusName(),
- m.getGenusAuthority(),
- m.getSubgenusName(),
- m.getSectionName(), m.getSubsectionName(),
- m.getSeriesName(), m.getSubseriesName(),
- (m.getTaxonomyFamily() == null ? null : m.getTaxonomyFamily().getId()), m.getId(), m.getGrinId());
- }
- static void downloadDataIfNeeded(File folder) throws IOException {
- if (!folder.exists()) {
- log.warn("Making directory " + folder.getAbsolutePath());
- if (!folder.mkdirs() || !folder.exists()) {
- throw new IOException("Failed to create data folder at " + folder.getAbsolutePath());
- }
- }
- // The two required files
- final File genusFile = new File(folder, "taxonomy_genus.txt");
- final File speciesFile = new File(folder, "taxonomy_species.txt");
- if (!genusFile.exists() || !speciesFile.exists()) {
- log.warn("Taxonomy data not provided in {}, starting download", folder.getAbsolutePath());
- final TaxonomyDownloader dl = new TaxonomyDownloader();
- log.warn("Downloading GRIN-Taxonomy database to {}", folder.getAbsolutePath());
- final File downloadedCabFile = File.createTempFile("grin-", ".cab");
- dl.downloadCurrent(downloadedCabFile);
- TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, folder, false);
- if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
- log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
- FileUtils.forceDelete(downloadedCabFile);
- }
- }
- }
- /**
- * Implementation of a group-by list
- *
- * @param <K> key
- * @param <V> value
- */
- public static class LookupList<K, V> extends HashMap<K, List<V>> {
- private static final long serialVersionUID = 2452703619583443005L;
- public V add(K key, V element) {
- computeIfAbsent(key, k -> new LinkedList<>()).add(element);
- return element;
- }
- }
- }