biodiversity-viewer-data-pipeline-v1/1_data_preparation.R at main · uncg-ukraine/biodiversity-viewer-data-pipeline-v1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# ----------------------------------
# The script takes a dataset with scientific names and conservation status info,
# and returns two files: list of two with species lookup (matching) results, and
# csv with names require manual editing.
#
# Inputs:
# full spreadsheet with Latin names of taxa and data on their inclusion/status in
# various red lists ("checklist-protected-species-....csv").
#
# Outputs:
# 1) matches.Rdata - list of two with the result of name matching.
#   [[1]] - "goodmatch" - Names whose matching is satisfactory - we will then look
#   for occurrences for these names using taxon keys.
#   [[2]] - "badmatch" - Names whose matching is supposed to be wrong/unsatisfactory.
#   We will then look for occurrences for these names using concatenated strings
#   of possible verbatim scientific names.
# 2) "higherrank.csv" - csv file with Latin names that need to be completed by all
#   possible verbatim scientific names available in GBIF manually.

# Environment preparation ####
rm(list = ls()) # Reset R`s brain

# Load libraries for data manipulations
library(tidyr)
library(dplyr)
library(rgbif)

# Load input data from csv files
df.fullredlist <- read.csv("./data/Species-protected-list-v27_cleaned_20240122.csv")

# Match taxonomic names against the GBIF Backbone Taxonomy ####
# Extract only taxon names and kingdom
name_data <- df.fullredlist %>%
  select(verbatimScientificName, kingdom) %>%
  rename(name = verbatimScientificName)


# Matching names against GBIF Backbone Taxonomy and tidying data
lookup_results <- name_backbone_checklist(name_data) %>%
  rename(verbatimScientificName = verbatim_name) %>%
  rename(ID = verbatim_index) %>%     # Assign internal IDs for each row (name)
  # Following steps are technical:
  relocate(ID) %>%                    # relocate ID variable to the 1st place
  relocate(usageKey, .after = ID) %>% # relocate key variable next to IDs
  relocate(verbatimScientificName, .after = usageKey) %>% # relocate verbatim name variable next to taxon key
  relocate(scientificName, .after = verbatimScientificName) %>% # relocate names
  relocate(verbatim_kingdom, .after = kingdom) %>%
  # select(1:11) %>%                    # Select only essential taxonomic information (w/o phylum, family)
  select(1:13) %>%                    # Select only essential taxonomic information
  mutate_if(is.character, as.factor)  # convert all character variables to factors


# Merge full checklist with lookup results
df.merged <- df.fullredlist %>% # get raw list
  left_join(lookup_results, by = join_by(verbatimScientificName)) %>% # join with normalized (matched) data
  # Following steps are technical:
  relocate(ID) %>% # relocate ID variable to the 1st place
  relocate(usageKey, .after = ID) %>% # relocate key variable next to IDs
  rename(taxonKey = usageKey) %>%
  relocate(scientificName, .after = verbatimScientificName) %>% # relocate names
  select(-kingdom.y,
         -phylum.y,
         -canonicalName,
         -rank,
         -confidence,
         -verbatim_kingdom) %>%    # get rid of the variables don't needed anymore
  rename(kingdom = kingdom.x) %>%  # rename kingdom
  rename(phylum = phylum.x) %>%    # rename phylum
  relocate(family, .after = class) # relocate family after other taxonomy fields


# Splitting full normalized data by matchType ####
# Names whose matching is satisfactory - we will then look for occurrences for
# these names using taxonKey field.
goodmatch <- df.merged %>%
  filter(matchType != "HIGHERRANK")

# Names whose matching is supposed to be wrong/unsatisfactory. We will then look
# for occurrences for these names using concatenated strings of possible verbatim
# scientific names.
badmatch <- df.merged %>%
  filter(matchType == "HIGHERRANK")

# Save outputs and close the session ####
# Save both parts as a named list
matches <- list(goodmatch, badmatch)
names(matches) <- c("goodmatch", "badmatch")
# Write data to the drive
save(matches, file = "./temp/matches.Rdata")

# Write the names with 'bad' matching to csv file (need manual reviewing before
# the next step).
badmatch %>%
  select(ID, verbatimScientificName) %>%
  write.csv("./temp/higherrank.csv")

# "higherrank.csv" contains names (with their internal IDs) that can not be searched
# for occurrences by taxon key efficiently. The only option left is to search by
# the verbatinScientificName field (names, assigned by data authors). Since each
# taxa can be represented in GBIF datasets with various verbatim (scientific) names,
# we need to edit "higherrank.csv" manually, adding all possible spellings for each name.
# The results must be saved as "higherrank_nameVariants.csv".
# Do not remove/rename current "higherrank_nameVariants.csv", unless you prepared
# the next version by yourself! It required a lot of manual work - be respectful.

rm(list = ls()) # Clean-up the Environment

# End of script ####