load packages & fonts & some functions

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyheatmaps)
library(dygraphs)
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(hrbrthemes)
library(ggthemes)
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(wordcloud2)
library(ggtext)
## Warning: package 'ggtext' was built under R version 4.4.3
library(glue)
library(httr)
## 
## Attaching package: 'httr'
## 
## The following object is masked from 'package:plotly':
## 
##     config
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
windowsFonts(NotoSansTC = "NotoSansTC-Regular", NotoSansCJKtcBlack = "NotoSansCJKtc-Black")

# function -- 讀取最近期的資料
latestPath <- function(path){
  latestOrder <- path %>% file.info() %>% .$ctime %>% which.max()
  return(path[latestOrder])
}

# function -- ceiling by digits as `round(digits = ...)`
ceiling_digits <- function(x, digits = 10){ceiling((x + 1)/digits)*digits}

load data

dat <- read_delim("data/specimens_dwc14.csv",
                  delim = ";",
                  escape_double = FALSE,
                  trim_ws = TRUE,
                  progress = TRUE)
## indexing specimens_dwc14.csv [------------------------------] 2.15GB/s, eta:  0sindexing specimens_dwc14.csv [==============================] 2.15GB/s, eta:  0s                                                                                                                                                                indexing specimens_dwc14.csv [============================] 142.80MB/s, eta:  0s                                                                                
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 498731 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (53): GlobalUniqueIdentifier, BasisOfRecord, InstitutionCode, Collectio...
## dbl  (13): SpcmID, DupID, SeriID, DetID, CountryID, MinimumElevationInMeters...
## lgl  (11): InformationWithheld, SpecificEpithetInChinese, NomenclaturalCode,...
## dttm  (4): DateLastModified, EarliestDateCollected, LatestDateCollected, Dat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# taiCOL taxa list ----
name_taicol <- read_csv(grep("^data/TaiCOL_taxon_.+\\.csv$", x = list.files(recursive = TRUE),value = TRUE) %>% latestPath())
## Rows: 95288 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (30): taxon_id, simple_name, name_author, formatted_name, synonyms, for...
## dbl   (1): name_id
## lgl   (9): is_hybrid, is_endemic, is_fossil, is_terrestrial, is_freshwater, ...
## date  (2): created_at, updated_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

資料長這樣

dat %>% glimpse()
## Rows: 498,731
## Columns: 81
## $ SpcmID                       <dbl> 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 9, 10…
## $ DupID                        <dbl> 1, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ SeriID                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ DetID                        <dbl> 1, 334021, 334020, 334019, 4, 309515, 6, …
## $ GlobalUniqueIdentifier       <chr> "TAIF:PLANT:1:1:1:21688", "TAIF:PLANT:2:1…
## $ DateLastModified             <dttm> 2017-09-05 11:04:04, 2009-12-09 14:39:23…
## $ BasisOfRecord                <chr> "PreservedSpecimen", "PreservedSpecimen",…
## $ InstitutionCode              <chr> "TAIF", "TAIF", "TAIF", "TAIF", "TAIF", "…
## $ CollectionCode               <chr> "PLANT", "PLANT", "PLANT", "PLANT", "PLAN…
## $ CatalogNumber                <chr> "21688", "21181", "330969", "330970", "21…
## $ InformationWithheld          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Remarks                      <chr> "6,000 ft.", "Hassen-zan 7000ft.; yellow …
## $ RemarksInChinese             <chr> "6,000 ft.", "Hassen-zan 7000ft.; yellow …
## $ SPID                         <chr> "Eupa3", "Impa4", "Impa4", "Impa4", "Also…
## $ ScientificName               <chr> "Eupatorium formosanum Hayata", "Impatien…
## $ SpeciesNameInChinese         <chr> NA, "紫花鳳仙花", "紫花鳳仙花", "紫花鳳仙…
## $ HigherTaxon                  <chr> "Plantae, Magnoliophyta, Magnoliopsida, A…
## $ HigherTaxonInChinese         <chr> "植物界, 木蘭植物門, 木蘭綱, 菊目, 菊科, …
## $ Kingdom                      <chr> "Plantae", "Plantae", "Plantae", "Plantae…
## $ KingdomInChinese             <chr> "植物界", "植物界", "植物界", "植物界", "…
## $ Phylum                       <chr> "Magnoliophyta", "Magnoliophyta", "Magnol…
## $ PhylumInChinese              <chr> "木蘭植物門", "木蘭植物門", "木蘭植物門",…
## $ Class                        <chr> "Magnoliopsida", "Magnoliopsida", "Magnol…
## $ ClassInChinese               <chr> "木蘭綱", "木蘭綱", "木蘭綱", "木蘭綱", "…
## $ Order                        <chr> "Asterales", "Geraniales", "Geraniales", …
## $ OrderInChinese               <chr> "菊目", "牻牛兒苗目", "牻牛兒苗目", "牻牛…
## $ Family                       <chr> "Asteraceae", "Balsaminaceae", "Balsamina…
## $ FamilyInChinese              <chr> "菊科", "鳳仙花科", "鳳仙花科", "鳳仙花科…
## $ Genus                        <chr> "Eupatorium", "Impatiens", "Impatiens", "…
## $ GenusInChinese               <chr> "澤蘭屬", "鳳仙花屬", "鳳仙花屬", "鳳仙花…
## $ SpecificEpithet              <chr> "formosanum", "uniflora", "uniflora", "un…
## $ SpecificEpithetInChinese     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ InfraspecificRank            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ InfraspecificEpithet         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ AuthorYearOfScientificName   <chr> "Hayata", "Hayata", "Hayata", "Hayata", "…
## $ NomenclaturalCode            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ IdentificationQualifier      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ HigherGeography              <chr> "Taiwan (台灣省), Taiwan", "Taiwan (台灣…
## $ CountryID                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Country                      <chr> "Taiwan", "Taiwan", "Taiwan", "Taiwan", "…
## $ CountryInChinese             <chr> "台灣", "台灣", "台灣", "台灣", "台灣", "…
## $ StateProvince                <chr> "Taiwan (台灣省)", "Taiwan (台灣省)", "Ta…
## $ County                       <chr> NA, NA, NA, NA, NA, NA, NA, "Miaoli Count…
## $ CountyInChinese              <chr> NA, NA, NA, NA, NA, NA, NA, "苗栗縣", "臺…
## $ SubCountyCantonInChinese     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Locality                     <chr> "Mt. Arisan", "Hassen-zan", "Hassen-zan",…
## $ LocalityInChinese            <chr> "Mt. Arisan", "Hassen-zan", "Hassen-zan",…
## $ AlternativeLocalityInChinese <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ MinimumElevationInMeters     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 3705, 370…
## $ MaximumElevationInMeters     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 3705, 370…
## $ ValidDistributionFlag        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ EarliestDateCollected        <dttm> 1927-10-08, 1927-08-18, 1927-08-18, 1927…
## $ LatestDateCollected          <dttm> 1927-10-08, 1927-08-18, 1927-08-18, 1927…
## $ DayOfYear                    <dbl> 281, 230, 230, 230, 240, 240, 230, 184, 2…
## $ Collector                    <chr> "Ryozo Kanehira", "Ryozo Kanehira", "Ryoz…
## $ CollectorInChinese           <chr> "金平 亮三", "金平 亮三", "金平 亮三", "…
## $ OtherCollectorInChinese      <chr> "Syunichi Sasaki (佐佐木 舜一)", NA, NA, …
## $ ImageURL                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ RelatedInformation           <chr> NA, NA, NA, NA, NA, NA, "Phenology: Spori…
## $ RelatedInformationInChinese  <chr> NA, NA, NA, NA, NA, NA, "物候: Sporing", …
## $ CatalogNumberNumeric         <dbl> 21688, 21181, 330969, 330970, 21560, 3077…
## $ IdentifiedBy                 <chr> NA, NA, NA, NA, NA, NA, NA, "Chia-Ying Hu…
## $ IdentifiedByInChinese        <chr> NA, NA, NA, NA, NA, NA, NA, "胡嘉穎", "林…
## $ DateIdentified               <dttm> NA, NA, NA, NA, NA, NA, 1927-08-18, 2001…
## $ CurrentDetermination         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ CollectorNumber              <chr> "s. n.", "s. n.", "s. n.", "s. n.", "s. n…
## $ VerbatimCollectingDate       <chr> "1927-Oct-08", "1927-Aug-18", "1927-Aug-1…
## $ VerbatimElevation            <chr> NA, NA, NA, NA, NA, NA, NA, NA, "3705 M",…
## $ Preparations                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ PreparationsInChinese        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TypeStatus                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TypeStatusInChinese          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Disposition                  <chr> "in collection", "in collection", "in col…
## $ DecimalLatitude              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 24.31667,…
## $ DecimalLongitude             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 121.4167,…
## $ GeodeticDatum                <chr> "TWD67", "TWD67", "TWD67", "TWD67", "TWD6…
## $ LocValid                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ VerbatimCoordinates          <chr> NA, NA, NA, NA, NA, NA, NA, NA, "N 24/19/…
## $ VerbatimLatitude             <chr> NA, NA, NA, NA, NA, NA, NA, NA, "24/19/",…
## $ VerbatimLongitude            <chr> NA, NA, NA, NA, NA, NA, NA, NA, "121/25/"…
## $ VerbatimCoordinateSystem     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

基礎統計資料

統計至202502資料:

多少份館藏標本?
How many specimens in TAIF herbarium?

n_specimens <- dat %>%
  distinct(CatalogNumber) %>%
  lengths()
cat(n_specimens, "份\n")
## 492577 份

多少採集者貢獻標本?
How many collectors contribute specimens?

n_collector <- dat %>%
  distinct(Collector) %>%
  lengths()
cat(n_collector, "人\n")
## 4058 人

平均每人貢獻標本份數
The average number of specimens from each collector?

cat(round(n_specimens/n_collector, digits = 0), "份/人\n")
## 121 份/人

繪製時間分布

資料繪圖前處理

dat_timeS <- dat %>% summarise(count = n(), .by = c(EarliestDateCollected, Collector)) %>% arrange(desc(count))

# Since my time is currently a factor, I have to convert it to a date-time format!
dat_timeS$EarliestDateCollected <- ymd(dat_timeS$EarliestDateCollected)

# Then you can create the xts necessary to use dygraph
dat_timeS_dropNA <- dat_timeS %>%
  mutate(EarliestDateCollected = round_date(EarliestDateCollected, unit = "year")) %>% 
  drop_na(EarliestDateCollected) %>%
  summarise(count = sum(count), .by = c(EarliestDateCollected, Collector)) %>%
  arrange(EarliestDateCollected, desc(count))

dat_timeS_dropNA_plot <- dat_timeS_dropNA %>%
  summarise(yearCount = sum(count), .by = EarliestDateCollected) %>%
  left_join(dat_timeS_dropNA, .) %>%
  # If the collector collections by year less than 30 percent of total collections, the counts will add to others group.
  mutate(countRatio = count/yearCount, countIfOther = countRatio < 0.30) %>%
  # If the number of collectors by year more than 10,
  mutate(indexCount =index(count), .by = EarliestDateCollected) %>%
  # the counts will add to others group.
  mutate(collectorIfother = indexCount > 10) %>% 
  mutate(group_others = ifelse(countIfOther*collectorIfother, "others", Collector)) %>% 
  summarise(groupCount = sum(count), .by = c(EarliestDateCollected, group_others))
## Joining with `by = join_by(EarliestDateCollected)`
dat_timeS_dropNA_xts <-  dat_timeS_dropNA_plot %>%
  select(EarliestDateCollected, groupCount, group_others) %>% 
  pivot_wider(names_from = group_others, values_from = groupCount, values_fn = sum, values_fill = 0) %>%
  as.xts()
# Finally the plot

timeSeq <- "/"
collector_n <- (1:dim(dat_timeS_dropNA_xts)[2])

p <- dygraph(data = dat_timeS_dropNA_xts[timeSeq, collector_n]) %>%
  dyRangeSelector() %>%
  dyStackedBarGroup(name = dimnames(dat_timeS_dropNA_xts)[[2]][collector_n]) %>% 
  # dyCrosshair(direction = "both") %>%
  dyHighlight(highlightCircleSize = 5,
              highlightSeriesBackgroundAlpha = 0.8,
              highlightSeriesOpts = list(pointShape = "square"),
              hideOnMouseOut = FALSE)  %>%
  # dyRoller(rollPeriod = 1) %>%
  dyLegend(show = "onmouseover", labelsSeparateLines = TRUE, showZeroValues = FALSE) %>% 
  dyOptions(labelsUTC = FALSE,
            fillGraph = TRUE,
            fillAlpha = 0.5,
            drawGrid = FALSE,
            includeZero = FALSE,
            axisLineWidth = 1.5,
            colors = viridis::viridis(length(collector_n), direction = -1)) %>%
  dyAxis("y"
         # valueRange = c(0, dat_timeS_dropNA_xts[timeSeq, collector_n] %>% rowSums(na.rm = TRUE) %>% max() %>%  ceiling_digits(digits = 100))
  )

p
date_majorEvent <- c(
  "1904-01-01",
  "1924-01-01",
  "1945-01-01",
  "2000-01-01",
  "2024-01-01"
) %>% as.Date()

p <- dat_timeS_dropNA_plot %>%
  arrange(EarliestDateCollected, groupCount, group_others) %>% 
  # filter(EarliestDateCollected <= as.Date("1924-01-01"))  %>% 
  # filter(group_others %in% c("Cheng-Te Hsu", "(n/a)", "(unknown)", "Pi-Fong Lu", "others")) %>%
  ggplot(mapping = aes(x = EarliestDateCollected,
                       y = groupCount,
                       fill = fct_reorder2(group_others, EarliestDateCollected, groupCount),
                       text = fct_reorder2(group_others, EarliestDateCollected, groupCount))) +
  geom_vline(xintercept = as.Date("1904-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  geom_vline(xintercept = as.Date("1924-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  geom_vline(xintercept = as.Date("1945-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  # geom_vline(xintercept = as.Date("1972-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  # geom_vline(xintercept = as.Date("1979-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  # geom_vline(xintercept = as.Date("1993-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  # geom_vline(xintercept = as.Date("2003-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  geom_vline(xintercept = as.Date("2000-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  geom_vline(xintercept = as.Date("2024-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
  geom_col() +
  scale_fill_viridis_d(direction = -1) +
  coord_cartesian(
    # ylim =
    #   c(0,
    #     dat_timeS_dropNA_plot %>%
    #       filter(EarliestDateCollected <= as.Date("1924-01-01")) %>% summarise(groupCountSum = sum(groupCount), .by = EarliestDateCollected) %>% pull(groupCountSum) %>% max() %>% ceiling_digits(digits = 100)
    #   ),
    expand = FALSE
  ) +
  # theme_ipsum() +
  labs(x = "yr", y = "#") +
  scale_x_date(breaks = c("1904-01-01", "1924-01-01", "1945-01-01", "2000-01-01", "2024-01-01") %>% as.Date(), minor_breaks = seq(ymd(18500101), ymd(20250101), by = "10 years"), date_labels = "%Y", limits = c("1848-01-01", "2025-01-01") %>% as.Date()) +
  scale_y_continuous(breaks = c(0, 2000, 10000, 20000, 30000), limits = c(0, 31000)) +
  theme_gdocs(
    # base_size = 28,
    base_family = "NotoSansCJKtcBlack") +
  scale_color_gdocs() +
  theme(
    axis.ticks.length.x = unit(0.8, units = "cm"),
    axis.minor.ticks.length.x = unit(0.5, units = "cm"),
    axis.ticks = element_line(linewidth = 2, color = "black"),
    axis.minor.ticks.x.bottom = element_line(linewidth = 1, color = "black"),
    axis.text = element_markdown(color = "black"),
    axis.title.y = element_text(angle = 0, color = "black", hjust = 1),
    axis.title.x = element_text(color = "black", hjust = 1),
    plot.background = element_rect(fill = "white", color = "white"),
    panel.background = element_rect(fill = "white"),
    plot.margin = unit(c(1,1,1,1), units = "cm"),
    legend.position="none"
  )
# guides(fill = "none")

# p
# ggsave(filename = "timeGap.png", device = "png", width = 80, height = 25,units = "cm", dpi = 600)

p <- ggplotly(p, tooltip = c("x","y","text"))
p

time-series calender heatmaps

source("https://raw.githubusercontent.com/iascchen/VisHealth/master/R/calendarHeat.R")
# http://www.columbia.edu/~sg3637/blog/Time_Series_Heatmaps.html

dat_calend <- dat %>%
  select(EarliestDateCollected) %>% 
  mutate(Year = EarliestDateCollected %>% year(),
         Month = EarliestDateCollected %>% month()) %>%
  group_by(Year, Month) %>% 
  summarise(count = n()) %>% 
  drop_na() %>% 
  arrange(Year, Month)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# mutate(EarliestDateCollected = as.Date(EarliestDateCollected)) %>% 
# drop_na() %>% 
# arrange(EarliestDateCollected)
# filter(lubridate::year(EarliestDateCollected) == lubridate::year(lubridate::ymd(20000101)))

# calendarHeat(dates = dat_calend$EarliestDateCollected, values = dat_calend$count)

p_calend <- dat_calend %>%
  # filter(Year <1945) %>% 
  # filter(Year >= 1945 & Year < 2000) %>%
  # filter(Year >= 2000) %>%
  ggplot(data = .) +
  geom_tile(mapping = aes(x = Year, y = Month, fill = count), color = "black") +
  scale_fill_gradient2(low = "white", high = "#004616") +
  scale_x_continuous(breaks = c(1904, 1924, 1945, 2000, 2024),
                     minor_breaks = seq(1850, 1945, 10)
                     # labels = c(1904, 1924, 1945, 2000, 2024)
  ) +
  scale_y_continuous(breaks = c(1, 3, 6, 9, 12), labels = c("Jan", "Mar", "Jun", "Sep", "Dec")) +
  coord_equal() +
  labs(x = "yr", y = "", fill = "#") +
  theme_void() +
  # theme(axis.text.y = element_text(size = 10))
  theme(
    axis.ticks.length.x = unit(0.1, units = "cm"),
    axis.minor.ticks.length.x = unit(0.5, units = "cm"),
    axis.ticks = element_line(linewidth = 1.5, color = "black"),
    axis.minor.ticks.x.bottom = element_line(linewidth = 1, color = "black"),
    axis.text = element_markdown(
      # size = unit(24, units = "cm"),
      color = "black"),
    axis.title.y = element_text(angle = 0, color = "black", hjust = 1),
    axis.title.x = element_text(color = "black", hjust = 1),
    plot.background = element_rect(fill = "#ffffff00", color = "#ffffff00"),
    panel.background = element_rect(fill = "#FFD700")
    # plot.margin = unit(c(1,1,1,1), units = "cm")
    # legend.position="none"
  )

p_calend

# ggsave(filename = "timeGap_calend.png", device = "png", width = 32, height = 10,units = "cm", dpi = 600)
# ggsave(filename = "timeGap_calend_2.png", device = "png", width = 12, height = 10,units = "cm", dpi = 600)
# ggsave(filename = "timeGap_calend_3.png", device = "png", width = 11, height = 10,units = "cm", dpi = 600)

文字雲

library(wordcloud2)
library(webshot)
# webshot::install_phantomjs()

wcloud_plot <- dat_timeS_dropNA %>%
  # filter(EarliestDateCollected < ymd(19040101)) %>%
  # filter(between(EarliestDateCollected, ymd(19040101),ymd(19230101))) %>%
  # filter(between(EarliestDateCollected, ymd(19240101),ymd(19440101))) %>%
  # filter(between(EarliestDateCollected, ymd(19450101),ymd(19990101))) %>%
  # filter(between(EarliestDateCollected, ymd(20000101),ymd(20250101))) %>%
  select(word = Collector, freq = count) %>%
  group_by(word) %>%
  summarise(freq = sum(freq)) %>%
  arrange(desc(freq)) %>% 
  wordcloud2(data = .,
             # color = "forestgreen",
             # color = "darkorange",
             fontWeight = 500, minRotation = 0.1, maxRotation = 0.9, shape = "circle")
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1904.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1924.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1945.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_2000.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_2024.html", selfcontained = FALSE)
wcloud_plot

標本數-屬排名(前30名)

rank_n <- 1:30
p_rankGenus <- dat %>%
  group_by(Genus) %>%
  summarise(count = n()) %>%
  left_join(.,
            by = "Genus",
            multiple = "first",
            dat %>% dplyr::distinct(Genus, GenusInChinese, ClassInChinese)
  ) %>%
  arrange(desc(count)) %>%
  mutate(Genus = glue::glue("{GenusInChinese}/<i>{Genus}</i>", .na = NULL)) %>%
  mutate(Genus = fct_reorder(Genus, count)) %>% 
  mutate(ClassInChinese = fct(ClassInChinese, levels = c("眼蟲綱","石松綱","木賊綱","松葉蕨綱","真蕨綱","蘇鐵綱","銀杏綱","買麻藤綱","松綱","木蘭綱","百合綱"))) %>%
  drop_na(Genus) %>%
  slice(rank_n) %>%
  ggplot(mapping = aes(x = Genus, y = count, fill = ClassInChinese)) +
  geom_col() +
  scale_fill_viridis(direction = 1, discrete = TRUE, option = "G") +
  labs(y = "#") +
  coord_flip(expand = FALSE, clip = "on") +
  theme_solarized(base_family = "NotoSansCJKtcBlack") +
  theme(
    axis.text.y = element_markdown(
      # size = 16,
      colour = "black"),
    axis.title.y = element_blank(),
    axis.title.x = element_text(colour = "black", hjust = 1),
    legend.text = element_text(colour = "black"),
    legend.title = element_blank()
  )

# p_rankGenus %>% ggsave(filename = "rankGenus.png", device = "png",units = "cm", width = 30, height = 18, dpi = 600)
p_rankGenus

cat(
  dat %>% group_by(Genus) %>% summarise(count = n()) %>% arrange(desc(count)) %>% mutate(Genus = fct_reorder(Genus, count)) %>% filter(Genus %>% is.na()) %>% pull(count),
  "份(未鑑定)\n"
)
## 8577 份(未鑑定)

標本數-物種排名(前30名) 大家最愛採

rankSci_plot <- dat %>%
  group_by(ScientificName) %>%
  summarise(count = n()) %>%
  left_join(.,
            by = "ScientificName",
            multiple = "first",
            dat %>% dplyr::distinct(ScientificName, ClassInChinese, SpeciesNameInChinese, GenusInChinese, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, AuthorYearOfScientificName)
  ) %>%
  arrange(desc(count)) %>%
  # slice_head(n = 10) %>%
  rowwise(SpeciesNameInChinese, GenusInChinese) %>% 
  mutate(ScientificName_plot = replace_na(SpeciesNameInChinese, GenusInChinese)) %>%
  ungroup() %>% 
  mutate(ScientificName_plot2 = glue("<i>{Genus} {SpecificEpithet}</i>{InfraspecificRank}<i> {InfraspecificEpithet}</i>{AuthorYearOfScientificName}",.sep = " ", .na = "")) %>% 
  mutate(ScientificName_plot = glue("{ScientificName_plot}/{ScientificName_plot2}")) %>%
  mutate(ScientificName_plot = fct_reorder(ScientificName_plot, count)) %>%
  mutate(ClassInChinese = fct(ClassInChinese, levels = c("眼蟲綱","石松綱","木賊綱","松葉蕨綱","真蕨綱","蘇鐵綱","銀杏綱","買麻藤綱","松綱","木蘭綱","百合綱"))) %>%
  drop_na(ScientificName) %>%
  slice(rank_n) %>%
  ggplot(mapping = aes(x = ScientificName_plot, y = count, fill = ClassInChinese)) +
  geom_col() +
  scale_fill_viridis(direction = 1, discrete = TRUE, option = "G") +
  coord_flip(expand = FALSE, clip = "on") +
  labs(y = "#") +
  theme_solarized(
    # base_size = 12,
    base_family = "NotoSansCJKtcBlack") +
  theme(
    axis.text.y = element_markdown(
      # size = 16,
      colour = "black"),
    axis.title.y = element_blank(),
    axis.title.x = element_text(colour = "black", hjust = 1),
    legend.text = element_text(colour = "black"),
    legend.title = element_blank()
  )

# rankSci_plot %>% ggsave(filename = "rankSpecies.png", device = "png",units = "cm", width = 30, height = 18, dpi = 600)
rankSci_plot

cat(
  dat %>% group_by(ScientificName) %>% summarise(count = n()) %>% arrange(desc(count)) %>% mutate(ScientificName = fct_reorder(ScientificName, count)) %>% filter(ScientificName %>% is.na()) %>% pull(count),
  "份(未鑑定)\n"
)
## 8577 份(未鑑定)

資料空缺年

website: https://chestnut123tw.github.io/theme-year_datagap

dat_pie <- tibble(
  gold = 0,
  silver = dat %>% select(ScientificName, DecimalLatitude, DecimalLongitude, EarliestDateCollected) %>% drop_na() %>% nrow(),
  bronze = dat %>% nrow() - silver
)
dat_pie <- dat_pie %>% pivot_longer(cols = 1:3, names_to = "quality", values_to = "count") %>%
  mutate(quality = factor(quality, levels = c("gold", "silver","bronze"))) %>%
  mutate(colors = c('#FFF019', '#F1F1F1', '#AC6B2B'))

ggplot(dat_pie) +
  geom_bar(mapping = aes(x = "", y = count, fill = quality), stat="identity", width = 1, color = "black") +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = dat_pie$colors) +
  theme_void()

分類群空缺

載入套件treemap

library(treemap)
# library(waffle) not yet...
# source("https://gist.githubusercontent.com/timelyportfolio/91e0b8c2b8cd4202baa2/raw/37663bf3de2168157e7f0c26a14422007ab843e8/as_rcdimple.R")

資料前處理…

# dat dataset 1 -- 直接使用組裝欄位 ----
dat_name <- dat %>%
  unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>%
  select(simple_scientificName) %>% mutate(count = 1)

# # dat dataset 2 -- 以比對後taxa list ----
# dat_name <- dat_joinacceptedName %>%
#   select(simple_scientificName) %>% mutate(count = 1)

# taxa list dataset 1 -- pick out native, Tracheophyta, Plantae taxa from TaiCOL ----
name_trach <- name_taicol %>%
  filter(kingdom == "Plantae") %>%
  filter(rank %in% c("Species","Subspecies","Variety","Form","Hybrid Formula")) %>%
  filter(phylum == "Tracheophyta") %>%
  filter(alien_type == "native") %>% 
  distinct()
# taxa list dataset 2 -- pick out Tracheophyta, Plantae taxa from TaiCOL ----
# name_trach <- name_taicol %>%
#     filter(kingdom == "Plantae") %>%
#     filter(rank %in% c("Species","Subspecies","Variety","Form","Hybrid Formula")) %>%
#     filter(phylum == "Tracheophyta") %>% distinct()

# modifying data structure for plotting ----
name_trach_plot <- full_join(keep = TRUE,
                             x = dat_name,
                             y = name_trach,
                             by = join_by(simple_scientificName == simple_name)
) %>%
  replace_na(list(count = 0)) %>%
  arrange(count) %>%
  group_by(simple_name, taxon_id, class_c, family_c) %>% summarise(count = sum(count)) %>%
  arrange(class_c, family_c, simple_name) %>%
  select(taxon_id, simple_name, count, class_c, family_c) %>%
  drop_na() %>% 
  mutate(n_tile = 1) %>% 
  ungroup()
## `summarise()` has grouped output by 'simple_name', 'taxon_id', 'class_c'. You
## can override using the `.groups` argument.
cols_8 <- c("#F9F9F9","#CCE1D0","#9FC9A7","#6FB17E","#369953","#007F2E","#006222","#004616")  

分類群空缺-裸子植物

### 裸子植物 ----
name_trach_plot_gymno <- name_trach_plot %>%
  filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
  # filter(class_c %in% c("木蘭植物綱")) %>%
  # filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
  mutate(countPlot = if_else(count == 0, 0, NA)) %>%
  mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>% 
  mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>% 
  mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))

# png(filename = "nameTreemap.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_gymno %>%
  mutate(countPlot = countPlot %>% factor()) %>% 
  treemap(
    index = c("family_c", "simple_name"),
    vSize = "n_tile",
    vColor = "countPlot",
    type = "categorical",
    align.labels = list(
      c("right", "bottom"),
      c("left", "top")
    ),
    fontfamily.labels = "NotoSansCJKtcBlack",
    
    palette = cols_8,
    bg.labels = c("#ffffffaa"),
    fontsize.labels = c(36, 24),
    lowerbound.cex.labels = 0,
    border.lwds = c(5, 2),
    # border.col = ,
    title = "",
    title.legend = "#",
    reverse.legend = TRUE,
    overlap.labels = 1,
    inflate.labels = FALSE
  )

# dev.off()

分類群空缺-被子植物

### 被子植物--中位數前30少(不含蘭科) ----
name_trach_plot_angio <- name_trach_plot %>%
  # filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
  filter(class_c %in% c("木蘭植物綱")) %>%
  # filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
  mutate(countPlot = if_else(count == 0, 0, NA)) %>%
  mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>% 
  mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>% 
  mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))

# 被子植物科層級採集量中位數前30少
am_angioTop30 <- name_trach_plot_angio %>% group_by(family_c) %>% summarise(am = median(count)) %>% arrange(am) %>% slice_head(n = 30) %>% pull(family_c)

# png(filename = "nameTreemap_angio.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_angio %>%
  mutate(countPlot = countPlot %>% factor()) %>% 
  filter(family_c %in% am_angioTop30) %>% 
  filter(!family_c %in% "蘭科") %>%
  treemap(
    index = c("family_c", "simple_name"),
    vSize = "n_tile",
    vColor = "countPlot",
    type = "categorical",
    align.labels = list(
      c("right", "bottom"),
      c("left", "top")
    ),
    fontfamily.labels = "NotoSansCJKtcBlack",
    
    palette = cols_8,
    bg.labels = c("#ffffffaa"),
    fontsize.labels = c(36, 20),
    lowerbound.cex.labels = 0,
    border.lwds = c(5, 2),
    # border.col = ,
    title = "",
    title.legend = "#",
    reverse.legend = TRUE,
    overlap.labels = 1,
    inflate.labels = FALSE
  )

# dev.off()

分類群空缺-蕨類植物

### 蕨類植物 ----
name_trach_plot_fern <- name_trach_plot %>%
  # filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
  # filter(class_c %in% c("木蘭植物綱")) %>%
  filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
  mutate(countPlot = if_else(count == 0, 0, NA)) %>%
  mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
  mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>% 
  mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>% 
  mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))

# 蕨類植物科層級採集量中位數前30少
am_fernTop30 <- name_trach_plot_fern %>% group_by(family_c) %>% summarise(am = median(count)) %>% arrange(am) %>% slice_head(n = 30) %>% pull(family_c)

# png(filename = "nameTreemap_fern.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_fern %>%
  mutate(countPlot = countPlot %>% factor()) %>% 
  filter(family_c %in% am_fernTop30) %>% 
  treemap(
    index = c("family_c", "simple_name"),
    vSize = "n_tile",
    vColor = "countPlot",
    type = "categorical",
    align.labels = list(
      c("right", "bottom"),
      c("left", "top")
    ),
    fontfamily.labels = "NotoSansCJKtcBlack",
    
    palette = cols_8,
    bg.labels = c("#ffffffaa"),
    fontsize.labels = c(36, 20),
    lowerbound.cex.labels = 0,
    border.lwds = c(5, 2),
    # border.col = ,
    title = "",
    title.legend = "#",
    reverse.legend = TRUE,
    overlap.labels = 1,
    inflate.labels = FALSE
  )

# dev.off()

分類群空缺(佔原生維管束植物各分類群比例)
The proportion of taxa covered by the collection to native vascular plant taxa.

cat("原生維管束植物:", name_trach_plot %>% nrow(), "個學名\n")
## 原生維管束植物: 5194 個學名
cat("館藏原生維管束植物:", name_trach_plot %>% filter(count != 0) %>% nrow(), "個學名",
    (name_trach_plot %>% filter(count != 0) %>% nrow() / name_trach_plot %>% nrow()) %>% scales::label_percent()(.),
    "\n")
## 館藏原生維管束植物: 4527 個學名 87%
cat("原生維管束植物(裸子植物):", name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(裸子植物): 34 個學名
cat("館藏原生維管束植物(裸子植物):", name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% filter(count != 0) %>% nrow(), "個學名",
    (name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% nrow()) %>% scales::label_percent()(.),
    "\n")
## 館藏原生維管束植物(裸子植物): 31 個學名 91%
cat("原生維管束植物(被子植物):", name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(被子植物): 4217 個學名
cat("館藏原生維管束植物(被子植物):", name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% filter(count != 0) %>% nrow(), "個學名",
    (name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% nrow()) %>% scales::label_percent()(.),
    "\n")
## 館藏原生維管束植物(被子植物): 3640 個學名 86%
cat("原生維管束植物(蕨類植物):", name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(蕨類植物): 943 個學名
cat("館藏原生維管束植物(蕨類植物):", name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% filter(count != 0) %>% nrow(), "個學名",
    (name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% nrow()) %>% scales::label_percent()(.),
    "\n")
## 館藏原生維管束植物(蕨類植物): 856 個學名 91%

分類群空缺–學名對應nomenMatch v2

# function nMv2() -- nomemmatch v2 api ----
nMv2 <- function(name, source = "taicol", best = "yes", format = "csv", multiName = FALSE){
  if(multiName %>% isTRUE()){
    
    RETRY(verb = "GET",
        url = "https://match.taibif.tw/v2/api.php",
        query = list(names = name %>% pull() %>% paste0(collapse = "|"), source = source, best = best, format = format)
  )
    
  } else{
    
  RETRY(verb = "GET",
        url = "https://match.taibif.tw/v2/api.php",
        query = list(names = name, source = source, best = best, format = format)
  )
    
  }
}

# simple_scientificName (組裝欄位(Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet)產生) to match -- solution 1 ----
dat_TaxonName <- dat %>%
  unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>%
  select(simple_scientificName) %>%
  distinct() %>%
  filter(!simple_scientificName == "")

# scientificName to match -- solution 2 ----
# dat_TaxonName2 <- dat %>% 
#   select(ScientificName) %>% 
#   distinct() %>%
#   filter(!scientificName == "")


## query for requests to nomenMatch v2 api(TaiCOL) ----
# startT <- Sys.time()
# 
# dat_TaxonName_nMv2 <- tibble()
# batch_size <- 100
# n <- nrow(dat_TaxonName)
# batches <- split(dat_TaxonName, ceiling(seq_len(n) / batch_size))
# 
# results <- list()  # 所有 batch 的 list 容器
# for (j in seq_along(batches)) {
#   cat("Processing batch", j, "of", length(batches), "\n")
# 
#   batch <- batches[[j]]
# 
#   batch_result <-  suppressMessages(
#     batch %>% nMv2(multiName = TRUE) %>% content()
#   )
# 
#   results[[j]] <- batch_result
# 
#   rm(batch_result);gc()
#   if(j %% 10 == 0) Sys.sleep(1)
# }
# 
# # 最後將所有 batch 合併成一個結果
# dat_TaxonName_nMv2 <- bind_rows(results)
# 
# # 重新命名欄位及依input排序dat_TaxonName:simple_scientificName == dat_TaxonName_nMv2:search_term
# dat_TaxonName_nMv2 <- dat_TaxonName_nMv2 %>%
#   rename(simple_scientificName = search_term) %>%
#   arrange(simple_scientificName)
# 
# cat((Sys.time() - startT) %>% format())
# 
# # 存檔
# saveRDS(
#   object = dat_TaxonName_nMv2,
#   file = paste0("dat_TaxonName_nomenMatchv2_", today(tzone = "asia/taipei"), ".rds")
# )

## 重新讀入dat_TaxonName_nMv2 ----
dat_TaxonName_nMv2 <- readRDS(
  file = grep("^dat_TaxonName_nomenMatchv2_[0-9]{4}-[0-9]{2}-[0-9]{2}\\.rds", x = list.files(), value = TRUE) %>% latestPath()
)

## pick out match_type not "no match" in "Plantae" ----
Match_taicol <- dat_TaxonName_nMv2 %>%
  filter(match_type != "No match") %>%
  filter(kingdom == "Plantae") %>%
  select(simple_scientificName, simple_name, accepted_namecode, name_status, source)

# function taicolID2name() --- TaiCOL api ----
base_url_taicol_higherTaxa <- "https://api.taicol.tw/v2/higherTaxa"
taicolID2name <- function(taxonID_taicol){
  request <- RETRY(verb = "GET",
                   url = base_url_taicol_higherTaxa,
                   query = list(taxon_id = taxonID_taicol)
  ) %>%
    content(as = "text") %>%
    fromJSON() %>% .$data %>%
    rowwise() %>% mutate(rank = rank %>% casefold()) %>% ungroup() %>%
    suppressMessages()
  
  request <- request %>% filter(taxon_id == taxonID_taicol) %>%
    bind_cols(
      .,
      request %>%
        .[(which(request$taxon_id == taxonID_taicol) + 1) : nrow(request),] %>%
        select(rank, simple_name) %>%
        pivot_wider(., names_from = rank, values_from = simple_name)
    )
  
  request <- request %>%
    mutate(source = "taicol")
  
  return(request)
}

# for loop -- 取出除了"no match"以外的TaiCOL acceptedName ----
# Match_taicol_acceptedName <- tibble()
# for(i in 5842:nrow(Match_taicol)){
#   Match_taicol_acceptedName_i <- Match_taicol$accepted_namecode[i] %>%
#     taicolID2name() %>%
#     mutate(simple_scientificName = Match_taicol$simple_scientificName[i]) %>%
#     suppressMessages()
#   Match_taicol_acceptedName <- bind_rows(
#     Match_taicol_acceptedName_i,
#     Match_taicol_acceptedName
#   )
#   if(i %% 10 == 0){
#     cat(i, "/", nrow(Match_taicol), "\n")
#   }
#   if(i %% 100 == 0) Sys.sleep(1)
#   rm(Match_taicol_acceptedName_i);gc()
# }
# 
# # 存檔
# saveRDS(
#   object = Match_taicol_acceptedName,
#   file = paste0("Match_taicol_acceptedName_", today(tzone = "asia/taipei"), ".rds")
# )

# 重新讀入Match_taicol_acceptedName ----
Match_taicol_acceptedName <- readRDS(
  file = grep("^Match_taicol_acceptedName_.+\\.rds", x = list.files(), value = TRUE) %>% latestPath()
)

# 重新命名欄位及依input排序Match_taicol_acceptedName:simple_scientificName
Match_taicol_acceptedName <- Match_taicol_acceptedName %>%
  # select(-"NA") %>%
  mutate(simple_scientificName = as.character(simple_scientificName)) %>% 
  arrange(simple_scientificName)

## pick out match_type are "no match" in "Plantae" in TaiCOL,接續對應GBIF ----
noMatch <- dat_TaxonName_nMv2 %>% filter(match_type == "No match") %>% select(simple_scientificName)


# join ----
# 
dat_joinacceptedName <- dat %>%
  unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>% 
  left_join(
  .,
  Match_taicol_acceptedName,
  by = "simple_scientificName",
  multiple = "first"
)