library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyheatmaps)
library(dygraphs)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(hrbrthemes)
library(ggthemes)
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(wordcloud2)
library(ggtext)
## Warning: package 'ggtext' was built under R version 4.4.3
library(glue)
library(httr)
##
## Attaching package: 'httr'
##
## The following object is masked from 'package:plotly':
##
## config
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
windowsFonts(NotoSansTC = "NotoSansTC-Regular", NotoSansCJKtcBlack = "NotoSansCJKtc-Black")
# function -- 讀取最近期的資料
latestPath <- function(path){
latestOrder <- path %>% file.info() %>% .$ctime %>% which.max()
return(path[latestOrder])
}
# function -- ceiling by digits as `round(digits = ...)`
ceiling_digits <- function(x, digits = 10){ceiling((x + 1)/digits)*digits}
dat <- read_delim("data/specimens_dwc14.csv",
delim = ";",
escape_double = FALSE,
trim_ws = TRUE,
progress = TRUE)
## indexing specimens_dwc14.csv [------------------------------] 2.15GB/s, eta: 0sindexing specimens_dwc14.csv [==============================] 2.15GB/s, eta: 0s indexing specimens_dwc14.csv [============================] 142.80MB/s, eta: 0s
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 498731 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (53): GlobalUniqueIdentifier, BasisOfRecord, InstitutionCode, Collectio...
## dbl (13): SpcmID, DupID, SeriID, DetID, CountryID, MinimumElevationInMeters...
## lgl (11): InformationWithheld, SpecificEpithetInChinese, NomenclaturalCode,...
## dttm (4): DateLastModified, EarliestDateCollected, LatestDateCollected, Dat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# taiCOL taxa list ----
name_taicol <- read_csv(grep("^data/TaiCOL_taxon_.+\\.csv$", x = list.files(recursive = TRUE),value = TRUE) %>% latestPath())
## Rows: 95288 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (30): taxon_id, simple_name, name_author, formatted_name, synonyms, for...
## dbl (1): name_id
## lgl (9): is_hybrid, is_endemic, is_fossil, is_terrestrial, is_freshwater, ...
## date (2): created_at, updated_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
資料長這樣
dat %>% glimpse()
## Rows: 498,731
## Columns: 81
## $ SpcmID <dbl> 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 9, 10…
## $ DupID <dbl> 1, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ SeriID <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ DetID <dbl> 1, 334021, 334020, 334019, 4, 309515, 6, …
## $ GlobalUniqueIdentifier <chr> "TAIF:PLANT:1:1:1:21688", "TAIF:PLANT:2:1…
## $ DateLastModified <dttm> 2017-09-05 11:04:04, 2009-12-09 14:39:23…
## $ BasisOfRecord <chr> "PreservedSpecimen", "PreservedSpecimen",…
## $ InstitutionCode <chr> "TAIF", "TAIF", "TAIF", "TAIF", "TAIF", "…
## $ CollectionCode <chr> "PLANT", "PLANT", "PLANT", "PLANT", "PLAN…
## $ CatalogNumber <chr> "21688", "21181", "330969", "330970", "21…
## $ InformationWithheld <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Remarks <chr> "6,000 ft.", "Hassen-zan 7000ft.; yellow …
## $ RemarksInChinese <chr> "6,000 ft.", "Hassen-zan 7000ft.; yellow …
## $ SPID <chr> "Eupa3", "Impa4", "Impa4", "Impa4", "Also…
## $ ScientificName <chr> "Eupatorium formosanum Hayata", "Impatien…
## $ SpeciesNameInChinese <chr> NA, "紫花鳳仙花", "紫花鳳仙花", "紫花鳳仙…
## $ HigherTaxon <chr> "Plantae, Magnoliophyta, Magnoliopsida, A…
## $ HigherTaxonInChinese <chr> "植物界, 木蘭植物門, 木蘭綱, 菊目, 菊科, …
## $ Kingdom <chr> "Plantae", "Plantae", "Plantae", "Plantae…
## $ KingdomInChinese <chr> "植物界", "植物界", "植物界", "植物界", "…
## $ Phylum <chr> "Magnoliophyta", "Magnoliophyta", "Magnol…
## $ PhylumInChinese <chr> "木蘭植物門", "木蘭植物門", "木蘭植物門",…
## $ Class <chr> "Magnoliopsida", "Magnoliopsida", "Magnol…
## $ ClassInChinese <chr> "木蘭綱", "木蘭綱", "木蘭綱", "木蘭綱", "…
## $ Order <chr> "Asterales", "Geraniales", "Geraniales", …
## $ OrderInChinese <chr> "菊目", "牻牛兒苗目", "牻牛兒苗目", "牻牛…
## $ Family <chr> "Asteraceae", "Balsaminaceae", "Balsamina…
## $ FamilyInChinese <chr> "菊科", "鳳仙花科", "鳳仙花科", "鳳仙花科…
## $ Genus <chr> "Eupatorium", "Impatiens", "Impatiens", "…
## $ GenusInChinese <chr> "澤蘭屬", "鳳仙花屬", "鳳仙花屬", "鳳仙花…
## $ SpecificEpithet <chr> "formosanum", "uniflora", "uniflora", "un…
## $ SpecificEpithetInChinese <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ InfraspecificRank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ InfraspecificEpithet <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ AuthorYearOfScientificName <chr> "Hayata", "Hayata", "Hayata", "Hayata", "…
## $ NomenclaturalCode <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ IdentificationQualifier <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ HigherGeography <chr> "Taiwan (台灣省), Taiwan", "Taiwan (台灣…
## $ CountryID <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Country <chr> "Taiwan", "Taiwan", "Taiwan", "Taiwan", "…
## $ CountryInChinese <chr> "台灣", "台灣", "台灣", "台灣", "台灣", "…
## $ StateProvince <chr> "Taiwan (台灣省)", "Taiwan (台灣省)", "Ta…
## $ County <chr> NA, NA, NA, NA, NA, NA, NA, "Miaoli Count…
## $ CountyInChinese <chr> NA, NA, NA, NA, NA, NA, NA, "苗栗縣", "臺…
## $ SubCountyCantonInChinese <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Locality <chr> "Mt. Arisan", "Hassen-zan", "Hassen-zan",…
## $ LocalityInChinese <chr> "Mt. Arisan", "Hassen-zan", "Hassen-zan",…
## $ AlternativeLocalityInChinese <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ MinimumElevationInMeters <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 3705, 370…
## $ MaximumElevationInMeters <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 3705, 370…
## $ ValidDistributionFlag <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ EarliestDateCollected <dttm> 1927-10-08, 1927-08-18, 1927-08-18, 1927…
## $ LatestDateCollected <dttm> 1927-10-08, 1927-08-18, 1927-08-18, 1927…
## $ DayOfYear <dbl> 281, 230, 230, 230, 240, 240, 230, 184, 2…
## $ Collector <chr> "Ryozo Kanehira", "Ryozo Kanehira", "Ryoz…
## $ CollectorInChinese <chr> "金平 亮三", "金平 亮三", "金平 亮三", "…
## $ OtherCollectorInChinese <chr> "Syunichi Sasaki (佐佐木 舜一)", NA, NA, …
## $ ImageURL <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ RelatedInformation <chr> NA, NA, NA, NA, NA, NA, "Phenology: Spori…
## $ RelatedInformationInChinese <chr> NA, NA, NA, NA, NA, NA, "物候: Sporing", …
## $ CatalogNumberNumeric <dbl> 21688, 21181, 330969, 330970, 21560, 3077…
## $ IdentifiedBy <chr> NA, NA, NA, NA, NA, NA, NA, "Chia-Ying Hu…
## $ IdentifiedByInChinese <chr> NA, NA, NA, NA, NA, NA, NA, "胡嘉穎", "林…
## $ DateIdentified <dttm> NA, NA, NA, NA, NA, NA, 1927-08-18, 2001…
## $ CurrentDetermination <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ CollectorNumber <chr> "s. n.", "s. n.", "s. n.", "s. n.", "s. n…
## $ VerbatimCollectingDate <chr> "1927-Oct-08", "1927-Aug-18", "1927-Aug-1…
## $ VerbatimElevation <chr> NA, NA, NA, NA, NA, NA, NA, NA, "3705 M",…
## $ Preparations <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ PreparationsInChinese <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TypeStatus <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TypeStatusInChinese <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Disposition <chr> "in collection", "in collection", "in col…
## $ DecimalLatitude <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 24.31667,…
## $ DecimalLongitude <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 121.4167,…
## $ GeodeticDatum <chr> "TWD67", "TWD67", "TWD67", "TWD67", "TWD6…
## $ LocValid <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ VerbatimCoordinates <chr> NA, NA, NA, NA, NA, NA, NA, NA, "N 24/19/…
## $ VerbatimLatitude <chr> NA, NA, NA, NA, NA, NA, NA, NA, "24/19/",…
## $ VerbatimLongitude <chr> NA, NA, NA, NA, NA, NA, NA, NA, "121/25/"…
## $ VerbatimCoordinateSystem <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
統計至202502資料:
n_specimens <- dat %>%
distinct(CatalogNumber) %>%
lengths()
cat(n_specimens, "份\n")
## 492577 份
n_collector <- dat %>%
distinct(Collector) %>%
lengths()
cat(n_collector, "人\n")
## 4058 人
cat(round(n_specimens/n_collector, digits = 0), "份/人\n")
## 121 份/人
資料繪圖前處理
dat_timeS <- dat %>% summarise(count = n(), .by = c(EarliestDateCollected, Collector)) %>% arrange(desc(count))
# Since my time is currently a factor, I have to convert it to a date-time format!
dat_timeS$EarliestDateCollected <- ymd(dat_timeS$EarliestDateCollected)
# Then you can create the xts necessary to use dygraph
dat_timeS_dropNA <- dat_timeS %>%
mutate(EarliestDateCollected = round_date(EarliestDateCollected, unit = "year")) %>%
drop_na(EarliestDateCollected) %>%
summarise(count = sum(count), .by = c(EarliestDateCollected, Collector)) %>%
arrange(EarliestDateCollected, desc(count))
dat_timeS_dropNA_plot <- dat_timeS_dropNA %>%
summarise(yearCount = sum(count), .by = EarliestDateCollected) %>%
left_join(dat_timeS_dropNA, .) %>%
# If the collector collections by year less than 30 percent of total collections, the counts will add to others group.
mutate(countRatio = count/yearCount, countIfOther = countRatio < 0.30) %>%
# If the number of collectors by year more than 10,
mutate(indexCount =index(count), .by = EarliestDateCollected) %>%
# the counts will add to others group.
mutate(collectorIfother = indexCount > 10) %>%
mutate(group_others = ifelse(countIfOther*collectorIfother, "others", Collector)) %>%
summarise(groupCount = sum(count), .by = c(EarliestDateCollected, group_others))
## Joining with `by = join_by(EarliestDateCollected)`
dat_timeS_dropNA_xts <- dat_timeS_dropNA_plot %>%
select(EarliestDateCollected, groupCount, group_others) %>%
pivot_wider(names_from = group_others, values_from = groupCount, values_fn = sum, values_fill = 0) %>%
as.xts()
# Finally the plot
timeSeq <- "/"
collector_n <- (1:dim(dat_timeS_dropNA_xts)[2])
p <- dygraph(data = dat_timeS_dropNA_xts[timeSeq, collector_n]) %>%
dyRangeSelector() %>%
dyStackedBarGroup(name = dimnames(dat_timeS_dropNA_xts)[[2]][collector_n]) %>%
# dyCrosshair(direction = "both") %>%
dyHighlight(highlightCircleSize = 5,
highlightSeriesBackgroundAlpha = 0.8,
highlightSeriesOpts = list(pointShape = "square"),
hideOnMouseOut = FALSE) %>%
# dyRoller(rollPeriod = 1) %>%
dyLegend(show = "onmouseover", labelsSeparateLines = TRUE, showZeroValues = FALSE) %>%
dyOptions(labelsUTC = FALSE,
fillGraph = TRUE,
fillAlpha = 0.5,
drawGrid = FALSE,
includeZero = FALSE,
axisLineWidth = 1.5,
colors = viridis::viridis(length(collector_n), direction = -1)) %>%
dyAxis("y"
# valueRange = c(0, dat_timeS_dropNA_xts[timeSeq, collector_n] %>% rowSums(na.rm = TRUE) %>% max() %>% ceiling_digits(digits = 100))
)
p
date_majorEvent <- c(
"1904-01-01",
"1924-01-01",
"1945-01-01",
"2000-01-01",
"2024-01-01"
) %>% as.Date()
p <- dat_timeS_dropNA_plot %>%
arrange(EarliestDateCollected, groupCount, group_others) %>%
# filter(EarliestDateCollected <= as.Date("1924-01-01")) %>%
# filter(group_others %in% c("Cheng-Te Hsu", "(n/a)", "(unknown)", "Pi-Fong Lu", "others")) %>%
ggplot(mapping = aes(x = EarliestDateCollected,
y = groupCount,
fill = fct_reorder2(group_others, EarliestDateCollected, groupCount),
text = fct_reorder2(group_others, EarliestDateCollected, groupCount))) +
geom_vline(xintercept = as.Date("1904-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
geom_vline(xintercept = as.Date("1924-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
geom_vline(xintercept = as.Date("1945-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
# geom_vline(xintercept = as.Date("1972-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
# geom_vline(xintercept = as.Date("1979-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
# geom_vline(xintercept = as.Date("1993-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
# geom_vline(xintercept = as.Date("2003-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
geom_vline(xintercept = as.Date("2000-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
geom_vline(xintercept = as.Date("2024-01-01"), color = "grey40", lwd = 1.5, lty = 5) +
geom_col() +
scale_fill_viridis_d(direction = -1) +
coord_cartesian(
# ylim =
# c(0,
# dat_timeS_dropNA_plot %>%
# filter(EarliestDateCollected <= as.Date("1924-01-01")) %>% summarise(groupCountSum = sum(groupCount), .by = EarliestDateCollected) %>% pull(groupCountSum) %>% max() %>% ceiling_digits(digits = 100)
# ),
expand = FALSE
) +
# theme_ipsum() +
labs(x = "yr", y = "#") +
scale_x_date(breaks = c("1904-01-01", "1924-01-01", "1945-01-01", "2000-01-01", "2024-01-01") %>% as.Date(), minor_breaks = seq(ymd(18500101), ymd(20250101), by = "10 years"), date_labels = "%Y", limits = c("1848-01-01", "2025-01-01") %>% as.Date()) +
scale_y_continuous(breaks = c(0, 2000, 10000, 20000, 30000), limits = c(0, 31000)) +
theme_gdocs(
# base_size = 28,
base_family = "NotoSansCJKtcBlack") +
scale_color_gdocs() +
theme(
axis.ticks.length.x = unit(0.8, units = "cm"),
axis.minor.ticks.length.x = unit(0.5, units = "cm"),
axis.ticks = element_line(linewidth = 2, color = "black"),
axis.minor.ticks.x.bottom = element_line(linewidth = 1, color = "black"),
axis.text = element_markdown(color = "black"),
axis.title.y = element_text(angle = 0, color = "black", hjust = 1),
axis.title.x = element_text(color = "black", hjust = 1),
plot.background = element_rect(fill = "white", color = "white"),
panel.background = element_rect(fill = "white"),
plot.margin = unit(c(1,1,1,1), units = "cm"),
legend.position="none"
)
# guides(fill = "none")
# p
# ggsave(filename = "timeGap.png", device = "png", width = 80, height = 25,units = "cm", dpi = 600)
p <- ggplotly(p, tooltip = c("x","y","text"))
p
source("https://raw.githubusercontent.com/iascchen/VisHealth/master/R/calendarHeat.R")
# http://www.columbia.edu/~sg3637/blog/Time_Series_Heatmaps.html
dat_calend <- dat %>%
select(EarliestDateCollected) %>%
mutate(Year = EarliestDateCollected %>% year(),
Month = EarliestDateCollected %>% month()) %>%
group_by(Year, Month) %>%
summarise(count = n()) %>%
drop_na() %>%
arrange(Year, Month)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# mutate(EarliestDateCollected = as.Date(EarliestDateCollected)) %>%
# drop_na() %>%
# arrange(EarliestDateCollected)
# filter(lubridate::year(EarliestDateCollected) == lubridate::year(lubridate::ymd(20000101)))
# calendarHeat(dates = dat_calend$EarliestDateCollected, values = dat_calend$count)
p_calend <- dat_calend %>%
# filter(Year <1945) %>%
# filter(Year >= 1945 & Year < 2000) %>%
# filter(Year >= 2000) %>%
ggplot(data = .) +
geom_tile(mapping = aes(x = Year, y = Month, fill = count), color = "black") +
scale_fill_gradient2(low = "white", high = "#004616") +
scale_x_continuous(breaks = c(1904, 1924, 1945, 2000, 2024),
minor_breaks = seq(1850, 1945, 10)
# labels = c(1904, 1924, 1945, 2000, 2024)
) +
scale_y_continuous(breaks = c(1, 3, 6, 9, 12), labels = c("Jan", "Mar", "Jun", "Sep", "Dec")) +
coord_equal() +
labs(x = "yr", y = "", fill = "#") +
theme_void() +
# theme(axis.text.y = element_text(size = 10))
theme(
axis.ticks.length.x = unit(0.1, units = "cm"),
axis.minor.ticks.length.x = unit(0.5, units = "cm"),
axis.ticks = element_line(linewidth = 1.5, color = "black"),
axis.minor.ticks.x.bottom = element_line(linewidth = 1, color = "black"),
axis.text = element_markdown(
# size = unit(24, units = "cm"),
color = "black"),
axis.title.y = element_text(angle = 0, color = "black", hjust = 1),
axis.title.x = element_text(color = "black", hjust = 1),
plot.background = element_rect(fill = "#ffffff00", color = "#ffffff00"),
panel.background = element_rect(fill = "#FFD700")
# plot.margin = unit(c(1,1,1,1), units = "cm")
# legend.position="none"
)
p_calend
# ggsave(filename = "timeGap_calend.png", device = "png", width = 32, height = 10,units = "cm", dpi = 600)
# ggsave(filename = "timeGap_calend_2.png", device = "png", width = 12, height = 10,units = "cm", dpi = 600)
# ggsave(filename = "timeGap_calend_3.png", device = "png", width = 11, height = 10,units = "cm", dpi = 600)
library(wordcloud2)
library(webshot)
# webshot::install_phantomjs()
wcloud_plot <- dat_timeS_dropNA %>%
# filter(EarliestDateCollected < ymd(19040101)) %>%
# filter(between(EarliestDateCollected, ymd(19040101),ymd(19230101))) %>%
# filter(between(EarliestDateCollected, ymd(19240101),ymd(19440101))) %>%
# filter(between(EarliestDateCollected, ymd(19450101),ymd(19990101))) %>%
# filter(between(EarliestDateCollected, ymd(20000101),ymd(20250101))) %>%
select(word = Collector, freq = count) %>%
group_by(word) %>%
summarise(freq = sum(freq)) %>%
arrange(desc(freq)) %>%
wordcloud2(data = .,
# color = "forestgreen",
# color = "darkorange",
fontWeight = 500, minRotation = 0.1, maxRotation = 0.9, shape = "circle")
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1904.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1924.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_1945.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_2000.html", selfcontained = FALSE)
# htmlwidgets::saveWidget(widget = wcloud_plot, file = "test_2024.html", selfcontained = FALSE)
wcloud_plot
rank_n <- 1:30
p_rankGenus <- dat %>%
group_by(Genus) %>%
summarise(count = n()) %>%
left_join(.,
by = "Genus",
multiple = "first",
dat %>% dplyr::distinct(Genus, GenusInChinese, ClassInChinese)
) %>%
arrange(desc(count)) %>%
mutate(Genus = glue::glue("{GenusInChinese}/<i>{Genus}</i>", .na = NULL)) %>%
mutate(Genus = fct_reorder(Genus, count)) %>%
mutate(ClassInChinese = fct(ClassInChinese, levels = c("眼蟲綱","石松綱","木賊綱","松葉蕨綱","真蕨綱","蘇鐵綱","銀杏綱","買麻藤綱","松綱","木蘭綱","百合綱"))) %>%
drop_na(Genus) %>%
slice(rank_n) %>%
ggplot(mapping = aes(x = Genus, y = count, fill = ClassInChinese)) +
geom_col() +
scale_fill_viridis(direction = 1, discrete = TRUE, option = "G") +
labs(y = "#") +
coord_flip(expand = FALSE, clip = "on") +
theme_solarized(base_family = "NotoSansCJKtcBlack") +
theme(
axis.text.y = element_markdown(
# size = 16,
colour = "black"),
axis.title.y = element_blank(),
axis.title.x = element_text(colour = "black", hjust = 1),
legend.text = element_text(colour = "black"),
legend.title = element_blank()
)
# p_rankGenus %>% ggsave(filename = "rankGenus.png", device = "png",units = "cm", width = 30, height = 18, dpi = 600)
p_rankGenus
cat(
dat %>% group_by(Genus) %>% summarise(count = n()) %>% arrange(desc(count)) %>% mutate(Genus = fct_reorder(Genus, count)) %>% filter(Genus %>% is.na()) %>% pull(count),
"份(未鑑定)\n"
)
## 8577 份(未鑑定)
rankSci_plot <- dat %>%
group_by(ScientificName) %>%
summarise(count = n()) %>%
left_join(.,
by = "ScientificName",
multiple = "first",
dat %>% dplyr::distinct(ScientificName, ClassInChinese, SpeciesNameInChinese, GenusInChinese, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, AuthorYearOfScientificName)
) %>%
arrange(desc(count)) %>%
# slice_head(n = 10) %>%
rowwise(SpeciesNameInChinese, GenusInChinese) %>%
mutate(ScientificName_plot = replace_na(SpeciesNameInChinese, GenusInChinese)) %>%
ungroup() %>%
mutate(ScientificName_plot2 = glue("<i>{Genus} {SpecificEpithet}</i>{InfraspecificRank}<i> {InfraspecificEpithet}</i>{AuthorYearOfScientificName}",.sep = " ", .na = "")) %>%
mutate(ScientificName_plot = glue("{ScientificName_plot}/{ScientificName_plot2}")) %>%
mutate(ScientificName_plot = fct_reorder(ScientificName_plot, count)) %>%
mutate(ClassInChinese = fct(ClassInChinese, levels = c("眼蟲綱","石松綱","木賊綱","松葉蕨綱","真蕨綱","蘇鐵綱","銀杏綱","買麻藤綱","松綱","木蘭綱","百合綱"))) %>%
drop_na(ScientificName) %>%
slice(rank_n) %>%
ggplot(mapping = aes(x = ScientificName_plot, y = count, fill = ClassInChinese)) +
geom_col() +
scale_fill_viridis(direction = 1, discrete = TRUE, option = "G") +
coord_flip(expand = FALSE, clip = "on") +
labs(y = "#") +
theme_solarized(
# base_size = 12,
base_family = "NotoSansCJKtcBlack") +
theme(
axis.text.y = element_markdown(
# size = 16,
colour = "black"),
axis.title.y = element_blank(),
axis.title.x = element_text(colour = "black", hjust = 1),
legend.text = element_text(colour = "black"),
legend.title = element_blank()
)
# rankSci_plot %>% ggsave(filename = "rankSpecies.png", device = "png",units = "cm", width = 30, height = 18, dpi = 600)
rankSci_plot
cat(
dat %>% group_by(ScientificName) %>% summarise(count = n()) %>% arrange(desc(count)) %>% mutate(ScientificName = fct_reorder(ScientificName, count)) %>% filter(ScientificName %>% is.na()) %>% pull(count),
"份(未鑑定)\n"
)
## 8577 份(未鑑定)
website: https://chestnut123tw.github.io/theme-year_datagap
dat_pie <- tibble(
gold = 0,
silver = dat %>% select(ScientificName, DecimalLatitude, DecimalLongitude, EarliestDateCollected) %>% drop_na() %>% nrow(),
bronze = dat %>% nrow() - silver
)
dat_pie <- dat_pie %>% pivot_longer(cols = 1:3, names_to = "quality", values_to = "count") %>%
mutate(quality = factor(quality, levels = c("gold", "silver","bronze"))) %>%
mutate(colors = c('#FFF019', '#F1F1F1', '#AC6B2B'))
ggplot(dat_pie) +
geom_bar(mapping = aes(x = "", y = count, fill = quality), stat="identity", width = 1, color = "black") +
coord_polar("y", start = 0) +
scale_fill_manual(values = dat_pie$colors) +
theme_void()
載入套件treemap
library(treemap)
# library(waffle) not yet...
# source("https://gist.githubusercontent.com/timelyportfolio/91e0b8c2b8cd4202baa2/raw/37663bf3de2168157e7f0c26a14422007ab843e8/as_rcdimple.R")
資料前處理…
# dat dataset 1 -- 直接使用組裝欄位 ----
dat_name <- dat %>%
unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>%
select(simple_scientificName) %>% mutate(count = 1)
# # dat dataset 2 -- 以比對後taxa list ----
# dat_name <- dat_joinacceptedName %>%
# select(simple_scientificName) %>% mutate(count = 1)
# taxa list dataset 1 -- pick out native, Tracheophyta, Plantae taxa from TaiCOL ----
name_trach <- name_taicol %>%
filter(kingdom == "Plantae") %>%
filter(rank %in% c("Species","Subspecies","Variety","Form","Hybrid Formula")) %>%
filter(phylum == "Tracheophyta") %>%
filter(alien_type == "native") %>%
distinct()
# taxa list dataset 2 -- pick out Tracheophyta, Plantae taxa from TaiCOL ----
# name_trach <- name_taicol %>%
# filter(kingdom == "Plantae") %>%
# filter(rank %in% c("Species","Subspecies","Variety","Form","Hybrid Formula")) %>%
# filter(phylum == "Tracheophyta") %>% distinct()
# modifying data structure for plotting ----
name_trach_plot <- full_join(keep = TRUE,
x = dat_name,
y = name_trach,
by = join_by(simple_scientificName == simple_name)
) %>%
replace_na(list(count = 0)) %>%
arrange(count) %>%
group_by(simple_name, taxon_id, class_c, family_c) %>% summarise(count = sum(count)) %>%
arrange(class_c, family_c, simple_name) %>%
select(taxon_id, simple_name, count, class_c, family_c) %>%
drop_na() %>%
mutate(n_tile = 1) %>%
ungroup()
## `summarise()` has grouped output by 'simple_name', 'taxon_id', 'class_c'. You
## can override using the `.groups` argument.
cols_8 <- c("#F9F9F9","#CCE1D0","#9FC9A7","#6FB17E","#369953","#007F2E","#006222","#004616")
分類群空缺-裸子植物
### 裸子植物 ----
name_trach_plot_gymno <- name_trach_plot %>%
filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
# filter(class_c %in% c("木蘭植物綱")) %>%
# filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
mutate(countPlot = if_else(count == 0, 0, NA)) %>%
mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>%
mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>%
mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))
# png(filename = "nameTreemap.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_gymno %>%
mutate(countPlot = countPlot %>% factor()) %>%
treemap(
index = c("family_c", "simple_name"),
vSize = "n_tile",
vColor = "countPlot",
type = "categorical",
align.labels = list(
c("right", "bottom"),
c("left", "top")
),
fontfamily.labels = "NotoSansCJKtcBlack",
palette = cols_8,
bg.labels = c("#ffffffaa"),
fontsize.labels = c(36, 24),
lowerbound.cex.labels = 0,
border.lwds = c(5, 2),
# border.col = ,
title = "",
title.legend = "#",
reverse.legend = TRUE,
overlap.labels = 1,
inflate.labels = FALSE
)
# dev.off()
分類群空缺-被子植物
### 被子植物--中位數前30少(不含蘭科) ----
name_trach_plot_angio <- name_trach_plot %>%
# filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
filter(class_c %in% c("木蘭植物綱")) %>%
# filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
mutate(countPlot = if_else(count == 0, 0, NA)) %>%
mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>%
mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>%
mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))
# 被子植物科層級採集量中位數前30少
am_angioTop30 <- name_trach_plot_angio %>% group_by(family_c) %>% summarise(am = median(count)) %>% arrange(am) %>% slice_head(n = 30) %>% pull(family_c)
# png(filename = "nameTreemap_angio.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_angio %>%
mutate(countPlot = countPlot %>% factor()) %>%
filter(family_c %in% am_angioTop30) %>%
filter(!family_c %in% "蘭科") %>%
treemap(
index = c("family_c", "simple_name"),
vSize = "n_tile",
vColor = "countPlot",
type = "categorical",
align.labels = list(
c("right", "bottom"),
c("left", "top")
),
fontfamily.labels = "NotoSansCJKtcBlack",
palette = cols_8,
bg.labels = c("#ffffffaa"),
fontsize.labels = c(36, 20),
lowerbound.cex.labels = 0,
border.lwds = c(5, 2),
# border.col = ,
title = "",
title.legend = "#",
reverse.legend = TRUE,
overlap.labels = 1,
inflate.labels = FALSE
)
# dev.off()
分類群空缺-蕨類植物
### 蕨類植物 ----
name_trach_plot_fern <- name_trach_plot %>%
# filter(class_c %in% c("松綱", "蘇鐵綱")) %>%
# filter(class_c %in% c("木蘭植物綱")) %>%
filter(class_c %in% c("水龍骨綱", "石松綱")) %>%
mutate(countPlot = if_else(count == 0, 0, NA)) %>%
mutate(countPlot = if_else(count == 1, 1, countPlot)) %>%
mutate(countPlot = if_else(between(count, 2, 10), 10, countPlot)) %>%
mutate(countPlot = if_else(between(count, 11, 50), 50, countPlot)) %>%
mutate(countPlot = if_else(between(count, 51, 100), 100, countPlot)) %>%
mutate(countPlot = if_else(between(count, 101, 200), 200, countPlot)) %>%
mutate(countPlot = if_else(between(count, 201, 400), 400, countPlot)) %>%
mutate(countPlot = if_else(between(count, 401, 800), 800, countPlot)) %>%
mutate(simple_name = simple_name %>% gsub(x=.," ", "\n"))
# 蕨類植物科層級採集量中位數前30少
am_fernTop30 <- name_trach_plot_fern %>% group_by(family_c) %>% summarise(am = median(count)) %>% arrange(am) %>% slice_head(n = 30) %>% pull(family_c)
# png(filename = "nameTreemap_fern.png", width = 40, height = 30,units = "cm", res = 600)
name_trach_plot_fern %>%
mutate(countPlot = countPlot %>% factor()) %>%
filter(family_c %in% am_fernTop30) %>%
treemap(
index = c("family_c", "simple_name"),
vSize = "n_tile",
vColor = "countPlot",
type = "categorical",
align.labels = list(
c("right", "bottom"),
c("left", "top")
),
fontfamily.labels = "NotoSansCJKtcBlack",
palette = cols_8,
bg.labels = c("#ffffffaa"),
fontsize.labels = c(36, 20),
lowerbound.cex.labels = 0,
border.lwds = c(5, 2),
# border.col = ,
title = "",
title.legend = "#",
reverse.legend = TRUE,
overlap.labels = 1,
inflate.labels = FALSE
)
# dev.off()
cat("原生維管束植物:", name_trach_plot %>% nrow(), "個學名\n")
## 原生維管束植物: 5194 個學名
cat("館藏原生維管束植物:", name_trach_plot %>% filter(count != 0) %>% nrow(), "個學名",
(name_trach_plot %>% filter(count != 0) %>% nrow() / name_trach_plot %>% nrow()) %>% scales::label_percent()(.),
"\n")
## 館藏原生維管束植物: 4527 個學名 87%
cat("原生維管束植物(裸子植物):", name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(裸子植物): 34 個學名
cat("館藏原生維管束植物(裸子植物):", name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% filter(count != 0) %>% nrow(), "個學名",
(name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("松綱", "蘇鐵綱")) %>% nrow()) %>% scales::label_percent()(.),
"\n")
## 館藏原生維管束植物(裸子植物): 31 個學名 91%
cat("原生維管束植物(被子植物):", name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(被子植物): 4217 個學名
cat("館藏原生維管束植物(被子植物):", name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% filter(count != 0) %>% nrow(), "個學名",
(name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("木蘭植物綱")) %>% nrow()) %>% scales::label_percent()(.),
"\n")
## 館藏原生維管束植物(被子植物): 3640 個學名 86%
cat("原生維管束植物(蕨類植物):", name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% nrow(), "個學名\n")
## 原生維管束植物(蕨類植物): 943 個學名
cat("館藏原生維管束植物(蕨類植物):", name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% filter(count != 0) %>% nrow(), "個學名",
(name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% filter(count != 0) %>% nrow() / name_trach_plot %>% filter(class_c %in% c("水龍骨綱", "石松綱")) %>% nrow()) %>% scales::label_percent()(.),
"\n")
## 館藏原生維管束植物(蕨類植物): 856 個學名 91%
# function nMv2() -- nomemmatch v2 api ----
nMv2 <- function(name, source = "taicol", best = "yes", format = "csv", multiName = FALSE){
if(multiName %>% isTRUE()){
RETRY(verb = "GET",
url = "https://match.taibif.tw/v2/api.php",
query = list(names = name %>% pull() %>% paste0(collapse = "|"), source = source, best = best, format = format)
)
} else{
RETRY(verb = "GET",
url = "https://match.taibif.tw/v2/api.php",
query = list(names = name, source = source, best = best, format = format)
)
}
}
# simple_scientificName (組裝欄位(Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet)產生) to match -- solution 1 ----
dat_TaxonName <- dat %>%
unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>%
select(simple_scientificName) %>%
distinct() %>%
filter(!simple_scientificName == "")
# scientificName to match -- solution 2 ----
# dat_TaxonName2 <- dat %>%
# select(ScientificName) %>%
# distinct() %>%
# filter(!scientificName == "")
## query for requests to nomenMatch v2 api(TaiCOL) ----
# startT <- Sys.time()
#
# dat_TaxonName_nMv2 <- tibble()
# batch_size <- 100
# n <- nrow(dat_TaxonName)
# batches <- split(dat_TaxonName, ceiling(seq_len(n) / batch_size))
#
# results <- list() # 所有 batch 的 list 容器
# for (j in seq_along(batches)) {
# cat("Processing batch", j, "of", length(batches), "\n")
#
# batch <- batches[[j]]
#
# batch_result <- suppressMessages(
# batch %>% nMv2(multiName = TRUE) %>% content()
# )
#
# results[[j]] <- batch_result
#
# rm(batch_result);gc()
# if(j %% 10 == 0) Sys.sleep(1)
# }
#
# # 最後將所有 batch 合併成一個結果
# dat_TaxonName_nMv2 <- bind_rows(results)
#
# # 重新命名欄位及依input排序dat_TaxonName:simple_scientificName == dat_TaxonName_nMv2:search_term
# dat_TaxonName_nMv2 <- dat_TaxonName_nMv2 %>%
# rename(simple_scientificName = search_term) %>%
# arrange(simple_scientificName)
#
# cat((Sys.time() - startT) %>% format())
#
# # 存檔
# saveRDS(
# object = dat_TaxonName_nMv2,
# file = paste0("dat_TaxonName_nomenMatchv2_", today(tzone = "asia/taipei"), ".rds")
# )
## 重新讀入dat_TaxonName_nMv2 ----
dat_TaxonName_nMv2 <- readRDS(
file = grep("^dat_TaxonName_nomenMatchv2_[0-9]{4}-[0-9]{2}-[0-9]{2}\\.rds", x = list.files(), value = TRUE) %>% latestPath()
)
## pick out match_type not "no match" in "Plantae" ----
Match_taicol <- dat_TaxonName_nMv2 %>%
filter(match_type != "No match") %>%
filter(kingdom == "Plantae") %>%
select(simple_scientificName, simple_name, accepted_namecode, name_status, source)
# function taicolID2name() --- TaiCOL api ----
base_url_taicol_higherTaxa <- "https://api.taicol.tw/v2/higherTaxa"
taicolID2name <- function(taxonID_taicol){
request <- RETRY(verb = "GET",
url = base_url_taicol_higherTaxa,
query = list(taxon_id = taxonID_taicol)
) %>%
content(as = "text") %>%
fromJSON() %>% .$data %>%
rowwise() %>% mutate(rank = rank %>% casefold()) %>% ungroup() %>%
suppressMessages()
request <- request %>% filter(taxon_id == taxonID_taicol) %>%
bind_cols(
.,
request %>%
.[(which(request$taxon_id == taxonID_taicol) + 1) : nrow(request),] %>%
select(rank, simple_name) %>%
pivot_wider(., names_from = rank, values_from = simple_name)
)
request <- request %>%
mutate(source = "taicol")
return(request)
}
# for loop -- 取出除了"no match"以外的TaiCOL acceptedName ----
# Match_taicol_acceptedName <- tibble()
# for(i in 5842:nrow(Match_taicol)){
# Match_taicol_acceptedName_i <- Match_taicol$accepted_namecode[i] %>%
# taicolID2name() %>%
# mutate(simple_scientificName = Match_taicol$simple_scientificName[i]) %>%
# suppressMessages()
# Match_taicol_acceptedName <- bind_rows(
# Match_taicol_acceptedName_i,
# Match_taicol_acceptedName
# )
# if(i %% 10 == 0){
# cat(i, "/", nrow(Match_taicol), "\n")
# }
# if(i %% 100 == 0) Sys.sleep(1)
# rm(Match_taicol_acceptedName_i);gc()
# }
#
# # 存檔
# saveRDS(
# object = Match_taicol_acceptedName,
# file = paste0("Match_taicol_acceptedName_", today(tzone = "asia/taipei"), ".rds")
# )
# 重新讀入Match_taicol_acceptedName ----
Match_taicol_acceptedName <- readRDS(
file = grep("^Match_taicol_acceptedName_.+\\.rds", x = list.files(), value = TRUE) %>% latestPath()
)
# 重新命名欄位及依input排序Match_taicol_acceptedName:simple_scientificName
Match_taicol_acceptedName <- Match_taicol_acceptedName %>%
# select(-"NA") %>%
mutate(simple_scientificName = as.character(simple_scientificName)) %>%
arrange(simple_scientificName)
## pick out match_type are "no match" in "Plantae" in TaiCOL,接續對應GBIF ----
noMatch <- dat_TaxonName_nMv2 %>% filter(match_type == "No match") %>% select(simple_scientificName)
# join ----
#
dat_joinacceptedName <- dat %>%
unite(col = simple_scientificName, Genus, SpecificEpithet, InfraspecificRank, InfraspecificEpithet, sep = " ", na.rm = TRUE) %>%
left_join(
.,
Match_taicol_acceptedName,
by = "simple_scientificName",
multiple = "first"
)