This project uses webscraping, data analysis, and plotting tools in R
to access, wrangle, analyse, and plot Tokyo Olympics 2020 medal tally data to generate the graphics used in the below infographic.
Topics
- Webscraping data with R
- Making chord plots
- Adding custom images as sector labels for chord plots
Step-by-step instructions and code are detailed in the header tabs. All code, datasets, and source files are also available on Github.
R
session
## ─ Session info ────────
## setting
## version
## os
## system
## ui
## language
## collate
## ctype
## tz
## date
## value
## R version 3.5.0 (2018-04-23)
## macOS 10.14.6
## x86_64, darwin15.6.0
## X11
## (EN)
## en_US.UTF-8
## en_US.UTF-8
## Australia/Melbourne
## 2022-02-04
##
## ─ Packages ────────────
## ! package *
## assertthat
## base *
## cli
## codetools
## P compiler
## crayon
## P datasets *
## DBI
## digest
## dplyr *
## ellipsis
## evaluate
## fansi
## generics
## glue
## P graphics *
## P grDevices *
## htmltools
## knitr
## lifecycle
## magrittr
## P methods *
## pillar
## pkgconfig
## purrr
## R6
## rlang
## rmarkdown
## sessioninfo *
## P stats *
## stringi
## stringr
## tibble
## tidyselect
## P tools
## utf8
## P utils *
## vctrs
## withr
## xfun
## yaml
## version date
## 0.2.1 2019-03-21
## 3.5.0 2018-04-24
## 2.5.0 2021-04-26
## 0.2-16 2018-12-24
## 3.5.0 2018-04-24
## 1.4.1 2021-02-08
## 3.5.0 2018-04-24
## 1.1.1 2021-01-15
## 0.6.25 2020-02-23
## 1.0.6 2021-05-05
## 0.3.2 2021-04-29
## 0.14 2019-05-28
## 0.5.0 2021-05-25
## 0.1.0 2020-10-31
## 1.4.2 2020-08-27
## 3.5.0 2018-04-24
## 3.5.0 2018-04-24
## 0.4.0.9003 2020-05-05
## 1.36 2021-09-29
## 1.0.0 2021-02-15
## 2.0.1 2020-11-17
## 3.5.0 2018-04-24
## 1.6.1 2021-05-16
## 2.0.3 2019-09-22
## 0.3.4 2020-04-17
## 2.5.0 2020-10-28
## 0.4.11 2021-04-30
## 2.2 2020-05-31
## 1.1.1 2018-11-05
## 3.5.0 2018-04-24
## 1.4.6 2020-02-17
## 1.4.0 2019-02-10
## 3.1.2 2021-05-16
## 1.1.0 2020-05-11
## 3.5.0 2018-04-24
## 1.2.1 2021-03-12
## 3.5.0 2018-04-24
## 0.3.8 2021-04-29
## 2.4.2 2021-04-18
## 0.28 2021-11-04
## 2.2.1 2020-02-01
## lib
## [1]
## [?]
## [1]
## [2]
## [2]
## [1]
## [2]
## [1]
## [1]
## [1]
## [1]
## [1]
## [1]
## [1]
## [1]
## [2]
## [2]
## [1]
## [1]
## [1]
## [1]
## [2]
## [1]
## [1]
## [1]
## [1]
## [1]
## [1]
## [1]
## [2]
## [1]
## [1]
## [1]
## [1]
## [2]
## [1]
## [2]
## [1]
## [1]
## [1]
## [1]
## source
## CRAN (R 3.5.2)
## local
## CRAN (R 3.5.0)
## CRAN (R 3.5.2)
## local
## CRAN (R 3.5.0)
## local
## CRAN (R 3.5.0)
## CRAN (R 3.5.2)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.2)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## local
## local
## Github (rstudio/htmltools@984b39c)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## local
## CRAN (R 3.5.0)
## CRAN (R 3.5.2)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## local
## CRAN (R 3.5.2)
## CRAN (R 3.5.2)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## local
## CRAN (R 3.5.0)
## local
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.0)
## CRAN (R 3.5.2)
##
## [1] /Users/malishev/Library/R/3.5/library
## [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library
##
## P ── Loaded and on-disk path mismatch.
Load packages and set data parameters
suppressWarnings(pacman::p_load(here,
rvest, xml2, dplyr, circlize,
tidyr, stringr, purrr,
magick, reshape2))
# vars
# ----------------------------------------------------------
base_url <- "https://olympics.com/tokyo-2020/olympic-games"
event_url <- paste0(base_url,
"/en/results/all-sports/medal-standings.htm")
pictogram_url <- "https://www.theolympicdesign.com/olympic-design/pictograms/tokyo-2020/"
flag_url <- "https://www.countryflags.com/icons-overview/"
col_lab <- "#434343" # sector colour
colv_label <- c("Gold", "Silver",
"Bronze") # col labels
colv_pal <- c("#C09F68", "#C5C3C3",
"#AA7C64") # col hex
colv_df <- tibble(label = colv_label,
col = colv_pal)
height <- 10
width <- height # plot dims
Write some functions to webscrape the medal tally data from Olympics.com.
# webscrape funcs ------------------------------------------------
get_webdata <- function(att) event_url %>% read_html() %>% html_nodes(".dropdown-link") %>% html_attr(att) %>% return() # pull webdata func
get_pictogram <- function() pictogram_url %>% read_html() %>% html_nodes(".j-module") # pictogram text and img func
get_flag <- function() flag_url %>% read_html() %>% html_nodes(".thumb") # get flag func
img_convert <- function(img){ # add raster img as chord labels
imgr <- img %>% magick::image_read() %>% as.raster() # convert img to raster layer
imgr %>% return()}
# webscrape data ------------------------------------------------
# get all countries
country_names <- event_url %>% read_html() %>%
html_table() %>% .[[1]] %>%
pull("Team/NOC")
# get urls for each country
get_country_data <- function(){
event_url %>% read_html() %>%
html_node("table") %>% # get first table
html_nodes(".playerTag") %>%
html_node("a")
}
# full country names
country_total <- get_country_data() %>%
html_attr("href") %>%
str_replace_all("entries","medalist-by-sport") %>% # get 'by sport' links
str_replace_all("../../..",base_url)
# country url titles
country_title <- get_country_data() %>%
html_attr("title") %>%
str_remove_all("NOC Entries-") %>%
str_replace_all(" ","-") %>% str_to_lower()
# get all events
event_title <- event_url %>% read_html() %>%
html_nodes(".dropdown-link") %>%
html_text(trim = T)
# get urls for each event
event_total <- get_webdata("href") %>%
str_subset("results") %>%
str_replace_all("../../..",base_url)
# pictogram text
pictogram_text <- get_pictogram() %>%
html_nodes("span") %>% html_text() %>% unique
# hi-res event pictograms
pictogram_total <- get_pictogram() %>%
html_nodes("a") %>% html_attr("data-href") %>% na.omit() %>% .[-1] # remove first header img
# pictogram id
pictogram_id <- get_pictogram() %>%
html_nodes("img") %>% html_attr("data-image-id")
# final pictogram df
pictogram_df <- tibble("event" = pictogram_text,
"img" = pictogram_total)
# hi-res flags
flags_df <- tibble("name" = get_flag() %>% html_nodes("span") %>% html_text() %>% str_remove_all(" flag icon") %>% str_replace_all(" ","-") %>% str_to_lower(),
"flag" = get_flag() %>% html_nodes("img") %>% html_attr("src"))
The next step is turning the webscraped data into consistent and usable data formats for analysis in R
.
The below chunks access the individual web url for medal tally data of each country or Olympic event. The final section accesses a site for getting hi-res country flags to add to each plot.
Medal tally data by country. Webscraped data are stored in lists, then saved to the local dir.
# save country/event data
# to dir
# -------------------------------
# all country data
clist <- list() # store country medal data
for (cn in seq_along(country_title)) {
url <- country_total[cn]
mc <- url %>% read_html %>%
html_nodes(".medal-icon") %>%
html_attr("alt") %>%
as.numeric() # get medal count
d <- url %>% read_html() %>%
html_table(trim = T) %>%
.[[1]] %>% replace("Medal",
mc) %>% mutate(Country = country) %>%
select(Medal, Sport) %>%
group_by(Sport) %>%
count(Medal) %>% arrange(n)
ll <- list(d) # save df
nn <- country_title[cn] # name each entry
names(ll) <- nn
clist <- c(clist, ll)
message(nn, rep(" ", 10),
"\r", appendLF = F) # display status
flush.console()
}
saveRDS(clist, here::here("r",
"medals_country.Rda"))
Now the same for medal tally data by event.
# all event data
elist <- list() # store event medal data
event_title <- event_title[1:47] # retain just sports
for(cn in seq_along(event_title)){
url <- event_total[cn]
d <- url %>%
read_html() %>%
html_table(trim = T) %>% .[[1]] %>% # get first table
rename(.cols = 2:5, # rename cols
"Country" = 2,
"Gold" = 3,
"Silver" = 4,
"Bronze" = 5) %>%
mutate("Event" = event_title[cn] %>% str_to_sentence())
ll <- list(d) # save df
nn <- event_title[cn] # name each entry
names(ll) <- nn
elist <- c(elist,ll)
message(nn,rep(" ",10),"\r",appendLF = F) # display status
flush.console()
}
saveRDS(elist,here::here("r","medals_event.Rda"))
Finally, get hi-res country flags to add to each chord plot.
# hi-res flags
flags_df <- tibble(name = get_flag() %>%
html_nodes("span") %>%
html_text() %>% str_remove_all(" flag icon") %>%
str_replace_all(" ", "-") %>%
str_to_lower(), flag = get_flag() %>%
html_nodes("img") %>%
html_attr("src"))
saveRDS(flags_df, here::here("r",
"flags_df.Rda"))
Load the Olympic medal data. The code for getting these datasets is found in the Build datasets section.
elist <- here::here("data",
"medals_event.Rda") %>%
readRDS() # load tally by event
clist <- here::here("data",
"medals_country.Rda") %>%
readRDS() # load tally by country
flags_df <- here::here("data",
"flags_df.Rda") %>% readRDS() # hi-res country flags
colv_label <- c("Gold", "Silver",
"Bronze") # col labels
colv_pal <- c("#C09F68", "#C5C3C3",
"#AA7C64") # col hex
colv_df <- tibble(label = colv_label,
col = colv_pal)
Now plot and save the medal tally per sport for each country from the country dataset. Here is the order of steps.
- Convert the raw data into a usable table for the chord plot
- Match the colour palette to the chord plot sectors
- Plot the base chord plot
- Add custom sector labels
- Add a country label and flag stamp from the flags data frame to each plot (you can also use {ggflags
} here)
- Save each plot to your local dir
# plot tally by country ----------------------------------------------------------
lid <- 0 # add leaderboard id
for(countryid in clist %>% names){
# pull hires country flag
fid <- flags_df %>% filter(name %in% countryid) %>% pull(flag) %>% magick::image_read() %>% as.raster() # get selected country flag
fidn <- flags_df %>% filter(name %in% countryid) %>% pull(name) %>% str_sub(1,3) %>% str_to_upper()
d <- clist[[countryid]]
d <- d %>%
mutate_at("Medal", funs(case_when(
Medal == 1 ~ "Gold",
Medal == 2 ~ "Silver",
Medal == 3 ~ "Bronze"
)))
dtab <- d %>%
dcast(Sport~Medal, fill = 0) %>% # turn rows into cols
melt() %>% # stack df
arrange(desc(value)) %>% # rearrange
tidyr::uncount(value) %>% # expand table by number of each medal
dplyr::select(variable,Sport) %>% # reorder for colpal
mutate("variable" = factor(variable, levels = colv_label)) %>% arrange(variable) # order by medals
colpal <- c(colv_df %>% filter(label %in% dtab$variable) %>% pull,rep(col_lab,dtab[,2] %>% unique %>% length)) # match colpal to data length
# plot save
fh <- countryid
png(here::here("plots") %>% paste0("/",lid,"_",fh,".png"),width = width, height = height, units = "cm", bg = "transparent", res = 250)
# plot pars
par(mar = rep(2, 4),mfrow = c(1, 1),family = "HersheySans",font = 2) # plot pars
circos.clear()
circos.par(start.degree = -180, # chord setup
gap.degree = 2, track.margin = c(-0.1, 0.1),
points.overflow.warning = F,
track.height = 0.2)
circos.par(cell.padding =c(0.02, 0, 0.02, 0))
# plot
chordDiagram(dtab,
grid.col = colpal,
transparency = 0.3,
directional = 1, # 1 = link origin is from sectors
diffHeight = -0.05,
# link.border = "#FFFFFF",
annotationTrack = c("grid"
# ,"name" # to check name placment
),
annotationTrackHeight = c(0.05, 0.1),
big.gap = 5, small.gap = 2, # gaps between sectors
link.sort = F, link.decreasing = T, # define link overlap
link.largest.ontop = T,
preAllocateTracks = list(track.height = 0.1)
# symmetric = F
# scale = F # weight links equally
)
# add labels to chord
ylim <- 0.85
cex <- 0.6
circos.track(track.index = 1,
panel.fun = function(x,y){ # add text labels
sector_index = get.cell.meta.data("sector.numeric.index")
circos.text(x = CELL_META$xcenter,
y = ylim,
# remove medal labels
labels = ifelse(sector_index <= dtab[,1] %>% unique %>% length, NA, CELL_META$sector.index),
facing = "clockwise", niceFacing = T, cex = cex, col = col_lab
)}
, bg.border = NA) # set bg to NA
# add flag and flag id
xx <- 0.2; yy <- 0.8
grid.raster(fid, x=xx, y=yy, width=0.1, height = 0.075) # add country flag
text(-1, yy+0.16, fidn, col = col_lab, cex = 1, pos = 4)
dev.off() # close plot save
cat(rep("\n",3),"Plot saved as",fh %>% paste0(".png in"),here::here("plots"))
message("\n\n",countryid,appendLF = F) # display status
flush.console()
lid <- lid + 1
} # end loop
This generates a plot for each country appended with their ranked medal tally order.
It's also possible to use images as labels for the sectors in chord plots, as well as different images for each sector.
Option 1
Use the same image for each plot sector.
# first intialise plot
# then add custom images
# convert png to raster
img_convert <- function(img){ # add raster img as chord labels
imgr <- img %>% magick::image_read() %>% as.raster() # convert img to raster layer
imgr %>% return()}
imgr <- img_convert("image.png")
# add image and label to initialised plot
circos.track(track.index = 1,
panel.fun = function(x, y){
circos.raster(image = imgr, # add image
x = CELL_META$xcenter,
y = CELL_META$ycenter,
width = "1.5mm", height = "1.5mm",
facing = "downward")
circos.text(x = CELL_META$xcenter, # add text
y = CELL_META$ycenter,
labels = CELL_META$sector.index,
facing = "clockwise", niceFacing = T,
### for image and text
cex = 0.8, col = col_lab,
adj = c(0.2, 0) # label xy position
)}
, bg.border = NA) # set bg to NA
Option 2
For different images for each plot sector, I used an image list of rasters and indexed each list element to correspond to the appropriate plot sector data.
This example uses event data (Diving as an example) and plots each country's flag as a custom image for the respective sector of the chord plot.
First, select data to plot.
fh <- "Diving" # plot by event
source("medals_event.Rda")
d <- elist[[fh]]
dtab <- d %>% select(NOCCode,Gold,Silver,Bronze) %>% # convert df to table for chord
melt() %>%
uncount(value) %>% # expand table by number of each medal
select(variable,NOCCode) # reorder for colpal
colpal <- c(colv_pal,rep(col_lab,dtab[,2] %>% unique %>% length)) # match colpal to data length
Now we can set the plot parameters and initialise the plot. Once the base chord plot is drawn, we can then add text and image labels for each sector.
Setup plot parameters.
# plot save
require(circlize)
png(here::here("test") %>% paste0("/",fh,".png"),width = width, height = height, units = "cm", bg = "transparent", res = 250)
circos.clear()
par(mar = rep(2, 4),mfrow = c(1, 1),family = "HersheySans",font = 2) # plot pars
circos.par(start.degree = -185, # chord setup
gap.degree = 2, track.margin = c(-0.1, 0.1),
points.overflow.warning = F,
track.height = 0.2)
circos.par(cell.padding =c(0.02, 0, 0.02, 0))
Now plot the base chord plot.
# plot
chordDiagram(dtab,
grid.col = colpal,
transparency = 0.3,
directional = 1, # 1 = link origin is from sectors
diffHeight = -0.05,
link.border = NA, # add border
annotationTrack = c("grid"
# ,"name" # to check name placment
),
annotationTrackHeight = c(0.05, 0.1),
big.gap = 5, small.gap = 2, # gaps between sectors
link.sort = T, link.decreasing = T, # link overlap
link.largest.ontop = T,
preAllocateTracks = list(track.height = 0.1)
)
Once the plot is active, we can add custom labels and images. You can skip this step and use the names in your data table for the plot sectors by using annotationTrack = "name"
in the above plot code. However, for custom labels and/or images, we add these features to the base chord plot manually.
# add custom image per sector (here you can instead use {ggflags})
# select custom images
fid <- c("china","great-britain","united-states","canada","germany","australia","mexico","roc")
fidn <- flags_df %>% filter(name %in% fid) %>% arrange(name = factor(name,levels = fid)) %>% pull(flag)
# create img labels
imgl <- as.list(fidn) # match imgs to events in elist
imglist <- lapply(imgl,img_convert) # apply convert to raster func
imgtab <- c(as.list(rep(NA,3)),imglist) # add country and empty imgs for three medal sectors
names(imgtab) <- get.all.sector.index() # get names from plot sector indices
# plot params
ylim <- 0.7
im <- "4mm"
cex <- 0.5
circos.track(track.index = 1,
panel.fun = function(x,y){ # add text/img per sector
circos.raster(x = CELL_META$xcenter,
y = ylim,
image = imgtab[[CELL_META$sector.numeric.index]], # add image by indexing each cell sector from img df (imgtab)
width = im, height = im,
facing = "clockwise",niceFacing = T)
circos.text(x = CELL_META$xcenter,
y = ylim + 1.2,
# remove medal sector labels but keep other sectors
labels = ifelse(CELL_META$sector.numeric.index <= dtab[,1] %>% unique %>% length, NA, CELL_META$sector.index),
facing = "clockwise", niceFacing = T, cex = cex, col = col_lab
)}
, bg.border = NA) # set bg to NA
# add icon stamp
xx <- 0.2; yy <- 0.8
grid.raster(picid, x=xx, y=yy, width=0.1, height = 0.1) # add country flag
text(-1, yy+0.2, piclab, col = col_lab, cex = 1, pos = 4)
dev.off()
For this case study, you can also use the official Olympics pictograms as sector images
# save pictograms
# -----------------------------------------------
li <- 1
for (li in seq_along(pictogram_df$event)) {
img <- pictogram_df$img[li] %>%
img_convert()
png(here::here("legend") %>%
paste0("/", pictogram_df$event[li],
".png"), width = width,
height = height, units = "cm",
bg = "transparent",
res = 250)
par(mfrow = c(1, 6), mar = rep(0,
4))
plot.new()
grid.raster(img, 0.5,
0.5, width = 0.5,
height = 0.5) # add country flag
dev.off()
}
# get pictogram labels
slist <- list()
for (ci in seq_along(clist)) {
cd <- clist[[ci]] %>%
select(Sport)
slist <- c(slist, cd)
}
Because your image conversion function for your webscraped images converts images to rasters (in this case the Olympic pictograms and country flags), you can also add some code that replaces the base colours in the raster grid with custom colours.
img_convert <- function(img) {
imgr <- img %>% magick::image_read() %>%
as.raster() # convert img to raster layer
imgr[imgr == "#002163ff"] <- col_lab # change main img color
imgr %>% return()
}
You can also loop through each sector index of the plot and index the relevant image from your label/image data frame. If you want the label/image to be centered in the middle of the chord sector, you need to access the built-in get.cell.meta.data()
and get.all.sector.index()
functions in {circlize} (see below).
# rm origin (medal) labels and cycle through labels/imgs
circos.track(track.index = 1,
for(sl in seq_along(get.all.sector.index())){ # apply custom labels to sectors
xlim = get.cell.meta.data("xcenter")
circos.text(x = mean(xlim), # center label
y = ylim/2,
sector.index = get.all.sector.index()[sl],
labels = labtab[[sl]],
facing = "clockwise", niceFacing = T,
cex = 0.75, col = col_lab,
adj = c(0, 0.5) # nudge xy position
)}
, bg.border = NA) # set bg to NA
Download code from Github
The {circlize} package
Another data exploration by Matt Malishev