---
title: "R Notebook"
output: html_notebook
---
# Initial Setup
```{r}
# Call libraries
library(tidyverse)
library(odbc)
library(DBI)
library(dbplyr)
library(viridis)
# Set private credentials - these should be set in .Renviron file
kb_username <- Sys.getenv("KB_USERNAME")
kb_password <- Sys.getenv("KB_PASSWORD")
```
```{r}
# Open DB connection
con <- dbConnect(odbc(), "KB", uid=kb_username, pwd=kb_password)
```
# Preprocessing
First create a 'normalised' version of the Unpaywall dataset, using the following rules:
- DOI to lowercase and trim whitespace
- Article title to lowercase and trim whitespace
- Block first and last author names (e.g. Nicholas Fraser -> frasern). Remove
all non-alphabetical characters from surname and concatenate with first letter
of family name
- Journal name to lowercase and trim whitespace
```{r}
# Create table
dbGetQuery(con, read_file("sql/normalise_upw_14.sql"))
# Add index on DOI column
dbGetQuery(con, read_file("sql/normalise_upw_14_index.sql"))
# Create table
#dbGetQuery(con, read_file("sql/normalise_wos_12_17.sql"))
# Add index on DOI column
#dbGetQuery(con, read_file("sql/normalise_wos_12_17_index.sql"))
```
# DOI matches
```{r}
# Count total matches, distinct WOS DOIs, distinct WOS items (PK_ITEMS) and
# distinct Unpaywall DOIs
doi_matches <- dbGetQuery(con, read_file("sql/doi_matches.sql"))
```
# Publication Years
How do publication years differ between Unpaywall and WOS for articles matched with DOIs?
```{r}
# Count total matches as a function of WOS publication year (Unpaywall
# publication year = 2014).
#doi_matches_pubyears <- dbGetQuery(con, read_file("sql/doi_matches_publication_years.sql"))
doi_matches_pubyears %>%
ggplot(aes(x=WOS_YEAR, y=MATCHES)) +
geom_bar(stat="identity") +
labs(x="WOS Publication Year", y="DOI Matches") +
theme_bw()
```
# Article Types
```{r}
# Count matches between Unpaywall and WOS article types
#doi_matches_arttypes <- dbGetQuery(con, read_file("sql/doi_matches_article_types.sql"))
doi_matches_arttypes %>%
ggplot(aes(y=WOS_TYPE, x=UNPAYWALL_TYPE)) +
geom_point(aes(size=MATCHES)) +
labs(y="WOS Type", x="Unpaywall Type", size="DOI Matches") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_size(range=c(0.01,10),
breaks=c(100, 1000, 10000, 100000, 1000000),
labels = c(100, 1000, 10000, 100000, 1000000))
```
# Title distance
```{r}
# Matches a small sample (~1 %) of Unpaywall records to WOS records via DOI.
# Edit distance similarity and Jaro-Winkler similarity are calculated using
# Oracle built-in UTL_MATCH function
doi_matches_titles <- dbGetQuery(con, read_file("sql/doi_matches_title_distances.sql"))
doi_matches_titles %>%
select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>%
gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY",
key="Method",
value="Similarity") %>%
ggplot(aes(x=Similarity, color=Method)) +
stat_ecdf(geom="step", pad=FALSE) +
labs(y="Cumulative Matched Percentage", x="Title Similarity") +
theme_bw() +
scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) +
scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100))
# Compare title similarity of DOI-matches with similarity from a selection of
# randomly matched articles
#random_matches_titles <- dbGetQuery(con, read_file("sql/random_matches_title_distances.sql"))
random_matches_titles %>%
select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>%
gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY",
key="Method",
value="Similarity") %>%
ggplot(aes(x=Similarity, color=Method)) +
stat_ecdf(geom="step", pad=FALSE) +
labs(y="Cumulative Matched Percentage", x="Title Similarity") +
theme_bw() +
scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) +
scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100))
```
# Author Count
```{r}
#doi_matches_authorcounts <- dbGetQuery(con, read_file("sql/doi_matches_all_authorcounts.sql"))
doi_matches_authorcounts %>%
ggplot(aes(x=UNPAYWALL_AUTHORS, y=WOS_AUTHORS)) +
geom_tile(aes(fill=MATCHES)) +
labs(x="Unpaywall Author Count", y="WOS Author Count") +
scale_x_continuous(limits=c(0,100)) +
scale_y_continuous(limits=c(0,100)) +
scale_fill_viridis() +
theme_bw()
doi_matches_authorcounts %>%
mutate(difference = UNPAYWALL_AUTHORS-WOS_AUTHORS) %>%
group_by(difference) %>%
summarize(sum_matches = sum(MATCHES)) %>%
filter(difference > -10, difference < 10) %>%
ggplot(aes(x=difference, y=sum_matches)) +
geom_bar(stat='identity')
```
# Author Names
```{r}
# Create table of first author names from WOS where PUBYEAR = 2014
#dbGetQuery(con, read_file("sql/create_first_author_names_wos14.sql"))
# Add indexes to DOI
#dbGetQuery(con, "CREATE INDEX IDX_FIRST_AUTHOR_NAMES_WOS14 ON FIRST_AUTHOR_NAMES_WOS14(DOI)")
```