--- title: "R Notebook" output: html_notebook --- # Initial Setup ```{r} # Call libraries library(tidyverse) library(odbc) library(DBI) library(dbplyr) library(viridis) # Set private credentials - these should be set in .Renviron file kb_username <- Sys.getenv("KB_USERNAME") kb_password <- Sys.getenv("KB_PASSWORD") ``` ```{r} # Open DB connection con <- dbConnect(odbc(), "KB", uid=kb_username, pwd=kb_password) ``` # Preprocessing First create a 'normalised' version of the Unpaywall dataset, using the following rules: - DOI to lowercase and trim whitespace - Article title to lowercase and trim whitespace - Block first and last author names (e.g. Nicholas Fraser -> frasern). Remove all non-alphabetical characters from surname and concatenate with first letter of family name - Journal name to lowercase and trim whitespace ```{r} # Create table dbGetQuery(con, read_file("sql/normalise_upw_14.sql")) # Add index on DOI column dbGetQuery(con, read_file("sql/normalise_upw_14_index.sql")) # Create table #dbGetQuery(con, read_file("sql/normalise_wos_12_17.sql")) # Add index on DOI column #dbGetQuery(con, read_file("sql/normalise_wos_12_17_index.sql")) ``` # DOI matches ```{r} # Count total matches, distinct WOS DOIs, distinct WOS items (PK_ITEMS) and # distinct Unpaywall DOIs doi_matches <- dbGetQuery(con, read_file("sql/doi_matches.sql")) ``` # Publication Years How do publication years differ between Unpaywall and WOS for articles matched with DOIs? ```{r} # Count total matches as a function of WOS publication year (Unpaywall # publication year = 2014). #doi_matches_pubyears <- dbGetQuery(con, read_file("sql/doi_matches_publication_years.sql")) doi_matches_pubyears %>% ggplot(aes(x=WOS_YEAR, y=MATCHES)) + geom_bar(stat="identity") + labs(x="WOS Publication Year", y="DOI Matches") + theme_bw() ``` # Article Types ```{r} # Count matches between Unpaywall and WOS article types #doi_matches_arttypes <- dbGetQuery(con, read_file("sql/doi_matches_article_types.sql")) doi_matches_arttypes %>% ggplot(aes(y=WOS_TYPE, x=UNPAYWALL_TYPE)) + geom_point(aes(size=MATCHES)) + labs(y="WOS Type", x="Unpaywall Type", size="DOI Matches") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_size(range=c(0.01,10), breaks=c(100, 1000, 10000, 100000, 1000000), labels = c(100, 1000, 10000, 100000, 1000000)) ``` # Title distance ```{r} # Matches a small sample (~1 %) of Unpaywall records to WOS records via DOI. # Edit distance similarity and Jaro-Winkler similarity are calculated using # Oracle built-in UTL_MATCH function doi_matches_titles <- dbGetQuery(con, read_file("sql/doi_matches_title_distances.sql")) doi_matches_titles %>% select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>% gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY", key="Method", value="Similarity") %>% ggplot(aes(x=Similarity, color=Method)) + stat_ecdf(geom="step", pad=FALSE) + labs(y="Cumulative Matched Percentage", x="Title Similarity") + theme_bw() + scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) + scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100)) # Compare title similarity of DOI-matches with similarity from a selection of # randomly matched articles #random_matches_titles <- dbGetQuery(con, read_file("sql/random_matches_title_distances.sql")) random_matches_titles %>% select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>% gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY", key="Method", value="Similarity") %>% ggplot(aes(x=Similarity, color=Method)) + stat_ecdf(geom="step", pad=FALSE) + labs(y="Cumulative Matched Percentage", x="Title Similarity") + theme_bw() + scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) + scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100)) ``` # Author Count ```{r} #doi_matches_authorcounts <- dbGetQuery(con, read_file("sql/doi_matches_all_authorcounts.sql")) doi_matches_authorcounts %>% ggplot(aes(x=UNPAYWALL_AUTHORS, y=WOS_AUTHORS)) + geom_tile(aes(fill=MATCHES)) + labs(x="Unpaywall Author Count", y="WOS Author Count") + scale_x_continuous(limits=c(0,100)) + scale_y_continuous(limits=c(0,100)) + scale_fill_viridis() + theme_bw() doi_matches_authorcounts %>% mutate(difference = UNPAYWALL_AUTHORS-WOS_AUTHORS) %>% group_by(difference) %>% summarize(sum_matches = sum(MATCHES)) %>% filter(difference > -10, difference < 10) %>% ggplot(aes(x=difference, y=sum_matches)) + geom_bar(stat='identity') ``` # Author Names ```{r} # Create table of first author names from WOS where PUBYEAR = 2014 #dbGetQuery(con, read_file("sql/create_first_author_names_wos14.sql")) # Add indexes to DOI #dbGetQuery(con, "CREATE INDEX IDX_FIRST_AUTHOR_NAMES_WOS14 ON FIRST_AUTHOR_NAMES_WOS14(DOI)") ```