wos_unpaywall_matching/wos_unpaywall_matching.Rmd at master · njahn82/wos_unpaywall_matching · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
---
title: "R Notebook"
output: html_notebook
---

# Initial Setup

```{r}
# Call libraries
library(tidyverse)
library(odbc)
library(DBI)
library(dbplyr)
library(viridis)

# Set private credentials - these should be set in .Renviron file
kb_username <- Sys.getenv("KB_USERNAME")
kb_password <- Sys.getenv("KB_PASSWORD")

```

```{r}

# Open DB connection
con <- dbConnect(odbc(), "KB", uid=kb_username, pwd=kb_password)

```


# Preprocessing

First create a 'normalised' version of the Unpaywall dataset, using the following rules:

- DOI to lowercase and trim whitespace
- Article title to lowercase and trim whitespace
- Block first and last author names (e.g. Nicholas Fraser -> frasern). Remove
all non-alphabetical characters from surname and concatenate with first letter
of family name
- Journal name to lowercase and trim whitespace

```{r}
# Create table
dbGetQuery(con, read_file("sql/normalise_upw_14.sql"))
# Add index on DOI column
dbGetQuery(con, read_file("sql/normalise_upw_14_index.sql"))
# Create table
#dbGetQuery(con, read_file("sql/normalise_wos_12_17.sql"))
# Add index on DOI column
#dbGetQuery(con, read_file("sql/normalise_wos_12_17_index.sql"))

```


# DOI matches

```{r}

# Count total matches, distinct WOS DOIs, distinct WOS items (PK_ITEMS) and
# distinct Unpaywall DOIs

doi_matches <- dbGetQuery(con, read_file("sql/doi_matches.sql"))

```


# Publication Years

How do publication years differ between Unpaywall and WOS for articles matched with DOIs?

```{r}

# Count total matches as a function of WOS publication year (Unpaywall
# publication year = 2014).

#doi_matches_pubyears <- dbGetQuery(con, read_file("sql/doi_matches_publication_years.sql"))

doi_matches_pubyears %>%
  ggplot(aes(x=WOS_YEAR, y=MATCHES)) +
  geom_bar(stat="identity") +
  labs(x="WOS Publication Year", y="DOI Matches") +
  theme_bw()

```

# Article Types

```{r}

# Count matches between Unpaywall and WOS article types

#doi_matches_arttypes <- dbGetQuery(con, read_file("sql/doi_matches_article_types.sql"))

doi_matches_arttypes %>%
  ggplot(aes(y=WOS_TYPE, x=UNPAYWALL_TYPE)) +
  geom_point(aes(size=MATCHES)) +
  labs(y="WOS Type", x="Unpaywall Type", size="DOI Matches") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_size(range=c(0.01,10),
             breaks=c(100, 1000, 10000, 100000, 1000000),
             labels = c(100, 1000, 10000, 100000, 1000000))

```

# Title distance

```{r}

# Matches a small sample (~1 %) of Unpaywall records to WOS records via DOI.
# Edit distance similarity and Jaro-Winkler similarity are calculated using
# Oracle built-in UTL_MATCH function

doi_matches_titles <- dbGetQuery(con, read_file("sql/doi_matches_title_distances.sql"))

doi_matches_titles %>%
  select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>%
  gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY",
         key="Method",
         value="Similarity") %>%
  ggplot(aes(x=Similarity, color=Method)) +
  stat_ecdf(geom="step", pad=FALSE) +
  labs(y="Cumulative Matched Percentage", x="Title Similarity") +
  theme_bw() +
  scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) +
  scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100))

# Compare title similarity of DOI-matches with similarity from a selection of
# randomly matched articles

#random_matches_titles <- dbGetQuery(con, read_file("sql/random_matches_title_distances.sql"))

random_matches_titles %>%
  select(EDIT_DISTANCE_SIMILARITY, JARO_WINKLER_SIMILARITY) %>%
  gather("EDIT_DISTANCE_SIMILARITY", "JARO_WINKLER_SIMILARITY",
         key="Method",
         value="Similarity") %>%
  ggplot(aes(x=Similarity, color=Method)) +
  stat_ecdf(geom="step", pad=FALSE) +
  labs(y="Cumulative Matched Percentage", x="Title Similarity") +
  theme_bw() +
  scale_y_continuous(breaks=seq(from=0, to=1, by=0.1), labels=scales::percent) +
  scale_x_continuous(breaks=seq(from=0, to=100, by=10), limits=c(0,100))

```

# Author Count

```{r}

#doi_matches_authorcounts <- dbGetQuery(con, read_file("sql/doi_matches_all_authorcounts.sql"))

doi_matches_authorcounts %>%
  ggplot(aes(x=UNPAYWALL_AUTHORS, y=WOS_AUTHORS)) +
  geom_tile(aes(fill=MATCHES)) +
  labs(x="Unpaywall Author Count", y="WOS Author Count") +
  scale_x_continuous(limits=c(0,100)) +
  scale_y_continuous(limits=c(0,100)) +
  scale_fill_viridis() +
  theme_bw()

doi_matches_authorcounts %>%
  mutate(difference = UNPAYWALL_AUTHORS-WOS_AUTHORS) %>%
  group_by(difference) %>%
  summarize(sum_matches = sum(MATCHES)) %>%
  filter(difference > -10, difference < 10) %>%
  ggplot(aes(x=difference, y=sum_matches)) +
  geom_bar(stat='identity')

```

# Author Names

```{r}

# Create table of first author names from WOS where PUBYEAR = 2014
#dbGetQuery(con, read_file("sql/create_first_author_names_wos14.sql"))
# Add indexes to DOI
#dbGetQuery(con, "CREATE INDEX IDX_FIRST_AUTHOR_NAMES_WOS14 ON FIRST_AUTHOR_NAMES_WOS14(DOI)")


```