|
7 | 7 | import java.util.Locale; |
8 | 8 | import java.util.Objects; |
9 | 9 | import java.util.Optional; |
| 10 | +import java.util.stream.Collectors; |
10 | 11 |
|
11 | 12 | import org.jabref.logic.importer.FulltextFetcher; |
12 | 13 | import org.jabref.logic.net.URLDownload; |
| 14 | +import org.jabref.logic.util.strings.StringSimilarity; |
13 | 15 | import org.jabref.model.entry.BibEntry; |
14 | 16 | import org.jabref.model.entry.field.StandardField; |
15 | 17 | import org.jabref.model.entry.identifier.DOI; |
|
27 | 29 | * FulltextFetcher implementation that follows the DOI resolution redirects and scans for a full-text PDF URL. |
28 | 30 | */ |
29 | 31 | public class DoiResolution implements FulltextFetcher { |
30 | | - |
31 | 32 | private static final Logger LOGGER = LoggerFactory.getLogger(DoiResolution.class); |
32 | 33 |
|
33 | 34 | @Override |
34 | 35 | public Optional<URL> findFullText(BibEntry entry) throws IOException { |
35 | 36 | Objects.requireNonNull(entry); |
36 | | - Optional<URL> pdfLink = Optional.empty(); |
37 | 37 |
|
38 | 38 | Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); |
39 | 39 |
|
40 | | - if (doi.isPresent()) { |
41 | | - String sciLink = doi.get().getURIAsASCIIString(); |
42 | | - |
43 | | - // follow all redirects and scan for a single pdf link |
44 | | - if (!sciLink.isEmpty()) { |
45 | | - try { |
46 | | - Connection connection = Jsoup.connect(sciLink); |
47 | | - // pretend to be a browser (agent & referrer) |
48 | | - connection.userAgent(URLDownload.USER_AGENT); |
49 | | - connection.referrer("http://www.google.com"); |
50 | | - connection.followRedirects(true); |
51 | | - connection.ignoreHttpErrors(true); |
52 | | - // some publishers are quite slow (default is 3s) |
53 | | - connection.timeout(10000); |
54 | | - |
55 | | - Document html = connection.get(); |
56 | | - |
57 | | - // scan for PDF |
58 | | - Elements elements = html.body().select("a[href]"); |
59 | | - List<Optional<URL>> links = new ArrayList<>(); |
60 | | - |
61 | | - for (Element element : elements) { |
62 | | - String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH); |
63 | | - String hrefText = element.text().toLowerCase(Locale.ENGLISH); |
64 | | - // Only check if pdf is included in the link or inside the text |
65 | | - // ACM uses tokens without PDF inside the link |
66 | | - // See https://github.com/lehner/LocalCopy for more scrape ideas |
67 | | - if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) { |
68 | | - return Optional.of(new URL(href)); |
69 | | - } |
70 | | - |
71 | | - if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) { |
72 | | - links.add(Optional.of(new URL(href))); |
73 | | - } |
74 | | - } |
75 | | - // return if only one link was found (high accuracy) |
76 | | - if (links.size() == 1) { |
77 | | - LOGGER.info("Fulltext PDF found @ " + sciLink); |
78 | | - pdfLink = links.get(0); |
79 | | - } |
80 | | - } catch (UnsupportedMimeTypeException type) { |
81 | | - // this might be the PDF already as we follow redirects |
82 | | - if (type.getMimeType().startsWith("application/pdf")) { |
83 | | - return Optional.of(new URL(type.getUrl())); |
84 | | - } |
85 | | - LOGGER.warn("DoiResolution fetcher failed: ", type); |
86 | | - } catch (IOException e) { |
87 | | - LOGGER.warn("DoiResolution fetcher failed: ", e); |
| 40 | + if (!doi.isPresent()) { |
| 41 | + return Optional.empty(); |
| 42 | + } |
| 43 | + |
| 44 | + String doiLink = doi.get().getURIAsASCIIString(); |
| 45 | + if (doiLink.isEmpty()) { |
| 46 | + return Optional.empty(); |
| 47 | + } |
| 48 | + |
| 49 | + // follow all redirects and scan for a single pdf link |
| 50 | + try { |
| 51 | + Connection connection = Jsoup.connect(doiLink); |
| 52 | + // pretend to be a browser (agent & referrer) |
| 53 | + connection.userAgent(URLDownload.USER_AGENT); |
| 54 | + connection.referrer("http://www.google.com"); |
| 55 | + connection.followRedirects(true); |
| 56 | + connection.ignoreHttpErrors(true); |
| 57 | + // some publishers are quite slow (default is 3s) |
| 58 | + connection.timeout(10000); |
| 59 | + |
| 60 | + Document html = connection.get(); |
| 61 | + // scan for PDF |
| 62 | + Elements hrefElements = html.body().select("a[href]"); |
| 63 | + |
| 64 | + List<URL> links = new ArrayList<>(); |
| 65 | + for (Element element : hrefElements) { |
| 66 | + String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH); |
| 67 | + String hrefText = element.text().toLowerCase(Locale.ENGLISH); |
| 68 | + // Only check if pdf is included in the link or inside the text |
| 69 | + // ACM uses tokens without PDF inside the link |
| 70 | + // See https://github.com/lehner/LocalCopy for more scrape ideas |
| 71 | + // link with "PDF" in title tag |
| 72 | + if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) { |
| 73 | + return Optional.of(new URL(href)); |
88 | 74 | } |
| 75 | + |
| 76 | + if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) { |
| 77 | + links.add(new URL(href)); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + // return if only one link was found (high accuracy) |
| 82 | + if (links.size() == 1) { |
| 83 | + LOGGER.info("Fulltext PDF found @ " + doiLink); |
| 84 | + return Optional.of(links.get(0)); |
| 85 | + } |
| 86 | + // return if links are similar or multiple links are similar |
| 87 | + return findSimilarLinks(links); |
| 88 | + } catch (UnsupportedMimeTypeException type) { |
| 89 | + // this might be the PDF already as we follow redirects |
| 90 | + if (type.getMimeType().startsWith("application/pdf")) { |
| 91 | + return Optional.of(new URL(type.getUrl())); |
89 | 92 | } |
| 93 | + LOGGER.warn("DoiResolution fetcher failed: ", type); |
| 94 | + } catch (IOException e) { |
| 95 | + LOGGER.warn("DoiResolution fetcher failed: ", e); |
| 96 | + } |
| 97 | + |
| 98 | + return Optional.empty(); |
| 99 | + } |
| 100 | + |
| 101 | + private Optional<URL> findSimilarLinks(List<URL> urls) { |
| 102 | + List<URL> distinctLinks = urls.stream().distinct().collect(Collectors.toList()); |
| 103 | + |
| 104 | + if (distinctLinks.isEmpty()) { |
| 105 | + return Optional.empty(); |
| 106 | + } |
| 107 | + // equal |
| 108 | + if (distinctLinks.size() == 1) { |
| 109 | + return Optional.of(distinctLinks.get(0)); |
90 | 110 | } |
91 | | - return pdfLink; |
| 111 | + // similar |
| 112 | + final String firstElement = distinctLinks.get(0).toString(); |
| 113 | + StringSimilarity similarity = new StringSimilarity(); |
| 114 | + List<URL> similarLinks = distinctLinks.stream().filter(elem -> similarity.isSimilar(firstElement, elem.toString())).collect(Collectors.toList()); |
| 115 | + if (similarLinks.size() == distinctLinks.size()) { |
| 116 | + return Optional.of(similarLinks.get(0)); |
| 117 | + } |
| 118 | + |
| 119 | + return Optional.empty(); |
92 | 120 | } |
93 | 121 |
|
94 | 122 | @Override |
|
0 commit comments