|
| 1 | +import re |
| 2 | +import json |
| 3 | +from pathlib import Path |
| 4 | +from collections import Counter |
| 5 | +from sota_extractor2.data.elastic import Reference2 |
| 6 | +from elasticsearch_dsl import connections |
| 7 | +from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore |
| 8 | +from tqdm import tqdm |
| 9 | +from elasticsearch.helpers import bulk |
| 10 | +from elasticsearch_dsl.connections import connections |
| 11 | +import http.client |
| 12 | +import xml.etree.ElementTree as ET |
| 13 | + |
| 14 | +# required for bulk saving |
| 15 | +http.client._MAXHEADERS = 1000 |
| 16 | + |
| 17 | +connections.create_connection(hosts=['elasticsearch'], timeout=20) |
| 18 | + |
| 19 | +papers_path = Path("/data/dblp/papers/papers-with-abstracts.json") |
| 20 | + |
| 21 | + |
| 22 | +def read_pwc_papers(path): |
| 23 | + with open(path, "rt") as f: |
| 24 | + return json.load(f) |
| 25 | + |
| 26 | + |
| 27 | +arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$") |
| 28 | +arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$") |
| 29 | +pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$") |
| 30 | + |
| 31 | + |
| 32 | +def from_paper_dict(paper): |
| 33 | + authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()] |
| 34 | + arxiv_id = None |
| 35 | + if paper["arxiv_id"]: |
| 36 | + arxiv_id = paper["arxiv_id"] |
| 37 | + elif paper["url_abs"]: |
| 38 | + m = arxiv_url_re.match(paper["url_abs"]) |
| 39 | + if m: |
| 40 | + arxiv_id = m.group("arxiv_id") |
| 41 | + title = None |
| 42 | + if paper["title"]: |
| 43 | + title = paper["title"].rstrip(" .") |
| 44 | + slug = None |
| 45 | + if paper["paper_url"]: |
| 46 | + m = pwc_url_re.match(paper["paper_url"]) |
| 47 | + if m: |
| 48 | + slug = m.group("slug") |
| 49 | + return PReference( |
| 50 | + title=title, |
| 51 | + authors=authors, |
| 52 | + ptr=paper["url_pdf"] or paper["url_abs"], |
| 53 | + arxiv_id=arxiv_id, |
| 54 | + pwc_slug=slug, |
| 55 | + date=paper["date"], |
| 56 | + orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.", |
| 57 | + ) |
| 58 | + |
| 59 | + |
| 60 | +def _text(elem): return "".join(elem.itertext()) |
| 61 | + |
| 62 | + |
| 63 | +def from_paper_elem(elem): |
| 64 | + authors_str = [_text(a).strip() for a in elem.findall("author")] |
| 65 | + authors_str = [s for s in authors_str if s] |
| 66 | + authors = [PAuthor.from_fullname(a) for a in authors_str] |
| 67 | + arxiv_id = None |
| 68 | + url = None |
| 69 | + for ee in elem.findall("ee"): |
| 70 | + if url is None or "oa" in ee.attrib: # prefere open access urls |
| 71 | + url = _text(ee) |
| 72 | + m = arxiv_url_only_re.match(_text(ee)) |
| 73 | + if m: |
| 74 | + url = _text(ee) # prefere arxiv urls |
| 75 | + arxiv_id = m.group("arxiv_id") |
| 76 | + break |
| 77 | + title = None |
| 78 | + title_elem = elem.find("title") |
| 79 | + if title_elem is not None: |
| 80 | + title = _text(title_elem).rstrip(" .") |
| 81 | + return PReference( |
| 82 | + title=title, |
| 83 | + authors=authors, |
| 84 | + ptr=url, |
| 85 | + arxiv_id=arxiv_id, |
| 86 | + orig_ref=f"{', '.join(authors_str)}. {title}.", |
| 87 | + ) |
| 88 | + |
| 89 | + |
| 90 | +def merge_references(p_references, elastic_references): |
| 91 | + uids = Counter([p_ref.unique_id() for p_ref in p_references]) |
| 92 | + for p_ref in tqdm(p_references): |
| 93 | + uid = p_ref.unique_id() |
| 94 | + # ignore papers with too common title |
| 95 | + # (often these are "Editorial", "Preface", "Letter") |
| 96 | + if uids[uid] > 5: |
| 97 | + continue |
| 98 | + e_ref = elastic_references.get(uid) |
| 99 | + if not e_ref: |
| 100 | + e_ref = Reference2.from_ref(p_ref) |
| 101 | + elastic_references[uid] = e_ref |
| 102 | + e_ref.add_ref(p_ref) |
| 103 | + |
| 104 | + |
| 105 | +def save_all(docs): |
| 106 | + bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500) |
| 107 | + |
| 108 | + |
| 109 | +def get_elastic_references(unique_ids, chunk_size=1000): |
| 110 | + elastic_references = {} |
| 111 | + i = 0 |
| 112 | + while i < len(unique_ids): |
| 113 | + ids = unique_ids[i:i+chunk_size] |
| 114 | + i += chunk_size |
| 115 | + elastic_references.update({ |
| 116 | + uid: ref for uid, ref in zip(ids, Reference2.mget(ids)) |
| 117 | + if ref |
| 118 | + }) |
| 119 | + return elastic_references |
| 120 | + |
| 121 | + |
| 122 | +def init_pwc(): |
| 123 | + # read list of ML papers (titles, abstracts, arxiv ids, etc.) |
| 124 | + all_papers = read_pwc_papers(papers_path) |
| 125 | + |
| 126 | + # change dicts into PReferences |
| 127 | + p_references = [from_paper_dict(paper) for paper in all_papers] |
| 128 | + |
| 129 | + # keep references with valid ids |
| 130 | + p_references = [ref for ref in p_references if ref.unique_id()] |
| 131 | + |
| 132 | + all_ids = list(set(ref.unique_id() for ref in p_references)) |
| 133 | + elastic_references = get_elastic_references(all_ids) |
| 134 | + merge_references(p_references, elastic_references) |
| 135 | + save_all(elastic_references.values()) |
| 136 | + |
| 137 | + |
| 138 | +def init_dblp(): |
| 139 | + dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-noent.xml")) |
| 140 | + #dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml")) |
| 141 | + root = dblp_xml.getroot() |
| 142 | + p_references = [from_paper_elem(elem) for elem in root] |
| 143 | + p_references = [ref for ref in p_references if ref.unique_id()] |
| 144 | + |
| 145 | + all_ids = list(set(ref.unique_id() for ref in p_references)) |
| 146 | + # todo: add references2 index initialization |
| 147 | + elastic_references = {} #get_elastic_references(all_ids) |
| 148 | + |
| 149 | + merge_references(p_references, elastic_references) |
| 150 | + save_all(elastic_references.values()) |
| 151 | + |
| 152 | +# Reference2._index.delete() |
| 153 | +Reference2.init() |
| 154 | +init_dblp() |
| 155 | +init_pwc() |
0 commit comments