samzhang8
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎environment.yml‎
Lines changed: 1 addition & 1 deletion b/‎environment.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎extract_tables.py‎
Lines changed: 10 additions & 0 deletions b/‎extract_tables.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎init_references.py‎
Lines changed: 155 additions & 0 deletions b/‎init_references.py‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎latex2html.sh‎
Lines changed: 6 additions & 2 deletions b/‎latex2html.sh‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎parse_references.py‎
Lines changed: 3 additions & 27 deletions b/‎parse_references.py‎
Lines changed: 3 additions & 27 deletions
diff --git a/‎sota_extractor2/config.py‎
Lines changed: 1 addition & 1 deletion b/‎sota_extractor2/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sota_extractor2/data/elastic.py‎
Lines changed: 43 additions & 2 deletions b/‎sota_extractor2/data/elastic.py‎
Lines changed: 43 additions & 2 deletions
@@ -98,3 +98,6 @@ venv.bak/
 .mypy_cache/
 .idea/*
 .vscode/settings.json
+
+# pytest
+.pytest_cache
@@ -34,3 +34,9 @@ To test the whole extraction on a single file run
 ```
 make test
 ```
+
+### Unit Tests
+
+```
+PYTHONPATH=. py.test
+```
@@ -13,7 +13,7 @@ dependencies:
 - python=3.7.1
 - pyahocorasick=1.4.0
 - Unidecode=1.0.23
-- elasticsearch-dsl=7.0.0
+- elasticsearch-dsl=6.3.1
 - ipython=7.5.0
 - joblib=0.13.2
 - python-magic=0.4.15
@@ -276,6 +276,16 @@ def save_tables(data, outdir):
         json.dump(metadata, f)
 
 
+def load_tables(path):
+    path = Path(path)
+    with open(path / "metadata.json", "r") as f:
+        metadata = json.load(f)
+
+    return [Table.from_file(
+        path,
+        table_metadata) for table_metadata in metadata]
+
+
 def set_ids_by_labels(soup):
     captions = soup.select(".ltx_caption")
     for caption in captions:
 
@@ -0,0 +1,155 @@
+import re
+import json
+from pathlib import Path
+from collections import Counter
+from sota_extractor2.data.elastic import Reference2
+from elasticsearch_dsl import connections
+from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
+from tqdm import tqdm
+from elasticsearch.helpers import bulk
+from elasticsearch_dsl.connections import connections
+import http.client
+import xml.etree.ElementTree as ET
+
+# required for bulk saving
+http.client._MAXHEADERS = 1000
+
+connections.create_connection(hosts=['elasticsearch'], timeout=20)
+
+papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")
+
+
+def read_pwc_papers(path):
+    with open(path, "rt") as f:
+        return json.load(f)
+
+
+arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
+arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
+pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$")
+
+
+def from_paper_dict(paper):
+    authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()]
+    arxiv_id = None
+    if paper["arxiv_id"]:
+        arxiv_id = paper["arxiv_id"]
+    elif paper["url_abs"]:
+        m = arxiv_url_re.match(paper["url_abs"])
+        if m:
+            arxiv_id = m.group("arxiv_id")
+    title = None
+    if paper["title"]:
+        title = paper["title"].rstrip(" .")
+    slug = None
+    if paper["paper_url"]:
+        m = pwc_url_re.match(paper["paper_url"])
+        if m:
+            slug = m.group("slug")
+    return PReference(
+        title=title,
+        authors=authors,
+        ptr=paper["url_pdf"] or paper["url_abs"],
+        arxiv_id=arxiv_id,
+        pwc_slug=slug,
+        date=paper["date"],
+        orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.",
+    )
+
+
+def _text(elem): return "".join(elem.itertext())
+
+
+def from_paper_elem(elem):
+    authors_str = [_text(a).strip() for a in elem.findall("author")]
+    authors_str = [s for s in authors_str if s]
+    authors = [PAuthor.from_fullname(a) for a in authors_str]
+    arxiv_id = None
+    url = None
+    for ee in elem.findall("ee"):
+        if url is None or "oa" in ee.attrib: # prefere open access urls
+            url = _text(ee)
+        m = arxiv_url_only_re.match(_text(ee))
+        if m:
+            url = _text(ee) # prefere arxiv urls
+            arxiv_id = m.group("arxiv_id")
+            break
+    title = None
+    title_elem = elem.find("title")
+    if title_elem is not None:
+        title = _text(title_elem).rstrip(" .")
+    return PReference(
+        title=title,
+        authors=authors,
+        ptr=url,
+        arxiv_id=arxiv_id,
+        orig_ref=f"{', '.join(authors_str)}. {title}.",
+    )
+
+
+def merge_references(p_references, elastic_references):
+    uids = Counter([p_ref.unique_id() for p_ref in p_references])
+    for p_ref in tqdm(p_references):
+        uid = p_ref.unique_id()
+        # ignore papers with too common title
+        # (often these are "Editorial", "Preface", "Letter")
+        if uids[uid] > 5:
+            continue
+        e_ref = elastic_references.get(uid)
+        if not e_ref:
+            e_ref = Reference2.from_ref(p_ref)
+            elastic_references[uid] = e_ref
+        e_ref.add_ref(p_ref)
+
+
+def save_all(docs):
+    bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500)
+
+
+def get_elastic_references(unique_ids, chunk_size=1000):
+    elastic_references = {}
+    i = 0
+    while i < len(unique_ids):
+        ids = unique_ids[i:i+chunk_size]
+        i += chunk_size
+        elastic_references.update({
+            uid: ref for uid, ref in zip(ids, Reference2.mget(ids))
+            if ref
+        })
+    return elastic_references
+
+
+def init_pwc():
+    # read list of ML papers (titles, abstracts, arxiv ids, etc.)
+    all_papers = read_pwc_papers(papers_path)
+
+    # change dicts into PReferences
+    p_references = [from_paper_dict(paper) for paper in all_papers]
+
+    # keep references with valid ids
+    p_references = [ref for ref in p_references if ref.unique_id()]
+
+    all_ids = list(set(ref.unique_id() for ref in p_references))
+    elastic_references = get_elastic_references(all_ids)
+    merge_references(p_references, elastic_references)
+    save_all(elastic_references.values())
+
+
+def init_dblp():
+    dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-noent.xml"))
+    #dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml"))
+    root = dblp_xml.getroot()
+    p_references = [from_paper_elem(elem) for elem in root]
+    p_references = [ref for ref in p_references if ref.unique_id()]
+
+    all_ids = list(set(ref.unique_id() for ref in p_references))
+    # todo: add references2 index initialization
+    elastic_references = {} #get_elastic_references(all_ids)
+
+    merge_references(p_references, elastic_references)
+    save_all(elastic_references.values())
+
+# Reference2._index.delete()
+Reference2.init()
+init_dblp()
+init_pwc()
@@ -1,10 +1,11 @@
 #!/usr/bin/env bash
 OUTNAME="$1"
 echo $OUTNAME
-RO_SOURCE_DIR="/files/ro-source"
+RO_SOURCE_DIR="${2:-/files/ro-source}"
 SOURCE_DIR="/files/source"
-OUTPUT_DIR="/files/htmls"
+OUTPUT_DIR="${3:-/files/htmls}"
 
+mkdir -p /files
 cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
 
 # turn tikzpciture instances into comments
@@ -23,6 +24,9 @@ do
 done
 
 MAINTEX=$(python3 /files/guess_main.py "$SOURCE_DIR")
+[ ! -f "$MAINTEX" ] && exit 1
+
 timeout -s KILL 300 engrafo "$MAINTEX" /files/output
 
+[ ! -f /files/output/index.html ] && exit 117
 cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME"
@@ -17,31 +17,7 @@
 pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
 
 
-def get_refstrings(p):
-    paper = p.text if hasattr(p, 'text') else p
-    if not hasattr(paper, 'fragments'):
-        return
-    fragments = paper.fragments
-    ref_sec_started = False
-    for f in reversed(fragments):
-        if f.header.startswith('xxanchor-bib'):
-            ref_sec_started = True
-            yield f.text
-        elif ref_sec_started:
-            break  # the refsection is only at the end of paper
-
-
-_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
-def extract_refs(p):
-    for ref in get_refstrings(p):
-        m = _ref_re.match(ref)
-        if m:
-            ref_id, ref_str = m.groups()
-            yield {
-                "paper_arxiv_id": p.arxiv_no_version,
-                "ref_id": ref_id,
-                "ref_str": ref_str.strip(r'\s')
-            }
+
 
 class PaperCollectionReferenceParser:
     def __init__(self):
@@ -52,13 +28,13 @@ def __init__(self):
     def parse_refs(self, p):
         for d in extract_refs(p):
             if not d["ref_id"].startswith("pwc-"):
-                key = d["paper_arxiv_id"] + d["ref_id"]
+                key = p.arxiv_no_version + d["ref_id"]
                 if key not in self.cache:
                     new_id = self.refsdb.add_reference_string(d['ref_str'])
                     if new_id is not None:
                         new_id = "pwc-" + new_id
                     self.cache[key] = new_id
-                if self.cache[key] and len(self.cache[key]) > 500:  # fix to self.cache to make the id compatible with elastic
+                if self.cache[key] and len(self.cache[key]) > ID_LIMIT:  # fix to self.cache to make the id compatible with elastic
                     self.cache[key] = self.cache[key][:ID_LIMIT]
                 yield d["ref_id"], self.cache[key]
         self.refsdb.sync()
 
@@ -14,7 +14,7 @@
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
-grobid = dict(host='10.0.1.145')
+grobid = dict(host='grobid')
 
 arxiv = data/'arxiv'
 htmls_raw = arxiv/'htmls'
 
@@ -108,7 +108,11 @@ class Fragment(Document):
     )
     outer_headers = Text(analyzer=html_strip, )
 
+    class Meta:
+        doc_type = '_doc'
+
     class Index:
+        doc_type = '_doc'
         name = 'paper-fragments'
 
     @classmethod
@@ -138,7 +142,11 @@ class Paper(Document):
         analyzer=html_strip
     )
 
+    class Meta:
+        doc_type = '_doc'
+
     class Index:
+        doc_type = '_doc'
         name = 'papers'
 
     def to_json(self):
@@ -290,26 +298,42 @@ class Reference(Document):
     urls = Keyword()
     is_ml = Boolean()
 
+    class Meta:
+        doc_type = '_doc'
+
     class Index:
+        doc_type = '_doc'
         name = 'references'
 
     def __repr__(self):
         return f"{self.title} / {self.authors}"
 
+
 ID_LIMIT=480
 
+
+class Author2(InnerDoc):
+    forenames = Text(fields={'keyword': Keyword()})
+    surname = Text(fields={'keyword': Keyword()})
+
+
 class Reference2(Document):
     title = Text()
-    authors = Text()
+    authors = Object(Author2)
 
     idno = Keyword()
     date = Date()
     ptr = Keyword()
 
     arxiv_id = Keyword()
+    pwc_slug = Keyword()
     orig_refs = Text()
 
+    class Meta:
+        doc_type = '_doc'
+
     class Index:
+        doc_type = '_doc'
         name = 'references2'
 
     def add_ref(self, ref):
@@ -318,14 +342,15 @@ def add_ref(self, ref):
         # self.refs.append(asdict(ref))
         if ref.arxiv_id:
             self.arxiv_id = ref.arxiv_id
+        if ref.pwc_slug:
+            self.pwc_slug = ref.pwc_slug
         if ref.idno:
             if hasattr(ref.idno, 'values'):
                 self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
             elif isinstance(ref.idno, str):
                 self.idno = ref.idno
         # if ref.date:
         #     self.date = ref.date
-        self.date = None
         if ref.ptr:
             self.ptr = ref.ptr
         self.orig_refs = self.orig_refs if self.orig_refs else []
@@ -414,3 +439,19 @@ def display_fragment(f, cell_type="", display=True):
     if display:
         display_html(html)
     return html
+
+
+def query_for_evidences(paper_id, values, topk=5, fragment_size=50):
+    evidence_query = Fragment.search().highlight(
+        'text', pre_tags="<b>", post_tags="</b>", fragment_size=fragment_size)
+
+    query = {
+        "query": ' '.join(values)
+    }
+
+    fragments = list(evidence_query
+                     .filter('term', paper_id=paper_id)
+                     .query('match', text=query)[:topk]
+                     )
+
+    return '\n'.join([' '.join(f.meta['highlight']['text']) for f in fragments])