feat: add max-length, new tasks and fix corner cases

guenthermi · guenthermi · commit 6b1548fc1233 · 2024-08-21T16:17:51.000+02:00
diff --git a/README.md b/README.md
@@ -60,10 +60,13 @@ We run this evaluation for various BeIR datasets with traditional chunking and o
 To split texts into chunks, we choose a straightforward method, which chunks the tests into strings of 256 tokens.
 Both the traditional and context-sensitive tests used the [jina-embeddings-v2-small-en](https://huggingface.co/jinaai/jina-embeddings-v2-small-en) model.
 
-| Dataset   | Traditional Chunking (nDCG@10) | Context-Sensitive Chunking (nDCG@10) |
-|-----------|--------------------------------|--------------------------------------|
-| SciFact   |                         64.20% |                               66.10% |
-| TRECCOVID |                           TODO |                                 TODO |
-
-In (all|most|some) cases, context-sensitive chunking improved the score.
+| Dataset   | AVG Document Length (characters) | Traditional Chunking (nDCG@10) | Context-Sensitive Chunking (nDCG@10) | No Chunking |
+|-----------|----------------------------------|--------------------------------|--------------------------------------|-------------|
+| SciFact   |                           1498.4 |                         64.20% |                           **66.10%** |      63.89% |
+| TRECCOVID |                           1116.7 |                         63.36% |                               64.70% |  **65.18%** |
+| FiQA2018  |                            767.2 |                         33.25% |                           **33.84%** |      33.43% |
+| NFCorpus  |                           1589.8 |                         23.46% |                               29.98% |  **30.40%** |
+| Quora     |                             62.2 |                         87.19% |                               87.19% |      87.19% |
+
+In all cases, context-sensitive chunking improved the score. In some cases, it also outperforms encoding the whole document into a single embedding, while for other datasets, no chunking performs best. However, this only makes sense if one does not need to rank chunks. One can also see that the average length of the documents correlates with greater improvement in the nDCG scores through context-sensitive chunking.
 
diff --git a/chunked_pooling/__init__.py b/chunked_pooling/__init__.py
@@ -29,14 +29,20 @@ def chunk_by_sentences(input_text: str, tokenizer: callable):
     return chunks, span_annotations
 
 
-def chunked_pooling(model_output: 'BatchEncoding', span_annotation: list):
+def chunked_pooling(
+    model_output: 'BatchEncoding', span_annotation: list, max_length=None
+):
     token_embeddings = model_output[0]
     outputs = []
     for embeddings, annotations in zip(token_embeddings, span_annotation):
-        if annotations[-1][1] > len(embeddings):
-            raise RuntimeError(
-                f'Not enough token embeddings {len(token_embeddings)} for your annotations {annotations}'
-            )
+        if (
+            max_length is not None
+        ):  # remove annotations which go bejond the max-length of the model
+            annotations = [
+                (start, min(end, max_length - 1))
+                for (start, end) in annotations
+                if start < (max_length - 1)
+            ]
         pooled_embeddings = [
             embeddings[start:end].sum(dim=0) / (end - start)
             for start, end in annotations
diff --git a/chunked_pooling/chunked_eval_tasks.py b/chunked_pooling/chunked_eval_tasks.py
@@ -78,6 +78,108 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
 
+class NFCorpusChunked(AbsTaskChunkedRetrieval):
+    metadata = TaskMetadata(
+        name="NFCorpusChunked",
+        dataset={
+            "path": "mteb/nfcorpus",
+            "revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814",
+            'name': 'NFCorpus',
+        },
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval",
+        reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+        n_samples=None,
+        avg_character_length=None,
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+class QuoraChunked(AbsTaskChunkedRetrieval):
+    metadata = TaskMetadata(
+        name="QuoraChunked",
+        dataset={
+            "path": "mteb/quora",
+            "revision": "e4e08e0b7dbe3c8700f0daef558ff32256715259",
+            "name": "QuoraRetrieval",
+        },
+        description=(
+            "QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
+            " question, find other (duplicate) questions."
+        ),
+        reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["dev", "test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+        n_samples=None,
+        avg_character_length=None,
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+class FiQA2018Chunked(AbsTaskChunkedRetrieval):
+    metadata = TaskMetadata(
+        name="FiQA2018Chunked",
+        description="Financial Opinion Mining and Question Answering",
+        reference="https://sites.google.com/view/fiqa/",
+        dataset={
+            "path": "mteb/fiqa",
+            "revision": "27a168819829fe9bcd655c2df245fb19452e8e06",
+            'name': 'FiQA2018',
+        },
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["train", "dev", "test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        form=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        socioeconomic_status=None,
+        annotations_creators=None,
+        dialect=None,
+        text_creation=None,
+        bibtex_citation=None,
+        n_samples=None,
+        avg_character_length=None,
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
 class TRECCOVIDChunked(AbsTaskChunkedRetrieval):
     metadata = TaskMetadata(
         name='TRECCOVIDChunked',
diff --git a/chunked_pooling/mteb_chunked_eval.py b/chunked_pooling/mteb_chunked_eval.py
@@ -2,10 +2,10 @@
 from typing import Any, Optional
 
 import numpy as np
+import torch
 from mteb.abstasks import AbsTask
 from mteb.evaluation.evaluators import RetrievalEvaluator
 from mteb.tasks import Retrieval
-
 from tqdm import tqdm
 
 from chunked_pooling import chunked_pooling
@@ -78,7 +78,7 @@ def evaluate(self, model, split='test', **kwargs):
         return scores
 
     def _evaluate_monolingual(
-        self, model, corpus, queries, relevant_docs, lang=None, batch_size=8, **kwargs
+        self, model, corpus, queries, relevant_docs, lang=None, batch_size=1, **kwargs
     ):
         # split corpus into chunks
         if not self.chunked_pooling_enabled:
@@ -118,24 +118,32 @@ def _evaluate_monolingual(
             ]
 
             corpus_embs = []
-            for inputs in tqdm(
-                self._batch_inputs(
-                    list(zip(corpus_texts, chunk_annotations)), batch_size=batch_size
-                ),
-                total=(len(corpus_texts) // batch_size),
-            ):
-                text_inputs = [x[0] for x in inputs]
-                annotations = [x[1] for x in inputs]
-                model_inputs = self.tokenizer(
-                    text_inputs, return_tensors='pt', padding=True
-                )
-                if model.device.type == 'cuda':
-                    model_inputs = {
-                        k: v.to(model.device) for k, v in model_inputs.items()
-                    }
-                model_outputs = model(**model_inputs)
-
-                corpus_embs.extend(chunked_pooling(model_outputs, annotations))
+            with torch.no_grad():
+                for inputs in tqdm(
+                    self._batch_inputs(
+                        list(zip(corpus_texts, chunk_annotations)),
+                        batch_size=batch_size,
+                    ),
+                    total=(len(corpus_texts) // batch_size),
+                ):
+                    text_inputs = [x[0] for x in inputs]
+                    annotations = [x[1] for x in inputs]
+                    model_inputs = self.tokenizer(
+                        text_inputs,
+                        return_tensors='pt',
+                        padding=True,
+                        truncation=True,
+                        max_length=8192,
+                    )
+                    if model.device.type == 'cuda':
+                        model_inputs = {
+                            k: v.to(model.device) for k, v in model_inputs.items()
+                        }
+                    model_outputs = model(**model_inputs)
+                    output_embs = chunked_pooling(
+                        model_outputs, annotations, max_length=8192
+                    )
+                    corpus_embs.extend(output_embs)
 
             max_chunks = max([len(x) for x in corpus_embs])
             k_values = self._calculate_k_values(max_chunks)