michaeldinzinger
diff --git a/‎chunked_pooling/__init__.py‎
Lines changed: 46 additions & 0 deletions b/‎chunked_pooling/__init__.py‎
Lines changed: 46 additions & 0 deletions
@@ -0,0 +1,46 @@
+def chunk_by_sentences(input_text: str, tokenizer: callable):
+    """
+    Split the input text into sentences using the tokenizer
+    :param input_text: The text snippet to split into sentences
+    :param tokenizer: The tokenizer to use
+    :return: A tuple containing the list of text chunks and their corresponding token spans
+    """
+    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
+    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
+    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
+    token_offsets = inputs['offset_mapping'][0]
+    token_ids = inputs['input_ids'][0]
+    chunk_positions = [
+        (i, int(start + 1))
+        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
+        if token_id == punctuation_mark_id
+        and (
+            token_offsets[i + 1][0] - token_offsets[i][1] > 0
+            or token_ids[i + 1] == sep_id
+        )
+    ]
+    chunks = [
+        input_text[x[1] : y[1]]
+        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
+    ]
+    span_annotations = [
+        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
+    ]
+    return chunks, span_annotations
+
+
+def chunked_pooling(model_output: 'BatchEncoding', span_annotation: list):
+    token_embeddings = model_output[0]
+    outputs = []
+
+    for embeddings, annotations in zip(token_embeddings, span_annotation):
+        pooled_embeddings = [
+            embeddings[start:end].sum(dim=0) / (end - start)
+            for start, end in annotations
+        ]
+        pooled_embeddings = [
+            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
+        ]
+        outputs.append(pooled_embeddings)
+
+    return outputs