Skip to content

Commit 440beb3

Browse files
committed
feat: add example for chunked pooling
1 parent b588479 commit 440beb3

File tree

3 files changed

+287
-0
lines changed

3 files changed

+287
-0
lines changed

chunked_pooling/__init__.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
def chunk_by_sentences(input_text: str, tokenizer: callable):
2+
"""
3+
Split the input text into sentences using the tokenizer
4+
:param input_text: The text snippet to split into sentences
5+
:param tokenizer: The tokenizer to use
6+
:return: A tuple containing the list of text chunks and their corresponding token spans
7+
"""
8+
inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
9+
punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
10+
sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
11+
token_offsets = inputs['offset_mapping'][0]
12+
token_ids = inputs['input_ids'][0]
13+
chunk_positions = [
14+
(i, int(start + 1))
15+
for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
16+
if token_id == punctuation_mark_id
17+
and (
18+
token_offsets[i + 1][0] - token_offsets[i][1] > 0
19+
or token_ids[i + 1] == sep_id
20+
)
21+
]
22+
chunks = [
23+
input_text[x[1] : y[1]]
24+
for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
25+
]
26+
span_annotations = [
27+
(x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
28+
]
29+
return chunks, span_annotations
30+
31+
32+
def chunked_pooling(model_output: 'BatchEncoding', span_annotation: list):
33+
token_embeddings = model_output[0]
34+
outputs = []
35+
36+
for embeddings, annotations in zip(token_embeddings, span_annotation):
37+
pooled_embeddings = [
38+
embeddings[start:end].sum(dim=0) / (end - start)
39+
for start, end in annotations
40+
]
41+
pooled_embeddings = [
42+
embedding.detach().cpu().numpy() for embedding in pooled_embeddings
43+
]
44+
outputs.append(pooled_embeddings)
45+
46+
return outputs

0 commit comments

Comments
 (0)