|
| 1 | +def chunk_by_sentences(input_text: str, tokenizer: callable): |
| 2 | + """ |
| 3 | + Split the input text into sentences using the tokenizer |
| 4 | + :param input_text: The text snippet to split into sentences |
| 5 | + :param tokenizer: The tokenizer to use |
| 6 | + :return: A tuple containing the list of text chunks and their corresponding token spans |
| 7 | + """ |
| 8 | + inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True) |
| 9 | + punctuation_mark_id = tokenizer.convert_tokens_to_ids('.') |
| 10 | + sep_id = tokenizer.convert_tokens_to_ids('[SEP]') |
| 11 | + token_offsets = inputs['offset_mapping'][0] |
| 12 | + token_ids = inputs['input_ids'][0] |
| 13 | + chunk_positions = [ |
| 14 | + (i, int(start + 1)) |
| 15 | + for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets)) |
| 16 | + if token_id == punctuation_mark_id |
| 17 | + and ( |
| 18 | + token_offsets[i + 1][0] - token_offsets[i][1] > 0 |
| 19 | + or token_ids[i + 1] == sep_id |
| 20 | + ) |
| 21 | + ] |
| 22 | + chunks = [ |
| 23 | + input_text[x[1] : y[1]] |
| 24 | + for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions) |
| 25 | + ] |
| 26 | + span_annotations = [ |
| 27 | + (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions) |
| 28 | + ] |
| 29 | + return chunks, span_annotations |
| 30 | + |
| 31 | + |
| 32 | +def chunked_pooling(model_output: 'BatchEncoding', span_annotation: list): |
| 33 | + token_embeddings = model_output[0] |
| 34 | + outputs = [] |
| 35 | + |
| 36 | + for embeddings, annotations in zip(token_embeddings, span_annotation): |
| 37 | + pooled_embeddings = [ |
| 38 | + embeddings[start:end].sum(dim=0) / (end - start) |
| 39 | + for start, end in annotations |
| 40 | + ] |
| 41 | + pooled_embeddings = [ |
| 42 | + embedding.detach().cpu().numpy() for embedding in pooled_embeddings |
| 43 | + ] |
| 44 | + outputs.append(pooled_embeddings) |
| 45 | + |
| 46 | + return outputs |
0 commit comments