-
Notifications
You must be signed in to change notification settings - Fork 14
Description
Some users may have noticed discrepancies between evaluation results obtained using the MTEB and CoIR frameworks. The primary reason for this difference lies in the handling of the title field in certain datasets within the CoIR benchmark.
In the MTEB evaluation framework, the title field is utilized by default, whereas CoIR does not incorporate this field in its standard evaluation. As a result, models evaluated under the MTEB framework may exhibit higher performance scores compared to CoIR. In the CoIR paper, the reported results exclude the title field.
How to Use the title Field in CoIR
To include the title field in CoIR evaluations, you can modify the encoding process as follows:
from transformers import AutoModel
import argparse
import logging
import coir
from coir.evaluation import COIR
from typing import List, Dict
import torch
from tqdm import tqdm
import numpy as np
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
# Load the remote model with trust_remote_code=True
base_model = AutoModel.from_pretrained(
"Salesforce/SFR-Embedding-Code-2B_R", trust_remote_code=True
)
# Define subclass to override methods
class CustomCodeXEmbedModel2B(base_model.__class__):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def encode_text(self, texts: List[str], batch_size: int = 12, max_length: int = 1024) -> np.ndarray:
logging.info(f"Encoding {len(texts)} texts...")
embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
batch_texts = texts[i:i + batch_size]
encoded_input = self.tokenizer(
batch_texts,
padding=True,
add_special_tokens=True,
truncation=True,
max_length=max_length,
return_tensors="pt"
)
device = next(self.model.parameters()).device
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
batch_embeddings = self.last_token_pool(model_output, encoded_input['attention_mask'])
embeddings.append(batch_embeddings.cpu())
embeddings = torch.cat(embeddings, dim=0)
logging.info(f"Encoded {embeddings.shape[0]} embeddings.")
return embeddings.to(dtype=torch.float32).numpy()
def encode_queries(self, queries: List[Dict], batch_size: int = 12, max_length: int = 1024, **kwargs) -> np.ndarray:
task_description = "Given Code or Text, retrieve relevant content."
all_queries = [get_detailed_instruct(task_description, query) for query in queries]
return self.encode_text(all_queries, batch_size, max_length)
def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 12, max_length: int = 1024, **kwargs) -> np.ndarray:
all_texts = [doc["title"] + " " + doc["text"] for doc in corpus]
return self.encode_text(all_texts, batch_size, max_length)
# Instantiate the custom model
custom_model = CustomCodeXEmbedModel2B.from_pretrained(
"Salesforce/SFR-Embedding-Code-2B_R", trust_remote_code=True
)
# Define datasets
datasets = [
"codetrans-dl", "stackoverflow-qa", "apps", "codefeedback-mt",
"codefeedback-st", "cosqa", "stackoverflow-qa", "synthetic-text2sql",
"codesearchnet", "codesearchnet-ccr"
]
for dataset in datasets:
tasks = coir.get_tasks(tasks=[dataset])
evaluation = COIR(tasks=tasks, batch_size=16)
results = evaluation.run(custom_model, output_folder='results')
logging.info(f"Results for {dataset}: {results}")This issue is specific to the current implementation. In future versions of the coir package, the title field will be included as a default component in the evaluation process.