Source code for saga_llm_evaluation.helpers.retrieval_metrics

from langchain.evaluation import load_evaluator
from llama_index.core.evaluation import RetrieverEvaluator


[docs] class Relevance: def __init__(self, llm=None) -> None: """ Relevance computes the relevance score of retrieved information relatively to the query. In other words, it validates that the retrieved information is related to the query topic. Args: llm (LangChain BaseLanguageModel): model used for evaluation. If None,\ the model is chosen as "gpt-4" by default. """ self.llm = llm if llm else None self.criterion = { "retrieval_relevance": "Are the retrieved information related to the query?", } self.evaluator = load_evaluator( "criteria", criteria=self.criterion, llm=self.llm )
[docs] def compute(self, contexts: list, query: str): """ This method computes the relevance score of retrieved information relatively to the query. In other words, it validates that the retrieved information is related to the query topic. Args: contexts (list): List of retrieved information. query (str): Query topic. Returns: dict: Relevance score for the candidate sentence. The dictionary contains the following keys: - score (int): Relevance score. Binary integer value (0 or 1), where 1 indicates that the sentence is\ relevant and 0 indicates that the sentence is irrelevant. - value (str): Relevance value. Y or N, where Y indicates that the sentence is relevant and N\ indicates that the sentence is irrelevant. - reasoning (str): Reasoning for the relevance score. """ result = self.evaluator.evaluate_strings(prediction=contexts, input=query) return result
[docs] class Accuracy: def __init__(self, index, k=2) -> None: """ Accuracy computes the accuracy of the retrieved information relatively to the query. In other words, it validates that the retrieved information is correct. Args: index (llama_index.core.index.Index): Index containing the information to retrieve. k (int, optional): Number of similar items to retrieve. Defaults to 2. """ self.retriever = index.as_retriever(similarity_top_k=k) self.retriever_evaluator = RetrieverEvaluator.from_metric_names( ["mrr", "hit_rate", "precision", "recall"], retriever=self.retriever )
[docs] def compute(self, query: str, expected_ids: list) -> dict: """ This method computes a series of metrics of the retrieved information relatively to the query. Args: query (str): Query topic. expected_ids (list): List of expected IDs of the retrieved information (nodes of the index). Returns: dict: Dictionary containing the following metrics: - mrr (float): Mean Reciprocal Rank (MRR) metric. - hit_rate (float): Hit Rate metric. - precision (float): Precision metric. - recall (float): Recall metric. """ eval_result = self.retriever_evaluator.evaluate( query=query, expected_ids=expected_ids ) return eval_result.metric_vals_dict