Source code for saga_llm_evaluation.helpers.retrieval_metrics

from langchain.evaluation import load_evaluator
from llama_index.core.evaluation import RetrieverEvaluator



[docs]
class Relevance:
    def __init__(self, llm=None) -> None:
        """
        Relevance computes the relevance score of retrieved information relatively to the query.
        In other words, it validates that the retrieved information is related to the query topic.

        Args:
            llm (LangChain BaseLanguageModel): model used for evaluation. If None,\
                the model is chosen as "gpt-4" by default.
        """

        self.llm = llm if llm else None
        self.criterion = {
            "retrieval_relevance": "Are the retrieved information related to the query?",
        }
        self.evaluator = load_evaluator(
            "criteria", criteria=self.criterion, llm=self.llm
        )


[docs]
    def compute(self, contexts: list, query: str):
        """
        This method computes the relevance score of retrieved information relatively to the query.
        In other words, it validates that the retrieved information is related to the query topic.

        Args:
            contexts (list): List of retrieved information.
            query (str): Query topic.

        Returns:
            dict: Relevance score for the candidate sentence. The dictionary contains the following keys:

                - score (int): Relevance score. Binary integer value (0 or 1), where 1 indicates that the sentence is\
                    relevant and 0 indicates that the sentence is irrelevant.
                - value (str): Relevance value. Y or N, where Y indicates that the sentence is relevant and N\
                    indicates that the sentence is irrelevant.
                - reasoning (str): Reasoning for the relevance score.
        """

        result = self.evaluator.evaluate_strings(prediction=contexts, input=query)

        return result





[docs]
class Accuracy:
    def __init__(self, index, k=2) -> None:
        """
        Accuracy computes the accuracy of the retrieved information relatively to the query.
        In other words, it validates that the retrieved information is correct.

        Args:
            index (llama_index.core.index.Index): Index containing the information to retrieve.
            k (int, optional): Number of similar items to retrieve. Defaults to 2.
        """
        self.retriever = index.as_retriever(similarity_top_k=k)
        self.retriever_evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate", "precision", "recall"], retriever=self.retriever
        )


[docs]
    def compute(self, query: str, expected_ids: list) -> dict:
        """
        This method computes a series of metrics of the retrieved information relatively to the query.

        Args:
            query (str): Query topic.
            expected_ids (list): List of expected IDs of the retrieved information (nodes of the index).

        Returns:
            dict: Dictionary containing the following metrics:

                - mrr (float): Mean Reciprocal Rank (MRR) metric.
                - hit_rate (float): Hit Rate metric.
                - precision (float): Precision metric.
                - recall (float): Recall metric.
        """

        eval_result = self.retriever_evaluator.evaluate(
            query=query, expected_ids=expected_ids
        )

        return eval_result.metric_vals_dict