RUCAIBox · huyiwen · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.github/.test_durations b/.github/.test_durations
@@ -100,5 +100,41 @@
     "tests/utilization/utils/test_batch_sampler.py::test_dcbs": 0.8340403449999485,
     "tests/utilization/utils/test_batch_sampler.py::test_dcbs_auto_batching": 0.06662718100005804,
     "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot": 0.0838686949998646,
-    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot_prefix_caching": 3.93439717199999
+    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot_prefix_caching": 3.93439717199999,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-no_split]": 0.002944755367934704,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-split]": 0.0031265132129192352,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-no_split]": 0.002978229895234108,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-split]": 0.003244483843445778,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-no_split]": 0.00300593301653862,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-split]": 0.003051883541047573,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-no_split]": 0.00298389233648777,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-split]": 0.011334518902003765,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-no_split]": 0.00297668669372797,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-split]": 0.003215758129954338,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-no_split]": 0.0030325893312692642,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-split]": 0.003189575858414173,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-no_split]": 0.0030073318630456924,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-split]": 0.003115130588412285,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-no_split]": 0.002978302538394928,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-split]": 0.0031450027599930763,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-no_split]": 0.002832619473338127,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-split]": 0.0028070490807294846,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-no_split]": 0.0029166433960199356,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-split]": 0.005571841262280941,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-no_split]": 0.0028258198872208595,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-split]": 0.0027440031990408897,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-no_split]": 0.0027814237400889397,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-split]": 0.0028041303157806396,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-no_split]": 0.0029122764244675636,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-split]": 0.0029731355607509613,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-no_split]": 0.002978702075779438,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-split]": 0.0030855443328619003,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-no_split]": 0.002977837808430195,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-split]": 0.003017907030880451,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-no_split]": 0.002957606688141823,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-split]": 0.0033153872936964035,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-no_split]": 0.002739163115620613,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-split]": 0.002798774279654026,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-no_split]": 0.0028306040912866592,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-split]": 0.002850011922419071
 }
diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py
@@ -1,7 +1,9 @@
 import nltk
 import pytest
 
-from .fixtures import run_evaluate
+from utilization.utils.logging import list_datasets
+
+from .fixtures import *
 
 nltk.download('punkt')
 
@@ -20,7 +22,7 @@
     "commonsenseqa": [],
     "copa": [],
     "coqa": "skip",
-    # "crows_pairs": "does not support api model",
+    "crows_pairs": "does not support api model",
     "drop": [],
     "gaokao": [],
     "gsm8k": [],
@@ -57,7 +59,7 @@
     "webq": [],
     "wic": [],
     "winogender": [],
-    # "winograd": "does not support api model",
+    "winograd": "does not support api model",
     "winogrande": [],
     "wmt16:de-en": [],
     "wsc": [],
@@ -82,6 +84,14 @@
     )
 }
 
+datasets_to_test = set(list_datasets()) - {
+    "wmt10", "wmt13", "wmt14", "wmt15", "wmt16", "wmt17", "wmt18", "wmt19", "wmt21", "agieval_cot",
+    "agieval_single_choice"
+}
+for dataset in datasets_to_test:
+    if dataset not in datasets:
+        datasets[dataset] = []
+
 
 @pytest.mark.parametrize("dataset, extra_args", datasets.items())
 def test_datasets_dry_run(run_evaluate, dataset, extra_args):

diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from .fixtures import run_evaluate
+from .fixtures import *
 
 models = {
     "gpt-3.5-turbo": ["--openai_api_key", "fake_key"],

diff --git a/tests/utilization/model/test_apply_prompt_template.py b/tests/utilization/model/test_apply_prompt_template.py
@@ -1,7 +1,7 @@
 from utilization.chat_templates import DEFAULT_CHAT_TEMPLATE
 from utilization.model.model_utils.conversation import Conversation, ConversationFormatter
 
-from ..fixtures import conversation
+from ..fixtures import *
 
 
 def test_base(conversation: Conversation):

diff --git a/tests/utilization/model/test_to_model_prompt.py b/tests/utilization/model/test_to_model_prompt.py
@@ -2,7 +2,7 @@
 
 from utilization.model.model_utils.conversation import Conversation, ConversationFormatter
 
-from ..fixtures import conversation
+from ..fixtures import *
 
 model_evaluation_methods = {
     ("generation", False): (

diff --git a/tests/utilization/utils/test_batch_sampler.py b/tests/utilization/utils/test_batch_sampler.py
@@ -1,6 +1,6 @@
 import sys
 
-from ..fixtures import get_dataset_collection
+from ..fixtures import *
 
 sys.path.append('.')
 from utilization.model.huggingface_model import HuggingFaceModel

diff --git a/tests/utilization/utils/test_log_results.py b/tests/utilization/utils/test_log_results.py
@@ -0,0 +1,141 @@
+import json
+import sys
+from copy import deepcopy
+
+import pandas as pd
+
+from ..fixtures import *
+
+sys.path.append('.')
+from utilization.model.model_utils.conversation import Conversation, ConversationFormatter
+from utilization.utils.log_results import log_final_results
+
+
+def get_conv(split):
+    if split:
+        msg = [{
+            "role": "user",
+            "content": "This is the example input of the model."
+        }, {
+            "role": "assistant",
+            "content": "This is the sample output of the model."
+        }, {
+            "role": "user",
+            "content": "This is the input of the model."
+        }]
+    else:
+        msg = [{"role": "user", "content": "This is the input of the model."}]
+    return Conversation(
+        msg,
+        formatter=ConversationFormatter.from_chat_template("base"),
+        model_evaluation_method="generation",
+        split=split,
+    )
+
+
+data = {
+    ("generation", "no_split", "legacy"): [("This is the input of the model.",)],
+    ("generation", "split", "legacy"): [("This is", " a splitted sentence.")],
+    ("generation", "split", "conv"): [get_conv(True)],
+    ("generation", "no_split", "conv"): [get_conv(False)],
+    ("get_ppl", "no_split", "legacy"): [("Source parts of get_ppl", " target parts 1 of get_ppl"),
+                                        ("Source parts of get_ppl", " target parts 2 of get_ppl")],
+    ("get_ppl", "split", "legacy"):
+    [("Source parts of get_ppl", " can be splitted, but not", " target parts 1 of get_ppl"),
+     ("Source parts of get_ppl", " can be splitted, but not", " target parts 2 of get_ppl")],
+    ("get_prob", "no_split", "legacy"): [("This is the get_prob input of the model", 2)],
+    ("get_prob", "split", "legacy"): [("The get_prob input of the model", " can also be splitted.", 2)],
+}
+methods = [
+    "generation:no_norm:legacy:sample1:local",
+    "generation:no_norm:conv:sample1:api",
+    "generation:no_norm:conv:sample1:local",
+    "generation:no_norm:legacy:sample2:local",
+    "generation:no_norm:conv:sample2:api",
+    "generation:no_norm:conv:sample2:local",
+    "get_prob:no_norm:legacy:sample1:local",
+    "get_ppl:no_norm:legacy:sample1:local",
+    "get_ppl:acc_norm:legacy:sample1:local",
+]
+
+
+@pytest.mark.parametrize("split", ["split", "no_split"])
+@pytest.mark.parametrize("multiple_source", [True, False])
+@pytest.mark.parametrize("methods", methods)
+def test_log_final_results(split, multiple_source, methods):
+
+    eval_method, use_normalization, use_conversation, sample_num, local = methods.split(":")
+    use_normalization = use_normalization == "acc_norm"
+    sample_num = int(sample_num[-1])
+
+    def set_subset(l: dict):
+        l["subset"] = "subset_name"
+
+    eval_data = data[eval_method, split, use_conversation]
+    if eval_method == "get_ppl":
+        raw = [(0.5, 10), (1.0, 10)]  # probabilities, length
+        processed = [1]  # index 0
+        op_num = 2
+    elif eval_method == "get_prob":
+        raw = [[0.1, 0.9]]  # probabilities
+        processed = [1]  # index 0
+        op_num = 2
+    elif eval_method == "generation":
+        raw = ["This is the model's raw prediction."]
+        processed = ["prediction"]
+        op_num = 1
+
+    if use_normalization:
+        no_num = 2
+    else:
+        no_num = 1
+
+    series = log_final_results(
+        raw_predictions=raw * sample_num * no_num,
+        processed_predictions=processed * sample_num,
+        evaluation_instances=deepcopy(eval_data) * sample_num * no_num,
+        score_lists={"Metric": [True]},  # score_lists have already been aggregated along self-concsistency
+        multiple_source=multiple_source,
+        model_evaluation_method=eval_method,
+        use_normalization=use_normalization,
+        option_nums=[op_num] * sample_num,
+        len_evaluation_data=1,
+        sample_num=sample_num,
+        references=["reference"],
+        local_model=local == "local",
+    )
+    series.apply(set_subset)
+    print(series)
+    json_str = pd.concat([series]).to_json(orient="records", indent=4, force_ascii=False)
+
+    unmarsheled = json.loads(json_str)
+    print(json_str)
+    print(unmarsheled)
+    assert len(unmarsheled) == 1
+    assert unmarsheled[0]["index"] == 0
+    assert unmarsheled[0]["subset"] == "subset_name"
+    if eval_method == "get_ppl" and not multiple_source:
+        source = "".join(eval_data[0][:-1])
+        assert unmarsheled[0]["source"] == source
+        assert unmarsheled[0]["option"] == [" target parts 1 of get_ppl", " target parts 2 of get_ppl"]
+        assert unmarsheled[0]["perplexity"] == [0.5, 1.0]
+    elif eval_method == "get_ppl" and multiple_source:
+        source = "".join(eval_data[0][:-1])
+        assert unmarsheled[0]["source"] == [source, source]
+        assert unmarsheled[0]["option"] == " target parts 1 of get_ppl"
+        assert unmarsheled[0]["perplexity"] == [0.5, 1.0]
+    elif eval_method == "get_prob":
+        source = "".join(eval_data[0][:-1])
+        assert unmarsheled[0]["source"] == source
+        assert unmarsheled[0]["probabilites"] == [0.1, 0.9]
+    elif eval_method == "generation":
+        if use_conversation == "conv" and local == "local":
+            source = eval_data[0].apply_prompt_template()
+        elif use_conversation == "conv" and local == "api":
+            source = eval_data[0].messages
+        else:
+            source = "".join(eval_data[0])
+        assert unmarsheled[0]["source"] == source
+        assert unmarsheled[0]["raw_prediction"] == ["This is the model's raw prediction."] * sample_num
+    assert unmarsheled[0]["reference"] == "reference"
+    assert unmarsheled[0]["metric"]["Metric"] == True
diff --git a/utilization/metric/gpteval.py b/utilization/metric/gpteval.py
@@ -49,7 +49,7 @@ def __init__(self, multi_turn=False, type: Literal["single", "pairwise"] = "sing
     def __call__(self, predictions, references):
 
         # load gpteval model after the predictions of dataset are generated
-        from ..model import load_model
+        from ..load_model import load_model
 
         self.model = load_model(self.model_args)
         self.model.set_generation_args()

diff --git a/utilization/model/model_utils/conversation.py b/utilization/model/model_utils/conversation.py
@@ -228,13 +228,26 @@ def to_model_prompts(
 class Conversation(_HFConversation):
 
     def __init__(
-        self, messages: Union[str, List[Dict[str, str]], None] = None, conversation_id=None, **deprecated_kwargs
+        self,
+        messages: Union[str, List[Dict[str, str]], None] = None,
+        conversation_id=None,
+        num_turns: int = 1,
+        num_shots: int = 0,
+        num_options: int = 1,
+        multi_turn_users: Optional[List[str]] = None,
+        formatter: Optional[ConversationFormatter] = None,
+        model_evaluation_method: Optional[Literal["get_ppl", "get_prob", "generation", "user_defined"]] = None,
+        split: Optional[bool] = None,
+        **deprecated_kwargs
     ):
         super().__init__(messages, conversation_id, **deprecated_kwargs)
-        self.num_turns = 1
-        self.num_shots = 0
-        self.num_options = 1
-        self.mt_users = []
+        self.num_turns = num_turns
+        self.num_shots = num_shots
+        self.num_options = num_options
+        self.mt_users = [] if multi_turn_users is None else multi_turn_users
+        self.formatter = formatter
+        self.model_evaluation_method = model_evaluation_method
+        self.split = split
 
     @classmethod
     def from_chat(cls, *, user: Optional[str] = None, assistant: Optional[str] = None) -> "Conversation":