embeddings-benchmark · orionw · Jul 6, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -23,7 +23,6 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
@@ -33,5 +32,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-videos/clustering_explanation.mp4 filter=lfs diff=lfs merge=lfs -text
-videos/sts_explanation.mp4 filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 corpus.jsonl
 corpus.jsonl.gz
 results_dataset_to_upload
-*.pyc
+*.pyc
+videos/*.mp4
diff --git a/app.py b/app.py
@@ -7,6 +7,16 @@
 from models import ModelManager
 from ui import build_side_by_side_ui_anon, build_side_by_side_ui_anon_sts, build_side_by_side_ui_anon_clustering, build_side_by_side_ui_named, build_side_by_side_ui_named_sts, build_side_by_side_ui_named_clustering, build_single_model_ui, build_single_model_ui_sts, build_single_model_ui_clustering
 
+
+# download the videos 
+from huggingface_hub import hf_hub_url
+for file_to_download in ["sts_explanation.mp4", "clustering_explanation.mp4"]:
+    file_url = hf_hub_url(repo_id="mteb/arena-videos", repo_type="dataset", endpoint=None, filename=file_to_download)
+    # download it to videos/ folder using wget
+    os.system(f"wget {file_url} -O videos/{file_to_download}")
+
+
+
 acknowledgment_md = """
 ### Acknowledgment
 We thank X, Y, Z, [Contextual AI](https://contextual.ai/) and [Hugging Face](https://huggingface.co/) for their generous sponsorship. If you'd like to sponsor us, please get in [touch](mailto:[email protected]).
@@ -39,15 +49,15 @@ def load_elo_results(elo_results_dir):
         elo_results_dir = Path(elo_results_dir)
         elo_results_file = {}
         leaderboard_table_file = {}
-        for file in elo_results_dir.glob('elo_results_*.pkl'):
-            if 'clustering' in file.name:
-                elo_results_file['clustering'] = file
-            elif 'retrieval' in file.name:
-                elo_results_file['retrieval'] = file
-            elif 'sts' in file.name:
-                elo_results_file['sts'] = file
+        for folder in elo_results_dir.glob('elo_results_*'):
+            if 'clustering' in folder.name:
+                elo_results_file['clustering'] = folder
+            elif 'retrieval' in folder.name:
+                elo_results_file['retrieval'] = folder
+            elif 'sts' in folder.name:
+                elo_results_file['sts'] = folder
             else:
-                raise ValueError(f"Unknown file name: {file.name}")
+                raise ValueError(f"Unknown folder name: {folder.name}")
         for file in elo_results_dir.glob('*_leaderboard.csv'):
             if 'clustering' in file.name:
                 leaderboard_table_file['clustering'] = file

diff --git a/arena_elo/elo_analysis.py b/arena_elo/elo_analysis.py
@@ -11,6 +11,8 @@
 import plotly.express as px
 from tqdm import tqdm
 from datasets import load_dataset
+import plotly
+import os
 
 from .basic_stats import get_log_files
 from .clean_battle_data import clean_battle_data
@@ -337,6 +339,125 @@ def pretty_print_elo_rating(rating):
         print(f"{i+1:2d}, {model:25s}, {rating[model]:.0f}")
 
 
+
+def write_out_results(item: dict, item_name: str):
+    """
+    Due to their complex structure, let's recursively create subfolders until we reach the end
+        of the item and then save the DFs as jsonl files
+
+    Args:
+        item (dict): The item to save
+        item_name (str): The name of the item
+
+    Returns:
+        None
+    """
+    main_folder = item_name
+
+    if isinstance(item, list): 
+        for i, v in enumerate(item):
+            write_out_results(v, os.path.join(main_folder, str(i)))
+
+    elif isinstance(item, dict):
+        for key, value in item.items():
+            if isinstance(value, dict):
+                write_out_results(value, os.path.join(main_folder, key))
+            elif isinstance(value, list):
+                for i, v in enumerate(value):
+                    write_out_results(v, os.path.join(main_folder, key + str(i)))
+            else:
+                write_out_results(value, os.path.join(main_folder, key))
+
+    elif isinstance(item, pd.DataFrame):
+        print(f"Saving {main_folder} to {main_folder}/default.jsonl")
+        os.makedirs(main_folder, exist_ok=True)
+        item.reset_index().to_json(f"{main_folder}/default.jsonl", orient="records", lines=True)
+
+    elif isinstance(item, pd.Series):
+        print(f"Saving {main_folder} to {main_folder}/default.jsonl")
+        os.makedirs(main_folder, exist_ok=True)
+        item.to_frame().reset_index().to_json(f"{main_folder}/default.jsonl", orient="records", lines=True)
+
+    elif isinstance(item, plotly.graph_objs._figure.Figure):
+        print(f"Saving {main_folder} to {main_folder}/default.png")
+        os.makedirs(main_folder, exist_ok=True)
+        item.write_image(f"{main_folder}/default.png")
+
+    elif isinstance(item, str):
+        print(f"Saving {main_folder} to {main_folder}/default.txt")
+        os.makedirs(main_folder, exist_ok=True)
+        with open(f"{main_folder}/default.txt", "w") as f:
+            f.write(item)
+
+    elif item is None:
+        # write out an empty file
+        print(f"Saving {main_folder} to {main_folder}/default.txt")
+        os.makedirs(main_folder, exist_ok=True)
+        with open(f"{main_folder}/default.txt", "w") as f:
+            f.write("")
+
+    elif isinstance(item, float):
+        print(f"Saving {main_folder} to {main_folder}/default.txt")
+        os.makedirs(main_folder, exist_ok=True)
+        with open(f"{main_folder}/default.txt", "w") as f:
+            f.write(str(item))
+
+    else:
+        print(main_folder)
+        raise Exception(f"Unknown type {type(item)}")
+
+
+
+def load_results(data_path):
+    """
+    Do the reverse of `write_out_results` to reconstruct the item
+
+    Args:
+        data_path (str): The path to the data to load
+
+    Returns:
+        dict: The loaded data
+    """
+    if os.path.isdir(data_path):
+        # if the folder just has numbers from 0 to N, load as a list
+        all_files_in_dir = list(os.listdir(data_path))
+        if set(all_files_in_dir) == set([str(i) for i in range(len(all_files_in_dir))]):
+            ### the list case
+            return [load_results(os.path.join(data_path, str(i))) for i in range(len(os.listdir(data_path)))]
+        else:
+            if len(all_files_in_dir) == 1:
+                file_name = all_files_in_dir[0]
+                if file_name == "default.jsonl": 
+                    return load_results(os.path.join(data_path, file_name))
+                else: ### the dict case
+                    return {file_name: load_results(os.path.join(data_path, file_name))}
+            else:
+                return {file_name: load_results(os.path.join(data_path, file_name)) for file_name in all_files_in_dir}
+
+    elif data_path.endswith(".png"):
+        return None
+
+    elif data_path.endswith(".jsonl"):
+        df = pd.read_json(data_path, orient="records", lines=True)
+        if "index" in df.columns:
+            df = df.set_index("index")
+        return df
+
+    else:
+        with open(data_path, "r") as f:
+            data = f.read()
+
+        try:
+            return float(data)
+        except ValueError:
+            pass # not a float
+
+        if data == "":
+            return None
+        else:
+            return data
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--clean-battle-file", type=str)
@@ -383,5 +504,4 @@ def pretty_print_elo_rating(rating):
         "anony": anony_results,
         "full": full_results,
     }
-    with open(f"elo_results_{cutoff_date}.pkl", "wb") as fout:
-        pickle.dump(results, fout)
+    write_out_results(results, f"elo_results_{cutoff_date}")
diff --git a/arena_elo/generate_leaderboard.py b/arena_elo/generate_leaderboard.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import pickle
 from yaml import safe_load
+from .elo_analysis import load_results
 
 RENAME_KEYS = {
     "organization": "Organization",
@@ -15,7 +16,7 @@
 }
 
 def main(
-    elo_rating_pkl: str,
+    elo_rating_folder: str,
     output_csv: str
 ):    
 
@@ -31,8 +32,7 @@ def main(
             if key in model_info[model]:
                 model_info[model][RENAME_KEYS[key]] = model_info[model].pop(key)
 
-    with open(elo_rating_pkl, "rb") as fin:
-        elo_rating_results = pickle.load(fin)
+    elo_rating_results = load_results(elo_rating_folder)
 
     anony_elo_rating_results = elo_rating_results["anony"]
     full_elo_rating_results = elo_rating_results["full"]

diff --git a/arena_elo/update_elo.sh b/arena_elo/update_elo.sh
@@ -1,22 +1,25 @@
 #!/bin/bash
 
 mkdir -p results
-
+mkdir -p results/latest
 # For battle data
+
 for task in "retrieval" "clustering" "sts"; do
     python -m arena_elo.clean_battle_data --task_name $task
     battle_cutoff_date=`cat cut_off_date.txt` && rm cut_off_date.txt && echo "$task battle data last updated on $battle_cutoff_date"
     mkdir -p ./results/$battle_cutoff_date
     cp clean_battle_${task}_$battle_cutoff_date.json ./results/latest/clean_battle_$task.json
     mv clean_battle_${task}_$battle_cutoff_date.json ./results/$battle_cutoff_date/clean_results_${task}.json
     python3 -m arena_elo.elo_analysis --clean-battle-file ./results/$battle_cutoff_date/clean_results_${task}.json --num-bootstrap 1
-    mv ./elo_results_$battle_cutoff_date.pkl ./results/$battle_cutoff_date/elo_results_${task}.pkl
-    python -m arena_elo.generate_leaderboard \
-        --elo_rating_pkl "./results/$battle_cutoff_date/elo_results_${task}.pkl" \
-        --output_csv "./results/$battle_cutoff_date/${task}_leaderboard.csv"
+    mv ./elo_results_$battle_cutoff_date ./results/$battle_cutoff_date/elo_results_${task}
+    cmd="""python -m arena_elo.generate_leaderboard \
+        --elo_rating_folder "./results/$battle_cutoff_date/elo_results_${task}" \
+        --output_csv "./results/$battle_cutoff_date/${task}_leaderboard.csv""""
+    echo $cmd
+    eval $cmd
     mkdir -p ./results/latest
     cp ./results/$battle_cutoff_date/${task}_leaderboard.csv ./results/latest/${task}_leaderboard.csv
-    cp ./results/$battle_cutoff_date/elo_results_${task}.pkl ./results/latest/elo_results_${task}.pkl
+    cp -R ./results/$battle_cutoff_date/elo_results_${task} ./results/latest/elo_results_${task}
     echo "$task leaderboard updated"
 done
 
diff --git a/leaderboard.py b/leaderboard.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 
+from arena_elo.elo_analysis import load_results
 
 leader_component_values = [None] * 5
 
@@ -77,9 +78,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
         md = "Loading ..."
         p1 = p2 = p3 = p4 = None
     else:
-        with open(elo_results_file, "rb") as fin:
-            elo_results = pickle.load(fin)
-
+        elo_results = load_results(elo_results_file)
         anony_elo_results = elo_results["anony"]
         anony_arena_df = anony_elo_results["leaderboard_table_df"]
         p1 = anony_elo_results["win_fraction_heatmap"]

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ gritlm
 mteb
 plotly
 umap-learn
+kaleido
diff --git a/results/20240606/elo_results_clustering.pkl b/results/20240606/elo_results_clustering.pkl
diff --git a/results/20240606/elo_results_retrieval.pkl b/results/20240606/elo_results_retrieval.pkl
diff --git a/results/20240606/elo_results_sts.pkl b/results/20240606/elo_results_sts.pkl
diff --git a/results/20240614/elo_results_clustering.pkl b/results/20240614/elo_results_clustering.pkl
diff --git a/results/20240614/elo_results_retrieval.pkl b/results/20240614/elo_results_retrieval.pkl
diff --git a/results/20240614/elo_results_sts.pkl b/results/20240614/elo_results_sts.pkl
diff --git a/results/20240615/elo_results_clustering.pkl b/results/20240615/elo_results_clustering.pkl
diff --git a/results/20240616/elo_results_clustering.pkl b/results/20240616/elo_results_clustering.pkl
diff --git a/results/20240616/elo_results_retrieval.pkl b/results/20240616/elo_results_retrieval.pkl
diff --git a/results/20240616/elo_results_sts.pkl b/results/20240616/elo_results_sts.pkl
diff --git a/results/20240617/elo_results_clustering.pkl b/results/20240617/elo_results_clustering.pkl
diff --git a/results/20240617/elo_results_retrieval.pkl b/results/20240617/elo_results_retrieval.pkl
diff --git a/results/20240617/elo_results_sts.pkl b/results/20240617/elo_results_sts.pkl
diff --git a/results/20240618/elo_results_retrieval.pkl b/results/20240618/elo_results_retrieval.pkl
diff --git a/results/20240620/elo_results_clustering.pkl b/results/20240620/elo_results_clustering.pkl
diff --git a/results/20240620/elo_results_sts.pkl b/results/20240620/elo_results_sts.pkl
diff --git a/results/20240624/elo_results_clustering.pkl b/results/20240624/elo_results_clustering.pkl
diff --git a/results/20240624/elo_results_retrieval.pkl b/results/20240624/elo_results_retrieval.pkl
diff --git a/results/20240704/clustering_leaderboard.csv b/results/20240704/clustering_leaderboard.csv
@@ -1,3 +1,3 @@
 key,Model,Arena Elo rating (anony),Arena Elo rating (full),MTEB Overall Avg,MTEB Retrieval Avg,MTEB Clustering Avg,MTEB STS Avg,License,Organization,Link
-sentence-transformers/all-MiniLM-L6-v2,sentence-transformers/all-MiniLM-L6-v2,1051.7304902423814,1036.2219844563203,56.26,41.95,42.35,78.9,Apache-2.0,Sentence Transformers,https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
-intfloat/multilingual-e5-small,intfloat/multilingual-e5-small,948.2695097576186,963.7780155436797,57.87,46.64,37.08,79.1,MIT License,Microsoft,https://huggingface.co/intfloat/multilingual-e5-small
+sentence-transformers/all-MiniLM-L6-v2,sentence-transformers/all-MiniLM-L6-v2,1051.7304902424,1036.2219844563,56.26,41.95,42.35,78.9,Apache-2.0,Sentence Transformers,https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+intfloat/multilingual-e5-small,intfloat/multilingual-e5-small,948.2695097576,963.7780155437,57.87,46.64,37.08,79.1,MIT License,Microsoft,https://huggingface.co/intfloat/multilingual-e5-small
diff --git a/results/20240704/elo_results_clustering.pkl b/results/20240704/elo_results_clustering.pkl
diff --git a/results/20240704/elo_results_clustering/anony/average_win_rate_bar/default.png b/results/20240704/elo_results_clustering/anony/average_win_rate_bar/default.png
diff --git a/results/20240704/elo_results_clustering/anony/battle_count_heatmap/default.png b/results/20240704/elo_results_clustering/anony/battle_count_heatmap/default.png
diff --git a/results/20240704/elo_results_clustering/anony/bootstrap_df/default.jsonl b/results/20240704/elo_results_clustering/anony/bootstrap_df/default.jsonl
@@ -0,0 +1 @@
+{"index":0,"sentence-transformers\/all-MiniLM-L6-v2":1094.4361603648,"intfloat\/multilingual-e5-small":905.5638396352}
diff --git a/results/20240704/elo_results_clustering/anony/bootstrap_elo_rating/default.png b/results/20240704/elo_results_clustering/anony/bootstrap_elo_rating/default.png
diff --git a/results/20240704/elo_results_clustering/anony/elo_rating_final/default.jsonl b/results/20240704/elo_results_clustering/anony/elo_rating_final/default.jsonl
@@ -0,0 +1,2 @@
+{"index":"sentence-transformers\/all-MiniLM-L6-v2","0":1051.7304902424}
+{"index":"intfloat\/multilingual-e5-small","0":948.2695097576}
diff --git a/...elo_results_clustering/anony/elo_rating_online/intfloat/multilingual-e5-small/default.txt b/...elo_results_clustering/anony/elo_rating_online/intfloat/multilingual-e5-small/default.txt
@@ -0,0 +1 @@
+987.0071375154072
diff --git a/...lts_clustering/anony/elo_rating_online/sentence-transformers/all-MiniLM-L6-v2/default.txt b/...lts_clustering/anony/elo_rating_online/sentence-transformers/all-MiniLM-L6-v2/default.txt
@@ -0,0 +1 @@
+1012.9928624845928
diff --git a/results/20240704/elo_results_clustering/anony/last_updated_datetime/default.txt b/results/20240704/elo_results_clustering/anony/last_updated_datetime/default.txt
@@ -0,0 +1 @@
+2024-07-04 12:16:37 PDT
diff --git a/results/20240704/elo_results_clustering/anony/last_updated_tstamp/default.txt b/results/20240704/elo_results_clustering/anony/last_updated_tstamp/default.txt
@@ -0,0 +1 @@
+1720120597.967
diff --git a/results/20240704/elo_results_clustering/anony/leaderboard_table/default.txt b/results/20240704/elo_results_clustering/anony/leaderboard_table/default.txt
@@ -0,0 +1,4 @@
+| Rank | Model | Elo Rating | Description |
+| --- | --- | --- | --- |
+| 1 | 🥇 [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1052 | all-MiniLM-L6-v2 by Sentence Transformers |
+| 2 | 🥈 [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 948 | multilingual-e5-small by Microsoft |
diff --git a/results/20240704/elo_results_clustering/anony/leaderboard_table_df/default.jsonl b/results/20240704/elo_results_clustering/anony/leaderboard_table_df/default.jsonl
@@ -0,0 +1,2 @@
+{"index":"intfloat\/multilingual-e5-small","rating":948.2695097576,"variance":null,"rating_q975":905.5638396352,"rating_q025":905.5638396352,"num_battles":24}
+{"index":"sentence-transformers\/all-MiniLM-L6-v2","rating":1051.7304902424,"variance":null,"rating_q975":1094.4361603648,"rating_q025":1094.4361603648,"num_battles":24}
diff --git a/results/20240704/elo_results_clustering/anony/rating_system/default.txt b/results/20240704/elo_results_clustering/anony/rating_system/default.txt
@@ -0,0 +1 @@
+bt
diff --git a/results/20240704/elo_results_clustering/anony/win_fraction_heatmap/default.png b/results/20240704/elo_results_clustering/anony/win_fraction_heatmap/default.png
diff --git a/results/20240704/elo_results_clustering/full/average_win_rate_bar/default.png b/results/20240704/elo_results_clustering/full/average_win_rate_bar/default.png
diff --git a/results/20240704/elo_results_clustering/full/battle_count_heatmap/default.png b/results/20240704/elo_results_clustering/full/battle_count_heatmap/default.png