diff --git a/README.md b/README.md
index 0b5bfe7eef66985658d68bb1f20c6e7ff6748bd5..3881567c61de444f21ce97677b5a815a1371b40d 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,35 @@ Setup the conda env
 ```
 conda create -n llm_persp python=3.9
 pip install -r requirements.txt 
-pip install git+https://github.com/huggingface/transformers@c612628045821f909020f7eb6784c79700813eda
+
+# install transformers
+pip install git+https://github.com/huggingface/transformers.git
+pip install -i https://test.pypi.org/simple/ bitsandbytes
+conda install cudatoolkit -y
+```
+
+For openassistant create new env 
+```
+conda create --name llm_persp_oa --clone llm_persp
+pip install git+https://github.com/huggingface/transformers@d04ec99bec8a0b432fc03ed60cea9a1a20ebaf3c
 ```
 
+[//]: # (or)
+
+[//]: # (```)
+
+[//]: # (git clone https://github.com/huggingface/transformers.git)
+
+[//]: # (cd transformers)
+
+[//]: # (git checkout d04ec99bec8a0b432fc03ed60cea9a1a20ebaf3c)
+
+[//]: # (pip install .)
+
+[//]: # (```)
+
+
+
 
 ### Install llama if you want to run LLaMa models (this step is not needed to recreate experiments in the paper)
 Initialize and fetch the llama submodule
diff --git a/evaluate.py b/evaluate.py
index fdca153eaa59e12bfeb8df453f40f5e0c04f1588..ee0ea665c09478efcb32092706f66aeabceb3998 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -4,13 +4,22 @@ import random
 import re
 import json
 from collections import defaultdict
+import os
 
 import matplotlib.pyplot as plt
 import tiktoken
 
 import torch
 import sys
-from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextStreamer
+
+hostname = os.uname()[1]
+if hostname == "PTB-09003439":
+    hf_cache_dir = "/home/flowers-user/.cache/huggingface"
+else:
+    hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
+
+os.environ['TRANSFORMERS_CACHE'] = hf_cache_dir
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextStreamer, pipeline
 import transformers
 
 import json
@@ -23,6 +32,18 @@ def print_chat_messages(messages):
         print(f"{msg['role'].upper()} : {msg['content']}")
     print("*********************")
 
+def parse_hf_outputs(output, tokenizer, answers):
+    # extract the score for each possible answer
+    option_scores = {ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers}
+
+    # take the most probable answer as the generation
+    generation = max(option_scores, key=option_scores.get)
+
+    # extract logprobs
+    lprobs = [float(option_scores[a]) for a in answers]
+
+    return option_scores, generation, lprobs
+
 
 def create_simulated_messages(conv, last="user"):
     # simulate a conversation between two LLMs
@@ -78,7 +99,6 @@ timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 print("timestamp:", timestamp)
 
 import openai
-import os
 import numpy as np
 import pandas as pd
 import time
@@ -114,7 +134,7 @@ def get_prompt_skeleton(subject, experiment_name, args):
     if sum(map(bool, [args.profile, args.lotr_character, args.music_expert_genre, args.hobby])) > 1:
         raise ValueError("Multiple ways of inducing a perspective are defined.")
 
-    if "pvq" in experiment_name or "hofstede" in experiment_name or "big5" in experiment_name:
+    if "pvq" in experiment_name or "hofstede" in experiment_name or "big5" in experiment_name or "mmlu" in experiment_name:
 
         # Natural language profile is only defined for dictionary profile
         if not args.profile_dict:
@@ -142,6 +162,10 @@ def get_prompt_skeleton(subject, experiment_name, args):
             else:
                 raise ValueError(f"Data dir name is ill-defined {args.data_dir}")
 
+        elif "mmlu" in experiment_name:
+            assert "mmlu" in args.data_dir
+            test_name = "mmlu"
+
         else:
             raise ValueError(f"Experiment name is ill-defined {args.experiment_name}")
 
@@ -155,6 +179,11 @@ def get_prompt_skeleton(subject, experiment_name, args):
             questionnaire_description = ""
             questionnaire_description_empty = True
 
+        elif test_name == "mmlu":
+            # VSM questionnaire doesn't have a description
+            questionnaire_description = "The following are multiple choice questions (with answers)."
+            questionnaire_description_empty = False
+
         elif test_name in ["big5_50", "big5_100"]:
             # VSM questionnaire doesn't have a description
             questionnaire_description = "Mark how much you agree with each statement."
@@ -454,7 +483,7 @@ def get_prompt_skeleton(subject, experiment_name, args):
 
         if args.system_message:
             prompts = {
-                "system": f"{set_perspective_str}".rstrip(), # remove newline from the end
+                "system": f"{set_perspective_str}".rstrip(),  # remove newline from the end
                 "intro": f"{questionnaire_description}\n\n" if not questionnaire_description_empty else questionnaire_description,
                 "query": f"\n{query_str}",
             }
@@ -469,40 +498,6 @@ def get_prompt_skeleton(subject, experiment_name, args):
                 "query": f"\n{query_str}",
             }
 
-    elif "mmlu" in experiment_name:
-        raise DeprecationWarning("Deprecated")
-        if args.natural_language_profile:
-            raise NotImplementedError("Natural language profile not implemented.")
-
-        if args.perspective_amount not in ["extreme", "slight"]:
-            raise NotImplementedError("Perspective amount not implemented")
-
-        questionnaire_description = "The following are multiple choice questions (with answers)."
-        prompts = {
-            "intro": f"{questionnaire_description}\n\n",
-            "query": f"\nAnswer:"
-        }
-
-        if args.profile_dict:
-            # we define the perspective through a profile
-            profile_string = "\n".join([
-                f"\t{k}:{v}" for k, v in args.profile_dict.items()
-            ])
-
-            if args.direct_perspective:
-                set_perspective_str = "You are a person with the following profile:"
-                query_str = "Answer:"
-
-            else:
-                set_perspective_str = "The following is a questionnaire (with answers) given to a person with the following profile:"
-                query_str = "Answer (from the person):"
-
-            prompts = {
-                "intro": f"{set_perspective_str}\n"+
-                         f"{profile_string}\n\n",
-                "query": f"\n{query_str}",
-            }
-
     elif subject == "political_compass":
         raise DeprecationWarning("Deprecated")
 
@@ -742,7 +737,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
     gpt_token_counter = 0
 
     if args.simulate_conversation_theme and not (args.system_message and engine in ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-4-0314", "gpt-3.5-turbo-0613", "gpt-4-0613"]):
-        raise NotImplementedError("simulated conversation is only implemented with direct message and GPT chat models.")
+        raise NotImplementedError("simulated conversation is only implemented with system message and GPT chat models.")
 
     for i in range(test_df.shape[0]):
         if i % 10 == 0:
@@ -878,39 +873,12 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
                                     "history": "What is the significance of the battle of Hastings. Answer in two sentences.", # slight collapse
                                     "chess": "1. e4",
                                     "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers child their nose started bleeding.",
-                                    # "code": "Can you write a bash script that changes the extensions of all the .jpg files to .jpeg.", # collapse
-                                    #
-                                    #
-                                    #
-                                    #
-                                    #
                                     "religion": "Why do we believe in the assumption of Mary? Please answer in two sentences.",
                                     "tax": "Can you explain how can my company pay less tax in two sentences?",
-                                    # "vgame": ""...,
                                     "vacation": "I was thinking of going surfing is that relaxing? Please reply briefly.",
                                 }
 
-                                themes = {
-                                    "poem": "writing a poem together",
-                                    "joke": "telling a joke",
-                                    "history": "talking about history",
-                                    # slight collapse
-                                    "chess": "playing chess",
-                                    "grammar": "asking for grammar corrections",
-                                    # "code": "Can you write a bash script that changes the extensions of all the .jpg files to .jpeg.", # collapse
-                                    #
-                                    #
-                                    #
-                                    #
-                                    #
-                                    "religion": "discussing about religion",
-                                    "tax": "trying to see how to save money for my company",
-                                    "vgame": ...,
-                                    "vacation": "considering options for vacation",
-                                }
-
                                 opening_question = opening_questions_for_themes[args.simulate_conversation_theme]
-                                theme = themes[args.simulate_conversation_theme]
 
                                 conversation = [opening_question]
 
@@ -931,8 +899,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
                                     else:
                                         assert simulated_conv_messages[0]['role'] == "assistant"
                                         simulated_conv_messages = [
-                                            {"role": "system", "content": f"You are simulating a human using a chatbot."} # v1
-                                            # {"role": "system", "content": f"You are simulating a human using a chatbot and {theme}."} # v2
+                                            {"role": "system", "content": f"You are simulating a human using a chatbot."}
                                         ] + simulated_conv_messages
 
                                         engine_ = "gpt-4-0613"
@@ -1076,6 +1043,57 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             # extract logprobs
             lprobs = [float(option_scores[a]) for a in answers]
 
+        elif engine in ["zephyr-7b-beta"]:
+            if args.simulate_conversation_theme:
+                raise NotImplementedError("Simulated conversation not implemented for zephyr.")
+
+            tokenizer, model, pipe = llm_generator
+
+            assert ("system" in prompt_skeleton) == args.system_message
+
+            messages = []
+            if prompt_skeleton.get("system", "") != "":
+                messages.append({"role": "system", "content": prompt_skeleton["system"]})
+
+            messages.append({"role": "user", "content": prompt})
+            print_chat_messages(messages)
+
+            prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+            output = model.generate(
+                **inputs,
+                max_new_tokens=1,
+                # temperature=0.0001,
+                do_sample=False,
+                top_p=1.0,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
+
+            # ### todo: remove below
+            #  tokenizers and models are the same, pipe adds whitespace in encoding
+            # prompt_ = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            # output_ = pipe(
+            #     prompt_,
+            #     max_new_tokens=1,
+            #     do_sample=False,
+            #     # temperature=0.0001,
+            #     top_p=1.0,
+            #     # return_full_text=True,
+            #     # return_all_scores=True,
+            #     # return_dict_in_generate=True,
+            #     # output_scores=True
+            # )
+            # generated_text = tokenizer.decode(output['sequences'][0], skip_special_tokens=False)
+            # generated_text_ = output_[0]['generated_text']
+            #
+            # generated_text = tokenizer.decode(output['sequences'][0], skip_special_tokens=False).split("assistant|>")[1]
+            # generated_text_ = output_[0]['generated_text'].split("assistant|>")[1]
+            # if not generated_text == generated_text_:
+            #     from IPython import embed; embed();
+
+
         elif engine in ["openassistant_rlhf2_llama30b"]:
             if args.generative_qa:
                 raise NotImplementedError("Generative QA not implemented for OpenAI non-ChatGPT models.")
@@ -1093,6 +1111,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             tokenizer, model = llm_generator
 
             inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
+            del inputs["token_type_ids"]
             start_time = time.time()
             output = model.generate(
                 **inputs,
@@ -1105,16 +1124,14 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             )
             print("inference time:", time.time()-start_time)
 
-            # extract the score for each possible answer
-            option_scores = {
-                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
-            }
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
 
-            # take the most probable answer as the generation
-            generation = max(option_scores, key=option_scores.get)
+            # todo: remove if assert doesn't fail
+            option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers }
+            generation_ = max(option_scores, key=option_scores.get)
+            lprobs_ = [float(option_scores[a]) for a in answers]
 
-            # extract logprobs
-            lprobs = [float(option_scores[a]) for a in answers]
+            assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs)
 
         elif engine in ["stablevicuna"]:
             # todo: combine with stablelm
@@ -1147,16 +1164,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             if args.simulate_conversation_theme:
                 raise NotImplementedError("noisy conversation not implemented for user message")
 
-            # extract the score for each possible answer
-            option_scores = {
-                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
-            }
-
-            # take the most probable answer as the generation
-            generation = max(option_scores, key=option_scores.get)
-
-            # extract logprobs
-            lprobs = [float(option_scores[a]) for a in answers]
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
+            option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers }
+            generation_ = max(option_scores, key=option_scores.get)
+            lprobs_ = [float(option_scores[a]) for a in answers]
+            assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs)
 
         elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]:
             if args.simulate_conversation_theme:
@@ -1170,7 +1182,9 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             tokenizer, model = llm_generator
 
             inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-            # del inputs["token_type_ids"]
+            if "token_type_ids" in inputs:
+                del inputs["token_type_ids"]
+
             streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
             output = model.generate(
@@ -1178,7 +1192,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
                 streamer=streamer,
                 use_cache=True,
                 max_new_tokens=1,
-                temperature=0.0001,
+                # temperature=0.0001,
                 do_sample=False,
                 top_p=1.0,
                 return_dict_in_generate=True,
@@ -1186,17 +1200,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
             )
             # output_text = tokenizer.decode(output[0], skip_special_tokens=True)
 
-            # extract the score for each possible answer
-            option_scores = {
-                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
-            }
-            print("option_scores:", option_scores)
-
-            # take the most probable answer as the generation
-            generation = max(option_scores, key=option_scores.get)
-
-            # extract logprobs
-            lprobs = [float(option_scores[a]) for a in answers]
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
 
         elif engine in ["rp_incite_7b_instruct", "rp_incite_7b_chat"]:
             if args.system_message:
@@ -1214,18 +1218,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
                 return_dict_in_generate=True, output_scores=True
             )
 
-            # extract the score for each possible answer
-            option_scores = {
-                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
-            }
-
-            print("option_scores:", option_scores)
-
-            # take the most probable answer as the generation
-            generation = max(option_scores, key=option_scores.get)
-
-            # extract logprobs
-            lprobs = [float(option_scores[a]) for a in answers]
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
+            option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers }
+            generation_ = max(option_scores, key=option_scores.get)
+            lprobs_ = [float(option_scores[a]) for a in answers]
+            assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs)
 
         elif engine in ["stablelm"]:
             if args.simulate_conversation_theme:
@@ -1265,16 +1262,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato
                 output_scores=True
             )
 
-            # extract the score for each possible answer
-            option_scores = {
-                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
-            }
-
-            # take the most probable answer as the generation
-            generation = max(option_scores, key=option_scores.get)
-
-            # extract logprobs
-            lprobs = [float(option_scores[a]) for a in answers]
+            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
+            option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers }
+            generation_ = max(option_scores, key=option_scores.get)
+            lprobs_ = [float(option_scores[a]) for a in answers]
+            assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs)
 
         else:
             raise ValueError(f"Not recognized model {engine}.")
@@ -1385,17 +1377,23 @@ def main(args):
         assert "data_mmlu" in args.data_dir
 
         subjects_to_evaluate = [
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_mathematics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_us_history",
-            "high_school_world_history",
+            'high_school_biology',
+            'high_school_chemistry',
+            'high_school_computer_science',
+            'high_school_european_history',
+            'high_school_geography',
+            'high_school_government_and_politics',
+            'high_school_macroeconomics',
+            'high_school_mathematics',
+            'high_school_microeconomics',
+            'high_school_physics',
+            'high_school_psychology',
+            'high_school_statistics',
+            'high_school_us_history',
+            'high_school_world_history'
         ]
+
+        assert set(subjects_to_evaluate).issubset(subjects)
         subjects = subjects_to_evaluate
 
     if "mmlu_college" in args.experiment_name:
@@ -1412,8 +1410,10 @@ def main(args):
         assert set(subjects_to_evaluate).issubset(subjects)
         subjects = subjects_to_evaluate
 
+        # assert all college subjects are taken
         # subjects = [s for s in subjects if "college" in s]
 
+
     if "data_hofstede" == args.data_dir:
         assert "hofstede" in args.experiment_name
 
@@ -1479,9 +1479,22 @@ def main(args):
                 max_seq_len=2048,
                 max_batch_size=1,
             )
-        elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]:
 
-            hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
+        elif engine in ["zephyr-7b-beta"]:
+            print("Loading zephyr-7b-beta")
+            zephyr_pipe = transformers.pipeline(
+                "text-generation",
+                model="HuggingFaceH4/zephyr-7b-beta",
+                torch_dtype=torch.bfloat16,
+                device_map="auto"
+            )
+
+            tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", cache_dir=hf_cache_dir, device_map="auto")
+            model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto", cache_dir=hf_cache_dir)
+
+            llm_generator = (tokenizer, model, zephyr_pipe)
+
+        elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]:
 
             if engine == "up_llama2_70b_instruct_v2":
                 tokenizer = AutoTokenizer.from_pretrained("upstage/Llama-2-70b-instruct-v2", cache_dir=hf_cache_dir)
@@ -1494,7 +1507,6 @@ def main(args):
                 )
 
             elif engine == "up_llama_60b_instruct":
-                hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
 
                 tokenizer = AutoTokenizer.from_pretrained("upstage/llama-65b-instruct", cache_dir=hf_cache_dir)
                 model = AutoModelForCausalLM.from_pretrained(
@@ -1505,14 +1517,12 @@ def main(args):
                     rope_scaling={"type": "dynamic", "factor": 2}  # allows handling of longer inputs
                 )
 
-            print("Loaded.")
             llm_generator = (tokenizer, model)
 
 
-
         elif engine in ["falcon-40b", "falcon-40b-instruct"]:
+            raise NotImplementedError("Falcon not implemented.")
             model_name = f"tiiuae/{engine}"
-            hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
 
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=hf_cache_dir)
             model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=hf_cache_dir, trust_remote_code=True)
@@ -1546,7 +1556,6 @@ def main(args):
             if engine == "rp_incite_7b_instruct":
 
                 # init
-                hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
                 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", cache_dir=hf_cache_dir)
                 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", torch_dtype=torch.float16, cache_dir=hf_cache_dir)
 
@@ -1558,7 +1567,6 @@ def main(args):
             elif engine == "rp_incite_7b_chat":
 
                 # init
-                hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
                 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Chat", cache_dir=hf_cache_dir)
                 model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Chat", torch_dtype=torch.float16, cache_dir=hf_cache_dir)
                 model = model.to('cuda:0')
@@ -1566,16 +1574,15 @@ def main(args):
             else:
                 raise ValueError("Unknown model.")
 
-            print("Loaded.")
             llm_generator = (tokenizer, model)
 
+
         elif engine in ["stablelm", "stablevicuna", "openassistant_rlhf2_llama30b"]:
 
             if engine == "stablelm":
                 print("Loading stable-lm-tuned-alpha-7b.")
-                stablelm_cache_dir = "/gpfswork/rech/imi/utu57ed/stablelm_models"
-                tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=stablelm_cache_dir)
-                model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=stablelm_cache_dir)
+                tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir)
+                model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir)
 
             elif engine == "stablevicuna":
                 print("Loading stable-vicuna-13b.")
@@ -1594,31 +1601,40 @@ def main(args):
                 raise NotImplementedError(f"{engine} not supported")
 
             model.half().cuda()
-            print("Loaded.")
             llm_generator = (tokenizer, model)
 
         else:
             llm_generator = None
 
+        print(f"Loaded model: {args.engine}.")
+
         all_cors = []
 
         # list because of permutations
-        subj_acc = []
-        subj_len = []
-        metrics = []
-        answers = []
+        subj_acc = [{} for _ in range(args.permutations)]
+        subj_len = [{} for _ in range(args.permutations)]
+        metrics = [{} for _ in range(args.permutations)]
+        answers = [{} for _ in range(args.permutations)]
 
         for subject in subjects:
             if args.ntrain >= 1:
                 # create in-context examples from dev
-                dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
+                dev_df = pd.read_csv(
+                    os.path.join(args.data_dir, "dev", subject + "_dev.csv"),
+                    header=None,
+                    keep_default_na=False,
+                )[:args.ntrain]
                 # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
                 dev_df[0][:] = dev_df[0][:].str.replace("\\n", "\n")
 
             else:
                 dev_df = None
 
-            test_df = pd.read_csv(os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"), header=None)
+            test_df = pd.read_csv(
+                os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"),
+                header=None,
+                keep_default_na=False,
+            )
             # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
             test_df[0][:] = test_df[0][:].str.replace("\\n", "\n")
 
@@ -1654,10 +1670,10 @@ def main(args):
             for perm_i, permutations_dict in enumerate(permutations_dicts):
                 print(f"PERMUTATION {perm_i}")
 
-                subj_acc.append({})
-                subj_len.append({})
-                metrics.append({})
-                answers.append({})
+                # subj_acc.append({})
+                # subj_len.append({})
+                # metrics.append({})
+                # answers.append({})
 
                 cors, acc, probs, preds, gpt_tokens = eval(
                     args=args,
@@ -1816,13 +1832,12 @@ def main(args):
 
                 plot_dict(m, savefile=os.path.join(dump_results_dir, f"plot_{subj}.png"))
 
+        # accuracy
+        plot_dict(mean_subj_acc, savefile=os.path.join(dump_results_dir, f"plot_mean_acc.png"))
+
         if not os.path.exists(dump_results_dir):
             os.mkdir(dump_results_dir)
 
-        # if not all([(np.mean([m['pvq_male'][k] for m in metrics]) == mean_metrics['pvq_male'][k]) for k in mean_metrics['pvq_male'].keys()]):
-        #     print("The metrics are not consistent.")
-        #     from IPython import embed; embed();
-
         json_dump_path = os.path.join(dump_results_dir, 'results.json')
 
         with open(json_dump_path, 'w') as fp:
diff --git a/models_stat_test.py b/models_stat_test.py
index 47edf2a3c04e7d1eb153268f7b7a415e85937ea9..b6244aba2cf8164f895dbbde1c1de43be755ca49 100644
--- a/models_stat_test.py
+++ b/models_stat_test.py
@@ -6,10 +6,24 @@ from collections import defaultdict
 import scipy.stats as stats
 from termcolor import colored
 
+data=defaultdict(dict)
 # use t-tests to compare
 
+## Zephyr
+data["zephyr"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
+data["zephyr"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
+data["zephyr"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
+data["zephyr"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
+data["zephyr"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
+data["zephyr"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
+data["zephyr"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
+data["zephyr"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
+data["zephyr"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
+data["zephyr"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
+data["zephyr"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
+data["zephyr"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
+
 ## GPT4
-data=defaultdict(dict)
 # data["gpt4"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_2nd_prs/"
 data["gpt4"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_3rd_prs/"
 # data["gpt4"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_User_msg_2nd_prs/"
@@ -172,24 +186,24 @@ data["ada"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_ada_p
 data["ada"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_ada_perm_50_User_msg_3rd_prs/"
 
 
-models = ["gpt4", "gpt35m", "gpt35j", "gpt35in", "upllama2","upllama1", "oa", "stvic", "stlm", "llama", "rpchat", "rpincite", "curie", "babbage", "ada"]
+models = ["zephyr", "gpt4", "gpt35m", "gpt35j", "gpt35in", "upllama2","upllama1", "oa", "stvic", "stlm", "llama", "rpchat", "rpincite", "curie", "babbage", "ada"]
 msg = ["S", "U"]
 prs = ["2", "3"]
 
 # pvq
-# questionnaires = ["pvq"]
-# comparisons = [("gpt35m", m) for m in models]
-# label_1 = "pvq_resU2"
+questionnaires = ["pvq"]
+comparisons = [("gpt35m", m) for m in models]
+label_best = "pvq_resU2"
 
 # hof
-# questionnaires = ["hof"]
-# comparisons = [("upllama1", m) for m in models]
-# label_1 = "hof_resU3"
+questionnaires = ["hof"]
+comparisons = [("upllama1", m) for m in models]
+label_best = "hof_resU3"
 #
 # # big5
 questionnaires = ["big5"]
 comparisons = [("gpt35j", m) for m in models]
-label_1 = "big5_resS3"
+label_best = "big5_resS3"
 
 
 # replace paths with data from alignments.json
@@ -219,6 +233,7 @@ for model in models:
                         # Append the data to the list
                         json_data.extend(load_data)
                 data[model][label] = json_data
+
 p_limit = 0.05 / 15
 
 print("p-limit: {}".format(p_limit))
@@ -237,7 +252,7 @@ for mod_1, mod_2 in comparisons:
                 if label not in data[mod_1] or label not in data[mod_2]:
                     continue
 
-                a=data[mod_1][label_1]
+                a=data[mod_1][label_best]
                 b=data[mod_2][label]
 
                 pvalue = stats.ttest_ind(a=a, b=b, equal_var=False).pvalue
diff --git a/requirements.txt b/requirements.txt
index d4bea5b40ca0b1e020b1c31a7001b732704c4b0c..d2029efed2cda9b5966e35e8faefe99cf473a8bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -86,8 +86,18 @@ wcwidth==0.2.6
 wsproto==1.2.0
 yarl==1.8.2
 zipp==3.15.0
+tiktoken==0.5.1
+matplotlib
+openai
+pandas
+bs
+selenium
+torch==1.13.1
+accelerate==0.18.0
+sentencepiece==0.1.98
+protobuf==3.20.1
 # matplotlib==3.7.1
-# tiktoken==0.4.0
+# tiktoken==0.4.0kk
 # pandas==2.0.1
 # selenium==4.9.1
 # bs4
diff --git a/visualization_scripts/bar_viz.py b/visualization_scripts/bar_viz.py
index a37c06869e2b1595e7f84dfc5a5860044e297567..b5ab15c94b3a55f7503515c4c6cc3f3a49e5e553 100644
--- a/visualization_scripts/bar_viz.py
+++ b/visualization_scripts/bar_viz.py
@@ -13,11 +13,34 @@ import scikit_posthocs as sp
 import pandas as pd
 import pingouin as pg
 from itertools import combinations
-from scipy.stats import pearsonr, spearmanr
+from scipy.stats import pearsonr, spearmanr, ConstantInputWarning
 
 def is_strictly_increasing(lst):
     return all(x <= y for x, y in zip(lst, lst[1:]))
 
+def print_dict_values(d):
+    print("\t".join([f"{k:<10}" for k in list(d.keys()) + ["Mean"]]))
+    print("\t\t".join([f"{np.round(s, 2):.2}" for s in list(d.values()) + [np.mean(list(d.values()))]]))
+
+warnings.filterwarnings('error', category=ConstantInputWarning)
+def compute_correlation(scores_1, scores_2, spearman):
+    try:
+        if spearman:
+            correlation, _ = spearmanr(scores_1, scores_2)
+        else:
+            correlation, _ = pearsonr(scores_1, scores_2)
+
+    except ConstantInputWarning:
+        # not ideal but no other way
+        if len(set(scores_1)) == 1 and len(set(scores_2)) == 1:
+            # both constant
+            correlation = 1.0
+        else:
+            # one constant
+            correlation = 0.0
+
+    return correlation
+
 
 def dir_to_label(directory):
 
@@ -162,10 +185,10 @@ def print_correlation_stats(cs):
         "STD": np.std(cs),
         "Min": np.min(cs),
         "Max": np.max(cs),
-        "Perc 25": np.percentile(cs, 25),
-        "Perc 75": np.percentile(cs, 75),
-        "Skew": skew(cs, axis=0, bias=True),
-        "Kurtosis": kurtosis(cs, axis=0, bias=True)
+        # "Perc 25": np.percentile(cs, 25),
+        # "Perc 75": np.percentile(cs, 75),
+        # "Skew": skew(cs, axis=0, bias=True),
+        # "Kurtosis": kurtosis(cs, axis=0, bias=True)
     }
 
     headers = "\t".join(statistics.keys())
@@ -428,6 +451,7 @@ if __name__ == '__main__':
 
     normalized_evaluation_data = []
     notnorm_evaluation_data = []
+    vals = []
 
     parser = argparse.ArgumentParser()
     parser.add_argument('directories', nargs='+', help='directories containing results.json files')
@@ -458,9 +482,9 @@ if __name__ == '__main__':
 
 
     ignore = [
-        # "religion",
-        # "tax",
-        # "vacation",
+        "religion",
+        "tax",
+        "vacation",
 
         # "grammar",
         # "poem",
@@ -502,6 +526,8 @@ if __name__ == '__main__':
     ignore_patterns = ["gen_space", "gen_w_space"]
     print("Ignoring patterns: ", ignore_patterns)
 
+    rank_order_stability = True
+
     for substring in ignore_patterns:
         directories = [d for d in directories if substring not in d]
 
@@ -515,10 +541,16 @@ if __name__ == '__main__':
         test_set_name = "big5_50"
     elif "big5_100" in directories[0]:
         test_set_name = "big5_100"
+    elif "mmlu" in directories[0]:
+        raise NotImplementedError("not implemented for mmlu")
+        test_set_name = "college_biology"
+        # test_set_name = "college_chemistry"
+        # test_set_name = "college_computer_science"
+        # test_set_name = "college_medicine"
+        # test_set_name = "college_physics"
     else:
         test_set_name = "pvq_male"
 
-
     dir_2_data = {}
     for i, directory in enumerate(directories):
         if not os.path.isdir(directory):
@@ -527,11 +559,11 @@ if __name__ == '__main__':
         if not os.path.isfile(results_json_path):
             continue
 
+
         with open(results_json_path, 'r') as f:
             data = json.load(f)
         dir_2_data[directory] = data
 
-
     if test_set_name == "pvq_male":
         test_set_values = [
             'Conformity',
@@ -562,6 +594,8 @@ if __name__ == '__main__':
             "Agreeableness",
             "Conscientiousness"
         ]
+    elif "college_" in test_set_name:
+        test_set_values = ["accuracy"]
 
     primary_value_alignments = []
     mean_vars = []
@@ -676,8 +710,23 @@ if __name__ == '__main__':
     normalized_evaluation_data = np.array(normalized_evaluation_data)
     notnorm_evaluation_data = np.array(notnorm_evaluation_data)
 
-    # mean over value dimensions
-    # mean_variance = variances.mean()
+    # for each value we compute the mean and var over perspectives
+    per_value_persp_mean = notnorm_evaluation_data.mean(axis=0).mean(axis=0)
+    # avg over permutation
+    per_value_persp_std = notnorm_evaluation_data.std(axis=0).mean(axis=0)
+    # std over both perspectives and permutation
+    per_value_std_std = notnorm_evaluation_data.std(axis=(0,1))
+
+    print("Per value std of perspectives")
+    for v_, m_, std_, std_std_ in zip(test_set_values, per_value_persp_mean, per_value_persp_std, per_value_std_std):
+        print("{:<15} \tM: {:.2f} \tSD (avg perm): {:.2f} \tSD (sd perm) {:.2f}".format(v_, m_, std_, std_std_))
+    print("{:<15} \tM: {:.2f} \tSD (avg perm): {:.2f} \tSD (sd perm) {:.2f}".format(
+        "Mean",
+        np.mean(per_value_persp_mean),
+        np.mean(per_value_persp_std),
+        np.mean(per_value_std_std)
+    ))
+
 
     # print(f"Mean (over values) Variance (over perspectives): {mean_variance}")
     perm_var = normalized_evaluation_data.var(1).mean()
@@ -689,6 +738,7 @@ if __name__ == '__main__':
     # mean(permut) -> var (persp) -> mean (values)
     print(f"Perspective Var - mean (over values/traits) of var (over perspectives) of mean (over perm) (*10^3): {round(persp_var * (10 ** 3), 2)}")
 
+
     # all_evaluation_data = (n_perspectives, n_permutations, n_values/traits)
     # we do a separate anova for each value/trait
     assert normalized_evaluation_data.shape[-1] == len(test_set_values)
@@ -705,130 +755,57 @@ if __name__ == '__main__':
     print("testing p-value: ", group_p_limit)
 
     if test_set_name == "pvq_male":
-        # PVQ centering - is this reccomented in this manual? # todo: do properly all questions
+        # PVQ centering - is this reccomended in this manual? # todo: do properly all questions
         notnorm_evaluation_data = notnorm_evaluation_data - np.repeat(notnorm_evaluation_data.mean(2)[:,:, np.newaxis], 10, axis=2)
 
+    cohens_ds=defaultdict(list)
+
     for val_i, val in enumerate(test_set_values):
         # value_data = normalized_evaluation_data[:, :, val_i] # value_data = (n_perspectives, n_permutations)
         value_data = notnorm_evaluation_data[:, :, val_i] # value_data = (n_perspectives, n_permutations)
 
         print(f"\n----------------{val}---------------------")
 
-        assume_assumptions=True
-
-        if assume_assumptions:
-            # it's usually ok to violate assumptions
-            levene_ok = True
-            gaussian = True
-
-        else:
-            # for assumptions
-            alpha = 0.001
-
-            # LEVENE test for equal variance (assumption for anova and tukey)
-            levene_test_stat, levene_p_value = stats.levene(*value_data)
-            if levene_p_value < alpha:  # or whatever alpha level you choose
-                levene_ok = False
-                print(f"Levene not OK (p={levene_p_value}) - not same variances")
-
-            else:
-                levene_ok = True
-                print(f"Levene OK (p={levene_p_value}) - same variances")
-
-            # Shapiro - normality
-            gaussian = []
-            ps = []
-            for v_d in value_data:
-                _, p = shapiro(v_d)
-                ps.append(p)
-
-                if p < alpha:
-                    gaussian.append(False)
-                else:
-                    gaussian.append(True)
-
-                # plt.hist(v_d)
-                # plt.title(f"p = {p} - {'' if gaussian[-1] else 'not'} gaussian")
-                # plt.show()
-
-            gaussian = all(gaussian)
-            # print(f"Gaussian (p={ps}): {gaussian}")
-            print(f"Gaussian (Shapiro) (p={np.round(ps, 5)}): {gaussian}")
-
-        if gaussian:
-            if levene_ok:
-                # standard ANOVA
-                testname = "ANOVA"
-                _, pvalue = stats.f_oneway(*value_data)
-            else:
-                # Welch ANOVA
-                test_name = "Welch ANOVA"
-                df = pd.DataFrame({'score': np.array(value_data).flatten(),
-                                   'group': np.repeat(labels, repeats=50)})
-
-                p_value = pg.welch_anova(dv='score', between='group', data=df)["p-unc"].values
-                assert len(p_value) == 1
-                p_value = p_value[0]
-
-        else:
-            # kruskal
-            testname = "Krushal-Wallis (H-test)"
-            _, pvalue = stats.kruskal(*value_data)
-
+        # standard ANOVA
+        _, pvalue = stats.f_oneway(*value_data)
 
         if pvalue >= group_p_limit:
             different_distr.append(False)
-            print(colored(f"p({testname}): {pvalue} > {group_p_limit}", "red"))
+            print(colored(f"p(ANOVA): {pvalue} > {group_p_limit}", "red"))
+            posthoc_test=False
         else:
             different_distr.append(True)
-            print(colored(f"p({testname}): {pvalue} < {group_p_limit}", "green"))
+            print(colored(f"p(ANOVA): {pvalue} < {group_p_limit}", "green"))
+            posthoc_test=True
 
-            pairs_ind = list(itertools.combinations(range(len(value_data)), 2))
+        # do posthocs and compute cohen's ds
+        pairs_ind = list(itertools.combinations(range(len(value_data)), 2))
 
-            if gaussian and levene_ok:
-                # Tukey
-                tukey_res=tukey_hsd(*value_data)
+        if posthoc_test:
+            tukey_res=tukey_hsd(*value_data) # Tukey test
 
-                # print("\tTukey (p limit) :", p_limit)
-                print("\tTukey (p limit) :", group_p_limit)
-                for pair_ind in pairs_ind:
+        print("\tTukey (p limit) :", group_p_limit)
+        for pair_ind in pairs_ind:
 
-                    res_pvalue = tukey_res.pvalue[pair_ind[0], pair_ind[1]]
+            # cohen's d
+            d = cohen_d(value_data[pair_ind[0]], value_data[pair_ind[1]])
+            cohens_ds[val].append(d)
 
-                    if res_pvalue < p_limit:
-                        d = cohen_d(value_data[pair_ind[0]], value_data[pair_ind[1]])
-                        print("\t{} - {} : p -> {:.8f} d -> {:.8f}".format(labels[pair_ind[0]], labels[pair_ind[1]], res_pvalue, d))
+            # posthoc
+            if posthoc_test:
+                res_pvalue = tukey_res.pvalue[pair_ind[0], pair_ind[1]]
+                if res_pvalue < p_limit:
+                    print("\t{} - {} : p -> {:.8f} d -> {:.8f}".format(
+                        labels[pair_ind[0]],
+                        labels[pair_ind[1]],
+                        res_pvalue, d)
+                    )
 
-            else:
-                # Dunn
-                dunn_res = sp.posthoc_dunn(value_data, p_adjust='bonferroni')
-                for pair_ind in pairs_ind:
-                    res_pvalue = dunn_res.iloc[pair_ind[0], pair_ind[1]]
-                    if res_pvalue < p_limit:
-                        print("\t{} - {} : {}".format(labels[pair_ind[0]], labels[pair_ind[1]], res_pvalue))
 
 
-            # # bonferroni correction
-            # n_comparisons = len(pairs_ind)
-            # bonf_p_limit = p_limit / len(pairs_ind)
-            #
-            # print(f"\tBonferroni corrected p : {p_limit} -> {bonf_p_limit}")
-            # print(f"\t{'t-test' if gaussian else 'u-test'} p < {bonf_p_limit} for: ")
-            #
-            # for pair_ind in pairs_ind:
-            #     if gaussian:
-            #         # welch or standard t-test (welch - leven_ok - equal_var:True)
-            #         res_pvalue = stats.ttest_ind(a=value_data[pair_ind[0]], b=value_data[pair_ind[1]], equal_var=levene_ok).pvalue
-            #     else:
-            #         from IPython import embed; embed();
-            #
-            #         # Mann-Whitney U
-            #         res_pvalue = stats.mannwhitneyu(x=value_data[pair_ind[0]], y=value_data[pair_ind[1]]).pvalue
-            #
-            #     pair_labels = (labels[pair_ind[0]], labels[pair_ind[1]])
-            #     if res_pvalue < bonf_p_limit:
-            #         print("\t{} -> {} < {}".format(pair_labels, res_pvalue, bonf_p_limit))
-
+    avg_abs_cohens_ds = {k:np.mean(np.abs(v)) for k,v in cohens_ds.items()}
+    print(colored("Average absolute cohen's ds:", "green"))
+    print_dict_values(avg_abs_cohens_ds)
 
     # plot bars
     for distr_i, different in enumerate(different_distr):
@@ -866,135 +843,86 @@ if __name__ == '__main__':
     print()
 
     ########################
-    # CORRELATIONS
+    # RANK ORDER STABILITY CORRELATIONS
     ########################
+    if rank_order_stability:
+        print(colored("Pearson Correlation (between perspectives) - permutation order", "green"))
+        corrs = defaultdict(list)
 
-    print(colored("Pearson Correlation (between perspectives) - permutation order", "green"))
-    corrs = defaultdict(list)
+        perm_orders = defaultdict(dict)
 
-    perm_orders = defaultdict(dict)
+        for dir_1, dir_2 in combinations(directories, 2):
 
-    for dir_1, dir_2 in combinations(directories, 2):
+            lab_1 = dir_to_label(dir_1)
+            lab_2 = dir_to_label(dir_2)
 
-        lab_1 = dir_to_label(dir_1)
-        lab_2 = dir_to_label(dir_2)
+            # print(f"{lab_1} vs {lab_2}")
+            for key in keys:
 
-        print(f"{lab_1} vs {lab_2}")
-        for key in keys:
+                scores_1 = [d[test_set_name][key] for d in dir_2_data[dir_1]["per_permutation_metrics"]]
+                scores_2 = [d[test_set_name][key] for d in dir_2_data[dir_2]["per_permutation_metrics"]]
 
-            scores_1 = [d[test_set_name][key] for d in dir_2_data[dir_1]["per_permutation_metrics"]]
-            scores_2 = [d[test_set_name][key] for d in dir_2_data[dir_2]["per_permutation_metrics"]]
+                correlation = compute_correlation(scores_1, scores_2, spearman)
 
+                corrs[key].append(correlation)
 
-            if spearman:
-                correlation, _ = spearmanr(scores_1, scores_2)
-            else:
-                correlation, _ = pearsonr(scores_1, scores_2)
+                # print(f"\t{key} : {correlation}")
 
-            if np.isnan(correlation):
-                # Constant values in one group. Correlation is undefined and artificially set to 0
-                correlation = 0.0
+        all_corrs = list(itertools.chain(*corrs.values()))
+        print_aggregated_correlation_stats(all_corrs)
 
-            corrs[key].append(correlation)
+        permutation_order_stabilities = {}
+        for key in keys:
+            permutation_order_stabilities[key] = np.mean(corrs[key])
 
-            print(f"\t{key} : {correlation}")
+        print("\nPermutation order stability due to perspective change")
+        print_dict_values(permutation_order_stabilities)
 
-    for key in keys:
-        print(f"{key}:")
-        print("\tAvg Pearson corr:", np.mean(corrs[key]))
-        print("\tMin Pearson corr:", np.min(corrs[key]))
 
-    all_corrs = list(itertools.chain(*corrs.values()))
-    print_aggregated_correlation_stats(all_corrs)
+        print("--------------------------------------------------")
 
-    print("--------------------------------------------------")
+        # order of perspectives
+        print(colored("Pearson Correlation (between permutations) - perspective order", "green"))
+        corrs = defaultdict(list)
+        persp_orders = defaultdict(dict)
 
-    # order of perspectives
-    print(colored("Pearson Correlation (between permutations) - perspective order", "green"))
-    corrs = defaultdict(list)
-    persp_orders = defaultdict(dict)
 
-    n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"])
+        n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"])
 
-    assert n_perms == 50
-    assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"])
+        assert n_perms == 50
+        assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"])
+
+        for perm_1, perm_2 in combinations(range(n_perms), 2):
+
+            for key in keys:
+                scores_1 = [dir_2_data[d]["per_permutation_metrics"][perm_1][test_set_name][key] for d in directories]
+                scores_2 = [dir_2_data[d]["per_permutation_metrics"][perm_2][test_set_name][key] for d in directories]
 
-    for perm_1, perm_2 in combinations(range(n_perms), 2):
+                correlation = compute_correlation(scores_1, scores_2, spearman)
+                corrs[key].append(correlation)
 
+        all_corrs = list(itertools.chain(*corrs.values()))
+        print_aggregated_correlation_stats(all_corrs)
+
+        perspective_order_stabilities = {}
         for key in keys:
-            scores_1 = [dir_2_data[d]["per_permutation_metrics"][perm_1][test_set_name][key] for d in directories]
-            scores_2 = [dir_2_data[d]["per_permutation_metrics"][perm_2][test_set_name][key] for d in directories]
+            perspective_order_stabilities[key] = np.mean(corrs[key])
 
-            if spearman:
-                correlation, _ = spearmanr(scores_1, scores_2)
-            else:
-                correlation, _ = pearsonr(scores_1, scores_2)
-
-            if np.isnan(correlation):
-                # Constant values in one group. Correlation is undefined and artificially set to 0
-                correlation = 0.0
-
-            corrs[key].append(correlation)
-
-    for key in keys:
-        print(key)
-        print("\tAvg Pearson corr:", np.mean(corrs[key]))
-        print("\tMin Pearson corr:", np.min(corrs[key]))
-
-    all_corrs = list(itertools.chain(*corrs.values()))
-
-    print_aggregated_correlation_stats(all_corrs)
-
-    # print("--------------------------------------------------")
-    # # order of perspectives - avg over permutations first (average scores)
-    # print(colored("Pearson Correlation (between perspectives) - values order (average scores)", "green"))
-    # corrs = list()
-    # key_orders = defaultdict(dict)
-    # persp_scores = defaultdict(dict)
-    #
-    # n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"])
-    #
-    # assert n_perms == 50
-    # assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"])
-    #
-    # for d in directories:
-    #     for key in keys:
-    #         persp_scores[d][key] = np.mean([dir_2_data[d]["per_permutation_metrics"][i][test_set_name][key] for i in range(50)])
-    #
-    #     # keys in order
-    #     key_orders[d] = [k for k, v in sorted(persp_scores[d].items(), key=lambda item: item[1])]
-    #
-    # for dir1, dir2 in combinations(directories, 2):
-    #
-    #     if compare_scores:
-    #         # key scores in order
-    #         values1 = [persp_scores[dir1][k] for k in key_orders[dir1]]
-    #         assert is_strictly_increasing(values1)
-    #         # same order of keys (values/traits) with other scores
-    #         values2 = [persp_scores[dir2][k] for k in key_orders[dir1]]
-    #
-    #     else:
-    #         mapping = {v: i for (i, v) in enumerate(key_orders[dir1])}
-    #
-    #         values1 = [mapping[v] for v in key_orders[dir1]]
-    #         values2 = [mapping[v] for v in key_orders[dir2]]
-    #
-    #     if spearman:
-    #         correlation, _ = spearmanr(values1, values2)
-    #     else:
-    #         correlation, _ = pearsonr(values1, values2)
-    #
-    #     if np.isnan(correlation):
-    #         assert compare_scores
-    #         # Constant values in one group. Correlation is undefined and artificially set to 0
-    #         correlation = 0.0
-    #
-    #     print(f"\t{dir_to_label(dir1)} - {dir_to_label(dir2)} : {correlation}")
-    #     corrs.append(correlation)
-    #
-    # print_aggregated_correlation_stats(corrs)
+        print("\nPerspective order stability due to permutation change")
+        print_dict_values(perspective_order_stabilities)
+
+        print(colored("\nAverage rank-order stability due to permutation change", "green"))
+        # average order stability
+        avg_order_stabilities = {}
+        for k in perspective_order_stabilities.keys():
+            avg_order_stabilities[k] = (permutation_order_stabilities[k] + perspective_order_stabilities[k]) / 2
 
+        print_dict_values(avg_order_stabilities)
 
+
+    ### IPSATIVE CORRELATIONS
+    print("\n\n--------------------------------------------------")
+    print("IPSATIVE CORELATIONS")
     print("--------------------------------------------------")
 
     # order of perspectives - avg over permutations last (average r)
@@ -1003,10 +931,9 @@ if __name__ == '__main__':
     key_orders = defaultdict(dict)
     persp_scores = defaultdict(lambda : defaultdict(dict))
 
-    n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"])
+    n_perms = len(dir_2_data[directories[0]]["per_permutation_metrics"])
 
     assert n_perms == 50
-    assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"])
 
     for p_i in range(50):
         for d in directories:
@@ -1022,14 +949,16 @@ if __name__ == '__main__':
         for p_i in range(50):
             scores_1 = [persp_scores[p_i][dir1][k] for k in keys]
             scores_2 = [persp_scores[p_i][dir2][k] for k in keys]
-            if spearman:
-                correlation, _ = spearmanr(scores_1, scores_2)
-            else:
-                correlation, _ = pearsonr(scores_1, scores_2)
 
-            if np.isnan(correlation):
-                # Constant values in one group. Correlation is undefined and artificially set to 0
-                correlation = 0.0
+            correlation = compute_correlation(scores_1, scores_2, spearman)
+            # if spearman:
+            #     correlation, _ = spearmanr(scores_1, scores_2)
+            # else:
+            #     correlation, _ = pearsonr(scores_1, scores_2)
+            #
+            # if np.isnan(correlation):
+            #     # Constant values in one group. Correlation is undefined and artificially set to 0
+            #     correlation = 0.0
 
             correlation_list.append(correlation)