diff --git a/README.md b/README.md
index 8604a535d38941aa4645377d7d85940f1c951290..72b37eb9c62c4629c53822d66c2cdc993831ede6 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,14 @@
 
 ## Installation
 
-Setup you conda env
+Initialize and fetch the llama submodule
+```
+git submodule update --init --recursive
+```
+
+
+
+Setup the conda env
 ```
 conda create -n llm_persp python=3.9
 pip install -r requirements.txt 
@@ -11,8 +18,8 @@ pip install -r requirements.txt
 
 
 Install llama
-``` pip install -r <llama_path>/requirements.txt```
-``` pip install -e <llama_path>```
+``` pip install -r llama/requirements.txt```
+``` pip install -e llama/```
 
 Set up llama_dir in evaluate.py - the dir with checkpoints and encoder
 
diff --git a/evaluate.py b/evaluate.py
index 48102edb9bbb10f7617f6b8ab525ea517ae91860..7abe4bfca4819804c9febea40799a526859c2325 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -3,10 +3,23 @@ import datetime
 import random
 import re
 import json
+from collections import defaultdict
 
 import matplotlib.pyplot as plt
 import tiktoken
 
+import torch
+import sys
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+
+def map_choice_to_number(letter, permutations_dict):
+    # A-F -> 1-6
+    # find index of letter in choices and add 1
+
+    number = permutations_dict[letter] + 1
+    # assert number == choices.index(letter) + 1
+    return number
+
 
 def plot_dict(data, savefile=None):
     # Get the keys and values from the dictionary
@@ -56,123 +69,325 @@ choices = ["A", "B", "C", "D", "E", "F"]
 
 llama_dir = "/gpfswork/rech/imi/utu57ed/llama/llama_files/"
 
-def get_prompts_skeleton(subject, experiment_name, profile_dict):
+high_level_cat_dict = {
+    "Hedonism,Stimulation,Self-Direction": "Openness to Change",
+    "Universalism,Benevolence": "Self-Transcendence",
+    "Conformity,Tradition,Security": "Conservation",
+    "Power,Achievement": "Self-Enhancement"
+}
 
-    if "hofstede" in experiment_name:
-        if args.direct_perspective:
-            raise NotImplementedError("Profile not implemented.")
 
+def get_prompt_skeleton(subject, experiment_name, args):
 
-        # default
-        prompts = {
-            "intro": "The following is a questionnaire (with answers)\n\n",
-            "query": f"\nAnswer:",
-        }
-        if profile_dict:
+    profile_dict = args.profile_dict
 
-            # we define the perspective through a profile
-            profile_string = "\n".join([
-                f"\t{k}:{v}" for k, v in profile_dict.items()
-            ])
+    if sum(map(bool, [args.profile, args.lotr_character, args.music_expert_genre, args.hobby])) > 1:
+        raise ValueError("Multiple ways of inducing a perspective are defined.")
 
-            prompts = {
-                "intro": "The following is a questionnaire (with answers) given to a person with the following profile:\n"+
-                         f"{profile_string}\n\n",
-                "query": f"\nAnswer (from the person):",
-            }
+    if "pvq" in experiment_name or "hofstede" in experiment_name or "big5" in experiment_name:
 
-    elif "pvq" in experiment_name:
+        # Natural language profile is only defined for dictionary profile
+        if not args.profile_dict:
+            if args.natural_language_profile:
+                raise NotImplementedError("Natural language profile not implemented.")
 
-        questionniare_description = "Here we briefly describe some people. Please read each description and think about how much each person is or is not like you. Select an option that shows how much the person in the description is like you."
-        # questionniare_description = "Here we briefly describe some people. Please read each description and think about how much each person is or is not like you. Put an X in the box to the right that shows how much the person in the description is like you.",
+        if "pvq" in experiment_name:
+            assert "pvq" in args.data_dir
+            # todo: extract this somewhere to make it more nice
+            test_name = "pvq"
 
-        # default
-        prompts = {
-            "intro": questionniare_description + "\n\n",
-            "query": f"\nAnswer:",
-        }
+        elif "hofstede" in experiment_name:
+            assert "hofstede" in args.data_dir
+            test_name = "hofstede"
+
+        elif "big5" in experiment_name:
+            if args.data_dir == "data_big5_50":
+                test_name = "big5_50"
+            elif args.data_dir == "data_big5_100":
+                test_name = "big5_100"
+            else:
+                raise ValueError(f"Data dir name is ill-defined {args.data_dir}")
+
+        else:
+            raise ValueError(f"Experiment name is ill-defined {args.experiment_name}")
+
+        # setup the questionnaire description
+        if test_name == "pvq":
+            questionnaire_description = "Here we briefly describe some people. Please read each description and think about how much each person is or is not like you. Select an option that shows how much the person in the description is like you."
+
+        elif test_name == "hofstede":
+            # VSM questionnaire doesn't have a description
+            questionnaire_description = ""
+
+        elif test_name in ["big5_50", "big5_100"]:
+            # VSM questionnaire doesn't have a description
+            questionnaire_description = "Mark how much you agree with each statement."
 
-        if not profile_dict:
+        # setup the set_perspective_str
+        if args.profile_dict:
+            if args.perspective_amount not in ["extreme", "slight"] and list(args.profile_dict.keys()) != ["Primary values"]:
+                raise NotImplementedError('Perspective amount not implemented for keys other than "Primary values".')
+
+            # source pvq: https://scholarworks.gvsu.edu/cgi/viewcontent.cgi?article=1116&context=orpc
+            # source hofstede: https://geerthofstede.com/wp-content/uploads/2016/07/Manual-VSM-2013.pdf
+
+            if args.natural_language_profile:
+
+                # we define the profile in natural language
+                assert set(args.profile_dict.keys()) == {'Primary values'}
+
+                # extract primary values, add space after commas and replace the last comma with "and"
+                primary_values_str = args.profile_dict["Primary values"]
+
+                if args.add_high_level_categories:
+                    if test_name == "pvq":
+                        primary_values_str += f",{high_level_cat_dict[args.profile_dict['Primary values']]}"
+                    else:
+                        raise ValueError("High level categories are implemented only for pvq.")
+
+                primary_values_str = primary_values_str.replace(",", ", ")
+                primary_values_str = ", and ".join(primary_values_str.rsplit(', ', 1))
+
+                if args.natural_language_profile_detail == "high":
+                    primary_values = args.profile_dict["Primary values"]
+                    if test_name == "pvq":
+                        values_description = "Here are the explanations of those values:\n" + \
+                                             ("\t- Self-Direction : independent thought and action - choosing, creating, exploring.\n" if "Self-Direction" in primary_values else "") + \
+                                             ("\t- Stimulation : excitement, novelty, and challenge in life.\n" if "Stimulation" in primary_values else "") + \
+                                             ("\t- Hedonism : pleasure or sensuous gratification for oneself.\n" if "Hedonism" in primary_values else "") + \
+                                             ("\t- Achievement : personal success through demonstrating competence according to social standards.\n" if "Achievement" in primary_values else "") + \
+                                             ("\t- Power : social status and prestige, control or dominance over people and resources.\n" if "Power" in primary_values else "") + \
+                                             ("\t- Security : safety, harmony, and stability of society, of relationships, and of self.\n" if "Security" in primary_values else "") + \
+                                             ("\t- Conformity : restraint of actions, inclinations, and impulses likely to upset or harm others and violate social expectations or norms.\n" if "Conformity" in primary_values else "") + \
+                                             ("\t- Tradition : respect, commitment, and acceptance of the customs and ideas that one's culture or religion provides.\n" if "Tradition" in primary_values else "") + \
+                                             ("\t- Benevolence : preserving and enhancing the welfare of those with whom one is in frequent personal contact (the ‘in-group’).\n" if "Benevolence" in primary_values else "") + \
+                                             ("\t- Universalism : understanding, appreciation, tolerance, and protection for the welfare of all people and for nature.\n" if "Universalism" in primary_values else "")
+
+                    elif test_name == "hofstede":
+                        values_description = "Here are the explanations of those values:\n" + \
+                                             ("\t- Power distance: Power Distance is defined as the extent to which the less powerful members of institutions and organizations within a society expect and accept that power is distributed unequally.\n" if "" in primary_values else "Power distance") + \
+                                             ("\t- Individualism vs Collectivism: Individualism is the opposite of Collectivism. Individualism stands for a society in which the ties between individuals are loose: a person is expected to look after himself or herself and his or her immediate family only. Collectivism stands for a society in which people from birth onwards are integrated into strong, cohesive in-groups, which continue to protect them throughout their lifetime in exchange for unquestioning loyalty.\n" if "Individualism" in primary_values else "") + \
+                                             ("\t- Masculinity vs Femininity: Masculinity is the opposite of Femininity. Masculinity stands for a society in which social gender roles are clearly distinct: men are supposed to be assertive, tough, and focused on material success; women are supposed to be more modest, tender, and concerned with the quality of life. Femininity stands for a society in which social gender roles overlap: both men and women are supposed to be modest, tender, and concerned with the quality of life.\n"  if "Masculinity" in primary_values else "") + \
+                                             ("\t- Uncertainty avoidance: Uncertainty Avoidance is defined as the extent to which the members of institutions and organizations within a society feel threatened by uncertain, unknown, ambiguous, or unstructured situations.\n" if "Uncertainty avoidance" in primary_values else "") + \
+                                             ("\t- Short vs Long-term orientation: Long Term Orientation is the opposite of Short Term Orientation. Long Term Orientation stands for a society which fosters virtues oriented towards future rewards, in particular adaptation, perseverance and thrift. Short Term orientation stands for a society which fosters virtues related to the past and present, in particular respect for tradition, preservation of “face”, and fulfilling social obligations. \n" if "Long-term orientation" in primary_values else "")  + \
+                                             ("\t- Indulgence vs Restraint: Indulgence stands for a society which allows relatively free gratification of some desires and feelings, especially those that have to do with leisure, merrymaking with friends, spending, consumption and sex. Its opposite pole, Restraint, stands for a society which controls such gratification, and where people feel less able to enjoy their lives.\n" if "Indulgence" in primary_values else "")
+
+                    elif test_name in ["big5_50", "big5_100"]:
+                        raise NotImplementedError(f"Descriptions not implemented for {test_name}.")
+
+                    else:
+                        raise NotImplementedError(f"High detail option not implemented for {test_name}")
+
+                elif args.natural_language_profile_detail == "no":
+                    values_description = ""
+                else:
+                    raise ValueError(f"Variable natural_language_profile_detail is ill-defined ({args.natural_language_profile_detail}).")
+
+                # perspective amount
+                if args.perspective_amount == "extreme":
+                    amount_str = "extremely more importance"
+                elif args.perspective_amount == "slight":
+                    amount_str = "slightly more importance"
+                else:
+                    amount_str = "the most importance"
+
+                if args.direct_perspective:
+                    set_perspective_str = f"You are a person attributing {amount_str} to high {primary_values_str}."
+                    if values_description:
+                        set_perspective_str += f" {values_description}"
+                    else:
+                        set_perspective_str += "\n"
+
+                else:
+                    set_perspective_str = f"The following is a questionnaire (with answers) given to a person attributing {amount_str} to high {primary_values_str}."
+                    if values_description:
+                        set_perspective_str += f" {values_description}"
+                    else:
+                        set_perspective_str += "\n"
+
+            else:
+
+                if args.perspective_amount in ["extreme", "slight"]:
+                    amount_str = f"({args.perspective_amount})"
+                else:
+                    amount_str = ""
+
+                # we define the perspective through a key:value profile
+
+                # only using profile for primary values at the moment -> todo:remove this assert later
+                assert set(args.profile_dict.keys()) == {'Primary values'}  # we add ":high "
+                kv_profile_string = "\n".join([
+                    f"\t{k}{amount_str}:high {v}" for k, v in args.profile_dict.items()
+                ])
+                if args.add_high_level_categories:
+                    kv_profile_string += f",{high_level_cat_dict[args.profile_dict['Primary values']]}"
+
+                if args.direct_perspective:
+                    set_perspective_str = "You are a person with the following profile:\n" + \
+                                      f"{kv_profile_string}\n"
+
+                else:
+                    set_perspective_str = "The following is a questionnaire (with answers) given to a person with the following profile:\n" + \
+                                      f"{kv_profile_string}\n"
+
+            # query string
             if args.direct_perspective:
-                raise NotImplementedError("Profile not implemented.")
+                query_str = "Answer:"
+            else:
+                query_str = "Answer (from the person):"
 
-        if "hitler" in experiment_name:
-            prompts = {
-                "intro": "The following is a questionnaire (with answers) given to Adolf Hitler.\n\n"+
-                         questionniare_description + "\n\n",
-                "query": f"\nAnswer (from Adolf Hitler):",
-            }
+        elif args.lotr_character is not None:
 
-        elif "gandhi" in experiment_name:
-            prompts = {
-                "intro": "The following is a questionnaire (with answers) given to Mahatma Gandhi.\n\n"+
-                         questionniare_description + "\n\n",
-                "query": f"\nAnswer (from Mahatma Gandhi):",
-            }
+            lotr_character_name = args.lotr_character
+            assert lotr_character_name in [
+                "Gandalf",
+                "Frodo",
+                "Sauron",
+                "Aragorn",
+                "Pippin",
+            ]
 
-        elif "dorian" in experiment_name:
-            prompts = {
-                "intro": "The following is a questionnaire (with answers) given to Dorian Gray.\n\n"+
-                         questionniare_description + "\n\n",
-                "query": f"\nAnswer (from Dorian Gray):",
-            }
+            if args.direct_perspective:
+                set_perspective_str = f"You are {lotr_character_name} from The Lord of the Rings."
 
-        elif profile_dict:
-            # we define the perspective through a profile
-            profile_string = "\n".join([
-                f"\t{k}:{v}" for k, v in profile_dict.items()
-            ])
+            else:
+                set_perspective_str = f"The following is a questionnaire (with answers) given to {lotr_character_name} from The Lord of the Rings."
 
+            if args.natural_language_profile:
+                raise NotImplementedError("Natural language profile not implemented.")
+
+            # query string
             if args.direct_perspective:
-                set_profile_str = "You are a person with the following profile:"
                 query_str = "Answer:"
+            else:
+                query_str = f"Answer (from {lotr_character_name}):"
+
+        elif args.hobby is not None:
+
+            hobby = args.hobby
+            assert hobby in [
+                 "singing in a church choir",
+                 "gambling",
+                 "playing chess",
+                 "volunteering at a homeless shelter",
+                 "playing in a rock band",
+                 "car racing"
+            ]
+
+            if args.direct_perspective:
+                # set_perspective_str = f"You are an expert in {music_expert_genre}."
+                set_perspective_str = f"You enjoy {hobby}."
 
             else:
-                set_profile_str = "The following is a questionnaire (with answers) given to a person with the following profile:"
+                set_perspective_str = f"The following is a questionnaire (with answers) given to person who enjoys {hobby}."
+
+            if args.natural_language_profile:
+                raise NotImplementedError("Natural language profile not implemented.")
+
+            # query string
+            if args.direct_perspective:
+                query_str = "Answer:"
+            else:
                 query_str = "Answer (from the person):"
 
+        elif args.music_expert_genre is not None:
+
+            music_expert_genre = args.music_expert_genre
+            assert music_expert_genre in [
+                "rap",
+                "hip-hop",
+                "jazz",
+                "classical",
+                "heavy metal",
+                "reggae",
+                "rock",
+                "gospel"
+            ]
+
+            if args.direct_perspective:
+                # set_perspective_str = f"You are an expert in {music_expert_genre}."
+                set_perspective_str = f"You are an AI expert in {music_expert_genre} music."
+
+            else:
+                set_perspective_str = f"The following is a questionnaire (with answers) given to an expert AI in {music_expert_genre} music."
+
+            if args.natural_language_profile:
+                raise NotImplementedError("Natural language profile not implemented.")
+
+            # query string
+            if args.direct_perspective:
+                query_str = "Answer:"
+            else:
+                query_str = f"Answer (from a {music_expert_genre} expert):"
+
+        else:
+            raise ValueError("Undefined perspective.")
+
+        if args.system_message:
             prompts = {
-                "intro": f"{set_profile_str}\n"+
-                         f"{profile_string}\n\n"+
-                         f"{questionniare_description}\n\n",
+                "system": f"{set_perspective_str}".rstrip(), # remove newline from the end
+                "intro": f"{questionnaire_description}\n\n" if questionnaire_description else "",
+                "query": f"\n{query_str}",
+            }
+
+        else:
+            if args.separator and not args.direct_perspective:
+                set_perspective_str += "\n" + "-"*200
+
+            prompts = {
+                "intro": f"{set_perspective_str}\n\n" +
+                         (f"{questionnaire_description}\n\n" if questionnaire_description else ""),  # if questionnaire_description is empty don't add newlines
                 "query": f"\n{query_str}",
             }
 
     elif "mmlu" in experiment_name:
+        raise DeprecationWarning("Deprecated")
+        if args.natural_language_profile:
+            raise NotImplementedError("Natural language profile not implemented.")
+
+        if args.perspective_amount not in ["extreme", "slight"]:
+            raise NotImplementedError("Perspective amount not implemented")
 
-        questionniare_description = "The following are multiple choice questions (with answers)."
+        questionnaire_description = "The following are multiple choice questions (with answers)."
         prompts = {
-            "intro": f"{questionniare_description}\n\n",
+            "intro": f"{questionnaire_description}\n\n",
             "query": f"\nAnswer:"
         }
 
-        if profile_dict:
+        if args.profile_dict:
             # we define the perspective through a profile
             profile_string = "\n".join([
-                f"\t{k}:{v}" for k, v in profile_dict.items()
+                f"\t{k}:{v}" for k, v in args.profile_dict.items()
             ])
 
             if args.direct_perspective:
-                set_profile_str = "You are a person with the following profile:"
+                set_perspective_str = "You are a person with the following profile:"
                 query_str = "Answer:"
 
             else:
-                set_profile_str = "The following is a questionnaire (with answers) given to a person with the following profile:"
+                set_perspective_str = "The following is a questionnaire (with answers) given to a person with the following profile:"
                 query_str = "Answer (from the person):"
 
             prompts = {
-                "intro": f"{set_profile_str}\n"+
+                "intro": f"{set_perspective_str}\n"+
                          f"{profile_string}\n\n",
                 "query": f"\n{query_str}",
             }
 
-
     elif subject == "political_compass":
-        if profile_dict:
+        raise DeprecationWarning("Deprecated")
+
+        if args.profile_dict:
            raise NotImplementedError("Profile not implemented.")
 
         if args.direct_perspective:
             raise NotImplementedError("Profile not implemented.")
+
+        if args.natural_language_profile:
+            raise NotImplementedError("Natural language profile not implemented.")
+
+        if args.perspective_amount not in ["extreme", "slight"]:
+            raise NotImplementedError("Perspective amount not implemented")
         # default
         prompts = {
             "intro": "",
@@ -228,19 +443,26 @@ def get_prompts_skeleton(subject, experiment_name, profile_dict):
             }
 
     elif "tomi" in subject:
+        raise DeprecationWarning("Deprecated")
+
+        if args.direct_perspective:
+            raise NotImplementedError("Profile not implemented.")
+
+        if args.natural_language_profile:
+            raise NotImplementedError("Natural language profile not implemented.")
+
+        if args.perspective_amount not in ["extreme", "slight"]:
+            raise NotImplementedError("Perspective amount not implemented")
 
         prompts = {
             "intro": "",
             "query": "\nAnswer:"
         }
 
-        if args.direct_perspective:
-            raise NotImplementedError("Profile not implemented.")
-
-        if profile_dict:
+        if args.profile_dict:
             # we define the perspective through a profile
             profile_string = "\n".join([
-                f"\t{k}:{v}" for k, v in profile_dict.items()
+                f"\t{k}:{v}" for k, v in args.profile_dict.items()
             ])
 
             prompts = {
@@ -249,80 +471,15 @@ def get_prompts_skeleton(subject, experiment_name, profile_dict):
                 "query": f"\nAnswer (from the person):",
             }
 
-        # if "tomi_default" in experiment_name:
-        #     # in-context examples are not from the same age as the query age
-        #     prompts = {
-        #         "intro": "",
-        #         "query": "\nAnswer:"
-        #     }
-        #
-        # if "tomi_age" in experiment_name:
-        #     numbers = [int(num) for num in re.findall(r'\d+', experiment_name)]
-        #     age = ["3", "5", "10", "15", "30", "50"][numbers[0]]
-        #
-        #     # in-context examples are not from the same age as the query age
-        #     prompts = {
-        #         # "intro": "The following is a text comprehension test with answers from a {} year old.\n\n".format(age),
-        #         "intro": "The following is a text comprehension test with answers from people of various ages.\n\n".format(age),
-        #         "query": "\nAnswer (from a {} year old):".format(age)
-        #     }
-        #
-        # if "tomi_default_new" in experiment_name:
-        #     # in-context examples are not from the same age as the query age
-        #     prompts = {
-        #         "intro": "The following are multiple choice questions (with answers).\n\n",
-        #         "query": "\nAnswer:"
-        #     }
-        #
-        # if "tomi_age_new" in experiment_name:
-        #     numbers = [int(num) for num in re.findall(r'\d+', experiment_name)]
-        #     age = ["3", "5", "10", "15", "30", "50"][numbers[0]]
-        #
-        #     # in-context examples are not from the same age as the query age
-        #     prompts = {
-        #         "intro": "The following are multiple choice questions (with answers from {} year old human).\n\n".format(age),
-        #         "query": "\nAnswer (from a {} year old):".format(age),
-        #     }
-        #
-        # if "tomi_dog" in experiment_name:
-        #     # in-context examples are not from the same age as the query age
-        #     prompts = {
-        #         "intro": "The following is a text comprehension test with answers from a dog.\n\n",
-        #         "query": "\nAnswer (from a dog):"
-        #     }
-
     else:
-        if profile_dict:
-            raise NotImplementedError("Profile not implemented.")
-
-        if args.direct_perspective:
-            raise NotImplementedError("Profile not implemented.")
-
-        # default
-        prompts = {
-            "intro": "The following are multiple choice questions (with answers) about {}.\n\n".format(
-                format_subject(subject)),
-            "query": "\nAnswer:"
-        }
-
-        if experiment_name == "spec_prof":
-            prompts = {
-                "intro": "The following is an interview with a professor of {}.\n\n".format(format_subject(subject)),
-                "query": "\nAnswer from the professor:"
-            }
-
-        if experiment_name == "unspec_prof":
-            prompts = {
-                "intro": "The following is an interview with a professor.\n\n",
-                "query": "\nAnswer from the professor:"
-            }
+        raise DeprecationWarning("Deprecated")
 
     return prompts
 
-def dummy_lprobs_from_generation(response):
-    # lprobs (todo: this is hardcoded)
-    lprobs = [-100] * len(choices)
-    for i, op in enumerate(choices):
+
+def dummy_lprobs_from_generation(response, answers):
+    lprobs = [-100] * len(answers)
+    for i, op in enumerate(answers):
         if op in response:
             lprobs[i] = -0.01
 
@@ -336,6 +493,7 @@ def softmax(x):
     softmax = numerator/denominator
     return softmax
 
+
 def format_subject(subject):
     l = subject.split("_")
     s = ""
@@ -344,58 +502,78 @@ def format_subject(subject):
     return s
 
 
-def format_example(df, idx, subject, experiment_name, include_answer=True):
-    prompt = df.iloc[idx, 0]
+def format_example(df, idx, subject, experiment_name, args, permutations_dict, include_answer=True):
+    prompt = df.iloc[idx, 0]  # add question to prompt
     k = df.shape[1] - 2
 
+
+    # extract options
     num_options = 0
+    options_strings = []
     for j in range(k):
         op_str = df.iloc[idx, j+1]
 
         if op_str == "undef":
             continue
 
-        prompt += "\n{}. {}".format(choices[j], op_str)
-        num_options += 1
-
-    # makes in-context examples be from a 50-year-old
-    if "tomi_age" in experiment_name and include_answer:
-
-        # include_answer is true -> this is an in-context example
-        # create dummy examples -> change the age to 50, so that the correct answers make sense
+        options_strings.append(op_str)
 
-        query_str = get_prompts_skeleton(subject=subject, experiment_name=experiment_name, profile_dict=args.profile_dict)["query"]
-        assert "year old" in query_str
-        new_query_str = re.sub(r'\d+', '50', query_str)
+        num_options += 1
 
-        prompt += new_query_str
+    for ch in choices[:num_options]:
+        prompt += "\n{}. {}".format(ch, options_strings[permutations_dict[ch]])
 
-    else:
-        # add query prompt (ex. "\nAnswer:")
-        prompt += get_prompts_skeleton(subject=subject, experiment_name=experiment_name, profile_dict=args.profile_dict)["query"]
+    prompt_skeleton = get_prompt_skeleton(subject=subject, experiment_name=experiment_name, args=args)
+    prompt += prompt_skeleton["query"]
 
     if include_answer:
         prompt += " {}\n\n".format(df.iloc[idx, k + 1])
-    return prompt, num_options
 
-def gen_prompt(train_df, subject, experiment_name, k=-1):
+    return prompt, num_options, prompt_skeleton
+
+# legacy
+# def format_example_(df, idx, subject, experiment_name, args, permutations_dict, include_answer=True):
+#     prompt = df.iloc[idx, 0]  # add question to prompt
+#     k = df.shape[1] - 2
+#
+#     # extract options
+#     num_options = 0
+#     for j in range(k):
+#         op_str = df.iloc[idx, j + 1]
+#
+#         if op_str == "undef":
+#             continue
+#
+#         prompt += "\n{}. {}".format(choices[j], op_str)
+#         num_options += 1
+#
+#     prompt_skeleton = get_prompt_skeleton(subject=subject, experiment_name=experiment_name, args=args)
+#     prompt += prompt_skeleton["query"]
+#
+#     if include_answer:
+#         prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+#
+#     return prompt, num_options, prompt_skeleton
+
+
+def gen_prompt(train_df, subject, experiment_name, args, permutations_dict, k=-1):
 
     # get intro prompt (ex. "The following are .... \n\n" )
-    prompt = get_prompts_skeleton(subject=subject, experiment_name=experiment_name, profile_dict=args.profile_dict)["intro"]
+    prompt = get_prompt_skeleton(subject=subject, experiment_name=experiment_name, args=args)["intro"]
 
     if k == -1:
         k = train_df.shape[0]
     for i in range(k):
-        example_prompt, _ = format_example(train_df, i, subject=subject, experiment_name=experiment_name)
+        example_prompt, _, _ = format_example(train_df, i, subject=subject, experiment_name=experiment_name, args=args, permutations_dict=permutations_dict)
         prompt += example_prompt
     return prompt
 
 
-def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
+def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generator=None):
     cors = []
     all_probs = []
     all_answers = []
-    answers = choices[:test_df.shape[1]-2]
+    # answers = choices[:test_df.shape[1]-2]
 
     gpt_token_counter = 0
 
@@ -405,60 +583,79 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
 
         # get prompt and make sure it fits
         k = args.ntrain
-        prompt_end, n_options = format_example(test_df, i, subject=subject, experiment_name=args.experiment_name, include_answer=False)
-        train_prompt = gen_prompt(dev_df, subject, experiment_name=args.experiment_name, k=k)
+        prompt_end, n_options, prompt_skeleton = format_example(test_df, i, subject=subject, experiment_name=args.experiment_name, include_answer=False, args=args, permutations_dict=permutations_dict)
+        train_prompt = gen_prompt(dev_df, subject, experiment_name=args.experiment_name, k=k, args=args, permutations_dict=permutations_dict)
         prompt = train_prompt + prompt_end
 
+        # all questions have the same number of options
+        assert test_df.shape[1]-2 == n_options
+        answers = choices[:n_options]
+
         #  crop to 2048 tokens
         #  this is used when the prompt is too long it feeds the most possible number of examples that fit
         while crop(prompt) != prompt:
             k -= 1
-            train_prompt = gen_prompt(dev_df, subject, k)
+            train_prompt = gen_prompt(dev_df, subject, k=k, args=args)
             prompt = train_prompt + prompt_end
 
         label = test_df.iloc[i, test_df.shape[1]-1]
-        assert label in choices + ["undef"]
+        assert label in answers + ["undef"]
 
         if args.estimate_gpt_tokens:
             encoder = tiktoken.encoding_for_model('gpt-3.5-turbo-0301')
             assert encoder == tiktoken.encoding_for_model('gpt-4-0314')
-            gpt_token_counter += len(encoder.encode(prompt)) + 1  # prompt + 1 generated token
+            gpt_token_counter += len(encoder.encode(prompt_skeleton.get("system", "")+prompt)) + 1  # prompt + 1 generated token
 
         if engine == "dummy":
-            generation = random.choice([f"{c} ba" for c in choices])
-            lprobs = dummy_lprobs_from_generation(generation)
+            generation = random.choice([f"{c} ba" for c in answers])
+            lprobs = dummy_lprobs_from_generation(generation, answers)
 
         elif engine == "interactive":
             # ask the user to choose
             generation = input(f"{prompt}")
-            lprobs = dummy_lprobs_from_generation(generation)
+            lprobs = dummy_lprobs_from_generation(generation, answers)
 
         elif engine in ["llama_7B", "llama_13B", "llama_30B", "llama_65B"]:
+            if args.system_message:
+                raise ValueError("System message is not supported in LLaMa models.")
+
+            if args.match_tokens_with_space:
+                raise NotImplementedError("Tokens with space not implemented.")
+
             if args.generative_qa:
-                result = llama_generator.generate(
+                results, _ = llm_generator.generate(
                     [prompt],
                     max_gen_len=5,
                     temperature=0,
-                )[0]
+                    cap_if_too_long=True,
+                )
+                result = results[0]
 
                 generation = remove_prefix(result, prompt)
-                lprobs = dummy_lprobs_from_generation(generation)
+                lprobs = dummy_lprobs_from_generation(generation, answers)
 
             else:
-                results, top_logprobs = llama_generator.generate_next_token(
+                generated_cont, top_logprobs = llm_generator.generate(
                     [prompt],
-                    max_gen_len=1,
+                    max_gen_len=1,  # only one token
                     temperature=0,
-                    logprobs=100
+                    logprobs=100,
+                    cap_if_too_long=True,
                 )
 
-                # result = results[0]
                 top_logprobs = top_logprobs[0]
-
                 lprobs = []
+
+                answers_lprobs = {}
                 for ans in answers:
-                    print(f"ans {ans} : {top_logprobs.get(ans, -100)}")
-                    lprobs.append(top_logprobs.get(ans, -100))
+                    answer_lprob = top_logprobs.get(ans, -100)
+
+                    answers_lprobs[ans] = answer_lprob
+                    lprobs.append(answer_lprob)
+                    print(f"ans {ans} : {answer_lprob}")
+
+                # take the answer with the highest log prob
+                generation = max(answers_lprobs, key=answers_lprobs.get)
 
         elif engine in ["gpt-3.5-turbo", "gpt-4", "gpt-3.5-turbo-0301", "gpt-4-0314"]:
             while True:
@@ -474,30 +671,28 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
                         # get the encoding for each letter in choices
                         if args.match_tokens_with_space:
                             logit_bias = {
-                                encoder.encode(f" {c}")[0]: 100 for c in choices[:n_options]
+                                encoder.encode(f" {c}")[0]: 100 for c in answers
                             }
 
                         else:
                             logit_bias = {
-                                encoder.encode(c)[0]: 100 for c in choices[:n_options]
+                                encoder.encode(c)[0]: 100 for c in answers
                             }
 
-
                     if args.system_message:
-                        assert prompt == train_prompt+prompt_end
-
-                        # user message
+                        # system message
                         c = openai.ChatCompletion.create(
                             model=engine,
                             messages=[
-                                {"role": "system", "content": train_prompt},
+                                {"role": "system", "content": prompt_skeleton["system"]},
                                 # {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
-                                {"role": "user", "content": prompt_end}
+                                {"role": "user", "content": prompt}
                             ],
                             max_tokens=max_tokens,
                             n=1,
                             temperature=0,
                             logit_bias=logit_bias,
+                            request_timeout=30,
                         )
 
                     else:
@@ -513,6 +708,7 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
                             n=1,
                             temperature=0,
                             logit_bias=logit_bias,
+                            request_timeout=30,
                         )
 
                     break
@@ -525,11 +721,10 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
             generation = c['choices'][0]['message']['content']
 
             if args.generative_qa:
-                if generation not in choices:
-                    raise ValueError("Generation is not in choices and gqa is not used. Potential problem with logit bias?")
-
+                if generation not in answers:
+                    raise ValueError(f"Generation is not in answers {answers} and gqa is not used. Potential problem with logit bias?")
 
-            lprobs = dummy_lprobs_from_generation(generation)
+            lprobs = dummy_lprobs_from_generation(generation, answers)
 
         elif engine in ["text-davinci-003", "text-davinci-002", "text-davinci-001", "curie", "babbage", "ada"]:
 
@@ -561,12 +756,132 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
                     # print("Warning: {} not found. Artificially adding log prob of -100.".format(ans))
                     lprobs.append(-100)
 
+        elif engine in ["openassistant_rlhf2_llama30b"]:
+            if args.generative_qa:
+                raise NotImplementedError("Generative QA not implemented for OpenAI non-ChatGPT models.")
+
+            if args.system_message:
+                prompt = f'<prefix>{prompt_skeleton["system"]}</prefix><human>{prompt}<bot>'
+
+            else:
+                # prompt = f"<prefix></prefix><human>{prompt}<bot>"
+                prompt = f'<human>{prompt}<bot>'
+
+            tokenizer, model = llm_generator
+
+            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
+            output = model.generate(
+                **inputs,
+                max_new_tokens=1,
+                do_sample=False,
+                temperature=0.001,
+                top_p=1.0,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+
+            # extract the score for each possible answer
+            option_scores = {
+                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
+            }
+
+            # take the most probable answer as the generation
+            generation = max(option_scores, key=option_scores.get)
+
+            # extract logprobs
+            lprobs = [float(option_scores[a]) for a in answers]
+
+        elif engine in ["stablevicuna"]:
+            # todo: combine with stablelm
+
+            if args.generative_qa:
+                raise NotImplementedError("Generative QA not implemented for OpenAI non-ChatGPT models.")
+
+            if args.system_message:
+                raise NotImplementedError("System message not implemented.")
+            else:
+                prompt = f"""\
+                ### Human: {prompt}
+                ### Assistant:\
+                """
+
+            tokenizer, model = llm_generator
+
+            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
+            output = model.generate(
+                **inputs,
+                max_new_tokens=1,
+                do_sample=False,
+                temperature=0.001,
+                top_p=1.0,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+
+            # extract the score for each possible answer
+            option_scores = {
+                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
+            }
+
+            # take the most probable answer as the generation
+            generation = max(option_scores, key=option_scores.get)
+
+            # extract logprobs
+            lprobs = [float(option_scores[a]) for a in answers]
+
+        elif engine in ["stablelm"]:
+
+            if args.generative_qa:
+                raise NotImplementedError("Generative QA not implemented for OpenAI non-ChatGPT models.")
+
+            tokenizer, model = llm_generator
+
+            class StopOnTokens(StoppingCriteria):
+                def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+                    stop_ids = [50278, 50279, 50277, 1, 0]
+                    for stop_id in stop_ids:
+                        if input_ids[0][-1] == stop_id:
+                            return True
+                    return False
+
+            if args.system_message:
+                system_prompt = prompt_skeleton["system"]
+                user_prompt = prompt
+                prompt = f"<|SYSTEM|>{system_prompt}<|USER|>{user_prompt}<|ASSISTANT|>"
+
+            else:
+                # prompt = f"<|SYSTEM|><|USER|>{prompt}<|ASSISTANT|>"
+                prompt = f"<|USER|>{prompt}<|ASSISTANT|>"
+
+            inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+            output = model.generate(
+                **inputs,
+                max_new_tokens=1,
+                temperature=0.001,
+                do_sample=False,
+                stopping_criteria=StoppingCriteriaList([StopOnTokens()]),
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+
+            # extract the score for each possible answer
+            option_scores = {
+                ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers
+            }
+
+            # take the most probable answer as the generation
+            generation = max(option_scores, key=option_scores.get)
+
+            # extract logprobs
+            lprobs = [float(option_scores[a]) for a in answers]
+
         else:
-            raise ValueError(f"Not recotnized model {engine}.")
+            raise ValueError(f"Not recognized model {engine}.")
 
         if args.verbose:
             if args.system_message:
-                print(f"Prompt(System):\n{train_prompt}")
+                print(f"Prompt(System):\n{prompt_skeleton['system']}")
                 print(f"Prompt(User):\n{prompt}")
 
             else:
@@ -577,7 +892,7 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
         if args.generative_qa:
 
             first_generated_letter = generation.strip()[:1]
-            if first_generated_letter in choices:
+            if first_generated_letter in answers:
                 pred = first_generated_letter
             else:
                 pred = "other"
@@ -588,12 +903,12 @@ def eval(args, subject, engine, dev_df, test_df, llama_generator=None):
 
         else:
             pred = {
-                i: c for i, c in enumerate(choices)
+                i: c for i, c in enumerate(answers)
             }[np.argmax(lprobs)]
             cor = pred == label
 
         if args.verbose:
-            print(f"Pred:{pred} (Generation:{generation})")
+            print(f"Pred:{pred} (Generation:{generation}; Score: {map_choice_to_number(pred, permutations_dict)})")
 
         if args.verbose:
             print("Correct: ", cor)
@@ -625,6 +940,7 @@ def remove_prefix(s, pref):
         return s[len(pref):]
     return s
 
+
 def main(args):
     engines = args.engine
     subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
@@ -682,7 +998,6 @@ def main(args):
         ]
         subjects = subjects_to_evaluate
 
-
     if "mmlu_college" in args.experiment_name:
         assert "data_mmlu" in args.data_dir
 
@@ -722,13 +1037,23 @@ def main(args):
     for engine in engines:
         print("engine:", engine)
         # dump results dir
-        dump_results_dir = os.path.join(args.save_dir, "_".join(
-            [args.experiment_name, engine, args.data_dir, f"ntrain_{args.ntrain}_" + f"profile_{args.profile}" if args.profile else "", timestamp]))
+        dump_results_dir = os.path.join(args.save_dir, "_".join([
+            args.experiment_name,
+            engine,
+            args.data_dir,
+            f"permutations_{args.permutations}",
+            f"ntrain_{args.ntrain}",
+            f"lotr_character_{args.lotr_character}" if args.lotr_character else "",
+            f"music_expert_{args.music_expert_genre}" if args.music_expert_genre else "",
+            f"hobby_{args.hobby}" if args.hobby else "",
+            f"profile_{args.profile}" if args.profile else "", timestamp
+        ]))
         os.makedirs(dump_results_dir, exist_ok=True)
 
         if engine in ["llama_7B", "llama_13B", "llama_30B", "llama_65B"]:
-            # load llama
-            from example_llama import setup_model_parallel, load
+            # todo: these functions should be moved to the llama submodule
+            from llama import setup_model_parallel, load
+
             local_rank, world_size = setup_model_parallel()
 
             if local_rank > 0:
@@ -741,18 +1066,51 @@ def main(args):
             llama_tokenizer_path = os.path.join(llama_dir, "tokenizer.model")
 
             # load model
-            llama_generator = load(llama_ckpt_dir, llama_tokenizer_path, local_rank, world_size)
+            llm_generator = load(
+                llama_ckpt_dir,
+                llama_tokenizer_path,
+                local_rank,
+                world_size,
+                max_seq_len=2048,
+                max_batch_size=1,
+            )
+        elif engine in ["stablelm", "stablevicuna", "openassistant_rlhf2_llama30b"]:
+            hf_cache_dir = "/gpfswork/rech/imi/utu57ed/stablelm_models"
+
+            if engine == "stablelm":
+                print("Loading stable-lm-tuned-alpha-7b.")
+                tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir)
+                model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir)
+
+            elif engine == "stablevicuna":
+                print("Loading stable-vicuna-13b.")
+                tokenizer = AutoTokenizer.from_pretrained("/gpfswork/rech/imi/utu57ed/hf_stable_vicuna_13b")
+                model = AutoModelForCausalLM.from_pretrained("/gpfswork/rech/imi/utu57ed/hf_stable_vicuna_13b")
+
+            elif engine == "openassistant_rlhf2_llama30b":
+                print("Loading openassistant-rlhf2-llama30b.")
+                tokenizer = AutoTokenizer.from_pretrained("/gpfswork/rech/imi/utu57ed/oasst-rlhf-2-llama-30b-7k-steps-xor/oasst-rlhf-2-llama-30b-7k-steps")
+                print("tokenizer loaded")
+                model = AutoModelForCausalLM.from_pretrained("/gpfswork/rech/imi/utu57ed/oasst-rlhf-2-llama-30b-7k-steps-xor/oasst-rlhf-2-llama-30b-7k-steps")
+                print("model loaded")
+
+            else:
+                raise NotImplementedError(f"{engine} not supported")
+
+            model.half().cuda()
+            print("Loaded.")
+            llm_generator = (tokenizer, model)
 
         else:
-            llama_generator = None
+            llm_generator = None
 
         all_cors = []
 
-        subj_acc = {}
-        subj_len = {}
-
-        metrics = {}
-        answers = {}
+        # list because of permutations
+        subj_acc = []
+        subj_len = []
+        metrics = []
+        answers = []
 
         for subject in subjects:
             if args.ntrain >= 1:
@@ -768,96 +1126,189 @@ def main(args):
             # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
             test_df[0][:] = test_df[0][:].str.replace("\\n", "\n")
 
-            # print("Example of prompt:\n")
-            # print(get_prompts_skeleton(subject, experiment_name=args.experiment_name)["intro"])
-
-            cors, acc, probs, preds, gpt_tokens = eval(args, subject, engine, dev_df, test_df, llama_generator=llama_generator)
-            all_cors.append(cors)
-            gpt_tokens_total += gpt_tokens
+            if args.permutations > 1:
+                if "hofstede" in args.data_dir:
+                    n_options = 5
+                elif "pvq" in args.data_dir:
+                    n_options = 6
+                elif "big5" in args.data_dir:
+                    n_options = 5
+                else:
+                    raise NotImplementedError(f"Permutations not implemented for data_dir {args.data_dir}.")
 
-            subj_acc[subject] = acc
-            subj_len[subject] = len(test_df)
-            answers[subject] = preds
+                import itertools
+                all_permutations=list(itertools.permutations(range(n_options)))
 
-            def map_choice_to_number(letter):
-                # A-F -> 1-6
-                # find index of letter in choices and add 1
-                number = choices.index(letter) + 1
+                original_state = random.getstate()  # save the original state
+                random.seed(1)
+                permutations = random.sample(all_permutations, args.permutations)
+                random.setstate(original_state)
+                print("permutations_hash:", np.array(permutations)[:, :3].sum())
 
-                if letter in "ABCDEF":
-                    assert number == ord(letter) - ord('A') + 1
-
-                return number
-
-            if "hofstede" in args.data_dir:
-                assert "hofstede" in args.experiment_name
-
-                preds_values = np.vectorize(map_choice_to_number)(preds)
-
-                # from the manual (question indices start from 1)
-                # power_distance = 35(m07 – m02) + 25(m20 – m23) + C(pd)
-                # individualism = 35(m04 – m01) + 35(m09 – m06) + C(ic)
-                # masculinity = 35(m05 – m03) + 35(m08 – m10) + C(mf)
-                # uncertainty_avoidance = 40(m18 - m15) + 25(m21 – m24) + C(ua)
-                # long_term_orientation = 40(m13 – m14) + 25(m19 – m22) + C(ls)
-                # indulgence = 35(m12 – m11) + 40(m17 – m16) + C(ir)
-
-                # indices start from 0
-                metrics[subject] = {
-                    "power_distance": 35*(preds_values[6] - preds_values[1]) + 25*(preds_values[19] - preds_values[22]),
-                    "individualism": 35*(preds_values[3] - preds_values[0]) + 35*(preds_values[8] - preds_values[5]),
-                    "masculinity": 35*(preds_values[4] - preds_values[2]) + 35*(preds_values[7] - preds_values[9]),
-                    "uncertainty_avoidance": 40*(preds_values[17] - preds_values[14]) + 25*(preds_values[20] - preds_values[23]),
-                    "long_term_orientation": 40*(preds_values[12] - preds_values[13]) + 25*(preds_values[18] - preds_values[21]),
-                    "indulgence": 35*(preds_values[11] - preds_values[10]) + 40*(preds_values[16] - preds_values[15])
-                }
-                metrics[subject] = {k: float(v) for k, v in metrics[subject].items()}
+                permutations_dicts = [
+                    dict(zip(choices, perm)) for perm in permutations
+                ]
 
-            elif "pvq" in args.data_dir:
-                assert "pvq" in args.experiment_name
-
-                # pvq is evaluated by averaging scored based on different values
-                preds_values = np.vectorize(map_choice_to_number)(preds)
-
-                profile_values_idx_json = os.path.join(os.path.join(args.data_dir, "raw"), "values.json")
-                with open(profile_values_idx_json) as f:
-                    profile_values_idx = json.load(f)
-                profile_values_idx = {k: np.array(v)-1 for k, v in profile_values_idx.items() if k != "_comment"}
-
-                metrics[subject] = {}
-
-                # mean_values = preds_values.mean()
-
-                for profile_value, idxs in profile_values_idx.items():
-                    # metrics[subject][profile_value] = preds_values[idxs].mean() - mean_values
-                    metrics[subject][profile_value] = preds_values[idxs].mean()
-
-            elif "political_compass" in args.data_dir:
-                # political compas is evaluated using the website
-                resp_df = test_df.copy()
-                resp_df[5] = preds
-                preds_csv_file = f"./results/political_compass/preds_{args.experiment_name}_{engine}_{timestamp}.csv"
-                resp_df.to_csv(preds_csv_file, header=None, index=False)
-                print(f"preds saved to '{preds_csv_file}")
-
-                evaluate_csv_file(preds_csv_file)
+                # permutations_dict = {choices[len(choices)-1-i]: i for i, c in enumerate(choices)}  # reverse
 
             else:
-                metrics[subject] = {
-                    "accuracy": subj_acc[subject]
-                }
-
-            test_df["{}_correct".format(engine)] = cors
-            for j in range(probs.shape[1]):
-                choice = choices[j]
-                test_df["{}_choice{}_probs".format(engine, choice)] = probs[:, j]
-
-            if args.log:
-                test_df.to_csv(os.path.join(args.save_dir, "results_{}".format(engine), "{}.csv".format(subject)), index=None)
+                # in order
+                permutations_dicts = [{choices[i]: i for i, c in enumerate(choices)}]
+
+            for perm_i, permutations_dict in enumerate(permutations_dicts):
+                subj_acc.append({})
+                subj_len.append({})
+                metrics.append({})
+                answers.append({})
+
+                cors, acc, probs, preds, gpt_tokens = eval(
+                    args=args,
+                    subject=subject,
+                    engine=engine,
+                    dev_df=dev_df,
+                    test_df=test_df,
+                    permutations_dict=permutations_dict,
+                    llm_generator=llm_generator
+                )
+                all_cors.append(cors)
+                gpt_tokens_total += gpt_tokens
+
+                subj_acc[perm_i][subject] = acc
+                subj_len[perm_i][subject] = len(test_df)
+                preds_values = np.vectorize(map_choice_to_number)(preds, permutations_dict)
+                answers[perm_i][subject] = list(zip(preds, map(int, preds_values)))
+
+                if "hofstede" in args.data_dir:
+                    assert "hofstede" in args.experiment_name
+
+                    # preds_values_ = np.vectorize(map_choice_to_number)(preds, permutations_dict)
+                    # assert all(preds_values_ == preds_values)
+
+                    # from the manual (question indices start from 1)
+                    # power_distance = 35(m07 – m02) + 25(m20 – m23) + C(pd)
+                    # individualism = 35(m04 – m01) + 35(m09 – m06) + C(ic)
+                    # masculinity = 35(m05 – m03) + 35(m08 – m10) + C(mf)
+                    # uncertainty_avoidance = 40(m18 - m15) + 25(m21 – m24) + C(ua)
+                    # long_term_orientation = 40(m13 – m14) + 25(m19 – m22) + C(ls)
+                    # indulgence = 35(m12 – m11) + 40(m17 – m16) + C(ir)
+
+                    # indices start from 0
+                    metrics[perm_i][subject] = {
+                        "Power Distance": 35*(preds_values[6] - preds_values[1]) + 25*(preds_values[19] - preds_values[22]),
+                        "Individualism": 35*(preds_values[3] - preds_values[0]) + 35*(preds_values[8] - preds_values[5]),
+                        "Masculinity": 35*(preds_values[4] - preds_values[2]) + 35*(preds_values[7] - preds_values[9]),
+                        "Uncertainty Avoidance": 40*(preds_values[17] - preds_values[14]) + 25*(preds_values[20] - preds_values[23]),
+                        "Long-Term Orientation": 40*(preds_values[12] - preds_values[13]) + 25*(preds_values[18] - preds_values[21]),
+                        "Indulgence": 35*(preds_values[11] - preds_values[10]) + 40*(preds_values[16] - preds_values[15])
+                    }
+                    metrics[perm_i][subject] = {k: float(v) for k, v in metrics[perm_i][subject].items()}
+
+                elif "big5" in args.data_dir:
+
+                    # items are given in the following order
+                    # positive items for Neuroticism, neg items for Neuroticism,
+                    # positive items for Extravesion, neg items for Extraversion,
+                    # ...
+                    # positive items for Conscientiousness, neg items for Conscientiousness
+
+                    if "data_big5_50" == args.data_dir:
+                        items_per_chunk = 5
+                    elif "data_big5_100" == args.data_dir:
+                        items_per_chunk = 10
+                    else:
+                        raise ValueError(f"data_dir {args.data_dir} not supported.")
+
+                    # separate answers into chunks
+                    chunks = [preds_values[st:st + items_per_chunk] for st in range(0, len(preds_values), items_per_chunk)]
+
+                    # pos item score - A = 1, F = 5
+                    # neg item score - A = 5, F = 1
+                    # revert negative items - neg_i = 6 - neg_i
+                    # i.e. total: sum(pos_its) + 6*items_per_chunk - sum(nef_its)
+                    metrics[perm_i][subject] = {
+                        "Neuroticism": chunks[0].sum() + 6*items_per_chunk - chunks[1].sum(),
+                        "Extraversion": chunks[2].sum() + 6*items_per_chunk - chunks[3].sum(),
+                        "Openness to Experience": chunks[4].sum() + 6*items_per_chunk - chunks[5].sum(),
+                        "Agreeableness": chunks[6].sum() + 6*items_per_chunk - chunks[7].sum(),
+                        "Conscientiousness": chunks[8].sum() + 6*items_per_chunk - chunks[9].sum()
+                    }
+                    metrics[perm_i][subject] = {k: float(v) for k, v in metrics[perm_i][subject].items()}
+
+                elif "pvq" in args.data_dir:
+                    assert "pvq" in args.experiment_name
+
+                    # # pvq is evaluated by averaging scored based on different values
+                    # preds_values_ = np.vectorize(map_choice_to_number)(preds, permutations_dict)
+                    # assert all(preds_values_ == preds_values)
+
+                    profile_values_idx_json = os.path.join(os.path.join(args.data_dir, "raw"), "values.json")
+                    with open(profile_values_idx_json) as f:
+                        profile_values_idx = json.load(f)
+                    profile_values_idx = {k: np.array(v)-1 for k, v in profile_values_idx.items() if k != "_comment"}
+
+                    metrics[perm_i][subject] = {}
+
+                    # mean_values = preds_values.mean()
+
+                    for profile_value, idxs in profile_values_idx.items():
+                        # metrics[subject][profile_value] = preds_values[idxs].mean() - mean_values
+                        metrics[perm_i][subject][profile_value] = preds_values[idxs].mean()
+
+                elif "political_compass" in args.data_dir:
+                    # political compas is evaluated using the website
+                    resp_df = test_df.copy()
+                    resp_df[5] = preds
+                    preds_csv_file = f"./results/political_compass/preds_{args.experiment_name}_{engine}_perm_{perm_i}_{timestamp}.csv"
+                    resp_df.to_csv(preds_csv_file, header=None, index=False)
+                    print(f"preds saved to '{preds_csv_file}")
+
+                    evaluate_csv_file(preds_csv_file)
+
+                else:
+                    metrics[perm_i][subject] = {
+                        "accuracy": subj_acc[perm_i][subject]
+                    }
+
+                res_test_df = test_df.copy()
+                res_test_df["{}_correct".format(engine)] = cors
+                for j in range(probs.shape[1]):
+                    choice = choices[j]
+                    res_test_df["{}_choice{}_probs".format(engine, choice)] = probs[:, j]
+
+                if args.log:
+                    res_test_df.to_csv(os.path.join(args.save_dir, "results_{}".format(engine), "{}.csv".format(subject)), index=None)
+
+        # aggregate scores
+
+        # aggregate to means
+        mean_subj_acc = defaultdict(list)
+        for subj_acc_perm in subj_acc:
+            for k, v in subj_acc_perm.items():
+                mean_subj_acc[k].append(v)
+        mean_subj_acc = {k: np.mean(v) for k,v in mean_subj_acc.items()}
+
+        # assert the same and take the fist
+        assert all(subj_len[0] == s for s in subj_len)
+        subj_len = subj_len[0]
+
+        # remap from list of metrics to metrics with lists
+        mean_metrics = defaultdict(lambda: defaultdict(list))
+        for metrics_perm in metrics:
+            for subj, subj_metrics in metrics_perm.items():
+                for metric, value in subj_metrics.items():
+                    mean_metrics[subj][metric].append(value)
+
+        # average metrics
+        mean_metrics = {
+            subj: {
+                metric: np.mean(values) for metric, values in subj_metrics.items()
+            } for subj, subj_metrics in mean_metrics.items()
+        }
 
         weighted_acc = np.mean(np.concatenate(all_cors))
 
-        for subj, m in metrics.items():
+        # save results
+        for subj, m in mean_metrics.items():
             if m:
                 print("Subject: ", subj)
                 for metric, score in m.items():
@@ -872,11 +1323,12 @@ def main(args):
         with open(json_dump_path, 'w') as fp:
             json.dump(
             {
-                **subj_acc,
+                **mean_subj_acc,
                 **{
                     "average": weighted_acc
                 },
-                "metrics": metrics,
+                "metrics": mean_metrics,
+                "per_permutation_metrics": metrics,
                 "answers": answers,
                 **{
                     "params": vars(args)
@@ -888,7 +1340,7 @@ def main(args):
         print("")
         print("Average accuracy per subject.")
         for subject in subjects:
-            print("{} accuracy ({}): {:.3f}".format(subject, subj_len[subject], subj_acc[subject]))
+            print("{} accuracy ({}): {:.3f}".format(subject, subj_len[subject], mean_subj_acc[subject]))
 
         print("Average accuracy: {:.3f}".format(weighted_acc))
 
@@ -903,6 +1355,7 @@ def main(args):
 
 
 if __name__ == "__main__":
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--ntrain", "-k", type=int, default=5)
     parser.add_argument("--data_dir", "-d", type=str, default="data")
@@ -914,6 +1367,8 @@ if __name__ == "__main__":
         "gpt-3.5-turbo-0301",
         "gpt-4-0314",
         "llama_7B", "llama_13B", "llama_30B", "llama_65B",
+        "stablelm", "stablevicuna",
+        "openassistant_rlhf2_llama30b"
     ], default=["davinci", "curie", "babbage", "ada"], nargs="+")
     parser.add_argument('--profile', type=str, help='Profile definition in format "k:v;k:v;k:v", ex. "age:35;interests:reading books"')
     parser.add_argument("--generative_qa", "-gqa", action="store_true", help="Use generative question answering instead of MCQ.")
@@ -923,9 +1378,23 @@ if __name__ == "__main__":
     parser.add_argument("--cold-run", "-cr", action="store_true")
     parser.add_argument("--estimate-gpt-tokens", "-t", action="store_true")
     parser.add_argument("--match-tokens-with-space", action="store_true")
-    parser.add_argument("--eval-set", type=str, default="test", choices=["test","val"])
-
+    parser.add_argument("--eval-set", type=str, default="test", choices=["test", "val"])
+    parser.add_argument("--natural-language-profile", "-nlp", action="store_true", help="If true a profile will be defined in natural language as opposed to key value pairs.")
+    parser.add_argument("--natural-language-profile-detail", type=str, default=None, choices=["no", "high"])
+    parser.add_argument("--perspective-amount", type=str, default="medium", choices=["extreme", "medium", "slight"])
+    parser.add_argument("--lotr-character", type=str, default=None, choices=[
+        "Gandalf",
+        "Frodo",
+        "Sauron",
+        "Aragorn",
+        "Pippin",
+    ])
+    parser.add_argument("--music-expert-genre", type=str, default=None)  # todo: add choices
+    parser.add_argument("--hobby", type=str, default=None)  # todo: add choices
     parser.add_argument("--log", "-l", type=bool, default=False)  # doesn't work well for multiproc (bigger llama models) # remove this parameter?
+    parser.add_argument("--permutations", "-p", type=int, default=1)
+    parser.add_argument("--separator", action="store_true")
+    parser.add_argument("--add-high-level-categories", action="store_true")
     args = parser.parse_args()
 
     profile = {}
@@ -935,14 +1404,43 @@ if __name__ == "__main__":
             profile[key] = value
 
         args.profile_dict = profile
+
+        print(f"Profile:\n{profile}")
+
     else:
         args.profile_dict = None
 
-    print(f"Profile:\n{profile}")
+    if args.lotr_character is not None:
+        print("LotR character: ", args.lotr_character)
+
+    if args.estimate_gpt_tokens:
+        if args.engine[0] not in ["gpt-4-0314", "gpt-3.5-turbo-0301", "dummy"]:
+            raise ValueError("Only gpt-4 gpt-3 and dummy support estimating GPT tokens")
+
     if args.cold_run:
+        print("2nd person:", args.direct_perspective)
+        print("System message:", args.system_message)
         # just used to show the profile to be used
         exit()
 
+    if not args.separator:
+        raise ValueError("You are not using a separator?")
+
+    if ("gpt-3.5" in args.engine and args.permutations > 50) or ("gpt-4" in args.engine and args.permutations > 5):
+        raise ValueError(f"Are you sure you want to use {args.permutations} with {args.engine}??")
+
+    # Asserts for NeurIPS
+    if args.profile:
+        assert args.natural_language_profile
+        assert args.natural_language_profile_detail == "no"
+
+    assert args.separator
+    assert args.ntrain == 0
+
+    assert sum(map(bool, [args.profile, args.lotr_character, args.music_expert_genre, args.hobby])) == 1
+
+    if "pvq" in args.data_dir and args.profile:
+        assert args.add_high_level_categories
 
     main(args)
 
diff --git a/price_estimate.py b/price_estimate.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9f7c7d5e19930d335997e2dcfae3e6d05bfb57d
--- /dev/null
+++ b/price_estimate.py
@@ -0,0 +1,80 @@
+# PVQ (lotr) - one eval:
+# total GPT tokens used: 5200
+#         gpt-4 ~ 0.2080 dollars
+#         gpt-3.5 ~ 0.0104 dollars
+#         davinci ~ 0.1040 dollars
+#         curie ~ 0.0104 dollars
+#         babagge ~ 0.0026 dollars
+#         ada ~ 0.0021 dollars
+
+# PVQ (high detail) - one eval:
+# total GPT tokens used: 7840
+#         gpt-4 ~ 0.3136 dollars
+#         gpt-3.5 ~ 0.0157 dollars
+#         davinci ~ 0.1568 dollars
+#         curie ~ 0.0157 dollars
+#         babagge ~ 0.0039 dollars
+#         ada ~ 0.0031 dollars
+
+
+# one eval
+
+# n_tokens_per_persp = 5200
+# gpt_4 = 0.208
+# gpt_35 = 0.01
+# davinci = 0.1
+
+# n_tokens_per_persp = 7840
+# gpt_4 = 0.3136
+# gpt_35 = 0.0157
+# davinci = 0.1568
+
+# n_tokens_per_persp = 5560
+# gpt_4 = 0.2224
+# gpt_35 = 0.0111
+# davinci = 0.1112
+
+# price per token
+gpt_4 = 0.03/1000
+gpt_35 = 0.002/1000
+davinci = 0.02/1000
+
+
+# 1. PVQ: lotr 5 + prim 0  -> 5 ( prim is in 3.)
+# exp1 = 0
+exp1 = 5 * 4640
+
+# 2. PVQ:music 6 +  hobbies: 5 -> 11
+# exp2 = 0
+exp2 = 6 * 4600 + 5 * 4520
+
+# 3. message person: (PVQ: 4 HOF: 6 B5: 5) x 4 settings = 15*4 -> 60
+pvq_3 = 4*4*5040*0
+hof_3 = 4*6*2200*0
+big5_3 = 4*5*3500*0 # 50 items
+big5_100_3 = 4*5*7083*0  # 100 items
+
+exp3 = pvq_3 + hof_3 + big5_3 + big5_100_3
+
+# 4. smooth: (PVQ: 4 HOF: 6 B5: 5) x 2 settings = 15*2 -> 30 (one is covered in 3.)
+# pvq = 2* 4 * 5320  # 2nd system
+pvq_4 = 2*4*5760*0
+hof_4 = 2*6*2200*0
+big5_4 = 2*5*3500*0  # 50 items
+big5_100_4 = 2*5*7083*0  # 100 items
+exp4 = pvq_4 + hof_4 + big5_4 + big5_100_4
+
+
+n_permutations = 5
+print("n_permutations:", n_permutations)
+
+total_persp_x_tokens = sum([exp1, exp2, exp3, exp4])
+
+total_persp_x_tokens = exp3
+
+total_tokens = n_permutations * total_persp_x_tokens
+
+price_gpt4 = gpt_4 * total_tokens
+price_gpt35 = gpt_35 * total_tokens
+price_davinci = davinci * total_tokens
+print(f"Total price:\n\tGPT4: {price_gpt4}\n\tGPT35: {price_gpt35}\n\tDavinci: {price_davinci}")
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7f51ca3895f59b5dd1c2529eb3cedda4feb50af0..6dba8723d421bdd33fe9db70e37257076990c39c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -85,4 +85,5 @@ urllib3==1.26.15
 wcwidth==0.2.6
 wsproto==1.2.0
 yarl==1.8.2
-zipp==3.15.0
\ No newline at end of file
+zipp==3.15.0
+git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
diff --git a/run_dummy.sh b/run_dummy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb2b9ba32d0bae2239336ec1d596d988ba513ec5
--- /dev/null
+++ b/run_dummy.sh
@@ -0,0 +1,48 @@
+##!/bin/bash
+
+#values_list=(
+#  "Power Distance"
+#  "Masculinity"
+#  "Uncertainty Avoidance"
+#  "Long-Term Orientation"
+#  "Indulgence"
+#  "Individualism"
+#)
+
+values_list=(
+  "Neuroticism"
+  "Extraversion"
+  "Openness to Experience"
+  "Agreeableness"
+  "Conscientiousness"
+)
+
+for val in "${values_list[@]}"; do
+
+profile="Primary values:$val"
+
+#openassistant_rlhf2_llama30b
+
+#torchrun --nproc_per_node 4 \
+#--engine llama_30B \
+
+python evaluate.py \
+--permutations 1 \
+--ntrain 0 \
+--data_dir data_big5_100 \
+--save_dir results/results_big5_100_test_gpt4 \
+--engine gpt-4-0314 \
+--experiment_name big5_test \
+--perspective-amount "extreme" \
+--profile "$profile" \
+--separator \
+--natural-language-profile \
+--natural-language-profile-detail "no" \
+--estimate-gpt-tokens \
+--direct-perspective \
+--system-message \
+--verbose
+
+#--add-high-level-categories \
+
+done
diff --git a/run_neurips_big5.sh b/run_neurips_big5.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f2b045ac4c7a86ef054c8a4dde7c6412baff5616
--- /dev/null
+++ b/run_neurips_big5.sh
@@ -0,0 +1,156 @@
+##!/bin/bash
+
+####################################
+### 3. System/User 2ne/3rd person
+####################################
+
+#### Big 5
+########################
+
+##### Sys: 2nd 3rd | Usr: 2nd 3rd
+# GPT-4 (5)
+# GPT-3.5 (5)
+# GPT-3.5 (50)
+# OA (50) & 3.098 & 2.429 & 2.859 & 4.927
+# StableVicuna (50) & n/a & n/a & 2.15 & 3.35
+# StableLM (50) & 0.0 & 0.003 & 0.205 & -0.041
+
+## System message , 2nd person
+#big5_values_list=(
+#  "Neuroticism"
+#  "Extraversion"
+#  "Openness to Experience"
+#  "Agreeableness"
+#  "Conscientiousness"
+#)
+#
+#message_options=(
+##  "System"
+#  "User"
+#)
+#
+#person_options=(
+##  "2nd"
+#  "3rd"
+#)
+#
+##ENGINE="gpt-4-0314"
+##ENGINE="gpt-3.5-turbo-0301"
+#ENGINE="openassistant_rlhf2_llama30b"
+##ENGINE="stablevicuna"
+##ENGINE="stablelm"
+#
+#PERMUTATIONS=50
+#
+#for message in "${message_options[@]}"; do
+#for person in "${person_options[@]}"; do
+#for vals in "${big5_values_list[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_nat_lang_prof_big5_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+#mkdir -p $SAVE_DIR
+#
+#python -u evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_big5_50 \
+#--experiment_name big5_test \
+#--separator \
+#--ntrain 0 \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--profile "Primary values:$vals" \
+#--natural-language-profile \
+#--natural-language-profile-detail "no" \
+#--perspective-amount "extreme" \
+#--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+#
+#done
+#done
+#done
+
+
+#####################
+## 4. Smoothness
+#####################
+
+#### Big5
+#############
+
+# Best
+# GPT-4 (5)
+# GPT-3.5 (10) S2
+# GPT-3.5 (50)
+# OA (50) U2
+# StableVicuna (50) U2
+# StableLM (50) U2
+
+perspective_intensity_list=(
+#  "slight"
+  "medium"
+#  "extreme"
+)
+
+# User message , 3nd person (for GPT3.5)
+big5_values_list=(
+  "Neuroticism"
+  "Extraversion"
+  "Openness to Experience"
+  "Agreeableness"
+  "Conscientiousness"
+)
+
+
+
+#ENGINE="gpt-4-0314"
+#ENGINE="gpt-3.5-turbo-0301"
+ENGINE="openassistant_rlhf2_llama30b"
+#ENGINE="stablevicuna"
+#ENGINE="stablelm"
+#ENGINE="dummy"
+
+message="User"
+person="3rd"
+
+# Results
+# Slight Medium
+#GPT4
+#GPT35
+#OA & 3.223
+#StableVicuna & 1.695 & 3.179 &
+#StableLM & -0.06 & 0.003 &
+#
+
+PERMUTATIONS=50
+
+echo "$ENGINE with $PERMUTATIONS permutations"
+
+for intensity in "${perspective_intensity_list[@]}"; do
+
+echo "Intensity: $intensity"
+
+for vals in "${big5_values_list[@]}"; do
+
+SAVE_DIR="results_neurips/results_nat_lang_prof_big5_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs_intensity_"$intensity""
+mkdir -p $SAVE_DIR
+
+echo "Save dir $SAVE_DIR"
+
+python evaluate.py \
+--permutations $PERMUTATIONS \
+--save_dir $SAVE_DIR \
+--engine "$ENGINE" \
+--data_dir data_big5_50 \
+--experiment_name big5_test \
+--separator \
+--ntrain 0 \
+--profile "Primary values:$vals" \
+--natural-language-profile \
+--natural-language-profile-detail "no" \
+$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+--perspective-amount "$intensity" \
+--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+
+done
+done
diff --git a/run_neurips_hof.sh b/run_neurips_hof.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0b7307ba59ccecc170730e35cf04b4ccbc71c785
--- /dev/null
+++ b/run_neurips_hof.sh
@@ -0,0 +1,161 @@
+##!/bin/bash
+
+####################################
+### 3. System/User 2ne/3rd person
+####################################
+
+#### HOFSTEDE
+########################
+
+##### Sys: 2nd 3rd | Usr: 2nd 3rd
+# GPT-4 (5)
+# GPT-3.5 (5) 69.033 & 102.533 & 128.5 & 122.733
+# GPT-3.5 (50) 80.59 & 99.623 & 124.99 & 110.713
+# OA (50)  4.63 & 13.79 & 20.57 & 24.083
+# StableVicuna (50)  & n/a & n/a & -1.76 & 3.01
+# StableLM (50) & -2.367 & 2.743 & 1.943 & 2.287
+
+# System message , 2nd person
+hof_values_list=(
+  "Power Distance"
+  "Masculinity"
+  "Uncertainty Avoidance"
+  "Long-Term Orientation"
+  "Indulgence"
+  "Individualism"
+)
+
+message_options=(
+  "System"
+  "User"
+)
+
+person_options=(
+  "2nd"
+  "3rd"
+)
+
+ENGINE="gpt-4-0314"
+#ENGINE="gpt-3.5-turbo-0301"
+#ENGINE="openassistant_rlhf2_llama30b"
+#ENGINE="stablevicuna"
+#ENGINE="stablelm"
+#ENGINE="dummy"
+
+PERMUTATIONS=5
+
+for message in "${message_options[@]}"; do
+for person in "${person_options[@]}"; do
+for vals in "${hof_values_list[@]}"; do
+
+SAVE_DIR="results_neurips/results_nat_lang_prof_hofstede_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+mkdir -p $SAVE_DIR
+
+python -u evaluate.py \
+--permutations $PERMUTATIONS \
+--save_dir $SAVE_DIR \
+--engine "$ENGINE" \
+--data_dir data_hofstede \
+--experiment_name hofstede_test \
+--separator \
+--ntrain 0 \
+$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+--profile "Primary values:$vals" \
+--natural-language-profile \
+--natural-language-profile-detail "no" \
+--perspective-amount "extreme" \
+--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+
+done
+done
+done
+
+
+#####################
+## 4. Smoothness
+#####################
+
+
+
+#### HOFSTEDE
+#############
+
+# Best
+# GPT-4 (5)
+# GPT-3.5 (10) S2
+# GPT-3.5 (50)
+# OA (50) U2
+# StableVicuna (50) U2
+# StableLM (50) U2
+
+#perspective_intensity_list=(
+#  "slight"
+#  "medium"
+##  "extreme"
+#)
+#
+## User message , 3nd person (for GPT3.5)
+#hof_values_list=(
+#  "Power Distance"
+#  "Masculinity"
+#  "Uncertainty Avoidance"
+#  "Long-Term Orientation"
+#  "Indulgence"
+#  "Individualism"
+#)
+#
+#
+#
+##ENGINE="gpt-4-0314"
+##ENGINE="gpt-3.5-turbo-0301"
+##ENGINE="openassistant_rlhf2_llama30b"
+##ENGINE="stablevicuna"
+##ENGINE="stablelm"
+#ENGINE="dummy"
+#
+#message="System"
+#person="3rd"
+#
+## Results
+## Slight Medium
+##GPT4
+##GPT35
+##OA
+##StableVicuna & 8.08 & 4.94
+##StableLM & 0.723 & -0.933
+##
+#
+#PERMUTATIONS=50
+#
+#echo "$ENGINE with $PERMUTATIONS permutations"
+#
+#for intensity in "${perspective_intensity_list[@]}"; do
+#
+#echo "Intensity: $intensity"
+#
+#for vals in "${hof_values_list[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_nat_lang_prof_hofstede_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs_intensity_"$intensity""
+#mkdir -p $SAVE_DIR
+#
+#echo "Save dir $SAVE_DIR"
+#
+#python evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_hofstede \
+#--experiment_name hofstede_test \
+#--separator \
+#--ntrain 0 \
+#--profile "Primary values:$vals" \
+#--natural-language-profile \
+#--natural-language-profile-detail "no" \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--perspective-amount "$intensity" \
+#--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+#
+#done
+#done
diff --git a/run_neurips_pvq.sh b/run_neurips_pvq.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5546c07fb37988e58c792838e953d70946334009
--- /dev/null
+++ b/run_neurips_pvq.sh
@@ -0,0 +1,278 @@
+##!/bin/bash
+
+
+###########################################
+# 1. Implied values (directly / indirectly)
+###########################################
+
+# LoTR characters
+##################
+
+# 2ps (direct pers) ; System message
+
+#ENGINE="gpt-4-0314"
+##ENGINE="openassistant_rlhf2_llama30b"
+##ENGINE="stablevicuna"
+##ENGINE="stablelm"
+##ENGINE="dummy"
+##ENGINE="gpt-3.5-turbo-0301"
+#
+#lotr_characters=(
+#  "Gandalf"
+#  "Frodo"
+#  "Sauron"
+#  "Aragorn"
+#  "Pippin"
+#)
+#
+#PERMUTATIONS=1
+#message="System"
+#person="2nd"
+#
+#for character in "${lotr_characters[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_lotr_pvq_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+#mkdir -p $SAVE_DIR
+#
+#python -u evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_pvq \
+#--experiment_name pvq_test \
+#--separator \
+#--ntrain 0 \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--estimate-gpt-tokens \
+#--add-high-level-categories \
+#--lotr-character "$character" \
+#--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+#
+#done
+
+#############################
+# 2. Non-implied values
+#############################
+
+### Music AI Experts
+####################
+#
+#music_genre_list=(
+#  "hip-hop"
+#  "jazz"
+#  "classical"
+#  "heavy metal"
+#  "reggae"
+#  "gospel"
+#)
+#
+#message="System"
+#person="2nd"
+#PERMUTATIONS=1
+#ENGINE="gpt-4-0314"
+#
+#for music_genre in "${music_genre_list[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_AI_music_expert_pvq_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+#
+#python evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_pvq \
+#--experiment_name pvq_test \
+#--separator \
+#--ntrain 0 \
+#--verbose \
+#--estimate-gpt-tokens \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--music-expert-genre  "$music_genre"
+#
+#done
+
+# Hobbies
+#################
+hobbies_list=(
+  "singing in a church choir"
+  "gambling"
+  "playing chess"
+  "volunteering at a homeless shelter"
+  "car racing"
+)
+
+message="System"
+person="2nd"
+PERMUTATIONS=1
+ENGINE="gpt-4-0314"
+
+for hobby in "${hobbies_list[@]}"; do
+
+SAVE_DIR="results_neurips/results_AI_music_expert_pvq_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+
+python evaluate.py \
+--permutations $PERMUTATIONS \
+--save_dir $SAVE_DIR \
+--engine "$ENGINE" \
+--data_dir data_pvq \
+--experiment_name pvq_test \
+--separator \
+--ntrain 0 \
+--verbose \
+--estimate-gpt-tokens \
+$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+--hobby  "$hobby"
+
+done
+
+####################################
+### 3. System/User 2ne/3rd person
+####################################
+
+#### PVQ
+########################
+
+#### Sys: 2nd 3rd | Usr: 2nd 3rd
+# GPT-4 (5)  & 2.309 & 2.439 & 2.095 & 2.226 -> not fair
+# GPT-3.5 (10) & 3.162 & 2.715 & 3.122 & 2.729 -> fair
+# GPT-3.5 (50) $ 3.403 $ 2.803 & 3.202 & 2.82
+# OA (50) & 0.619 & 0.698 & 0.979 & 0.647
+# StableVicuna (50)  & n/a & n/a & 0.328 & 0.168
+# StableLM (50) & -0.029 & -0.009 & 0.029 & -0.001
+
+## System message , 2nd person
+#pvq_values_list=(
+#  "Hedonism,Stimulation,Self-Direction"
+#  "Universalism,Benevolence"
+#  "Conformity,Tradition,Security"
+#  "Power,Achievement"
+#)
+#
+#message_options=(
+#  "System"
+#  "User"
+#)
+#
+#person_options=(
+#  "2nd"
+#  "3rd"
+#)
+#
+##ENGINE="gpt-4-0314"
+#ENGINE="gpt-3.5-turbo-0301"
+##ENGINE="openassistant_rlhf2_llama30b"
+##ENGINE="dummy"
+#
+#PERMUTATIONS=50
+#
+#for message in "${message_options[@]}"; do
+#for person in "${person_options[@]}"; do
+#for vals in "${pvq_values_list[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_nat_lang_prof_pvq_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs"
+#mkdir -p $SAVE_DIR
+#
+#python -u evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_pvq \
+#--experiment_name pvq_test \
+#--separator \
+#--ntrain 0 \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--profile "Primary values:$vals" \
+#--natural-language-profile \
+#--natural-language-profile-detail "no" \
+#--perspective-amount "extreme" \
+#--add-high-level-categories \
+#--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+#
+#done
+#done
+#done
+
+
+#####################
+## 4. Smoothness
+#####################
+
+
+#### PVQ
+#############
+
+# Best
+# GPT-3.5 (50) S2
+# OA (50) U2
+# StableVicuna (50) U2
+# StableLM (50) U2
+
+#perspective_intensity_list=(
+#  "slight"
+#  "medium"
+##  "extreme"
+#)
+#
+## User message , 3nd person (for GPT3.5)
+#pvq_values_list=(
+#  "Hedonism,Stimulation,Self-Direction"
+#  "Universalism,Benevolence"
+#  "Conformity,Tradition,Security"
+#  "Power,Achievement"
+#)
+#
+#
+#
+##ENGINE="gpt-4-0314"
+##ENGINE="gpt-3.5-turbo-0301"
+##ENGINE="openassistant_rlhf2_llama30b"
+##ENGINE="stablevicuna"
+##ENGINE="stablelm"
+#ENGINE="dummy"
+#
+## Results
+## Slight Medium
+##GPT35 2.458 & 3.258
+##OA 0.804 & 0.867
+##StableVicuna 0.194 & 0.328
+##StableLM 0.008 & 0.036
+#
+#PERMUTATIONS=50
+#message="System"
+#person="2nd"
+#
+#echo "$ENGINE with $PERMUTATIONS permutations"
+#
+#for intensity in "${perspective_intensity_list[@]}"; do
+#
+#echo "Intensity: $intensity"
+#
+#for vals in "${pvq_values_list[@]}"; do
+#
+#SAVE_DIR="results_neurips/results_nat_lang_prof_pvq_test_"$ENGINE"_perm_"$PERMUTATIONS"_"$message"_msg_"$person"_prs_intensity_"$intensity""
+#mkdir -p $SAVE_DIR
+#
+#echo "Save dir $SAVE_DIR"
+#
+#python evaluate.py \
+#--permutations $PERMUTATIONS \
+#--save_dir $SAVE_DIR \
+#--engine "$ENGINE" \
+#--data_dir data_pvq \
+#--experiment_name pvq_test \
+#--separator \
+#--ntrain 0 \
+#--profile "Primary values:$vals" \
+#--natural-language-profile \
+#--natural-language-profile-detail "no" \
+#--perspective-amount "$intensity" \
+#--add-high-level-categories \
+#$(if [ "$message" == "System" ]; then echo "--system-message"; fi) \
+#$(if [ "$person" == "2nd" ]; then echo "--direct-perspective"; fi) \
+#--verbose  2>&1 | tee -a $SAVE_DIR/log.txt
+#
+#done
+#done
diff --git a/visualization_scripts/bar_viz.py b/visualization_scripts/bar_viz.py
index beff108fae69a4d67a4ca031c3b11daaebeb79c4..37a67dd2c902a7c38e451b7f2d530ecbd2538a45 100644
--- a/visualization_scripts/bar_viz.py
+++ b/visualization_scripts/bar_viz.py
@@ -3,8 +3,30 @@ import json
 import matplotlib.pyplot as plt
 import re
 import numpy as np
+from termcolor import colored
 
-all_values = []
+all_values_ = []
+def extract_value(directory, key="_lotr_character_"):
+    label = os.path.basename(directory)
+    if key in label:
+        start_index = label.find(key) + len(key)
+
+    elif "_ntrain_" in label:
+        start_index = label.find("ntrain_") + len("ntrain_") + 1
+
+    else:
+        start_index = 0
+
+    if "__2023" in label:
+        end_index = label.find("__2023")
+    elif "_2023" in label:
+        end_index = label.find("_2023")
+    else:
+        end_index = len(label)
+
+    label = label[start_index:end_index]
+
+    return label
 
 def extract_profile(directory):
     label = os.path.basename(directory)
@@ -48,9 +70,7 @@ def extract_by_key(directory, key="Hobbies"):
         return 'Unknown'
 
 
-def plot_baseline(ax, directory, offset, keys_to_plot=None, subj=None, bar_width=1.0, min_bar_size=0.1, horizontal_bar=False, value_limit=250):
-    with open(os.path.join(directory, 'results.json'), 'r') as f:
-        data = json.load(f)
+def plot_baseline(data, ax, directory, offset, keys_to_plot=None, subj=None, bar_width=1.0, min_bar_size=0.1, horizontal_bar=False, value_limit=250):
 
     if subj:
         draw_metrics = data['metrics'][subj]
@@ -150,7 +170,22 @@ def plot_baseline(ax, directory, offset, keys_to_plot=None, subj=None, bar_width
     # key = "hobbies"
 
     # label = extract_by_key(directory, key=key)
-    label = extract_profile(directory)
+    if "profile" in directory:
+        label = extract_profile(directory)
+
+    elif "lotr_character" in directory:
+        label = extract_value(directory, "_lotr_character_")
+    elif "music_expert" in directory:
+        label = extract_value(directory, "_music_expert_")
+    elif "music_AI_experts" in directory:
+        label = extract_value(directory, "_music_expert_")
+    elif "hobby" in directory:
+        label = extract_value(directory, "_hobby_")
+    else:
+        label = os.path.basename(directory)
+
+    label = label.rstrip("_").lstrip("_")
+
     x_values = [key_indices[key] + offset for key in keys]
 
     x_values = [v+bar_width/2 for v in x_values]
@@ -190,7 +225,7 @@ def plot_baseline(ax, directory, offset, keys_to_plot=None, subj=None, bar_width
     v_to_add = []
     x_to_add = []
 
-    all_values.append(values)
+    all_values_.append(values)
 
     for ind, v in enumerate(values):
         if abs(v) < bar_width/2:
@@ -205,8 +240,29 @@ def plot_baseline(ax, directory, offset, keys_to_plot=None, subj=None, bar_width
         ax.barh(x_values, values, label=label, height=bar_width, color=color_for_label(label))
 
     else:
-        ax.bar(x_values, values, label=label, width=bar_width, color=color_for_label(label))
-        #, facecolor=color_for_label(label), edgecolor=color_for_edge(label), linewidth=2)
+        if figure_draw:
+            labels = [label] * len(values)
+
+            bar_color_dict = {
+                'Conformity': "orange",
+                'Tradition': "orange",
+                'Benevolence': "green",
+                'Universalism': "green",
+                'Self-Direction': "blue",
+                'Stimulation': "blue",
+                'Hedonism': "blue",
+                'Achievement': "red",
+                'Power': "red",
+                'Security': "orange"
+            }
+
+            for i, (value, label, key) in enumerate(zip(values, labels, keys)):
+                print(key)
+                plt.bar(x_values[i], value, label=label, width=bar_width, color=bar_color_dict[key])
+
+        else:
+            ax.bar(x_values, values, label=label, width=bar_width, color=color_for_label(label))
+            #, facecolor=color_for_label(label), edgecolor=color_for_edge(label), linewidth=2)
 
     if args.horizontal_bar:
         assert all([-value_limit <= v <= value_limit for v in values])
@@ -217,22 +273,36 @@ def plot_baseline(ax, directory, offset, keys_to_plot=None, subj=None, bar_width
 
     else:
         # set y-axis limits
-        ax.set_ylim([0, max([1, *values])+0.1])
+        if test_set_name == "pvq_male":
+            ax.set_ylim([0, 6.1])
+
+        elif test_set_name == "hofstede":
+            ax.set_ylim([-350, 350])
+
+        elif test_set_name == "big5_50":
+            ax.set_ylim([0, 55])
+
+        elif test_set_name == "big5_100":
+            ax.set_ylim([0, 110])
+
+        else:
+            ax.set_ylim([0, max([6, *values])+0.1])
 
         ax.set_xlabel('Values', fontsize=15)
         ax.set_ylabel('Scores', fontsize=15)
 
-    if "gpt-3.5-turbo-0301" in directory:
-        ax.set_title("gpt-3.5-turbo-0301")
-
-    elif "gpt-4-0314" in directory:
-        ax.set_title("gpt-4-0314")
+    # if "gpt-3.5-turbo-0301" in directory:
+    #     ax.set_title("gpt-3.5-turbo-0301")
+    #
+    # elif "gpt-4-0314" in directory:
+    #     ax.set_title("gpt-4-0314")
 
     if not args.separate_legend:
         ax.legend(loc="best", fontsize=15)
 
     return keys
 
+figure_draw = False
 
 if __name__ == '__main__':
     import argparse
@@ -252,6 +322,9 @@ if __name__ == '__main__':
     bar_width = 0.10
     bar_margin = 1.2
 
+    if figure_draw:
+        bar_width = 0.9
+
     fig, ax = plt.subplots(figsize=(12, 6))
 
     all_bars_width = len(args.directories) * (bar_width*bar_margin)  # bars with margins
@@ -289,18 +362,129 @@ if __name__ == '__main__':
     for substring in ignore_patterns:
         directories = [d for d in directories if substring not in d]
 
+    directories = [d for d in directories if os.path.isfile(os.path.join(d, 'results.json'))]
+
+    if "pvq_test" in directories[0] or "pvq" in directories[0]:
+        test_set_name = "pvq_male"
+    elif "hofstede" in directories[0]:
+        test_set_name = "hofstede"
+    elif "big5_50" in directories[0]:
+        test_set_name = "big5_50"
+    elif "big5_100" in directories[0]:
+        test_set_name = "big5_100"
+    else:
+        test_set_name = "pvq_male"
+
+    current_max_y = 0
+
+    dir_2_data = {}
+
     for i, directory in enumerate(directories):
+        if not os.path.isdir(directory):
+            continue
+
+        results_json_path = os.path.join(directory, 'results.json')
+        if not os.path.isfile(results_json_path):
+            continue
+
+        with open(results_json_path, 'r') as f:
+            data = json.load(f)
 
         offset = -all_bars_width/2 + (i/len(args.directories))*all_bars_width
-        keys_ = plot_baseline(ax, directory, offset, keys_to_plot=keys_to_plot, bar_width=bar_width, min_bar_size=0.05, horizontal_bar=args.horizontal_bar)
+        keys_ = plot_baseline(data, ax, directory, offset, keys_to_plot=keys_to_plot, bar_width=bar_width, min_bar_size=0.05, horizontal_bar=args.horizontal_bar)
+
+        dir_2_data[directory] = data
 
         # check that keys are the same in all the baselines
         assert keys is None or keys_ == keys
         keys = keys_
 
     # variance over baselines per value
-    variances = np.array(all_values).var(axis=0)
-    # assert len(variances) != len(directories)
+    variances_ = np.stack([list(d["metrics"][test_set_name].values()) for d in dir_2_data.values()]).var(axis=0)  # todo: remove variances_
+
+    variances = np.array(all_values_).var(axis=0)
+    assert all(variances_ == variances)
+
+    if test_set_name == "pvq_male":
+        test_set_values = [
+            'Conformity',
+            'Tradition',
+            'Benevolence',
+            'Universalism',
+            'Self-Direction',
+            'Stimulation',
+            'Hedonism',
+            'Achievement',
+            'Power',
+            'Security'
+        ]
+    elif test_set_name == "hofstede":
+        test_set_values = [
+            "Power Distance",
+            "Masculinity",
+            "Uncertainty Avoidance",
+            "Long-Term Orientation",
+            "Indulgence",
+            "Individualism"
+        ]
+    elif test_set_name in ["big5_50", "big5_100"]:
+        test_set_values = [
+            "Neuroticism",
+            "Extraversion",
+            "Openness to Experience",
+            "Agreeableness",
+            "Conscientiousness"
+        ]
+
+    primary_value_alignments = []
+    if all(["Primary Values".lower() in d.lower() for d in directories]):
+        for dir, data in dir_2_data.items():
+
+
+            profile = {}
+            # extract values from profile string
+            if "params" in data:
+                profile_str = data['params']['profile']
+            else:
+                profile_str = dir[dir.rindex("profile"):dir.index("_2023")]
+
+            for item in profile_str.split(';'):
+                key, value = item.split(':')
+                profile[key] = value
+
+            primary_values = profile["Primary values"].split(",")
+
+            # map_prim_values = {
+            #     "long term orientation": "long_term_orientation",
+            #     "power distance": 'power_distance',
+            #     "uncertainty avoidance": "uncertainty_avoidance",
+            # }
+            # primary_values = [map_prim_values.get(p, p) for p in primary_values]
+
+            if "Primary values" not in profile:
+                raise ValueError(f"Primary values are not in the profile: {profile}.")
+
+            assert all([prim_v in test_set_values for prim_v in primary_values])
+
+            # compute the metrics avg_{prim_values} - avg_{other_values}
+            # avg_primary_values = np.mean([data['metrics'][test_set_name][val] for val in primary_values])
+            # avg_other_values = np.mean([data['metrics'][test_set_name][val] for val in list(set(test_set_values) - set(primary_values))])
+            # primary_value_alignment = avg_primary_values - avg_other_values
+
+            for perm_metrics in data["per_permutation_metrics"]:
+                avg_primary_values = np.mean([perm_metrics[test_set_name][val] for val in primary_values])
+                avg_other_values = np.mean(
+                    [perm_metrics[test_set_name][val] for val in list(set(test_set_values) - set(primary_values))])
+                perm_alignment = avg_primary_values - avg_other_values
+                print("permutation alignment: ", perm_alignment)
+                primary_value_alignments.append(perm_alignment)
+
+            perspective_value_alignment = np.mean(primary_value_alignments[-len(data["per_permutation_metrics"]):])
+            print(f"Primary value alignment for {primary_values}: {perspective_value_alignment}.")
+
+        # todo: confirm this
+        mean_primary_value_alignment = np.mean(primary_value_alignments)
+        print(colored(f"Mean primary value alignment (over all): {mean_primary_value_alignment}", "green"))
 
     # mean over value dimensions
     mean_variance = variances.mean()
@@ -309,10 +493,9 @@ if __name__ == '__main__':
 
     if args.horizontal_bar:
 
-        # Set the yticks labels on the left side
+        # Set the y-ticks labels on the left side
         y_locs = list(range(len(keys)))
 
-
         # right labels
         key_to_hofstede_label_right = {
             "power_distance": "high power distance",
@@ -348,6 +531,10 @@ if __name__ == '__main__':
         ax.set_xticks(range(len(keys)))
         ax.set_xticklabels(keys, rotation=45)
 
+    if figure_draw:
+        ax.legend().remove()
+        ax.set_title("")
+
     if args.save:
         for ext in ["png", "svg"]:
             savepath = f"visualizations/{args.filename}.{ext}"
@@ -366,7 +553,6 @@ if __name__ == '__main__':
                 ax_legend.set_xticks([])
                 ax_legend.set_yticks([])
 
-
                 # ax.set_frame_on(False)
                 # ax.xaxis.set_visible(False)
                 # ax.yaxis.set_visible(False)