From 898832c1a589e55cf8dc1cb7a846cf225e387e17 Mon Sep 17 00:00:00 2001
From: KOVAC Grgur <grgur.kovac@inria.fr>
Date: Tue, 30 Apr 2024 15:27:46 +0200
Subject: [PATCH] Refactor models as configs

---
 .gitignore                                    |    3 +-
 README.md                                     |  313 +++-
 .../run_dummy.sh                              |    0
 .../run_iclr_big5.sh                          |    0
 .../run_iclr_hof.sh                           |    0
 .../run_iclr_mmlu.sh                          |    0
 .../run_iclr_pvq.sh                           |    0
 .../run_neurips_big5.sh                       |    0
 .../run_neurips_hof.sh                        |    0
 .../run_neurips_pvq.sh                        |    0
 .../run_plosone_pvq.sh                        |    0
 .../run_plosone_ult.sh                        |    0
 campaign_data_analysis.py                     | 1306 +++++++++++++
 campaign_evaluations.py                       |  176 +-
 categories.py                                 |   66 -
 compile_evaluations.py                        |  178 --
 crop.py                                       |  155 --
 estimate_tokens.py                            |   29 -
 evaluate.py                                   | 1614 -----------------
 evaluate_v3.py                                |  863 +++++++++
 iclr_evaluations.sh                           |   99 -
 ipsative_stat_test.py                         |   86 -
 models/__init__.py                            |   47 +
 models/configs/Mistral-7B-Instruct-v0.1.json  |   15 +
 models/configs/Mistral-7B-Instruct-v0.2.json  |   15 +
 models/configs/Mistral-7B-v0.1.json           |   15 +
 .../Mixtral-8x22B-Instruct-v0.1-4b.json       |   13 +
 .../Mixtral-8x7B-Instruct-v0.1-4b.json        |   16 +
 .../configs/Mixtral-8x7B-Instruct-v0.1.json   |   16 +
 models/configs/Mixtral-8x7B-v0.1-4b.json      |   16 +
 models/configs/Mixtral-8x7B-v0.1.json         |   16 +
 models/configs/Qwen-14B.json                  |   15 +
 models/configs/Qwen-72B.json                  |   15 +
 models/configs/Qwen-7B.json                   |   15 +
 models/configs/Qwen1.5-72B-Chat.json          |   17 +
 models/configs/command_r_plus.json            |   21 +
 models/configs/dummy.json                     |   10 +
 models/configs/gpt-3.5-turbo-0125.json        |   12 +
 models/configs/gpt-3.5-turbo-1106.json        |   12 +
 models/configs/interactive.json               |   10 +
 models/configs/llama_2_13b.json               |   21 +
 models/configs/llama_2_13b_chat.json          |   21 +
 models/configs/llama_2_70b.json               |   21 +
 models/configs/llama_2_70b_chat.json          |   21 +
 models/configs/llama_2_7b.json                |   21 +
 models/configs/llama_2_7b_chat.json           |   21 +
 models/configs/llama_3_70b_instruct.json      |   20 +
 models/configs/llama_3_8b_instruct.json       |   20 +
 models/configs/phi-1.json                     |   15 +
 models/configs/phi-2.json                     |   15 +
 models/configs/phi-3.json                     |   16 +
 models/configs/zephyr-7b-beta.json            |   19 +
 models/dummymodel.py                          |   57 +
 models/huggingfacemodel.py                    |  264 +++
 models/interactivemodel.py                    |   52 +
 models/model.py                               |   19 +
 models/openaimodel.py                         |  112 ++
 models/utils.py                               |  152 ++
 models_stat_test.py                           |  267 ---
 requirements.txt                              |    9 +-
 run_campaign_seeds.sh                         |  125 ++
 run_campaign_sim_conv_no_pop.sh               |    8 +-
 run_campaign_sim_conv_pvq_seeds.sh            |   27 +-
 run_dummy.sh                                  |  177 +-
 run_local.sh                                  |   60 +-
 run_single.sh                                 |  179 +-
 tokens_estimate.py                            |   39 -
 utils.py                                      |   33 +-
 visualization_scripts/data_analysis.py        |   33 +-
 69 files changed, 4086 insertions(+), 2942 deletions(-)
 rename {backup_run_scripts => backup_scripts}/run_dummy.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_iclr_big5.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_iclr_hof.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_iclr_mmlu.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_iclr_pvq.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_neurips_big5.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_neurips_hof.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_neurips_pvq.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_plosone_pvq.sh (100%)
 rename {backup_run_scripts => backup_scripts}/run_plosone_ult.sh (100%)
 create mode 100644 campaign_data_analysis.py
 delete mode 100644 categories.py
 delete mode 100644 compile_evaluations.py
 delete mode 100644 crop.py
 delete mode 100644 estimate_tokens.py
 delete mode 100644 evaluate.py
 create mode 100644 evaluate_v3.py
 delete mode 100644 iclr_evaluations.sh
 delete mode 100644 ipsative_stat_test.py
 create mode 100644 models/__init__.py
 create mode 100644 models/configs/Mistral-7B-Instruct-v0.1.json
 create mode 100644 models/configs/Mistral-7B-Instruct-v0.2.json
 create mode 100644 models/configs/Mistral-7B-v0.1.json
 create mode 100644 models/configs/Mixtral-8x22B-Instruct-v0.1-4b.json
 create mode 100644 models/configs/Mixtral-8x7B-Instruct-v0.1-4b.json
 create mode 100644 models/configs/Mixtral-8x7B-Instruct-v0.1.json
 create mode 100644 models/configs/Mixtral-8x7B-v0.1-4b.json
 create mode 100644 models/configs/Mixtral-8x7B-v0.1.json
 create mode 100644 models/configs/Qwen-14B.json
 create mode 100644 models/configs/Qwen-72B.json
 create mode 100644 models/configs/Qwen-7B.json
 create mode 100644 models/configs/Qwen1.5-72B-Chat.json
 create mode 100644 models/configs/command_r_plus.json
 create mode 100644 models/configs/dummy.json
 create mode 100644 models/configs/gpt-3.5-turbo-0125.json
 create mode 100644 models/configs/gpt-3.5-turbo-1106.json
 create mode 100644 models/configs/interactive.json
 create mode 100644 models/configs/llama_2_13b.json
 create mode 100644 models/configs/llama_2_13b_chat.json
 create mode 100644 models/configs/llama_2_70b.json
 create mode 100644 models/configs/llama_2_70b_chat.json
 create mode 100644 models/configs/llama_2_7b.json
 create mode 100644 models/configs/llama_2_7b_chat.json
 create mode 100644 models/configs/llama_3_70b_instruct.json
 create mode 100644 models/configs/llama_3_8b_instruct.json
 create mode 100644 models/configs/phi-1.json
 create mode 100644 models/configs/phi-2.json
 create mode 100644 models/configs/phi-3.json
 create mode 100644 models/configs/zephyr-7b-beta.json
 create mode 100644 models/dummymodel.py
 create mode 100644 models/huggingfacemodel.py
 create mode 100644 models/interactivemodel.py
 create mode 100644 models/model.py
 create mode 100644 models/openaimodel.py
 create mode 100644 models/utils.py
 delete mode 100644 models_stat_test.py
 create mode 100644 run_campaign_seeds.sh
 delete mode 100644 tokens_estimate.py

diff --git a/.gitignore b/.gitignore
index a1abe7c..78916b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,8 @@
 visualizations/
-models/
 data/*
 logs/*
 .idea/
-results/
+results*
 .cache/
 *__pycache__*
 backup_run_scripts/
diff --git a/README.md b/README.md
index 7f942c6..e921802 100644
--- a/README.md
+++ b/README.md
@@ -6,16 +6,120 @@ This codebase is based on MMLU codebase. - link
 
 Setup the conda env
 ```
-conda create -n llm_stability python=3.9
-conda activate llm_persp
+conda create -n llm_stability python=3.10
+conda activate llm_stability
 cd test/
 pip install -r requirements.txt 
 ```
 
+For phi-1 and phi-2
+```
+conda create -n llm_stability python=3.10
+conda activate llm_stability_phi
+cd test/
+pip install -r requirements.txt 
+pip install transformers==4.37.0
+```
+
+## Setup environment variables
+The rest of this guide will use the dummy model, which is a random baseline. For other models, you may wish to set various environment variables.
+
+- To use OpenAI models, set the `OPENAI_API_KEY` env variable:
+```commandline
+export OPENAI_API_KEY="<your_key>"
+```
+- To use huggingface models, set the `HF_HOME` env variable to define your cache directory:
+
+```commandline
+export HF_HOME="$HOME/.cache/huggingface"
+```
+
+- To use huggingface gated models, set the `HF_TOKEN env variable`
+```commandline
+export HF_TOKEN="<your_token>"
+```
+
+
+
+## Evaluating and computing the stability
+
+### Minimal example
+
+You can run one evaluation with the following command
+
+```
+theme="joke"
+python -u evaluate_v3.py \
+--engine "dummy" \
+--experiment_name pvq_test \
+--data_dir data/data_pvq \
+--simulated-population-type tolkien_characters \
+--simulated-conversation-theme $theme \
+--simulated-conversation-n-messages 3 \
+--permute-options-seed "testing_seed" \
+--simulated-human-knows-persona \
+--save_dir test_results/pvq_tolkien_dummy_$theme \
+--permute-options \
+--pvq-version "pvq_auto" \
+--verbose
+```
+
+This will evaluate a dummy (random) model simulating tolkien characters on PVQ.
+It will save the results into: ```test_results/pvq_tolkien_dummy_chess_2024_04_29_19_54_19/results.json```
+
+Now lets run the same command for "joke":
+
+```
+theme="joke"
+python -u evaluate_v3.py \
+--engine "dummy" \
+--experiment_name pvq_test \
+--data_dir data/data_pvq \
+--simulated-population-type tolkien_characters \
+--simulated-conversation-theme $theme \
+--simulated-conversation-n-messages 3 \
+--permute-options-seed "testing_seed" \
+--simulated-human-knows-persona \
+--save_dir test_results/pvq_tolkien_dummy_$theme \
+--permute-options \
+--pvq-version "pvq_auto" \
+--verbose
+```
 
-# Evaluating a model
+Now lets run the same command for "grammar":
+```
+theme="grammar"
+python -u evaluate_v3.py \
+--engine "dummy" \
+--experiment_name pvq_test \
+--data_dir data/data_pvq \
+--simulated-population-type tolkien_characters \
+--simulated-conversation-theme $theme \
+--simulated-conversation-n-messages 3 \
+--permute-options-seed "testing_seed" \
+--simulated-human-knows-persona \
+--save_dir test_results/pvq_tolkien_dummy_$theme \
+--permute-options \
+--pvq-version "pvq_auto" \
+--verbose
+```
+
+Great now we have three results and we can compute the stability of the model with the following command:
+```
+python ./visualization_scripts/data_analysis.py test_results/pvq_tolkien_dummy_*
+```
+
+This should give stabilities close to zero because we are testing a dummy model, which only selects a random answer:
+```
+------------------------
+Aggregated metrics
+------------------------
+Rank-Order      Ipsative
+0.0051          0.0074
+```
 
-The ``run_single.sh`` contains an example of how to evaluate a models.
+You can see examples of other settings in the ``run_single.sh`` script.
+The purpose of this script if to increase the clarity of this tutorial.
 
 It requires to set 7 parameters, which are by default set to:
 ```
@@ -28,99 +132,188 @@ It requires to set 7 parameters, which are by default set to:
 7. Experiment name:test
 ```
 
-You can modify those parameters inside the script (following the comments).
-
+Following the comments in the script, you can modify those parameters.
 
-From the test directory, run
+From the test directory, you can run
 ```
 bash run_single.sh
 ```
 
 This will evaluate a dummy model, which chooses random answers on the PVQ questionniare.
 
-# Running all experiments
 
+## Campaign evaluations and stability computation
+
+In the previous example we showed how to run one evaluation: administer a questionnaire to a simulated population with **one** model, **one** conversation topic, and **one** seed.
+
+In practice, we want to run those evaluations as a campaign (evaluate on **many** topics and **many** seeds).
+We can do this with the ``run_campaign_seeds.sh`` script.
+
+This script accepts two arguments:
+ - model: ```dummy``` (to see all available models run ``ls models/configs/``)
+ - the experiment_type as defined by the following table
 
-All the experiments in the paper are shown in ```run_campain*.sh``` scripts.
+| experiment_type | task     | simulated population |
+|-----------------|----------|----------------------|
+| pvq_tolk        | PVQ      | tolkien characters   |
+| pvq_fam         | PVQ      | real-world personas  |
+| religion        | religion | real-world personas  |
+| don             | donation | tolkien characters   |
+| bag             | stealing | tolkien characters   |
 
-These are slurm scripts and enable parallel evaluation of different topics and seeds. These scripts require an argument, which defines the model.
-The following command evaluates the Mistral-7B-Instruct-v0.2 model (```model_idx=7```):
+We can use this script on a regular or a slurm-based machine to run 25 evaluations: 5 seeds (answer permutations) x 5 conversation topics at once.
 
+- Regular machine:
+
+    This will run the 25 evaluations sequentially:
+    ```commandline
+    for i in {0..24}; do SLURM_ARRAY_TASK_ID=$i bash run_campaign_seeds.sh dummy pvq_tolk ; done
+    ```
+
+    This will run the 25 evaluations in parallel:
+    ```commandline
+    for i in {0..24}; do SLURM_ARRAY_TASK_ID=$i bash run_campaign_seeds.sh dummy pvq_tolk & done; wait
+    ```
+
+- Slurm-based machine:
+    
+    Make sure to modify your slurm config at the top of ``run_campaign_seeds.sh``.
+    This will launch 25 parallel jobs
+    ```commandline
+    sbatch run_campaign_seeds.sh dummy pvq_tolk
+    ```
+
+After evaluating a dummy model by any of the above commands, you should have the following folders structure:
+```commandline
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_2
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_4
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_6
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_8
 ```
-sbatch run_campaign_sim_conv_pvq_seeds.sh <model_idx>
+Each should have 5 subdirectories with a results.json file, e.g:
+```commandline
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/theme_chess_2024_04_29_20_33_03/results.json
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/theme_grammar_2024_04_29_20_23_19/results.json
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/theme_history_2024_04_29_20_33_06/results.json
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/theme_joke_2024_04_29_20_33_07/results.json
+results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/theme_poem_2024_04_29_20_33_05/results.json
 ```
 
-Here is a list of models and their indices (this correponds to the index in the model list in run_campain*.sh scripts):
+In other words the following command should return 25:
+
+```
+ls  results/stability_default_params_pvq_tolkien_characters/dummy/seed_*/*/results.json | wc -l
+```
 
-| Model | model_idx |
-|-------|----------------|
-|llama_2_7b| 0 |
-|llama_2_13b| 1 |
-|llama_2_7b_chat| 2 |
-|llama_2_13b_chat| 3 |
-|zephyr-7b-beta| 4 |
-|Mistral-7B-v0.1| 5 |
-|Mistral-7B-Instruct-v0.1| 6 |
-|Mistral-7B-Instruct-v0.2| 7 |
-|llama_2_70b| 8 |
-|llama_2_70b_chat| 9 |
-|Mixtral-8x7B-v0.1-4b| 10 |
-|Mixtral-8x7B-Instruct-v0.1-4b| 11 |
-|Mixtral-8x7B-v0.1| 12 |
-|Mixtral-8x7B-Instruct-v0.1|13 |
-|phi-2| 14 |
-|phi-1| 15 |
-|Qwen-72B| 17 |
-|Qwen-14B| 18 |
-|Qwen-7B| 19 |
+We can compute the stability in one seed with the `data_analysis.py` script:
+```commandline
+python ./visualization_scripts/data_analysis.py results/stability_default_params_pvq_tolkien_characters/dummy/seed_0/*
+```
 
-Those scripts also require setting the population and the questionnaire. They can easily be changed following the scripts comments.
-By default, they are set to fictional characters and PVQ:
+As we are using a dummy model we should get close to zero stabilities:
 ```
-## PVQ - tolkien characters
-test_tag="pvq"
-experiment_name="pvq_test"
-data_dir="data_pvq"
-population_type="tolkien_characters"
+------------------------
+Aggregated metrics
+------------------------
+Rank-Order      Ipsative
+-0.0030         -0.0088
 ```
+TIP: you can add `--no-ips` argument to the `data_analysis.py` call to compute only Rank-Order stability (this is much faster).
 
 
-The scripts are used for various experiments as follows:
+We can also evaluate many seeds and models at once with the `campaign_data_analysis.py` script.
+It takes the following arguments:
+- `--fig-name` argument defines the experiment type to evaluate (looks in the correct results subdirectory), options are: `tolk_ro_t,fam_ro_t,religion_t,don_t,bag_t`
+- `--assert-n-context 5` ensures that each seed has 5 topics
+- `--all-models` evaluates all models in the `./models/configs` directory, if you do not set this argument you can manually define the models list on line 33.
 
-Experiments with simulated populations: ```run_campaign_sim_conv_pvq_seeds.sh``` 
 
-Experiments with simulated populations and increasing conversation length:
-```run_campaign_sim_conv_pvq_msgs.sh```
+```commandline
+python campaign_data_analysis.py --fig-name tolk_ro_t --assert-n-context 5 --all-models
+```
 
-Experiments with no persona instructions: ```run_campaign_sim_conv_no_pop.sh```
+Towards the end of the output you should see a line as follows `random: -0.00029 +/- 0.005`, again the dummy model has near-zero stability.
+The displayed figure should show one bar (it will not be easily visible as it is ~0.0).
 
-Ablation study on the system message with LLaMa-2 models: ```run_campaign_sim_conv_pvq_NO_SYSTEM.sh```
 
+# Recreating the results
 
-## Non-slurm machine
+Using the above described procedure you can evaluate the models in the paper with `run_campaign_seeds.sh` script.
+And then evaluate them with the `campaign_data_analysis.py` script.
 
-The ```run_campain*.sh``` scripts can be run on a regular machine my manually setting the ```SLURM_ARRAY_TAK_ID''' variable as follows:
+We recommend starting with the religion task because this is the smallest one.
 
-1. Check the slurm array size parameter
+# Adding a new model
 
+Most models on the huggingface hub can be added by simply adding a new config file.
+```commandline
+touch ./models/configs/mymodelname.json
 ```
-grep "$SBATCH --array=" run_campaign_sim_conv_pvq_seeds.sh
+
+This assumes that the model can be used in the standard way as follows:
 ```
+model = AutoModelForCausalLM.from_pretrained(self.model_id, **self.load_args)
+tokenizer = AutoTokenizer.from_pretrained(self.model_id, **self.load_args)
+prompt = "Hello
 
-The expected output is:
-```#SBATCH --array=0-29 # themes x n_seeds -> 6x5```
-This means that slurm would run **30 parallel jobs** corresponding to 6 themes (5 + no theme) and 5 seeds.
+# for chat models
+input_ids = self.tokenizer.apply_chat_template(
+[{"role":"user", "content", prompt}], return_tensors="pt", add_generation_prompt=True).to(self.model.device
+)
 
-2. Run the jobs manually
+# for base models
+input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.model.device).input_ids
 
-You can run the 30 evaluations sequentially on a regular machine as follows:
-```
-for i in {0..29}; do SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh <model_idx> ; done
+output_seq = model.generate(input_ids=input_ids, **generation_args, return_dict_in_generate=True, output_scores=True, stopping_criteria=stopping_criteria)
+response = tokenizer.decode(output_seq.sequences[0][len(input_ids[0]):], skip_special_tokens=True)
 ```
+You can refer to `models/hugginfacemodel.py` for more details.
 
-or in parallel as follows:
+Here is an example of the config file for the LLaMa-2-7b model:
+```
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-7b-hf",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
 ```
-for i in {0..29}; do SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh <model_idx> & done
+It should be filled as follows:
+- `model_class` - should be `"HuggingFaceModel"` unless for you want to do define your own class (it should be in ``models/``)
+- `model_id` is equivalent to the huggingface hub id
+- `base_model_template` - `true` if the model is a base model, `false` if the model is chat or instruct tuned (the tokenizer has the `apply_chat_template` function)
+- `system_message` - `true` if the model has the system message input or if it's a base model
+- `load_args` - arguments that will be passed to `AutoTokenizer.from_pretrained` and `AutoModelForCausalLM.from_pretrained` in addition to the `model_id` 
+- `generation_args` - arguments that will be passed to the `generate` function while simulating conversations
+
+Minor points:
+- HF_TOKEN is automatically parsed to the token encoded in the "HF_TOKEN" environment variable.
+- The string "torch.float16" is parsed to torch.float16 value.
+
+For additional details refer to `models/__init__.py` (`create_model` and `load_model_args` methods).
+
+
+After correctly configuring the model config file, the new model can be passed as the `engine` argument to evaluate.py (name of the json file without the extension).
+You should be able to evaluate it as any other model.
+For example, using:
+```commandline
+sbatch run_campaign_seeds.sh mymodelname pvq_tolk
 ```
 
 
diff --git a/backup_run_scripts/run_dummy.sh b/backup_scripts/run_dummy.sh
similarity index 100%
rename from backup_run_scripts/run_dummy.sh
rename to backup_scripts/run_dummy.sh
diff --git a/backup_run_scripts/run_iclr_big5.sh b/backup_scripts/run_iclr_big5.sh
similarity index 100%
rename from backup_run_scripts/run_iclr_big5.sh
rename to backup_scripts/run_iclr_big5.sh
diff --git a/backup_run_scripts/run_iclr_hof.sh b/backup_scripts/run_iclr_hof.sh
similarity index 100%
rename from backup_run_scripts/run_iclr_hof.sh
rename to backup_scripts/run_iclr_hof.sh
diff --git a/backup_run_scripts/run_iclr_mmlu.sh b/backup_scripts/run_iclr_mmlu.sh
similarity index 100%
rename from backup_run_scripts/run_iclr_mmlu.sh
rename to backup_scripts/run_iclr_mmlu.sh
diff --git a/backup_run_scripts/run_iclr_pvq.sh b/backup_scripts/run_iclr_pvq.sh
similarity index 100%
rename from backup_run_scripts/run_iclr_pvq.sh
rename to backup_scripts/run_iclr_pvq.sh
diff --git a/backup_run_scripts/run_neurips_big5.sh b/backup_scripts/run_neurips_big5.sh
similarity index 100%
rename from backup_run_scripts/run_neurips_big5.sh
rename to backup_scripts/run_neurips_big5.sh
diff --git a/backup_run_scripts/run_neurips_hof.sh b/backup_scripts/run_neurips_hof.sh
similarity index 100%
rename from backup_run_scripts/run_neurips_hof.sh
rename to backup_scripts/run_neurips_hof.sh
diff --git a/backup_run_scripts/run_neurips_pvq.sh b/backup_scripts/run_neurips_pvq.sh
similarity index 100%
rename from backup_run_scripts/run_neurips_pvq.sh
rename to backup_scripts/run_neurips_pvq.sh
diff --git a/backup_run_scripts/run_plosone_pvq.sh b/backup_scripts/run_plosone_pvq.sh
similarity index 100%
rename from backup_run_scripts/run_plosone_pvq.sh
rename to backup_scripts/run_plosone_pvq.sh
diff --git a/backup_run_scripts/run_plosone_ult.sh b/backup_scripts/run_plosone_ult.sh
similarity index 100%
rename from backup_run_scripts/run_plosone_ult.sh
rename to backup_scripts/run_plosone_ult.sh
diff --git a/campaign_data_analysis.py b/campaign_data_analysis.py
new file mode 100644
index 0000000..b7dff95
--- /dev/null
+++ b/campaign_data_analysis.py
@@ -0,0 +1,1306 @@
+import sys
+import glob
+import math
+import subprocess
+import json
+import os
+import itertools
+import numpy as np
+import matplotlib.pyplot as plt
+import hashlib
+import checksumdir
+import inspect
+import scipy.stats as st
+from termcolor import cprint
+import argparse
+
+# use this command to parse SVGs to PDFs
+# for f in *; do DISPLAY= inkscape $f --export-pdf="${f%.*}.pdf"; done
+# crop pdf images with
+# sudo apt-get install texlive-extra-utils
+# for f in *_Fig.pdf; do pdfcrop $f $f; done
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--no-show", action="store_true")
+parser.add_argument("--fig-name", type=str, default="test")
+parser.add_argument("--assert-n-contexts", type=int, default=-1, help="Set to <0 for no asserts")
+parser.add_argument("--all-models", action="store_true")
+args = parser.parse_args()
+
+if args.all_models:
+    models = sorted([m.strip(".json") for m in os.listdir("./models/configs")])
+else:
+    models = [
+        "llama_2_7b",
+        "llama_2_13b",
+        "llama_2_70b",
+        "llama_2_7b_chat",
+        "llama_2_13b_chat",
+        "llama_2_70b_chat",
+        "Mistral-7B-v0.1",
+        "Mistral-7B-Instruct-v0.1",
+        "Mistral-7B-Instruct-v0.2",
+        "zephyr-7b-beta",
+        "Mixtral-8x7B-v0.1-4b",
+        "Mixtral-8x7B-Instruct-v0.1-4b",
+        "Mixtral-8x7B-v0.1",
+        "Mixtral-8x7B-Instruct-v0.1",
+        "phi-1",
+        "phi-2",
+        "phi-3",
+        "Qwen-7B",
+        "Qwen-14B",
+        "Qwen-72B",
+        "Qwen1.5-72B-Chat",
+        "gpt-3.5-turbo-1106",
+        "gpt-3.5-turbo-0125",
+    ]
+
+assert len(set(models)) == len(models)
+
+
+def model_2_family(model):
+    model_lower = model.lower()
+    if "llama_2" in model_lower:
+        return "LLaMa-2"
+    elif "mixtral" in model_lower:
+        return "Mixtral"
+    elif "mistral" in model_lower or "zephyr" in model_lower:
+       return "Mistral"
+    elif "phi" in model_lower:
+        return "Phi"
+    elif "qwen" in model_lower:
+        return "Qwen"
+    elif "gpt" in model_lower:
+        return "GPT"
+    elif "dummy" == model_lower:
+        return "dummy"
+    elif "random" == model_lower:
+        return "random"
+    else:
+        return model
+
+
+family_2_color = {
+    "LLaMa-2": "blue",
+    "Mixtral": "orange",
+    "Mistral": "green",
+    "Phi": "red",
+    "Qwen": "purple",
+    "GPT": "black",
+    "dummy": "brown",
+    "random": "brown"
+}
+
+family_2_linestyle = {
+    "LLaMa-2": ":",
+    "Mixtral": "-",
+    "Mistral": "dashdot",
+    "Phi":  (0, (3, 5, 1, 5, 1, 5)),
+    "Qwen": "--",
+    # "GPT": "-",
+    # "dummy": "-"
+}
+
+def FDR(scores):
+    from scipy.stats import ttest_ind
+    from statsmodels.stats.multitest import multipletests
+
+    # Compute pairwise t-tests
+    n_models = scores.shape[0]
+    p_values = np.ones((n_models, n_models))  # Initialize a matrix of p-values
+
+    for i in range(n_models):
+        for j in range(i + 1, n_models):  # No need to test against itself or repeat comparisons
+            stat, p_value = ttest_ind(scores[i], scores[j])
+            p_values[i, j] = p_value
+            p_values[j, i] = p_value  # Symmetric matrix
+
+    # Flatten the p-value matrix and remove ones to prepare for FDR correction
+    p_values_flat = p_values[np.tril_indices(n_models)]
+    # Apply FDR correction
+    reject, p_values_corrected, _, _ = multipletests(p_values_flat, alpha=0.05, method='fdr_bh')
+
+    # Reshape the corrected p-values back into a matrix
+    p_values_corrected_matrix = np.zeros((n_models, n_models))
+    p_values_corrected_matrix[np.tril_indices(n_models)] = p_values_corrected
+    p_values_corrected_matrix += p_values_corrected_matrix.T  # Make symmetric
+
+    return p_values_corrected_matrix
+
+def plot_comparison_matrix(models, p_values_matrix, figure_name, title="Model Comparison"):
+    fig, ax = plt.subplots(figsize=(8, 6))
+    cax = ax.matshow(p_values_matrix, cmap='gray_r')
+
+    # Setting axes labels
+    ax.set_xticks(range(len(models)))
+    ax.set_yticks(range(len(models)))
+    ax.set_xticklabels(models, rotation=90)
+    ax.set_yticklabels(models)
+
+    # Title and color bar
+    plt.title(title)
+    # fig.colorbar(cax)
+    plt.tight_layout()
+
+    fig_path = f'visualizations/{figure_name}_comparison.pdf'
+    print(f"save to: {fig_path}")
+    plt.savefig(fig_path)
+
+    if not args.no_show:
+        plt.show()  # Sh
+
+    plt.close()
+
+
+def legend_without_duplicate_labels(ax, loc="best", title=None, legend_loc=None):
+    handles, labels = ax.get_legend_handles_labels()
+    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
+    # axs[plt_i].legend(bbox_to_anchor=legend_loc, loc="best")
+    if legend_loc:
+        loc="upper left"
+    else:
+        loc="best"
+
+    ax.legend(*zip(*unique), loc=loc, title=title, fontsize=legend_fontsize, title_fontsize=legend_fontsize, bbox_to_anchor=legend_loc)
+
+def get_all_ipsative_corrs_str(default_profile):
+
+    if default_profile is None:
+        return "All_Ipsative_corrs"
+    else:
+        return "All_Ipsative_corrs_default_profile"
+
+
+def get_all_ro_corrs_str(RO_neutral, paired_data_dir):
+    assert RO_neutral != paired_data_dir
+    if RO_neutral:
+        return "All_Neutral_Rank-Order_stabilities"
+    elif paired_data_dir:
+        return "All_Proxy_stabilities"
+    else:
+        return "All_Rank-Order_stabilities"
+
+def run_analysis(eval_script_path, data_dir, assert_n_contexts=None, default_profile=None, paired_data_dir=None, RO_neutral=False, RO_neutral_data_dir=None, no_ips=False):
+    # run evaluation script
+    command = f"python {eval_script_path} --result-json-stdout {'--assert-n-dirs ' + str(assert_n_contexts) if assert_n_contexts else ''} {f'--default-profile {default_profile}' if default_profile is not None else ''} {data_dir}/* {f'--paired-dirs {paired_data_dir}/*/*' if paired_data_dir is not None else ''} {f'--neutral-ranks --neutral-dir {RO_neutral_data_dir}' if RO_neutral else ''} {'--no-ips' if no_ips else ''}"
+    print("Command: ", command)
+    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+
+    if stderr:
+        command = f"python {eval_script_path} --result-json-stdout {'--assert-n-dirs ' + str(assert_n_contexts) if assert_n_contexts else ''} {f'--default-profile {default_profile}' if default_profile is not None else ''} {data_dir}/*/* {f'--paired-dirs {paired_data_dir}/*/*' if paired_data_dir is not None else ''} {f'--neutral-ranks --neutral-dir {RO_neutral_data_dir}' if RO_neutral else ''} {'--no-ips' if no_ips else ''}"
+        print("(old savedir detected runing Command: ", command)
+        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+
+    # parse json outputs
+    results = json.loads(stdout)
+
+    all_ipsative_corrs_str = get_all_ipsative_corrs_str(default_profile)
+    results[all_ipsative_corrs_str] = np.array(results[all_ipsative_corrs_str])
+
+    return results
+
+all_data_dirs = []
+
+
+
+
+
+
+x_label_map = {
+    "dummy": "random",
+    "llama_2_7b":  "LLaMa_2_7b",
+    "llama_2_13b": "LLaMa_2_13b",
+    "llama_2_70b": "LLaMa_2_70b",
+    "llama_2_7b_chat": "LLaMa_2_7b_chat",
+    "llama_2_13b_chat": "LLaMa_2_13b_chat",
+    "llama_2_70b_chat": "LLaMa_2_70b_chat",
+    "phi-2": "Phi-2",
+    "phi-1": "Phi-1",
+
+}
+x_label_map = {**x_label_map, **{k: k.replace("_msgs", "") for k in ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]}}
+
+x_label_map = {**x_label_map, **{
+    "gpt-3.5-turbo-1106": "GPT-3.5-1106",
+    "gpt-3.5-turbo-0125": "GPT-3.5-0125",
+}}
+
+
+# Define the results directory
+# sim conv
+
+add_legend = False
+bars_as_plot = False
+label_ = None
+
+results_dir = "results"
+# experiment_dirs = [
+#     "sim_conv_pvq_tolkien_characters_seeds",
+#     # "sim_conv_pvq_famous_people_seeds",
+#     # "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM",
+#     # "sim_conv_tolkien_donation_tolkien_characters_seeds",
+# ]
+# if "permutations_msgs" in experiment_dirs[0]:
+#     seed_strings = [f"{i}_msgs/_seed" for i in range(1, 10, 2)]  # msgs (show trends
+#     # seed_strings = ["3_msgs/_seed"] # ips (only n=3)
+# else:
+#     seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+#     # seed_strings = [f"{i}_seed" for i in range(3, 10, 2)]
+
+add_tolkien_ipsative_curve = True
+bar_plots = False
+
+metric = "Rank-Order"
+# metric = "Ipsative"
+
+ci_ticks = False
+
+# Vérifie si au moins un argument a été passé
+figure_name = args.fig_name
+
+# list of options
+# figure_name = "tolk_ro_t"
+# figure_name = "fam_ro_t"
+# figure_name = "no_pop_ips"
+# figure_name = "no_pop_msgs"
+# figure_name = "tolk_ro_msgs"
+# figure_name = "religion_t"
+# figure_name = "don_t"
+# figure_name = "bag_t"
+# figure_name = "paired_tolk_ro_uni"
+# figure_name = "paired_tolk_ro_ben"
+# figure_name = "paired_tolk_ro_pow"
+# figure_name = "paired_tolk_ro_ach"
+# app
+# figure_name = "tolk_ips_msgs"
+# figure_name = "tolk_ips_msgs_default_prof"
+# figure_name = "tolk_ro_msgs_neutral"
+# figure_name = "llama_sys_no_sys"
+
+rotatation_x_labels = 0
+
+legend_fontsize = 18
+human_data_fontsize = 12
+xticks_fontsize = 15
+yticks_fontsize = 15
+y_label_fontsize = 25
+x_label_fontsize = 20
+title_fontsize = 18
+
+interval_figsize_x = 8
+interval_figsize_y = 7
+
+round_y_lab = 1
+
+show_human_change = False
+legend_loc = None
+
+legend_title = "LLM families"
+
+title=None
+
+default_profile = None
+
+add_tolkien_ro_curve = False
+add_tolkien_ipsative_curve = False
+
+left_adjust = None
+paired_dir = None
+y_label = None
+
+RO_neutral = False
+
+# FDR rest
+FDR_test = True
+
+# Families legend
+families_plot = False
+fam_min_y, fam_max_y = -0.1, 0.8
+
+if figure_name == "no_pop_msgs":
+    experiment_dirs = ["sim_conv_pvq_permutations_msgs"]
+    seed_strings = [f"{i}_msgs/_seed" for i in range(1, 10, 2)]  # msgs (show trends
+
+    FDR_test = False
+
+    add_tolkien_ipsative_curve = True
+    bar_plots = False
+    models = [
+        "Mixtral-8x7B-Instruct-v0.1",
+        "Mixtral-8x7B-Instruct-v0.1-4b",  # 6h
+        "zephyr-7b-beta",
+        "Mistral-7B-Instruct-v0.2",
+        "Mistral-7B-Instruct-v0.1",
+        "Qwen-72B",
+        "Qwen-14B",
+        "Qwen-7B",
+        "llama_2_70b_chat",  # 2 gpu
+        "llama_2_70b",  # 2 gpu
+        "phi-2",
+        "gpt-3.5-turbo-0125",
+    ]
+    metric = "Ipsative"
+    human_change_xloc = -1.0
+    msgs_ro_tolk = False
+
+    min_y, max_y = -0.1, 1.0  # IPS
+    legend_fontsize = 22
+    xticks_fontsize = 20
+    yticks_fontsize = 20
+    y_label_fontsize = 40
+    x_label_fontsize = 30
+    interval_figsize_x = 14
+    interval_figsize_y = 7
+
+elif figure_name == "tolk_ips_msgs_default_prof":
+
+    # Messages on Ips Tolkien
+    models = ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]
+    experiment_dirs = ["sim_conv_pvq_tolkien_characters_msgs/Mixtral-8x7B-Instruct-v0.1"]
+    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+    y_label = "Stability"
+
+    bar_plots = True
+    bars_as_plot = True
+    add_tolkien_ro_curve = True
+    add_tolkien_ipsative_curve = False
+    msgs_ro_tolk = True
+    legend_title = None
+    legend_fontsize = 14
+
+    label_ = "Ipsative stability (with\n  the default profile)"
+
+    metric = "Ipsative_default_profile"
+    default_profile = "results/sim_conv_pvq_permutations_msgs/Mixtral-8x7B-Instruct-v0.1/9_msgs/_seed/results_sim_conv_permutations_Mixtral-8x7B-Instruct-v0.1/pvq_test_Mixtral-8x7B-Instruct-v0.1_data_pvq_pvq_auto__permutations_50_permute_options_5_no_profile_True_format_chat___2024_02_14_20_47_27"
+    add_legend = True
+    human_change_xloc = 6.8
+    show_human_change = False
+
+    min_y, max_y = 0.3, 0.8  # IPS
+    round_y_lab = 2
+
+    left_adjust = 0.15
+
+    interval_figsize_x = 6
+    interval_figsize_y = 6
+
+    # PLOSONE
+    interval_figsize_x = 6
+    interval_figsize_y = 4
+
+elif figure_name == "no_pop_ips":
+    experiment_dirs = ["sim_conv_pvq_permutations_msgs"]
+    seed_strings = ["3_msgs/_seed"]  # ips (only n=3)
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = True
+    metric = "Ipsative"
+    human_change_xloc = -1.0
+    msgs_ro_tolk = False
+
+    show_human_change = True
+
+    human_data_fontsize = 8
+    xticks_fontsize = 17
+    yticks_fontsize = 20
+    legend_fontsize = 25
+    rotatation_x_labels = 90
+    y_label_fontsize = 30
+
+    # legend_loc = (0.2, 0.45)
+    legend_loc = (1, 1)
+
+    interval_figsize_x = 14
+    interval_figsize_y = 7
+
+    human_data_fontsize = 14
+
+    min_y, max_y = -0.1, 1.0  # IPS
+
+elif figure_name.startswith("tolk_ro_t"):
+
+    experiment_dirs = ["stability_default_params_pvq_tolkien_characters"]
+    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = True
+    legend_loc = (0.001, 0.99)
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    msgs_ro_tolk = False
+    show_human_change = True
+    legend_fontsize = 22
+    rotatation_x_labels = 90
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+elif figure_name.startswith("religion_t"):
+
+    rotatation_x_labels = 90
+
+    # title = "Religion stability of real world persons"
+    # title = "(C)"
+
+    # experiment_dirs = ["sim_conv_religion_famous_people_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+    #
+    # experiment_dirs = ["RERUN_sim_conv_religion_famous_people_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+    # models = [
+    #     "llama_2_7b",
+    #     "llama_2_13b",
+    #     "llama_2_70b",  # 2 gpu
+    #     "llama_2_7b_chat",
+    #     "llama_2_13b_chat",
+    #     # "llama_2_70b_chat",  # 2 gpu
+    #     "Mistral-7B-v0.1",
+    #     "Mistral-7B-Instruct-v0.1",
+    #     "Mistral-7B-Instruct-v0.2",
+    #     "zephyr-7b-beta",
+    #     "Mixtral-8x7B-v0.1-4b",  # 6h
+    #     "Mixtral-8x7B-Instruct-v0.1-4b",  # 6h
+    #     "Mixtral-8x7B-v0.1",
+    #     "Mixtral-8x7B-Instruct-v0.1",
+    #     "phi-1",
+    #     "phi-2",
+    #     "Qwen-7B",
+    #     "Qwen-14B",
+    #     "Qwen-72B",
+    #     "gpt-3.5-turbo-1106",
+    #     "gpt-3.5-turbo-0125",
+    #
+    # ]
+    #
+    # experiment_dirs = ["stability_religion_famous_people"]
+    # seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+
+    experiment_dirs = ["stability_default_params_religion_famous_people"]
+    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = False
+
+    metric = "Rank-Order"
+    msgs_ro_tolk = False
+    show_human_change = False
+    legend_fontsize = 22
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+elif figure_name.startswith("paired_tolk_ro"):
+
+    if figure_name.endswith("uni"):
+        value_to_pair = "Universalism"
+        letter = "(A)"
+    elif figure_name.endswith("ben"):
+        value_to_pair = "Benevolence"
+        letter = "(B)"
+    elif figure_name.endswith("pow"):
+        value_to_pair = "Power"
+        letter = "(C)"
+    elif figure_name.endswith("ach"):
+        value_to_pair = "Achievement"
+        letter = "(D)"
+    else:
+        raise ValueError(f"Undefined figure name: {figure_name}")
+
+    # title = f"{letter} {value_to_pair}"
+
+    # y_label = f"Rank-Order stability\n{value_to_pair}-Donation"
+    y_label = f"Rank-Order stability\nwith donation"
+
+    # experiment_dirs = ["sim_conv_pvq_tolkien_characters_seeds"]
+    # paired_dir = "sim_conv_tolkien_donation_tolkien_characters_seeds"
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    experiment_dirs = ["RERUN_sim_conv_pvq_tolkien_characters_seeds"]
+    paired_dir = "RERUN_sim_conv_tolkien_donation_tolkien_characters_seeds"
+    seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+
+    if value_to_pair == "Universalism":
+        add_legend = True
+        legend_fontsize = 20
+    else:
+        add_legend = False
+
+    metric = "Rank-Order"
+    msgs_ro_tolk = False
+    show_human_change = False
+    human_change_xloc = 6.8
+    rotatation_x_labels = 90
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    left_adjust = 0.2
+
+    if value_to_pair in ["Power", "Achievement"]:
+        min_y, max_y = -0.5, 0.1
+    else:
+        min_y, max_y = -0.1, 0.5
+
+elif figure_name.startswith("fam_ro_t"):
+
+    # title = "Personal value stability of real world personas with PVQ"
+    # title = "(B)"
+
+    # experiment_dirs = ["sim_conv_pvq_famous_people_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    # experiment_dirs = ["RERUN_sim_conv_pvq_famous_people_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+
+    experiment_dirs = ["stability_default_params_pvq_famous_people"]
+    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = False
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    msgs_ro_tolk = False
+
+    show_human_change = True
+    rotatation_x_labels = 90
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+elif figure_name.startswith("don_t"):
+
+    # title = "Donation stability of fictional characters"
+    # title = "(A)"
+    # experiment_dirs = ["sim_conv_tolkien_donation_tolkien_characters_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    # experiment_dirs = ["RERUN_sim_conv_tolkien_donation_tolkien_characters_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+
+    experiment_dirs = ["stability_default_params_tolkien_donation_tolkien_characters"]
+    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+    # models = [
+    #     "llama_2_7b",
+    #     "llama_2_13b",
+    #     "llama_2_70b",  # pushed on GPU
+    #     "llama_2_7b_chat",
+    #     "llama_2_13b_chat",
+    #     "llama_2_70b_chat",  # pushed on GPU
+    #     "Mistral-7B-v0.1",
+    #     "Mistral-7B-Instruct-v0.1",
+    #     "Mistral-7B-Instruct-v0.2",
+    #     "zephyr-7b-beta",
+    #     "Mixtral-8x7B-v0.1-4b",  # 6h  # not on GPU still fail??
+    #     "Mixtral-8x7B-Instruct-v0.1-4b",  # 6h # not on GPU stil fail??
+    #     "Mixtral-8x7B-v0.1", # 3 good  #pushed on GPU
+    #     "Mixtral-8x7B-Instruct-v0.1",# pushed on GPU
+    #     "phi-1",
+    #     "phi-2",
+    #     "Qwen-7B",
+    #     "Qwen-14B",
+    #     "Qwen-72B",  # nije uspio nista izgenerirat, ali svi load (znaci zapne na prvoj generaciji?) pushed on GPU
+    #     "gpt-3.5-turbo-1106",
+    #     "gpt-3.5-turbo-0125",
+    # ]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    msgs_ro_tolk = False
+    rotatation_x_labels = 90
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+elif figure_name.startswith("bag_t"):
+
+    # title = "Stealing stability of fictional characters"
+    # title = "(B)"
+
+    # experiment_dirs = ["sim_conv_tolkien_bag_tolkien_characters_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    experiment_dirs = ["RERUN_sim_conv_tolkien_bag_tolkien_characters_seeds"]
+    seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = True
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    msgs_ro_tolk = False
+    rotatation_x_labels = 90
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+elif figure_name == "tolk_ro_msgs":
+    # Messages on Rank-Order Tolkien
+    models = ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]
+    experiment_dirs = ["sim_conv_pvq_tolkien_characters_msgs/Mixtral-8x7B-Instruct-v0.1"]
+    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    bar_plots = True
+    bars_as_plot = False
+    add_tolkien_ipsative_curve = False
+    msgs_ro_tolk = True
+
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    interval_figsize_x = 14
+    interval_figsize_y = 7
+
+    xticks_fontsize = 25
+    yticks_fontsize = 25
+    y_label_fontsize = 35
+    x_label_fontsize = 30
+
+    round_y_lab = 2
+    min_y, max_y = 0.25, 0.5  # RO
+
+elif figure_name == "tolk_ro_msgs_neutral":
+    # Messages on Rank-Order Tolkien
+    models = ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]
+    experiment_dirs = ["sim_conv_pvq_tolkien_characters_msgs/Mixtral-8x7B-Instruct-v0.1"]
+    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    RO_neutral_dir = "sim_conv_pvq_tolkien_characters_seeds/Mixtral-8x7B-Instruct-v0.1"
+    RO_neutral = True
+
+    bar_plots = True
+
+    bars_as_plot = True
+    add_tolkien_ipsative_curve = False
+    add_tolkien_ro_curve = True
+    msgs_ro_tolk = True
+
+    add_legend = True
+    legend_title=None
+    label_ = "Rank-Order stability\n  (with the neutral order)"
+
+    metric = "Rank-Order"
+    human_change_xloc = 6.8
+    interval_figsize_x = 14
+    interval_figsize_y = 7
+
+    xticks_fontsize = 25
+    yticks_fontsize = 25
+    y_label_fontsize = 35
+    x_label_fontsize = 30
+
+    round_y_lab = 2
+    min_y, max_y = 0.30, 0.58  # RO
+
+elif figure_name == "tolk_ips_msgs":
+    # Messages on Ips Tolkien
+    models = ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]
+    experiment_dirs = ["sim_conv_pvq_tolkien_characters_msgs/Mixtral-8x7B-Instruct-v0.1"]
+    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+
+    bar_plots = True
+    bars_as_plot = True
+    add_tolkien_ipsative_curve = False
+    msgs_ro_tolk = True
+
+    metric = "Ipsative"
+    human_change_xloc = 6.8
+
+    min_y, max_y = -0.1, 1  # IPS
+
+elif figure_name == "llama_sys_no_sys":
+    families_plot = False
+    # title = "Personal value stability of fictional characters with PVQ"
+
+    experiment_dirs = [
+        # "sim_conv_pvq_tolkien_characters_seeds",
+        # "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM",
+        ""
+    ]
+
+    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+    models = [
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_7b_chat",
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_13b_chat",
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_70b_chat",  # 2 gpu
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_7b_chat",
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_13b_chat",
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_70b_chat",  # 2 gpu
+    ]
+    x_label_map = {
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_7b_chat": "llama_2_7b_chat_sys",
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_13b_chat": "llama_2_13b_chat_sys",
+        "sim_conv_pvq_tolkien_characters_seeds/llama_2_70b_chat": "llama_2_70b_chat_sys",  # 2 gpu
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_7b_chat": "llama_2_7b_chat_no_sys",
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_13b_chat": "llama_2_13b_chat_no_sys",
+        "sim_conv_pvq_tolkien_characters_seeds_NO_SYSTEM/llama_2_70b_chat": "llama_2_70b_chat_no_sys",  # 2 gpu
+    }
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = False
+    metric = "Rank-Order"
+    human_change_xloc = -0.5
+    msgs_ro_tolk = False
+    show_human_change = True
+    legend_fontsize = 22
+    rotatation_x_labels = 90
+    show_human_changea = False
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+else:
+    raise ValueError("Unknown figure name")
+    # scratch
+    # results_dir = "results"
+    # experiment_dirs = ["Temp_GS_religion_famous_people_seeds"]
+    # models = ["dummy"]
+    # seed_strings = ["temp_0.4", "temp_0.7", "temp_1.0", "temp_1.5"]
+    #
+    # add_tolkien_ipsative_curve = False
+    # bar_plots = True
+    # add_legend = False
+    #
+    # metric = "Rank-Order"
+    # msgs_ro_tolk = False
+    # show_human_change = False
+    # legend_fontsize = 22
+    #
+    # xticks_fontsize = 15
+    # yticks_fontsize = 18
+    #
+    # min_y, max_y = -0.1, 0.8  # RO
+
+    rotatation_x_labels = 90
+
+    # models = ["dummy"]
+    experiment_dirs = ["stability_default_params_religion_famous_people"]
+    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = False
+
+    metric = "Rank-Order"
+    msgs_ro_tolk = False
+    show_human_change = False
+    legend_fontsize = 22
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
+
+if y_label is None:
+    y_label = metric + " stability (r)"
+
+if add_tolkien_ipsative_curve:
+    with open("tolkien_ipsative_curve_cache.json", "r") as f:
+        tolkien_ipsative_curve = json.load(f)
+
+if add_tolkien_ro_curve:
+    with open("tolkien_ro_curve_cache.json", "r") as f:
+        tolkien_ro_curve = json.load(f)
+
+
+# confidence = 0.95
+
+n_comp = math.comb(len(models), 2)  # n comparisons
+
+print("N_comp:", n_comp)
+
+confidence = 0.95
+
+
+if args.assert_n_contexts < 0:
+    args.assert_n_contexts = None
+else:
+    cprint(f"Asserting {args.assert_n_contexts} contexts.", "green")
+
+# prefix = "results_pvq_sim_conv_famous_people"
+# prefix = "results_ult_sim_conv_famous_people"
+
+
+data = {}
+for experiment_dir in experiment_dirs:
+    print(f"{experiment_dir}")
+    data[experiment_dir] = {}
+    for model in models:
+        print(f"\t{model}")
+
+
+        data[experiment_dir][model] = {}
+        for seed_str in seed_strings:
+            data[experiment_dir][model][seed_str] = {}
+
+            # data_dir = os.path.join("results", experiment_dir, model, seed_str)
+            data_dir = os.path.join(results_dir, experiment_dir, model, seed_str)
+
+            if paired_dir:
+                paired_data_dir = os.path.join("results", paired_dir, model, seed_str)
+            else:
+                paired_data_dir = None
+
+            if RO_neutral:
+                RO_neutral_data_dir = os.path.join("results", RO_neutral_dir, seed_str)
+            else:
+                RO_neutral_data_dir = None
+
+            if len(glob.glob(data_dir+"/*/*.json")) < 3 and len(glob.glob(data_dir + "/*/*/*.json")) < 3:
+                print(f"No evaluation found at {data_dir}.")
+                # no evaluations
+                eval_data = dict(zip(["Mean-Level", "Rank-Order", "Ipsative"], [np.nan, np.nan, np.nan]))
+
+            else:
+                no_ips = metric != "Ipsative"
+                # compute hash
+                eval_script_path = "./visualization_scripts/data_analysis.py"
+                with open(eval_script_path, 'rb') as file_obj: eval_script = str(file_obj.read())
+                hash = hashlib.sha256("-".join(
+                    [eval_script, inspect.getsource(run_analysis), checksumdir.dirhash(data_dir),
+                     str(args.assert_n_contexts), str(False),
+                     str(default_profile), str(paired_data_dir),
+                     str(RO_neutral), str(RO_neutral_data_dir),
+                     str(no_ips)
+                     ]).encode()).hexdigest()
+                cache_path = f".cache/{hash}.json"
+
+                # check for cache
+                if os.path.isfile(cache_path):
+                    with open(cache_path) as f:
+                        print("\t\tLoading from cache")
+                        eval_data = json.load(f)
+
+                else:
+                    print("\t\tEvaluating")
+                    eval_data = run_analysis(
+                        eval_script_path=eval_script_path, data_dir=data_dir, assert_n_contexts=args.assert_n_contexts,
+                        default_profile=default_profile,
+                        paired_data_dir=paired_data_dir, RO_neutral=RO_neutral, RO_neutral_data_dir=RO_neutral_data_dir,
+                        no_ips=no_ips,
+                    )
+
+                with open(cache_path, 'w') as fp:
+
+                    class NumpyEncoder(json.JSONEncoder):
+                        def default(self, obj):
+                            if isinstance(obj, np.ndarray):
+                                return obj.tolist()
+                            return json.JSONEncoder.default(self, obj)
+
+                    json.dump(eval_data, fp, cls=NumpyEncoder)
+
+            data[experiment_dir][model][seed_str] = eval_data.copy()
+
+            keys_to_print = ["Mean-Level", "Rank-Order", "Ipsative"]
+            metrs_str = {k: np.round(v, 2) for k, v in data[experiment_dir][model][seed_str].items() if k in keys_to_print}
+            print(f"\t\t- {seed_str} : {metrs_str}")
+
+
+human_data_color = "black"
+
+# # ips
+# legend_fontsize = 8
+# human_data_fontsize = 5.5
+
+human_change_10_12 = {
+    "Mean-Level": None,
+    "Rank-Order": 0.569,
+    "Ipsative": 0.66
+}
+
+human_change_20_28 = {
+    "Mean-Level": 0.11,
+    "Rank-Order": 0.657,
+    "Ipsative": 0.59,
+}
+
+human_change_20_24 = {
+    "Mean-Level": 0.19,
+    "Rank-Order": 0.69,
+    "Ipsative": 0.59,
+}
+
+human_change_24_28 = {
+    "Mean-Level": 0.11,
+    "Rank-Order": 0.77,
+    "Ipsative": 0.65,
+}
+
+num_plots = len(experiment_dirs)
+num_cols = min(len(experiment_dirs), 3)  # Adjust this as needed for a better layout
+num_rows = num_plots // num_cols + (num_plots % num_cols > 0)
+
+print(f"Metric: {metric}")
+
+
+fig, axs = plt.subplots(num_rows, num_cols, figsize=(interval_figsize_x * num_cols, interval_figsize_y * num_rows))
+
+if num_cols == 1:
+    axs=[axs]
+else:
+    axs = axs.flatten()
+
+all_ipsative_corrs_str = get_all_ipsative_corrs_str(default_profile)
+all_ro_corrs_str = get_all_ro_corrs_str(RO_neutral, paired_data_dir)
+
+from collections import defaultdict
+family_data = defaultdict(list)
+
+for plt_i, experiment_dir in enumerate(experiment_dirs):
+
+    if show_human_change:
+
+        if default_profile:
+            metric_human = "Ipsative"
+        else:
+            metric_human = metric
+
+        axs[plt_i].axhline(y=human_change_10_12[metric_human], color=human_data_color, linestyle=':', zorder=0)
+        axs[plt_i].text(human_change_xloc, human_change_10_12[metric_human] + 0.01, "Human value stability between ages 10 and 12",
+                        fontsize=human_data_fontsize, color=human_data_color)
+
+        axs[plt_i].axhline(y=human_change_20_28[metric_human], color=human_data_color, linestyle=':', zorder=0)
+        axs[plt_i].text(human_change_xloc, human_change_20_28[metric_human] + 0.01, "Human value stability between ages 20 and 28",
+                        fontsize=human_data_fontsize, color=human_data_color)
+
+
+    if bar_plots:
+
+        plt.subplots_adjust(left=left_adjust, top=0.90, bottom=0.5, hspace=0.8)
+        xs = models
+        xs = [x_label_map.get(x, x) for x in xs]
+
+        if figure_name.startswith("paired_tolk_ro"):
+            scores = np.array([[data[experiment_dir][model][seed_str]['Proxy_stability'][value_to_pair] for seed_str in seed_strings] for model in models])
+
+        elif RO_neutral:
+            assert metric == "Rank-Order"
+            scores = np.array([[data[experiment_dir][model][seed_str]["Neutral_Rank-Order"] for seed_str in seed_strings] for model in models])
+        else:
+            scores = np.array([[data[experiment_dir][model][seed_str][metric] for seed_str in seed_strings] for model in models])
+
+        if figure_name.startswith("paired_tolk_ro"):
+            # reorganize this
+            all_scores = [
+                np.array(list(itertools.chain(*[
+                    (itertools.chain(*data[experiment_dir][model][seed_str][all_ro_corrs_str][value_to_pair].values()) if all_ro_corrs_str in data[experiment_dir][model][seed_str] else [np.nan])
+                    for seed_str in seed_strings
+                ]))) for model in models
+            ]
+
+        else:
+            # reorganize this
+            all_scores = [
+                np.array(list(itertools.chain(*[
+                    (itertools.chain(*data[experiment_dir][model][seed_str][all_ro_corrs_str].values()) if all_ro_corrs_str in data[experiment_dir][model][seed_str] else [np.nan])
+                    for seed_str in seed_strings
+                ]))) for model in models
+            ]
+
+        assert len(models) == len(scores)
+
+        for model, m_scores in zip(models, scores):
+            family_data[model_2_family(model)].append(m_scores)
+
+        ys = scores.mean(axis=1)
+
+        # get the right side of the CI
+        if "sim_conv_pvq_permutations_msgs" in experiment_dir:
+            assert metric == "Ipsative"
+            assert len(seed_strings) == 1  # you should use plots, not bars
+            # [n_modelx, cont_pairs, pop_size]
+            all_corrs = np.array([data[experiment_dir][model][seed_strings[0]][all_ipsative_corrs_str] for model in models])
+            all_corrs = all_corrs.mean(1)  # mean over pairs
+
+            c2 = np.array([st.t.interval(confidence, len(a) - 1, loc=np.mean(a), scale=st.sem(a))[1] for a in all_corrs])
+            tick_len_ci = c2 - ys  # half the conf interval
+
+            tick_len_se = np.array(list(st.sem(a) for a in all_corrs))
+
+            scores = all_corrs
+
+        elif metric == "Ipsative":
+            n_msgs = models
+            # [n_msgs, n_seeds, n_pairs, n_personas]
+            all_corrs = np.array([[
+                data[experiment_dir][n_msg][seed_str][all_ipsative_corrs_str] for seed_str in seed_strings
+            ] for n_msg in n_msgs])
+
+            all_corrs = all_corrs.mean(2)  # mean over context pairs
+
+            # SI over what -> seed and personas
+            all_corrs = all_corrs.reshape(len(n_msgs), -1)
+
+            # SI over what -> personas
+            # SI over what -> seeds
+            c2 = np.array([st.t.interval(confidence, len(a) - 1, loc=np.mean(a), scale=st.sem(a))[1] for a in all_corrs])
+            tick_len_ci = c2 - ys  # half the conf interval
+            tick_len_ci = None
+
+            tick_len_se = np.array([st.sem(a) for a in all_corrs])
+
+            scores = all_corrs
+
+        else:
+            assert metric == "Rank-Order"
+
+            # 5 seeds = 5 samples
+            c2 = np.array([st.t.interval(confidence, len(a) - 1, loc=np.mean(a), scale=st.sem(a))[1] for a in scores])
+
+            # 5 seeds x 10 values x (5 ch 2) = 500 samples
+            # assert np.allclose(np.array(all_scores).mean(1), ys)
+            # c2 = np.array([st.t.interval(confidence, len(a) - 1, loc=np.mean(a), scale=st.sem(a))[1] for a in all_scores])
+            tick_len_ci = c2 - ys  # half the conf interval
+
+            print("Error bars")
+            tick_len_se = np.array([st.sem(a) for a in scores])
+
+            assert all((tick_len_se >= 0) | np.isnan(tick_len_se))
+
+        if np.isnan(ys).all():
+            raise ValueError("All models are nan.")
+
+        print("Results")
+        for x, y, t in zip(xs, ys, tick_len_se):
+            print(f"{x}: {y:.5f} +/- {t:.3f}")
+
+        if bars_as_plot:
+            # used for msgs
+            axs[plt_i].plot(xs, ys, label=label_)
+            axs[plt_i].fill_between(xs, ys - tick_len_se, ys + tick_len_se, alpha=0.3)
+
+            if metric == "Ipsative" and figure_name == "tolk_ips_msgs":
+                tolkien_ipsative_curve = {
+                    "xs": list(xs),
+                    "ys": list(ys),
+                    "tick_len": list(tick_len_se),
+                }
+
+                cprint("SAVING Ipsative Tolkien Mixtral-Instruct stability to CACHE", "red")
+                with open("tolkien_ipsative_curve_cache.json", "w") as f:
+                    json.dump(tolkien_ipsative_curve, f)
+
+            # load ro
+            if add_tolkien_ro_curve:
+                cprint("Loading Rank-order Tolkien Mixtral-Instruct stability from CACHE", "red")
+
+                xs = np.array(tolkien_ro_curve["xs"])
+                xs = [x_label_map.get(x, x) for x in xs]
+                ys = np.array(tolkien_ro_curve["ys"])
+                shade_len = np.array(tolkien_ro_curve["tick_len"])
+
+                lab_ = "Rank-Order stability\n  (between contexts)"
+                col_ = "black"
+                axs[plt_i].plot(xs, ys, label=lab_, color=col_)
+                axs[plt_i].fill_between(xs, ys - shade_len, ys + shade_len, alpha=0.3, color=col_)
+
+            if add_tolkien_ipsative_curve:
+                cprint("Loading Ipsative Tolkien Mixtral-Instruct stability from CACHE", "red")
+
+                xs = np.array(tolkien_ipsative_curve["xs"])
+                xs = [x_label_map.get(x, x) for x in xs]
+                ys = np.array(tolkien_ipsative_curve["ys"])
+                shade_len = np.array(tolkien_ipsative_curve["tick_len"])
+
+                lab_ = "Ipsative stability (between contexts)"
+                col_ = "brown"
+                axs[plt_i].plot(xs, ys, label=lab_, color=col_, zorder=0)
+                axs[plt_i].fill_between(xs, ys - shade_len, ys + shade_len, alpha=0.3, color=col_, zorder=0)
+
+        else:
+            if msgs_ro_tolk:
+                axs[plt_i].bar(xs, ys, yerr=tick_len_se)
+
+                if metric == "Rank-Order" and figure_name == "tolk_ro_msgs":
+                    tolkien_ro_curve = {
+                        "xs": list(xs),
+                        "ys": list(ys),
+                        "tick_len": list(tick_len_se),
+                    }
+
+                    cprint("SAVING Rank-order Tolkien Mixtral-Instruct stability to CACHE", "red")
+                    with open("tolkien_ro_curve_cache.json", "w") as f:
+                        json.dump(tolkien_ro_curve, f)
+
+            else:
+                cs = [family_2_color.get(model_2_family(x), "black") for x in xs]
+                labs = [model_2_family(x) for x in xs]
+                axs[plt_i].bar(xs, ys, yerr=tick_len_se, color=cs, label=labs)
+                if ci_ticks:
+                    # axs[plt_i].bar(xs, ys, yerr=tick_len_ci, color=cs, label=labs)
+                    axs[plt_i].scatter(xs, ys+tick_len_ci, marker="x", color="black", s=20, lw=0.8)
+                    axs[plt_i].scatter(xs, ys-tick_len_ci, marker="x", color="black", s=20, lw=0.8)
+
+                assert len(experiment_dirs) == 1
+
+        axs[plt_i].set_ylim(min_y, max_y)
+        axs[plt_i].set_xticklabels([x_label_map.get(m, m) for m in models], rotation=rotatation_x_labels, fontsize=xticks_fontsize)
+        axs[plt_i].set_xticklabels(models, rotation=rotatation_x_labels, fontsize=xticks_fontsize)
+        axs[plt_i].set_yticklabels(map(lambda x: np.round(x, round_y_lab), axs[plt_i].get_yticks()), fontsize=yticks_fontsize)
+
+        axs[plt_i].set_ylabel(y_label, fontsize=y_label_fontsize)
+
+        if title:
+            axs[plt_i].set_title(title, fontsize=title_fontsize)
+
+        if msgs_ro_tolk:
+            axs[plt_i].set_ylim(min_y, max_y)
+            axs[plt_i].set_xlabel("Simulated conversation length (n)", fontsize=x_label_fontsize)
+
+        if add_legend:
+            legend_without_duplicate_labels(axs[plt_i], loc="best", title=legend_title, legend_loc=legend_loc)
+
+    else:
+
+        if add_tolkien_ipsative_curve:
+            cprint("Loading Ipsative Rank-order Tolkien Mixtral-Instruct stability from CACHE", "red")
+
+            assert metric == "Ipsative"
+            assert not bars_as_plot
+            xs = np.array(tolkien_ipsative_curve["xs"])
+            xs = [x_label_map.get(x,x) for x in xs]
+            ys = np.array(tolkien_ipsative_curve["ys"])
+            shade_len = np.array(tolkien_ipsative_curve["tick_len"])
+
+            m_ = "Mixtral-8x7B-Instruct-v0.1"
+            lab_ = x_label_map.get(m_, m_)+" (fict. char.)"
+            axs[plt_i].plot(xs, ys, label=lab_, linestyle="-.", color="black")
+            axs[plt_i].fill_between(xs, ys - shade_len, ys + shade_len, alpha=0.3, color="black")
+
+        for model in models:
+            xs = seed_strings
+            ys = [data[experiment_dir][model][msg][metric] for msg in seed_strings]
+
+            # [n_msgs, context pairs, n_pop]
+            all_corrs = np.array([data[experiment_dir][model][msg][all_ipsative_corrs_str] for msg in seed_strings])
+            all_corrs = np.mean(all_corrs, axis=1)  # average over context pairs
+            c2 = np.array([
+                st.t.interval(confidence, len(msg_corrs) - 1, loc=np.mean(msg_corrs), scale=st.sem(msg_corrs))[1] for msg_corrs in all_corrs
+            ])
+            shade_len = c2 - ys  # half the conf interval
+
+            print(f"{model}: {ys}")
+
+            family = model_2_family(model)
+            # linestyle = family_2_linestyle[family]
+            linestyle = "-"
+
+            xs = [x.replace("/_seed", "") for x in xs]
+            xs = [x_label_map.get(x, x) for x in xs]
+
+            # ugly patch to make colors nicer
+            c = "black" if "gpt" in model else None
+
+            line, = axs[plt_i].plot(xs, ys, label=x_label_map.get(model, model), linestyle=linestyle, color=c)
+            line_color = line.get_color()
+            axs[plt_i].fill_between(xs, ys - shade_len, ys + shade_len, alpha=0.3, color=line_color)
+
+        max_y = 0.8 if metric == "Rank-Order" else 1.0
+        axs[plt_i].set_ylim(-0.1, max_y)
+        axs[plt_i].set_xlim(0, len(seed_strings) - 1)
+        axs[plt_i].set_ylabel(y_label, fontsize=y_label_fontsize)
+        axs[plt_i].set_xlabel("Simulated conversation length (n)", fontsize=x_label_fontsize)
+        # axs[plt_i].set_title(experiment_dir.replace("sim_conv_", "").replace("_seeds", ""))
+        if add_legend:
+            axs[plt_i].legend(bbox_to_anchor=(1.04, 1), loc="best", fontsize=legend_fontsize)
+
+        axs[plt_i].set_xticklabels(axs[plt_i].get_xticks(), rotation=rotatation_x_labels, fontsize=xticks_fontsize)
+        axs[plt_i].set_yticklabels(map(lambda  x: np.round(x, round_y_lab), axs[plt_i].get_yticks()), fontsize=yticks_fontsize)
+
+        plt.subplots_adjust(left=0.1, top=0.95, bottom=0.2, hspace=0.8)
+
+# Hide any unused subplots
+for j in range(plt_i + 1, num_rows * num_cols):
+    axs[j].axis('off')
+
+plt.tight_layout()
+
+# fig_path = f'visualizations/{figure_name}.svg'
+fig_path = f'visualizations/{figure_name}.pdf'
+print(f"save to: {fig_path}")
+plt.savefig(fig_path)
+
+if not args.no_show:
+    plt.show()  # Sh
+plt.close()
+# plt.draw()
+
+
+if FDR_test:
+    # FDR
+    p_values_corrected_matrix = FDR(scores)
+
+    binary_matrix = (p_values_corrected_matrix < 0.05).astype(int)
+    models_labels = [x_label_map.get(m, m) for m in models]
+    plot_comparison_matrix(models_labels, binary_matrix, figure_name, title=title)
+
+if families_plot:
+    # fam_conf = 0.05
+
+    n_comp = math.comb(len(family_data), 2)
+
+    # sidak
+    # fam_confidence = (1-0.05)**(1/n_comp)
+    # bonf
+    fam_confidence = 1-0.05/n_comp
+
+    families = list(family_data.keys())
+    family_scores = np.array([np.array(family_data[f]).mean(axis=0) for f in families])  # n seeds per family
+    family_means = family_scores.mean(axis=1)
+    family_CIs = np.array([st.t.interval(fam_confidence, len(a) - 1, loc=np.mean(a), scale=st.sem(a))[1] for a in family_scores])
+    family_tick_len_ci = family_CIs - family_means  # half the conf interval
+    colors = [family_2_colorget(f, "black") for f in families]
+    family_tick_len_se = np.array([st.sem(a) for a in family_scores])
+
+    tick_len_se = np.array([st.sem(a) for a in family_scores])
+
+    plt.gca().set_title(title)
+    plt.bar(families, family_means, yerr=family_tick_len_se, color=colors, label=families)
+
+    if ci_ticks:
+        plt.scatter(families, family_means + family_tick_len_ci, marker="x", color="black", s=20, lw=0.8)
+        plt.scatter(families, family_means - family_tick_len_ci, marker="x", color="black", s=20, lw=0.8)
+
+    plt.gca().set_xticklabels(families, rotation=rotatation_x_labels, fontsize=xticks_fontsize)
+    plt.gca().set_yticklabels(map(lambda x: np.round(x, round_y_lab), axs[plt_i].get_yticks()), fontsize=yticks_fontsize)
+    plt.gca().set_ylabel(y_label, fontsize=y_label_fontsize)
+
+    plt.tight_layout()
+
+    plt.gca().set_ylim(fam_min_y, fam_max_y)
+
+    fig_path = f'visualizations/{figure_name}_families.svg'
+    print(f"save to: {fig_path}")
+    plt.savefig(fig_path)
+
+    if not args.no_show:
+        plt.show()  # Sh
+
+    plt.close()
+
diff --git a/campaign_evaluations.py b/campaign_evaluations.py
index 1d2c7ed..3ab7907 100644
--- a/campaign_evaluations.py
+++ b/campaign_evaluations.py
@@ -22,7 +22,8 @@ import argparse
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--no-show", action="store_true")
-parser.add_argument("--fig-name", type=str, default="tolk_ro_t")
+parser.add_argument("--fig-name", type=str, default="test")
+parser.add_argument("--assert-n-contexts", type=int, default=5, help="Set to <0 for no asserts")
 args = parser.parse_args()
 
 
@@ -117,16 +118,28 @@ def plot_comparison_matrix(models, p_values_matrix, figure_name, title="Model Co
     plt.close()
 
 
-def legend_without_duplicate_labels(ax, loc="best", title=None, legend_loc=None):
+def legend_without_duplicate_labels(ax, loc="best", title=None, legend_loc=None, legend_path=None):
     handles, labels = ax.get_legend_handles_labels()
     unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
+
     # axs[plt_i].legend(bbox_to_anchor=legend_loc, loc="best")
     if legend_loc:
-        loc="upper left"
+        loc = "upper left"
+    if not legend_path:
+        ax.legend(*zip(*unique), loc=loc, title=title, fontsize=legend_fontsize, title_fontsize=legend_fontsize, bbox_to_anchor=legend_loc)
     else:
-        loc="best"
+        figsize = (6,1)
+        figsize = (1,6)
+
+        # save legend separately
+        fig_leg = plt.figure(figsize=figsize)
+        ax_leg = fig_leg.add_subplot(111)
+        ax_leg.legend(*zip(*unique), loc=loc, title=title, fontsize=legend_fontsize, title_fontsize=legend_fontsize, bbox_to_anchor=legend_loc, ncols=figsize[0])
+        ax_leg.axis('off')  # Hide the axes
+        # Save the figure containing only the legend
+        fig_leg.savefig(legend_path, bbox_inches='tight')
+        print(f"Legend saved to: {legend_path}")
 
-    ax.legend(*zip(*unique), loc=loc, title=title, fontsize=legend_fontsize, title_fontsize=legend_fontsize, bbox_to_anchor=legend_loc)
 
 def get_all_ipsative_corrs_str(default_profile):
 
@@ -190,6 +203,25 @@ models = [
     # "dummy"
 ]
 
+models_ft = [
+    "Mistral-7B-v0.1",
+    "Mistral-7B-Instruct-v0.1",
+    "Mistral-7B-Instruct-v0.2",
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_batch_size_16_rank_256",  # lora is just a new train -> unk_token fixed
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_roleplay_filtered_chars_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_LOAD_INSTRUCT_roleplay_filtered_chars_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_target_all_lin_and_train_ml_headbatch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_no_peft_batch_size_16_rank_256",
+    # # new params
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_target_all_lin_and_train_ml_head_batch_size_8_rank_64_lr_0.0002_train_on_all",
+    # "Mistral-7B-v0.1_ft_roleplay_filtered_chars_no_peft_batch_size_8_rank_64_lr_2e-05_train_on_all",
+    # "Mistral-7B-v0.1_ft_roleplay_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_roleplay_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_LOAD_INSTRUCT_roleplay_filtered_chars_batch_size_16_rank_256",
+    # "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_roleplay_filtered_chars_batch_size_16_rank_256",
+]
+
 plot_models = [
     "Mixtral-8x7B-Instruct-v0.1",
     "Mixtral-8x7B-Instruct-v0.1-4b",  # 6h
@@ -208,13 +240,13 @@ plot_models = [
 
 x_label_map = {
     "dummy": "random",
-    "Mixtral-8x7B-v0.1-4b": "Mixtral-Base-4b",
-    "Mixtral-8x7B-Instruct-v0.1-4b": "Mixtral-Instruct-4b",
-    "Mixtral-8x7B-v0.1": "Mixtral-Base",
-    "Mixtral-8x7B-Instruct-v0.1": "Mixtral-Instruct",
-    "Mistral-7B-v0.1": "Mistral-Base",
-    "Mistral-7B-Instruct-v0.1": "Mistral-Instruct-v0.1",
-    "Mistral-7B-Instruct-v0.2": "Mistral-Instruct-v0.2",
+    # "Mixtral-8x7B-v0.1-4b": "Mixtral-Base-4b",
+    # "Mixtral-8x7B-Instruct-v0.1-4b": "Mixtral-Instruct-4b",
+    # "Mixtral-8x7B-v0.1": "Mixtral-Base",
+    # "Mixtral-8x7B-Instruct-v0.1": "Mixtral-Instruct",
+    # "Mistral-7B-v0.1": "Mistral-Base",
+    # "Mistral-7B-Instruct-v0.1": "Mistral-Instruct-v0.1",
+    # "Mistral-7B-Instruct-v0.2": "Mistral-Instruct-v0.2",
     "llama_2_7b":  "LLaMa_2_7b",
     "llama_2_13b": "LLaMa_2_13b",
     "llama_2_70b": "LLaMa_2_70b",
@@ -230,7 +262,7 @@ x_label_map = {
 x_label_map = {**x_label_map, **{k: k.replace("_msgs", "") for k in ["1_msgs", "3_msgs", "5_msgs", "7_msgs", "9_msgs"]}}
 
 x_label_map = {**x_label_map, **{
-    m:m.replace("_batch_size_16_rank_256", "").replace("Mistral-7B-v0.1","Mistral-base").replace("_filtered_chars","") for m in models_ft}
+    m:m.replace("_batch_size_16_rank_256", "").replace("_filtered_chars", "") for m in models_ft}
 }
 
 x_label_map = {**x_label_map, **{
@@ -246,6 +278,7 @@ add_legend = False
 bars_as_plot = False
 label_ = None
 
+results_dir = "results"
 experiment_dirs = [
     "sim_conv_pvq_permutations_msgs",
     # "sim_conv_pvq_tolkien_characters_seeds",
@@ -307,6 +340,7 @@ round_y_lab = 1
 
 show_human_change = False
 legend_loc = None
+save_legend_separately = False
 
 legend_title = "LLM families"
 
@@ -330,7 +364,6 @@ FDR_test = True
 families_plot = False
 fam_min_y, fam_max_y = -0.1, 0.8
 
-
 if figure_name == "no_pop_msgs":
     experiment_dirs = ["sim_conv_pvq_permutations_msgs"]
     seed_strings = [f"{i}_msgs/_seed" for i in range(1, 10, 2)]  # msgs (show trends
@@ -343,6 +376,7 @@ if figure_name == "no_pop_msgs":
     metric = "Ipsative"
     human_change_xloc = -1.0
     msgs_ro_tolk = False
+    add_legend = True
 
     min_y, max_y = -0.1, 1.0  # IPS
     legend_fontsize = 22
@@ -428,16 +462,20 @@ elif figure_name.startswith("tolk_ro_t"):
     # title = "Personal value stability of fictional characters with PVQ"
     # title = "(A)"
 
-    experiment_dirs = ["sim_conv_pvq_tolkien_characters_seeds"]
-    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+    # experiment_dirs = ["sim_conv_pvq_tolkien_characters_seeds"]
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
 
     experiment_dirs = ["RERUN_sim_conv_pvq_tolkien_characters_seeds"]
     seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
 
     add_tolkien_ipsative_curve = False
     bar_plots = True
-    add_legend = True
+
+    add_legend = False
+    save_legend_separately = False
+
     legend_loc = (0.001, 0.99)
+
     metric = "Rank-Order"
     human_change_xloc = 6.8
     msgs_ro_tolk = False
@@ -466,6 +504,9 @@ elif figure_name.startswith("religion_t"):
     experiment_dirs = ["RERUN_sim_conv_religion_famous_people_seeds"]
     seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
 
+    results_dir = "results"
+    experiment_dirs = ["RERUN_sim_conv_religion_famous_people_seeds"]
+
     add_tolkien_ipsative_curve = False
     bar_plots = True
     add_legend = False
@@ -502,20 +543,17 @@ elif figure_name.startswith("paired_tolk_ro"):
     # y_label = f"Rank-Order stability\n{value_to_pair}-Donation"
     y_label = f"Rank-Order stability\nwith donation"
 
-    experiment_dirs = ["sim_conv_pvq_tolkien_characters_seeds"]
-    paired_dir = "sim_conv_tolkien_donation_tolkien_characters_seeds"
+    # experiment_dirs = ["sim_conv_pvq_tolkien_characters_seeds"]
+    # paired_dir = "sim_conv_tolkien_donation_tolkien_characters_seeds"
+    # seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
 
-    seed_strings = [f"{i}_seed" for i in range(1, 10, 2)]
+    experiment_dirs = ["RERUN_sim_conv_pvq_tolkien_characters_seeds"]
+    paired_dir = "RERUN_sim_conv_tolkien_donation_tolkien_characters_seeds"
+    seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
 
     add_tolkien_ipsative_curve = False
     bar_plots = True
 
-    if value_to_pair == "Universalism":
-        add_legend = True
-        legend_fontsize = 20
-    else:
-        add_legend = False
-
     metric = "Rank-Order"
     msgs_ro_tolk = False
     show_human_change = False
@@ -575,10 +613,6 @@ elif figure_name.startswith("don_t"):
     experiment_dirs = ["RERUN_sim_conv_tolkien_donation_tolkien_characters_seeds"]
     seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
 
-    add_legend = True
-    legend_loc = (0.01, 0.99)
-
-
     add_tolkien_ipsative_curve = False
     bar_plots = True
     metric = "Rank-Order"
@@ -608,6 +642,7 @@ elif figure_name.startswith("bag_t"):
     add_tolkien_ipsative_curve = False
     bar_plots = True
     add_legend = False
+    save_legend_separately = False
     metric = "Rank-Order"
     human_change_xloc = 6.8
     msgs_ro_tolk = False
@@ -659,7 +694,7 @@ elif figure_name == "tolk_ro_msgs_neutral":
     msgs_ro_tolk = True
 
     add_legend = True
-    legend_title=None
+    legend_title = None
     label_ = "Rank-Order stability\n  (with the neutral order)"
 
     metric = "Rank-Order"
@@ -734,8 +769,39 @@ elif figure_name == "llama_sys_no_sys":
     yticks_fontsize = 18
 
     min_y, max_y = -0.1, 0.8  # RO
+
 else:
-    raise ValueError(f"Unknown figure name {figure_name}.")
+
+    rotatation_x_labels = 90
+
+    # results_dir = "test"
+    # experiment_dirs = ["refactor_RERUN_sim_conv_religion_famous_people_seeds"]
+    # experiment_dirs = ["refactor2_RERUN_sim_conv_religion_famous_people_seeds"]
+
+    # results_dir = "results"
+    # experiment_dirs = ["RERUN_sim_conv_religion_famous_people_seeds"]
+
+    # seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
+    # seed_strings = [f"0_seed"]
+
+    results_dir = "results"
+    experiment_dirs = ["Temp_GS_religion_famous_people_seeds"]
+    models = ["dummy"]
+    seed_strings = ["temp_0.4", "temp_0.7", "temp_1.0", "temp_1.3"]
+
+    add_tolkien_ipsative_curve = False
+    bar_plots = True
+    add_legend = False
+
+    metric = "Rank-Order"
+    msgs_ro_tolk = False
+    show_human_change = False
+    legend_fontsize = 22
+
+    xticks_fontsize = 15
+    yticks_fontsize = 18
+
+    min_y, max_y = -0.1, 0.8  # RO
 
 if y_label is None:
     y_label = metric + " stability (r)"
@@ -758,15 +824,10 @@ print("N_comp:", n_comp)
 confidence = 0.95
 
 
-# assert_n_contexts = None
-assert_n_contexts = 5
-
-
-# assert_n_contexts = 6
-# assert_n_contexts = 4
-
-if assert_n_contexts:
-    cprint(f"Asserting {assert_n_contexts} contexts.", "green")
+if args.assert_n_contexts < 0:
+    args.assert_n_contexts = None
+else:
+    cprint(f"Asserting {args.assert_n_contexts} contexts.", "green")
 
 # prefix = "results_pvq_sim_conv_famous_people"
 # prefix = "results_ult_sim_conv_famous_people"
@@ -784,7 +845,9 @@ for experiment_dir in experiment_dirs:
         for seed_str in seed_strings:
             data[experiment_dir][model][seed_str] = {}
 
-            data_dir = os.path.join("results", experiment_dir, model, seed_str)
+            # data_dir = os.path.join("results", experiment_dir, model, seed_str)
+            data_dir = os.path.join(results_dir, experiment_dir, model, seed_str)
+
             if paired_dir:
                 paired_data_dir = os.path.join("results", paired_dir, model, seed_str)
             else:
@@ -807,7 +870,7 @@ for experiment_dir in experiment_dirs:
                 with open(eval_script_path, 'rb') as file_obj: eval_script = str(file_obj.read())
                 hash = hashlib.sha256("-".join(
                     [eval_script, inspect.getsource(run_analysis), checksumdir.dirhash(data_dir),
-                     str(assert_n_contexts), str(False),
+                     str(args.assert_n_contexts), str(False),
                      str(default_profile), str(paired_data_dir),
                      str(RO_neutral), str(RO_neutral_data_dir),
                      str(no_ips)
@@ -823,7 +886,7 @@ for experiment_dir in experiment_dirs:
                 else:
                     print("\t\tEvaluating")
                     eval_data = run_analysis(
-                        eval_script_path=eval_script_path, data_dir=data_dir, assert_n_contexts=assert_n_contexts,
+                        eval_script_path=eval_script_path, data_dir=data_dir, assert_n_contexts=args.assert_n_contexts,
                         default_profile=default_profile,
                         paired_data_dir=paired_data_dir, RO_neutral=RO_neutral, RO_neutral_data_dir=RO_neutral_data_dir,
                         no_ips=no_ips,
@@ -1082,10 +1145,6 @@ for plt_i, experiment_dir in enumerate(experiment_dirs):
                     axs[plt_i].scatter(xs, ys-tick_len_ci, marker="x", color="black", s=20, lw=0.8)
 
                 assert len(experiment_dirs) == 1
-                with open(f"tables/{figure_name}.txt", "w") as f:
-                    f.write(f"Model & Mean & SE & CI \\\\\n")
-                    for x_, y_, er_, ci_ in zip(xs, ys, tick_len_se, tick_len_ci):
-                        f.write(f"{x_} & {y_:.2f} & {er_:.2f} & {y_-ci_:.2f} - {y_+ci_:.2f} \\\\\n")
 
         axs[plt_i].set_ylim(min_y, max_y)
         axs[plt_i].set_xticklabels([x_label_map.get(m, m) for m in models], rotation=rotatation_x_labels, fontsize=xticks_fontsize)
@@ -1100,8 +1159,17 @@ for plt_i, experiment_dir in enumerate(experiment_dirs):
             axs[plt_i].set_ylim(min_y, max_y)
             axs[plt_i].set_xlabel("Simulated conversation length (n)", fontsize=x_label_fontsize)
 
-        if add_legend:
-            legend_without_duplicate_labels(axs[plt_i], loc="best", title=legend_title, legend_loc=legend_loc)
+        if add_legend or save_legend_separately:
+
+            if save_legend_separately:
+                legend_path = f'visualizations/families_legend.pdf'
+            else:
+                legend_path = None
+
+            legend_without_duplicate_labels(axs[plt_i], loc="best", title=legend_title, legend_loc=legend_loc, legend_path=legend_path)
+
+            if save_legend_separately:
+                exit()
 
     else:
 
@@ -1154,7 +1222,8 @@ for plt_i, experiment_dir in enumerate(experiment_dirs):
         axs[plt_i].set_ylabel(y_label, fontsize=y_label_fontsize)
         axs[plt_i].set_xlabel("Simulated conversation length (n)", fontsize=x_label_fontsize)
         # axs[plt_i].set_title(experiment_dir.replace("sim_conv_", "").replace("_seeds", ""))
-        axs[plt_i].legend(bbox_to_anchor=(1.04, 1), loc="best", fontsize=legend_fontsize)
+        if add_legend:
+            axs[plt_i].legend(bbox_to_anchor=(1.04, 1), loc="best", fontsize=legend_fontsize)
 
         axs[plt_i].set_xticklabels(axs[plt_i].get_xticks(), rotation=rotatation_x_labels, fontsize=xticks_fontsize)
         axs[plt_i].set_yticklabels(map(lambda  x: np.round(x, round_y_lab), axs[plt_i].get_yticks()), fontsize=yticks_fontsize)
@@ -1167,7 +1236,6 @@ for j in range(plt_i + 1, num_rows * num_cols):
 
 plt.tight_layout()
 
-# fig_path = f'visualizations/{figure_name}.svg'
 fig_path = f'visualizations/{figure_name}.pdf'
 print(f"save to: {fig_path}")
 plt.savefig(fig_path)
@@ -1230,7 +1298,3 @@ if families_plot:
 
     plt.close()
 
-    with open(f"tables/{figure_name}_families.txt", "w") as f:
-        f.write(f"Model & Mean & SE & CI \\\\\n")
-        for x_, y_, er_, ci_ in zip(families, family_means, family_tick_len_se, family_tick_len_ci):
-            f.write(f"{x_} & {y_:.2f} & {er_:.2f} & {y_ - ci_:.2f} - {y_ + ci_:.2f} \\\\\n")
diff --git a/categories.py b/categories.py
deleted file mode 100644
index 08c5aac..0000000
--- a/categories.py
+++ /dev/null
@@ -1,66 +0,0 @@
-subcategories = {
-    "abstract_algebra": ["math"],
-    "anatomy": ["health"],
-    "astronomy": ["physics"],
-    "business_ethics": ["business"],
-    "clinical_knowledge": ["health"],
-    "college_biology": ["biology"],
-    "college_chemistry": ["chemistry"],
-    "college_computer_science": ["computer science"],
-    "college_mathematics": ["math"],
-    "college_medicine": ["health"],
-    "college_physics": ["physics"],
-    "computer_security": ["computer science"],
-    "conceptual_physics": ["physics"],
-    "econometrics": ["economics"],
-    "electrical_engineering": ["engineering"],
-    "elementary_mathematics": ["math"],
-    "formal_logic": ["philosophy"],
-    "global_facts": ["other"],
-    "high_school_biology": ["biology"],
-    "high_school_chemistry": ["chemistry"],
-    "high_school_computer_science": ["computer science"],
-    "high_school_european_history": ["history"],
-    "high_school_geography": ["geography"],
-    "high_school_government_and_politics": ["politics"],
-    "high_school_macroeconomics": ["economics"],
-    "high_school_mathematics": ["math"],
-    "high_school_microeconomics": ["economics"],
-    "high_school_physics": ["physics"],
-    "high_school_psychology": ["psychology"],
-    "high_school_statistics": ["math"],
-    "high_school_us_history": ["history"],
-    "high_school_world_history": ["history"],
-    "human_aging": ["health"],
-    "human_sexuality": ["culture"],
-    "international_law": ["law"],
-    "jurisprudence": ["law"],
-    "logical_fallacies": ["philosophy"],
-    "machine_learning": ["computer science"],
-    "management": ["business"],
-    "marketing": ["business"],
-    "medical_genetics": ["health"],
-    "miscellaneous": ["other"],
-    "moral_disputes": ["philosophy"],
-    "moral_scenarios": ["philosophy"],
-    "nutrition": ["health"],
-    "philosophy": ["philosophy"],
-    "prehistory": ["history"],
-    "professional_accounting": ["other"],
-    "professional_law": ["law"],
-    "professional_medicine": ["health"],
-    "professional_psychology": ["psychology"],
-    "public_relations": ["politics"],
-    "security_studies": ["politics"],
-    "sociology": ["culture"],
-    "us_foreign_policy": ["politics"],
-    "virology": ["health"],
-    "world_religions": ["philosophy"],
-}
-
-categories = {
-    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
-    "humanities": ["history", "philosophy", "law"],
-    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
-    "other (business, health, misc.)": ["other", "business", "health"],
-}
diff --git a/compile_evaluations.py b/compile_evaluations.py
deleted file mode 100644
index 2d477dd..0000000
--- a/compile_evaluations.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import subprocess
-import json
-import os
-from itertools import chain
-import numpy as np
-
-
-def run_analysis(data_dir, prefix, model, assert_n_contexts=None):
-    # run evaluation script
-    # print(f"Path: {data_dir}/{prefix}_{model}")
-    command = f"python visualization_scripts/data_analysis.py {'--assert-n-dirs ' + assert_n_contexts if assert_n_contexts else ''} results/{data_dir}/{prefix}_{model}/*"
-    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    stdout, stderr = process.communicate()
-
-    if stderr:
-        print("Error:", stderr.decode())
-
-    # extract the results
-    output_lines = stdout.decode().split('\n')[-3:-1]  # Adjust indices to capture the last two lines correctly
-
-    metrics = output_lines[0].split("\t")
-
-    if metrics != ["Mean-Level", "Rank-Order", "Ipsative"]:
-        metrics = ["Mean-Level", "Rank-Order", "Ipsative"]
-        values = [np.nan, np.nan, np.nan]
-    else:
-        values = output_lines[1].split("\t\t")
-
-
-    results = dict(zip(metrics, values))
-
-    return results
-
-all_models = []
-all_data_dirs = []
-
-# Define the models
-models = [
-    "llama_2_7b",
-    "llama_2_13b",
-    "llama_2_70b",  # 2 gpu
-    "llama_2_7b_chat",
-    "llama_2_13b_chat",
-    "llama_2_70b_chat",  # 2 gpu
-    "Mistral-7B-v0.1",
-    "Mistral-7B-Instruct-v0.1",
-    "Mistral-7B-Instruct-v0.2",
-    "zephyr-7b-beta",
-    "Mixtral-8x7B-v0.1-4b",  # 6h
-    "Mixtral-8x7B-Instruct-v0.1-4b"  # 6h
-]
-all_models.extend(models)
-# Define the results directory
-# sim conv
-data_dirs_prefixes = [
-    ("results_sim_conv_tolkien_simulated_human_knows_persona", "results_pvq_sim_conv_tolkien_characters"),
-    ("results_sim_conv_v2_simulated_human_knows_persona", "results_pvq_sim_conv_famous_people"),
-]
-all_data_dirs.extend(list(zip(*data_dirs_prefixes))[0])
-
-
-assert_n_contexts = 5
-# prefix = "results_pvq_sim_conv_famous_people"
-# prefix = "results_ult_sim_conv_famous_people"
-
-
-# # # sim conv - ultimatum
-# data_dirs = [
-#     "results_ultimatum_sim_conv_v2_perm",
-# ]
-# assert_n_contexts = 5
-# prefix = "results_pvq_sim_conv_tolkien_characters"
-
-
-
-
-data = {}
-for data_dir, prefix in data_dirs_prefixes:
-    print(f"EXPERIMENT: {data_dir}")
-    data[data_dir] = {}
-
-    for model in models:
-        data[data_dir][model] = run_analysis(data_dir, prefix, model, assert_n_contexts=None)
-
-        print(model.ljust(35, ' ') + " - " + str(data[data_dir][model]))
-
-
-# cacheing
-with open('test.json', 'w') as fp:
-    json.dump(data, fp)
-
-with open('test.json') as f:
-    data = json.load(f)
-
-
-# MERGE
-merge = False
-if merge:
-    merge_dict = {
-        "pvq": ["results_sim_conv_tolkien_simulated_human_knows_persona", "results_sim_conv_v2_simulated_human_knows_persona"],
-        "ult": ["results_tolkien_ultimatum_sim_conv_v2_perm", "results_regular_ultimatum_sim_conv_v2"]
-    }
-    data_ = {}
-    for k, v in merge_dict.items():
-        models = list(data[v[0]].keys())
-        metrics = list(list(data[v[0]].values())[0].keys())
-
-        data_[k] = {
-            model: {
-                metric: np.mean([float(data[v[0]][model][metric]),float(data[v[1]][model][metric])]) for metric in metrics
-            } for model in models
-        }
-    data = data_
-    all_data_dirs = merge_dict.keys()
-
-
-
-
-# DRAW PLOTS
-import matplotlib.pyplot as plt
-
-data_dirs_2_labels = {
-    # "results_sim_conv_v2_perm": "tolkien",
-    "results_sim_conv_tolkien_simulated_human_knows_persona": "tolkien_pvq",
-    "results_tolkien_ultimatum_sim_conv_v2_perm": "tolkien_ult",
-    "results_sim_conv_v2_simulated_human_knows_persona": "famous_pvq",
-    "results_regular_ultimatum_sim_conv_v2": "famous_ult"
-    # "results_sim_conv_v2_perm_op_only": "option order ch.",
-    # "results_sim_conv_v2_perm": "topic + option order ch.",
-    # "results_sim_conv_v2_perm_base_format": "topic + option order ch.",
-    # "results_weather_v2": "weather change",
-    # "results_weather_v2_perm_op_only": "option order ch.",
-    # "results_weather_v2_perm_op": "option order + weather ch.",
-}
-
-biggest_human_change = {
-    "Rank-Order": 0.57,
-    "Ipsative": 0.59,
-}
-
-num_models = len(models)
-num_cols = 3  # Adjust this as needed for a better layout
-num_rows = num_models // num_cols + (num_models % num_cols > 0)
-
-for metric in ["Rank-Order", "Ipsative"]:
-
-    # Create a figure with subplots
-    fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))  # Adjust figsize as needed
-    axs = axs.flatten()  # Flatten the array of axes for easy indexing
-
-    for i, model in enumerate(all_models):
-        data_dirs_ = [d for d in all_data_dirs if model in data[d]]
-        rank_order_values = [float(data[data_dir][model][metric]) for data_dir in data_dirs_]
-        min_size = 0.03  # because it's invisible otherwise
-        rank_order_values = [min_size if -min_size < value < min_size else value for value in rank_order_values]
-
-        data_dir_labels = [data_dirs_2_labels.get(d,d) for d in data_dirs_]
-        axs[i].bar(data_dir_labels, rank_order_values, color=['red', 'green', 'blue'])
-        # axs[i].set_xlabel('Experiment')
-        axs[i].set_ylabel(metric + " stability (r)")
-        axs[i].set_title(f'{model}')
-        axs[i].set_ylim(-0.5, 1)
-        axs[i].tick_params(axis='x')
-
-        # humans
-        axs[i].axhline(y=biggest_human_change[metric], color='gray', linestyle='--')
-
-    # Hide any unused subplots
-    for j in range(i + 1, num_rows * num_cols):
-        axs[j].axis('off')
-
-    fig.suptitle(f'{metric} Stability')
-    plt.tight_layout()
-    plt.subplots_adjust(top=0.90, bottom=0.05, hspace=0.8)
-    plt.savefig(f'visualizations/{metric}_all_models.png')
-    plt.savefig(f'visualizations/{metric}_all_models.svg')
-    plt.show()  # Sh
-    plt.close()
diff --git a/crop.py b/crop.py
deleted file mode 100644
index dd586c3..0000000
--- a/crop.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Byte pair encoding utilities (Adapted from the official GPT-2 GitHub repository)"""
-import json
-import os
-import regex as re
-import requests
-import sys
-
-from functools import lru_cache
-from tqdm import tqdm
-
-
-def _get_encoder(subdir):
-    print("Downloading encoder and vocab to ", subdir)
-    for filename in ['encoder.json', 'vocab.bpe']:
-        r = requests.get("https://openaipublic.blob.core.windows.net/gpt-2/" + subdir + "/" + filename, stream=True)
-        with open(os.path.join(subdir, filename), 'wb') as f:
-            file_size = int(r.headers["content-length"])
-            chunk_size = 1000
-            with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
-                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
-                for chunk in r.iter_content(chunk_size=chunk_size):
-                    f.write(chunk)
-                    pbar.update(chunk_size)
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors='replace'):
-        self.encoder = encoder
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
-
-def get_encoder(model_name):
-    subdir = os.path.join("models", model_name)
-    if not os.path.exists(subdir):
-        os.makedirs(subdir)
-    if not os.path.exists(os.path.join(subdir, 'encoder.json')):
-        _get_encoder(subdir)
-
-    subdir = subdir.replace('\\','/') # needed for Windows
-
-    with open(os.path.join(subdir, 'encoder.json'), 'r') as f:
-        encoder = json.load(f)
-    with open(os.path.join(subdir, 'vocab.bpe'), 'r', encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
-
-enc = get_encoder('124M')
-
-def crop_prompt(prompt: str):
-    global enc
-
-    cropped_prompt = enc.decode(enc.encode(prompt)[:2048])
-    return cropped_prompt
-
-def crop(s):
-    prompt = crop_prompt(s)
-    return prompt
-
diff --git a/estimate_tokens.py b/estimate_tokens.py
deleted file mode 100644
index a3d00f7..0000000
--- a/estimate_tokens.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import tiktoken
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description="Estimate the number of tokens in a text file using OpenAI's API.")
-parser.add_argument("file_path", metavar="file_path", type=str, help="The path to the text file to process.")
-args = parser.parse_args()
-
-if not os.path.exists(args.file_path):
-    print("File not found.")
-    exit()
-
-
-with open(args.file_path, "r") as f:
-    text = f.read()
-
-
-encoder = tiktoken.encoding_for_model('gpt-3.5-turbo-0301')
-assert encoder == tiktoken.encoding_for_model('gpt-4-0314')
-
-n_tokens = len(encoder.encode(text))
-
-print("total GPT tokens used: {}".format(n_tokens))
-print(f"\tgpt-4 ~ {0.04 * n_tokens / 1000:.4f} dollars")
-print(f"\tgpt-3.5 ~ {0.002 * n_tokens / 1000:.4f} dollars")
-print(f"\tdavinci ~ {0.02 * n_tokens / 1000:.4f} dollars")
-print(f"\tcurie ~ {0.002 * n_tokens / 1000:.4f} dollars")
-print(f"\tbabagge ~ {0.0005 * n_tokens / 1000:.4f} dollars")
-print(f"\tada ~ {0.0004 * n_tokens / 1000:.4f} dollars")
diff --git a/evaluate.py b/evaluate.py
deleted file mode 100644
index 8a3ef3e..0000000
--- a/evaluate.py
+++ /dev/null
@@ -1,1614 +0,0 @@
-import argparse
-import random
-from collections import defaultdict
-import os
-import json
-import hashlib
-import time
-import datetime
-import itertools
-import string
-
-from termcolor import colored
-
-from utils import *
-
-
-import numpy as np
-import pandas as pd
-import torch
-import tiktoken
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, BitsAndBytesConfig
-from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
-
-from personas.utils import simulated_participant_to_name
-
-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_random_exponential,
-)  # for exponential backoff
-
-
-@retry(wait=wait_random_exponential(min=10, max=30), stop=stop_after_attempt(10))
-def completions_with_backoff(client, **kwargs):
-    return client.chat.completions.create(**kwargs)
-
-
-hf_cache_dir = get_hf_cache_dir()
-os.environ['TRANSFORMERS_CACHE'] = hf_cache_dir
-
-
-def construct_messages(prompt, system_message, messages_conv=None, add_query_str=True):
-
-    set_persona_str = prompt["set_persona_str"]
-    questionnaire_description = prompt["questionnaire_description"]
-
-    user_prompt = f"{questionnaire_description}\n\n" if questionnaire_description else ""
-    user_prompt += prompt["item_str"]
-
-    if add_query_str:
-        user_prompt += "\n"+prompt["query_str"]
-
-    if system_message or messages_conv:
-        # multiple messages
-        messages = []
-        if set_persona_str:
-            messages.append({
-                "role": "system" if system_message else "user",
-                "content": set_persona_str
-            })
-
-        if messages_conv:
-            messages.extend(messages_conv)
-
-        messages.append({"role": "user", "content": user_prompt})
-
-        if not system_message:
-            # USER, USER -> USER, AS:"OK", USER
-            messages = fix_alternating_msg_order(messages)
-
-    else:
-
-        full_prompt = f"{set_persona_str}\n\n" if set_persona_str else ""
-
-        if args.separator:
-            full_prompt += "-" * 200 + "\n"
-
-        full_prompt += user_prompt
-
-        messages = [
-            {"role": "user", "content": full_prompt}
-        ]
-
-    return messages
-
-
-def apply_base_model_template(
-        messages,
-        assistant_label,
-        user_label,
-        system_label,
-        add_generation_prompt=True,
-        return_stop_words=False,
-):
-
-    formatted_conversation = ""
-
-    labels_dict = {
-        "ASSISTANT": assistant_label,
-        "SYSTEM": system_label,
-        "USER": user_label,
-    }
-
-    assert assistant_label != ""
-    assert user_label != ""
-    assert system_label != ""
-
-    for msg in messages:
-        label = labels_dict[msg['role'].upper()]
-        formatted_conversation += f"{label}:{msg['content']}"
-        formatted_conversation += "\n"
-
-    if add_generation_prompt:
-        formatted_conversation += f"{labels_dict['ASSISTANT']}:"
-
-    if return_stop_words:
-        return formatted_conversation, [f"\n{l}:" for l in labels_dict.values()]
-    else:
-        return formatted_conversation
-
-
-# take the theme starter
-opening_questions_for_themes = {
-    "poem": "Hello, let's write a poem together. You start by the first verse I'll add the second one, and so on.",
-    "joke": "Tell me a joke.",
-    "history": "What is the significance of the battle of Hastings. Answer in two sentences.",  # slight collapse
-    "chess": "1. e4",
-    "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding.",
-}
-
-
-def create_permutation_dicts(args, n_options, choices, num_questions, population_size=None):
-
-    if args.permute_options:
-
-        # sample permutations based on given seed -> should correspond to different contexts
-        original_state = random.getstate()  # save the original state
-        random.seed(args.permute_options_seed)
-
-        if len(set(n_options)) == 1:
-
-            if n_options[0] > 9:
-                raise ValueError("Number of options too big. Refactor code below to use it.")
-
-            all_permutations = list(itertools.permutations(range(n_options[0])))
-
-            permutations = random.choices(all_permutations, k=num_questions*population_size)
-            permutations = [permutations[part_i:part_i+num_questions] for part_i in range(population_size)]
-
-        else:
-            # not all questions have the same number of options
-
-            # string seed to int seed
-            int_seed = int(hashlib.md5(args.permute_options_seed.encode('utf-8')).hexdigest(), 16)
-            rng = np.random.default_rng(seed=int_seed)
-
-            permutations = [
-                [tuple(rng.permutation(n_options_q)) for n_options_q in n_options] for _ in range(population_size)
-            ]
-
-        permutations_dicts = [
-            [
-                dict(zip(choices, perm)) for perm in part_perms
-            ] for part_perms in permutations
-        ]
-
-        # revert original state
-        random.setstate(original_state)
-
-    else:
-        if len(set(n_options)) == 1:
-            permutations_dicts = [
-                [{choices[i]: i for i, c in enumerate(choices[:n_opt])} for n_opt in n_options]
-            ] * population_size
-
-    return permutations_dicts
-
-
-def parse_hf_outputs(output, tokenizer, answers):
-
-    answer_tokens = extract_answer_tokens(answers, tokenizer)  # todo: repetitive -> extract
-
-    option_scores = {
-        ans: max([output.scores[0][0, ind] for ind in answer_tokens[ans]])
-        for ans in answers
-    }
-
-    # take the most probable answer as the generation
-    generation = max(option_scores, key=option_scores.get)
-
-    # extract logprobs
-    lprobs = [float(option_scores[a]) for a in answers]
-
-    # todo: check that ' A' are one token and check for those as well and not "unk"
-    encoded_ans = [tokenizer.encode(ans, add_special_tokens=False)[0] for ans in answers]
-    option_scores = {enc_a: output.scores[0][0, enc_a] for enc_a in encoded_ans}
-
-    return option_scores, generation, lprobs
-
-
-def create_simulated_messages(conv, last="user"):
-    # simulate a conversation between two LLMs
-    if last == "user":
-        # last role is user
-        sim_conv = list(zip(["user", "assistant"] * (len(conv) // 2 + 1), conv[::-1]))[::-1]
-    elif last == "assistant":
-        # last role is assistant
-        sim_conv = list(zip(["assistant", "user"] * (len(conv) // 2 + 1), conv[::-1]))[::-1]
-    else:
-        raise ValueError("last must be either user or assistant")
-
-    sim_conv_messages = [{"role": role, "content": msg} for role, msg in sim_conv]
-
-    return sim_conv_messages
-
-
-def fix_alternating_msg_order(messages):
-
-    if len(messages) <= 1:
-        return messages
-
-    # roles must iterate, and start with user, so we add fixes
-    if messages[0]['role'] == "system" and messages[1]['role'] == "assistant":
-        # insert empty user message
-        messages.insert(1, {"role": "user", "content": ""})
-
-    if messages[0]['role'] == "user" and messages[1]['role'] == "user":
-        # first message sets the persona, second sets the topic
-        # insert artificial message of the model accepting the persona
-        messages.insert(1, {"role": "assistant", "content": "OK"})
-
-    return messages
-
-
-class StoppingCriteriaSub(StoppingCriteria):
-    def __init__(self, stops, tokenizer, original_input_ids):
-        super().__init__()
-        self.stops = [s.upper() for s in stops]
-        self.tokenizer = tokenizer
-        self.original_input_ids = original_input_ids
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
-        generated_ids = input_ids[0][len(self.original_input_ids[0]):]
-        generation = self.tokenizer.decode(generated_ids).upper()
-        return any([stop in generation for stop in self.stops])
-
-
-def simulate_conversation(args, engine, sim_engine, model_set_persona_string=None, llm_generator=None, simulated_participant=None):
-
-    if llm_generator is not None:
-        tokenizer, model = llm_generator
-
-    opening_question = opening_questions_for_themes[args.simulate_conversation_theme]
-
-    conversation = [opening_question]
-
-    # simulate conversation
-    assert args.simulated_conversation_n_messages % 2 == 1  # must be odd so that the last one is GPT as simulated persona
-
-    for msg_i in range(args.simulated_conversation_n_messages):
-        if args.verbose:
-            print(f"Simulted conv msg {msg_i}")
-
-        # assign roles to messages - alternating, last one user
-        simulated_conv_messages = create_simulated_messages(conversation, last="user")
-        simulated_participant_name = simulated_participant_to_name(simulated_participant, args.simulated_population_type)
-        labels_dict = {
-            "persona": {
-                "assistant_label": simulated_participant_name.upper(),
-                "user_label": "USER",
-                "system_label": "CONTEXT"
-            },
-            "human": {
-                "assistant_label": "HUMAN",
-                "user_label": f"{simulated_participant_name.upper()} (CHATBOT)" if args.simulated_human_knows_persona else "CHATBOT",
-                "system_label": "CONTEXT"
-            }
-        }
-        stop_words_up = [f"\n{v}:" for v in labels_dict["persona"].values()] + [f"\n{v}:" for v in labels_dict["human"].values()]
-        # also add similar words wo whitespace ex. GANDALF (CHATBOT) and GANDALF(CHATBOT)
-        stop_words_up += [s.replace(" ", "") for s in stop_words_up if " " in s]
-
-        if msg_i % 2 == 0:
-            # even -> gpt as a persona
-            assert simulated_conv_messages[0]['role'] == "user"
-
-            if model_set_persona_string:
-                simulated_conv_messages = [{
-                    "role": "system" if args.system_message else "user",
-                    "content": model_set_persona_string
-                }] + simulated_conv_messages
-
-            engine_ = engine
-            assistant_label = labels_dict["persona"]["assistant_label"]
-            user_label = labels_dict["persona"]["user_label"]
-            system_label = labels_dict["persona"]["system_label"]
-
-        else:
-            # gpt as human
-            assert simulated_conv_messages[0]['role'] == "assistant"
-
-            # user doesn't know the chatbots persona -> change this?
-            if args.base_model_template:
-                if args.simulated_human_knows_persona:
-                    sys_msg = f"The following is a conversation between a human and a chatbot. The chatbot is pretending to be {simulated_participant_name}. The human's every reply must be in one sentence only."
-                else:
-                    sys_msg = f"The following is a conversation between a human and a chatbot. The human's every reply must be in one sentence only."
-            else:
-                if args.simulated_human_knows_persona:
-                    sys_msg = f"You are simulating a human using a chatbot. The chatbot is pretending to be {simulated_participant_name}. Your every reply must be in one sentence only."
-                else:
-                    sys_msg = f"You are simulating a human using a chatbot. Your every reply must be in one sentence only."
-
-            simulated_conv_messages = [{
-                "role": "system" if args.system_message else "user",
-                "content": sys_msg
-            }] + simulated_conv_messages
-
-            engine_ = sim_engine
-
-            assistant_label = labels_dict["human"]["assistant_label"]
-            user_label = labels_dict["human"]["user_label"]
-            system_label = labels_dict["human"]["system_label"]
-
-        if not args.base_model_template:
-            simulated_conv_messages = fix_alternating_msg_order(simulated_conv_messages)
-
-        if engine_ == "dummy":
-            response = f"Dummy simulated message no. {msg_i}. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message. I am repeating it now. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message."
-
-        elif "gpt" in engine_:
-            assert not args.base_model_template
-
-            if args.verbose:
-                print_chat_messages(simulated_conv_messages)
-
-            if args.azure_openai:
-                # time.sleep(0.1)
-                c = completions_with_backoff(
-                    client=model,
-                    model=openai_2_azure_tag[engine_],
-                    messages=simulated_conv_messages,
-                    max_tokens=100,
-                    n=1,
-                    temperature=1.0,
-                )
-
-            else:
-                # todo: add backoff
-                c = model.chat.completions.create(
-                    model=engine_,
-                    messages=simulated_conv_messages,
-                    max_tokens=100,
-                    n=1,
-                    temperature=1.0,
-                )
-            response = c.choices[0].message.content
-
-        elif "llama_2" in engine_:
-
-            if args.base_model_template:
-                assert args.system_message
-                formatted_prompt, stop_words = apply_base_model_template(
-                    simulated_conv_messages,
-                    assistant_label=assistant_label,
-                    user_label=user_label,
-                    system_label=system_label,
-                    add_generation_prompt=True,
-                    return_stop_words=True
-                )
-                input_ids = tokenizer(formatted_prompt, return_tensors="pt").to(model.device).input_ids
-                assert all([w.upper() in stop_words_up for w in stop_words])
-                stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stop_words_up, tokenizer, input_ids)])
-
-                print(f"\n>>>>>>>>>>>>FORMATTED<<<>>>PROMPT<<<<<<<<<<<<\n{formatted_prompt}\n>>>>>>>>>>><<<<<<<<<<<\n")
-
-            else:
-                input_ids = tokenizer.apply_chat_template(simulated_conv_messages, return_tensors="pt", add_generation_prompt=True).to(model.device)
-                if args.verbose:
-                    print_chat_messages(simulated_conv_messages)
-                stopping_criteria = None
-
-            output_seq = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=100,
-                do_sample=True,
-                top_p=0.9,
-                top_k=50,
-                temperature=0.6,
-                repetition_penalty=1.2,
-                num_beams=1,
-                return_dict_in_generate=True,
-                output_scores=True,
-                stopping_criteria=stopping_criteria
-            )
-            response = tokenizer.decode(output_seq.sequences[0][len(input_ids[0]):], skip_special_tokens=True)
-
-        elif engine_ in ["phi-2", "phi-1.5", "phi-1", "Qwen-72B", "Qwen-14B", "Qwen-7B"]:
-
-            if args.base_model_template:
-                assert args.system_message
-                formatted_prompt, stop_words = apply_base_model_template(
-                    simulated_conv_messages,
-                    assistant_label=assistant_label,
-                    user_label=user_label,
-                    system_label=system_label,
-                    add_generation_prompt=True,
-                    return_stop_words=True
-                )
-                input_ids = tokenizer(formatted_prompt, return_tensors="pt").to(model.device).input_ids
-                assert all([w.upper() in stop_words_up for w in stop_words])
-                stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stop_words_up, tokenizer, input_ids)])
-
-                print(f"\n>>>>>>>>>>>>FORMATTED<<<>>>PROMPT<<<<<<<<<<<<\n{formatted_prompt}\n>>>>>>>>>>><<<<<<<<<<<\n")
-
-            else:
-                input_ids = tokenizer.apply_chat_template(simulated_conv_messages, return_tensors="pt", add_generation_prompt=True).to(model.device)
-                if args.verbose:
-                    print_chat_messages(simulated_conv_messages)
-                stopping_criteria=None
-
-            output_seq = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=100,
-                return_dict_in_generate=True,
-                output_scores=True,
-                stopping_criteria=stopping_criteria
-            )
-            response = tokenizer.decode(output_seq.sequences[0][len(input_ids[0]):], skip_special_tokens=True)
-
-        elif "zephyr" in engine_ or "Mixtral" in engine_ or "Mistral" in engine_:
-            
-            # for params: https://huggingface.co/blog/mixtral
-            # for params: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
-
-            if args.base_model_template:
-                assert args.system_message
-                formatted_prompt, stop_words = apply_base_model_template(
-                    simulated_conv_messages,
-                    assistant_label=assistant_label,
-                    user_label=user_label,
-                    system_label=system_label,
-                    add_generation_prompt=True,
-                    return_stop_words=True
-                )
-                input_ids = tokenizer(formatted_prompt, return_tensors="pt").to(model.device).input_ids
-                assert all([w.upper() in stop_words_up for w in stop_words])
-                stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stop_words_up, tokenizer, input_ids)])
-
-                if args.verbose:
-                    print(f"\n>>>>>>>>>>>>FORMATTED<<<>>>PROMPT<<<<<<<<<<<<\n{formatted_prompt}\n>>>>>>>>>>><<<<<<<<<<<\n")
-
-            else:
-                input_ids = tokenizer.apply_chat_template(simulated_conv_messages, return_tensors="pt", add_generation_prompt=True).to(model.device)
-                if args.verbose:
-                    print_chat_messages(simulated_conv_messages)
-                stopping_criteria = None
-
-            output_seq = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=100,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.95,
-                top_k=50,
-                return_dict_in_generate=True,
-                output_scores=True,
-                stopping_criteria=stopping_criteria
-            )
-            response = tokenizer.decode(output_seq.sequences[0][len(input_ids[0]):], skip_special_tokens=True)
-
-        else:
-            raise NotImplementedError(f"Simulated conversations not implemented for {engine_}")
-
-        if args.base_model_template:
-            response_up = response.upper()
-            stop_word_ind = np.min([response_up.index(sw) if sw in response_up else np.inf for sw in stop_words_up])
-            if stop_word_ind != np.inf:
-                stop_word_ind = int(stop_word_ind)
-                response = response[:stop_word_ind]
-
-        conversation.append(response)
-
-        if args.verbose:
-            print(f"--> {response}")
-
-        messages_conv = create_simulated_messages(conversation, last="assistant")
-        messages_conv_hash = hash_chat_conv(messages_conv)
-
-
-    return messages_conv, messages_conv_hash
-
-
-def map_choice_to_number(letter, permutations_dict):
-    # A-F -> 1-6
-    # find index of letter in choices and add 1
-    number = permutations_dict[letter] + 1
-    return number
-
-def map_number_to_choice(number, inv_permutations_dict):
-    choice = inv_permutations_dict[number-1]
-    return choice
-
-
-timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
-print("timestamp:", timestamp)
-
-
-hf_token = os.environ["HF_TOKEN"]
-hidden_token = hf_token[:6] + "*" * (len(hf_token)-12) + hf_token[-6:]
-print("HF TOKEN:", hidden_token )
-
-
-# choices = ["A", "B", "C", "D", "E", "F"]
-choices = list(string.ascii_uppercase)
-
-llama_dir = "/gpfswork/rech/imi/utu57ed/llama/llama_files/"
-
-def get_prompt_skeleton(experiment_name, args, simulated_participant):
-
-    if "pvq" in experiment_name:
-        assert "pvq" in args.data_dir
-        questionnaire_description = "Here we briefly describe some people. Please read each description and think about how much each person is or is not like you. Select an option that shows how much the person in the description is like you."
-        questionnaire_description_empty = False
-
-    elif "donation" in experiment_name:
-        assert "donation" in args.data_dir
-        questionnaire_description = ""
-        questionnaire_description_empty = True
-
-    elif "bag" in experiment_name:
-        assert "bag" in args.data_dir
-        questionnaire_description = ""
-        questionnaire_description_empty = True
-
-    elif "religion" in experiment_name:
-        assert "religion" in args.data_dir
-        questionnaire_description = ""
-        questionnaire_description_empty = True
-
-    else:
-        raise ValueError(f"Experiment name is ill-defined {args.experiment_name}")
-
-    if args.base_model_template:
-        prefix = "The following is a conversation with"
-    else:
-        prefix = "You are"
-
-    if args.simulated_population_type in ["famous_people"]:
-        set_persona_str = f"{prefix} {simulated_participant}"
-    elif args.simulated_population_type in ["tolkien_characters"]:
-        set_persona_str = f"{prefix} {simulated_participant} from J. R. R. Tolkien's Middle-earth legendarium."
-    elif args.simulated_population_type == "anes":
-        set_persona_str = f"{prefix} a person with the following profile.\n\n'''\n{simulated_participant}'''"
-    elif args.simulated_population_type == "llm_personas":
-        set_persona_str = f"{prefix} a person with the following profile.\n\n'''\n{simulated_participant}'''"
-    elif args.simulated_population_type == "user_personas":
-        if args.base_model_tempalte:
-            raise NotImplementedError("base model template not implemented for user_personas simulated population.")
-        set_persona_str = f"{prefix} talking to a person with the following profile.\n\n'''\n{simulated_participant}'''"
-    elif args.simulated_population_type == "permutations":
-        set_persona_str = ""
-    else:
-        raise ValueError("Unknown population type")
-
-    if args.format == "chat":
-        if args.query_prompt:
-            query_str = args.query_prompt
-        else:
-            query_str = "Answer: ("
-
-    else:
-        raise ValueError(f"Undefined format {args.format}.")
-
-    assert (not questionnaire_description_empty == questionnaire_description)
-
-    prompt_skeleton = {
-        "set_persona_str": set_persona_str,  # remove newline from the end
-        "questionnaire_description": questionnaire_description,
-        "query_str": f"{query_str}",
-    }
-
-    return prompt_skeleton
-
-
-def dummy_lprobs_from_generation(response, answers, label_2_text_option_dict):
-
-
-    def find_first_match(response, labels_strings, case_insensitive):
-
-        if case_insensitive:
-            labels_strings = [(l, s.lower()) for l, s in labels_strings]
-            response = response.lower()
-
-        for l, s in labels_strings:
-            if s in response:
-                return l, s
-
-        return None, None
-
-    # first try to match substrings
-    # sort from longest to shortest (to avoid substrings, "Like me" vs "A little like me")
-    labels_text_options = sorted(label_2_text_option_dict.items(), key=lambda x: len(x[1]), reverse=True)
-    label, option = find_first_match(response, labels_text_options, case_insensitive=True)
-
-    if option is not None:
-        lprobs = [-0.01 if a == label else -100 for a in answers]
-        return lprobs
-
-    def find_matches(strings):
-        lprobs = [-100] * len(strings)
-        for i, op in enumerate(strings):
-            if op in response:
-                lprobs[i] = -0.01
-
-        match = any([lp > -100 for lp in lprobs])
-
-        return lprobs, match
-
-
-    # look for 'A.' -> change to A)
-    lprobs, match = find_matches([f"{a}." for a in answers])
-    if match:
-        return lprobs
-
-    # look for "A "
-    lprobs, _ = find_matches([f"{a} " for a in answers])
-    if match:
-        return lprobs
-
-    # look for "A"
-    lprobs, _ = find_matches(answers)
-    return lprobs
-
-
-def softmax(x):
-    z = x - max(x)
-    numerator = np.exp(z)
-    denominator = np.sum(numerator)
-    softmax = numerator/denominator
-    return softmax
-
-
-def format_subject(subject):
-    l = subject.split("_")
-    s = ""
-    for entry in l:
-        s += " " + entry
-    return s
-
-
-def format_example(df, idx, subject, experiment_name, args, permutations_dict, simulated_participant, include_answer=True):
-    # an item contains a question and suggested answers
-    item_str = df.iloc[idx, 0]
-    k = df.shape[1] - 2
-
-    # extract options
-    num_options = 0
-    options_strings = []
-    for j in range(k):
-        op_str = df.iloc[idx, j+1]
-
-        if op_str == "undef":
-            continue
-
-        options_strings.append(op_str)
-
-        num_options += 1
-
-    if args.format == "chat":
-        for ch in choices[:num_options]:
-            item_str += "\n({}) {}".format(ch, options_strings[permutations_dict[ch]])
-
-    else:
-        raise ValueError(f"Undefined textual format {args.format}.")
-
-    prompt = get_prompt_skeleton(
-        experiment_name=experiment_name, args=args, simulated_participant=simulated_participant
-    )
-
-    prompt["item_str"] = item_str
-
-    # query_in_reply will put query in the models response, if not add it to prompt here
-
-    # if not args.query_in_reply:
-    #     item_str += "\n"+prompt_skeleton["query"]
-
-    if include_answer:
-        prompt["answer"] = df.iloc[idx, k + 1]
-        # item_str += " {}\n\n".format(df.iloc[idx, k + 1])
-
-    # return prompt, num_options, prompt_skeleton
-
-    return prompt, num_options
-
-
-def hash_chat_conv(msgs_conv):
-    json_string = json.dumps(msgs_conv)
-
-    # Create a SHA256 hash of the string
-    hash_object = hashlib.sha256(json_string.encode())
-
-    # Get the hexadecimal representation of the hash
-    hex_dig = hash_object.hexdigest()
-
-    return hex_dig
-
-
-def eval(args, subject, engine, dev_df, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None):
-    cors = []
-    all_probs = []
-    all_lprobs = []
-    all_answers = []
-    all_generations = []
-    all_scores = []
-
-    # hashing for simulated conversations
-    messages_conv = None
-    messages_conv_hash = None
-
-    gpt_token_counter = {"input": 0, "output": 0}
-
-    assert test_df.shape[0] == len(participant_perm_dicts)
-
-    for item_i, permutations_dict in enumerate(participant_perm_dicts):
-        inv_permutations_dict = {v: k for k, v in permutations_dict.items()}
-
-        if item_i % 10 == 0:
-            print(f"Eval progress: {item_i}/{test_df.shape[0]}")
-
-        #  e.g. A -> A little like me
-        label_2_text_option_dict = {
-            label: test_df.iloc[item_i, score+1] for label, score in permutations_dict.items()
-        }
-        prompt, n_options = format_example(
-            test_df, item_i,
-            subject=subject,
-            experiment_name=args.experiment_name,
-            include_answer=False,
-            args=args,
-            permutations_dict=permutations_dict,
-            simulated_participant=simulated_participant
-        )
-
-        skip_generation = False
-
-        assert n_options == len(permutations_dict)
-        answers = choices[:n_options]
-
-        assert all([a in permutations_dict for a in answers])
-
-        label = test_df.iloc[item_i, test_df.shape[1]-1]
-        assert label in answers + ["undef"]
-
-        if args.estimate_gpt_tokens:
-            gpt_tokenizer = tiktoken.get_encoding("cl100k_base")
-        else:
-            gpt_tokenizer = None
-
-
-        if args.simulate_conversation_theme:
-
-            set_persona_str = prompt["set_persona_str"]
-            if messages_conv is None:
-                if args.verbose:
-                    print("SIMULATING CONVERSATION")
-
-                messages_conv, messages_conv_hash = simulate_conversation(
-                    args=args,
-                    engine=engine,
-                    sim_engine=engine,
-                    model_set_persona_string=set_persona_str,
-                    simulated_participant=simulated_participant,
-                    llm_generator=llm_generator
-                )
-
-                if args.estimate_gpt_tokens:
-                    # topic setting msg
-                    current_input_tokens = len(gpt_tokenizer.encode(messages_conv[0]['content']))
-
-                    for msg_i in range(1, len(messages_conv)):
-                        current_output_tokens = len(gpt_tokenizer.encode(messages_conv[msg_i]['content']))
-                        gpt_token_counter['input'] += current_input_tokens
-                        gpt_token_counter['output'] += current_output_tokens
-
-                        # add for next message
-                        current_input_tokens += current_output_tokens
-
-            else:
-                if args.verbose:
-                    print("LOADING CACHED CONVERSATION")
-                assert hash_chat_conv(messages_conv) == messages_conv_hash
-
-        if args.estimate_gpt_tokens:
-            # gpt params
-            messages = construct_messages(
-                prompt=prompt,
-                system_message=True,
-                messages_conv=messages_conv if args.simulate_conversation_theme else None,
-                add_query_str=True,  # not query_in_reply
-            )
-            n_input_tokens = sum([len(gpt_tokenizer.encode(msg['content'])) for msg in messages])
-
-            gpt_token_counter['input'] += n_input_tokens
-            gpt_token_counter['output'] += 1
-
-        if engine == "dummy":
-
-            messages = construct_messages(
-                prompt=prompt,
-                system_message=args.system_message,
-                messages_conv=messages_conv if args.simulate_conversation_theme else None,
-                add_query_str=not args.query_in_reply,
-            )
-
-            if args.query_in_reply:
-                messages += [{
-                    "role": "assistant",
-                    "content": prompt['query_str']
-                }]
-
-            formatted_prompt = apply_base_model_template(
-                messages,
-                add_generation_prompt=True,
-                assistant_label=simulated_participant_to_name(
-                    simulated_participant, args.simulated_population_type).upper(),
-                user_label="USER",
-                system_label="CONTEXT"
-            )
-
-            if args.verbose:
-                print(f"************************\nFORMATTED PROMPT:\n{formatted_prompt}\n******************")
-
-            # generation = messages[-2]['content'][messages[-2]['content'].index(") War") - 1:][:1]
-            generation = random.choice([f"{c}" for c in answers])
-
-            # import re
-            # generation = messages[-2]['content'][messages[-2]['content'].index(") a few hours per day") - 1:][:1]
-
-            # if re.search("\) You receive: 85", messages[-2]['content']):
-            #     generation = messages[-2]['content'][messages[-2]['content'].index(") You receive: 85") - 1:][:1]
-            # elif re.search("\) You receive: 100", messages[-2]['content']):
-            #     generation = messages[-2]['content'][messages[-2]['content'].index(") You receive: 100") - 1:][:1]
-            # else:
-            #     generation = random.choice([f"{c}" for c in answers])
-
-            lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
-
-        elif engine == "interactive":
-            # ask the user to choose
-            generation = input(f"{prompt}")
-
-            lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
-
-        elif engine in ["zephyr-7b-beta"] or "Mistral-7B" in engine or "Mixtral" in engine:
-
-            tokenizer, model = llm_generator
-
-
-            messages = construct_messages(
-                prompt=prompt,
-                system_message=args.system_message,
-                messages_conv=messages_conv if args.simulate_conversation_theme else None,
-                add_query_str=not args.query_in_reply,
-            )
-
-            if args.base_model_template:
-                formatted_prompt = apply_base_model_template(
-                    messages,
-                    add_generation_prompt=True,
-                    assistant_label=simulated_participant_to_name(simulated_participant, args.simulated_population_type).upper(),
-                    user_label="USER",
-                    system_label="CONTEXT"
-                )
-
-            else:
-                formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-            if args.query_in_reply:
-                formatted_prompt += f"{prompt['query_str']}"
-
-            if args.verbose:
-                print(f"************************\nFORMATTED PROMPT:\n{formatted_prompt}\n******************")
-
-            inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
-
-            output = model.generate(
-                **inputs,
-                max_new_tokens=1,
-                # temperature=0.0001,
-                do_sample=False,
-                top_p=1.0,
-                return_dict_in_generate=True,
-                output_scores=True
-            )
-            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
-
-        elif engine in [
-            *[f"llama_2_{s}_chat" for s in ["7b", "13b", "70b"]],
-            *[f"llama_2_{s}" for s in ["7b", "13b", "70b"]],
-            *["phi-2", "phi-1.5", "phi-1"],
-            *[f"Qwen-{s}" for s in ["72B", "14B", "7B"]],
-            *[f"Qwen-{s}-Chat" for s in ["72B", "14B", "7B"]],
-        ]:
-
-            tokenizer, model = llm_generator
-
-            messages = construct_messages(
-                prompt=prompt,
-                system_message=args.system_message,
-                messages_conv=messages_conv if args.simulate_conversation_theme else None,
-                add_query_str=not args.query_in_reply,
-            )
-
-            if args.base_model_template:
-                formatted_prompt = apply_base_model_template(
-                    messages,
-                    add_generation_prompt=True,
-                    assistant_label=simulated_participant_to_name(simulated_participant, args.simulated_population_type).upper(),
-                    user_label="USER",
-                    system_label="CONTEXT"
-                )
-
-            else:
-                formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-            if args.query_in_reply:
-
-                formatted_prompt += f"{prompt['query_str']}"
-
-            if args.verbose:
-                print(f"************************\nFORMATTED PROMPT:\n{formatted_prompt}\n******************")
-
-            inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
-
-            # token match
-            output = model.generate(
-                **inputs,
-                max_new_tokens=1,
-                # do_sample=True,
-                # temperature=0.5,
-                # top_p=0.9,
-                # top_k=50,
-                # num_beams=1,
-                # repetition_penalty=1.2,
-                return_dict_in_generate=True,
-                output_scores=True
-            )
-
-            option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers)
-
-        elif "gpt-3.5-turbo" in engine or "gpt-4" in engine:
-            if args.query_in_reply:
-                raise ValueError("Can't use query_in_reply with gpt models.")
-
-            if args.base_model_template:
-                raise ValueError("base_model_template not supported for gpt models")
-
-            tokenizer, model = llm_generator
-
-            messages = construct_messages(
-                prompt=prompt,
-                system_message=args.system_message,
-                messages_conv=messages_conv if args.simulate_conversation_theme else None,
-                add_query_str=True,
-            )
-
-            if args.verbose:
-                print_chat_messages(messages)
-
-            encoder = tiktoken.encoding_for_model(engine)
-
-            # get the encoding for each letter in choices
-            logit_bias = {encoder.encode(c)[0]: 100 for c in answers}
-
-            if args.azure_openai:
-                # time.sleep(0.05)
-                c = completions_with_backoff(
-                    client=model,
-                    model=openai_2_azure_tag[engine],
-                    messages=messages,
-                    max_tokens=1,
-                    n=1,
-                    temperature=0,
-                    logit_bias=logit_bias,
-                )
-            else:
-                c = model.chat.completions.create(
-                    model=engine,
-                    messages=messages,
-                    max_tokens=1,
-                    n=1,
-                    temperature=0,
-                    logit_bias=logit_bias,
-                )
-
-            generation = c.choices[0].message.content
-
-            lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
-
-        else:
-            raise ValueError(f"Not recognized model {engine}.")
-
-        probs = softmax(np.array(lprobs))
-        pred = {i: c for i, c in enumerate(answers)}[np.argmax(lprobs)]
-        cor = pred == label
-        score = map_choice_to_number(pred, permutations_dict)
-
-        if args.verbose:
-            print(colored(f"Pred:{pred} (Generation:{generation}; Score: {score})", "green"))
-            print("------------------")
-
-        cors.append(cor)
-        all_lprobs.append(lprobs)
-        all_probs.append(probs)
-        all_answers.append(pred)
-        all_generations.append(generation)
-        all_scores.append(score)
-
-    acc = np.mean(cors)
-    cors = np.array(cors)
-    all_scores = np.array(all_scores)
-
-    if args.estimate_gpt_tokens:
-        estimate_and_print_gpt_prices(gpt_token_counter, engine)
-
-    return cors, acc, all_probs, all_lprobs, all_answers, all_scores, all_generations, gpt_token_counter
-
-
-def main(args):
-    engine = args.engine
-    print("Engine:", engine)
-
-    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
-
-    # dump results dir
-    dump_results_dir = os.path.join(args.save_dir, "_".join([
-        args.experiment_name,
-        engine,
-        os.path.basename(args.data_dir),
-        f"permutations_{args.permutations}" if args.permutations > 1 else "",
-        f"permute_options_{args.permute_options_seed}" if args.permute_options else "",
-        f"format_{args.format}",
-        f"simulate_conv_{args.simulate_conversation_theme}" if args.simulate_conversation_theme else "",
-        timestamp
-    ]))
-
-    if not args.overwrite:
-        # check for previous versions and break if found
-        import glob
-        prev_versions = glob.glob(dump_results_dir.removesuffix(timestamp)+"*/results.json")
-        if len(prev_versions) > 0:
-            raise RuntimeError(f"Previous version of this run were found: {prev_versions}")
-
-    os.makedirs(dump_results_dir, exist_ok=True)
-    print("Savedir: ", dump_results_dir)
-
-    # Data preparation
-    if len(subjects) == 0:
-        raise ValueError("No subjects found.")
-
-    if "data_pvq" in args.data_dir:
-        assert "pvq" in args.experiment_name
-
-        # assert set(subjects_to_evaluate).issubset(subjects)
-        subjects = ["pvq_auto"]
-
-    print("Args:", args)
-    print("Subjects:", subjects)
-
-    gpt_tokens_total = {"input": 0, "output": 0}
-
-    if engine in ["zephyr-7b-beta"]:
-        print("Loading zephyr-7b-beta")
-        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", cache_dir=hf_cache_dir, device_map="auto")
-        model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto", cache_dir=hf_cache_dir)
-
-        llm_generator = (tokenizer, model)
-
-    elif engine in [
-        *[f"Qwen-{s}" for s in ["72B", "14B", "7B"]],
-        *[f"Qwen-{s}-Chat" for s in ["72B", "14B", "7B"]]
-    ]:
-        tokenizer = AutoTokenizer.from_pretrained(f"Qwen/{engine}", trust_remote_code=True, cache_dir=hf_cache_dir)
-        model = AutoModelForCausalLM.from_pretrained(f"Qwen/{engine}", device_map="auto", trust_remote_code=True, cache_dir=hf_cache_dir).eval()
-        llm_generator = (tokenizer, model)
-
-    elif engine in ["phi-1", "phi-1.5", "phi-2"]:
-        tokenizer = AutoTokenizer.from_pretrained(f"microsoft/{engine}", trust_remote_code=True, cache_dir=hf_cache_dir)
-        model = AutoModelForCausalLM.from_pretrained(f"microsoft/{engine}", trust_remote_code=True, cache_dir=hf_cache_dir, device_map="cuda")
-        llm_generator = (tokenizer, model)
-
-    elif "Mistral-7B" in engine and "_ft_" in engine:
-
-        ft_model_path = f"./results_ft/{engine}/final"
-
-        if "no_peft" in engine:
-            model = AutoModelForCausalLM.from_pretrained(
-                ft_model_path,
-                device_map="auto",
-                cache_dir=hf_cache_dir
-            )
-            tokenizer = AutoTokenizer.from_pretrained(ft_model_path, cache_dir=hf_cache_dir)
-
-        else:
-
-            lora_config = LoraConfig.from_pretrained(ft_model_path)
-            base_model = lora_config.base_model_name_or_path
-
-            if "LOAD_INSTRUCT" in ft_model_path:
-                colored("LOADING INSTRUCT MODEL WHICH is different from the trained based model... Only for testing!!", "red")
-                base_model = "mistralai/Mistral-7B-Instruct-v0.2"
-
-            print(f"Loading {engine}")
-
-            bnb_config_ = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.bfloat16,
-                bnb_4bit_use_double_quant=False,
-            )
-            model = AutoModelForCausalLM.from_pretrained(
-                base_model,
-                quantization_config=bnb_config_,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                trust_remote_code=True,
-                cache_dir=hf_cache_dir
-            )
-            print("Loaded base model: ", base_model)
-            model = prepare_model_for_kbit_training(model)
-            model = PeftModel.from_pretrained(model, ft_model_path)
-            print("Loaded peft from ", ft_model_path)
-
-
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(ft_model_path, trust_remote_code=True, cache_dir=hf_cache_dir)
-                print("Loaded tokeniezer from ", ft_model_path)
-            except:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    lora_config.base_model_name_or_path, trust_remote_code=True, cache_dir=hf_cache_dir)
-                print("Loaded tokenizer from ", lora_config.base_model_name_or_path)
-
-        llm_generator = (tokenizer, model)
-
-    elif engine in [
-        "Mistral-7B-v0.1",
-        "Mistral-7B-Instruct-v0.2",
-        "Mistral-7B-Instruct-v0.1",
-    ]:
-        print(f"Loading {engine}")
-        tokenizer = AutoTokenizer.from_pretrained(f"mistralai/{engine}", cache_dir=hf_cache_dir, device_map="auto")
-        model = AutoModelForCausalLM.from_pretrained(f"mistralai/{engine}", device_map="auto", cache_dir=hf_cache_dir)
-
-        llm_generator = (tokenizer, model)
-        
-    elif engine in [
-        "Mixtral-8x7B-v0.1",
-        "Mixtral-8x7B-Instruct-v0.1",
-    ]:
-        print(f"Loading {engine}")
-        tokenizer = AutoTokenizer.from_pretrained(f"mistralai/{engine}", cache_dir=hf_cache_dir, device_map="auto")
-        model = AutoModelForCausalLM.from_pretrained(f"mistralai/{engine}", device_map="auto", cache_dir=hf_cache_dir, torch_dtype=torch.float16)
-
-        llm_generator = (tokenizer, model)
-
-    elif engine in [
-        "Mixtral-8x7B-v0.1-4b",
-        "Mixtral-8x7B-Instruct-v0.1-4b",
-    ]:
-        model_name = engine.rstrip("-4b")
-        print(f"Loading {engine} -> {model_name}")
-        tokenizer = AutoTokenizer.from_pretrained(f"mistralai/{model_name}", cache_dir=hf_cache_dir, device_map="auto")
-        # model = AutoModelForCausalLM.from_pretrained(f"mistralai/{model_name}", device_map="auto", cache_dir=hf_cache_dir, load_in_4bit=True, attn_implementation="flash_attention_2")
-        model = AutoModelForCausalLM.from_pretrained(f"mistralai/{model_name}", device_map="auto", cache_dir=hf_cache_dir, load_in_4bit=True)
-
-        llm_generator = (tokenizer, model)
-
-    elif engine in [
-        *[f"llama_2_{s}_chat" for s in ["7b", "13b", "70b"]],
-        *[f"llama_2_{s}" for s in ["7b", "13b", "70b"]],
-    ]:
-
-        print("Loading llama 2")
-        import re
-
-        model_size = re.findall(r"_(\d+b)", engine)[0]
-        chat = "chat" in engine
-
-        hf_model_name = f"meta-llama/Llama-2-{model_size}-{'chat-' if chat else ''}hf"
-
-        tokenizer = AutoTokenizer.from_pretrained(hf_model_name, token=hf_token, cache_dir=hf_cache_dir)
-
-        if not chat:
-            # monkey patch
-            hf_model_name_chat = f"meta-llama/Llama-2-{model_size}-chat-hf"
-            tokenizer_chat = AutoTokenizer.from_pretrained(hf_model_name_chat, token=hf_token, cache_dir=hf_cache_dir)
-            tokenizer.apply_chat_template = tokenizer_chat.apply_chat_template
-
-        model = AutoModelForCausalLM.from_pretrained(hf_model_name, torch_dtype=torch.float16, token=hf_token, device_map="auto", cache_dir=hf_cache_dir)
-
-        llm_generator = (tokenizer, model)
-
-    elif "gpt" in engine:
-
-        if args.azure_openai:
-            print(colored("Using Azure OPENAI API", "red"))
-            from openai import AzureOpenAI
-
-            if engine == "gpt-3.5-turbo-0125":
-                model = AzureOpenAI(
-                    azure_endpoint="https://petunia-grgur.openai.azure.com/",
-                    api_key=os.getenv("AZURE_OPENAI_KEY_gpt_35_turbo_0125"),
-                    api_version="2024-02-15-preview"
-                )
-
-            elif engine == "gpt-3.5-turbo-1106":
-                model = AzureOpenAI(
-                    azure_endpoint="https://petunia-grgur-gpt-35-turbo-1106.openai.azure.com/",
-                    api_key=os.getenv("AZURE_OPENAI_KEY_gpt_35_turbo_1106"),
-                    api_version="2024-02-15-preview"
-                )
-            else:
-                raise NotImplementedError("Azure endpoint not found.")
-
-        else:
-            print(colored("Using OPENAI API", "red"))
-            from openai import OpenAI
-            openai_api_key = os.environ["OPENAI_API_KEY"]
-            hidden_key = openai_api_key[:4] + "*" * 10 + openai_api_key[4:]
-            print(f"OPENAI KEY: {hidden_key}")
-            model = OpenAI(api_key=openai_api_key)
-
-        tokenizer = tiktoken.get_encoding("cl100k_base")
-        llm_generator = (tokenizer, model)
-
-    elif engine or engine in ["dummy", "interactive"]:
-        llm_generator = None
-
-    else:
-        raise ValueError(f"Undefined model: {engine}")
-
-    print(f"Loaded model: {args.engine}.")
-
-    if "pvq" in args.data_dir:
-        max_n_options = 6
-    elif "donation" in args.data_dir:
-        max_n_options = 6
-    elif "bag" in args.data_dir:
-        max_n_options = 6
-    elif "religion" in args.data_dir:
-        max_n_options = 5
-    else:
-        raise ValueError(f"Undefined number of options for data in {args.data_dir}.")
-
-    if args.simulated_population_type == "permutations":
-        simulated_population = [None]*args.permutations
-        simulated_population_genders = (["M", "F"]*int(np.ceil(args.permutations/2)))[:args.permutations]
-
-    elif args.simulated_population_type == "tolkien_characters":
-        # https://en.wikipedia.org/wiki/List_of_Middle-earth_characters
-        # 50 characters with the longest wikipedia page
-        with open("personas/tolkien_characters/tolkien_characters.txt") as f:
-            simulated_population = [name.rstrip() for name in f.readlines()]
-
-        with open("personas/tolkien_characters/tolkien_characters_genders.txt") as f:
-            simulated_population_genders = [g.rstrip() for g in f.readlines()]
-
-    elif args.simulated_population_type == "famous_people":
-        # source: https://www.biographyonline.net/people/famous-100.html
-        with open("personas/famous_people/famous_people.txt") as f:
-            simulated_population = [name.rstrip() for name in f.readlines()]
-
-        with open("personas/famous_people/famous_people_genders.txt") as f:
-            simulated_population_genders = [g.rstrip() for g in f.readlines()]
-
-
-    all_cors = []
-
-    # list because of permutations
-    subj_acc = [{} for _ in range(len(simulated_population))]
-    subj_lprobs = [{} for _ in range(len(simulated_population))]
-    subj_len = [{} for _ in range(len(simulated_population))]
-    metrics = [{} for _ in range(len(simulated_population))]
-    answers = [{} for _ in range(len(simulated_population))]
-    generations = [{} for _ in range(len(simulated_population))]
-
-    # evaluate model
-    for subject in subjects:
-
-        dev_df = None
-        if subject == "pvq_auto":
-            if not simulated_population_genders:
-                raise ValueError("Simulated population genders is not defined.")
-
-            test_df_dict = {}
-            test_df_dict["F"] = pd.read_csv(
-                os.path.join(args.data_dir, args.eval_set, f"pvq_female_{args.eval_set}.csv"),
-                header=None, keep_default_na=False,
-            )
-
-            test_df_dict["M"] = pd.read_csv(
-                os.path.join(args.data_dir, args.eval_set, f"pvq_male_{args.eval_set}.csv"),
-                header=None, keep_default_na=False,
-            )
-
-            # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
-            test_df_dict["F"][0][:] = test_df_dict["F"][0][:].str.replace("\\n", "\n")
-            test_df_dict["M"][0][:] = test_df_dict["M"][0][:].str.replace("\\n", "\n")
-
-            assert len(test_df_dict["F"]) == len(test_df_dict["M"])
-            assert test_df_dict["F"].shape == test_df_dict["M"].shape
-
-            num_questions = len(test_df_dict["F"])
-            assert max_n_options == test_df_dict["F"].shape[1] - 2
-            n_options = [max_n_options] * num_questions
-
-        else:
-
-            test_df = pd.read_csv(
-                os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"),
-                header=None,
-                keep_default_na=False,
-                dtype=str
-            )
-            n_options = [max_n_options]*len(test_df)
-
-            # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
-            test_df[0][:] = test_df[0][:].str.replace("\\n", "\n")
-
-            num_questions = len(test_df)
-
-        permutations_dicts = create_permutation_dicts(
-            args,
-            n_options,
-            choices,
-            num_questions=num_questions,
-            population_size=len(simulated_population)
-        )
-
-        assert len(permutations_dicts) == len(simulated_population)
-        assert all([len(part_d) == num_questions for part_d in permutations_dicts])
-
-        # evaluate over population
-        for sim_part_i, (simulated_participant, simulated_participant_gender, participant_perm_dicts) in enumerate(zip(simulated_population, simulated_population_genders, permutations_dicts)):
-            print(f"Simulated participant {sim_part_i}/{len(simulated_population)}")
-
-            if subject == "pvq_auto":
-                test_df = test_df_dict[simulated_participant_gender]
-
-            cors, acc, eval_probs, eval_lprobs, preds, preds_values, gens, gpt_tokens = eval(
-                args=args,
-                subject=subject,
-                engine=engine,
-                dev_df=dev_df,
-                test_df=test_df,
-                participant_perm_dicts=participant_perm_dicts,
-                llm_generator=llm_generator,
-                simulated_participant=simulated_participant,
-            )
-            all_cors.append(cors)
-            gpt_tokens_total['input'] += gpt_tokens['input']
-            gpt_tokens_total['output'] += gpt_tokens['output']
-
-            subj_acc[sim_part_i][subject] = acc
-            subj_lprobs[sim_part_i][subject] = eval_lprobs
-            subj_len[sim_part_i][subject] = num_questions
-            answers[sim_part_i][subject] = list(zip(preds, map(int, preds_values)))
-            generations[sim_part_i][subject] = gens
-
-            if "pvq" in args.data_dir:
-                assert "pvq" in args.experiment_name
-
-                profile_values_idx_json = os.path.join(os.path.join(args.data_dir, "raw"), "values.json")
-
-                with open(profile_values_idx_json) as f:
-                    profile_values_idx = json.load(f)
-
-                profile_values_idx = {k: np.array(v)-1 for k, v in profile_values_idx.items() if k != "_comment"}
-
-                metrics[sim_part_i][subject] = {}
-
-                for profile_value, idxs in profile_values_idx.items():
-                    metrics[sim_part_i][subject][profile_value] = preds_values[idxs].mean() # legacy: todo: remove and save those below
-
-            elif "tolkien_donation" in args.data_dir:
-                assert "donation" in args.experiment_name
-
-                groups = ["elves", "dwarves", "orcs", "humans", "hobbits"]
-
-                donated = (preds_values-1)*2
-                group_donations = np.split(donated, len(groups))
-                assert set([len(g) for g in group_donations]) == {20}
-
-                metrics[sim_part_i][subject] = {
-                    f"Donation {g}": np.mean(g_d) for g, g_d in zip(groups, group_donations)
-                }
-
-            elif "tolkien_bag" in args.data_dir:
-                assert "bag" in args.experiment_name
-
-                groups = ["elves", "dwarves", "orcs", "humans", "hobbits"]
-                group_bag = np.split(preds_values, len(groups))
-                assert set([len(g) for g in group_bag]) == {20}
-
-                metrics[sim_part_i][subject] = {
-                    f"Return {g}": np.mean(g_d) for g, g_d in zip(groups, group_bag)
-                }
-
-            elif "religion" in args.data_dir:
-                assert "religion" in args.experiment_name
-
-                metrics[sim_part_i][subject] = {
-                    f"religion time": np.mean(preds_values)
-                }
-
-            else:
-                raise NotImplementedError("Evaluation not implemented")
-
-        # aggregate to means
-        mean_subj_acc = defaultdict(list)
-        for subj_acc_perm in subj_acc:
-            for k, v in subj_acc_perm.items():
-                mean_subj_acc[k].append(v)
-        mean_subj_acc = {k: np.mean(v) for k,v in mean_subj_acc.items()}
-
-        # assert the same and take the fist
-        assert all(subj_len[0] == s for s in subj_len)
-        subj_len = subj_len[0]
-
-        # remap from list of metrics to metrics with lists
-        mean_metrics = defaultdict(lambda: defaultdict(list))
-        for metrics_perm in metrics:
-            for subj, subj_metrics in metrics_perm.items():
-                for metric, value in subj_metrics.items():
-                    mean_metrics[subj][metric].append(value)
-
-        # average metrics
-        mean_metrics = {
-            subj: {
-                metric: np.mean(values) for metric, values in subj_metrics.items()
-            } for subj, subj_metrics in mean_metrics.items()
-        }
-
-        weighted_acc = np.mean(np.concatenate(all_cors))
-
-        pop_metrics = {}
-
-
-        # save results
-        for subj, m in mean_metrics.items():
-            if m:
-                print("Subject: ", subj)
-                for metric, score in m.items():
-                    print(f"{metric} : {score}")
-
-
-        if not os.path.exists(dump_results_dir):
-            os.mkdir(dump_results_dir)
-
-        json_dump_path = os.path.join(dump_results_dir, 'results.json')
-
-        with open(json_dump_path, 'w') as fp:
-            json.dump({
-                "args": vars(args),
-                **mean_subj_acc,
-                **{
-                    "average": weighted_acc
-                },
-                "metrics": mean_metrics,
-                "pop_metrics": pop_metrics,
-                "per_permutation_metrics": metrics,  # legacy todo: remove and update var_viz
-                "per_simulated_participant_metrics": metrics,
-                "simulated_population": simulated_population,
-                "generations": generations,
-                "answers": answers,
-                "lprobs": subj_lprobs,
-                **{
-                    "params": vars(args)
-                }
-            }, fp, indent=4)
-
-        print(f"Results saved to {json_dump_path}")
-
-        print("")
-        print("Average accuracy per subject.")
-        for subject in subjects:
-            print("{} accuracy ({}): {:.3f}".format(subject, subj_len[subject], mean_subj_acc[subject]))
-
-        print("Average accuracy: {:.3f}".format(weighted_acc))
-
-        if pop_metrics:
-            print("pop metrics:", pop_metrics['all']['hist'])
-
-    if args.estimate_gpt_tokens:
-        estimate_and_print_gpt_prices(gpt_tokens_total, engine)
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data_dir", "-d", type=str, required=True)
-    parser.add_argument("--save_dir", "-s", type=str, default="results/results_test")
-    parser.add_argument("--experiment_name", "-n", type=str, default="")
-    parser.add_argument("--pvq-version", type=str, default="pvq_auto", choices=["pvq_auto"])
-    parser.add_argument("--engine", "-e", type=str, default="dummy")
-    parser.add_argument("--format", type=str, default="chat", choices=["chat"])
-    parser.add_argument('--profile', type=str, help='Profile definition in format "k:v;k:v;k:v", ex. "age:35;interests:reading books"')
-    parser.add_argument("--query-in-reply", action="store_true", help="Force the query string as the beginning of the model's reply.")
-    parser.add_argument("--base-model-template", action="store_true")
-    parser.add_argument("--query-prompt", "-qp", type=str, help="Use Answer(as ONE letter): where applicable.")
-    parser.add_argument("--verbose", "-v", action="store_true")
-    parser.add_argument("--system-message", "-sm", action="store_true")
-    parser.add_argument("--assert-params", action="store_true")
-    parser.add_argument("--cold-run", "-cr", action="store_true")
-    parser.add_argument("--estimate-gpt-tokens", "-t", action="store_true")
-    parser.add_argument("--eval-set", type=str, default="test", choices=["test", "val"])
-    parser.add_argument("--simulate-conversation-theme", type=str, default=None)
-    parser.add_argument("--simulated-conversation-n-messages", type=int, default=5)
-    parser.add_argument("--permute-options", "-po", action="store_true")
-    parser.add_argument("--azure-openai", action="store_true")
-    parser.add_argument("--simulated-human-knows-persona", action="store_true")
-    parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people", "llm_personas", "user_personas", "anes"])
-    parser.add_argument("--permutations", "-p", type=int, default=1)  # permutations as a population type
-    parser.add_argument("--permute-options-seed", type=str)
-    parser.add_argument("--separator", action="store_true")
-    parser.add_argument("--overwrite", action="store_true")
-    args = parser.parse_args()
-
-    assert args.azure_openai
-    assert args.pvq_version == "pvq_auto"
-    assert args.format == "chat"
-
-    if not args.data_dir.startswith("data"):
-        raise ValueError(f"data_dir should be inside data, and it's {args.data_dir}")
-
-    if args.assert_params:
-        # check parameters for models
-        if "gpt" in args.engine and "instruct" not in args.engine:
-            assert args.system_message
-            assert not args.query_in_reply
-
-        if args.engine in ["phi-2", "phi-1.5", "phi-1", "Qwen-72B", "Qwen-14B", "Qwen-7B"]:
-            # phi is a base model
-            assert args.query_in_reply
-            assert args.system_message
-            assert args.base_model_template
-
-        if ("llama_2" in args.engine and "chat" in args.engine) or "zephyr" in args.engine:
-            assert args.system_message
-            assert args.query_in_reply
-            assert not args.base_model_template
-
-        if "llama_2" in args.engine and "chat" not in args.engine:
-            # base llama_2 model
-            assert args.system_message
-            assert args.query_in_reply
-            assert args.base_model_template
-
-        if "Mistral" in args.engine or "Mixtral" in args.engine:
-            assert args.query_in_reply
-
-            if "Instruct" in args.engine or "ft_roleplay" in args.engine:
-                assert not args.system_message
-                assert not args.base_model_template
-            else:
-                # base model
-                assert args.system_message
-                assert args.base_model_template
-
-    if args.base_model_template:
-        if not args.system_message:
-            raise ValueError("Use system-message with base_model_template -> system is parsed to 'CONTEXT:' ")
-
-    if args.simulated_population_type == "permutations":
-        if args.simulated_human_knows_persona:
-            raise ValueError("Use simulated_human_knows_persona cannot be used with permutations sim. population type")
-    else:
-        if args.simulate_conversation_theme and not args.simulated_human_knows_persona:
-            raise ValueError("Use simulated_human_knows_persona.")
-
-    if args.simulate_conversation_theme in ["None", "none"]:
-        args.simulate_conversation_theme = None
-
-    if args.estimate_gpt_tokens:
-        if "gpt" not in args.engine and args.engine != "dummy":
-            raise ValueError("Only gpt-4 gpt-3 and dummy support estimating GPT tokens")
-
-    if args.permute_options and args.permute_options_seed is None:
-        raise ValueError("Permute options string should be defined for stability")
-
-    if args.cold_run:
-        print("System message:", args.system_message)
-        # just used to show the profile to be used
-        exit()
-
-    if ("gpt-3.5" in args.engine and args.permutations > 50) or ("gpt-4" in args.engine and args.permutations > 5):
-        raise ValueError(f"Are you sure you want to use {args.permutations} with {args.engine}??")
-
-    # assert for plosone gpt or query_in_reply for other models
-    # because query_in_reply can't be implemented for GPTs
-    assert args.query_in_reply or "gpt" in args.engine or "dummy" in args.engine
-
-    if "gpt" in args.engine:
-        if args.query_in_reply:
-            raise ValueError("Can't use query in reply with gpt models")
-
-    start_time = time.time()
-    main(args)
-    end_time = time.time()
-    print("Elapsed time:", str(datetime.timedelta(seconds=end_time-start_time)).split(".")[0])
-
-
diff --git a/evaluate_v3.py b/evaluate_v3.py
new file mode 100644
index 0000000..d75e610
--- /dev/null
+++ b/evaluate_v3.py
@@ -0,0 +1,863 @@
+import argparse
+import random
+from collections import defaultdict
+from pathlib import Path
+import json
+import hashlib
+import time
+import datetime
+import itertools
+import string
+
+from termcolor import colored
+
+from utils import *
+
+from models import *
+
+import numpy as np
+import pandas as pd
+import torch
+import tiktoken
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, BitsAndBytesConfig
+from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
+
+from personas.utils import simulated_participant_to_name
+
+
+hf_cache_dir = get_hf_cache_dir()
+os.environ['HF_HOME'] = hf_cache_dir
+
+
+# take the theme starter
+opening_questions_for_themes = {
+    "poem": "Hello, let's write a poem together. You start by the first verse I'll add the second one, and so on.",
+    "joke": "Tell me a joke.",
+    "history": "What is the significance of the battle of Hastings. Answer in two sentences.",  # slight collapse
+    "chess": "1. e4",
+    "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding.",
+}
+
+
+def create_permutation_dicts(args, n_options, choices, num_questions, population_size=None):
+
+    if args.permute_options:
+
+        # sample permutations based on given seed -> should correspond to different contexts
+        original_state = random.getstate()  # save the original state
+        random.seed(args.permute_options_seed)
+
+        if len(set(n_options)) == 1:
+
+            if n_options[0] > 9:
+                raise ValueError("Number of options too big. Refactor code below to use it.")
+
+            all_permutations = list(itertools.permutations(range(n_options[0])))
+
+            permutations = random.choices(all_permutations, k=num_questions*population_size)
+            permutations = [permutations[part_i:part_i+num_questions] for part_i in range(population_size)]
+
+        else:
+            # not all questions have the same number of options
+
+            # string seed to int seed
+            int_seed = int(hashlib.md5(args.permute_options_seed.encode('utf-8')).hexdigest(), 16)
+            rng = np.random.default_rng(seed=int_seed)
+
+            permutations = [
+                [tuple(rng.permutation(n_options_q)) for n_options_q in n_options] for _ in range(population_size)
+            ]
+
+        permutations_dicts = [
+            [
+                dict(zip(choices, perm)) for perm in part_perms
+            ] for part_perms in permutations
+        ]
+
+        # revert original state
+        random.setstate(original_state)
+
+    else:
+        if len(set(n_options)) == 1:
+            permutations_dicts = [
+                [{choices[i]: i for i, c in enumerate(choices[:n_opt])} for n_opt in n_options]
+            ] * population_size
+
+    return permutations_dicts
+
+
+def create_simulated_messages(conv, last="user"):
+    # simulate a conversation between two LLMs
+    if last == "user":
+        # last role is user
+        sim_conv = list(zip(["user", "assistant"] * (len(conv) // 2 + 1), conv[::-1]))[::-1]
+    elif last == "assistant":
+        # last role is assistant
+        sim_conv = list(zip(["assistant", "user"] * (len(conv) // 2 + 1), conv[::-1]))[::-1]
+    else:
+        raise ValueError("last must be either user or assistant")
+
+    sim_conv_messages = [{"role": role, "content": msg} for role, msg in sim_conv]
+
+    return sim_conv_messages
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops, tokenizer, original_input_ids):
+        super().__init__()
+        self.stops = [s.upper() for s in stops]
+        self.tokenizer = tokenizer
+        self.original_input_ids = original_input_ids
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        generated_ids = input_ids[0][len(self.original_input_ids[0]):]
+        generation = self.tokenizer.decode(generated_ids).upper()
+        return any([stop in generation for stop in self.stops])
+
+
+def simulate_conversation(args, engine, sim_engine, model_set_persona_string=None, llm_generator=None, simulated_participant=None):
+
+    opening_question = opening_questions_for_themes[args.simulated_conversation_theme]
+
+    conversation = [opening_question]
+
+    # simulate conversation
+    assert args.simulated_conversation_n_messages % 2 == 1  # must be odd so that the last one is GPT as simulated persona
+
+    for msg_i in range(args.simulated_conversation_n_messages):
+        if args.verbose:
+            print(f"Simulated conv msg {msg_i}")
+
+        # assign roles to messages - alternating, last one user
+        simulated_conv_messages = create_simulated_messages(conversation, last="user")
+        simulated_participant_name = simulated_participant_to_name(simulated_participant, args.simulated_population_type)
+        labels_dict = {
+            "persona": {
+                "assistant_label": simulated_participant_name.upper(),
+                "user_label": "USER",
+                "system_label": "CONTEXT"
+            },
+            "human": {
+                "assistant_label": "HUMAN",
+                "user_label": f"{simulated_participant_name.upper()} (CHATBOT)" if args.simulated_human_knows_persona else "CHATBOT",
+                "system_label": "CONTEXT"
+            }
+        }
+        stop_words_up = [f"\n{v}:" for v in labels_dict["persona"].values()] + [f"\n{v}:" for v in labels_dict["human"].values()]
+        # also add similar words wo whitespace ex. GANDALF (CHATBOT) and GANDALF(CHATBOT)
+        stop_words_up += [s.replace(" ", "") for s in stop_words_up if " " in s]
+
+        if msg_i % 2 == 0:
+            # even -> gpt as a persona
+            assert simulated_conv_messages[0]['role'] == "user"
+
+            if model_set_persona_string:
+                simulated_conv_messages = [{
+                    "role": "system" if llm_generator.system_message else "user",
+                    "content": model_set_persona_string
+                }] + simulated_conv_messages
+
+            engine_ = engine
+            assistant_label = labels_dict["persona"]["assistant_label"]
+            user_label = labels_dict["persona"]["user_label"]
+            system_label = labels_dict["persona"]["system_label"]
+
+        else:
+            # gpt as human
+            assert simulated_conv_messages[0]['role'] == "assistant"
+
+            # user doesn't know the chatbots persona -> change this?
+            # if args.base_model_template:
+            if llm_generator.base_model_template:
+                if args.simulated_human_knows_persona:
+                    sys_msg = f"The following is a conversation between a human and a chatbot. The chatbot is pretending to be {simulated_participant_name}. The human's every reply must be in one sentence only."
+                else:
+                    sys_msg = f"The following is a conversation between a human and a chatbot. The human's every reply must be in one sentence only."
+            else:
+                if args.simulated_human_knows_persona:
+                    sys_msg = f"You are simulating a human using a chatbot. The chatbot is pretending to be {simulated_participant_name}. Your every reply must be in one sentence only."
+                else:
+                    sys_msg = f"You are simulating a human using a chatbot. Your every reply must be in one sentence only."
+
+            simulated_conv_messages = [{
+                "role": "system" if llm_generator.system_message else "user",
+                "content": sys_msg
+            }] + simulated_conv_messages
+
+            assistant_label = labels_dict["human"]["assistant_label"]
+            user_label = labels_dict["human"]["user_label"]
+            system_label = labels_dict["human"]["system_label"]
+
+        # if not args.base_model_template:
+        if not llm_generator.base_model_template:
+            simulated_conv_messages = fix_alternating_msg_order(simulated_conv_messages)
+
+        response = llm_generator.generate(
+            messages=simulated_conv_messages,
+            assistant_label=assistant_label,
+            user_label=user_label,
+            system_label=system_label,
+            stop_words_up=stop_words_up
+        )
+
+        if args.verbose:
+            print_chat_messages(simulated_conv_messages)
+
+
+        # llm_generator_type = type(llm_generator)
+        # if llm_generator_type == HuggingFaceModel:
+        #     response = llm_generator.generate(
+        #         messages=simulated_conv_messages,
+        #         generation_kwargs=dict(
+        #             max_new_tokens=args.simulated_conversation_msg_max_tokens,
+        #             do_sample=True,
+        #             top_p=args.simulated_conversation_top_p,
+        #             temperature=args.simulated_conversation_temp,
+        #             # top_k=50,
+        #             # repetition_penalty=1.2,  # logit / (T * penalty*bool(token present) )
+        #             num_beams=1,
+        #         ),
+        #         assistant_label=assistant_label,
+        #         user_label=user_label,
+        #         system_label=system_label,
+        #         stop_words_up=stop_words_up
+        #     )
+        #
+        # elif llm_generator_type == OpenAIModel:
+        #     response = llm_generator.generate(
+        #         messages=simulated_conv_messages,
+        #         generation_kwargs=dict(
+        #             max_tokens=args.simulated_conversation_msg_max_tokens,
+        #             top_p=args.simulated_conversation_top_p,
+        #             temperature=args.simulated_conversation_temp,
+        #             # not the same as hf repetition_penalty
+        #             # presence_penalty=0.2,  # logit - penalty*bool(token present)
+        #             n=1,
+        #         )
+        #     )
+        # elif llm_generator_type in [InteractiveModel, DummyModel]:
+        #     response = llm_generator.generate()
+        #
+        # else:
+        #     raise NotImplementedError(f"Simulated conversations not implemented for {engine_}")
+
+        # if args.base_model_template:
+        if llm_generator.base_model_template:
+            response_up = response.upper()
+            stop_word_ind = np.min([response_up.index(sw) if sw in response_up else np.inf for sw in stop_words_up])
+            if stop_word_ind != np.inf:
+                stop_word_ind = int(stop_word_ind)
+                response = response[:stop_word_ind]
+
+        conversation.append(response)
+
+        if args.verbose:
+            print(f"--> {response}")
+
+        messages_conv = create_simulated_messages(conversation, last="assistant")
+        messages_conv_hash = hash_chat_conv(messages_conv)
+
+    return messages_conv, messages_conv_hash
+
+
+def map_choice_to_number(letter, permutations_dict):
+    # A-F -> 1-6
+    # find index of letter in choices and add 1
+    number = permutations_dict[letter] + 1
+    return number
+
+
+def map_number_to_choice(number, inv_permutations_dict):
+    choice = inv_permutations_dict[number-1]
+    return choice
+
+
+timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+print("timestamp:", timestamp)
+
+
+hf_token = os.environ["HF_TOKEN"]
+hidden_token = hf_token[:6] + "*" * (len(hf_token)-12) + hf_token[-6:]
+print("HF TOKEN:", hidden_token)
+
+
+# choices = ["A", "B", "C", "D", "E", "F"]
+choices = list(string.ascii_uppercase)
+
+llama_dir = "/gpfswork/rech/imi/utu57ed/llama/llama_files/"
+
+
+def get_prompt_skeleton(experiment_name, args, simulated_participant, base_model_template):
+
+    if "pvq" in experiment_name:
+        assert "pvq" in args.data_dir
+        questionnaire_description = "Here we briefly describe some people. Please read each description and think about how much each person is or is not like you. Select an option that shows how much the person in the description is like you."
+        questionnaire_description_empty = False
+
+    elif "donation" in experiment_name:
+        assert "donation" in args.data_dir
+        questionnaire_description = ""
+        questionnaire_description_empty = True
+
+    elif "bag" in experiment_name:
+        assert "bag" in args.data_dir
+        questionnaire_description = ""
+        questionnaire_description_empty = True
+
+    elif "religion" in experiment_name:
+        assert "religion" in args.data_dir
+        questionnaire_description = ""
+        questionnaire_description_empty = True
+
+    else:
+        raise ValueError(f"Experiment name is ill-defined {args.experiment_name}")
+
+    if base_model_template:
+        prefix = "The following is a conversation with"
+    else:
+        prefix = "You are"
+
+    if args.simulated_population_type in ["famous_people"]:
+        set_persona_str = f"{prefix} {simulated_participant}"
+    elif args.simulated_population_type in ["tolkien_characters"]:
+        set_persona_str = f"{prefix} {simulated_participant} from J. R. R. Tolkien's Middle-earth legendarium."
+    elif args.simulated_population_type == "permutations":
+        set_persona_str = ""
+    else:
+        raise ValueError("Unknown population type")
+
+    if args.query_prompt:
+        query_str = args.query_prompt
+    else:
+        query_str = "Answer: ("
+
+    assert (not questionnaire_description_empty == questionnaire_description)
+
+    prompt_skeleton = {
+        "set_persona_str": set_persona_str,  # remove newline from the end
+        "questionnaire_description": questionnaire_description,
+        "query_str": f"{query_str}",
+    }
+
+    return prompt_skeleton
+
+def format_example(df, idx, experiment_name, args, permutations_dict, simulated_participant, base_model_template=None):
+    # an item contains a question and suggested answers
+    item_str = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+
+    # extract options
+    num_options = 0
+    options_strings = []
+    for j in range(k):
+        op_str = df.iloc[idx, j+1]
+
+        if op_str == "undef":
+            continue
+
+        options_strings.append(op_str)
+
+        num_options += 1
+
+    for ch in choices[:num_options]:
+        item_str += "\n({}) {}".format(ch, options_strings[permutations_dict[ch]])
+
+    prompt = get_prompt_skeleton(
+        experiment_name=experiment_name,
+        args=args,
+        simulated_participant=simulated_participant,
+        base_model_template=base_model_template
+    )
+
+    prompt["item_str"] = item_str
+
+    return prompt, num_options
+
+
+def hash_chat_conv(msgs_conv):
+    json_string = json.dumps(msgs_conv)
+
+    # Create a SHA256 hash of the string
+    hash_object = hashlib.sha256(json_string.encode())
+
+    # Get the hexadecimal representation of the hash
+    hex_dig = hash_object.hexdigest()
+
+    return hex_dig
+
+
+def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None):
+    cors = []
+    all_probs = []
+    all_lprobs = []
+    all_answers = []
+    all_generations = []
+    all_scores = []
+
+    # hashing for simulated conversations
+    messages_conv = None
+    messages_conv_hash = None
+
+    gpt_token_counter = {"input": 0, "output": 0}
+
+    assert test_df.shape[0] == len(participant_perm_dicts)
+
+    for item_i, permutations_dict in enumerate(participant_perm_dicts):
+        inv_permutations_dict = {v: k for k, v in permutations_dict.items()}
+
+        if item_i % 20 == 0:
+            print(f"Eval progress: {item_i}/{test_df.shape[0]}")
+
+        #  e.g. A -> A little like me
+        label_2_text_option_dict = {
+            label: test_df.iloc[item_i, score+1] for label, score in permutations_dict.items()
+        }
+        prompt, n_options = format_example(
+            test_df, item_i,
+            experiment_name=args.experiment_name,
+            args=args,
+            permutations_dict=permutations_dict,
+            simulated_participant=simulated_participant,
+            base_model_template=llm_generator.base_model_template
+        )
+
+        assert n_options == len(permutations_dict)
+        answers = choices[:n_options]
+
+        assert all([a in permutations_dict for a in answers])
+
+        label = test_df.iloc[item_i, test_df.shape[1]-1]
+        assert label in answers + ["undef"]
+
+        if args.estimate_gpt_tokens:
+            gpt_tokenizer = tiktoken.get_encoding("cl100k_base")
+        else:
+            gpt_tokenizer = None
+
+        if args.simulated_conversation_theme:
+
+            set_persona_str = prompt["set_persona_str"]
+            if messages_conv is None:
+                if args.verbose:
+                    print("SIMULATING CONVERSATION")
+
+                messages_conv, messages_conv_hash = simulate_conversation(
+                    args=args,
+                    engine=engine,
+                    sim_engine=engine,
+                    model_set_persona_string=set_persona_str,
+                    simulated_participant=simulated_participant,
+                    llm_generator=llm_generator,
+                )
+
+                if args.estimate_gpt_tokens:
+                    # topic setting msg
+                    current_input_tokens = len(gpt_tokenizer.encode(messages_conv[0]['content']))
+
+                    for msg_i in range(1, len(messages_conv)):
+                        current_output_tokens = len(gpt_tokenizer.encode(messages_conv[msg_i]['content']))
+                        gpt_token_counter['input'] += current_input_tokens
+                        gpt_token_counter['output'] += current_output_tokens
+
+                        # add for next message
+                        current_input_tokens += current_output_tokens
+
+            else:
+                if args.verbose:
+                    print("LOADING CACHED CONVERSATION")
+                assert hash_chat_conv(messages_conv) == messages_conv_hash
+
+        if args.estimate_gpt_tokens:
+            # gpt params
+            messages = construct_messages(
+                prompt=prompt,
+                system_message=True,
+                messages_conv=messages_conv if args.simulated_conversation_theme else None,
+            )
+            n_input_tokens = sum([len(gpt_tokenizer.encode(msg['content'])) for msg in messages])
+
+            gpt_token_counter['input'] += n_input_tokens
+            gpt_token_counter['output'] += 1
+
+        messages = construct_messages(
+            prompt=prompt,
+            system_message=llm_generator.system_message,
+            messages_conv=messages_conv if args.simulated_conversation_theme else None,
+        )
+
+        if args.verbose:
+            print_chat_messages(messages)
+
+        generation, lprobs = llm_generator.predict(
+            messages=messages,
+            answers=answers,
+            label_2_text_option_dict=label_2_text_option_dict,
+            query_string=prompt['query_str'],
+            assistant_label=simulated_participant_to_name(simulated_participant, args.simulated_population_type).upper()
+        )
+
+        probs = softmax(np.array(lprobs))
+        pred = {i: c for i, c in enumerate(answers)}[np.argmax(lprobs)]
+        cor = pred == label
+        score = map_choice_to_number(pred, permutations_dict)
+
+        if args.verbose:
+            print(colored(f"Pred:{pred} (Generation:{generation}; Score: {score})", "green"))
+            print("------------------")
+
+        cors.append(cor)
+        all_lprobs.append(lprobs)
+        all_probs.append(probs)
+        all_answers.append(pred)
+        all_generations.append(generation)
+        all_scores.append(score)
+
+    cors = np.array(cors)
+    all_scores = np.array(all_scores)
+
+    if args.estimate_gpt_tokens:
+        estimate_and_print_gpt_prices(gpt_token_counter, engine)
+
+    return cors, all_probs, all_lprobs, all_answers, all_scores, all_generations, gpt_token_counter
+
+
+def main(args):
+    engine = args.engine
+    print("Engine:", engine)
+
+    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
+
+    # add timestamp to dir_name
+    dump_results_dir = Path(args.save_dir)
+    dump_results_dir = dump_results_dir.with_name(dump_results_dir.name+f"_{timestamp}")
+
+    if not args.overwrite:
+        prev_jsons = list(dump_results_dir.parent.glob(dump_results_dir.name.rstrip(timestamp)+"*/results.json"))
+        if len(prev_jsons) > 0:
+            raise RuntimeError(f"Previous version of this run were found: {prev_jsons}")
+
+    else:
+        old_jsons = list(Path(dump_results_dir).parent.glob(f"*{args.permute_options_seed}*{args.simulated_conversation_theme}*/results.json"))
+
+        for old_json in old_jsons:
+            new_json = old_json.parent / "results_old.json.backup"
+            old_json.rename(new_json)
+            print(f"Renamed: {old_json} --> {new_json}")
+
+    os.makedirs(dump_results_dir, exist_ok=True)
+    print("Savedir: ", dump_results_dir)
+
+    # Data preparation
+    if len(subjects) == 0:
+        raise ValueError("No subjects found.")
+
+    if "data_pvq" in args.data_dir:
+        assert "pvq" in args.experiment_name
+
+        # assert set(subjects_to_evaluate).issubset(subjects)
+        subjects = ["pvq_auto"]
+
+    print("Args:", args)
+    print("Subjects:", subjects)
+
+    gpt_tokens_total = {"input": 0, "output": 0}
+
+    llm_generator = create_model(
+        engine,
+        additional_model_args={"use_azure": args.azure_openai}
+    )
+
+    if "pvq" in args.data_dir:
+        max_n_options = 6
+    elif "donation" in args.data_dir:
+        max_n_options = 6
+    elif "bag" in args.data_dir:
+        max_n_options = 6
+    elif "religion" in args.data_dir:
+        max_n_options = 5
+    else:
+        raise ValueError(f"Undefined number of options for data in {args.data_dir}.")
+
+    if args.simulated_population_type == "permutations":
+        simulated_population = [None]*args.permutations
+        simulated_population_genders = (["M", "F"]*int(np.ceil(args.permutations/2)))[:args.permutations]
+
+    elif args.simulated_population_type == "tolkien_characters":
+        # https://en.wikipedia.org/wiki/List_of_Middle-earth_characters
+        # 50 characters with the longest wikipedia page
+        with open("personas/tolkien_characters/tolkien_characters.txt") as f:
+            simulated_population = [name.rstrip() for name in f.readlines()]
+
+        with open("personas/tolkien_characters/tolkien_characters_genders.txt") as f:
+            simulated_population_genders = [g.rstrip() for g in f.readlines()]
+
+    elif args.simulated_population_type == "famous_people":
+        # source: https://www.biographyonline.net/people/famous-100.html
+        with open("personas/famous_people/famous_people.txt") as f:
+            simulated_population = [name.rstrip() for name in f.readlines()]
+
+        with open("personas/famous_people/famous_people_genders.txt") as f:
+            simulated_population_genders = [g.rstrip() for g in f.readlines()]
+
+    all_cors = []
+
+    # list because of permutations
+    subj_lprobs = [{} for _ in range(len(simulated_population))]
+    subj_len = [{} for _ in range(len(simulated_population))]
+    metrics = [{} for _ in range(len(simulated_population))]
+    answers = [{} for _ in range(len(simulated_population))]
+    generations = [{} for _ in range(len(simulated_population))]
+
+    # evaluate model
+    for subject in subjects:
+
+        if subject == "pvq_auto":
+            if not simulated_population_genders:
+                raise ValueError("Simulated population genders is not defined.")
+
+            test_df_dict = {}
+            test_df_dict["F"] = pd.read_csv(
+                os.path.join(args.data_dir, args.eval_set, f"pvq_female_{args.eval_set}.csv"),
+                header=None, keep_default_na=False,
+            )
+
+            test_df_dict["M"] = pd.read_csv(
+                os.path.join(args.data_dir, args.eval_set, f"pvq_male_{args.eval_set}.csv"),
+                header=None, keep_default_na=False,
+            )
+
+            # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
+            test_df_dict["F"][0][:] = test_df_dict["F"][0][:].str.replace("\\n", "\n")
+            test_df_dict["M"][0][:] = test_df_dict["M"][0][:].str.replace("\\n", "\n")
+
+            assert len(test_df_dict["F"]) == len(test_df_dict["M"])
+            assert test_df_dict["F"].shape == test_df_dict["M"].shape
+
+            num_questions = len(test_df_dict["F"])
+            assert max_n_options == test_df_dict["F"].shape[1] - 2
+            n_options = [max_n_options] * num_questions
+
+        else:
+
+            test_df = pd.read_csv(
+                os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"),
+                header=None,
+                keep_default_na=False,
+                dtype=str
+            )
+            n_options = [max_n_options]*len(test_df)
+
+            # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline
+            test_df[0][:] = test_df[0][:].str.replace("\\n", "\n")
+
+            num_questions = len(test_df)
+
+        permutations_dicts = create_permutation_dicts(
+            args,
+            n_options,
+            choices,
+            num_questions=num_questions,
+            population_size=len(simulated_population)
+        )
+
+        assert len(permutations_dicts) == len(simulated_population)
+        assert all([len(part_d) == num_questions for part_d in permutations_dicts])
+
+        pop_start_time = time.time()
+        # evaluate over population
+        for sim_part_i, (simulated_participant, simulated_participant_gender, participant_perm_dicts) in enumerate(zip(simulated_population, simulated_population_genders, permutations_dicts)):
+
+            if sim_part_i > 0:
+                eta = estimate_eta(start_time=pop_start_time, progress=sim_part_i/len(simulated_population))
+                eta_str = "ETA: {:.0f}h {:.0f}m {:.2f}s".format(*secs_2_hms(eta))
+
+            else:
+                eta_str = ""
+
+            print(f"Simulated participant {sim_part_i}/{len(simulated_population)} {eta_str}")
+
+            if subject == "pvq_auto":
+                test_df = test_df_dict[simulated_participant_gender]
+
+            cors, eval_probs, eval_lprobs, preds, preds_values, gens, gpt_tokens = eval(
+                args=args,
+                engine=engine,
+                test_df=test_df,
+                participant_perm_dicts=participant_perm_dicts,
+                llm_generator=llm_generator,
+                simulated_participant=simulated_participant,
+            )
+            all_cors.append(cors)
+            gpt_tokens_total['input'] += gpt_tokens['input']
+            gpt_tokens_total['output'] += gpt_tokens['output']
+
+            subj_lprobs[sim_part_i][subject] = eval_lprobs
+            subj_len[sim_part_i][subject] = num_questions
+            answers[sim_part_i][subject] = list(zip(preds, map(int, preds_values)))
+            generations[sim_part_i][subject] = gens
+
+            if "pvq" in args.data_dir:
+                assert "pvq" in args.experiment_name
+
+                profile_values_idx_json = os.path.join(os.path.join(args.data_dir, "raw"), "values.json")
+
+                with open(profile_values_idx_json) as f:
+                    profile_values_idx = json.load(f)
+
+                profile_values_idx = {k: np.array(v)-1 for k, v in profile_values_idx.items() if k != "_comment"}
+
+                metrics[sim_part_i][subject] = {}
+
+                for profile_value, idxs in profile_values_idx.items():
+                    metrics[sim_part_i][subject][profile_value] = preds_values[idxs].mean() # legacy: todo: remove and save those below
+
+            elif "tolkien_donation" in args.data_dir:
+                assert "donation" in args.experiment_name
+
+                groups = ["elves", "dwarves", "orcs", "humans", "hobbits"]
+
+                donated = (preds_values-1)*2
+                group_donations = np.split(donated, len(groups))
+                assert set([len(g) for g in group_donations]) == {20}
+
+                metrics[sim_part_i][subject] = {
+                    f"Donation {g}": np.mean(g_d) for g, g_d in zip(groups, group_donations)
+                }
+
+            elif "tolkien_bag" in args.data_dir:
+                assert "bag" in args.experiment_name
+
+                groups = ["elves", "dwarves", "orcs", "humans", "hobbits"]
+                group_bag = np.split(preds_values, len(groups))
+                assert set([len(g) for g in group_bag]) == {20}
+
+                metrics[sim_part_i][subject] = {
+                    f"Return {g}": np.mean(g_d) for g, g_d in zip(groups, group_bag)
+                }
+
+            elif "religion" in args.data_dir:
+                assert "religion" in args.experiment_name
+
+                metrics[sim_part_i][subject] = {
+                    f"religion time": np.mean(preds_values)
+                }
+
+            else:
+                raise NotImplementedError("Evaluation not implemented")
+
+        # assert the same and take the fist
+        assert all(subj_len[0] == s for s in subj_len)
+        subj_len = subj_len[0]
+
+        # remap from list of metrics to metrics with lists
+        mean_metrics = defaultdict(lambda: defaultdict(list))
+        for metrics_perm in metrics:
+            for subj, subj_metrics in metrics_perm.items():
+                for metric, value in subj_metrics.items():
+                    mean_metrics[subj][metric].append(value)
+
+        # average metrics
+        mean_metrics = {
+            subj: {
+                metric: np.mean(values) for metric, values in subj_metrics.items()
+            } for subj, subj_metrics in mean_metrics.items()
+        }
+
+        pop_metrics = {}
+
+        # save results
+        for subj, m in mean_metrics.items():
+            if m:
+                print("Subject: ", subj)
+                for metric, score in m.items():
+                    print(f"{metric} : {score}")
+
+        if not os.path.exists(dump_results_dir):
+            os.mkdir(dump_results_dir)
+
+        json_dump_path = os.path.join(dump_results_dir, 'results.json')
+
+        with open(json_dump_path, 'w') as fp:
+            json.dump({
+                "args": vars(args),
+                "metrics": mean_metrics,
+                "pop_metrics": pop_metrics,
+                "per_permutation_metrics": metrics,  # legacy todo: remove and update var_viz
+                "per_simulated_participant_metrics": metrics,
+                "simulated_population": simulated_population,
+                "generations": generations,
+                "answers": answers,
+                "lprobs": subj_lprobs,
+                **{
+                    "params": vars(args)
+                }
+            }, fp, indent=4)
+
+        print(f"Results saved to {json_dump_path}")
+
+        if pop_metrics:
+            print("pop metrics:", pop_metrics['all']['hist'])
+
+    if args.estimate_gpt_tokens:
+        estimate_and_print_gpt_prices(gpt_tokens_total, engine)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", "-d", type=str, required=True)
+    parser.add_argument("--save_dir", "-s", type=str, default="results/results_test")
+    parser.add_argument("--experiment_name", "-n", type=str, default="")
+    parser.add_argument("--pvq-version", type=str, default="pvq_auto", choices=["pvq_auto"])
+    parser.add_argument("--engine", "-e", type=str, default="dummy")
+    parser.add_argument("--format", type=str, default="chat", choices=["chat"])
+    parser.add_argument('--profile', type=str, help='Profile definition in format "k:v;k:v;k:v", ex. "age:35;interests:reading books"')
+    parser.add_argument("--query-prompt", "-qp", type=str, help="Use Answer(as ONE letter): where applicable.")
+    parser.add_argument("--verbose", "-v", action="store_true")
+    parser.add_argument("--assert-params", action="store_true")
+    parser.add_argument("--estimate-gpt-tokens", "-t", action="store_true")
+    parser.add_argument("--eval-set", type=str, default="test", choices=["test", "val"])
+    # parser.add_argument("--simulated-conversation-msg-max-tokens", type=int, default=100)
+    # parser.add_argument("--simulated-conversation-top-p", type=float, default=0.9)
+    # parser.add_argument("--simulated-conversation-temp", type=float, default=0.7)
+    parser.add_argument("--simulated-conversation-theme", type=str, default=None)
+    parser.add_argument("--simulated-conversation-n-messages", type=int, default=5)
+    parser.add_argument("--permute-options", "-po", action="store_true")
+    parser.add_argument("--azure-openai", action="store_true")
+    parser.add_argument("--simulated-human-knows-persona", action="store_true")
+    parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people", "llm_personas", "user_personas", "anes"])
+    parser.add_argument("--permutations", "-p", type=int, default=1)  # permutations as a population type
+    parser.add_argument("--permute-options-seed", type=str)
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+
+    assert args.pvq_version == "pvq_auto"
+
+    if not args.data_dir.startswith("data"):
+        raise ValueError(f"data_dir should be inside data, and it's {args.data_dir}")
+
+    if args.simulated_population_type == "permutations":
+        if args.simulated_human_knows_persona:
+            raise ValueError("Use simulated_human_knows_persona cannot be used with permutations sim. population type")
+    else:
+        if args.simulated_conversation_theme and not args.simulated_human_knows_persona:
+            raise ValueError("Use simulated_human_knows_persona.")
+
+    if args.simulated_conversation_theme in ["None", "none"]:
+        args.simulated_conversation_theme = None
+
+    if args.estimate_gpt_tokens:
+        if "gpt" not in args.engine and args.engine != "dummy":
+            raise ValueError("Only gpt-4 gpt-3 and dummy support estimating GPT tokens")
+
+    if args.permute_options and args.permute_options_seed is None:
+        raise ValueError("Permute options string should be defined for stability")
+
+    if ("gpt-3.5" in args.engine and args.permutations > 50) or ("gpt-4" in args.engine and args.permutations > 5):
+        raise ValueError(f"Are you sure you want to use {args.permutations} with {args.engine}??")
+
+    start_time = time.time()
+    main(args)
+    end_time = time.time()
+    print("Elapsed time:", str(datetime.timedelta(seconds=end_time-start_time)).split(".")[0])
\ No newline at end of file
diff --git a/iclr_evaluations.sh b/iclr_evaluations.sh
deleted file mode 100644
index 919e405..0000000
--- a/iclr_evaluations.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-# table of formats
-# correlations
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename pvq_formats_gpt35j_50 | grep -E -A 1 "^Mean|pvq_formats"
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename pvq_formats_gpt35m_50 | grep -E -A 1 "^Mean|pvq_formats"
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama2_70b_instruct_v2_perm_50_format/* --save --filename pvq_formats_llama2_50 | grep -E -A 1 "^Mean|pvq_formats"
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama_60b_instruct_perm_50_format/* --save --filename pvq_formats_llama_50 | grep -E -A 1 "^Mean|pvq_formats"
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_openassistant_rlhf2_llama30b_perm_50_format/* --save --filename pvq_formats_oa_50 | grep -E -A 1 "^Mean|pvq_formats"
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_zephyr-7b-beta_perm_50_format/* --save --filename pvq_formats_zephyr_50 | grep -E -A 1 "^Mean|pvq_formats"
-
-## Mean SD
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename pvq_formats_gpt35j_50 | grep '^Mean.*SD (avg per\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename pvq_formats_gpt35m_50 | grep '^Mean.*SD (avg per\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama2_70b_instruct_v2_perm_50_format/* --save --filename pvq_formats_llama2_50 | grep '^Mean.*SD (avg per\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama_60b_instruct_perm_50_format/* --save --filename pvq_formats_llama_50 | grep '^Mean.*SD (avg per\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_openassistant_rlhf2_llama30b_perm_50_format/* --save --filename pvq_formats_oa_50 | grep '^Mean.*SD (avg per\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_zephyr-7b-beta_perm_50_format/* --save --filename pvq_formats_zephyr_50 | grep '^Mean.*SD (avg per\|pvq_format'
-
-# Rank-order stability
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename pvq_formats_gpt35j_50 | grep -A 2 '^Average rank-order\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename pvq_formats_gpt35m_50 | grep -A 2 '^Average rank-order\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama2_70b_instruct_v2_perm_50_format/* --save --filename pvq_formats_llama2_50 | grep -A 2 '^Average rank-order\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama_60b_instruct_perm_50_format/* --save --filename pvq_formats_llama_50 | grep -A 2 '^Average rank-order\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_openassistant_rlhf2_llama30b_perm_50_format/* --save --filename pvq_formats_oa_50 | grep -A 2 '^Average rank-order\|pvq_format'
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_zephyr-7b-beta_perm_50_format/* --save --filename pvq_formats_zephyr_50 | grep -A 2 '^Average rank-order\|pvq_format'
-
-# AVG cohen's ds
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename pvq_formats_gpt35j_50 | grep -A 2 '^Average absolute\|pvq_format'
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename pvq_formats_gpt35m_50 | grep -A 2 '^Average absolute\|pvq_format'
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama2_70b_instruct_v2_perm_50_format/* --save --filename pvq_formats_llama2_50 | grep -A 2 '^Average absolute\|pvq_format'
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_up_llama_60b_instruct_perm_50_format/* --save --filename pvq_formats_llama_50 | grep -A 2 '^Average absolute\|pvq_format'
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_openassistant_rlhf2_llama30b_perm_50_format/* --save --filename pvq_formats_oa_50 | grep -A 2 '^Average absolute\|pvq_format'
-python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_zephyr-7b-beta_perm_50_format/* --save --filename pvq_formats_zephyr_50 | grep -A 2 '^Average absolute\|pvq_format'
-
-
-##GPTm
-
-#PVQ
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_sim_conv_gpt-3.5-turbo-0301_perm_50_theme/* --save --filename pvq_sim_conv_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename pvq_formats_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_AI_wiki_context_v2_no_separator_music_expert_pvq_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/* --save --filename pvq_wiki_gpt35m_50
-
-## HOF
-#python visualization_scripts/bar_viz.py results_iclr/results_hofstede_test_sim_conv_gpt-3.5-turbo-0301_perm_50_theme/* --save --filename hof_sim_conv_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_hofstede_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename hof_formats_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_AI_wiki_context_v2_no_separator_music_expert_hofstede_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/* --save --filename hof_wiki_gpt35m_50
-
-# BIG5
-#python visualization_scripts/bar_viz.py results_iclr/results_big5_test_sim_conv_gpt-3.5-turbo-0301_perm__theme/* --save --filename big5_sim_conv_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_big5_test_format_gpt-3.5-turbo-0301_perm_50_format/* --save --filename big5_formats_gpt35m_50
-#python visualization_scripts/bar_viz.py results_iclr/results_AI_wiki_context_v2_no_separator_music_expert_big5_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/* --save --filename big5_wiki_gpt35m_50
-
-
-#GPTj
-# PVQ
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_sim_conv_gpt-3.5-turbo-0613_perm_50_theme/* --save --filename pvq_sim_conv_gpt35j_50
-#python visualization_scripts/bar_viz.py results_iclr/results_pvq_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename pvq_formats_gpt35j_50
-#python visualization_scripts/bar_viz.py results_iclr/results_AI_wiki_context_v2_no_separator_music_expert_pvq_test_gpt-3.5-turbo-0613_perm_50_User_msg_3rd_prs/* --save --filename pvq_wiki_gpt35j_50
-
-# HOF
-#python visualization_scripts/bar_viz.py results_iclr/results_hofstede_test_sim_conv_gpt-3.5-turbo-0613_perm_50_theme/* --save --filename hof_sim_conv_gpt35j_50
-#python visualization_scripts/bar_viz.py results_iclr/results_hofstede_test_format_gpt-3.5-turbo-0613_perm_50_format/* --save --filename hof_formats_gpt35j_50
-#python visualization_scripts/bar_viz.py results_iclr/results_AI_wiki_context_v2_no_separator_music_expert_hofstede_test_gpt-3.5-turbo-0613_perm_50_User_msg_3rd_prs/* --save --filename hof_wiki_gpt35j_50
-
-
-
-# NOTE: remember to change ylims in bar_viz.py
-#        if test_set_name == "pvq_male":
-#            ax.set_ylim([-3, 3]) # append
-#            # ax.set_ylim([-2.5, 2.5])
-#
-#        elif test_set_name == "hofstede":
-#            ax.set_ylim([-350, 350]) # append
-#            # ax.set_ylim([-150, 150])
-#
-#        elif test_set_name == "big5_50":
-#            ax.set_ylim([0, 55])
-#
-#        elif test_set_name == "big5_100":
-#            ax.set_ylim([0, 110])
-
-# gpt4
-pvq_resS3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_3rd_prs/* --save --filename neurips_plots/pvq_gpt4_50_S3`
-hof_resU3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_hofstede_test_gpt-4-0314_perm_50_User_msg_3rd_prs/* --save --filename neurips_plots/hof_gpt4_50_U3`
-big5_resU3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_big5_test_gpt-4-0314_perm_50_User_msg_3rd_prs/* --save --filename neurips_plots/big5_gpt4_50_U3`
-
-# gpt35m
-pvq_resS2=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0301_perm_50_System_msg_2nd_prs/* --save --filename neurips_plots/pvq_gpt35m_50_S2`
-hof_resU2=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0301_perm_50_User_msg_2nd_prs/* --save --filename neurips_plots/hof_gpt35m_50_U2`
-big5_resU2=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0301_perm_50_User_msg_2nd_prs/* --save --filename neurips_plots/big5_gpt35m_50_U2`
-
-# gpt35j
-pvq_resS2=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0613_perm_50_System_msg_2nd_prs/* --save --filename neurips_plots/pvq_gpt35j_50_S2`
-hof_resS3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0613_perm_50_System_msg_3rd_prs/* --save --filename neurips_plots/hof_gpt35j_50_S3`
-big5_resS3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0613_perm_50_System_msg_3rd_prs/* --save --filename neurips_plots/big5_gpt35j_50_S3`
-
-# Upllama
-pvq_resU2=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_pvq_test_up_llama_60b_instruct_perm_50_User_msg_2nd_prs/* --save --filename neurips_plots/pvq_upllama_50_U2`
-hof_resU3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_hofstede_test_up_llama_60b_instruct_perm_50_User_msg_3rd_prs/* --save --filename neurips_plots/hof_upllama_50_U3`
-big5_resU3=`python visualization_scripts/bar_viz.py results_neurips/results_nat_lang_prof_big5_test_up_llama_60b_instruct_perm_50_User_msg_3rd_prs/* --save --filename neurips_plots/big5_upllama_50_U3`
diff --git a/ipsative_stat_test.py b/ipsative_stat_test.py
deleted file mode 100644
index 1977ad5..0000000
--- a/ipsative_stat_test.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#! python3
-
-from pathlib import Path
-import json
-from collections import defaultdict
-import scipy.stats as stats
-from termcolor import colored
-
-# Ipsative
-# mean, STD, n
-ipsative_human_change = 0.59, 0.25, 270
-
-# Simulated conversations
-data_sim_conv = {}
-data_sim_conv["chess_grammar"] = (0.78, 0.12, 50)
-data_sim_conv["chess_history"] = (0.70, 0.19, 50)
-data_sim_conv["chess_joke"] = (0.48, 0.27, 50)
-data_sim_conv["chess_poem"] = (0.87, 0.07, 50)
-data_sim_conv["grammar_history"] = (0.70, 0.18, 50)
-data_sim_conv["grammar_joke"] = (0.40, 0.32, 50)
-data_sim_conv["grammar_poem"] = (0.90, 0.06, 50)
-data_sim_conv["history_joke"] = (0.52, 0.30, 50)
-data_sim_conv["history_poem"] = (0.74, 0.20, 50)
-data_sim_conv["joke_poem"] = (0.42, 0.32, 50)
-
-# Textual formats
-data_text_format = {}
-data_text_format["chat_code_cpp"] = (0.05, 0.23, 50)
-data_text_format["chat_code_py"] = (0.31, 0.28, 50)
-data_text_format["chat_conf_toml"] = (0.86, 0.06, 50)
-data_text_format["chat_latex"] = (0.68, 0.19, 50)
-data_text_format["code_cpp_code_py"] = (0.30, 0.35, 50)
-data_text_format["code_cpp_conf_toml"] = (0.08, 0.24, 50)
-data_text_format["code_cpp_latex"] = (0.20, 0.32, 50)
-data_text_format["code_py_conf_toml"] = (0.33, 0.31, 50)
-data_text_format["code_py_latex"] = (0.55, 0.27, 50)
-data_text_format["conf_toml_latex"] = (0.68, 0.23, 50)
-
-# Wikipedia articles
-data_wiki = {}
-data_wiki["classical_gospel"] = (0.84, 0.09, 50)
-data_wiki["classical_heavy_metal"] = (0.80, 0.10, 50)
-data_wiki["classical_hip_hop"] = (0.86, 0.07, 50)
-data_wiki["classical_jazz"] = (0.82, 0.11, 50)
-data_wiki["classical_reggae"] = (0.84, 0.09, 50)
-data_wiki["gospel_heavy_metal"] = (0.77, 0.09, 50)
-data_wiki["gospel_hip_hop"] = (0.83, 0.08, 50)
-data_wiki["gospel_jazz"] = (0.79, 0.10, 50)
-data_wiki["gospel_reggae"] = (0.88, 0.06, 50)
-data_wiki["heavy_metal_hip_hop"] = (0.91, 0.05, 50)
-data_wiki["heavy_metal_jazz"] = (0.92, 0.05, 50)
-data_wiki["heavy_metal_reggae"] = (0.86, 0.09, 50)
-data_wiki["hip_hop_jazz"] = (0.90, 0.05, 50)
-data_wiki["hip_hop_reggae"] = (0.88, 0.08, 50)
-data_wiki["jazz_reggae"] = (0.89, 0.06, 50)
-
-human_mean, human_std, human_nobs = ipsative_human_change
-p_limit = 0.05
-
-for data in [data_sim_conv, data_text_format, data_wiki]:
-    # we only compare those with llm correlation < human correlation
-    data = {key: value for key, value in data.items() if value[0] < human_mean}
-
-    if len(data) == 0:
-        continue
-
-    # bonferroni correction
-    p_limit_bonf = p_limit / len(data)
-
-    print("------------")
-    for key, value in data.items():
-        llm_mean, llm_std, llm_nobs = value
-        pvalue = stats.ttest_ind_from_stats(
-            mean1=human_mean,
-            std1=human_std,
-            nobs1=human_nobs,
-            mean2=llm_mean,
-            std2=llm_std,
-            nobs2=llm_nobs,
-        ).pvalue
-
-        if pvalue < p_limit_bonf:
-            print(colored(f"{key} - Mean: {llm_mean} p={pvalue:.5f}", "green"))
-        else:
-            print(f"{key} - Mean: {llm_mean} p={pvalue:.5f}")
-
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..1bb69c2
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1,47 @@
+import os
+import torch
+import json
+import importlib
+
+from .utils import *
+from .model import Model
+from .dummymodel import DummyModel
+from .interactivemodel import InteractiveModel
+from .openaimodel import OpenAIModel
+from .huggingfacemodel import HuggingFaceModel, LLama3Model, Mixtral8x22BModel
+
+hf_token = os.environ["HF_TOKEN"]
+
+def load_model_args(model_name):
+
+    try:
+        with open(f'./models/configs/{model_name}.json', 'r') as file:
+            model_args = json.load(file)
+
+    except FileNotFoundError:
+        raise FileNotFoundError(f"The configuration file for {model_name} could not be found.")
+
+    if 'load_args' in model_args:
+        # parse hf token
+        if "token" in model_args['load_args']:
+            if model_args['load_args']['token'] == "HF_TOKEN":
+                model_args['load_args']['token'] = hf_token
+
+        # parse torch.dtype
+        if "torch_dtype" in model_args['load_args']:
+            if model_args['load_args']['torch_dtype'].startswith("torch."):
+                model_args['load_args']['torch_dtype'] = eval(model_args['load_args']['torch_dtype'])
+
+    # load model class
+    my_module = importlib.import_module("models")
+    ModelClass = getattr(my_module, model_args['model_class'])
+
+    return ModelClass, model_args
+
+
+def create_model(model_name, additional_model_args=None):
+
+    from mergedeep import merge
+    ModelClass, model_args = load_model_args(model_name)
+    model_args = merge(model_args if model_args else {}, additional_model_args)
+    return ModelClass(**model_args)
\ No newline at end of file
diff --git a/models/configs/Mistral-7B-Instruct-v0.1.json b/models/configs/Mistral-7B-Instruct-v0.1.json
new file mode 100644
index 0000000..903ff50
--- /dev/null
+++ b/models/configs/Mistral-7B-Instruct-v0.1.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mistral-7B-Instruct-v0.1",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mistral-7B-Instruct-v0.2.json b/models/configs/Mistral-7B-Instruct-v0.2.json
new file mode 100644
index 0000000..6fd9222
--- /dev/null
+++ b/models/configs/Mistral-7B-Instruct-v0.2.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mistral-7B-v0.1.json b/models/configs/Mistral-7B-v0.1.json
new file mode 100644
index 0000000..2322fcb
--- /dev/null
+++ b/models/configs/Mistral-7B-v0.1.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mistral-7B-v0.1",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mixtral-8x22B-Instruct-v0.1-4b.json b/models/configs/Mixtral-8x22B-Instruct-v0.1-4b.json
new file mode 100644
index 0000000..9879930
--- /dev/null
+++ b/models/configs/Mixtral-8x22B-Instruct-v0.1-4b.json
@@ -0,0 +1,13 @@
+{
+  "model_class": "Mixtral8x22BModel",
+  "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "load_in_4bit": true
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mixtral-8x7B-Instruct-v0.1-4b.json b/models/configs/Mixtral-8x7B-Instruct-v0.1-4b.json
new file mode 100644
index 0000000..b71fe49
--- /dev/null
+++ b/models/configs/Mixtral-8x7B-Instruct-v0.1-4b.json
@@ -0,0 +1,16 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "trust_remote_code": true,
+    "load_in_4bit": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mixtral-8x7B-Instruct-v0.1.json b/models/configs/Mixtral-8x7B-Instruct-v0.1.json
new file mode 100644
index 0000000..4a733b9
--- /dev/null
+++ b/models/configs/Mixtral-8x7B-Instruct-v0.1.json
@@ -0,0 +1,16 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "trust_remote_code": true,
+    "torch_dtype": "torch.float16",
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mixtral-8x7B-v0.1-4b.json b/models/configs/Mixtral-8x7B-v0.1-4b.json
new file mode 100644
index 0000000..0ee57ea
--- /dev/null
+++ b/models/configs/Mixtral-8x7B-v0.1-4b.json
@@ -0,0 +1,16 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mixtral-8x7B-v0.1",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "load_in_4bit": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Mixtral-8x7B-v0.1.json b/models/configs/Mixtral-8x7B-v0.1.json
new file mode 100644
index 0000000..ccc7098
--- /dev/null
+++ b/models/configs/Mixtral-8x7B-v0.1.json
@@ -0,0 +1,16 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "mistralai/Mixtral-8x7B-v0.1",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "torch_dtype": "torch.float16",
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Qwen-14B.json b/models/configs/Qwen-14B.json
new file mode 100644
index 0000000..58f6002
--- /dev/null
+++ b/models/configs/Qwen-14B.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "Qwen/Qwen-14B",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Qwen-72B.json b/models/configs/Qwen-72B.json
new file mode 100644
index 0000000..72ba99e
--- /dev/null
+++ b/models/configs/Qwen-72B.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "Qwen/Qwen-72B",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Qwen-7B.json b/models/configs/Qwen-7B.json
new file mode 100644
index 0000000..b8a8c04
--- /dev/null
+++ b/models/configs/Qwen-7B.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "Qwen/Qwen-7B",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/Qwen1.5-72B-Chat.json b/models/configs/Qwen1.5-72B-Chat.json
new file mode 100644
index 0000000..247cf50
--- /dev/null
+++ b/models/configs/Qwen1.5-72B-Chat.json
@@ -0,0 +1,17 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "Qwen/Qwen1.5-72B-Chat",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "torch_dtype": "auto",
+    "token": "HF_TOKEN"
+  },
+  "tokenizer_load_args": {},
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/command_r_plus.json b/models/configs/command_r_plus.json
new file mode 100644
index 0000000..9f78783
--- /dev/null
+++ b/models/configs/command_r_plus.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "CohereForAI/c4ai-command-r-plus",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/dummy.json b/models/configs/dummy.json
new file mode 100644
index 0000000..a693eba
--- /dev/null
+++ b/models/configs/dummy.json
@@ -0,0 +1,10 @@
+{
+  "model_class": "DummyModel",
+  "model_id": "dummy",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto"
+  }
+}
\ No newline at end of file
diff --git a/models/configs/gpt-3.5-turbo-0125.json b/models/configs/gpt-3.5-turbo-0125.json
new file mode 100644
index 0000000..277969c
--- /dev/null
+++ b/models/configs/gpt-3.5-turbo-0125.json
@@ -0,0 +1,12 @@
+{
+  "model_class": "OpenAIModel",
+  "model_id": "gpt-3.5-turbo-0125",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {},
+  "generation_args": {
+    "max_tokens": 100,
+    "n": 1,
+    "temperature": 1.0
+  }
+}
diff --git a/models/configs/gpt-3.5-turbo-1106.json b/models/configs/gpt-3.5-turbo-1106.json
new file mode 100644
index 0000000..901cf71
--- /dev/null
+++ b/models/configs/gpt-3.5-turbo-1106.json
@@ -0,0 +1,12 @@
+{
+  "model_class": "OpenAIModel",
+  "model_id": "gpt-3.5-turbo-1106",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {},
+  "generation_args": {
+    "max_tokens": 100,
+    "n": 1,
+    "temperature": 1.0
+  }
+}
diff --git a/models/configs/interactive.json b/models/configs/interactive.json
new file mode 100644
index 0000000..9301655
--- /dev/null
+++ b/models/configs/interactive.json
@@ -0,0 +1,10 @@
+{
+  "model_class": "InteractiveModel",
+  "model_id": "interactive",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "trust_remote_code": true,
+    "device_map": "auto"
+  }
+}
\ No newline at end of file
diff --git a/models/configs/llama_2_13b.json b/models/configs/llama_2_13b.json
new file mode 100644
index 0000000..8987a3d
--- /dev/null
+++ b/models/configs/llama_2_13b.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-13b-hf",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_2_13b_chat.json b/models/configs/llama_2_13b_chat.json
new file mode 100644
index 0000000..d4fb915
--- /dev/null
+++ b/models/configs/llama_2_13b_chat.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-13b-chat-hf",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_2_70b.json b/models/configs/llama_2_70b.json
new file mode 100644
index 0000000..9a25285
--- /dev/null
+++ b/models/configs/llama_2_70b.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-70b-hf",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_2_70b_chat.json b/models/configs/llama_2_70b_chat.json
new file mode 100644
index 0000000..7872b67
--- /dev/null
+++ b/models/configs/llama_2_70b_chat.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-70b-chat-hf",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_2_7b.json b/models/configs/llama_2_7b.json
new file mode 100644
index 0000000..8bd9e7e
--- /dev/null
+++ b/models/configs/llama_2_7b.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-7b-hf",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_2_7b_chat.json b/models/configs/llama_2_7b_chat.json
new file mode 100644
index 0000000..a2d3344
--- /dev/null
+++ b/models/configs/llama_2_7b_chat.json
@@ -0,0 +1,21 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "meta-llama/Llama-2-7b-chat-hf",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.float16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "top_k": 50,
+    "temperature": 0.6,
+    "repetition_penalty": 1.2,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_3_70b_instruct.json b/models/configs/llama_3_70b_instruct.json
new file mode 100644
index 0000000..14558f0
--- /dev/null
+++ b/models/configs/llama_3_70b_instruct.json
@@ -0,0 +1,20 @@
+{
+  "model_class": "LLama3Model",
+  "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.bfloat16",
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "tokenizer_load_args": {},
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "eos_token_id": "defined in LLama3Model",
+    "temperature": 0.6,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/llama_3_8b_instruct.json b/models/configs/llama_3_8b_instruct.json
new file mode 100644
index 0000000..028167d
--- /dev/null
+++ b/models/configs/llama_3_8b_instruct.json
@@ -0,0 +1,20 @@
+{
+  "model_class": "LLama3Model",
+  "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.bfloat16",
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "tokenizer_load_args": {},
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "top_p": 0.9,
+    "eos_token_id": "defined in LLama3Model",
+    "temperature": 0.6,
+    "num_beams": 1
+  }
+}
diff --git a/models/configs/phi-1.json b/models/configs/phi-1.json
new file mode 100644
index 0000000..7208ac5
--- /dev/null
+++ b/models/configs/phi-1.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "microsoft/phi-1",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "token": "HF_TOKEN",
+    "torch_dtype": "auto",
+    "device_map": "cuda"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
\ No newline at end of file
diff --git a/models/configs/phi-2.json b/models/configs/phi-2.json
new file mode 100644
index 0000000..664eeb6
--- /dev/null
+++ b/models/configs/phi-2.json
@@ -0,0 +1,15 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "microsoft/phi-2",
+  "system_message": true,
+  "base_model_template": true,
+  "load_args": {
+    "token": "HF_TOKEN",
+    "torch_dtype": "auto",
+    "device_map": "cuda"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
\ No newline at end of file
diff --git a/models/configs/phi-3.json b/models/configs/phi-3.json
new file mode 100644
index 0000000..30f74e2
--- /dev/null
+++ b/models/configs/phi-3.json
@@ -0,0 +1,16 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "microsoft/phi-3-mini-128k-instruct",
+  "system_message": false,
+  "base_model_template": false,
+  "load_args": {
+    "token": "HF_TOKEN",
+    "trust_remote_code": true,
+    "torch_dtype": "auto",
+    "device_map": "cuda"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true
+  }
+}
diff --git a/models/configs/zephyr-7b-beta.json b/models/configs/zephyr-7b-beta.json
new file mode 100644
index 0000000..c059b1d
--- /dev/null
+++ b/models/configs/zephyr-7b-beta.json
@@ -0,0 +1,19 @@
+{
+  "model_class": "HuggingFaceModel",
+  "model_id": "HuggingFaceH4/zephyr-7b-beta",
+  "system_message": true,
+  "base_model_template": false,
+  "load_args": {
+    "torch_dtype": "torch.bfloat16",
+    "trust_remote_code": true,
+    "device_map": "auto",
+    "token": "HF_TOKEN"
+  },
+  "generation_args": {
+    "max_new_tokens": 100,
+    "do_sample": true,
+    "temperature": 0.7,
+    "top_p": 0.95,
+    "top_k": 50
+  }
+}
diff --git a/models/dummymodel.py b/models/dummymodel.py
new file mode 100644
index 0000000..3ed8244
--- /dev/null
+++ b/models/dummymodel.py
@@ -0,0 +1,57 @@
+import random
+
+from .model import Model
+from .utils import *
+
+class DummyModel(Model):
+    def __init__(self, model_id, *args, **kwargs):
+
+        super(DummyModel, self).__init__(
+            model_id=model_id,
+            *args, **kwargs
+        )
+
+    def generate(self, *args, **kwargs):
+        response = f"Dummy simulated message. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message. I am repeating it now. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message."
+        return response
+
+    def predict(
+        self,
+        messages,
+        answers,
+        label_2_text_option_dict=None,
+        query_string=None,
+        assistant_label=None,
+        user_label="USER",
+        system_label="CONTEXT",
+        *args, **kwargs
+    ):
+        if label_2_text_option_dict is None:
+            raise ValueError("label_2_text_option_dict must be provided")
+
+        if assistant_label is None:
+            raise ValueError("assistant_label must be provided. ")
+
+        formatted_prompt = apply_base_model_template(
+            messages,
+            add_generation_prompt=True,
+            assistant_label=assistant_label,
+            user_label="USER",
+            system_label="CONTEXT"
+        )
+
+        messages += [{
+            "role": "assistant",
+            "content": query_string
+        }]
+
+        if self.verbose:
+            print(f"************************\nFORMATTED PROMPT:\n{formatted_prompt}\n******************")
+
+        # import re
+        # generation = messages[-2]['content'][messages[-2]['content'].index(") a few hours per day") - 1:][:1]
+        generation = random.choice([f"{c}" for c in answers])
+
+        lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
+
+        return generation, lprobs
diff --git a/models/huggingfacemodel.py b/models/huggingfacemodel.py
new file mode 100644
index 0000000..605dc76
--- /dev/null
+++ b/models/huggingfacemodel.py
@@ -0,0 +1,264 @@
+import os
+from .model import Model
+from .utils import *
+import time
+
+def get_hf_cache_dir():
+    return os.environ['HF_HOME']
+
+
+hf_cache_dir = get_hf_cache_dir()
+os.environ['HF_HOME'] = hf_cache_dir
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+
+import torch
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops, tokenizer, original_input_ids):
+        super().__init__()
+        self.stops = [s.upper() for s in stops]
+        self.tokenizer = tokenizer
+        self.original_input_ids = original_input_ids
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        generated_ids = input_ids[0][len(self.original_input_ids[0]):]
+        generation = self.tokenizer.decode(generated_ids).upper()
+        return any([stop in generation for stop in self.stops])
+
+
+class HuggingFaceModel(Model):
+
+    def __init__(
+            self,
+            model_id,
+            base_model_template,
+            system_message,
+            load_args=None,
+            generation_args=None,
+            tokenizer_load_args=None,
+            *args,
+            **kwargs
+    ):
+        super(HuggingFaceModel, self).__init__(
+            model_id=model_id,
+            base_model_template=base_model_template,
+            system_message=system_message,
+            *args, **kwargs
+        )
+
+        if load_args is None:
+            self.load_args = {}
+        else:
+            self.load_args = load_args
+        print("Model Load args:", self.load_args)
+
+        if tokenizer_load_args is None:
+            self.tokenizer_load_args = self.load_args
+        else:
+            self.tokenizer_load_args = tokenizer_load_args
+
+        print("Tokenizer Load args:", self.tokenizer_load_args)
+
+        if generation_args is None:
+            self.generation_args = {}
+        else:
+            self.generation_args = generation_args
+        print("Generation args:", self.generation_args)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,
+            **self.tokenizer_load_args,
+            cache_dir=hf_cache_dir,
+        )
+        start_time = time.time()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            **self.load_args,
+            cache_dir=hf_cache_dir
+        ).eval()
+        end_time = time.time()
+
+        if self.verbose:
+            print("Model loading time: {}h {}m {}s".format(*secs_2_hms(end_time-start_time)))
+
+    def extract_answer_tokens(self, answers):
+        answer_tokens = {a: [] for a in answers}
+        for tok_ind in range(len(self.tokenizer)):
+            tok = self.tokenizer.decode([tok_ind])
+            if tok in answers:
+                answer_tokens[tok].append(tok_ind)
+
+        return answer_tokens
+
+    def parse_hf_outputs(self, output, answers):
+
+        answer_tokens = self.extract_answer_tokens(answers)  # todo: repetitive -> extract
+
+        option_scores = {
+            ans: max([output.scores[0][0, ind] for ind in answer_tokens[ans]])
+            for ans in answers
+        }
+
+        # take the most probable answer as the generation
+        generation = max(option_scores, key=option_scores.get)
+
+        # extract logprobs
+        lprobs = [float(option_scores[a]) for a in answers]
+
+        # todo: check that ' A' are one token and check for those as well and not "unk"
+        encoded_ans = [self.tokenizer.encode(ans, add_special_tokens=False)[0] for ans in answers]
+        option_scores = {enc_a: output.scores[0][0, enc_a] for enc_a in encoded_ans}
+
+        return option_scores, generation, lprobs
+
+    def predict(
+        self,
+        messages,
+        answers,
+        query_string=None,
+        assistant_label=None,
+        user_label="USER",
+        system_label="CONTEXT",
+        *args, **kwargs
+    ):
+
+        if self.base_model_template:
+            if assistant_label is None:
+                raise ValueError("assistant_label must be provided with base model template.")
+
+            formatted_prompt = apply_base_model_template(
+                messages,
+                add_generation_prompt=True,
+                assistant_label=assistant_label,
+                user_label="USER",
+                system_label="CONTEXT"
+            )
+
+        else:
+            formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+        formatted_prompt += query_string
+
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
+
+        # token match
+        output = self.model.generate(
+            **inputs,
+            max_new_tokens=1,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
+
+        _, generation, lprobs = self.parse_hf_outputs(output=output, answers=answers)
+
+        if self.verbose:
+            print(f"************************\nFORMATTED PROMPT:\n{formatted_prompt}\n->{generation}\n******************")
+        return generation, lprobs
+
+    def generate(
+            self,
+            messages,
+            additional_generation_args=None,
+            assistant_label=False,
+            user_label=False,
+            system_label=False,
+            stop_words_up=None
+    ):
+        if self.base_model_template:
+            if not self.system_message:
+                raise ValueError("system_message must be used with base model template")
+
+            if assistant_label is None:
+                raise ValueError("assistant_label must be defined with base model template")
+
+            if user_label is None:
+                raise ValueError("user_label must be defined with base model template")
+
+            if system_label is None:
+                raise ValueError("system_label must be defined with base model template")
+
+            if stop_words_up is None:
+                raise ValueError("stop_words_up must be defined with base model template (Uppercase stop words)")
+
+            formatted_prompt, stop_words = apply_base_model_template(
+                messages,
+                assistant_label=assistant_label,
+                user_label=user_label,
+                system_label=system_label,
+                add_generation_prompt=True,
+                return_stop_words=True
+            )
+            input_ids = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device).input_ids
+            assert all([w.upper() in stop_words_up for w in stop_words])
+            stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stop_words_up, self.tokenizer, input_ids)])
+
+            if self.verbose:
+                print(f"\n>>>>>>>>>>>>FORMATTED<<<>>>PROMPT<<<<<<<<<<<<\n{formatted_prompt}")
+
+        else:
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                add_generation_prompt=True
+            ).to(self.model.device)
+
+            if self.verbose:
+                print_chat_messages(messages)
+
+            stopping_criteria = None
+
+        if additional_generation_args is not None:
+            generation_args = {**self.generation_args, **additional_generation_args}
+        else:
+            generation_args = self.generation_args
+
+        output_seq = self.model.generate(
+            input_ids=input_ids,
+            **generation_args,
+            return_dict_in_generate=True,
+            output_scores=True,
+            stopping_criteria=stopping_criteria
+        )
+
+        response = self.tokenizer.decode(output_seq.sequences[0][len(input_ids[0]):], skip_special_tokens=True)
+        if self.verbose:
+            print(f"->{response}")
+        return response
+
+
+class LLama3Model(HuggingFaceModel):
+
+    def __init__(self, *args, **kwargs):
+        super(LLama3Model, self).__init__(*args, **kwargs)
+
+        self.generation_args["eos_token_id"] = [
+            self.tokenizer.eos_token_id,
+            self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+
+from mistral_common.tokens.instruct.normalize import ChatCompletionRequest
+from mistral_common.protocol.instruct.messages import (
+    AssistantMessage,
+    UserMessage,
+)
+
+def to_mistral_msg(msg):
+
+    if msg['role'] == 'user':
+        return UserMessage(content=msg['content'])
+    elif msg['role'] == 'assistant':
+        return AssistantMessage(content=msg['content'])
+    else:
+        raise ValueError(f"Undefined message role {msg['role']}")
+
+
+class Mixtral8x22BModel(HuggingFaceModel):
+
+    def generate(self, messages, *args, **kwargs):
+        if not self.base_model_template:
+            mistral_query = ChatCompletionRequest(messages=list(map(to_mistral_msg, messages)), model="test")
+            messages = mistral_query.model_dump()['messages']
+
+        return super(Mixtral8x22BModel, self).generate(messages, *args, **kwargs)
diff --git a/models/interactivemodel.py b/models/interactivemodel.py
new file mode 100644
index 0000000..c92ebb0
--- /dev/null
+++ b/models/interactivemodel.py
@@ -0,0 +1,52 @@
+from .model import Model
+from .utils import *
+
+
+class InteractiveModel(Model):
+
+    def __init__(self, model_id, *args, **kwargs):
+
+        super(InteractiveModel, self).__init__(
+            model_id=model_id,
+            *args, **kwargs
+        )
+
+    def generate(self, *args, **kwargs):
+        response = f"Dummy simulated message. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message. I am repeating it now. This is a filler message it same some extra text so as to help estimate the number of tokens. As the gpt generations is set to 100 tokens max. Here we aim to also 100 tokens message."
+        return response
+
+    def predict(
+        self,
+        messages,
+        answers,
+        label_2_text_option_dict=None,
+        query_string=None,
+        assistant_label=None,
+        user_label="USER",
+        system_label="CONTEXT",
+        *args, **kwargs
+    ):
+        if label_2_text_option_dict is None:
+            raise ValueError("label_2_text_option_dict must be provided")
+
+        if assistant_label is None:
+            raise ValueError("assistant_label must be provided. ")
+
+        formatted_prompt = apply_base_model_template(
+            messages,
+            add_generation_prompt=True,
+            assistant_label=assistant_label,
+            user_label=user_label,
+            system_label=system_label
+        )
+
+        messages += [{
+            "role": "assistant",
+            "content": query_string
+        }]
+        generation = input(f"{formatted_prompt}")
+
+        lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
+
+        return generation, lprobs
+
diff --git a/models/model.py b/models/model.py
new file mode 100644
index 0000000..ee828cc
--- /dev/null
+++ b/models/model.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+
+class Model(ABC):
+
+    def __init__(self, model_id, base_model_template, system_message, verbose=True, **kwargs):
+        self.model_id = model_id
+        self.base_model_template = base_model_template
+        self.system_message = system_message
+        self.verbose = verbose
+
+    @abstractmethod
+    def predict(self, messages, answer, *args, **kwargs):
+        raise NotImplementedError("Not implemented")
+
+    @abstractmethod
+    def generate(self, messages, generation_kwargs=None, *args, **kwargs):
+        raise NotImplementedError("Not implemented")
+
diff --git a/models/openaimodel.py b/models/openaimodel.py
new file mode 100644
index 0000000..8bc2028
--- /dev/null
+++ b/models/openaimodel.py
@@ -0,0 +1,112 @@
+import os
+from termcolor import colored
+
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+
+from openai import AzureOpenAI
+from openai import OpenAI
+import tiktoken
+
+
+from .model import Model
+from .utils import *
+
+
+class OpenAIModel(Model):
+
+    openai_2_azure_tag = {
+        "gpt-3.5-turbo-0125": "gpt-35-turbo-0125",
+        "gpt-3.5-turbo-1106": "gpt-35-turbo-1106"
+    }
+
+    def __init__(self, model_id, use_azure, generation_args, *args, **kwargs):
+
+        super(OpenAIModel, self).__init__(model_id, *args, **kwargs)
+
+        self.azure_id = self.openai_2_azure_tag[self.model_id]
+        self.use_azure = use_azure
+
+        if generation_args is None:
+            self.generation_args = {}
+        else:
+            self.generation_args = generation_args
+
+        if self.use_azure:
+            print(colored("Using Azure OPENAI API", "red"))
+
+            if self.model_id == "gpt-3.5-turbo-0125":
+                self.model = AzureOpenAI(
+                    azure_endpoint="https://petunia-grgur.openai.azure.com/",
+                    api_key=os.getenv("AZURE_OPENAI_KEY_gpt_35_turbo_0125"),
+                    api_version="2024-02-15-preview"
+                )
+
+            elif self.model_id == "gpt-3.5-turbo-1106":
+                self.model = AzureOpenAI(
+                    azure_endpoint="https://petunia-grgur-gpt-35-turbo-1106.openai.azure.com/",
+                    api_key=os.getenv("AZURE_OPENAI_KEY_gpt_35_turbo_1106"),
+                    api_version="2024-02-15-preview"
+                )
+            else:
+                raise NotImplementedError("Azure endpoint not found.")
+
+        else:
+            print(colored("Using OPENAI API", "red"))
+            self.model = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
+
+        self.tokenizer = tiktoken.get_encoding("cl100k_base")
+
+    @retry(wait=wait_random_exponential(min=10, max=30), stop=stop_after_attempt(10))
+    def completions_with_backoff(self, **kwargs):
+        return self.model.chat.completions.create(**kwargs)
+
+    def predict(
+        self,
+        messages,
+        answers,
+        query_string=None,
+        label_2_text_option_dict=None,
+        *args, **kwargs
+    ):
+        if label_2_text_option_dict is None:
+            raise ValueError("label_2_text_option_dict must be provided")
+
+        messages.append({"role": "assistant", "content": query_string})
+
+        # get the encoding for each letter in choices
+        logit_bias = {self.tokenizer.encode(c)[0]: 100 for c in answers}
+
+        c = self.completions_with_backoff(
+            model=self.azure_id if self.use_azure else self.model_id,
+            messages=messages,
+            max_tokens=1,
+            n=1,
+            temperature=0,
+            logit_bias=logit_bias,
+        )
+
+        generation = c.choices[0].message.content
+        lprobs = dummy_lprobs_from_generation(generation, answers, label_2_text_option_dict)
+
+        return generation, lprobs
+
+    def generate(self, messages, additional_generation_args=None, *args, **kwargs):
+
+        if additional_generation_args is not None:
+            generation_args = {**self.generation_args, **additional_generation_args}
+        else:
+            generation_args = self.generation_args
+
+        c = self.completions_with_backoff(
+            model=self.azure_id if self.use_azure else self.model_id,
+            messages=messages,
+            **generation_args,
+        )
+
+        response = c.choices[0].message.content
+
+        if response is None:
+            response = " "
+
+        return response
+
diff --git a/models/utils.py b/models/utils.py
new file mode 100644
index 0000000..ff2ccbc
--- /dev/null
+++ b/models/utils.py
@@ -0,0 +1,152 @@
+
+def print_chat_messages(messages):
+    print("*********************")
+    print("Messages:")
+    for msg in messages:
+        print(f"{msg['role'].upper()} : {msg['content']}")
+    print("*********************")
+
+def fix_alternating_msg_order(messages):
+
+    if len(messages) <= 1:
+        return messages
+
+    # roles must iterate, and start with user, so we add fixes
+    if messages[0]['role'] == "system" and messages[1]['role'] == "assistant":
+        # insert empty user message
+        messages.insert(1, {"role": "user", "content": ""})
+
+    if messages[0]['role'] == "user" and messages[1]['role'] == "user":
+        # first message sets the persona, second sets the topic
+        # insert artificial message of the model accepting the persona
+        messages.insert(1, {"role": "assistant", "content": "OK"})
+
+    return messages
+
+def construct_messages(prompt, system_message, messages_conv=None):
+
+    set_persona_str = prompt["set_persona_str"]
+    questionnaire_description = prompt["questionnaire_description"]
+
+    user_prompt = f"{questionnaire_description}\n\n" if questionnaire_description else ""
+    user_prompt += prompt["item_str"]
+
+    if system_message or messages_conv:
+        # multiple messages
+        messages = []
+        if set_persona_str:
+            messages.append({
+                "role": "system" if system_message else "user",
+                "content": set_persona_str
+            })
+
+        if messages_conv:
+            messages.extend(messages_conv)
+
+        messages.append({"role": "user", "content": user_prompt})
+
+        if not system_message:
+            # USER, USER -> USER, AS:"OK", USER
+            messages = fix_alternating_msg_order(messages)
+
+    else:
+
+        full_prompt = f"{set_persona_str}\n\n" if set_persona_str else ""
+
+        full_prompt += user_prompt
+
+        messages = [
+            {"role": "user", "content": full_prompt}
+        ]
+
+    return messages
+
+
+def apply_base_model_template(
+        messages,
+        assistant_label,
+        user_label,
+        system_label,
+        add_generation_prompt=True,
+        return_stop_words=False,
+):
+
+    formatted_conversation = ""
+
+    labels_dict = {
+        "ASSISTANT": assistant_label,
+        "SYSTEM": system_label,
+        "USER": user_label,
+    }
+
+    assert assistant_label != ""
+    assert user_label != ""
+    assert system_label != ""
+
+    for msg in messages:
+        label = labels_dict[msg['role'].upper()]
+        formatted_conversation += f"{label}:{msg['content']}"
+        formatted_conversation += "\n"
+
+    if add_generation_prompt:
+        formatted_conversation += f"{labels_dict['ASSISTANT']}:"
+
+    if return_stop_words:
+        return formatted_conversation, [f"\n{l}:" for l in labels_dict.values()]
+    else:
+        return formatted_conversation
+
+def dummy_lprobs_from_generation(response, answers, label_2_text_option_dict):
+
+    def find_first_match(response, labels_strings, case_insensitive):
+
+        if case_insensitive:
+            labels_strings = [(l, s.lower()) for l, s in labels_strings]
+            response = response.lower()
+
+        for l, s in labels_strings:
+            if s in response:
+                return l, s
+
+        return None, None
+
+    # first try to match substrings
+    # sort from longest to shortest (to avoid substrings, "Like me" vs "A little like me")
+    labels_text_options = sorted(label_2_text_option_dict.items(), key=lambda x: len(x[1]), reverse=True)
+    label, option = find_first_match(response, labels_text_options, case_insensitive=True)
+
+    if option is not None:
+        lprobs = [-0.01 if a == label else -100 for a in answers]
+        return lprobs
+
+    def find_matches(strings):
+        lprobs = [-100] * len(strings)
+        for i, op in enumerate(strings):
+            if op in response:
+                lprobs[i] = -0.01
+
+        match = any([lp > -100 for lp in lprobs])
+
+        return lprobs, match
+
+    # look for 'A.' -> change to A)
+    lprobs, match = find_matches([f"{a}." for a in answers])
+    if match:
+        return lprobs
+
+    # look for "A "
+    lprobs, _ = find_matches([f"{a} " for a in answers])
+    if match:
+        return lprobs
+
+    # look for "A"
+    lprobs, _ = find_matches(answers)
+    return lprobs
+
+
+
+def secs_2_hms(s):
+    minutes, seconds = divmod(s, 60)
+    hours, minutes = divmod(minutes, 60)
+    return hours, minutes, seconds
+
diff --git a/models_stat_test.py b/models_stat_test.py
deleted file mode 100644
index b6244ab..0000000
--- a/models_stat_test.py
+++ /dev/null
@@ -1,267 +0,0 @@
-#! python3
-
-from pathlib import Path
-import json
-from collections import defaultdict
-import scipy.stats as stats
-from termcolor import colored
-
-data=defaultdict(dict)
-# use t-tests to compare
-
-## Zephyr
-data["zephyr"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
-data["zephyr"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
-data["zephyr"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
-data["zephyr"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
-data["zephyr"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
-data["zephyr"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
-data["zephyr"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
-data["zephyr"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
-data["zephyr"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/"
-data["zephyr"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/"
-data["zephyr"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/"
-data["zephyr"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/"
-
-## GPT4
-# data["gpt4"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_2nd_prs/"
-data["gpt4"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_3rd_prs/"
-# data["gpt4"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_User_msg_2nd_prs/"
-# data["gpt4"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_User_msg_3rd_prs/"
-# data["gpt4"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-4-0314_perm_50_System_msg_2nd_prs/"
-# data["gpt4"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-4-0314_perm_50_System_msg_3rd_prs/"
-# data["gpt4"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-4-0314_perm_50_User_msg_2nd_prs/"
-data["gpt4"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-4-0314_perm_50_User_msg_3rd_prs/"
-# data["gpt4"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-4-0314_perm_50_System_msg_2nd_prs/"
-# data["gpt4"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-4-0314_perm_50_System_msg_3rd_prs/"
-# data["gpt4"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-4-0314_perm_50_User_msg_2nd_prs/"
-data["gpt4"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-4-0314_perm_50_User_msg_3rd_prs/"
-
-## GPT35m
-data["gpt35m"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0301_perm_50_System_msg_2nd_prs/"
-data["gpt35m"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0301_perm_50_System_msg_3rd_prs/"
-data["gpt35m"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0301_perm_50_User_msg_2nd_prs/"
-data["gpt35m"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/"
-data["gpt35m"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0301_perm_50_System_msg_2nd_prs/"
-data["gpt35m"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0301_perm_50_System_msg_3rd_prs/"
-data["gpt35m"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0301_perm_50_User_msg_2nd_prs/"
-data["gpt35m"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/"
-data["gpt35m"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0301_perm_50_System_msg_2nd_prs/"
-data["gpt35m"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0301_perm_50_System_msg_3rd_prs/"
-data["gpt35m"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0301_perm_50_User_msg_2nd_prs/"
-data["gpt35m"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0301_perm_50_User_msg_3rd_prs/"
-
-## GPT35j
-data["gpt35j"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0613_perm_50_System_msg_2nd_prs/"
-data["gpt35j"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0613_perm_50_System_msg_3rd_prs/"
-data["gpt35j"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0613_perm_50_User_msg_2nd_prs/"
-data["gpt35j"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-0613_perm_50_User_msg_3rd_prs/"
-data["gpt35j"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0613_perm_50_System_msg_2nd_prs/"
-data["gpt35j"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0613_perm_50_System_msg_3rd_prs/"
-data["gpt35j"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0613_perm_50_User_msg_2nd_prs/"
-data["gpt35j"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-0613_perm_50_User_msg_3rd_prs/"
-data["gpt35j"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0613_perm_50_System_msg_2nd_prs/"
-data["gpt35j"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0613_perm_50_System_msg_3rd_prs/"
-data["gpt35j"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0613_perm_50_User_msg_2nd_prs/"
-data["gpt35j"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-0613_perm_50_User_msg_3rd_prs/"
-
-## upstage llama 2
-data["upllama2"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama2_70b_instruct_v2_perm_50_System_msg_2nd_prs/"
-data["upllama2"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama2_70b_instruct_v2_perm_50_System_msg_3rd_prs/"
-data["upllama2"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama2_70b_instruct_v2_perm_50_User_msg_2nd_prs/"
-data["upllama2"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama2_70b_instruct_v2_perm_50_User_msg_3rd_prs/"
-data["upllama2"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama2_70b_instruct_v2_perm_50_System_msg_2nd_prs/"
-data["upllama2"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama2_70b_instruct_v2_perm_50_System_msg_3rd_prs/"
-data["upllama2"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama2_70b_instruct_v2_perm_50_User_msg_2nd_prs/"
-data["upllama2"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama2_70b_instruct_v2_perm_50_User_msg_3rd_prs/"
-data["upllama2"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama2_70b_instruct_v2_perm_50_System_msg_2nd_prs/"
-data["upllama2"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama2_70b_instruct_v2_perm_50_System_msg_3rd_prs/"
-data["upllama2"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama2_70b_instruct_v2_perm_50_User_msg_2nd_prs/"
-data["upllama2"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama2_70b_instruct_v2_perm_50_User_msg_3rd_prs/"
-
-## upstage llama 1
-data["upllama1"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama_60b_instruct_perm_50_System_msg_2nd_prs/"
-data["upllama1"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama_60b_instruct_perm_50_System_msg_3rd_prs/"
-data["upllama1"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama_60b_instruct_perm_50_User_msg_2nd_prs/"
-data["upllama1"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_up_llama_60b_instruct_perm_50_User_msg_3rd_prs/"
-data["upllama1"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama_60b_instruct_perm_50_System_msg_2nd_prs/"
-data["upllama1"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama_60b_instruct_perm_50_System_msg_3rd_prs/"
-data["upllama1"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama_60b_instruct_perm_50_User_msg_2nd_prs/"
-data["upllama1"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_up_llama_60b_instruct_perm_50_User_msg_3rd_prs/"
-data["upllama1"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama_60b_instruct_perm_50_System_msg_2nd_prs/"
-data["upllama1"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama_60b_instruct_perm_50_System_msg_3rd_prs/"
-data["upllama1"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama_60b_instruct_perm_50_User_msg_2nd_prs/"
-data["upllama1"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_up_llama_60b_instruct_perm_50_User_msg_3rd_prs/"
-
-## OpenAssistant
-data["oa"]["pvq_resS2"]="results_neurips/results_nat_lang_prof_pvq_test_openassistant_rlhf2_llama30b_perm_50_System_msg_2nd_prs/"
-data["oa"]["pvq_resS3"]="results_neurips/results_nat_lang_prof_pvq_test_openassistant_rlhf2_llama30b_perm_50_System_msg_3rd_prs/"
-data["oa"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_openassistant_rlhf2_llama30b_perm_50_User_msg_2nd_prs/"
-data["oa"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_openassistant_rlhf2_llama30b_perm_50_User_msg_3rd_prs/"
-data["oa"]["hof_resS2"]="results_neurips/results_nat_lang_prof_hofstede_test_openassistant_rlhf2_llama30b_perm_50_System_msg_2nd_prs/"
-data["oa"]["hof_resS3"]="results_neurips/results_nat_lang_prof_hofstede_test_openassistant_rlhf2_llama30b_perm_50_System_msg_3rd_prs/"
-data["oa"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_openassistant_rlhf2_llama30b_perm_50_User_msg_2nd_prs/"
-data["oa"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_openassistant_rlhf2_llama30b_perm_50_User_msg_3rd_prs/"
-data["oa"]["big5_resS2"]="results_neurips/results_nat_lang_prof_big5_test_openassistant_rlhf2_llama30b_perm_50_System_msg_2nd_prs/"
-data["oa"]["big5_resS3"]="results_neurips/results_nat_lang_prof_big5_test_openassistant_rlhf2_llama30b_perm_50_System_msg_3rd_prs/"
-data["oa"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_openassistant_rlhf2_llama30b_perm_50_User_msg_2nd_prs/"
-data["oa"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_openassistant_rlhf2_llama30b_perm_50_User_msg_3rd_prs/"
-
-### StableVicuna
-data["stvic"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_stablevicuna_perm_50_User_msg_2nd_prs/"
-data["stvic"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_stablevicuna_perm_50_User_msg_3rd_prs/"
-data["stvic"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_stablevicuna_perm_50_User_msg_2nd_prs/"
-data["stvic"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_stablevicuna_perm_50_User_msg_3rd_prs/"
-data["stvic"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_stablevicuna_perm_50_User_msg_2nd_prs/"
-data["stvic"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_stablevicuna_perm_50_User_msg_3rd_prs/"
-
-## StableLM
-data["stlm"]["pvq_resS2"]="results_neurips/results_nat_lang_prof_pvq_test_stablelm_perm_50_System_msg_2nd_prs/"
-data["stlm"]["pvq_resS3"]="results_neurips/results_nat_lang_prof_pvq_test_stablelm_perm_50_System_msg_3rd_prs/"
-data["stlm"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_stablelm_perm_50_User_msg_2nd_prs/"
-data["stlm"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_stablelm_perm_50_User_msg_3rd_prs/"
-data["stlm"]["hof_resS2"]="results_neurips/results_nat_lang_prof_hofstede_test_stablelm_perm_50_System_msg_2nd_prs/"
-data["stlm"]["hof_resS3"]="results_neurips/results_nat_lang_prof_hofstede_test_stablelm_perm_50_System_msg_3rd_prs/"
-data["stlm"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_stablelm_perm_50_User_msg_2nd_prs/"
-data["stlm"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_stablelm_perm_50_User_msg_3rd_prs/"
-data["stlm"]["big5_resS2"]="results_neurips/results_nat_lang_prof_big5_test_stablelm_perm_50_System_msg_2nd_prs/"
-data["stlm"]["big5_resS3"]="results_neurips/results_nat_lang_prof_big5_test_stablelm_perm_50_System_msg_3rd_prs/"
-data["stlm"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_stablelm_perm_50_User_msg_2nd_prs/"
-data["stlm"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_stablelm_perm_50_User_msg_3rd_prs/"
-
-## LLaMa 65B
-data["llama"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_llama_65B_perm_50__msg_2nd_prs/"
-data["llama"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_llama_65B_perm_50__msg_3rd_prs/"
-data["llama"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_llama_65B_perm_50__msg_2nd_prs/"
-data["llama"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_llama_65B_perm_50__msg_3rd_prs/"
-data["llama"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_llama_65B_perm_50__msg_2nd_prs/"
-data["llama"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_llama_65B_perm_50__msg_3rd_prs/"
-
-## RP Chat
-data["rpchat"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_rp_incite_7b_chat_perm_50_User_msg_2nd_prs/"
-data["rpchat"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_rp_incite_7b_chat_perm_50_User_msg_3rd_prs/"
-data["rpchat"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_rp_incite_7b_chat_perm_50_User_msg_2nd_prs/"
-data["rpchat"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_rp_incite_7b_chat_perm_50_User_msg_3rd_prs/"
-data["rpchat"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_rp_incite_7b_chat_perm_50_User_msg_2nd_prs/"
-data["rpchat"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_rp_incite_7b_chat_perm_50_User_msg_3rd_prs/"
-
-## RP Instruct
-data["rpinstruct"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_rp_incite_7b_instruct_perm_50_User_msg_2nd_prs/"
-data["rpinstruct"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_rp_incite_7b_instruct_perm_50_User_msg_3rd_prs/"
-data["rpinstruct"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_rp_incite_7b_instruct_perm_50_User_msg_2nd_prs/"
-data["rpinstruct"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_rp_incite_7b_instruct_perm_50_User_msg_3rd_prs/"
-data["rpinstruct"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_rp_incite_7b_instruct_perm_50_User_msg_2nd_prs/"
-data["rpinstruct"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_rp_incite_7b_instruct_perm_50_User_msg_3rd_prs/"
-
-## gpt-3.5-turbo-instruct
-data["gpt35in"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_2nd_prs/"
-data["gpt35in"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_3rd_prs/"
-data["gpt35in"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_2nd_prs/"
-data["gpt35in"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_3rd_prs/"
-data["gpt35in"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_2nd_prs/"
-data["gpt35in"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_gpt-3.5-turbo-instruct-0914_perm_50_User_msg_3rd_prs/"
-
-## Curie
-data["curie"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_curie_perm_50_User_msg_2nd_prs/"
-data["curie"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_curie_perm_50_User_msg_3rd_prs/"
-data["curie"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_curie_perm_50_User_msg_2nd_prs/"
-data["curie"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_curie_perm_50_User_msg_3rd_prs/"
-data["curie"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_curie_perm_50_User_msg_2nd_prs/"
-data["curie"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_curie_perm_50_User_msg_3rd_prs/"
-
-## Babbage
-data["babbage"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_babbage_perm_50_User_msg_2nd_prs/"
-data["babbage"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_babbage_perm_50_User_msg_3rd_prs/"
-data["babbage"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_babbage_perm_50_User_msg_2nd_prs/"
-data["babbage"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_babbage_perm_50_User_msg_3rd_prs/"
-data["babbage"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_babbage_perm_50_User_msg_2nd_prs/"
-data["babbage"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_babbage_perm_50_User_msg_3rd_prs/"
-
-## Ada
-data["ada"]["pvq_resU2"]="results_neurips/results_nat_lang_prof_pvq_test_ada_perm_50_User_msg_2nd_prs/"
-data["ada"]["pvq_resU3"]="results_neurips/results_nat_lang_prof_pvq_test_ada_perm_50_User_msg_3rd_prs/"
-data["ada"]["hof_resU2"]="results_neurips/results_nat_lang_prof_hofstede_test_ada_perm_50_User_msg_2nd_prs/"
-data["ada"]["hof_resU3"]="results_neurips/results_nat_lang_prof_hofstede_test_ada_perm_50_User_msg_3rd_prs/"
-data["ada"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_ada_perm_50_User_msg_2nd_prs/"
-data["ada"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_ada_perm_50_User_msg_3rd_prs/"
-
-
-models = ["zephyr", "gpt4", "gpt35m", "gpt35j", "gpt35in", "upllama2","upllama1", "oa", "stvic", "stlm", "llama", "rpchat", "rpincite", "curie", "babbage", "ada"]
-msg = ["S", "U"]
-prs = ["2", "3"]
-
-# pvq
-questionnaires = ["pvq"]
-comparisons = [("gpt35m", m) for m in models]
-label_best = "pvq_resU2"
-
-# hof
-questionnaires = ["hof"]
-comparisons = [("upllama1", m) for m in models]
-label_best = "hof_resU3"
-#
-# # big5
-questionnaires = ["big5"]
-comparisons = [("gpt35j", m) for m in models]
-label_best = "big5_resS3"
-
-
-# replace paths with data from alignments.json
-for model in models:
-    for quest in questionnaires:
-        for m in msg:
-
-            for p in prs:
-                label = quest + "_res" + m + p
-                if label not in data[model]:
-                    continue
-                pa = Path(data[model][label])
-
-                json_paths= list(pa.glob("*/alignments.json"))
-
-                if len(json_paths) == 0:
-                    print("No JSON files found in", pa)
-                    continue
-
-                json_data = []
-
-                for json_path in json_paths:
-                    # Open each JSON file
-                    with open(json_path, 'r') as f:
-                        # Load the JSON data from the file
-                        load_data = json.load(f)
-                        # Append the data to the list
-                        json_data.extend(load_data)
-                data[model][label] = json_data
-
-p_limit = 0.05 / 15
-
-print("p-limit: {}".format(p_limit))
-
-for mod_1, mod_2 in comparisons:
-    print("-> {} vs {}:".format(mod_1, mod_2))
-
-    for quest in questionnaires:
-        print(f"\t-> {quest}:")
-        for m in msg:
-            if (mod_1 == "stvic" or mod_2 == "stvic") and m == "S":
-                continue
-
-            for p in prs:
-                label = quest + "_res" + m + p
-                if label not in data[mod_1] or label not in data[mod_2]:
-                    continue
-
-                a=data[mod_1][label_best]
-                b=data[mod_2][label]
-
-                pvalue = stats.ttest_ind(a=a, b=b, equal_var=False).pvalue
-
-                if pvalue < p_limit:
-                    mark = "*"
-                    color = "green"
-                else:
-                    mark = " "
-                    color = "red"
-
-                print(colored(f"\t {mark} {label.split('_')[1]} -> {pvalue}", color=color))
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a9f8bf1..c581f4d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,20 @@
 termcolor==2.4.0
 bitsandbytes==0.43.0
 transformers==4.36.2
-peft==0.10.0
 accelerate==0.28.0
-trl==0.8.1
 datasets==2.16.0
 sentencepiece==0.2.0
 IPython==8.18.1
 tiktoken==0.6.0
 einops==0.7.0 # Qwen
-transformers_stream_generator==0.0.5 # Qwen
+transformers_stream_generator==0.0.4 # Qwen
 matplotlib==3.8.3
 checksumdir==1.2.0
 scipy==1.12.0
 termcolor==2.4.0
 statsmodels==0.14.1
 openai==1.14.3
-tiktoken==0.6.0
\ No newline at end of file
+tiktoken==0.6.0
+tenacity==8.2.3
+mergedeep==1.3.4
+mistral-common==1.0.2
\ No newline at end of file
diff --git a/run_campaign_seeds.sh b/run_campaign_seeds.sh
new file mode 100644
index 0000000..e3e939b
--- /dev/null
+++ b/run_campaign_seeds.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+#SBATCH -A imi@a100
+#SBATCH -C a100
+#SBATCH --time=02:29:59
+#SBATCH --gres=gpu:2
+#SBATCH --array=0-24 # themes x n_seeds -> 6x5 (0-24 wo None, 0-29 for all)
+#SBATCH -o slurm_logs/sb_log_%A_%a.out
+#SBATCH -e slurm_logs/sb_log_%A_%a.err
+##SBATCH --qos=qos_gpu-dev
+
+##########################################################
+# Set the questionnaire and population (using the second command argument)
+##########################################################
+
+experiment_setting=$2
+
+# Define the configuration based on the experiment_setting
+case "$experiment_setting" in
+  pvq_tolk)
+    test_tag="pvq"
+    experiment_name="pvq_test"
+    data_dir="data_pvq"
+    population_type="tolkien_characters"
+    ;;
+  pvq_fam)
+    test_tag="pvq"
+    experiment_name="pvq_test"
+    data_dir="data_pvq"
+    population_type="famous_people"
+    ;;
+  don)
+    test_tag="tolkien_donation"
+    experiment_name="tolkien_donation_test"
+    data_dir="data_tolkien_donation"
+    population_type="tolkien_characters"
+    ;;
+  bag)
+    test_tag="tolkien_bag"
+    experiment_name="tolkien_bag_test"
+    data_dir="data_tolkien_bag"
+    population_type="tolkien_characters"
+    ;;
+  religion)
+    test_tag="religion"
+    experiment_name="religion_test"
+    data_dir="data_religion"
+    population_type="famous_people"
+    ;;
+  *)
+    echo "Invalid experiment_setting. Please use one of the following: pvq_tolk, pvq_fam, don, bag, religion."
+    exit 1
+    ;;
+esac
+
+# Print the selected configuration
+echo "test_tag=$test_tag"
+echo "experiment_name=$experiment_name"
+echo "data_dir=$data_dir"
+echo "population_type=$population_type"
+
+
+# Extract parameters: theme and seed
+##########################################################
+themes=("grammar" "joke" "poem" "history" "chess" "None")
+seed_list=(0 2 4 6 8)
+
+seed_list_len=${#seed_list[@]}
+
+
+theme_i=$(( SLURM_ARRAY_TASK_ID / $seed_list_len ))
+seed_i=$(( SLURM_ARRAY_TASK_ID % $seed_list_len ))
+
+
+theme="${themes[$theme_i]}"
+seed="${seed_list[$seed_i]}"
+
+permute_options_seed="$seed"_"$theme_i"
+
+# Other params
+##########################################################
+engine="$1"
+n_msgs=3
+
+echo "ID:"$SLURM_ARRAY_TASK_ID
+echo "Theme:"$theme
+echo "Seed:"$seed
+echo "Seed str:"$permute_options_seed
+echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs:$test_tag:$population_type"
+
+# Setup the experiments directories
+##########################################################
+SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}"
+
+SAVE_DIR="results/"$SUBDIR
+LOG_DIR="logs/"$SUBDIR
+
+# Start the experiment
+##########################################################
+mkdir -p $LOG_DIR
+
+source $HOME/.bashrc
+
+if [[ "$engine" == "phi-1" || "$engine" == "phi-2" || "$engine" == "Qwen1.5*"  || "$engine" == "llama_3*" || "$engine" == "command_r_plus" ]]; then
+    conda activate llm_stability_phi
+else
+    conda activate llm_stability
+fi
+
+
+
+python -u evaluate_v3.py \
+  --simulated-population-type $population_type \
+  --simulated-conversation-theme $theme \
+  --simulated-human-knows-persona \
+  --simulated-conversation-n-messages $n_msgs \
+  --permute-options \
+  --permute-options-seed "$permute_options_seed" \
+  --save_dir $SAVE_DIR \
+  --engine "$engine" \
+  --data_dir data/$data_dir \
+  --experiment_name $experiment_name \
+  --pvq-version "pvq_auto" \
+  --azure-openai \
+  --assert-params \
+  --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
\ No newline at end of file
diff --git a/run_campaign_sim_conv_no_pop.sh b/run_campaign_sim_conv_no_pop.sh
index b82bd56..979e605 100644
--- a/run_campaign_sim_conv_no_pop.sh
+++ b/run_campaign_sim_conv_no_pop.sh
@@ -64,14 +64,13 @@ all_engines=(
   "Mixtral-8x7B-Instruct-v0.1"
   "phi-2"
   "phi-1"
-  "phi-1.5"
   "Qwen-72B"
   "Qwen-14B"
   "Qwen-7B"
   "Qwen-72B-Chat"
   "dummy"
   "gpt-3.5-turbo-0125"
-#  "gpt-3.5-turbo-1106"
+  "gpt-3.5-turbo-1106"
 #  "gpt-3.5-turbo-0613"
 #  "gpt-3.5-turbo-0301"
 )
@@ -79,7 +78,6 @@ all_engines=(
 # Select engine based on provided index
 engine="${all_engines[$1]}"
 
-
 echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs"
 
 SUBDIR="sim_conv_"$test_tag"_"$population_type"_msgs/"$engine"/"$n_msgs"_msgs/"$seed"_seed/results_sim_conv_"$population_type"_"$engine
@@ -236,8 +234,8 @@ elif [[ $engine == *"gpt"* ]] ; then
     --data_dir data/$data_dir \
     --experiment_name $experiment_name \
     --pvq-version "pvq_auto" \
-    --azure-openai \
-    --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
+    --verbose \
+    --azure-openai 2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
 
 
 else
diff --git a/run_campaign_sim_conv_pvq_seeds.sh b/run_campaign_sim_conv_pvq_seeds.sh
index a1d1bcd..5cf3626 100644
--- a/run_campaign_sim_conv_pvq_seeds.sh
+++ b/run_campaign_sim_conv_pvq_seeds.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH -A imi@a100
 #SBATCH -C a100
-#SBATCH --time=01:29:59
+#SBATCH --time=01:59:59
 #SBATCH --gres=gpu:2
 #SBATCH --array=0-24 # themes x n_seeds -> 6x5 (0-24 wo None, 0-29 for all)
 ##SBATCH --array=0-4 # just grammar
@@ -89,9 +89,6 @@ seed_i=$(( SLURM_ARRAY_TASK_ID % $seed_list_len ))
 theme="${themes[$theme_i]}"
 seed="${seed_list[$seed_i]}"
 
-echo "Theme_i:"$theme_i
-echo "Seed_i:"$seed_i
-
 echo "Theme:"$theme
 echo "Seed:"$seed
 
@@ -118,23 +115,31 @@ all_engines=(
   "Mixtral-8x7B-Instruct-v0.1"
   "phi-2"
   "phi-1"
-  "phi-1.5"
   "Qwen-72B"
   "Qwen-14B"
   "Qwen-7B"
-  "Qwen-72B-Chat"
+  "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_batch_size_16_rank_256"
+  "Mistral-7B-v0.1_ft_roleplay_filtered_chars_no_peft_batch_size_16_rank_256"
+  "gpt-3.5-turbo-0125"
+  "gpt-3.5-turbo-1106"
+  "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_roleplay_filtered_chars_batch_size_16_rank_256"
+  "Mistral-7B-v0.1_ft_NO_INSTR_TEMPL_LOAD_INSTRUCT_roleplay_filtered_chars_batch_size_16_rank_256"
+  "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_target_all_lin_and_train_ml_headbatch_size_16_rank_256"
+  "Mistral-7B-v0.1_ft_roleplay_filtered_chars_lora_target_all_lin_and_train_ml_head_batch_size_8_rank_64_lr_0.0002_train_on_all"
+  "Mistral-7B-v0.1_ft_roleplay_filtered_chars_no_peft_batch_size_8_rank_64_lr_2e-05_train_on_all"
+  "Mistral-7B-Instruct-v0.2_ft_roleplay_batch_size_16_rank_256"
   "dummy"
 )
 
 # Select engine based on provided index
 engine="${all_engines[$1]}"
 
-
 echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs"
 
 SUBDIR="RERUN_sim_conv_"$test_tag"_"$population_type"_seeds/"$engine"/"$seed"_seed/results_sim_conv_"$population_type"_"$engine"_msgs_"$n_msgs
 SAVE_DIR="results/"$SUBDIR
-#SAVE_DIR="test_results/"$SUBDIR
+
+SAVE_DIR="test/"$SUBDIR
 LOG_DIR="logs/"$SUBDIR
 
 mkdir -p $SAVE_DIR
@@ -148,7 +153,7 @@ if [[ $engine == *"Mistral"* ]] || [[ $engine == *"Mixtral"* ]]; then
 
   echo "Mistral or Mixtral: $engine"
 
-  if [[ $engine == *"Instruct"* ]] ; then
+  if [[ $engine == *"Instruct"* ]] || [[ $engine == *"ft_roleplay"* ]] ; then
     # INSTRUCT MODELS
 
     # mistral, mixtral -> no sys; query
@@ -292,10 +297,10 @@ elif [[ $engine == *"gpt"* ]] ; then
     --experiment_name $experiment_name \
     --pvq-version "pvq_auto" \
     --azure-openai \
+    --overwrite \
+    --verbose \
     --assert-params 2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
 
-#    --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
 else
   echo "Undefined engine: $engine"
 fi
\ No newline at end of file
diff --git a/run_dummy.sh b/run_dummy.sh
index cb7c472..5342a81 100644
--- a/run_dummy.sh
+++ b/run_dummy.sh
@@ -1,79 +1,124 @@
-##!/bin/bash
+#!/bin/bash
 
-engines=(
-  "dummy"
-#  "zephyr-7b-beta"
-#  "Mistral-7B-v0.1"
-#  "Mistral-7B-Instruct-v0.1"
-#  "Mistral-7B-Instruct-v0.2"
-#  "Mixtral-8x7B-Instruct-v0.1"
-#  "Mixtral-8x7B-Instruct-v0.1-4b"
-#  "Mixtral-8x7B-v0.1-4b"
-#  "Mixtral-8x7B-v0.1"
-#  "Mixtral-8x7B-Instruct-v0.1"
-#  "zephyr-7b-beta"
-#  "llama_2_7b_chat" # 2 gpu
-#  "falcon-7b"
-#  "phi-2"
-#  "phi-1.5"
-#  "phi-1"
-#  "Qwen-72B"
-#  "Qwen-14B"
-#  "Qwen-7B"
-#  "Qwen-72B-Chat"
-)
+##########################################################
+# Set the questionnaire and population (using the second command argument)
+##########################################################
 
-for engine in "${engines[@]}"; do
+theme="chess"
+n_msgs=3
+permute_options_seed="test"
+#engine="dummy"
+#engine="interactive"
+#engine="gpt-3.5-turbo-0125"
+#engine="phi-3"
+#engine="Qwen1.5-72B-Chat"
+#engine="Mistral-7B-Instruct-v0.1"
+#engine="Mixtral-8x7B-Instruct-v0.1-4b"
+engine="Mixtral-8x22B-Instruct-v0.1-4b"
+#engine="llama_3_8b_instruct"
+#engine="llama_3_70b_instruct"
+#experiment_setting=religion
 
-# Tolkien characters
-#--simulated-population-type tolkien_characters \
+#engine="llama_2_7b"
+#engine="llama_2_13b"
+#engine="llama_2_70b"
+#engine="llama_2_7b_chat"
+#engine="llama_2_13b_chat"
+#engine="llama_2_70b_chat"
+#engine="Mistral-7B-v0.1"
+#engine="Mistral-7B-Instruct-v0.1"
+#engine="Mistral-7B-Instruct-v0.2"
+#engine="zephyr-7b-beta"
+#engine="Mixtral-8x7B-v0.1-4b"
+#engine="Mixtral-8x7B-Instruct-v0.1-4b"
+#engine="Mixtral-8x7B-v0.1"
+#engine="Mixtral-8x7B-Instruct-v0.1"
+#engine="phi-1"
+#engine="phi-2"
+#engine="Qwen-7B"
+#engine="Qwen-14B"
+#engine="Qwen-72B"
+#engine="gpt-3.5-turbo-1106"
+#engine="gpt-3.5-turbo-0125"
 
-# Real world personas
-#--simulated-population-type famous_people \
 
-# No personas
-#--simulated-population-type permutations \
-#--permutations 50 \
+experiment_setting=pvq_tolk
 
+# Define the configuration based on the experiment_setting
+case "$experiment_setting" in
+  pvq_tolk)
+    test_tag="pvq"
+    experiment_name="pvq_test"
+    data_dir="data_pvq"
+    population_type="tolkien_characters"
+    ;;
+  pvq_fam)
+    test_tag="pvq"
+    experiment_name="pvq_test"
+    data_dir="data_pvq"
+    population_type="famous_people"
+    ;;
+  don)
+    test_tag="tolkien_donation"
+    experiment_name="tolkien_donation_test"
+    data_dir="data_tolkien_donation"
+    population_type="tolkien_characters"
+    ;;
+  bag)
+    test_tag="tolkien_bag"
+    experiment_name="tolkien_bag_test"
+    data_dir="data_tolkien_bag"
+    population_type="tolkien_characters"
+    ;;
+  religion)
+    test_tag="religion"
+    experiment_name="religion_test"
+    data_dir="data_religion"
+    population_type="famous_people"
+    ;;
+  *)
+    echo "Invalid experiment_setting. Please use one of the following: pvq_tolk, pvq_fam, don, bag, religion."
+    exit 1
+    ;;
+esac
 
-# Questionnaire
-# PVQ
-#--data_dir data/data_pvq \
-#--experiment_name pvq_test \
+# Print the selected configuration
+echo "test_tag=$test_tag"
+echo "experiment_name=$experiment_name"
+echo "data_dir=$data_dir"
+echo "population_type=$population_type"
+echo "engine=$engine"
+#####################################################
 
-# tolkien donation
-#--data_dir data/data_tolkien_donation \
-#--experiment_name tolkien_donation_test \
 
-# tolkien stealing
-#--data_dir data/data_pvq \
-#--experiment_name pvq_test \
+SUBDIR="test/"$engine"/"$seed"_seed/results_sim_conv_"$population_type"_"$engine"_msgs_"$n_msgs
+SAVE_DIR="test_results/"$SUBDIR
+LOG_DIR="test_logs/"$SUBDIR
 
-#--data_dir data/data_tolkien_bag \
-#--experiment_name tolkien_bag_test \
+#mkdir -p $SAVE_DIR
+mkdir -p $LOG_DIR
 
-#--data_dir data/data_svo \
-#--experiment_name svo_test \
+source $HOME/.bashrc
 
-python -u evaluate.py \
---simulated-population-type tolkien_characters \
---simulate-conversation-theme "None" \
---simulated-human-knows-persona \
---simulated-conversation-n-messages 3 \
---permute-options \
---permute-options-seed "5" \
---format chat \
---save_dir results/test/test \
---engine $engine \
---query-in-reply \
---data_dir data/data_religion \
---experiment_name religion_test \
---pvq-version "pvq_auto" \
---no-profile \
---direct-perspective \
---base-model-template \
---system-message \
---verbose
-
-done
+if [[ "$engine" == "phi-1" || "$engine" == "phi-2" || "$engine" == "Qwen1.5*"  || "$engine" == "llama_3*" || "$engine" == "command_r_plus" ]]; then
+    conda activate llm_stability_phi
+else
+    conda activate llm_stability
+fi
 
+python -u evaluate_v3.py \
+  --simulated-population-type $population_type \
+  --simulated-conversation-theme $theme \
+  --simulated-human-knows-persona \
+  --simulated-conversation-n-messages $n_msgs \
+  --permute-options \
+  --permute-options-seed "$permute_options_seed" \
+  --format chat \
+  --save_dir $SAVE_DIR \
+  --engine "$engine" \
+  --data_dir data/$data_dir \
+  --experiment_name $experiment_name \
+  --pvq-version "pvq_auto" \
+  --azure-openai \
+  --assert-params \
+  --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
\ No newline at end of file
diff --git a/run_local.sh b/run_local.sh
index 53b0b18..aa532b3 100644
--- a/run_local.sh
+++ b/run_local.sh
@@ -1,18 +1,35 @@
 #!/bin/bash
 
-# Total tasks and tasks per run
-total_tasks=25
-experiment_setting="don"
-
-# Loop over total tasks in steps of tasks_per_run
-for ((i=total_tasks-1; i>=0; i-=1)); do
-  if [ $((i % 2)) -eq 0 ]; then
-    SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh 23 $experiment_setting --verbose
-  else
-    SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh 23 $experiment_setting
-  fi
+for model in "gpt-3.5-turbo-0125" "gpt-3.5-turbo-1106"
+do
+  for i in {0..24}
+  do
+#    SLURM_ARRAY_TASK_ID=$i bash run_campaign_gs.sh $model # GS
+    SLURM_ARRAY_TASK_ID=$i bash run_campaign_seeds.sh $model don # eval
+  done
 done
 
+#SLURM_ARRAY_TASK_ID=0 bash run_campaign_seeds.sh "Mixtral-8x7B-v0.1-4b" $model religion # eval
+
+exit
+
+
+
+#
+## Total tasks and tasks per run
+#total_tasks=25
+#experiment_setting="bag"
+#
+## Loop over total tasks in steps of tasks_per_run
+#for ((i=0; i<total_tasks; i+=1)); do
+##  if [ $((i % 2)) -eq 0 ]; then
+#  if [ $i -eq 0 ]; then
+#    SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh 23 $experiment_setting --verbose
+#  else
+#    SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_pvq_seeds.sh 23 $experiment_setting
+#  fi
+#done
+
 ## Total tasks and tasks per run
 #total_tasks=25
 #tasks_per_run=1
@@ -43,21 +60,16 @@ done
 # NO pop - it goes backward  to get shorted msgs results quicker
 #############
 
-## Loop over total tasks in steps of tasks_per_run
-##for ((i=0; i<total_tasks; i+=tasks_per_run)); do
+# Loop over total tasks in steps of tasks_per_run
+#for ((i=0; i<total_tasks; i+=tasks_per_run)); do
+
+# backwards
 #for ((i=total_tasks-1; i>=0; i-=tasks_per_run)); do
-#  # Launch parallel tasks for the current block
-#  for ((j=i-tasks_per_run+1; j<=i && j<total_tasks; j++)); do
-#    if [ "$j" -eq "$i" ]; then
-#      SLURM_ARRAY_TASK_ID=$j bash run_campaign_sim_conv_no_pop.sh 22 --verbose &
-#    else
-#      SLURM_ARRAY_TASK_ID=$j bash run_campaign_sim_conv_no_pop.sh 22 &
-#    fi
-#  done
+# 15 - 20 : just msgs =3
+
+#for ((i=3; i<25; i+=5)); do # Launch parallel tasks for the current block
+#  SLURM_ARRAY_TASK_ID=$i bash run_campaign_sim_conv_no_pop.sh 23
 #
-#  # Wait for the current block of tasks to finish
-#  wait
-#  echo "Block $((($tasks_per_run - $i  / $tasks_per_run + 1) - 1)) of $(( (total_tasks + tasks_per_run - 1) / tasks_per_run )) completed."
 #done
 #
 #echo "All tasks have been completed."
diff --git a/run_single.sh b/run_single.sh
index 17f515a..594bedb 100644
--- a/run_single.sh
+++ b/run_single.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+
 # 6 themes
 themes=(
   "grammar"
@@ -50,7 +51,6 @@ permute_options_seed="$seed"_"$theme_i"
 #engine="Mixtral-8x7B-Instruct-v0.1"
 #engine="phi-2"
 #engine="phi-1"
-#engine="phi-1.5"
 #engine="Qwen-72B"
 #engine="Qwen-14B"
 #engine="Qwen-7B"
@@ -108,161 +108,26 @@ mkdir -p $LOG_DIR
 
 source $HOME/.bashrc
 
-conda activate llm_persp
-
-
-if [[ $engine == *"Mistral"* ]] || [[ $engine == *"Mixtral"* ]]; then
-
-  echo "Mistral or Mixtral: $engine"
-
-  if [[ $engine == *"Instruct"* ]] ; then
-    # INSTRUCT MODELS
-
-    # mistral, mixtral -> no sys; query
-    python -u evaluate.py \
-      --simulated-population-type $population_type \
-      --simulate-conversation-theme $theme \
-      --simulated-human-knows-persona \
-      --simulated-conversation-n-messages $n_msgs \
-      --permute-options \
-      --permute-options-seed "$permute_options_seed" \
-      --format chat \
-      --save_dir $SAVE_DIR \
-      --engine "$engine" \
-      --query-in-reply \
-      --data_dir data/$data_dir \
-      --experiment_name $experiment_name \
-      --pvq-version "pvq_auto" \
-      --no-profile \
-      --assert-params \
-      --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
-  else
-    # BASE MODELS
-
-    # mistral, mixtral -> no sys; query
-    python -u evaluate.py \
-      --simulated-population-type $population_type \
-      --simulate-conversation-theme $theme \
-      --simulated-human-knows-persona \
-      --simulated-conversation-n-messages $n_msgs \
-      --permute-options \
-      --permute-options-seed "$permute_options_seed" \
-      --format chat \
-      --save_dir $SAVE_DIR \
-      --engine "$engine" \
-      --query-in-reply \
-      --system-message \
-      --base-model-template \
-      --data_dir data/$data_dir \
-      --experiment_name $experiment_name \
-      --pvq-version "pvq_auto" \
-      --no-profile \
-      --assert-params \
-      --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-  fi
-
-elif [[ $engine == *"phi"* ]] || [[ $engine == "Qwen-"*"B" ]]; then
-
-    # all phi models are BASE and qwen base
-    python -u evaluate.py \
-      --simulated-population-type $population_type \
-      --simulate-conversation-theme $theme \
-      --simulated-human-knows-persona \
-      --simulated-conversation-n-messages $n_msgs \
-      --permute-options \
-      --permute-options-seed "$permute_options_seed" \
-      --format chat \
-      --save_dir $SAVE_DIR \
-      --engine "$engine" \
-      --query-in-reply \
-      --system-message \
-      --base-model-template \
-      --data_dir data/$data_dir \
-      --experiment_name $experiment_name \
-      --pvq-version "pvq_auto" \
-      --no-profile \
-      --assert-params \
-      --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
-elif [[ $engine == *"zephyr"* ]] || [[ $engine == *"llama_2"* ]] || [[ $engine == "dummy" ]]; then
-
-  echo "Zephyr or LLaMa: $engine"
-
-  if [[ $engine == *"llama_2"* ]] && [[ $engine != *"chat"* ]]; then
-    # BASE MODELS
-
-    # llama_base_model
-    python -u evaluate.py \
-      --simulated-population-type $population_type \
-      --simulate-conversation-theme $theme \
-      --simulated-human-knows-persona \
-      --simulated-conversation-n-messages $n_msgs \
-      --permute-options \
-      --permute-options-seed "$permute_options_seed" \
-      --format chat \
-      --save_dir $SAVE_DIR \
-      --engine "$engine" \
-      --query-in-reply \
-      --system-message \
-      --base-model-template \
-      --data_dir data/$data_dir \
-      --experiment_name $experiment_name \
-      --pvq-version "pvq_auto" \
-      --no-profile \
-      --assert-params \
-      --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
-  else
-
-    # INSTUCT, DPO models
-    # zephyr, llama -> sys ; query
-    python -u evaluate.py \
-      --simulated-population-type $population_type \
-      --simulate-conversation-theme $theme \
-      --simulated-human-knows-persona \
-      --simulated-conversation-n-messages $n_msgs \
-      --permute-options \
-      --permute-options-seed "$permute_options_seed" \
-      --format chat \
-      --save_dir $SAVE_DIR \
-      --engine "$engine" \
-      --query-in-reply \
-      --system-message \
-      --data_dir data/$data_dir \
-      --experiment_name $experiment_name \
-      --pvq-version "pvq_auto" \
-      --no-profile \
-      --assert-params \
-      --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
-  fi
-
-
-elif [[ $engine == *"gpt"* ]] ; then
-
-  echo "GPTs: $engine"
-
-  # gpts -> sys ; no query
-  python -u evaluate.py \
-    --simulated-population-type $population_type \
-    --simulate-conversation-theme $theme \
-    --simulated-human-knows-persona \
-    --simulated-conversation-n-messages $n_msgs \
-    --permute-options-seed "$permute_options_seed" \
-    --permute-options \
-    --format chat \
-    --save_dir $SAVE_DIR \
-    --engine "$engine" \
-    --system-message \
-    --data_dir data/$data_dir \
-    --experiment_name $experiment_name \
-    --pvq-version "pvq_auto" \
-    --no-profile \
-    --assert-params \
-    --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
-
 
+if [[ "$engine" == "phi-1" || "$engine" == "phi-2" ]]; then
+    conda activate llm_stability_phi
 else
-  echo "Undefined engine: $engine"
-fi
\ No newline at end of file
+    conda activate llm_stability
+fi
+
+
+python -u evaluate_v3.py \
+  --engine "$engine" \
+  --experiment_name $experiment_name \
+  --data_dir data/$data_dir \
+  --simulated-population-type $population_type \
+  --simulated-conversation-theme $theme \
+  --simulated-conversation-n-messages $n_msgs \
+  --permute-options-seed "$permute_options_seed" \
+  --simulated-human-knows-persona \
+  --save_dir $SAVE_DIR \
+  --permute-options \
+  --pvq-version "pvq_auto" \
+  --azure-openai \
+  --assert-params \
+  --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
diff --git a/tokens_estimate.py b/tokens_estimate.py
deleted file mode 100644
index 5b796f9..0000000
--- a/tokens_estimate.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import pandas as pd
-import tiktoken
-
-encoder = tiktoken.encoding_for_model('gpt-4')
-
-def count_tokens(file_path):
-
-    # Read CSV file
-    df = pd.read_csv(file_path)
-
-    # Join all text in the dataframe into a single string
-    text = ' '.join(df.values.flatten().astype(str))
-
-    # Tokenize and return the number of tokens
-    tokens = list(encoder.encode(text))
-    return len(tokens)
-
-
-def count_tokens_in_dir(dir_path):
-    # Initialize total token count
-    total_tokens = 0
-    # Iterate over all CSV files in the directory
-    for file_name in os.listdir(dir_path):
-        if file_name.endswith('.csv'):
-            file_path = os.path.join(dir_path, file_name)
-            tokens = count_tokens(file_path)
-            print(f"Filename {file_name} - tokens: {tokens}")
-            total_tokens += tokens
-
-    return total_tokens
-
-
-# Usage
-dir_path = './data_mmlu/test'
-n_tokens = count_tokens_in_dir(dir_path)
-print(f"Total GPT-4 tokens in the CSV files: {n_tokens}")
-print(f"Total price for GPT-4: {n_tokens/1000 * 0.03}")
-print(f"Total price for GPT-35: {n_tokens/1000 * 0.002}")
diff --git a/utils.py b/utils.py
index 5539296..54229fa 100644
--- a/utils.py
+++ b/utils.py
@@ -1,4 +1,20 @@
 import os
+import time
+import numpy as np
+
+def secs_2_hms(s):
+    minutes, seconds = divmod(s, 60)
+    hours, minutes = divmod(minutes, 60)
+    return hours, minutes, seconds
+
+
+def estimate_eta(start_time, progress):
+    # estimate ETA
+    curr_time = time.time()
+    elapsed_time = curr_time - start_time
+    eta = (elapsed_time / progress) * (1 - progress)
+    return eta
+
 
 def print_chat_messages(messages):
     print("*********************")
@@ -13,14 +29,14 @@ def remove_prefix(s, pref):
         return s[len(pref):]
     return s
 
-def extract_answer_tokens(answers, tokenizer):
-    answer_tokens = {a: [] for a in answers}
-    for tok_ind in range(len(tokenizer)):
-        tok = tokenizer.decode([tok_ind])
-        if tok in answers:
-            answer_tokens[tok].append(tok_ind)
 
-    return answer_tokens
+def softmax(x):
+    z = x - max(x)
+    numerator = np.exp(z)
+    denominator = np.sum(numerator)
+    softmax = numerator/denominator
+    return softmax
+
 
 def get_hf_cache_dir():
     hostname = os.uname()[1]
@@ -31,6 +47,8 @@ def get_hf_cache_dir():
     else:
         hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface"
     return hf_cache_dir
+
+
 def estimate_and_print_gpt_prices(gpt_tokens_count, engine):
     assert gpt_tokens_count.keys() == {"input", "output"}
 
@@ -73,6 +91,7 @@ def estimate_and_print_gpt_prices(gpt_tokens_count, engine):
         tot_price = price_input + price_ouput
         print(f"\t{gpt_eng} ~ {tot_price:.2f}$ (in: {price_input:.2f}$ out: {price_ouput:.2f}$)")
 
+
 openai_2_azure_tag = {
     "gpt-3.5-turbo-0125": "gpt-35-turbo-0125",
     "gpt-3.5-turbo-1106": "gpt-35-turbo-1106"
diff --git a/visualization_scripts/data_analysis.py b/visualization_scripts/data_analysis.py
index 25f9682..b912ffa 100644
--- a/visualization_scripts/data_analysis.py
+++ b/visualization_scripts/data_analysis.py
@@ -150,7 +150,7 @@ def compute_ipsative_stability(dir_2_data, keys, default_profile=None):
 
     return mean_ipsative_stability, part_scores, ips_part_stabilities, ips_part_dir_stabilities, all_corrs
 
-def compute_paired_rank_order_stability(dir_2_data_1, dir_2_data_2, key_1, key_2, test_set_name_1, test_set_name_2, verbose=False, directories_1=None, directories_2=None):
+def compute_paired_rank_order_stability(dir_2_data_1, dir_2_data_2, key_1, key_2, test_set_name_1, test_set_name_2, verbose=False):
 
     if verbose:
         print(colored("\n\n--------------------------------------------------", "green"))
@@ -161,9 +161,14 @@ def compute_paired_rank_order_stability(dir_2_data_1, dir_2_data_2, key_1, key_2
 
     corrs = []
 
-    if directories_1 is None or directories_2 is None:
-        directories_1, directories_2 = list(dir_2_data_1.keys()), list(dir_2_data_2.keys())
+    # order paired_dirs according to conv topics in directories
+    directories_1, dirs_1_themes = zip(
+        *[(dir_1, data['args']['simulate_conversation_theme']) for dir_1, data in dir_2_data_1.items()]
+    )
+    theme_to_dir_2 = {data['args']['simulate_conversation_theme']: dir_2 for dir_2, data in dir_2_data_2.items()}
+    directories_2 = [theme_to_dir_2[theme] for theme in dirs_1_themes]
 
+    # pair directories
     dir_pairs = zip(directories_1, directories_2)
     dir_2_data = {**dir_2_data_1, **dir_2_data_2}
 
@@ -666,23 +671,23 @@ if __name__ == '__main__':
         print(f"IPsative stability is not computed because there only one metric {keys}.")
 
     if args.default_profile:
-        dir_2_data_neut_prof = load_data([args.default_profile])
-        assert len(dir_2_data_neut_prof.keys()) == 1
-        dir_neut_prof = list(dir_2_data_neut_prof.keys())[0]
+        dir_2_data_paired = load_data([args.default_profile])
+        assert len(dir_2_data_paired.keys()) == 1
+        dir_neut_prof = list(dir_2_data_paired.keys())[0]
 
         # per participant normalize the neutral profile
         normalized_scores = []
         for key in keys:
 
-            scores_neut_prof = np.array([d[test_set_name][key] for d in dir_2_data_neut_prof[dir_neut_prof]["per_simulated_participant_metrics"]])
+            scores_neut_prof = np.array([d[test_set_name][key] for d in dir_2_data_paired[dir_neut_prof]["per_simulated_participant_metrics"]])
 
             if "pvq" in test_set_name:
                 # extract per participant average answer
                 average_part_answer_neut_prof = np.array([
-                    np.array(d[test_set_name])[:, 1].astype(float).mean() for d in dir_2_data_neut_prof[dir_neut_prof]["answers"]
+                    np.array(d[test_set_name])[:, 1].astype(float).mean() for d in dir_2_data_paired[dir_neut_prof]["answers"]
                 ])
                 scores_neut_prof_ = scores_neut_prof - average_part_answer_neut_prof
-                scores_neut_prof = per_part_normalized_scores(dir_2_data_neut_prof, dir_neut_prof, test_set_name, key)
+                scores_neut_prof = per_part_normalized_scores(dir_2_data_paired, dir_neut_prof, test_set_name, key)
                 assert all(scores_neut_prof_ == scores_neut_prof)
 
                 normalized_scores.append(scores_neut_prof.copy())
@@ -762,14 +767,8 @@ if __name__ == '__main__':
         print("\n\nPaired Rank-Order stability\n")
         args.paired_dirs = [d for d in args.paired_dirs if "results.json" in os.listdir(d)]
 
-        for dir_, ben_ in zip(args.directories, args.paired_dirs):
-            # model and seed should be analogous
-            assert dir_.split("/")[2:4] == ben_.split("/")[2:4]
+        dir_2_data_paired = load_data(args.paired_dirs)
 
-            # same conversation theme
-            assert dir_.split("chat__")[1].split("202")[0] == ben_.split("chat__")[1].split("202")[0]
-
-        dir_2_data_neut_prof = load_data(args.paired_dirs)
 
         values_names = ["Benevolence", "Universalism", "Power", "Achievement", "Tradition", "Conformity", "Security",
                         "Self-Direction", "Stimulation", "Hedonism"]
@@ -781,7 +780,7 @@ if __name__ == '__main__':
             all_proxy_stabs[value] = {}
             for race in ["elves", 'dwarves', 'orcs', 'humans', 'hobbits']:
                 mean_paired_rank_order_stability, _, _, all_proxy_stabs_ = compute_paired_rank_order_stability(
-                    dir_2_data, dir_2_data_neut_prof,
+                    dir_2_data, dir_2_data_paired,
                     value, f"Donation {race}",
                     "pvq_auto", "tolkien_donation"
                 )
-- 
GitLab