diff --git a/README.md b/README.md
index 6bae99cfa650f9a461dbbce3bd7a99debbe43121..57c8cf65b4257006959da60bdbe5f3df3c61d151 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ You can run one evaluation with the following command
 
 ```
 theme="joke"
-python -u evaluate_v3.py \
+python -u evaluate.py \
 --engine "dummy" \
 --experiment_name pvq_test \
 --data_dir data/data_pvq \
@@ -71,7 +71,7 @@ Now lets run the same command for "joke":
 
 ```
 theme="joke"
-python -u evaluate_v3.py \
+python -u evaluate.py \
 --engine "dummy" \
 --experiment_name pvq_test \
 --data_dir data/data_pvq \
@@ -89,7 +89,7 @@ python -u evaluate_v3.py \
 Now lets run the same command for "grammar":
 ```
 theme="grammar"
-python -u evaluate_v3.py \
+python -u evaluate.py \
 --engine "dummy" \
 --experiment_name pvq_test \
 --data_dir data/data_pvq \
diff --git a/campaign_data_analysis.py b/campaign_data_analysis.py
index 61069c7eaba21e898fb8b4547b5129859e094941..2d16bf649aa27acf28d64e9e892cb9355df4dfdf 100644
--- a/campaign_data_analysis.py
+++ b/campaign_data_analysis.py
@@ -248,17 +248,18 @@ y_label = None
 FDR_test = True
 fam_min_y, fam_max_y = -0.1, 0.8
 
+seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
+
 if figure_name == "tolk_ro_t":
 
     experiment_dirs = ["stability_default_params_pvq_tolkien_characters"]
-    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
 
     add_legend = True
     legend_loc = (0.001, 0.99)
     metric = "Rank-Order"
     human_change_xloc = 6.8
     show_human_change = True
-    legend_fontsize = 22
+    legend_fontsize = 18
     rotatation_x_labels = 90
 
     xticks_fontsize = 15
@@ -269,9 +270,8 @@ if figure_name == "tolk_ro_t":
 elif figure_name == "fam_ro_t":
 
     experiment_dirs = ["stability_default_params_pvq_famous_people"]
-    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
 
-    add_legend = True
+    add_legend = False
     metric = "Rank-Order"
     human_change_xloc = 6.8
 
@@ -281,14 +281,13 @@ elif figure_name == "fam_ro_t":
     xticks_fontsize = 15
     yticks_fontsize = 18
 
-    min_y, max_y = -0.1, 1.0  # RO
+    min_y, max_y = -0.1, 0.8  # RO
 
 elif figure_name == "religion_t":
 
     rotatation_x_labels = 90
 
     experiment_dirs = ["stability_default_params_religion_famous_people"]
-    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
 
     add_legend = False
 
@@ -304,7 +303,6 @@ elif figure_name == "religion_t":
 elif figure_name == "don_t":
 
     experiment_dirs = ["stability_default_params_tolkien_donation_tolkien_characters"]
-    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
 
     metric = "Rank-Order"
     human_change_xloc = 6.8
@@ -318,7 +316,6 @@ elif figure_name == "don_t":
 elif figure_name.startswith("bag_t"):
 
     experiment_dirs = ["stability_default_params_tolkien_bag_tolkien_characters"]
-    seed_strings = [f"{i}_seed" for i in range(0, 9, 2)]
 
     add_legend = True
     metric = "Rank-Order"
@@ -339,7 +336,6 @@ elif figure_name == "no_pop_ips":
     show_human_change = True
     human_change_xloc = -1.0
 
-
     human_data_fontsize = 8
     xticks_fontsize = 17
     yticks_fontsize = 20
@@ -378,7 +374,6 @@ elif figure_name.startswith("paired_tolk_ro"):
 
     experiment_dirs = ["stability_default_params_pvq_tolkien_characters"]
     paired_dir = "stability_default_params_tolkien_donation_tolkien_characters"
-    seed_strings = [f"seed_{i}" for i in range(0, 9, 2)]
 
     if value_to_pair == "Universalism":
         add_legend = True
diff --git a/evaluate_v3.py b/evaluate.py
similarity index 91%
rename from evaluate_v3.py
rename to evaluate.py
index d75e610d58ce27827d5ebaca657ae229e89a94e1..c52e2647b95c62f351cb470adfeef6fb8aebf181 100644
--- a/evaluate_v3.py
+++ b/evaluate.py
@@ -34,7 +34,7 @@ os.environ['HF_HOME'] = hf_cache_dir
 opening_questions_for_themes = {
     "poem": "Hello, let's write a poem together. You start by the first verse I'll add the second one, and so on.",
     "joke": "Tell me a joke.",
-    "history": "What is the significance of the battle of Hastings. Answer in two sentences.",  # slight collapse
+    "history": "What is the significance of the battle of Hastings. Answer in two sentences.",
     "chess": "1. e4",
     "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding.",
 }
@@ -116,9 +116,7 @@ class StoppingCriteriaSub(StoppingCriteria):
         return any([stop in generation for stop in self.stops])
 
 
-def simulate_conversation(args, engine, sim_engine, model_set_persona_string=None, llm_generator=None, simulated_participant=None):
-
-    opening_question = opening_questions_for_themes[args.simulated_conversation_theme]
+def simulate_conversation(args, opening_question, model_set_persona_string=None, llm_generator=None, simulated_participant=None):
 
     conversation = [opening_question]
 
@@ -158,7 +156,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non
                     "content": model_set_persona_string
                 }] + simulated_conv_messages
 
-            engine_ = engine
             assistant_label = labels_dict["persona"]["assistant_label"]
             user_label = labels_dict["persona"]["user_label"]
             system_label = labels_dict["persona"]["system_label"]
@@ -167,8 +164,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non
             # gpt as human
             assert simulated_conv_messages[0]['role'] == "assistant"
 
-            # user doesn't know the chatbots persona -> change this?
-            # if args.base_model_template:
             if llm_generator.base_model_template:
                 if args.simulated_human_knows_persona:
                     sys_msg = f"The following is a conversation between a human and a chatbot. The chatbot is pretending to be {simulated_participant_name}. The human's every reply must be in one sentence only."
@@ -189,7 +184,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non
             user_label = labels_dict["human"]["user_label"]
             system_label = labels_dict["human"]["system_label"]
 
-        # if not args.base_model_template:
         if not llm_generator.base_model_template:
             simulated_conv_messages = fix_alternating_msg_order(simulated_conv_messages)
 
@@ -204,45 +198,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non
         if args.verbose:
             print_chat_messages(simulated_conv_messages)
 
-
-        # llm_generator_type = type(llm_generator)
-        # if llm_generator_type == HuggingFaceModel:
-        #     response = llm_generator.generate(
-        #         messages=simulated_conv_messages,
-        #         generation_kwargs=dict(
-        #             max_new_tokens=args.simulated_conversation_msg_max_tokens,
-        #             do_sample=True,
-        #             top_p=args.simulated_conversation_top_p,
-        #             temperature=args.simulated_conversation_temp,
-        #             # top_k=50,
-        #             # repetition_penalty=1.2,  # logit / (T * penalty*bool(token present) )
-        #             num_beams=1,
-        #         ),
-        #         assistant_label=assistant_label,
-        #         user_label=user_label,
-        #         system_label=system_label,
-        #         stop_words_up=stop_words_up
-        #     )
-        #
-        # elif llm_generator_type == OpenAIModel:
-        #     response = llm_generator.generate(
-        #         messages=simulated_conv_messages,
-        #         generation_kwargs=dict(
-        #             max_tokens=args.simulated_conversation_msg_max_tokens,
-        #             top_p=args.simulated_conversation_top_p,
-        #             temperature=args.simulated_conversation_temp,
-        #             # not the same as hf repetition_penalty
-        #             # presence_penalty=0.2,  # logit - penalty*bool(token present)
-        #             n=1,
-        #         )
-        #     )
-        # elif llm_generator_type in [InteractiveModel, DummyModel]:
-        #     response = llm_generator.generate()
-        #
-        # else:
-        #     raise NotImplementedError(f"Simulated conversations not implemented for {engine_}")
-
-        # if args.base_model_template:
         if llm_generator.base_model_template:
             response_up = response.upper()
             stop_word_ind = np.min([response_up.index(sw) if sw in response_up else np.inf for sw in stop_words_up])
@@ -387,7 +342,7 @@ def hash_chat_conv(msgs_conv):
     return hex_dig
 
 
-def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None):
+def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None, simulated_conversation_theme=None):
     cors = []
     all_probs = []
     all_lprobs = []
@@ -435,7 +390,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu
         else:
             gpt_tokenizer = None
 
-        if args.simulated_conversation_theme:
+        if simulated_conversation_theme:
 
             set_persona_str = prompt["set_persona_str"]
             if messages_conv is None:
@@ -444,8 +399,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu
 
                 messages_conv, messages_conv_hash = simulate_conversation(
                     args=args,
-                    engine=engine,
-                    sim_engine=engine,
+                    opening_question=opening_questions_for_themes[simulated_conversation_theme],
                     model_set_persona_string=set_persona_str,
                     simulated_participant=simulated_participant,
                     llm_generator=llm_generator,
@@ -473,7 +427,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu
             messages = construct_messages(
                 prompt=prompt,
                 system_message=True,
-                messages_conv=messages_conv if args.simulated_conversation_theme else None,
+                messages_conv=messages_conv if simulated_conversation_theme else None,
             )
             n_input_tokens = sum([len(gpt_tokenizer.encode(msg['content'])) for msg in messages])
 
@@ -483,7 +437,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu
         messages = construct_messages(
             prompt=prompt,
             system_message=llm_generator.system_message,
-            messages_conv=messages_conv if args.simulated_conversation_theme else None,
+            messages_conv=messages_conv if simulated_conversation_theme else None,
         )
 
         if args.verbose:
@@ -687,6 +641,7 @@ def main(args):
                 participant_perm_dicts=participant_perm_dicts,
                 llm_generator=llm_generator,
                 simulated_participant=simulated_participant,
+                simulated_conversation_theme=args.simulated_conversation_theme,
             )
             all_cors.append(cors)
             gpt_tokens_total['input'] += gpt_tokens['input']
@@ -826,8 +781,8 @@ if __name__ == "__main__":
     parser.add_argument("--permute-options", "-po", action="store_true")
     parser.add_argument("--azure-openai", action="store_true")
     parser.add_argument("--simulated-human-knows-persona", action="store_true")
-    parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people", "llm_personas", "user_personas", "anes"])
-    parser.add_argument("--permutations", "-p", type=int, default=1)  # permutations as a population type
+    parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people"])
+    parser.add_argument("--permutations", "-p", type=int, default=50)
     parser.add_argument("--permute-options-seed", type=str)
     parser.add_argument("--overwrite", action="store_true")
     args = parser.parse_args()
@@ -854,9 +809,6 @@ if __name__ == "__main__":
     if args.permute_options and args.permute_options_seed is None:
         raise ValueError("Permute options string should be defined for stability")
 
-    if ("gpt-3.5" in args.engine and args.permutations > 50) or ("gpt-4" in args.engine and args.permutations > 5):
-        raise ValueError(f"Are you sure you want to use {args.permutations} with {args.engine}??")
-
     start_time = time.time()
     main(args)
     end_time = time.time()
diff --git a/run_campaign_msgs.sh b/run_campaign_msgs.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3e7c2e90915aa93832a97aa2dcd53feb80e10b97
--- /dev/null
+++ b/run_campaign_msgs.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH -A imi@a100
+#SBATCH -C a100
+#SBATCH --time=06:30:00
+#SBATCH --gres=gpu:2
+#SBATCH --array=0-24 # themes x n_msg -> 5x5 (no default profile, only contexts)
+#SBATCH -o slurm_logs/sb_log_%A_%a.out
+#SBATCH -e slurm_logs/sb_log_%A_%a.err
+##SBATCH --qos=qos_gpu-dev
+
+##########################################################
+# Set the questionnaire and population (using the second command argument)
+##########################################################
+
+test_tag="pvq"
+experiment_name="pvq_test"
+data_dir="data_pvq"
+population_type="permutations"
+
+# Print the selected configuration
+echo "test_tag=$test_tag"
+echo "experiment_name=$experiment_name"
+echo "data_dir=$data_dir"
+echo "population_type=$population_type"
+
+
+# Extract parameters: theme and seed
+##########################################################
+themes=("grammar" "joke" "poem" "history" "chess" "None")
+
+n_msgs_list=(9 7 5 3 1) # 5
+n_msgs_len=${#n_msgs_list[@]}
+
+theme_i=$(( SLURM_ARRAY_TASK_ID / $n_msgs_len ))
+msgs_i=$(( SLURM_ARRAY_TASK_ID % $n_msgs_len ))
+
+theme="${themes[$theme_i]}"
+n_msgs="${n_msgs_list[$msgs_i]}"
+
+permute_options_seed=$theme_i
+
+# Other params
+##########################################################
+engine="$1"
+
+echo "ID:"$SLURM_ARRAY_TASK_ID
+echo "Theme:"$theme
+echo "Seed:"$seed
+echo "Seed str:"$permute_options_seed
+echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs:$test_tag:$population_type"
+
+# Setup the experiments directories
+##########################################################
+SUBDIR="stability_default_params_${test_tag}_${population_type}_msgs/${engine}/${n_msgs}_msgs/${seed}_seed/theme_${theme}"
+
+SAVE_DIR="results/"$SUBDIR
+LOG_DIR="logs/"$SUBDIR
+
+# Start the experiment
+##########################################################
+mkdir -p $LOG_DIR
+
+source $HOME/.bashrc
+
+## define the conda env to use
+case "$engine" in
+    phi-1|phi-2|Qwen1.5*|llama_3*|command_r_plus*|Mixtral-8x22B*)
+        conda activate llm_stability_phi
+        ;;
+    *)
+        conda activate llm_stability
+        ;;
+esac
+
+echo "SLURM_JOB_ID: "$SLURM_JOB_ID"_"$SLURM_ARRAY_TASK_ID | tee -a $LOG_DIR/log_$permute_options_seed.txt
+
+python -u evaluate.py \
+  --simulated-population-type $population_type \
+  --simulated-conversation-theme $theme \
+  --simulated-conversation-n-messages $n_msgs \
+  --permute-options \
+  --permute-options-seed "$permute_options_seed" \
+  --save_dir $SAVE_DIR \
+  --engine "$engine" \
+  --data_dir data/$data_dir \
+  --experiment_name $experiment_name \
+  --pvq-version "pvq_auto" \
+  --azure-openai \
+  --assert-params \
+  --verbose  2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt
\ No newline at end of file
diff --git a/run_campaign_seeds.sh b/run_campaign_seeds.sh
index ef891e03fded0db32e545d8658e715c622cb1e64..496bd481af000b751d955614439dbfff845d1544 100644
--- a/run_campaign_seeds.sh
+++ b/run_campaign_seeds.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH -A imi@a100
 #SBATCH -C a100
-#SBATCH --time=02:29:59
+#SBATCH --time=06:30:00
 #SBATCH --gres=gpu:2
 #SBATCH --array=0-24 # themes x n_seeds -> 6x5 (0-24 wo None, 0-29 for all)
 #SBATCH -o slurm_logs/sb_log_%A_%a.out
@@ -46,6 +46,12 @@ case "$experiment_setting" in
     data_dir="data_religion"
     population_type="famous_people"
     ;;
+  no_pop)
+    test_tag="pvq"
+    experiment_name="pvq_test"
+    data_dir="data_pvq"
+    population_type="permutations"
+    ;;
   *)
     echo "Invalid experiment_setting. Please use one of the following: pvq_tolk, pvq_fam, don, bag, religion."
     exit 1
@@ -78,7 +84,10 @@ permute_options_seed="$seed"_"$theme_i"
 # Other params
 ##########################################################
 engine="$1"
-n_msgs=3
+ n_msgs=3
+# n_msgs=9
+# n_msgs=19
+#n_msgs=29
 
 echo "ID:"$SLURM_ARRAY_TASK_ID
 echo "Theme:"$theme
@@ -88,7 +97,14 @@ echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs:$test_tag:$populat
 
 # Setup the experiments directories
 ##########################################################
-SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}"
+#SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}"
+#SUBDIR="stability_default_params_${test_tag}_${population_type}_${n_msgs}_msgs/${engine}/seed_${seed}/theme_${theme}"
+
+if [ $n_msgs -eq 3 ]; then
+  SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}"
+else
+  SUBDIR="stability_default_params_${test_tag}_${population_type}_${n_msgs}_msgs/${engine}/seed_${seed}/theme_${theme}"
+fi
 
 SAVE_DIR="results/"$SUBDIR
 LOG_DIR="logs/"$SUBDIR
@@ -111,7 +127,7 @@ esac
 
 
 
-python -u evaluate_v3.py \
+python -u evaluate.py \
   --simulated-population-type $population_type \
   --simulated-conversation-theme $theme \
   --simulated-human-knows-persona \
diff --git a/run_single.sh b/run_single.sh
index 594bedb7bcfc4f357c85671c0c4b138c152e615e..caf71e9dffe53bcbe4f1268bc540171688305f2b 100644
--- a/run_single.sh
+++ b/run_single.sh
@@ -116,7 +116,7 @@ else
 fi
 
 
-python -u evaluate_v3.py \
+python -u evaluate.py \
   --engine "$engine" \
   --experiment_name $experiment_name \
   --data_dir data/$data_dir \