diff --git a/README.md b/README.md index 6bae99cfa650f9a461dbbce3bd7a99debbe43121..57c8cf65b4257006959da60bdbe5f3df3c61d151 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ You can run one evaluation with the following command ``` theme="joke" -python -u evaluate_v3.py \ +python -u evaluate.py \ --engine "dummy" \ --experiment_name pvq_test \ --data_dir data/data_pvq \ @@ -71,7 +71,7 @@ Now lets run the same command for "joke": ``` theme="joke" -python -u evaluate_v3.py \ +python -u evaluate.py \ --engine "dummy" \ --experiment_name pvq_test \ --data_dir data/data_pvq \ @@ -89,7 +89,7 @@ python -u evaluate_v3.py \ Now lets run the same command for "grammar": ``` theme="grammar" -python -u evaluate_v3.py \ +python -u evaluate.py \ --engine "dummy" \ --experiment_name pvq_test \ --data_dir data/data_pvq \ diff --git a/campaign_data_analysis.py b/campaign_data_analysis.py index 61069c7eaba21e898fb8b4547b5129859e094941..2d16bf649aa27acf28d64e9e892cb9355df4dfdf 100644 --- a/campaign_data_analysis.py +++ b/campaign_data_analysis.py @@ -248,17 +248,18 @@ y_label = None FDR_test = True fam_min_y, fam_max_y = -0.1, 0.8 +seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] + if figure_name == "tolk_ro_t": experiment_dirs = ["stability_default_params_pvq_tolkien_characters"] - seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] add_legend = True legend_loc = (0.001, 0.99) metric = "Rank-Order" human_change_xloc = 6.8 show_human_change = True - legend_fontsize = 22 + legend_fontsize = 18 rotatation_x_labels = 90 xticks_fontsize = 15 @@ -269,9 +270,8 @@ if figure_name == "tolk_ro_t": elif figure_name == "fam_ro_t": experiment_dirs = ["stability_default_params_pvq_famous_people"] - seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] - add_legend = True + add_legend = False metric = "Rank-Order" human_change_xloc = 6.8 @@ -281,14 +281,13 @@ elif figure_name == "fam_ro_t": xticks_fontsize = 15 yticks_fontsize = 18 - min_y, max_y = -0.1, 1.0 # RO + min_y, max_y = -0.1, 0.8 # RO elif figure_name == "religion_t": rotatation_x_labels = 90 experiment_dirs = ["stability_default_params_religion_famous_people"] - seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] add_legend = False @@ -304,7 +303,6 @@ elif figure_name == "religion_t": elif figure_name == "don_t": experiment_dirs = ["stability_default_params_tolkien_donation_tolkien_characters"] - seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] metric = "Rank-Order" human_change_xloc = 6.8 @@ -318,7 +316,6 @@ elif figure_name == "don_t": elif figure_name.startswith("bag_t"): experiment_dirs = ["stability_default_params_tolkien_bag_tolkien_characters"] - seed_strings = [f"{i}_seed" for i in range(0, 9, 2)] add_legend = True metric = "Rank-Order" @@ -339,7 +336,6 @@ elif figure_name == "no_pop_ips": show_human_change = True human_change_xloc = -1.0 - human_data_fontsize = 8 xticks_fontsize = 17 yticks_fontsize = 20 @@ -378,7 +374,6 @@ elif figure_name.startswith("paired_tolk_ro"): experiment_dirs = ["stability_default_params_pvq_tolkien_characters"] paired_dir = "stability_default_params_tolkien_donation_tolkien_characters" - seed_strings = [f"seed_{i}" for i in range(0, 9, 2)] if value_to_pair == "Universalism": add_legend = True diff --git a/evaluate_v3.py b/evaluate.py similarity index 91% rename from evaluate_v3.py rename to evaluate.py index d75e610d58ce27827d5ebaca657ae229e89a94e1..c52e2647b95c62f351cb470adfeef6fb8aebf181 100644 --- a/evaluate_v3.py +++ b/evaluate.py @@ -34,7 +34,7 @@ os.environ['HF_HOME'] = hf_cache_dir opening_questions_for_themes = { "poem": "Hello, let's write a poem together. You start by the first verse I'll add the second one, and so on.", "joke": "Tell me a joke.", - "history": "What is the significance of the battle of Hastings. Answer in two sentences.", # slight collapse + "history": "What is the significance of the battle of Hastings. Answer in two sentences.", "chess": "1. e4", "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding.", } @@ -116,9 +116,7 @@ class StoppingCriteriaSub(StoppingCriteria): return any([stop in generation for stop in self.stops]) -def simulate_conversation(args, engine, sim_engine, model_set_persona_string=None, llm_generator=None, simulated_participant=None): - - opening_question = opening_questions_for_themes[args.simulated_conversation_theme] +def simulate_conversation(args, opening_question, model_set_persona_string=None, llm_generator=None, simulated_participant=None): conversation = [opening_question] @@ -158,7 +156,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non "content": model_set_persona_string }] + simulated_conv_messages - engine_ = engine assistant_label = labels_dict["persona"]["assistant_label"] user_label = labels_dict["persona"]["user_label"] system_label = labels_dict["persona"]["system_label"] @@ -167,8 +164,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non # gpt as human assert simulated_conv_messages[0]['role'] == "assistant" - # user doesn't know the chatbots persona -> change this? - # if args.base_model_template: if llm_generator.base_model_template: if args.simulated_human_knows_persona: sys_msg = f"The following is a conversation between a human and a chatbot. The chatbot is pretending to be {simulated_participant_name}. The human's every reply must be in one sentence only." @@ -189,7 +184,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non user_label = labels_dict["human"]["user_label"] system_label = labels_dict["human"]["system_label"] - # if not args.base_model_template: if not llm_generator.base_model_template: simulated_conv_messages = fix_alternating_msg_order(simulated_conv_messages) @@ -204,45 +198,6 @@ def simulate_conversation(args, engine, sim_engine, model_set_persona_string=Non if args.verbose: print_chat_messages(simulated_conv_messages) - - # llm_generator_type = type(llm_generator) - # if llm_generator_type == HuggingFaceModel: - # response = llm_generator.generate( - # messages=simulated_conv_messages, - # generation_kwargs=dict( - # max_new_tokens=args.simulated_conversation_msg_max_tokens, - # do_sample=True, - # top_p=args.simulated_conversation_top_p, - # temperature=args.simulated_conversation_temp, - # # top_k=50, - # # repetition_penalty=1.2, # logit / (T * penalty*bool(token present) ) - # num_beams=1, - # ), - # assistant_label=assistant_label, - # user_label=user_label, - # system_label=system_label, - # stop_words_up=stop_words_up - # ) - # - # elif llm_generator_type == OpenAIModel: - # response = llm_generator.generate( - # messages=simulated_conv_messages, - # generation_kwargs=dict( - # max_tokens=args.simulated_conversation_msg_max_tokens, - # top_p=args.simulated_conversation_top_p, - # temperature=args.simulated_conversation_temp, - # # not the same as hf repetition_penalty - # # presence_penalty=0.2, # logit - penalty*bool(token present) - # n=1, - # ) - # ) - # elif llm_generator_type in [InteractiveModel, DummyModel]: - # response = llm_generator.generate() - # - # else: - # raise NotImplementedError(f"Simulated conversations not implemented for {engine_}") - - # if args.base_model_template: if llm_generator.base_model_template: response_up = response.upper() stop_word_ind = np.min([response_up.index(sw) if sw in response_up else np.inf for sw in stop_words_up]) @@ -387,7 +342,7 @@ def hash_chat_conv(msgs_conv): return hex_dig -def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None): +def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simulated_participant=None, simulated_conversation_theme=None): cors = [] all_probs = [] all_lprobs = [] @@ -435,7 +390,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu else: gpt_tokenizer = None - if args.simulated_conversation_theme: + if simulated_conversation_theme: set_persona_str = prompt["set_persona_str"] if messages_conv is None: @@ -444,8 +399,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu messages_conv, messages_conv_hash = simulate_conversation( args=args, - engine=engine, - sim_engine=engine, + opening_question=opening_questions_for_themes[simulated_conversation_theme], model_set_persona_string=set_persona_str, simulated_participant=simulated_participant, llm_generator=llm_generator, @@ -473,7 +427,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu messages = construct_messages( prompt=prompt, system_message=True, - messages_conv=messages_conv if args.simulated_conversation_theme else None, + messages_conv=messages_conv if simulated_conversation_theme else None, ) n_input_tokens = sum([len(gpt_tokenizer.encode(msg['content'])) for msg in messages]) @@ -483,7 +437,7 @@ def eval(args, engine, test_df, participant_perm_dicts, llm_generator=None, simu messages = construct_messages( prompt=prompt, system_message=llm_generator.system_message, - messages_conv=messages_conv if args.simulated_conversation_theme else None, + messages_conv=messages_conv if simulated_conversation_theme else None, ) if args.verbose: @@ -687,6 +641,7 @@ def main(args): participant_perm_dicts=participant_perm_dicts, llm_generator=llm_generator, simulated_participant=simulated_participant, + simulated_conversation_theme=args.simulated_conversation_theme, ) all_cors.append(cors) gpt_tokens_total['input'] += gpt_tokens['input'] @@ -826,8 +781,8 @@ if __name__ == "__main__": parser.add_argument("--permute-options", "-po", action="store_true") parser.add_argument("--azure-openai", action="store_true") parser.add_argument("--simulated-human-knows-persona", action="store_true") - parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people", "llm_personas", "user_personas", "anes"]) - parser.add_argument("--permutations", "-p", type=int, default=1) # permutations as a population type + parser.add_argument("--simulated-population-type", "-pop", type=str, default="tolkien_characters", choices=["permutations", "tolkien_characters", "famous_people"]) + parser.add_argument("--permutations", "-p", type=int, default=50) parser.add_argument("--permute-options-seed", type=str) parser.add_argument("--overwrite", action="store_true") args = parser.parse_args() @@ -854,9 +809,6 @@ if __name__ == "__main__": if args.permute_options and args.permute_options_seed is None: raise ValueError("Permute options string should be defined for stability") - if ("gpt-3.5" in args.engine and args.permutations > 50) or ("gpt-4" in args.engine and args.permutations > 5): - raise ValueError(f"Are you sure you want to use {args.permutations} with {args.engine}??") - start_time = time.time() main(args) end_time = time.time() diff --git a/run_campaign_msgs.sh b/run_campaign_msgs.sh new file mode 100644 index 0000000000000000000000000000000000000000..3e7c2e90915aa93832a97aa2dcd53feb80e10b97 --- /dev/null +++ b/run_campaign_msgs.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH -A imi@a100 +#SBATCH -C a100 +#SBATCH --time=06:30:00 +#SBATCH --gres=gpu:2 +#SBATCH --array=0-24 # themes x n_msg -> 5x5 (no default profile, only contexts) +#SBATCH -o slurm_logs/sb_log_%A_%a.out +#SBATCH -e slurm_logs/sb_log_%A_%a.err +##SBATCH --qos=qos_gpu-dev + +########################################################## +# Set the questionnaire and population (using the second command argument) +########################################################## + +test_tag="pvq" +experiment_name="pvq_test" +data_dir="data_pvq" +population_type="permutations" + +# Print the selected configuration +echo "test_tag=$test_tag" +echo "experiment_name=$experiment_name" +echo "data_dir=$data_dir" +echo "population_type=$population_type" + + +# Extract parameters: theme and seed +########################################################## +themes=("grammar" "joke" "poem" "history" "chess" "None") + +n_msgs_list=(9 7 5 3 1) # 5 +n_msgs_len=${#n_msgs_list[@]} + +theme_i=$(( SLURM_ARRAY_TASK_ID / $n_msgs_len )) +msgs_i=$(( SLURM_ARRAY_TASK_ID % $n_msgs_len )) + +theme="${themes[$theme_i]}" +n_msgs="${n_msgs_list[$msgs_i]}" + +permute_options_seed=$theme_i + +# Other params +########################################################## +engine="$1" + +echo "ID:"$SLURM_ARRAY_TASK_ID +echo "Theme:"$theme +echo "Seed:"$seed +echo "Seed str:"$permute_options_seed +echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs:$test_tag:$population_type" + +# Setup the experiments directories +########################################################## +SUBDIR="stability_default_params_${test_tag}_${population_type}_msgs/${engine}/${n_msgs}_msgs/${seed}_seed/theme_${theme}" + +SAVE_DIR="results/"$SUBDIR +LOG_DIR="logs/"$SUBDIR + +# Start the experiment +########################################################## +mkdir -p $LOG_DIR + +source $HOME/.bashrc + +## define the conda env to use +case "$engine" in + phi-1|phi-2|Qwen1.5*|llama_3*|command_r_plus*|Mixtral-8x22B*) + conda activate llm_stability_phi + ;; + *) + conda activate llm_stability + ;; +esac + +echo "SLURM_JOB_ID: "$SLURM_JOB_ID"_"$SLURM_ARRAY_TASK_ID | tee -a $LOG_DIR/log_$permute_options_seed.txt + +python -u evaluate.py \ + --simulated-population-type $population_type \ + --simulated-conversation-theme $theme \ + --simulated-conversation-n-messages $n_msgs \ + --permute-options \ + --permute-options-seed "$permute_options_seed" \ + --save_dir $SAVE_DIR \ + --engine "$engine" \ + --data_dir data/$data_dir \ + --experiment_name $experiment_name \ + --pvq-version "pvq_auto" \ + --azure-openai \ + --assert-params \ + --verbose 2>&1 | tee -a $LOG_DIR/log_$permute_options_seed.txt \ No newline at end of file diff --git a/run_campaign_seeds.sh b/run_campaign_seeds.sh index ef891e03fded0db32e545d8658e715c622cb1e64..496bd481af000b751d955614439dbfff845d1544 100644 --- a/run_campaign_seeds.sh +++ b/run_campaign_seeds.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH -A imi@a100 #SBATCH -C a100 -#SBATCH --time=02:29:59 +#SBATCH --time=06:30:00 #SBATCH --gres=gpu:2 #SBATCH --array=0-24 # themes x n_seeds -> 6x5 (0-24 wo None, 0-29 for all) #SBATCH -o slurm_logs/sb_log_%A_%a.out @@ -46,6 +46,12 @@ case "$experiment_setting" in data_dir="data_religion" population_type="famous_people" ;; + no_pop) + test_tag="pvq" + experiment_name="pvq_test" + data_dir="data_pvq" + population_type="permutations" + ;; *) echo "Invalid experiment_setting. Please use one of the following: pvq_tolk, pvq_fam, don, bag, religion." exit 1 @@ -78,7 +84,10 @@ permute_options_seed="$seed"_"$theme_i" # Other params ########################################################## engine="$1" -n_msgs=3 + n_msgs=3 +# n_msgs=9 +# n_msgs=19 +#n_msgs=29 echo "ID:"$SLURM_ARRAY_TASK_ID echo "Theme:"$theme @@ -88,7 +97,14 @@ echo "Evaluation:$engine:$theme:$permute_options_seed:$n_msgs:$test_tag:$populat # Setup the experiments directories ########################################################## -SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}" +#SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}" +#SUBDIR="stability_default_params_${test_tag}_${population_type}_${n_msgs}_msgs/${engine}/seed_${seed}/theme_${theme}" + +if [ $n_msgs -eq 3 ]; then + SUBDIR="stability_default_params_${test_tag}_${population_type}/${engine}/seed_${seed}/theme_${theme}" +else + SUBDIR="stability_default_params_${test_tag}_${population_type}_${n_msgs}_msgs/${engine}/seed_${seed}/theme_${theme}" +fi SAVE_DIR="results/"$SUBDIR LOG_DIR="logs/"$SUBDIR @@ -111,7 +127,7 @@ esac -python -u evaluate_v3.py \ +python -u evaluate.py \ --simulated-population-type $population_type \ --simulated-conversation-theme $theme \ --simulated-human-knows-persona \ diff --git a/run_single.sh b/run_single.sh index 594bedb7bcfc4f357c85671c0c4b138c152e615e..caf71e9dffe53bcbe4f1268bc540171688305f2b 100644 --- a/run_single.sh +++ b/run_single.sh @@ -116,7 +116,7 @@ else fi -python -u evaluate_v3.py \ +python -u evaluate.py \ --engine "$engine" \ --experiment_name $experiment_name \ --data_dir data/$data_dir \