diff --git a/README.md b/README.md index 0b5bfe7eef66985658d68bb1f20c6e7ff6748bd5..3881567c61de444f21ce97677b5a815a1371b40d 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,35 @@ Setup the conda env ``` conda create -n llm_persp python=3.9 pip install -r requirements.txt -pip install git+https://github.com/huggingface/transformers@c612628045821f909020f7eb6784c79700813eda + +# install transformers +pip install git+https://github.com/huggingface/transformers.git +pip install -i https://test.pypi.org/simple/ bitsandbytes +conda install cudatoolkit -y +``` + +For openassistant create new env +``` +conda create --name llm_persp_oa --clone llm_persp +pip install git+https://github.com/huggingface/transformers@d04ec99bec8a0b432fc03ed60cea9a1a20ebaf3c ``` +[//]: # (or) + +[//]: # (```) + +[//]: # (git clone https://github.com/huggingface/transformers.git) + +[//]: # (cd transformers) + +[//]: # (git checkout d04ec99bec8a0b432fc03ed60cea9a1a20ebaf3c) + +[//]: # (pip install .) + +[//]: # (```) + + + ### Install llama if you want to run LLaMa models (this step is not needed to recreate experiments in the paper) Initialize and fetch the llama submodule diff --git a/evaluate.py b/evaluate.py index fdca153eaa59e12bfeb8df453f40f5e0c04f1588..ee0ea665c09478efcb32092706f66aeabceb3998 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,13 +4,22 @@ import random import re import json from collections import defaultdict +import os import matplotlib.pyplot as plt import tiktoken import torch import sys -from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextStreamer + +hostname = os.uname()[1] +if hostname == "PTB-09003439": + hf_cache_dir = "/home/flowers-user/.cache/huggingface" +else: + hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" + +os.environ['TRANSFORMERS_CACHE'] = hf_cache_dir +from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextStreamer, pipeline import transformers import json @@ -23,6 +32,18 @@ def print_chat_messages(messages): print(f"{msg['role'].upper()} : {msg['content']}") print("*********************") +def parse_hf_outputs(output, tokenizer, answers): + # extract the score for each possible answer + option_scores = {ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers} + + # take the most probable answer as the generation + generation = max(option_scores, key=option_scores.get) + + # extract logprobs + lprobs = [float(option_scores[a]) for a in answers] + + return option_scores, generation, lprobs + def create_simulated_messages(conv, last="user"): # simulate a conversation between two LLMs @@ -78,7 +99,6 @@ timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") print("timestamp:", timestamp) import openai -import os import numpy as np import pandas as pd import time @@ -114,7 +134,7 @@ def get_prompt_skeleton(subject, experiment_name, args): if sum(map(bool, [args.profile, args.lotr_character, args.music_expert_genre, args.hobby])) > 1: raise ValueError("Multiple ways of inducing a perspective are defined.") - if "pvq" in experiment_name or "hofstede" in experiment_name or "big5" in experiment_name: + if "pvq" in experiment_name or "hofstede" in experiment_name or "big5" in experiment_name or "mmlu" in experiment_name: # Natural language profile is only defined for dictionary profile if not args.profile_dict: @@ -142,6 +162,10 @@ def get_prompt_skeleton(subject, experiment_name, args): else: raise ValueError(f"Data dir name is ill-defined {args.data_dir}") + elif "mmlu" in experiment_name: + assert "mmlu" in args.data_dir + test_name = "mmlu" + else: raise ValueError(f"Experiment name is ill-defined {args.experiment_name}") @@ -155,6 +179,11 @@ def get_prompt_skeleton(subject, experiment_name, args): questionnaire_description = "" questionnaire_description_empty = True + elif test_name == "mmlu": + # VSM questionnaire doesn't have a description + questionnaire_description = "The following are multiple choice questions (with answers)." + questionnaire_description_empty = False + elif test_name in ["big5_50", "big5_100"]: # VSM questionnaire doesn't have a description questionnaire_description = "Mark how much you agree with each statement." @@ -454,7 +483,7 @@ def get_prompt_skeleton(subject, experiment_name, args): if args.system_message: prompts = { - "system": f"{set_perspective_str}".rstrip(), # remove newline from the end + "system": f"{set_perspective_str}".rstrip(), # remove newline from the end "intro": f"{questionnaire_description}\n\n" if not questionnaire_description_empty else questionnaire_description, "query": f"\n{query_str}", } @@ -469,40 +498,6 @@ def get_prompt_skeleton(subject, experiment_name, args): "query": f"\n{query_str}", } - elif "mmlu" in experiment_name: - raise DeprecationWarning("Deprecated") - if args.natural_language_profile: - raise NotImplementedError("Natural language profile not implemented.") - - if args.perspective_amount not in ["extreme", "slight"]: - raise NotImplementedError("Perspective amount not implemented") - - questionnaire_description = "The following are multiple choice questions (with answers)." - prompts = { - "intro": f"{questionnaire_description}\n\n", - "query": f"\nAnswer:" - } - - if args.profile_dict: - # we define the perspective through a profile - profile_string = "\n".join([ - f"\t{k}:{v}" for k, v in args.profile_dict.items() - ]) - - if args.direct_perspective: - set_perspective_str = "You are a person with the following profile:" - query_str = "Answer:" - - else: - set_perspective_str = "The following is a questionnaire (with answers) given to a person with the following profile:" - query_str = "Answer (from the person):" - - prompts = { - "intro": f"{set_perspective_str}\n"+ - f"{profile_string}\n\n", - "query": f"\n{query_str}", - } - elif subject == "political_compass": raise DeprecationWarning("Deprecated") @@ -742,7 +737,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato gpt_token_counter = 0 if args.simulate_conversation_theme and not (args.system_message and engine in ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-4-0314", "gpt-3.5-turbo-0613", "gpt-4-0613"]): - raise NotImplementedError("simulated conversation is only implemented with direct message and GPT chat models.") + raise NotImplementedError("simulated conversation is only implemented with system message and GPT chat models.") for i in range(test_df.shape[0]): if i % 10 == 0: @@ -878,39 +873,12 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato "history": "What is the significance of the battle of Hastings. Answer in two sentences.", # slight collapse "chess": "1. e4", "grammar": "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers child their nose started bleeding.", - # "code": "Can you write a bash script that changes the extensions of all the .jpg files to .jpeg.", # collapse - # - # - # - # - # "religion": "Why do we believe in the assumption of Mary? Please answer in two sentences.", "tax": "Can you explain how can my company pay less tax in two sentences?", - # "vgame": ""..., "vacation": "I was thinking of going surfing is that relaxing? Please reply briefly.", } - themes = { - "poem": "writing a poem together", - "joke": "telling a joke", - "history": "talking about history", - # slight collapse - "chess": "playing chess", - "grammar": "asking for grammar corrections", - # "code": "Can you write a bash script that changes the extensions of all the .jpg files to .jpeg.", # collapse - # - # - # - # - # - "religion": "discussing about religion", - "tax": "trying to see how to save money for my company", - "vgame": ..., - "vacation": "considering options for vacation", - } - opening_question = opening_questions_for_themes[args.simulate_conversation_theme] - theme = themes[args.simulate_conversation_theme] conversation = [opening_question] @@ -931,8 +899,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato else: assert simulated_conv_messages[0]['role'] == "assistant" simulated_conv_messages = [ - {"role": "system", "content": f"You are simulating a human using a chatbot."} # v1 - # {"role": "system", "content": f"You are simulating a human using a chatbot and {theme}."} # v2 + {"role": "system", "content": f"You are simulating a human using a chatbot."} ] + simulated_conv_messages engine_ = "gpt-4-0613" @@ -1076,6 +1043,57 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato # extract logprobs lprobs = [float(option_scores[a]) for a in answers] + elif engine in ["zephyr-7b-beta"]: + if args.simulate_conversation_theme: + raise NotImplementedError("Simulated conversation not implemented for zephyr.") + + tokenizer, model, pipe = llm_generator + + assert ("system" in prompt_skeleton) == args.system_message + + messages = [] + if prompt_skeleton.get("system", "") != "": + messages.append({"role": "system", "content": prompt_skeleton["system"]}) + + messages.append({"role": "user", "content": prompt}) + print_chat_messages(messages) + + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + output = model.generate( + **inputs, + max_new_tokens=1, + # temperature=0.0001, + do_sample=False, + top_p=1.0, + return_dict_in_generate=True, + output_scores=True + ) + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) + + # ### todo: remove below + # tokenizers and models are the same, pipe adds whitespace in encoding + # prompt_ = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + # output_ = pipe( + # prompt_, + # max_new_tokens=1, + # do_sample=False, + # # temperature=0.0001, + # top_p=1.0, + # # return_full_text=True, + # # return_all_scores=True, + # # return_dict_in_generate=True, + # # output_scores=True + # ) + # generated_text = tokenizer.decode(output['sequences'][0], skip_special_tokens=False) + # generated_text_ = output_[0]['generated_text'] + # + # generated_text = tokenizer.decode(output['sequences'][0], skip_special_tokens=False).split("assistant|>")[1] + # generated_text_ = output_[0]['generated_text'].split("assistant|>")[1] + # if not generated_text == generated_text_: + # from IPython import embed; embed(); + + elif engine in ["openassistant_rlhf2_llama30b"]: if args.generative_qa: raise NotImplementedError("Generative QA not implemented for OpenAI non-ChatGPT models.") @@ -1093,6 +1111,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato tokenizer, model = llm_generator inputs = tokenizer(prompt, return_tensors='pt').to('cuda') + del inputs["token_type_ids"] start_time = time.time() output = model.generate( **inputs, @@ -1105,16 +1124,14 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato ) print("inference time:", time.time()-start_time) - # extract the score for each possible answer - option_scores = { - ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers - } + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) - # take the most probable answer as the generation - generation = max(option_scores, key=option_scores.get) + # todo: remove if assert doesn't fail + option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers } + generation_ = max(option_scores, key=option_scores.get) + lprobs_ = [float(option_scores[a]) for a in answers] - # extract logprobs - lprobs = [float(option_scores[a]) for a in answers] + assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs) elif engine in ["stablevicuna"]: # todo: combine with stablelm @@ -1147,16 +1164,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato if args.simulate_conversation_theme: raise NotImplementedError("noisy conversation not implemented for user message") - # extract the score for each possible answer - option_scores = { - ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers - } - - # take the most probable answer as the generation - generation = max(option_scores, key=option_scores.get) - - # extract logprobs - lprobs = [float(option_scores[a]) for a in answers] + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) + option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers } + generation_ = max(option_scores, key=option_scores.get) + lprobs_ = [float(option_scores[a]) for a in answers] + assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs) elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]: if args.simulate_conversation_theme: @@ -1170,7 +1182,9 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato tokenizer, model = llm_generator inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - # del inputs["token_type_ids"] + if "token_type_ids" in inputs: + del inputs["token_type_ids"] + streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) output = model.generate( @@ -1178,7 +1192,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato streamer=streamer, use_cache=True, max_new_tokens=1, - temperature=0.0001, + # temperature=0.0001, do_sample=False, top_p=1.0, return_dict_in_generate=True, @@ -1186,17 +1200,7 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato ) # output_text = tokenizer.decode(output[0], skip_special_tokens=True) - # extract the score for each possible answer - option_scores = { - ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers - } - print("option_scores:", option_scores) - - # take the most probable answer as the generation - generation = max(option_scores, key=option_scores.get) - - # extract logprobs - lprobs = [float(option_scores[a]) for a in answers] + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) elif engine in ["rp_incite_7b_instruct", "rp_incite_7b_chat"]: if args.system_message: @@ -1214,18 +1218,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato return_dict_in_generate=True, output_scores=True ) - # extract the score for each possible answer - option_scores = { - ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers - } - - print("option_scores:", option_scores) - - # take the most probable answer as the generation - generation = max(option_scores, key=option_scores.get) - - # extract logprobs - lprobs = [float(option_scores[a]) for a in answers] + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) + option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers } + generation_ = max(option_scores, key=option_scores.get) + lprobs_ = [float(option_scores[a]) for a in answers] + assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs) elif engine in ["stablelm"]: if args.simulate_conversation_theme: @@ -1265,16 +1262,11 @@ def eval(args, subject, engine, dev_df, test_df, permutations_dict, llm_generato output_scores=True ) - # extract the score for each possible answer - option_scores = { - ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers - } - - # take the most probable answer as the generation - generation = max(option_scores, key=option_scores.get) - - # extract logprobs - lprobs = [float(option_scores[a]) for a in answers] + option_scores, generation, lprobs = parse_hf_outputs(output=output, tokenizer=tokenizer, answers=answers) + option_scores_ = { ans: output.scores[0][0, tokenizer.convert_tokens_to_ids(ans)] for ans in answers } + generation_ = max(option_scores, key=option_scores.get) + lprobs_ = [float(option_scores[a]) for a in answers] + assert (option_scores_, generation_, lprobs_) == (option_scores, generation, lprobs) else: raise ValueError(f"Not recognized model {engine}.") @@ -1385,17 +1377,23 @@ def main(args): assert "data_mmlu" in args.data_dir subjects_to_evaluate = [ - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_mathematics", - "high_school_physics", - "high_school_psychology", - "high_school_us_history", - "high_school_world_history", + 'high_school_biology', + 'high_school_chemistry', + 'high_school_computer_science', + 'high_school_european_history', + 'high_school_geography', + 'high_school_government_and_politics', + 'high_school_macroeconomics', + 'high_school_mathematics', + 'high_school_microeconomics', + 'high_school_physics', + 'high_school_psychology', + 'high_school_statistics', + 'high_school_us_history', + 'high_school_world_history' ] + + assert set(subjects_to_evaluate).issubset(subjects) subjects = subjects_to_evaluate if "mmlu_college" in args.experiment_name: @@ -1412,8 +1410,10 @@ def main(args): assert set(subjects_to_evaluate).issubset(subjects) subjects = subjects_to_evaluate + # assert all college subjects are taken # subjects = [s for s in subjects if "college" in s] + if "data_hofstede" == args.data_dir: assert "hofstede" in args.experiment_name @@ -1479,9 +1479,22 @@ def main(args): max_seq_len=2048, max_batch_size=1, ) - elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]: - hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" + elif engine in ["zephyr-7b-beta"]: + print("Loading zephyr-7b-beta") + zephyr_pipe = transformers.pipeline( + "text-generation", + model="HuggingFaceH4/zephyr-7b-beta", + torch_dtype=torch.bfloat16, + device_map="auto" + ) + + tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", cache_dir=hf_cache_dir, device_map="auto") + model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto", cache_dir=hf_cache_dir) + + llm_generator = (tokenizer, model, zephyr_pipe) + + elif engine in ["up_llama_60b_instruct", "up_llama2_70b_instruct_v2"]: if engine == "up_llama2_70b_instruct_v2": tokenizer = AutoTokenizer.from_pretrained("upstage/Llama-2-70b-instruct-v2", cache_dir=hf_cache_dir) @@ -1494,7 +1507,6 @@ def main(args): ) elif engine == "up_llama_60b_instruct": - hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" tokenizer = AutoTokenizer.from_pretrained("upstage/llama-65b-instruct", cache_dir=hf_cache_dir) model = AutoModelForCausalLM.from_pretrained( @@ -1505,14 +1517,12 @@ def main(args): rope_scaling={"type": "dynamic", "factor": 2} # allows handling of longer inputs ) - print("Loaded.") llm_generator = (tokenizer, model) - elif engine in ["falcon-40b", "falcon-40b-instruct"]: + raise NotImplementedError("Falcon not implemented.") model_name = f"tiiuae/{engine}" - hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=hf_cache_dir) model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=hf_cache_dir, trust_remote_code=True) @@ -1546,7 +1556,6 @@ def main(args): if engine == "rp_incite_7b_instruct": # init - hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", cache_dir=hf_cache_dir) model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", torch_dtype=torch.float16, cache_dir=hf_cache_dir) @@ -1558,7 +1567,6 @@ def main(args): elif engine == "rp_incite_7b_chat": # init - hf_cache_dir = "/gpfsscratch/rech/imi/utu57ed/.cache/huggingface" tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Chat", cache_dir=hf_cache_dir) model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Chat", torch_dtype=torch.float16, cache_dir=hf_cache_dir) model = model.to('cuda:0') @@ -1566,16 +1574,15 @@ def main(args): else: raise ValueError("Unknown model.") - print("Loaded.") llm_generator = (tokenizer, model) + elif engine in ["stablelm", "stablevicuna", "openassistant_rlhf2_llama30b"]: if engine == "stablelm": print("Loading stable-lm-tuned-alpha-7b.") - stablelm_cache_dir = "/gpfswork/rech/imi/utu57ed/stablelm_models" - tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=stablelm_cache_dir) - model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=stablelm_cache_dir) + tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir) + model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-7b", cache_dir=hf_cache_dir) elif engine == "stablevicuna": print("Loading stable-vicuna-13b.") @@ -1594,31 +1601,40 @@ def main(args): raise NotImplementedError(f"{engine} not supported") model.half().cuda() - print("Loaded.") llm_generator = (tokenizer, model) else: llm_generator = None + print(f"Loaded model: {args.engine}.") + all_cors = [] # list because of permutations - subj_acc = [] - subj_len = [] - metrics = [] - answers = [] + subj_acc = [{} for _ in range(args.permutations)] + subj_len = [{} for _ in range(args.permutations)] + metrics = [{} for _ in range(args.permutations)] + answers = [{} for _ in range(args.permutations)] for subject in subjects: if args.ntrain >= 1: # create in-context examples from dev - dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain] + dev_df = pd.read_csv( + os.path.join(args.data_dir, "dev", subject + "_dev.csv"), + header=None, + keep_default_na=False, + )[:args.ntrain] # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline dev_df[0][:] = dev_df[0][:].str.replace("\\n", "\n") else: dev_df = None - test_df = pd.read_csv(os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"), header=None) + test_df = pd.read_csv( + os.path.join(args.data_dir, args.eval_set, subject + f"_{args.eval_set}.csv"), + header=None, + keep_default_na=False, + ) # if the question contains \n in the csv it will get parsed as \\n, we revert it back here to be newline test_df[0][:] = test_df[0][:].str.replace("\\n", "\n") @@ -1654,10 +1670,10 @@ def main(args): for perm_i, permutations_dict in enumerate(permutations_dicts): print(f"PERMUTATION {perm_i}") - subj_acc.append({}) - subj_len.append({}) - metrics.append({}) - answers.append({}) + # subj_acc.append({}) + # subj_len.append({}) + # metrics.append({}) + # answers.append({}) cors, acc, probs, preds, gpt_tokens = eval( args=args, @@ -1816,13 +1832,12 @@ def main(args): plot_dict(m, savefile=os.path.join(dump_results_dir, f"plot_{subj}.png")) + # accuracy + plot_dict(mean_subj_acc, savefile=os.path.join(dump_results_dir, f"plot_mean_acc.png")) + if not os.path.exists(dump_results_dir): os.mkdir(dump_results_dir) - # if not all([(np.mean([m['pvq_male'][k] for m in metrics]) == mean_metrics['pvq_male'][k]) for k in mean_metrics['pvq_male'].keys()]): - # print("The metrics are not consistent.") - # from IPython import embed; embed(); - json_dump_path = os.path.join(dump_results_dir, 'results.json') with open(json_dump_path, 'w') as fp: diff --git a/models_stat_test.py b/models_stat_test.py index 47edf2a3c04e7d1eb153268f7b7a415e85937ea9..b6244aba2cf8164f895dbbde1c1de43be755ca49 100644 --- a/models_stat_test.py +++ b/models_stat_test.py @@ -6,10 +6,24 @@ from collections import defaultdict import scipy.stats as stats from termcolor import colored +data=defaultdict(dict) # use t-tests to compare +## Zephyr +data["zephyr"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/" +data["zephyr"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/" +data["zephyr"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/" +data["zephyr"]["pvq_resU3"] = "results_neurips/results_nat_lang_prof_pvq_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/" +data["zephyr"]["hof_resS2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/" +data["zephyr"]["hof_resS3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/" +data["zephyr"]["hof_resU2"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/" +data["zephyr"]["hof_resU3"] = "results_neurips/results_nat_lang_prof_hofstede_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/" +data["zephyr"]["big5_resS2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_2nd_prs/" +data["zephyr"]["big5_resS3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_System_msg_3rd_prs/" +data["zephyr"]["big5_resU2"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_2nd_prs/" +data["zephyr"]["big5_resU3"] = "results_neurips/results_nat_lang_prof_big5_test_zephyr-7b-beta_perm_50_User_msg_3rd_prs/" + ## GPT4 -data=defaultdict(dict) # data["gpt4"]["pvq_resS2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_2nd_prs/" data["gpt4"]["pvq_resS3"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_System_msg_3rd_prs/" # data["gpt4"]["pvq_resU2"] = "results_neurips/results_nat_lang_prof_pvq_test_gpt-4-0314_perm_50_User_msg_2nd_prs/" @@ -172,24 +186,24 @@ data["ada"]["big5_resU2"]="results_neurips/results_nat_lang_prof_big5_test_ada_p data["ada"]["big5_resU3"]="results_neurips/results_nat_lang_prof_big5_test_ada_perm_50_User_msg_3rd_prs/" -models = ["gpt4", "gpt35m", "gpt35j", "gpt35in", "upllama2","upllama1", "oa", "stvic", "stlm", "llama", "rpchat", "rpincite", "curie", "babbage", "ada"] +models = ["zephyr", "gpt4", "gpt35m", "gpt35j", "gpt35in", "upllama2","upllama1", "oa", "stvic", "stlm", "llama", "rpchat", "rpincite", "curie", "babbage", "ada"] msg = ["S", "U"] prs = ["2", "3"] # pvq -# questionnaires = ["pvq"] -# comparisons = [("gpt35m", m) for m in models] -# label_1 = "pvq_resU2" +questionnaires = ["pvq"] +comparisons = [("gpt35m", m) for m in models] +label_best = "pvq_resU2" # hof -# questionnaires = ["hof"] -# comparisons = [("upllama1", m) for m in models] -# label_1 = "hof_resU3" +questionnaires = ["hof"] +comparisons = [("upllama1", m) for m in models] +label_best = "hof_resU3" # # # big5 questionnaires = ["big5"] comparisons = [("gpt35j", m) for m in models] -label_1 = "big5_resS3" +label_best = "big5_resS3" # replace paths with data from alignments.json @@ -219,6 +233,7 @@ for model in models: # Append the data to the list json_data.extend(load_data) data[model][label] = json_data + p_limit = 0.05 / 15 print("p-limit: {}".format(p_limit)) @@ -237,7 +252,7 @@ for mod_1, mod_2 in comparisons: if label not in data[mod_1] or label not in data[mod_2]: continue - a=data[mod_1][label_1] + a=data[mod_1][label_best] b=data[mod_2][label] pvalue = stats.ttest_ind(a=a, b=b, equal_var=False).pvalue diff --git a/requirements.txt b/requirements.txt index d4bea5b40ca0b1e020b1c31a7001b732704c4b0c..d2029efed2cda9b5966e35e8faefe99cf473a8bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -86,8 +86,18 @@ wcwidth==0.2.6 wsproto==1.2.0 yarl==1.8.2 zipp==3.15.0 +tiktoken==0.5.1 +matplotlib +openai +pandas +bs +selenium +torch==1.13.1 +accelerate==0.18.0 +sentencepiece==0.1.98 +protobuf==3.20.1 # matplotlib==3.7.1 -# tiktoken==0.4.0 +# tiktoken==0.4.0kk # pandas==2.0.1 # selenium==4.9.1 # bs4 diff --git a/visualization_scripts/bar_viz.py b/visualization_scripts/bar_viz.py index a37c06869e2b1595e7f84dfc5a5860044e297567..b5ab15c94b3a55f7503515c4c6cc3f3a49e5e553 100644 --- a/visualization_scripts/bar_viz.py +++ b/visualization_scripts/bar_viz.py @@ -13,11 +13,34 @@ import scikit_posthocs as sp import pandas as pd import pingouin as pg from itertools import combinations -from scipy.stats import pearsonr, spearmanr +from scipy.stats import pearsonr, spearmanr, ConstantInputWarning def is_strictly_increasing(lst): return all(x <= y for x, y in zip(lst, lst[1:])) +def print_dict_values(d): + print("\t".join([f"{k:<10}" for k in list(d.keys()) + ["Mean"]])) + print("\t\t".join([f"{np.round(s, 2):.2}" for s in list(d.values()) + [np.mean(list(d.values()))]])) + +warnings.filterwarnings('error', category=ConstantInputWarning) +def compute_correlation(scores_1, scores_2, spearman): + try: + if spearman: + correlation, _ = spearmanr(scores_1, scores_2) + else: + correlation, _ = pearsonr(scores_1, scores_2) + + except ConstantInputWarning: + # not ideal but no other way + if len(set(scores_1)) == 1 and len(set(scores_2)) == 1: + # both constant + correlation = 1.0 + else: + # one constant + correlation = 0.0 + + return correlation + def dir_to_label(directory): @@ -162,10 +185,10 @@ def print_correlation_stats(cs): "STD": np.std(cs), "Min": np.min(cs), "Max": np.max(cs), - "Perc 25": np.percentile(cs, 25), - "Perc 75": np.percentile(cs, 75), - "Skew": skew(cs, axis=0, bias=True), - "Kurtosis": kurtosis(cs, axis=0, bias=True) + # "Perc 25": np.percentile(cs, 25), + # "Perc 75": np.percentile(cs, 75), + # "Skew": skew(cs, axis=0, bias=True), + # "Kurtosis": kurtosis(cs, axis=0, bias=True) } headers = "\t".join(statistics.keys()) @@ -428,6 +451,7 @@ if __name__ == '__main__': normalized_evaluation_data = [] notnorm_evaluation_data = [] + vals = [] parser = argparse.ArgumentParser() parser.add_argument('directories', nargs='+', help='directories containing results.json files') @@ -458,9 +482,9 @@ if __name__ == '__main__': ignore = [ - # "religion", - # "tax", - # "vacation", + "religion", + "tax", + "vacation", # "grammar", # "poem", @@ -502,6 +526,8 @@ if __name__ == '__main__': ignore_patterns = ["gen_space", "gen_w_space"] print("Ignoring patterns: ", ignore_patterns) + rank_order_stability = True + for substring in ignore_patterns: directories = [d for d in directories if substring not in d] @@ -515,10 +541,16 @@ if __name__ == '__main__': test_set_name = "big5_50" elif "big5_100" in directories[0]: test_set_name = "big5_100" + elif "mmlu" in directories[0]: + raise NotImplementedError("not implemented for mmlu") + test_set_name = "college_biology" + # test_set_name = "college_chemistry" + # test_set_name = "college_computer_science" + # test_set_name = "college_medicine" + # test_set_name = "college_physics" else: test_set_name = "pvq_male" - dir_2_data = {} for i, directory in enumerate(directories): if not os.path.isdir(directory): @@ -527,11 +559,11 @@ if __name__ == '__main__': if not os.path.isfile(results_json_path): continue + with open(results_json_path, 'r') as f: data = json.load(f) dir_2_data[directory] = data - if test_set_name == "pvq_male": test_set_values = [ 'Conformity', @@ -562,6 +594,8 @@ if __name__ == '__main__': "Agreeableness", "Conscientiousness" ] + elif "college_" in test_set_name: + test_set_values = ["accuracy"] primary_value_alignments = [] mean_vars = [] @@ -676,8 +710,23 @@ if __name__ == '__main__': normalized_evaluation_data = np.array(normalized_evaluation_data) notnorm_evaluation_data = np.array(notnorm_evaluation_data) - # mean over value dimensions - # mean_variance = variances.mean() + # for each value we compute the mean and var over perspectives + per_value_persp_mean = notnorm_evaluation_data.mean(axis=0).mean(axis=0) + # avg over permutation + per_value_persp_std = notnorm_evaluation_data.std(axis=0).mean(axis=0) + # std over both perspectives and permutation + per_value_std_std = notnorm_evaluation_data.std(axis=(0,1)) + + print("Per value std of perspectives") + for v_, m_, std_, std_std_ in zip(test_set_values, per_value_persp_mean, per_value_persp_std, per_value_std_std): + print("{:<15} \tM: {:.2f} \tSD (avg perm): {:.2f} \tSD (sd perm) {:.2f}".format(v_, m_, std_, std_std_)) + print("{:<15} \tM: {:.2f} \tSD (avg perm): {:.2f} \tSD (sd perm) {:.2f}".format( + "Mean", + np.mean(per_value_persp_mean), + np.mean(per_value_persp_std), + np.mean(per_value_std_std) + )) + # print(f"Mean (over values) Variance (over perspectives): {mean_variance}") perm_var = normalized_evaluation_data.var(1).mean() @@ -689,6 +738,7 @@ if __name__ == '__main__': # mean(permut) -> var (persp) -> mean (values) print(f"Perspective Var - mean (over values/traits) of var (over perspectives) of mean (over perm) (*10^3): {round(persp_var * (10 ** 3), 2)}") + # all_evaluation_data = (n_perspectives, n_permutations, n_values/traits) # we do a separate anova for each value/trait assert normalized_evaluation_data.shape[-1] == len(test_set_values) @@ -705,130 +755,57 @@ if __name__ == '__main__': print("testing p-value: ", group_p_limit) if test_set_name == "pvq_male": - # PVQ centering - is this reccomented in this manual? # todo: do properly all questions + # PVQ centering - is this reccomended in this manual? # todo: do properly all questions notnorm_evaluation_data = notnorm_evaluation_data - np.repeat(notnorm_evaluation_data.mean(2)[:,:, np.newaxis], 10, axis=2) + cohens_ds=defaultdict(list) + for val_i, val in enumerate(test_set_values): # value_data = normalized_evaluation_data[:, :, val_i] # value_data = (n_perspectives, n_permutations) value_data = notnorm_evaluation_data[:, :, val_i] # value_data = (n_perspectives, n_permutations) print(f"\n----------------{val}---------------------") - assume_assumptions=True - - if assume_assumptions: - # it's usually ok to violate assumptions - levene_ok = True - gaussian = True - - else: - # for assumptions - alpha = 0.001 - - # LEVENE test for equal variance (assumption for anova and tukey) - levene_test_stat, levene_p_value = stats.levene(*value_data) - if levene_p_value < alpha: # or whatever alpha level you choose - levene_ok = False - print(f"Levene not OK (p={levene_p_value}) - not same variances") - - else: - levene_ok = True - print(f"Levene OK (p={levene_p_value}) - same variances") - - # Shapiro - normality - gaussian = [] - ps = [] - for v_d in value_data: - _, p = shapiro(v_d) - ps.append(p) - - if p < alpha: - gaussian.append(False) - else: - gaussian.append(True) - - # plt.hist(v_d) - # plt.title(f"p = {p} - {'' if gaussian[-1] else 'not'} gaussian") - # plt.show() - - gaussian = all(gaussian) - # print(f"Gaussian (p={ps}): {gaussian}") - print(f"Gaussian (Shapiro) (p={np.round(ps, 5)}): {gaussian}") - - if gaussian: - if levene_ok: - # standard ANOVA - testname = "ANOVA" - _, pvalue = stats.f_oneway(*value_data) - else: - # Welch ANOVA - test_name = "Welch ANOVA" - df = pd.DataFrame({'score': np.array(value_data).flatten(), - 'group': np.repeat(labels, repeats=50)}) - - p_value = pg.welch_anova(dv='score', between='group', data=df)["p-unc"].values - assert len(p_value) == 1 - p_value = p_value[0] - - else: - # kruskal - testname = "Krushal-Wallis (H-test)" - _, pvalue = stats.kruskal(*value_data) - + # standard ANOVA + _, pvalue = stats.f_oneway(*value_data) if pvalue >= group_p_limit: different_distr.append(False) - print(colored(f"p({testname}): {pvalue} > {group_p_limit}", "red")) + print(colored(f"p(ANOVA): {pvalue} > {group_p_limit}", "red")) + posthoc_test=False else: different_distr.append(True) - print(colored(f"p({testname}): {pvalue} < {group_p_limit}", "green")) + print(colored(f"p(ANOVA): {pvalue} < {group_p_limit}", "green")) + posthoc_test=True - pairs_ind = list(itertools.combinations(range(len(value_data)), 2)) + # do posthocs and compute cohen's ds + pairs_ind = list(itertools.combinations(range(len(value_data)), 2)) - if gaussian and levene_ok: - # Tukey - tukey_res=tukey_hsd(*value_data) + if posthoc_test: + tukey_res=tukey_hsd(*value_data) # Tukey test - # print("\tTukey (p limit) :", p_limit) - print("\tTukey (p limit) :", group_p_limit) - for pair_ind in pairs_ind: + print("\tTukey (p limit) :", group_p_limit) + for pair_ind in pairs_ind: - res_pvalue = tukey_res.pvalue[pair_ind[0], pair_ind[1]] + # cohen's d + d = cohen_d(value_data[pair_ind[0]], value_data[pair_ind[1]]) + cohens_ds[val].append(d) - if res_pvalue < p_limit: - d = cohen_d(value_data[pair_ind[0]], value_data[pair_ind[1]]) - print("\t{} - {} : p -> {:.8f} d -> {:.8f}".format(labels[pair_ind[0]], labels[pair_ind[1]], res_pvalue, d)) + # posthoc + if posthoc_test: + res_pvalue = tukey_res.pvalue[pair_ind[0], pair_ind[1]] + if res_pvalue < p_limit: + print("\t{} - {} : p -> {:.8f} d -> {:.8f}".format( + labels[pair_ind[0]], + labels[pair_ind[1]], + res_pvalue, d) + ) - else: - # Dunn - dunn_res = sp.posthoc_dunn(value_data, p_adjust='bonferroni') - for pair_ind in pairs_ind: - res_pvalue = dunn_res.iloc[pair_ind[0], pair_ind[1]] - if res_pvalue < p_limit: - print("\t{} - {} : {}".format(labels[pair_ind[0]], labels[pair_ind[1]], res_pvalue)) - # # bonferroni correction - # n_comparisons = len(pairs_ind) - # bonf_p_limit = p_limit / len(pairs_ind) - # - # print(f"\tBonferroni corrected p : {p_limit} -> {bonf_p_limit}") - # print(f"\t{'t-test' if gaussian else 'u-test'} p < {bonf_p_limit} for: ") - # - # for pair_ind in pairs_ind: - # if gaussian: - # # welch or standard t-test (welch - leven_ok - equal_var:True) - # res_pvalue = stats.ttest_ind(a=value_data[pair_ind[0]], b=value_data[pair_ind[1]], equal_var=levene_ok).pvalue - # else: - # from IPython import embed; embed(); - # - # # Mann-Whitney U - # res_pvalue = stats.mannwhitneyu(x=value_data[pair_ind[0]], y=value_data[pair_ind[1]]).pvalue - # - # pair_labels = (labels[pair_ind[0]], labels[pair_ind[1]]) - # if res_pvalue < bonf_p_limit: - # print("\t{} -> {} < {}".format(pair_labels, res_pvalue, bonf_p_limit)) - + avg_abs_cohens_ds = {k:np.mean(np.abs(v)) for k,v in cohens_ds.items()} + print(colored("Average absolute cohen's ds:", "green")) + print_dict_values(avg_abs_cohens_ds) # plot bars for distr_i, different in enumerate(different_distr): @@ -866,135 +843,86 @@ if __name__ == '__main__': print() ######################## - # CORRELATIONS + # RANK ORDER STABILITY CORRELATIONS ######################## + if rank_order_stability: + print(colored("Pearson Correlation (between perspectives) - permutation order", "green")) + corrs = defaultdict(list) - print(colored("Pearson Correlation (between perspectives) - permutation order", "green")) - corrs = defaultdict(list) + perm_orders = defaultdict(dict) - perm_orders = defaultdict(dict) + for dir_1, dir_2 in combinations(directories, 2): - for dir_1, dir_2 in combinations(directories, 2): + lab_1 = dir_to_label(dir_1) + lab_2 = dir_to_label(dir_2) - lab_1 = dir_to_label(dir_1) - lab_2 = dir_to_label(dir_2) + # print(f"{lab_1} vs {lab_2}") + for key in keys: - print(f"{lab_1} vs {lab_2}") - for key in keys: + scores_1 = [d[test_set_name][key] for d in dir_2_data[dir_1]["per_permutation_metrics"]] + scores_2 = [d[test_set_name][key] for d in dir_2_data[dir_2]["per_permutation_metrics"]] - scores_1 = [d[test_set_name][key] for d in dir_2_data[dir_1]["per_permutation_metrics"]] - scores_2 = [d[test_set_name][key] for d in dir_2_data[dir_2]["per_permutation_metrics"]] + correlation = compute_correlation(scores_1, scores_2, spearman) + corrs[key].append(correlation) - if spearman: - correlation, _ = spearmanr(scores_1, scores_2) - else: - correlation, _ = pearsonr(scores_1, scores_2) + # print(f"\t{key} : {correlation}") - if np.isnan(correlation): - # Constant values in one group. Correlation is undefined and artificially set to 0 - correlation = 0.0 + all_corrs = list(itertools.chain(*corrs.values())) + print_aggregated_correlation_stats(all_corrs) - corrs[key].append(correlation) + permutation_order_stabilities = {} + for key in keys: + permutation_order_stabilities[key] = np.mean(corrs[key]) - print(f"\t{key} : {correlation}") + print("\nPermutation order stability due to perspective change") + print_dict_values(permutation_order_stabilities) - for key in keys: - print(f"{key}:") - print("\tAvg Pearson corr:", np.mean(corrs[key])) - print("\tMin Pearson corr:", np.min(corrs[key])) - all_corrs = list(itertools.chain(*corrs.values())) - print_aggregated_correlation_stats(all_corrs) + print("--------------------------------------------------") - print("--------------------------------------------------") + # order of perspectives + print(colored("Pearson Correlation (between permutations) - perspective order", "green")) + corrs = defaultdict(list) + persp_orders = defaultdict(dict) - # order of perspectives - print(colored("Pearson Correlation (between permutations) - perspective order", "green")) - corrs = defaultdict(list) - persp_orders = defaultdict(dict) - n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"]) + n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"]) - assert n_perms == 50 - assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"]) + assert n_perms == 50 + assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"]) + + for perm_1, perm_2 in combinations(range(n_perms), 2): + + for key in keys: + scores_1 = [dir_2_data[d]["per_permutation_metrics"][perm_1][test_set_name][key] for d in directories] + scores_2 = [dir_2_data[d]["per_permutation_metrics"][perm_2][test_set_name][key] for d in directories] - for perm_1, perm_2 in combinations(range(n_perms), 2): + correlation = compute_correlation(scores_1, scores_2, spearman) + corrs[key].append(correlation) + all_corrs = list(itertools.chain(*corrs.values())) + print_aggregated_correlation_stats(all_corrs) + + perspective_order_stabilities = {} for key in keys: - scores_1 = [dir_2_data[d]["per_permutation_metrics"][perm_1][test_set_name][key] for d in directories] - scores_2 = [dir_2_data[d]["per_permutation_metrics"][perm_2][test_set_name][key] for d in directories] + perspective_order_stabilities[key] = np.mean(corrs[key]) - if spearman: - correlation, _ = spearmanr(scores_1, scores_2) - else: - correlation, _ = pearsonr(scores_1, scores_2) - - if np.isnan(correlation): - # Constant values in one group. Correlation is undefined and artificially set to 0 - correlation = 0.0 - - corrs[key].append(correlation) - - for key in keys: - print(key) - print("\tAvg Pearson corr:", np.mean(corrs[key])) - print("\tMin Pearson corr:", np.min(corrs[key])) - - all_corrs = list(itertools.chain(*corrs.values())) - - print_aggregated_correlation_stats(all_corrs) - - # print("--------------------------------------------------") - # # order of perspectives - avg over permutations first (average scores) - # print(colored("Pearson Correlation (between perspectives) - values order (average scores)", "green")) - # corrs = list() - # key_orders = defaultdict(dict) - # persp_scores = defaultdict(dict) - # - # n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"]) - # - # assert n_perms == 50 - # assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"]) - # - # for d in directories: - # for key in keys: - # persp_scores[d][key] = np.mean([dir_2_data[d]["per_permutation_metrics"][i][test_set_name][key] for i in range(50)]) - # - # # keys in order - # key_orders[d] = [k for k, v in sorted(persp_scores[d].items(), key=lambda item: item[1])] - # - # for dir1, dir2 in combinations(directories, 2): - # - # if compare_scores: - # # key scores in order - # values1 = [persp_scores[dir1][k] for k in key_orders[dir1]] - # assert is_strictly_increasing(values1) - # # same order of keys (values/traits) with other scores - # values2 = [persp_scores[dir2][k] for k in key_orders[dir1]] - # - # else: - # mapping = {v: i for (i, v) in enumerate(key_orders[dir1])} - # - # values1 = [mapping[v] for v in key_orders[dir1]] - # values2 = [mapping[v] for v in key_orders[dir2]] - # - # if spearman: - # correlation, _ = spearmanr(values1, values2) - # else: - # correlation, _ = pearsonr(values1, values2) - # - # if np.isnan(correlation): - # assert compare_scores - # # Constant values in one group. Correlation is undefined and artificially set to 0 - # correlation = 0.0 - # - # print(f"\t{dir_to_label(dir1)} - {dir_to_label(dir2)} : {correlation}") - # corrs.append(correlation) - # - # print_aggregated_correlation_stats(corrs) + print("\nPerspective order stability due to permutation change") + print_dict_values(perspective_order_stabilities) + + print(colored("\nAverage rank-order stability due to permutation change", "green")) + # average order stability + avg_order_stabilities = {} + for k in perspective_order_stabilities.keys(): + avg_order_stabilities[k] = (permutation_order_stabilities[k] + perspective_order_stabilities[k]) / 2 + print_dict_values(avg_order_stabilities) + + ### IPSATIVE CORRELATIONS + print("\n\n--------------------------------------------------") + print("IPSATIVE CORELATIONS") print("--------------------------------------------------") # order of perspectives - avg over permutations last (average r) @@ -1003,10 +931,9 @@ if __name__ == '__main__': key_orders = defaultdict(dict) persp_scores = defaultdict(lambda : defaultdict(dict)) - n_perms = len(dir_2_data[dir_1]["per_permutation_metrics"]) + n_perms = len(dir_2_data[directories[0]]["per_permutation_metrics"]) assert n_perms == 50 - assert n_perms == len(dir_2_data[dir_2]["per_permutation_metrics"]) for p_i in range(50): for d in directories: @@ -1022,14 +949,16 @@ if __name__ == '__main__': for p_i in range(50): scores_1 = [persp_scores[p_i][dir1][k] for k in keys] scores_2 = [persp_scores[p_i][dir2][k] for k in keys] - if spearman: - correlation, _ = spearmanr(scores_1, scores_2) - else: - correlation, _ = pearsonr(scores_1, scores_2) - if np.isnan(correlation): - # Constant values in one group. Correlation is undefined and artificially set to 0 - correlation = 0.0 + correlation = compute_correlation(scores_1, scores_2, spearman) + # if spearman: + # correlation, _ = spearmanr(scores_1, scores_2) + # else: + # correlation, _ = pearsonr(scores_1, scores_2) + # + # if np.isnan(correlation): + # # Constant values in one group. Correlation is undefined and artificially set to 0 + # correlation = 0.0 correlation_list.append(correlation)