diff --git a/README.md b/README.md index ece39972a0d286bd7eb125b31c538fd2a2d29edd..5559071951c48791ead819f1ca06db92a0400ecb 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,10 @@ Results of the test: | [UnifiedQA](https://arxiv.org/abs/2005.00700) | Khashabi et al., 2020 | 45.6 | 56.6 | 40.2 | 54.6 | 48.9 | [GPT-3](https://arxiv.org/abs/2005.14165) (175B, few-shot) | Brown et al., 2020 | 40.8 | 50.4 | 36.7 | 48.8 | 43.9 | [GPT-3](https://arxiv.org/abs/2005.14165) (6.7B, fine-tuned) | Brown et al., 2020 | 42.1 | 49.2 | 35.1 | 46.9 | 43.2 +| [flan-T5-large](https://arxiv.org/abs/2210.11416) | Chung et al., 2022 | 39.1 | 49.1 | 33.2 | 47.4 | 41.9 +| [flan-T5-base](https://arxiv.org/abs/2210.11416) | Chung et al., 2022 | 34.0 | 38.1 | 27.6 | 37.0 | 34.2 | [GPT-2](https://arxiv.org/abs/2005.14165) | Radford et al., 2019 | 32.8 | 33.3 | 30.2 | 33.1 | 32.4 +| [flan-T5-small](https://arxiv.org/abs/2210.11416) | Chung et al., 2022 | 29.9 | 30.9 | 27.5 | 29.7 | 29.5 | Random Baseline | N/A | 25.0 | 25.0 | 25.0 | 25.0 | 25.0 | 25.0 diff --git a/evaluate_flan.py b/evaluate_flan.py index c4a22f478f93b4b3144d72e61453eaefa4403377..8afeb323a93adb9f54e39e65b442ef17a36fdbed 100644 --- a/evaluate_flan.py +++ b/evaluate_flan.py @@ -3,6 +3,7 @@ import os import torch import numpy as np import pandas as pd +from categories import subcategories, categories from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import time @@ -58,7 +59,6 @@ def eval(args, subject, model, tokenizer, dev_df, test_df): train_prompt = gen_prompt(dev_df, subject, k) prompt = train_prompt + prompt_end input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda() - print(input_ids.shape[-1]) label = test_df.iloc[i, test_df.shape[1] - 1] @@ -128,6 +128,10 @@ def main(args): os.makedirs(os.path.join(args.save_dir, "results_{}".format(args.model))) all_cors = [] + subcat_cors = { + subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists + } + cat_cors = {cat: [] for cat in categories} for subject in subjects: dev_df = pd.read_csv( @@ -138,6 +142,12 @@ def main(args): ) cors, acc, probs = eval(args, subject, model, tokenizer, dev_df, test_df) + subcats = subcategories[subject] + for subcat in subcats: + subcat_cors[subcat].append(cors) + for key in categories.keys(): + if subcat in categories[key]: + cat_cors[key].append(cors) all_cors.append(cors) test_df["{}_correct".format(args.model)] = cors @@ -151,6 +161,13 @@ def main(args): index=None, ) + for subcat in subcat_cors: + subcat_acc = np.mean(np.concatenate(subcat_cors[subcat])) + print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat)) + + for cat in cat_cors: + cat_acc = np.mean(np.concatenate(cat_cors[cat])) + print("Average accuracy {:.3f} - {}".format(cat_acc, cat)) weighted_acc = np.mean(np.concatenate(all_cors)) print("Average accuracy: {:.3f}".format(weighted_acc))