Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit b060521d authored by BERNIER Fabien's avatar BERNIER Fabien
Browse files

[~] Reorganization for text

parent fdb6b441
......@@ -7,7 +7,8 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from fixout.core_text import FixOutText, EnsembleOutText
from fixout.core_text import EnsembleOutText
from fixout.lime_text_global import TextExplainer
from time import time
......@@ -29,25 +30,23 @@ model.fit(X_train, y_train)
# evaluating our model
print("Accuracy:", model.score(X_test, y_test))
"""
# explaining our model
sensitive_words = list(map(lambda x: [x], ["com", "nigel", "of", "host", "library", "canrem", "symposium", "desks", "edu"]))
fixout = FixOutText(X_train, y_train, sensitives=sensitive_words, max_features=-1)
fair_flag, words_weights, actual_sensitive, explainer = fixout.is_fair(model)
explainer = TextExplainer(model.predict_proba)
explainer.global_explanation(X_test, n_samples=250)
for word, contrib in explainer.get_top_k(k=10) :
print(word, '\t', contrib)
# correcting fairness if necessary
# if fair_flag :
# print("Model is fair ! \o/")
# else :
# print("Model not fair, " + " ".join(actual_sensitive) + " in the main features...")
sensitive_words = list(map(lambda x: [x], ["com", "nigel", "of", "host", "library", "canrem", "symposium", "desks", "edu"]))
ensemble = EnsembleOutText(model, sensitive_words)
ensemble.fit(X_train, y_train)
print("Ensemble accuracy:", ensemble.score(X_test, y_test))
fair_flag_ens, words_weights_ens, actual_sensitive_ens, explainer_ens = fixout.is_fair(ensemble)
# explaining the ensemble model
ensemble_explainer = TextExplainer(ensemble.predict_proba)
ensemble_explainer.global_explanation(X_test, n_samples=250)
ensemble2 = EnsembleOutText(rf, sensitive_words, tokenizer=vectorizer.transform)
ensemble2.fit(X_train, y_train)
print("Ensemble accuracy:", ensemble2.score(X_test, y_test))
fair_flag_ens2, words_weights_ens2, actual_sensitive_ens2, explainer_ens2 = fixout.is_fair(ensemble2)
"""
\ No newline at end of file
for word, contrib in explainer.get_top_k(k=10) :
print(word, '\t', contrib)
\ No newline at end of file
# from .core import FixOut
from .core_text import FixOutText
from .core_text import EnsembleOutText
\ No newline at end of file
......@@ -5,38 +5,13 @@ import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.base import ClassifierMixin
from fixout.lime_text_global import fairness_eval
from copy import deepcopy
import numpy as np
import re
class FixOutText:
def __init__(self, X, y, sensitives, exp=fairness_eval, max_features=10, sampling_size=None):
self.exp = exp
self.X = X
le = LabelEncoder()
self.y = le.fit_transform(y)
self.class_names = le.classes_
self.sensitive_f = sensitives
self.max_features = max_features
if sampling_size is None :
self.sampling_size = len(X)//10 # by default, we set the sampling_size for SubmodularPick to 10% of the total number of instances
else :
self.sampling_size = sampling_size
def is_fair(self, model):
actual_sensitive, fair_flag, words_weights, explainer = self.exp(model, self.X, self.max_features, self.sensitive_f, self.class_names, self.sampling_size)
return fair_flag, words_weights, actual_sensitive, explainer
class EnsembleOutText(ClassifierMixin):
"""Class for ensemble models
......
"""
Implements LIME_Global. Verifies if sensitives features have high contributions.
"""
from collections import Counter
import pandas as pd
import numpy as np
from fixout.lime.lime_text import LimeTextExplainer
from collections import Counter
def fairness_eval(model, train, max_features, sensitive_features, class_names, sample_size):
explainer, explanations = features_contributions(model.predict_proba, train, class_names, sample_size)
contributions = Counter()
vocab_counter = Counter()
for exp in explanations:
vocab = exp.domain_mapper.indexed_string.inverse_vocab
words_weights = {vocab[i]: weight for i, weight in exp.local_exp[1]}
contributions.update(words_weights)
for i, _ in exp.local_exp[1]: vocab_counter.update({vocab[i]: 1})
# for w in contributions :
# contributions[w] /= vocab_counter[w]
actual_sensitive, is_fair, df = fairness_valid_top(contributions, sensitive_features, max_features)
return actual_sensitive, is_fair, df, explainer
def features_contributions(predict_fn, train, class_names, sample_size, kernel_width=5):
explainer = LimeTextExplainer(class_names=class_names, kernel_width=kernel_width)
if sample_size > len(train):
sample_size = len(train)
indexes = np.random.choice(range(sample_size), sample_size)
explanations = [explainer.explain_instance(train[i].lower(), predict_fn, num_features=1000) for i in indexes]
# sp_obj = submodular_pick.SubmodularPick(explainer, train, predict_fn, sample_size=sample_size,
# num_features=1000, clusters=clusters)
# explanations = sp_obj.sp_explanations
return explainer, explanations
def fairness_valid_top(contributions, sensitive_features, max_features):
actual_sensitive = []
ans_data = []
sorted_dict = sorted(contributions.items(), key=lambda x: abs(x[1]), reverse=True)
if max_features is None or max_features < 0 :
max_features = len(sorted_dict)
for i in range(max_features):
feature, value = sorted_dict[i]
ans_data.append([i, feature, value])
if feature in sensitive_features:
actual_sensitive.append(feature)
df = pd.DataFrame(ans_data, columns=["Index", "Word", "Contribution"])
return actual_sensitive, len(actual_sensitive) < 2, df
\ No newline at end of file
class TextExplainer:
def __init__(self, predict_fn, X_data=None, **kwargs):
self.predict_fn = predict_fn
self.X_data = X_data
self.explainer = LimeTextExplainer(**kwargs)
self.explanations = None
def explain_instances(self, X):
explanations = []
for instance in X:
exp = self.explainer.explain_instance(
instance.lower(),
self.predict_fn
)
vocab = exp.domain_mapper.indexed_string.inverse_vocab
local_exp = {vocab[i]: weight for i, weight in exp.local_exp[1]}
explanations.append(local_exp)
return explanations
def global_explanation(self, X_data=None, n_samples=5000):
if X_data is None:
X_data = self.X_data
I = np.arange(len(X_data))
np.random.shuffle(I)
I = I[:n_samples]
X = np.array(X_data)[I]
explanations = self.explain_instances(X)
contributions = Counter()
tuple(map(contributions.update, explanations))
self.explanations = contributions
return self
def get_top_k(self, k=10):
if self.explanations is None:
return
words = self.explanations.keys()
values = self.explanations.values()
top = sorted(zip(words, values), key=lambda x: abs(x[1]), reverse=True)[:k]
return top
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment