Commit 87f841fc authored by BERNIER Fabien's avatar BERNIER Fabien
Browse files

[~] bug fix for shap submodular pick

parent db822020
......@@ -4,33 +4,36 @@ Implements SHAP_Global.
import shap
from lime_global import normalize
from lime import submodular_pick
from shap_submodular_pick import ShapSubmodularPick
import numpy as np
import pandas as pd
clusters=10
clusters=50
def with_submodular(predict_fn, train, feature_names, max_features, class_names, categorical_features, categorical_names, sample_size, kernel_width=3):
def with_submodular(predict_fn, train, samples, feature_names, max_features, class_names, categorical_features, categorical_names, sample_size, kernel_width=3):
# self.explanations.append(explainer.explain_instance(data[i], predict_fn, num_features=num_features,))
explainer = shap.KernelExplainer(predict_fn,train)
explainer = shap.KernelExplainer(predict_fn, samples)
num_features = len(feature_names) if max_features <= 0 else max_features
sp_obj = submodular_pick.SubmodularPick(explainer, train, predict_fn, sample_size=sample_size, num_features=num_features, clusters=clusters)
sp_obj = ShapSubmodularPick(explainer, train, predict_fn, sample_size=sample_size, num_exps_desired=clusters, num_features=num_features)
return explainer, sp_obj
def features_contributions(predict_fn, train, feature_names, max_features, class_names, categorical_features, categorical_names, sample_size, kernel_width=3):
choosen = shap.sample(train, sample_size)
sample = shap.kmeans(train, clusters)
explainer = shap.KernelExplainer(predict_fn,sample)
shap_values = explainer.shap_values(choosen)
samples = shap.kmeans(train, 10)
# explainer = shap.KernelExplainer(predict_fn,sample)
# shap_values = explainer.shap_values(choosen)
explainer, sp_obj = with_submodular(lambda x: predict_fn(x)[:,0], choosen, samples, feature_names, max_features, class_names, categorical_features, categorical_names, sample_size, kernel_width)
shap_values = sp_obj.sp_explanations
return explainer, shap_values
def fairness_eval(model, train, max_features, sensitive_features, feature_names, class_names, categorical_features, categorical_names, sample_size, threshold=None):
......@@ -42,7 +45,7 @@ def fairness_eval(model, train, max_features, sensitive_features, feature_names,
contributions = {}
for i in range(n_features):
contributions[i] = sum(shap_values[0][:,i])
contributions[i] = sum(shap_values[:,i])
actual_sensitive, df = fairness_valid_top(contributions, feature_names, sensitive_features, max_features)
......
import numpy as np
import warnings
class ShapSubmodularPick:
def __init__(self,
explainer,
data,
predict_fn,
method='sample',
sample_size=1000,
num_exps_desired=5,
**kwargs):
"""
Args:
data: a numpy array where each row is a single input into predict_fn
predict_fn: prediction function. For classifiers, this should be a
function that takes a numpy array and outputs prediction
probabilities. For regressors, this takes a numpy array and
returns the predictions. For ScikitClassifiers, this is
`classifier.predict_proba()`. For ScikitRegressors, this
is `regressor.predict()`. The prediction function needs to work
on multiple feature vectors (the vectors randomly perturbed
from the data_row).
method: The method to use to generate candidate explanations
method == 'sample' will sample the data uniformly at
random. The sample size is given by sample_size. Otherwise
if method == 'full' then explanations will be generated for the
entire data. l
sample_size: The number of instances to explain if method == 'sample'
num_exps_desired: The number of explanation objects returned
num_features: maximum number of features present in explanation
Sets value:
sp_explanations: A list of explanation objects that has a high coverage
explanations: All the candidate explanations saved for potential future use.
"""
# Parse args
if method == 'sample':
if sample_size > len(data):
warnings.warn("Requested sample size larger than "
"size of input data. Using all data")
sample_size = len(data)
np.random.shuffle(data)
data = data[:sample_size]
elif method == 'full':
sample_size = len(data)
else:
raise ValueError('Method must be \'sample\' or \'full\'')
num_exps_desired = min(num_exps_desired, sample_size)
# Generate Explanations
self.explanations = explainer.shap_values(data, **kwargs)
W = abs(np.array(self.explanations))
# Create the global importance vector, I_j described in the paper
importance = np.sum(W, axis=0)**.5
# Now run the SP-LIME greedy algorithm
remaining_indices = set(range(len(self.explanations)))
V = []
for _ in range(num_exps_desired):
best = 0
best_ind = None
current = 0
for i in remaining_indices:
current = np.dot(
(np.sum(W[V + [i]], axis=0) > 0), importance
) # coverage function
if current >= best:
best = current
best_ind = i
V.append(best_ind)
remaining_indices -= {best_ind}
self.sp_explanations = self.explanations[V]
self.V = V
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment