Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit 42076923 authored by BERNIER Fabien's avatar BERNIER Fabien
Browse files

[+] experiments with hate speech

parent 35a58651
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -8,11 +8,12 @@ from nltk import pos_tag
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# Import data
df = pd.read_csv("datasets/english_variant.csv")
df = pd.read_csv("examples/datasets/english_variant.csv")
class_names = np.array(["SA", "AA"])
......@@ -64,7 +65,9 @@ for tweet in df.tweet:
X = df.tweet.to_numpy()
y = (df["group"] == "AA").to_numpy(dtype=np.uint8)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
print(len(X))
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train, X_test, y_train, y_test = X[:10000], X[10000:], y[:10000], y[10000:]
vectorizer = TfidfVectorizer(
tokenizer=tokenize,
......@@ -80,11 +83,11 @@ vectorizer = TfidfVectorizer(
max_df=0.75
)
# lr = LogisticRegression(class_weight='balanced')
rf = RandomForestClassifier(n_estimators=100)
lr = LogisticRegression(class_weight='balanced')
# rf = RandomForestClassifier(n_estimators=100)
# training the model
model = make_pipeline(vectorizer, rf)
model = make_pipeline(vectorizer, lr)
model.fit(X_train, y_train)
# evaluating the model
......@@ -93,6 +96,7 @@ print("Accuracy:", accuracy_score(y_test, pred))
print(class_names[model.predict(["piece of cake", "piece of shit"])])
# explaining the model
# fixout = FixOutText(X, y, sensitives=["black", "white", "bitch"], max_features=-1)
# t0 = time()
......
......@@ -10,68 +10,81 @@ from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
print("Training embeddings...")
from examples.word_clustering import word_clustering_aa, word_clustering_sa
print("Ok let's go")
from copy import deepcopy
from time import time
from fixout.core_text import FixOutText, EnsembleOutText
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
hsdata = pd.read_csv("datasets/hate_speech.csv")
hsdata = pd.read_csv("datasets/hate_speech_eg.csv")
class_names = np.array(["ok", "hate speech"])
stopwords = stopwords.words("english")
stopwords.extend(["#ff", "ff", "rt"])
stemmer = PorterStemmer()
def preprocess(text_string):
"""
Accepts a text string and replaces:
1) urls with URLHERE
2) lots of whitespace with one instance
3) mentions with MENTIONHERE
This allows us to get standardized counts of urls and mentions
Without caring about specific people mentioned
"""
space_pattern = '\s+'
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'
parsed_text = re.sub(space_pattern, ' ', text_string)
parsed_text = re.sub(giant_url_regex, '', parsed_text)
parsed_text = re.sub(mention_regex, '', parsed_text)
return parsed_text
#
# def preprocess(text_string):
# """
# Accepts a text string and replaces:
# 1) urls with URLHERE
# 2) lots of whitespace with one instance
# 3) mentions with MENTIONHERE
#
# This allows us to get standardized counts of urls and mentions
# Without caring about specific people mentioned
# """
# space_pattern = '\s+'
# giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
# '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# mention_regex = '@[\w\-]+'
# parsed_text = re.sub(space_pattern, ' ', text_string)
# parsed_text = re.sub(giant_url_regex, '', parsed_text)
# parsed_text = re.sub(mention_regex, '', parsed_text)
# return parsed_text
#
#
def tokenize(tweet):
"""Removes punctuation & excess whitespace, sets to lowercase,
and stems tweets. Returns a list of stemmed tokens."""
tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
tokens = [stemmer.stem(t) for t in tweet.split()]
return tokens
#
#
# def basic_tokenize(tweet):
# """Same as tokenize but without the stemming"""
# tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
# return tweet.split()
#
# tweet_tags = []
# for tweet in hsdata.tweet:
# tokens = basic_tokenize(preprocess(tweet))
# tags = pos_tag(tokens)
# tag_list = [x[1] for x in tags]
# tag_str = " ".join(tag_list)
# tweet_tags.append(tag_str)
def basic_tokenize(tweet):
"""Same as tokenize but without the stemming"""
tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
return tweet.split()
tweet_tags = []
for tweet in hsdata.tweet:
tokens = basic_tokenize(preprocess(tweet))
tags = pos_tag(tokens)
tag_list = [x[1] for x in tags]
tag_str = " ".join(tag_list)
tweet_tags.append(tag_str)
X = hsdata.tweet.to_numpy()
y = (hsdata["class"] != 2).to_numpy(dtype=np.uint8)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
ev = hsdata["english_variant"]
print("Clustering data...")
X[ev == "SA"] = list(map(word_clustering_sa, X[ev == "SA"]))
X[ev == "AA"] = list(map(word_clustering_aa, X[ev == "AA"]))
print("Data is clustered!")
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train, X_test, y_train, y_test = X[:19000], X[19000:], y[:19000], y[19000:]
ev_train, ev_test = ev[:19000], ev[19000:]
vectorizer = TfidfVectorizer(
tokenizer=tokenize,
preprocessor=preprocess,
ngram_range=(1, 3),
stop_words=stopwords,
use_idf=True,
......@@ -88,51 +101,62 @@ lr = LogisticRegression(class_weight='balanced')
# training the model
model = make_pipeline(vectorizer, lr)
model_sa = deepcopy(model)
model_aa = deepcopy(model)
model.fit(X_train, y_train)
model_sa.fit(X_train[ev_train == "SA"], y_train[ev_train == "SA"])
model_aa.fit(X_train[ev_train == "AA"], y_train[ev_train == "AA"])
# evaluating the model
pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("SA Accuracy:", model_sa.score(X_test[ev_test == "SA"], y_test[ev_test == "SA"]))
print("AA Accuracy:", model_aa.score(X_test[ev_test == "AA"], y_test[ev_test == "AA"]))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~ checking parity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def get_dataset_predictions(model, dataset_filename) :
def get_dataset_predictions(model_sa, model_aa, dataset_filename) :
variant = pd.read_csv(dataset_filename)
tweets = variant["tweet"].to_numpy()
for i in range(tweets.shape[0]): tweets[i] = tweets[i][1:-1]
groups = variant["group"]
return model.predict(tweets), tweets, groups
tweets_sa = tweets[variant["group"] == "SA"]
tweets_aa = tweets[variant["group"] == "AA"]
return model_sa.predict(tweets_sa), model_aa.predict(tweets_aa)
def check_parity(preds, tweets, groups, word=None):
if word is not None :
I = list(map(lambda t: word in t, tweets))
groups = groups[I]
preds = preds[I]
preds_aa = preds[groups == "AA"]
preds_sa = preds[groups == "SA"]
hate_rate_aa = len(preds_aa[preds_aa == 1]) / len(preds_aa)
def check_parity(preds_sa, preds_aa, word=None):
hate_rate_sa = len(preds_sa[preds_sa == 1]) / len(preds_sa)
print(f"[{word}] P(hate_speech | AA) = {hate_rate_aa}")
hate_rate_aa = len(preds_aa[preds_aa == 1]) / len(preds_aa)
print(f"[{word}] P(hate_speech | SA) = {hate_rate_sa}")
print(f"[{word}] P(hate_speech | AA) = {hate_rate_aa}")
preds, tweets, groups = get_dataset_predictions(model, "datasets/english_variant.csv")
check_parity(preds, tweets, groups)
btch_preds, btch_tweets, btch_groups = get_dataset_predictions(model, "datasets/english_variant_btch.csv")
check_parity(btch_preds, btch_tweets, btch_groups)
ngga_preds, ngga_tweets, ngga_groups = get_dataset_predictions(model, "datasets/english_variant_ngga.csv")
check_parity(ngga_preds, ngga_tweets, ngga_groups)
print("Original model")
preds_sa, preds_aa = get_dataset_predictions(model, model, "datasets/english_variant.csv")
check_parity(preds_sa, preds_aa, None)
preds_sa, preds_aa = get_dataset_predictions(model, model, "datasets/english_variant_btch.csv")
check_parity(preds_sa, preds_aa, "b*tch")
preds_sa, preds_aa = get_dataset_predictions(model, model, "datasets/english_variant_ngga.csv")
check_parity(preds_sa, preds_aa, "n*gga")
print("Contextual model")
preds_sa, preds_aa = get_dataset_predictions(model_sa, model_aa, "datasets/english_variant.csv")
check_parity(preds_sa, preds_aa, None)
preds_sa, preds_aa = get_dataset_predictions(model_sa, model_aa, "datasets/english_variant_btch.csv")
check_parity(preds_sa, preds_aa, "b*tch")
preds_sa, preds_aa = get_dataset_predictions(model_sa, model_aa, "datasets/english_variant_ngga.csv")
check_parity(preds_sa, preds_aa, "n*gga")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~ Applying EnsembleOut ~~~~~~~~~~~~~~~~~~~~~~~~~
sensitive_words = ['lol', 'amp', 'haha', 'bout', 'im', 'u', 'tho', 'yea', 'lmao', 'finna', 'honestly']
ensemble = EnsembleOutText(model, sensitive_words)
ensemble.fit(X_train, y_train)
print("Ensemble accuracy:", accuracy_score(y_test, ensemble.predict(X_test)))
preds, tweets, groups = get_dataset_predictions(ensemble, "datasets/english_variant.csv")
check_parity(preds, tweets, groups)
btch_preds, btch_tweets, btch_groups = get_dataset_predictions(ensemble, "datasets/english_variant_btch.csv")
check_parity(btch_preds, btch_tweets, btch_groups)
ngga_preds, ngga_tweets, ngga_groups = get_dataset_predictions(ensemble, "datasets/english_variant_ngga.csv")
check_parity(ngga_preds, ngga_tweets, ngga_groups)
\ No newline at end of file
# sensitive_words = ['lol', 'amp', 'haha', 'bout', 'im', 'u', 'tho', 'yea', 'lmao', 'finna', 'honestly']
# ensemble = EnsembleOutText(model, sensitive_words)
# ensemble.fit(X_train, y_train)
# print("Ensemble accuracy:", accuracy_score(y_test, ensemble.predict(X_test)))
#
# preds, tweets, groups = get_dataset_predictions(ensemble, "datasets/english_variant.csv")
# check_parity(preds, tweets, groups)
# btch_preds, btch_tweets, btch_groups = get_dataset_predictions(ensemble, "datasets/english_variant_btch.csv")
# check_parity(btch_preds, btch_tweets, btch_groups)
# ngga_preds, ngga_tweets, ngga_groups = get_dataset_predictions(ensemble, "datasets/english_variant_ngga.csv")
# check_parity(ngga_preds, ngga_tweets, ngga_groups)
\ No newline at end of file
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation
import nltk
from nltk.stem.porter import *
from nltk import pos_tag
from copy import deepcopy
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
hsdata = pd.read_csv("datasets/english_variant_100k.csv")
class_names = np.array(["ok", "hate speech"])
stopwords = nltk.corpus.stopwords.words("english")
stopwords.extend(["#ff", "ff", "rt"])
stemmer = PorterStemmer()
def preprocess(text_string):
"""
Accepts a text string and replaces:
1) urls with URLHERE
2) lots of whitespace with one instance
3) mentions with MENTIONHERE
This allows us to get standardized counts of urls and mentions
Without caring about specific people mentioned
"""
space_pattern = '\s+'
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'
parsed_text = re.sub(space_pattern, ' ', text_string)
parsed_text = re.sub(giant_url_regex, '', parsed_text)
parsed_text = re.sub(mention_regex, '', parsed_text)
parsed_text = parsed_text.replace("&", "")
return parsed_text
def tokenize(tweet):
"""Removes punctuation & excess whitespace, sets to lowercase,
and stems tweets. Returns a list of stemmed tokens."""
tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
tokens = [stemmer.stem(t) for t in tweet.split()]
return tokens
def basic_tokenize(tweet):
"""Same as tokenize but without the stemming"""
tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
return tweet.split()
tweet_tags = []
for tweet in hsdata.tweet:
tokens = basic_tokenize(preprocess(tweet))
tags = pos_tag(tokens)
tag_list = [x[1] for x in tags]
tag_str = " ".join(tag_list)
tweet_tags.append(tag_str)
tweets_aa = list(map(lambda s: tokenize(preprocess(s)), hsdata.tweet[hsdata["group"] == "AA"]))
tweets_sa = list(map(lambda s: tokenize(preprocess(s)), hsdata.tweet[hsdata["group"] == "SA"]))
embedding_aa = Word2Vec(sentences=tweets_aa)
embedding_sa = Word2Vec(sentences=tweets_sa)
def get_vocab(embedding):
vectors = []
vocab = []
for w in embedding.wv.vocab:
vocab.append(w)
vectors.append(embedding[w])
vectors = np.array(vectors)
vocab = np.array(vocab)
return vocab, vectors
def get_clustering_function(embedding):
vocab, vectors = get_vocab(embedding)
kmeans = AffinityPropagation(damping=0.5)
kmeans.fit(vectors)
kmeans.n_clusters = len(kmeans.cluster_centers_)
mapping = []
for i in range(kmeans.n_clusters):
center = kmeans.cluster_centers_[i]
word = embedding.wv.similar_by_vector(center)[0][0]
mapping.append((set(vocab[kmeans.labels_ == i]), word))
def word_to_cluster(word):
for s, w in mapping:
if word in s:
return w
return word
def word_clustering(string):
tok = tokenize(preprocess(string))
return " ".join(map(word_to_cluster, tok))
return word_clustering, kmeans, vocab, vectors
word_clustering_aa, kmeans_aa, vocab_aa, vectors_aa = get_clustering_function(embedding_aa)
word_clustering_sa, kmeans_sa, vocab_sa, vectors_sa = get_clustering_function(embedding_sa)
# word_clustering_aa("You are very happy today ! ;)")
#
# print([len(kmeans_aa.labels_[kmeans_aa.labels_ == i]) for i in range(40)])
# print([len(kmeans_sa.labels_[kmeans_sa.labels_ == i]) for i in range(40)])
#
# len(kmeans_sa.cluster_centers_)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment