From 4b92a841da0305fe074fb2f7b42c551324d3c066 Mon Sep 17 00:00:00 2001 From: SHEIKH Imran <imran.sheikh@inria.fr> Date: Fri, 30 Oct 2020 14:32:44 +0100 Subject: [PATCH] python 2.X to 3.X --- README.md | 6 ++-- .../err2unk/errdet/saus_feats_for_predict.py | 24 +++++++------- local/err2unk/errdet/saus_feats_for_train.py | 32 +++++++++---------- local/err2unk/errdet/tag_with_3c_tagger.py | 18 ++++------- .../errdet/train_3c_error_mlp_on_dev.py | 20 ++++-------- .../errdet/train_3c_error_tagger_on_dev.py | 20 ++++-------- 6 files changed, 48 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 0c0dc1e..627f566 100644 --- a/README.md +++ b/README.md @@ -27,9 +27,9 @@ Readers interested in the high level design and experimental evaluation of these - (more) un-transcribed speech data. - a dev set containing application specific transcribed speech data - Err2Unk based training requires: - - Python 2.7 (Python 3.X support is on ToDo list but should be straightforward for users/developers/contributors.) - - Python sklearn and matplotlib libraries - - the [Keras](https://keras.io/) Python library (verion 2.3.1) to train neural network models for STT error detection. + - Python 3.X (Python 2.7 not supported.) + - Python sklearn library + - [Keras v2.3.1](https://keras.io/) and [Tensorflow v2.0.0](https://www.tensorflow.org) Python libraries to train neural network models for STT error detection. - the [kenlm](https://github.com/kpu/kenlm) Python module to extract language model related features for error detector. - Dialogue state based training requires: - the [SRILM](http://www.speech.sri.com/projects/srilm/) tool. If you have installed Kaldi, you can install the SRILM tool with the *tools/extras/install\_srilm.sh* script in your Kaldi installation. diff --git a/local/err2unk/errdet/saus_feats_for_predict.py b/local/err2unk/errdet/saus_feats_for_predict.py index 8dc7c8b..68a9d68 100644 --- a/local/err2unk/errdet/saus_feats_for_predict.py +++ b/local/err2unk/errdet/saus_feats_for_predict.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright © 2020 INRIA (Imran Sheikh) @@ -6,7 +6,6 @@ import sys, os, re, numpy import kenlm -from itertools import izip def getProb(lm, phrase, bosFlag): prob = 0 @@ -48,16 +47,16 @@ with open(words_file) as fp: word, wid = line.split(' '); vocab.append(word) -print "\nExtracting features from confusion networks and writing to " + args.out_saus_feats +print("\nExtracting features from confusion networks and writing to %s" % (args.out_saus_feats)) with open(args.out_saus_feats,'w') as outfile: for r, d, f in os.walk(args.saus_dir): - for file in f: - m = re.search(r'saus\-wtimes\.\d+$', file) - if m is not None: - fnum = file.split('.')[-1] - with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: - for line in fp: + for file in f: + m = re.search(r'saus\-wtimes\.\d+$', file) + if m is not None: + fnum = file.split('.')[-1] + with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: + for line in fp: #line = line.replace('\n', ' ').replace(' [ 0 1 ] ', ' ') # remove all epsilon only tokens from ASR line = line.replace('\n', ' ') @@ -79,7 +78,7 @@ with open(args.out_saus_feats,'w') as outfile: uttid2, binTText = line2.split(' ', 1) binTUttid = uttid+'_'+str(bid) if uttid2 != binTUttid: - print "Error: %s != %s" %(uttid2, binTUttid) + print("Error: %s != %s" %(uttid2, binTUttid)) sys.exit() binTText = binTText.strip(' ') _binsT = binTText.split(' ; ') @@ -93,7 +92,7 @@ with open(args.out_saus_feats,'w') as outfile: continue j=0 - for bin, binT in izip(bins, binsT): + for bin, binT in zip(bins, binsT): arcs = bin.split(' ') featvec = [] @@ -180,7 +179,7 @@ with open(args.out_saus_feats,'w') as outfile: ff = -10.0 featvec.append(ff) - vals = numpy.array(map(float,arcs[1::2])) + vals = numpy.array(list(map(float,arcs[1::2]))) ff = -1*numpy.sum(vals*numpy.log10(vals)) #12 local entropy of word posteriors in a confusion slot featvec.append(ff) ff = numpy.std(vals) #13 standard deviation of word posteriors in a confusion slot @@ -214,7 +213,6 @@ with open(args.out_saus_feats,'w') as outfile: fvecstr = "%s_%d,%s,%d,%s" %(uttid, j, " ".join(map(str, featvec)).strip(), -1, "NA") - #print fvecstr outfile.write(fvecstr+"\n") j = j+1 diff --git a/local/err2unk/errdet/saus_feats_for_train.py b/local/err2unk/errdet/saus_feats_for_train.py index af9d976..36aa76e 100644 --- a/local/err2unk/errdet/saus_feats_for_train.py +++ b/local/err2unk/errdet/saus_feats_for_train.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright © 2020 INRIA (Imran Sheikh) @@ -6,7 +6,6 @@ import sys, os, re, numpy import kenlm -from itertools import izip def getProb(lm, phrase, bosFlag): prob = 0 @@ -83,16 +82,16 @@ with open(args.saus_ref_align) as fp: errorLabels[uttid] = labels # Note: label 0 reserved for padding refWords[uttid] = refs -print "\nExtracting features from confusion networks and writing to " + args.out_saus_feats_n_labs +print("\nExtracting features from confusion networks and writing to %s" % (args.out_saus_feats_n_labs)) with open(args.out_saus_feats_n_labs,'w') as outfile: for r, d, f in os.walk(args.saus_dir): - for file in f: - m = re.search(r'saus\-wtimes\.\d+$', file) - if m is not None: - fnum = file.split('.')[-1] - with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: - for line in fp: + for file in f: + m = re.search(r'saus\-wtimes\.\d+$', file) + if m is not None: + fnum = file.split('.')[-1] + with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: + for line in fp: line = line.replace('\n', ' ') uttid, binText = line.split(' ', 1) @@ -116,7 +115,7 @@ with open(args.out_saus_feats_n_labs,'w') as outfile: uttid2, binTText = line2.split(' ', 1) binTUttid = uttid+'_'+str(bid) if uttid2 != binTUttid: - print "Error: %s != %s" %(uttid2, binTUttid) + print("Error: %s != %s" %(uttid2, binTUttid)) sys.exit() binTText = binTText.strip(' ') _binsT = binTText.split(' ; ') @@ -130,13 +129,13 @@ with open(args.out_saus_feats_n_labs,'w') as outfile: continue if (len(labels) != len(bins)): - print "Error: %s %d != %d %d" %(uttid, len(labels), len(bins), len(binsT)) - print binText - print labels + print("Error: %s %d != %d %d" %(uttid, len(labels), len(bins), len(binsT))) + print(binText) + print(labels) sys.exit() j=0 - for bin, binT in izip(bins, binsT): + for bin, binT in zip(bins, binsT): arcs = bin.split(' ') featvec = [] @@ -223,8 +222,8 @@ with open(args.out_saus_feats_n_labs,'w') as outfile: ff = -10.0 featvec.append(ff) - vals = numpy.array(map(float,arcs[1::2])) - ff = -1*numpy.sum(vals*numpy.log10(vals)) #12 local entropy of word posteriors in a confusion slot + vals = numpy.array(list(map(float, arcs[1::2]))) + ff = -1*numpy.sum(vals*numpy.log(vals)) #12 local entropy of word posteriors in a confusion slot featvec.append(ff) ff = numpy.std(vals) #13 standard deviation of word posteriors in a confusion slot featvec.append(ff) @@ -257,7 +256,6 @@ with open(args.out_saus_feats_n_labs,'w') as outfile: fvecstr = "%s_%d,%s,%d,%s" %(uttid, j, " ".join(map(str, featvec)).strip(), labels[j], refs[j]) - #print fvecstr outfile.write(fvecstr+"\n") j = j+1 diff --git a/local/err2unk/errdet/tag_with_3c_tagger.py b/local/err2unk/errdet/tag_with_3c_tagger.py index bea0a98..6ebb738 100644 --- a/local/err2unk/errdet/tag_with_3c_tagger.py +++ b/local/err2unk/errdet/tag_with_3c_tagger.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright © 2020 INRIA (Imran Sheikh) @@ -11,10 +11,6 @@ from keras.layers import Dense, LSTM, Bidirectional, TimeDistributed, Masking from keras import backend as K -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt - from sklearn.preprocessing import StandardScaler from keras.models import model_from_json @@ -96,17 +92,17 @@ def prepFeats(featfile, maxlen, scaler, numclasses): def printPreds(yp, ids): for i in range(len(yp)): if len(yp[i]) != len(ids[i]): - print "Error:" + str(len(yp[i])) + " " + str(len(ids[i])) - print ids[i][0].rsplit('_',1)[0] + ",", + print("Error: %d %d" % (len(yp[i]), len(ids[i]))) + print(ids[i][0].rsplit('_',1)[0], end=",") for j in range(len(yp[i])): - print yp[i][j], - print "" + print(yp[i][j],end='') + print("\n") def printPredsToFile(yp, ids, fpath): fp = open(fpath, "w") for i in range(len(yp)): if len(yp[i]) != len(ids[i]): - print "Error:" + str(len(yp[i])) + " " + str(len(ids[i])) + print("Error: %d %d" %(len(yp[i]), len(ids[i]))) fp.write(ids[i][0].rsplit('_',1)[0] + ", ") for j in range(len(yp[i])): fp.write(str(yp[i][j]) + " ") @@ -148,7 +144,7 @@ for layer in new_model.layers: except: print("Could not transfer weights for layer {}".format(layer.name)) -print new_model.summary() +print(new_model.summary()) pred = new_model.predict(testX, verbose=0) yt, yp = getPredictions(testX, testY, pred) diff --git a/local/err2unk/errdet/train_3c_error_mlp_on_dev.py b/local/err2unk/errdet/train_3c_error_mlp_on_dev.py index f97f319..d095873 100644 --- a/local/err2unk/errdet/train_3c_error_mlp_on_dev.py +++ b/local/err2unk/errdet/train_3c_error_mlp_on_dev.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright © 2020 INRIA (Imran Sheikh) @@ -14,10 +14,6 @@ from keras.callbacks import Callback import tensorflow as tf from keras import backend as K -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt - from sklearn.metrics import classification_report, recall_score, precision_score, f1_score from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split @@ -61,8 +57,8 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback): yt.append(numpy.argmax(self.validation_data[1][i])) yp.append(numpy.argmax(y_val_pred[i])) f1=f1_score(yt, yp, average='micro') - print 'Epoch %d f1: %f' %(epoch, f1) - print classification_report(yt, yp, digits=4) + print('Epoch %d f1: %f' %(epoch, f1)) + print(classification_report(yt, yp, digits=4)) self.f1_scores.append(f1) current = round(f1, 3) @@ -78,7 +74,7 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback): if self.wait >= self.patience: self.stopped_epoch = epoch self.model.stop_training = True - print('Restoring model weights from the end of the best epoch: ' + str(self.bestEpoch)) + print('Restoring model weights from the end of the best epoch: %s' % (str(self.bestEpoch))) self.model.set_weights(self.best_weights) def on_train_end(self, logs=None): @@ -161,7 +157,7 @@ pickle.dump(scaler, open(args.outdir + '/scaler.pkl','wb')) (trainX_, trainY_) = prepFeats(args.featsfile, scaler, numclasses) indim=trainX_.shape[1] -print "Forming train-dev split from given train set..." +print("Forming train-dev split from given train set...") trainX, devX, trainY, devY = train_test_split(trainX_, trainY_, test_size=0.33, shuffle= True) model = Sequential() @@ -177,8 +173,4 @@ model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accur history = model.fit(trainX, trainY, validation_data=(devX, devY), epochs=200, batch_size=32, verbose=2, callbacks=[es]) -model.save(mTag) -plt.plot(history.history['loss'], label='train') -plt.plot(history.history['val_loss'], label='dev') -plt.savefig(args.outdir +'/loss.png') -plt.clf() +model.save(mTag) diff --git a/local/err2unk/errdet/train_3c_error_tagger_on_dev.py b/local/err2unk/errdet/train_3c_error_tagger_on_dev.py index 89c9a2a..87967ee 100644 --- a/local/err2unk/errdet/train_3c_error_tagger_on_dev.py +++ b/local/err2unk/errdet/train_3c_error_tagger_on_dev.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright © 2020 INRIA (Imran Sheikh) @@ -14,10 +14,6 @@ from keras.callbacks import Callback import tensorflow as tf from keras import backend as K -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt - from sklearn.metrics import classification_report, recall_score, precision_score, f1_score from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split @@ -63,8 +59,8 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback): yt.append(numpy.argmax(self.validation_data[1][i,j])) yp.append(numpy.argmax(y_val_pred[i,j])) f1=f1_score(yt, yp, average='micro') - print 'Epoch %d f1: %f' %(epoch, f1) - print classification_report(yt, yp, digits=4) + print('Epoch %d f1: %f' %(epoch, f1)) + print(classification_report(yt, yp, digits=4)) self.f1_scores.append(f1) current = round(f1, 3) @@ -80,7 +76,7 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback): if self.wait >= self.patience: self.stopped_epoch = epoch self.model.stop_training = True - print('Restoring model weights from the end of the best epoch: ' + str(self.bestEpoch)) + print('Restoring model weights from the end of the best epoch: %s' % (str(self.bestEpoch))) self.model.set_weights(self.best_weights) def on_train_end(self, logs=None): @@ -192,7 +188,7 @@ pickle.dump(scaler, open(args.outdir + '/scaler.pkl','wb')) indim=trainX_.shape[2] -print "Forming train-dev split from input feature set..." +print("Forming train-dev split from input feature set...") trainX, devX, trainY, devY = train_test_split(trainX_, trainY_, test_size=0.33, shuffle= True) model = Sequential() @@ -207,8 +203,4 @@ model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accur history = model.fit(trainX, trainY, validation_data=(devX, devY), epochs=200, batch_size=32, verbose=2, callbacks=[es]) -model.save(mTag) -plt.plot(history.history['loss'], label='train') -plt.plot(history.history['val_loss'], label='dev') -plt.savefig(args.outdir +'/loss-graph.png') -plt.clf() +model.save(mTag) \ No newline at end of file -- GitLab