From 4b92a841da0305fe074fb2f7b42c551324d3c066 Mon Sep 17 00:00:00 2001
From: SHEIKH Imran <imran.sheikh@inria.fr>
Date: Fri, 30 Oct 2020 14:32:44 +0100
Subject: [PATCH] python 2.X to 3.X

---
 README.md                                     |  6 ++--
 .../err2unk/errdet/saus_feats_for_predict.py  | 24 +++++++-------
 local/err2unk/errdet/saus_feats_for_train.py  | 32 +++++++++----------
 local/err2unk/errdet/tag_with_3c_tagger.py    | 18 ++++-------
 .../errdet/train_3c_error_mlp_on_dev.py       | 20 ++++--------
 .../errdet/train_3c_error_tagger_on_dev.py    | 20 ++++--------
 6 files changed, 48 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 0c0dc1e..627f566 100644
--- a/README.md
+++ b/README.md
@@ -27,9 +27,9 @@ Readers interested in the high level design and experimental evaluation of these
     - (more) un-transcribed speech data. 
     - a dev set containing application specific transcribed speech data
 - Err2Unk based training requires:
-    - Python 2.7 (Python 3.X support is on ToDo list but should be straightforward for users/developers/contributors.)
-    - Python sklearn and matplotlib libraries
-    - the [Keras](https://keras.io/) Python library (verion 2.3.1) to train neural network models for STT error detection.
+    - Python 3.X (Python 2.7 not supported.)
+    - Python sklearn library
+    - [Keras v2.3.1](https://keras.io/) and [Tensorflow v2.0.0](https://www.tensorflow.org) Python libraries to train neural network models for STT error detection.
     - the [kenlm](https://github.com/kpu/kenlm) Python module to extract language model related features for error detector.
 - Dialogue state based training requires:
     - the [SRILM](http://www.speech.sri.com/projects/srilm/) tool. If you have installed Kaldi, you can install the SRILM tool with the *tools/extras/install\_srilm.sh* script in your Kaldi installation.
diff --git a/local/err2unk/errdet/saus_feats_for_predict.py b/local/err2unk/errdet/saus_feats_for_predict.py
index 8dc7c8b..68a9d68 100644
--- a/local/err2unk/errdet/saus_feats_for_predict.py
+++ b/local/err2unk/errdet/saus_feats_for_predict.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright © 2020 INRIA (Imran Sheikh)
@@ -6,7 +6,6 @@
 
 import sys, os, re, numpy
 import kenlm
-from itertools import izip
 
 def getProb(lm, phrase, bosFlag):
 	prob = 0
@@ -48,16 +47,16 @@ with open(words_file) as fp:
 		word, wid = line.split(' ');
 		vocab.append(word)
 
-print "\nExtracting features from confusion networks and writing to " + args.out_saus_feats
+print("\nExtracting features from confusion networks and writing to %s" % (args.out_saus_feats))
 
 with open(args.out_saus_feats,'w') as outfile:
 	for r, d, f in os.walk(args.saus_dir):
-	    for file in f:
-	    	m = re.search(r'saus\-wtimes\.\d+$', file)
-	        if m is not None:
-	            fnum = file.split('.')[-1]
-	            with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: 
-	            	for line in fp:
+		for file in f:
+			m = re.search(r'saus\-wtimes\.\d+$', file)
+			if m is not None:
+				fnum = file.split('.')[-1]
+				with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: 
+					for line in fp:
 						#line = line.replace('\n', ' ').replace(' [ 0 1 ] ', ' ')	# remove all epsilon only tokens from ASR
 						line = line.replace('\n', ' ')
 
@@ -79,7 +78,7 @@ with open(args.out_saus_feats,'w') as outfile:
 								uttid2, binTText = line2.split(' ', 1)
 								binTUttid = uttid+'_'+str(bid)
 								if uttid2 != binTUttid:
-									print "Error: %s != %s" %(uttid2, binTUttid)
+									print("Error: %s != %s" %(uttid2, binTUttid))
 									sys.exit()
 								binTText = binTText.strip(' ')
 								_binsT = binTText.split(' ; ')	
@@ -93,7 +92,7 @@ with open(args.out_saus_feats,'w') as outfile:
 							continue
 
 						j=0
-						for bin, binT in izip(bins, binsT):
+						for bin, binT in zip(bins, binsT):
 							arcs = bin.split(' ')
 							featvec = [] 
 
@@ -180,7 +179,7 @@ with open(args.out_saus_feats,'w') as outfile:
 								ff = -10.0
 							featvec.append(ff) 		
 
-							vals = numpy.array(map(float,arcs[1::2]))	
+							vals = numpy.array(list(map(float,arcs[1::2])))	
 							ff = -1*numpy.sum(vals*numpy.log10(vals))		#12 local entropy of word posteriors in a confusion slot
 							featvec.append(ff) 		
 							ff = numpy.std(vals)					#13 standard deviation of word posteriors in a confusion slot
@@ -214,7 +213,6 @@ with open(args.out_saus_feats,'w') as outfile:
 
 							fvecstr = "%s_%d,%s,%d,%s" %(uttid, j, " ".join(map(str, featvec)).strip(), -1, "NA")
 
-							#print fvecstr
 							outfile.write(fvecstr+"\n")
 							j = j+1
 
diff --git a/local/err2unk/errdet/saus_feats_for_train.py b/local/err2unk/errdet/saus_feats_for_train.py
index af9d976..36aa76e 100644
--- a/local/err2unk/errdet/saus_feats_for_train.py
+++ b/local/err2unk/errdet/saus_feats_for_train.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright © 2020 INRIA (Imran Sheikh)
@@ -6,7 +6,6 @@
 
 import sys, os, re, numpy
 import kenlm
-from itertools import izip
 
 def getProb(lm, phrase, bosFlag):
 	prob = 0
@@ -83,16 +82,16 @@ with open(args.saus_ref_align) as fp:
 		errorLabels[uttid] = labels 		# Note: label 0 reserved for padding 
 		refWords[uttid] = refs
 
-print "\nExtracting features from confusion networks and writing to " + args.out_saus_feats_n_labs
+print("\nExtracting features from confusion networks and writing to %s" % (args.out_saus_feats_n_labs))
 
 with open(args.out_saus_feats_n_labs,'w') as outfile:
 	for r, d, f in os.walk(args.saus_dir):
-	    for file in f:
-	    	m = re.search(r'saus\-wtimes\.\d+$', file)
-	        if m is not None:
-	            fnum = file.split('.')[-1]
-	            with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: 
-	            	for line in fp:
+		for file in f:
+			m = re.search(r'saus\-wtimes\.\d+$', file)
+			if m is not None:
+				fnum = file.split('.')[-1]
+				with open(os.path.join(r, file)) as fp, open(os.path.join(r, 'saut-wtimes.'+fnum)) as fp2: 
+					for line in fp:
 						line = line.replace('\n', ' ')
 
 						uttid, binText = line.split(' ', 1)
@@ -116,7 +115,7 @@ with open(args.out_saus_feats_n_labs,'w') as outfile:
 								uttid2, binTText = line2.split(' ', 1)
 								binTUttid = uttid+'_'+str(bid)
 								if uttid2 != binTUttid:
-									print "Error: %s != %s" %(uttid2, binTUttid)
+									print("Error: %s != %s" %(uttid2, binTUttid))
 									sys.exit()
 								binTText = binTText.strip(' ')
 								_binsT = binTText.split(' ; ')	
@@ -130,13 +129,13 @@ with open(args.out_saus_feats_n_labs,'w') as outfile:
 							continue
 							
 						if (len(labels) != len(bins)):
-							print "Error: %s %d != %d %d" %(uttid, len(labels), len(bins), len(binsT))
-							print binText
-							print labels
+							print("Error: %s %d != %d %d" %(uttid, len(labels), len(bins), len(binsT)))
+							print(binText)
+							print(labels)
 							sys.exit()
 
 						j=0
-						for bin, binT in izip(bins, binsT):
+						for bin, binT in zip(bins, binsT):
 							arcs = bin.split(' ')
 							featvec = [] 
 
@@ -223,8 +222,8 @@ with open(args.out_saus_feats_n_labs,'w') as outfile:
 								ff = -10.0
 							featvec.append(ff) 		
 
-							vals = numpy.array(map(float,arcs[1::2]))	
-							ff = -1*numpy.sum(vals*numpy.log10(vals))		#12 local entropy of word posteriors in a confusion slot
+							vals = numpy.array(list(map(float, arcs[1::2])))
+							ff = -1*numpy.sum(vals*numpy.log(vals))		#12 local entropy of word posteriors in a confusion slot
 							featvec.append(ff) 		
 							ff = numpy.std(vals)					#13 standard deviation of word posteriors in a confusion slot
 							featvec.append(ff) 		
@@ -257,7 +256,6 @@ with open(args.out_saus_feats_n_labs,'w') as outfile:
 
 							fvecstr = "%s_%d,%s,%d,%s" %(uttid, j, " ".join(map(str, featvec)).strip(), labels[j], refs[j])
 
-							#print fvecstr
 							outfile.write(fvecstr+"\n")
 							j = j+1
 
diff --git a/local/err2unk/errdet/tag_with_3c_tagger.py b/local/err2unk/errdet/tag_with_3c_tagger.py
index bea0a98..6ebb738 100644
--- a/local/err2unk/errdet/tag_with_3c_tagger.py
+++ b/local/err2unk/errdet/tag_with_3c_tagger.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright © 2020 INRIA (Imran Sheikh)
@@ -11,10 +11,6 @@ from keras.layers import Dense, LSTM, Bidirectional, TimeDistributed, Masking
 
 from keras import backend as K
 
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-
 from sklearn.preprocessing import StandardScaler
 
 from keras.models import model_from_json
@@ -96,17 +92,17 @@ def prepFeats(featfile, maxlen, scaler, numclasses):
 def printPreds(yp, ids):
 	for i in range(len(yp)):
 		if len(yp[i]) != len(ids[i]):
-			print "Error:" + str(len(yp[i])) + " "  + str(len(ids[i]))
-		print ids[i][0].rsplit('_',1)[0] + ",",
+			print("Error: %d %d" % (len(yp[i]), len(ids[i])))
+		print(ids[i][0].rsplit('_',1)[0], end=",")
 		for j in range(len(yp[i])):
-			print yp[i][j],
-		print ""
+			print(yp[i][j],end='')
+		print("\n")
 
 def printPredsToFile(yp, ids, fpath):
 	fp = open(fpath, "w")
 	for i in range(len(yp)):
 		if len(yp[i]) != len(ids[i]):
-			print "Error:" + str(len(yp[i])) + " "  + str(len(ids[i]))
+			print("Error: %d %d" %(len(yp[i]), len(ids[i])))
 		fp.write(ids[i][0].rsplit('_',1)[0] + ", ")
 		for j in range(len(yp[i])):                                                                 
 			fp.write(str(yp[i][j]) + " ")
@@ -148,7 +144,7 @@ for layer in new_model.layers:
 	except:
 		print("Could not transfer weights for layer {}".format(layer.name))
 
-print new_model.summary()
+print(new_model.summary())
 
 pred = new_model.predict(testX, verbose=0)
 yt, yp = getPredictions(testX, testY, pred)
diff --git a/local/err2unk/errdet/train_3c_error_mlp_on_dev.py b/local/err2unk/errdet/train_3c_error_mlp_on_dev.py
index f97f319..d095873 100644
--- a/local/err2unk/errdet/train_3c_error_mlp_on_dev.py
+++ b/local/err2unk/errdet/train_3c_error_mlp_on_dev.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright © 2020 INRIA (Imran Sheikh)
@@ -14,10 +14,6 @@ from keras.callbacks import Callback
 import tensorflow as tf
 from keras import backend as K
 
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-
 from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
@@ -61,8 +57,8 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback):
         yt.append(numpy.argmax(self.validation_data[1][i]))
         yp.append(numpy.argmax(y_val_pred[i]))
     f1=f1_score(yt, yp, average='micro')  
-    print 'Epoch %d f1: %f' %(epoch, f1)
-    print classification_report(yt, yp, digits=4)
+    print('Epoch %d f1: %f' %(epoch, f1))
+    print(classification_report(yt, yp, digits=4))
     self.f1_scores.append(f1)
 
     current = round(f1, 3)
@@ -78,7 +74,7 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback):
       if self.wait >= self.patience:
         self.stopped_epoch = epoch
         self.model.stop_training = True
-        print('Restoring model weights from the end of the best epoch: ' + str(self.bestEpoch))
+        print('Restoring model weights from the end of the best epoch: %s' % (str(self.bestEpoch)))
         self.model.set_weights(self.best_weights)
 
   def on_train_end(self, logs=None):
@@ -161,7 +157,7 @@ pickle.dump(scaler, open(args.outdir + '/scaler.pkl','wb'))
 (trainX_, trainY_) = prepFeats(args.featsfile, scaler, numclasses)
 indim=trainX_.shape[1]
 
-print "Forming train-dev split from given train set..."
+print("Forming train-dev split from given train set...")
 trainX, devX, trainY, devY = train_test_split(trainX_, trainY_, test_size=0.33, shuffle= True)
 
 model = Sequential()
@@ -177,8 +173,4 @@ model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accur
 
 history = model.fit(trainX, trainY, validation_data=(devX, devY), epochs=200, batch_size=32, verbose=2, callbacks=[es])
 
-model.save(mTag) 
-plt.plot(history.history['loss'], label='train')
-plt.plot(history.history['val_loss'], label='dev')
-plt.savefig(args.outdir +'/loss.png')
-plt.clf()
+model.save(mTag)
diff --git a/local/err2unk/errdet/train_3c_error_tagger_on_dev.py b/local/err2unk/errdet/train_3c_error_tagger_on_dev.py
index 89c9a2a..87967ee 100644
--- a/local/err2unk/errdet/train_3c_error_tagger_on_dev.py
+++ b/local/err2unk/errdet/train_3c_error_tagger_on_dev.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright © 2020 INRIA (Imran Sheikh)
@@ -14,10 +14,6 @@ from keras.callbacks import Callback
 import tensorflow as tf
 from keras import backend as K
 
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-
 from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
@@ -63,8 +59,8 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback):
         	yt.append(numpy.argmax(self.validation_data[1][i,j]))
         	yp.append(numpy.argmax(y_val_pred[i,j]))
     f1=f1_score(yt, yp, average='micro')  
-    print 'Epoch %d f1: %f' %(epoch, f1)
-    print classification_report(yt, yp, digits=4)
+    print('Epoch %d f1: %f' %(epoch, f1))
+    print(classification_report(yt, yp, digits=4))
     self.f1_scores.append(f1)
 
     current = round(f1, 3)
@@ -80,7 +76,7 @@ class EarlyStoppingAtMaxF1(tf.keras.callbacks.Callback):
       if self.wait >= self.patience:
         self.stopped_epoch = epoch
         self.model.stop_training = True
-        print('Restoring model weights from the end of the best epoch: ' + str(self.bestEpoch))
+        print('Restoring model weights from the end of the best epoch: %s' % (str(self.bestEpoch)))
         self.model.set_weights(self.best_weights)
 
   def on_train_end(self, logs=None):
@@ -192,7 +188,7 @@ pickle.dump(scaler, open(args.outdir + '/scaler.pkl','wb'))
 
 indim=trainX_.shape[2]
 
-print "Forming train-dev split from input feature set..."
+print("Forming train-dev split from input feature set...")
 trainX, devX, trainY, devY = train_test_split(trainX_, trainY_, test_size=0.33, shuffle= True)
 
 model = Sequential()
@@ -207,8 +203,4 @@ model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accur
 
 history = model.fit(trainX, trainY, validation_data=(devX, devY), epochs=200, batch_size=32, verbose=2, callbacks=[es])
 
-model.save(mTag) 
-plt.plot(history.history['loss'], label='train')
-plt.plot(history.history['val_loss'], label='dev')
-plt.savefig(args.outdir +'/loss-graph.png')
-plt.clf()
+model.save(mTag)
\ No newline at end of file
-- 
GitLab