Commit cb907980 authored by SHEIKH Imran's avatar SHEIKH Imran

version 0.1

parent 685f1dc5
This diff is collapsed.
This diff is collapsed.
export train_cmd=run.pl
export decode_cmd=run.pl
\ No newline at end of file
*
*/
!.gitignore
[COMPRISE](https://www.compriseh2020.eu) sample [Kaldi](http://kaldi-asr.org) recipe to train [Chain](https://kaldi-asr.org/doc/chain.html) models on the [Let's Go dataset](https://dialrc.github.io/LetsGoDataset/).
----
## Prerequisites
- This library will re-use binaries and scripts from the [Kaldi](http://kaldi-asr.org) toolkit. So you should have Kaldi pre-installed on your system.
- It requires you to install the [kaldi_lm](http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz) tool. You can install this tool with the *tools/extras/install\_kaldi_lm.sh* script in your Kaldi installation.
----
## Setup
- Ensure that you have a working Kaldi installation.
- Ensure you either follow the directory structure of this repository or you have modifed *path.sh*, *cmd.sh* and other scripts in *local/* accordingly.
----
## Typical Usage Steps
Assuming you are following the directory structure of this repository.
### Prepare for training
>`bash egs/letsgo-15d/prepare_data.sh letsgo_raw_dir letsgo_transcript_2008_2009_v4.csv`
Note that [prepare_data.sh](prepare_data.sh) uses perl scripts [local/getCleanTranscripts.pl](local/getCleanTranscripts.pl) and [local/getTrainDevTestSplits.pl](local/getTrainDevTestSplits.pl) with variables fixed to decide the choice of train, dev and test set. Similarly, [local/updateLexicon.pl](local/updateLexicon.pl) script uses the pre-built [local/oovs.lexicon.txt](local/oovs.lexicon.txt).
### Train tri3 model
>`bash egs/letsgo-15d/train_tri3.sh letsgo_raw_dir`
Note that [train_tri3.sh](train_tri3.sh) uses fixed locations of *datadir*, *confdir*, *expdir*, *egslocal*
### Train chain model
>`bash egs/letsgo-15d/train_chain.sh --datadir data/letsgo-15d-seed --expdir exp/letsgo-15d-seed --localdir egs/letsgo-15d-seed/local --confdir egs/letsgo-15d-seed/conf`
### Decode using chain model
>`bash egs/letsgo-15d/decode_lats_with_chain.sh --datadir data/letsgo-15d-seed --expdir exp/letsgo-15d-seed --testset split/unsup`
--sample-frequency=8000
--use-energy=false # only non-default option.
# config for high-resolution MFCC features, intended for neural network training
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--sample-frequency=8000
--use-energy=false # use average of log energy, not energy.
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
# there might be some information at the low end.
--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
#!/bin/bash
# Derived software, Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
#
#
# Based on Kaldi (kaldi/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh), Copyright 2019 © Johns Hopkins University (author: Daniel Povey)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# https://github.com/kaldi-asr/kaldi/blob/master/COPYING
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
expdir=
datadir=
testset=split/unsup
stage=0
nj=40
nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium.
affix=1f #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
tree_dir=$expdir/chain${nnet3_affix}/tree_a_sp
dir=$expdir/chain${nnet3_affix}/tdnn${affix}_sp
# training chunk-options
chunk_width=140,100,160
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
if [ $stage -le 0 ]; then
for split in ${testset}; do
utils/copy_data_dir.sh $datadir/$split $datadir/${split}_hires
# extract mfcc, compute cmvn
steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" $datadir/${split}_hires
steps/compute_cmvn_stats.sh $datadir/${split}_hires
utils/fix_data_dir.sh $datadir/${split}_hires
# extract iVectors for the test data
nspk=$(wc -l <$datadir/${split}_hires/spk2utt)
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
$datadir/${split}_hires $expdir/nnet3${nnet3_affix}/extractor \
$expdir/nnet3${nnet3_affix}/ivectors_${split}_hires
done
fi
if [ $stage -le 1 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
for split in ${testset}; do
data_affix=$(echo $split | sed "s/\//_/")
nspk=$nj #$(wc -l <$datadir/${split}_hires/spk2utt) # $nj gives consistent output
for lmtype in tg; do
steps/nnet3/decode.sh \
--acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nspk --cmd "$decode_cmd" --num-threads 4 \
--online-ivector-dir $expdir/nnet3${nnet3_affix}/ivectors_${split}_hires \
$tree_dir/graph_${lmtype} $datadir/${split}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
done
done
fi
#!/bin/bash
# Derived software, Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
#
#
# Based on Kaldi (kaldi/egs/wsj/s5/local/wsj_format_local_lms.sh), Copyright 2019 © Johns Hopkins University (author: Daniel Povey, Guoguo Chen)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# https://github.com/kaldi-asr/kaldi/blob/master/COPYING
lang_suffix=
datadir=
echo "$0 $@" # Print the command line for logging
. ./path.sh
. utils/parse_options.sh || exit 1;
lm_srcdir_3g=$datadir/local/local_lm/3gram
[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
for d in $datadir/lang${lang_suffix}_test_tg; do
rm -r $d 2>/dev/null
cp -r $datadir/lang${lang_suffix} $d
done
lang=$datadir/lang${lang_suffix}
# Check a few files that we have to use.
for f in words.txt oov.int; do
if [[ ! -f $lang/$f ]]; then
echo "$0: no such file $lang/$f"
exit 1;
fi
done
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - $datadir/lang${lang_suffix}_test_tg/G.fst || exit 1;
fstisstochastic $datadir/lang${lang_suffix}_test_tg/G.fst
exit 0;
#!/usr/bin/perl
# Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
if($#ARGV != 2){
print STDERR "USAGE: $0 <signal_parent_directory> <trascriptions.csv> <out_dir>\n";
exit(0);
}
use File::Spec;
my %signalPath;
my @dirs = (File::Spec->rel2abs($ARGV[0]));
while (@dirs) {
my $thisdir = shift @dirs;
opendir my $dh, $thisdir;
while (my $entry = readdir $dh) {
next if $entry eq '.';
next if $entry eq '..';
my $fullname = "$thisdir/$entry";
if (-d $fullname ){
push @dirs, $fullname;
}elsif($fullname =~ m/\.raw$/){
if($fullname =~ m/\d+\/\d\d\d\.raw$/){
my @tmp = split(/\//,$fullname);
my $fname = $tmp[$#tmp-3]."/".$tmp[$#tmp-2]."/".$tmp[$#tmp-1]."/".$tmp[$#tmp];
$signalPath{$fname} = $fullname;
#print "$fname ",$fullname,"\n";
}
}
}
}
my $outdir = $ARGV[2];
$outdir =~ s/\/$//;
my %spk2utt;
my $txtfile = "$outdir/text.unsorted";
my $wavscpfile = "$outdir/wav.scp.unsorted";
my $utt2spkfile = "$outdir/utt2spk.unsorted";
open(TEXTF, '>', $txtfile) or die $!;
open(WAVSCP, '>', $wavscpfile) or die $!;
open(UTT2SPK, '>', $utt2spkfile) or die $!;
open my $FH, "<", $ARGV[1] or die "can't read open '$ARGV[1]': $OS_ERROR";
while (my $line = <$FH>) {
my ($id, $txt, $ds) = split(/\,/, $line);
my $newid = $id;
$newid =~ s/\//_/g;
my $spkrid = $newid;
$spkrid =~ s/\_\d\d\d\.raw$//;
$newid =~ s/\.raw$//;
if(exists $signalPath{$id}){
if(length($txt)<=0){
print TEXTF $newid," <unk>\n";
}else{
print TEXTF $newid," ",$txt."\n";
}
print WAVSCP $newid," sox -r 8000 -e signed-integer -b 16 -c 1 ",$signalPath{$id}," -t wav - | \n";
print UTT2SPK $newid," ",$spkrid,"\n";
if(exists $spk2utt{$spkrid}){
$spk2utt{$spkrid} = "$spk2utt{$spkrid} $newid";
}else{
$spk2utt{$spkrid} = "$newid";
}
}else{
print "Warning: Missing wav file corresponding to $id\n";
}
}
close(TEXTF);
close(UTT2SPK);
close(WAVSCP);
close($FH);
my $spk2uttfile = "$outdir/spk2utt.unsorted";
open(SPK2UTT, '>', $spk2uttfile) or die $!;
foreach my $spkr (keys %spk2utt){
my $list = $spk2utt{$spkr};
$list =~ s/^\s+//;
my @tmp = split(/\s+/, $list);
my @sorted = sort @tmp;
my $utts = join(" ", @sorted);
print SPK2UTT "$spkr $utts\n";
}
close(SPK2UTT);
exit(0);
#!/usr/bin/perl
# Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
if($#ARGV != 1){
print STDERR "USAGE: $0 <orig_annotations> <letsgo_raw_directory>\n";
exit(0);
}
### some hard inits to choose amount of training data
my $trainYm = "200810";
my $trainD = 15;
my $devTestYm = "200909";
my @uttArray, @rawArray, @gtArray;
my $lastCallPath = "NULL";
my $currCallPath = "NULL";
open my $FH, "<", $ARGV[0] or die "can't read open '$ARGV[0]': $OS_ERROR";
while (my $line = <$FH>) {
$line =~ s/\s+$//;
if($line =~ m/^id.+purpose$/){
next;
}else{
$line =~ s/^\.\///;
my @fields = split(/\,/, $line);
my $rawfile = $fields[0];
my $gt = "$fields[4]";
$gt =~ s/[a-z*]\%[a-z]*/ <unk> /g;
$gt =~ s/%+/ <unk> /g;
my @words = split(/\s+/, $gt);
my $newgt = "";
foreach my $w (@words){
if($w =~ m/^\d+[a-z]+$/){
#print STDERR $w,"\n";
my $wa = $w;
$w =~ s/^\d+//;
$wa =~ s/[a-z]+$//;
$newgt = $newgt." $wa";
}elsif($w =~ m/^[a-z]+\d+$/){
my $wa = $w;
$wa =~ s/\d+$//;
$w =~ s/^[a-z]+//;
$newgt = $newgt." $wa";
}
$newgt = $newgt." $w";
}
$gt = $newgt;
$gt =~ s/^\s//;
$currCallPath = $rawfile;
$currCallPath =~ s/\/\d\d\d\.raw//;
if(($lastCallPath eq "NULL") || ($currCallPath eq $lastCallPath)){
push(@uttArray, $line);
push(@rawArray, $rawfile);
push(@gtArray, $gt);
}else{
my $pdir = $lastCallPath;
$pdir =~ s/\d\d\/.+$//;
my $lastCallId = $lastCallPath;
$lastCallId =~ s/\//-/;
my $logfile = $ARGV[1]."/".$pdir."/".$lastCallPath."/LetsGoPublic-".$lastCallId."-dialog.log";
if (-e $logfile) {
my @states = getDialogStates($logfile);
if($#uttArray != $#states){ # inconsistency in dialog.log for this call
#print STDERR "----$lastCallPath $#uttArray $#states-----\n";
}else{
for(my $i=0; $i<=$#states; $i++){
if(($gtArray[$i] eq "NON_UNDERSTANDABLE") || ($gtArray[$i] =~ m/dtmf\_/ )){
next;
}
my $rawFilePath = $pdir."/".$rawArray[$i];
#my $size = -s $rawFilePath;
#$size = $size / (2 * 8000); # 16bit8kHz raw file
#print $uttArray[$i].",".$states[$i].",".$size."\n";
my ($ym, $ymd, $c, $u) = split(/\//,$rawFilePath);
my $m = $ym;
$m =~ s/^\d\d\d\d//;
$m = int($m);
my $d = $ymd;
$d =~ s/^\d\d\d\d\d\d//;
$d = int($d);
if((($ym eq $trainYm) && ($d<=$trainD)) || ($ym eq $devTestYm)){
print $rawFilePath.",".$gtArray[$i].",".$states[$i]."\n";
}
}
}
}else{
#print "xxx $logfile\n";
}
@uttArray = ();
@rawArray = ();
@gtArray = ();
push(@uttArray, $line);
push(@rawArray, $rawfile);
push(@gtArray, $gt);
}
$lastCallPath = $currCallPath;
}
}
close($FH);
sub getDialogStates(){
my $filename = $_[0];
#print "===getDialogStates=== ".$filename."\n";
my @retstates;
my $currState = "";
open my $FH, "<", $filename or die "can't read open '$filename': $OS_ERROR";
while (my $line = <$FH>) {
if($line =~ m/^dialog_state \= /){
$currState = $line;
$currState =~ s/^dialog_state \= //;
$currState =~ s/\s*$//;
#print $line;
}elsif($line =~ m/New user input \[User\:/){
push(@retstates,$currState);
#print $line;
}else{
next;
}
}
#print "===returning getDialogStates=== $filename\n";
return @retstates;
}
#!/usr/bin/perl
# Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
if($#ARGV != 1){
print STDERR "USAGE: $0 <trascriptions.csv> <outdir>\n";
exit(0);
}
### some hard inits to choose amount of training data
my $trainSetMonths = " 200810 ";
my $devTestMonth = "200909"; # Dev set will be calls from 1-15 of this month and Test set will be calls from 16-end of this month
my $outdir = $ARGV[1];
$outdir =~ s/\/$//;
my $trainfile = "$outdir/train.transcriptions.csv";
my $devfile = "$outdir/dev.transcriptions.csv";
my $testfile = "$outdir/test.transcriptions.csv";
open(TRAIN, '>', $trainfile) or die $!;
open(DEV, '>', $devfile) or die $!;
open(TEST, '>', $testfile) or die $!;
open my $FH2, "<", $ARGV[0] or die "can't read open '$ARGV[0]': $OS_ERROR";
while (my $line = <$FH2>) {
$line =~ s/^\.\///;
my ($id, $txt, $ds) = split(/\,/, $line);
my ($ym, $ymd, $c, $u) = split(/\//,$id);
my $m = $ym;
$m =~ s/^\d\d\d\d//;
$m = int($m);
my $d = $ymd;
$d =~ s/^\d\d\d\d\d\d//;
$d = int($d);
#print "$ym $m $d\n";
if($trainSetMonths =~ m/ $ym /){
if($d<=15){
print TRAIN $line;
}
}elsif($ym eq $devTestMonth){
if($d<=15){
print DEV $line;
}else{
print TEST $line;
}
}
}
close($FH2);
close(TRAIN);
close(DEV);
close(TEST);
This diff is collapsed.
#!/usr/bin/perl
# Copyright © 2020 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
if($#ARGV != 1){
print STDERR "USAGE: $0 <trascriptions.csv> <cmudict>\n";
exit(0);
}
my %lexicon;
open my $FH, "<", $ARGV[1] or die "can't read open '$ARGV[1]': $OS_ERROR";
while (my $line = <$FH>) {
$line =~ s/\s+$//;
$line = lc($line);
my ($wrd, $lex) = split(/\s+/, $line, 2);
$wrd =~ s/\(\d+\)$//;
$lex =~ s/\d//g;
if(exists($lexicon{$wrd})){
$lexicon{$wrd} = $lexicon{$wrd}."|".$lex;
}else{
$lexicon{$wrd} = $lex;
}
}
close($FH);
my %wfreq;
open my $FH, "<", $ARGV[0] or die "can't read open '$ARGV[0]': $OS_ERROR";
while (my $line = <$FH>) {
$line =~ s/\s+$//;
my ($id, $txt, $ds) = split(/\,/, $line);
my @wrds = split(/\s+/, $txt);
foreach my $wrd(@wrds){
if(exists($wfreq{$wrd})){
$wfreq{$wrd} += 1;
}else{
$wfreq{$wrd} = 1;
}
}
}
close($FH);
foreach my $key (sort (keys(%wfreq))) {
if($key eq "<unk>"){
print "<unk> oov\n";
}else{
if(exists($lexicon{$key})){
my @prons = split(/\|/, $lexicon{$key});
foreach my $pron (@prons){
print "$key $pron\n";
}
}else{
print "$key <add_manually>\n";
}
}
}
This diff is collapsed.
#!/bin/bash
# Derived software, Copyright © 2019 INRIA (Imran Sheikh)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
#
#
# Based on Kaldi (kaldi/egs/wsj/s5/local/nnet3/run_ivector_common.sh), Copyright 2019 © Johns Hopkins University (author: Daniel Povey, Karel Vesely, Hossein Hadian, Tom Ko)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# https://github.com/kaldi-asr/kaldi/blob/master/COPYING
set -e -o pipefail
# This script is called from scripts like local/nnet3/run_tdnn.sh and
# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It
# contains the common feature preparation and iVector-related parts of the
# script. See those scripts for examples of usage.
stage=0
nj=30
expdir=
pardata=
confdir=
train_set= # you might set this to e.g. split/Train.
test_sets=
gmm=tri4b # This specifies a GMM-dir from the features of the type you're training the system on;
# it should contain alignments for 'train_set'.
num_threads_ubm=32
nnet3_affix= # affix for $expdir/nnet3 directory to put iVector stuff in (e.g.
# in the tedlium recip it's _cleaned).
. ./cmd.sh
. ./path.sh
. utils/parse_options.sh
gmm_dir=$expdir/${gmm}
ali_dir=$expdir/${gmm}_ali_${train_set}_sp
for f in ${pardata}/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done
if [ $stage -le 2 ] && [ -f ${pardata}/${train_set}_sp_hires/feats.scp ]; then
echo "$0: ${pardata}/${train_set}_sp_hires/feats.scp already exists."
echo " ... Please either remove it, or rerun this script with stage > 2."
exit 1
fi
if [ $stage -le 1 ]; then
echo "$0: preparing directory for speed-perturbed data"
utils/data/perturb_data_dir_speed_3way.sh ${pardata}/${train_set} ${pardata}/${train_set}_sp
fi
if [ $stage -le 2 ]; then
echo "$0: creating high-resolution MFCC features"
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=${pardata}/${train_set}_sp_hires/data
for datadir in ${train_set}_sp ${test_sets}; do
utils/copy_data_dir.sh ${pardata}/$datadir ${pardata}/${datadir}_hires
done
# do volume-perturbation on the training data prior to extracting hires
# features; this helps make trained nnets more invariant to test data volume.
utils/data/perturb_data_dir_volume.sh ${pardata}/${train_set}_sp_hires
for datadir in ${train_set}_sp ${test_sets}; do
steps/make_mfcc.sh --nj $nj --mfcc-config $confdir/mfcc_hires.conf \
--cmd "$train_cmd" ${pardata}/${datadir}_hires
steps/compute_cmvn_stats.sh ${pardata}/${datadir}_hires
utils/fix_data_dir.sh ${pardata}/${datadir}_hires
done
fi
if [ $stage -le 3 ]; then
echo "$0: computing a subset of data to train the diagonal UBM."
mkdir -p $expdir/nnet3${nnet3_affix}/diag_ubm
temp_data_root=$expdir/nnet3${nnet3_affix}/diag_ubm
# train a diagonal UBM using a subset of about a quarter of the data
num_utts_total=$(wc -l <${pardata}/${train_set}_sp_hires/utt2spk)
num_utts=$[$num_utts_total/4]
utils/data/subset_data_dir.sh ${pardata}/${train_set}_sp_hires \