turtlebot/speech-rec.sh at master · icrl/turtlebot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash

# Usage info
show_help() {
cat << EOF
  Usage: ${0##*/} [-h] [-i INFILE] [-d DURATION] [-r RATE] [-l LANGUAGE] [-k KEY]

  Record an utterance and send audio data to Google for speech recognition.

       -h|--help               display this help and exit.
       -i|--input     INFILE   use INFILE instead of recording a stream with sox or parecord.
       -d|--duration  FLOAT    recoding duration in seconds (Default: 3).
       -l|--language  STRING   set transcription language (Default: en_US).
                               Other languages: fr_FR, de_DE, es_ES, ...
       -r|--rate      INTEGER  Sampling rate of audio data (Default: 16000, if data is to be recorded).
                               If -i|--input is used, the sampling rate must be supplied by the user.
       -k|--key       STRING   Google Speech Recognition Key.

EOF
}

DURATION=3
LANGUAGE=en_US
# Please replace this with your own key
KEY=AIzaSyAcalCzUvPmmJ7CZBFOEWx2Z1ZSn4Vs1gg


record() {
    DURATION=$1
    SRATE=$2
    INFILE=$3

    if hash rec 2>/dev/null; then
    # try to record audio with sox
        rec -q -c 1 -r $SRATE $INFILE trim 0 $DURATION
    else
    # fallback to parecord
        timeout $DURATION parecord $INFILE --file-format=flac --rate=$SRATE --channels=1
    fi
}

# parse parameters
while [[ $# -ge 1 ]]
do
   key="$1"
   case $key in
       -h|--help)
       show_help
       exit 0
       ;;
       -i|--input)
       INFILE="$2"
       shift
       ;;
       -d|--duration)
       DURATION="$2"
       shift
       ;;
       -r|--rate)
       SRATE=$2
       shift
       ;;
       -l|--language)
       LANGUAGE="$2"
       shift
       ;;
       -k|--key)
       KEY="$2"
       shift
       ;;
       *)
       echo "Unknown parameter '$key'. Type $0 -h for more information."
       exit 1
       ;;
   esac
   shift
done

if [[ ! "$DURATION" ]]
   then
     echo "ERROR: empty or invalid value for duration."
     exit 1
fi

if [[ ! "$LANGUAGE" ]]
   then
     echo "ERROR: empty value for language."
     exit 1
fi

if [[ ! "$INFILE" ]]
   then
      INFILE="./records/record_"`date "+%Y%b%d_%H-%M-%S"`.flac
      if  [[ ! "$SRATE" ]]
         then
            SRATE=16000
      fi
      echo "Say something..."
      echo ""
      record $DURATION $SRATE $INFILE

else
      if  [[ ! "$SRATE" ]]
      then
           >&2 echo "ERROR: no sampling rate specified for input file."
           exit 1
      fi

      echo "Try to recognize speech from file $INFILE"
      echo ""
fi

RESULT=`wget -q --post-file $INFILE --header="Content-Type: audio/x-flac; rate=$SRATE" -O - "https://www.google.com/speech-api/v2/recognize?client=chromium&lang=$LANGUAGE&key=$KEY"`

FILTERED=`echo "$RESULT" | grep "transcript.*}" | sed 's/,/\n/g;s/[{,},"]//g;s/\[//g;s/\]//g;s/:/: /g' | grep -o -i -e "transcript.*" -e "confidence:.*"`

CONFIDENCE=`echo "$RESULT" | grep "transcript.*}" | sed 's/,/\n/g;s/[{,},"]//g;s/\[//g;s/\]//g;s/:/: /g' | grep -o -i -e "confidence:.*"`

if [[ ! "$FILTERED" ]]
  then
     cat /dev/null > "words.log"
     >&2 echo "Google was unable to recognize any speech in audio data"
else
    echo "Recognition result:"
    echo ""
    cat /dev/null > "words.log"
    echo ""
    echo "$FILTERED" >> "words.log"
fi

exit 0