Отрывается транскрипция Google Speech? - PullRequest
0 голосов
/ 09 октября 2018

Я заметил, что речь Google в последнее время ведет себя иначе

Например, в этом файле говорящий говорит «123457» на тайском языке ... до того, как его правильно расшифровали как «123457», но дляПо какой-то причине, теперь API возвращает «12345»

Файл flac: http://s000.tinyupload.com/index.php?file_id=76277841017264777654

Код Python:

import argparse
import base64
import json
import sys
from googleapiclient import discovery
import httplib2
from oauth2client.client import GoogleCredentials
DISCOVERY_URL = ('https://{api}.googleapis.com/$discovery/rest?'
                 'version={apiVersion}')

def get_speech_service():
    credentials = GoogleCredentials.get_application_default().create_scoped(
        ['https://www.googleapis.com/auth/cloud-platform'])
    http = httplib2.Http()
    credentials.authorize(http)
    return discovery.build(
        'speech', 'v1beta1', http=http, discoveryServiceUrl=DISCOVERY_URL)
def english_numeric(number_as_string):
    s = number_as_string
    numerics=["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "zero"]
    for i,num in enumerate(numerics):
        s = s.replace(num,str(i))
    return s.replace(" ", "")
def transcribe_audio(speech_file, languageCode="th-TH", encoding="FLAC"):

    """Transcribe the given audio file.
    Args:
        speech_file: the name of the audio file.
    """
    with open(speech_file, 'rb') as speech:
        speech_content = base64.b64encode(speech.read())
    try:
        service = get_speech_service()
    except:
        print("probably didn't do EXPORT")
        print("i.e. export GCLOUD_PROJECT=project-id")
        print("i.e. export GOOGLE_APPLICATION_CREDENTIALS=/path/to/crednetials.json")

        print("Error message: {}".format(sys.exc_info()[0]))
        return

    service_request = service.speech().syncrecognize(
        body={
            'config': {
                'encoding': encoding,  # raw 16-bit signed LE samples
                # 'sampleRate': 16000,  # 16 khz
                'languageCode': languageCode,  # a BCP-47 language tag,
                'enableWordTimeOffsets' : "true"
            },
            'audio': {
                'content': speech_content.decode('UTF-8')
                }
            })

    response = ""
    try:
        response = service_request.execute()
    except:
        # should do some major debugging here
        pass
    # return response
    json_string = json.dumps(response, ensure_ascii=False)
    # json_object = json.loads(json_string)
    # from pprint import pprint
    # pprint(json_object)
    return json_string

def audio_json_to_text(audio_transcription, include_confidence=True):
    transcript_text= "COULD_NOT_BE_TRANSCRIBED"
    if(audio_transcription and "results" in audio_transcription):
        if(include_confidence):
            transcript_text = "{} {}".format(
                audio_transcription["results"][0]["alternatives"][0]["transcript"],
                '(%.3f)' % float(audio_transcription["results"][0]["alternatives"][0]["confidence"]),
            )
        else:
            transcript_text = "{}".format(
                audio_transcription["results"][0]["alternatives"][0]["transcript"]
            )

    return transcript_text

if __name__ == '__main__':
    flac = sys.argv[1]
    languageCode = sys.argv[2]
    print(transcribe_audio(flac, languageCode=languageCode))
    # print(english_numeric("one five four three"))
...