Browse Source

functions are added to perform forced_alignment using novoapi. results can be written in novo70 or IPA.

master
yemaozi88 3 years ago
parent
commit
1622655542
  1. 1
      .gitignore
  2. BIN
      .vs/acoustic_model/v15/.suo
  3. BIN
      acoustic_model/__pycache__/defaultfiles.cpython-36.pyc
  4. 10
      acoustic_model/defaultfiles.py
  5. 2
      acoustic_model/htk_vs_kaldi.py
  6. 71
      acoustic_model/novoapi_forced_alignment.py
  7. 39
      acoustic_model/novoapi_functions.py

1
.gitignore

@ -3,6 +3,7 @@
## important ##
.acoustic_model/forced_alignment_novo.py
.acoustic_model/novoapi_functions.py
# User-specific files
*.suo

BIN
.vs/acoustic_model/v15/.suo

BIN
acoustic_model/__pycache__/defaultfiles.cpython-36.pyc

10
acoustic_model/defaultfiles.py

@ -36,8 +36,14 @@ fame_s5_dir = os.path.join(fame_dir, 's5')
fame_corpus_dir = os.path.join(fame_dir, 'corpus')
experiments_dir = r'c:\OneDrive\Research\rug\experiments'
stimmen_transcription_xlsx = os.path.join(experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
stimmen_data_dir = os.path.join(experiments_dir, 'stimmen', 'data')
stimmen_dir = os.path.join(experiments_dir, 'stimmen')
stimmen_data_dir = os.path.join(stimmen_dir, 'data')
# 44.1 kHz
#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav')
# 16 kHz
stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx')
phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')

2
acoustic_model/htk_vs_kaldi.py

@ -256,7 +256,7 @@ if make_kaldi_lexicon_txt:
# f.write("{0},{1}\n".format(key,c[key]))
for key, value in c.most_common(option_num):
# make possible pronounciation variant list.
# make possible pronunciation variant list.
pronvar_list = am_func.fame_pronunciation_variant(key)
for pronvar_ in pronvar_list:

71
acoustic_model/novoapi_forced_alignment.py

@ -37,11 +37,15 @@
# Aki Kunikoshi
# 428968@gmail.com
#
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse
import json
from novoapi.backend import session
import novoapi_functions
import defaultfiles as default
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
@ -51,68 +55,11 @@ p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='fa0Thaic')
args = p.parse_args()
wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": {
"kind": "sequence",
"elements": [{
"kind": "word",
"pronunciation": [{
"phones": ["wv",
"a1",
"n"],
"id": 0
},
{
"phones": ["wv",
"uh1",
"n"],
"id": 1
}],
"label": "one"
},
{
"kind": "word",
"pronunciation": [{
"phones": ["t",
"uw1"],
"id": 0
}],
"label": "two"
},
{
"kind": "word",
"pronunciation": [{
"phones": ["t",
"r",
"iy1"],
"id": 0
},
{
"phones": ["s",
"r",
"iy1"],
"id": 1
}],
"label": "three"
}]
},
"return_objects": ["grammar"],
"phoneset": "novo70"
}
res = rec.setgrammar(grammar)
#print "Set grammar result", res
#res = rec.recognize_wav("test/onetwothree.wav")
res = rec.recognize_wav(wav_file)
#print "Recognition result:", json.dumps(res.export(), indent=4)
#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
# list of the pronunciation for each words
word = 'pauw'
pronunciation_ipa = ['pau', 'pɑu']
grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)

39
acoustic_model/novoapi_functions.py

@ -1,7 +1,14 @@
## this script should be used only by Aki Kunikoshi.
import numpy as np
import argparse
import json
from novoapi.backend import session
import defaultfiles as default
def load_phonset():
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
@ -112,7 +119,7 @@ def make_grammar(word, pronunciation_ipa):
grammer_data_elements0_pronunciation = []
for id, ipa in enumerate(pronunciation_ipa):
novo70 = novoapi_functions.ipa2novo70(ipa)
novo70 = ipa2novo70(ipa)
grammer_data_elements0_pronunciation.append({
"phones": novo70.split(),
"id": id
@ -135,4 +142,32 @@ def make_grammar(word, pronunciation_ipa):
"phoneset": "novo70"
}
return grammar
return grammar
def forced_alignment(wav_file, word, pronunciation_ipa):
### IMPORTANT ###
# because of this function, this script should not be uploaded / shared.
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='fa0Thaic')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
grammar = make_grammar(word, pronunciation_ipa)
result = rec.setgrammar(grammar)
#print "Set grammar result", res
result = rec.recognize_wav(wav_file)
return result.export()
def result2pronunciation(result, word):
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation_novo70 = [phone['label'] for phone in phones]
pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
return pronunciation_ipa, pronunciation_novo70, llh
Loading…
Cancel
Save