Browse Source

cleaned up the INTERSPEECH related codes.

master
yemaozi88 4 years ago
parent
commit
eb65543781
  1. BIN
      .vs/accent_classification/v15/.suo
  2. 4
      accent_classification.sln
  3. BIN
      accent_classification/__pycache__/data_manipulation.cpython-36.pyc
  4. BIN
      accent_classification/__pycache__/evaluation.cpython-36.pyc
  5. BIN
      accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc
  6. 8
      accent_classification/accent_classification.pyproj
  7. 9
      accent_classification/audio2db.py
  8. 0
      accent_classification/classifier.py
  9. 0
      accent_classification/config.ini
  10. 0
      accent_classification/data_io.py
  11. 0
      accent_classification/data_manipulation.py
  12. 0
      accent_classification/evaluation.py
  13. 0
      accent_classification/manipulate_db.py
  14. 0
      accent_classification/output_confusion_matrix.py
  15. 0
      accent_classification/sentence_based.py
  16. 267
      accent_classification/speaker_based.py
  17. 32
      accent_classification/speaker_based_functions.py
  18. 0
      accent_classification/test_code.py
  19. 0
      accent_classification/word_based.py
  20. 326
      dialect_identification/speaker_based.py
  21. BIN
      output/confusion_matrix_2regions.npy
  22. BIN
      output/confusion_matrix_2regions.png
  23. BIN
      output/confusion_matrix_2regions_normalized.png
  24. BIN
      output/confusion_matrix_3regions.npy
  25. BIN
      output/confusion_matrix_3regions.png
  26. BIN
      output/confusion_matrix_3regions_normalized.png
  27. BIN
      output/pidlist_2regions_test.npy
  28. BIN
      output/pidlist_2regions_train.npy
  29. BIN
      output/pidlist_3regions_eval.npy
  30. BIN
      output/pidlist_3regions_test.npy
  31. BIN
      output/pidlist_3regions_train.npy
  32. BIN
      output/pred_per_pid_2regions.npy
  33. BIN
      output/pred_per_pid_3regions.npy

BIN
.vs/accent_classification/v15/.suo

Binary file not shown.

4
dialect_identification.sln → accent_classification.sln

@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 @@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26730.12
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
ProjectSection(SolutionItems) = preProject
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution @@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
EndProjectSection
EndProject
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU

BIN
accent_classification/__pycache__/data_manipulation.cpython-36.pyc

Binary file not shown.

BIN
accent_classification/__pycache__/evaluation.cpython-36.pyc

Binary file not shown.

BIN
accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc

Binary file not shown.

8
dialect_identification/dialect_identification.pyproj → accent_classification/accent_classification.pyproj

@ -5,7 +5,7 @@ @@ -5,7 +5,7 @@
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
<ProjectHome>.</ProjectHome>
<StartupFile>output_confusion_matrix.py</StartupFile>
<StartupFile>speaker_based.py</StartupFile>
<SearchPath>
</SearchPath>
<WorkingDirectory>.</WorkingDirectory>
@ -22,6 +22,8 @@ @@ -22,6 +22,8 @@
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<ItemGroup>
<Compile Include="data_io.py" />
<Compile Include="data_manipulation.py" />
<Compile Include="manipulate_db.py">
<SubType>Code</SubType>
</Compile>
@ -29,9 +31,6 @@ @@ -29,9 +31,6 @@
<SubType>Code</SubType>
</Compile>
<Compile Include="classifier.py" />
<Compile Include="dataManipulation.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="output_confusion_matrix.py">
<SubType>Code</SubType>
</Compile>
@ -53,7 +52,6 @@ @@ -53,7 +52,6 @@
<Compile Include="word_based.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="dataIO.py" />
</ItemGroup>
<ItemGroup>
<Content Include="config.ini" />

9
dialect_identification/audio2db.py → accent_classification/audio2db.py

@ -1,6 +1,5 @@ @@ -1,6 +1,5 @@
import os
import sys
import configparser
import numpy as np
import pypyodbc
@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module) @@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module)
from forced_alignment import forced_alignment
## check if forced-alignment work in each sentence
## delete all automatically generated pronunciations
#from forced_alignment import pronunciations
#pronunciations.delete_all_g2p_entries()
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
#script_file = script_dir + '\\script10.txt'
#with open(script_file, 'r') as fin:
# script = fin.readline()
#fa = forced_alignment(wav_file, script)
## make database connection
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"

0
dialect_identification/classifier.py → accent_classification/classifier.py

0
dialect_identification/config.ini → accent_classification/config.ini

0
dialect_identification/data_io.py → accent_classification/data_io.py

0
dialect_identification/data_manipulation.py → accent_classification/data_manipulation.py

0
dialect_identification/evaluation.py → accent_classification/evaluation.py

0
dialect_identification/manipulate_db.py → accent_classification/manipulate_db.py

0
dialect_identification/output_confusion_matrix.py → accent_classification/output_confusion_matrix.py

0
dialect_identification/sentence_based.py → accent_classification/sentence_based.py

267
accent_classification/speaker_based.py

@ -0,0 +1,267 @@ @@ -0,0 +1,267 @@
import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
curr_dir = repo_dir + '\\accent_classification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
import data_manipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
## ======================= user define =======================
sentence_num_max = 10
config_file = curr_dir + '\\config.ini'
output_dir = repo_dir + '\\output'
# make train/test set: 1, load: 0
make_train_test_set = 0
# specify which experiment to be performed.
# - 3: groninven vs oost_overijssel vs limburg
# - 2: groningen vs limburg
experiment_type = 2
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
## ======================= data preparation =======================
## load variables from the ini file
config = configparser.ConfigParser()
config.sections()
config.read(config_file)
MDB_file = config['sentence_based']['fileMDB']
## connect to the database
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
data = np.array(rows)
del SQL_string, rows
## get the list of pronunciation variant (pronvarList) from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
conn.close()
## make list of LabelBinarizer object per word for X (=pronunciation variant).
LB_list = []
unique_wordID_list = data[:, 3].astype(int)
unique_wordID_max = max(unique_wordID_list)
for unique_wordID in range(1, unique_wordID_max+1):
pronvar = data[unique_wordID_list == unique_wordID, 7]
LB = preprocessing.LabelBinarizer()
LB.fit(np.unique(pronvar))
LB_list.append(LB)
## make LabelEncorder/LabelBinilizer objects for y (=region).
LE_y3 = preprocessing.LabelEncoder()
LE_y3.fit(region_labels3)
LE_y2 = preprocessing.LabelEncoder()
LE_y2.fit(region_labels2)
LB_y3 = preprocessing.LabelBinarizer()
LB_y3.fit(region_labels3)
LB_y2 = preprocessing.LabelBinarizer()
LB_y2.fit(region_labels2)
del unique_wordID, unique_wordID_max, pronvar, LB
## ======================= make train/eval/test set or load =======================
## find the smallest group to balance the number of samples per group.
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
pidlist3_counter = Counter(pidlist3[:, 1])
sample_num_max = min(pidlist3_counter.values())
del pidlist3_counter
## make train/eval/test set or load them.
if make_train_test_set==1:
pidlist3_train = []
pidlist3_eval = []
pidlist3_test = []
for region_num in range(0, len(region_labels3)):
region_name = region_labels3[region_num]
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
pidlist3_per_region, idx = mani.extractRandomSample(
pidlist3_per_region_, sample_num_max)
# split dataset into train, eval and test.
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
pidlist3_per_region, test_size = 0.2, random_state = 0)
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays.
if region_num == 0:
pidlist3_train = pidlist3_per_region_train
pidlist3_eval = pidlist3_per_region_eval
pidlist3_test = pidlist3_per_region_test
else:
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
del region_num, region_name
del pidlist3_per_region_, pidlist3_per_region, idx
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
if experiment_type == 2:
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
del pidlist2_train_
else:
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
if experiment_type == 2:
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
## extract corresponding data using pid
data3_train = sb_func.extractPid(pidlist3_train, data)
data3_eval = sb_func.extractPid(pidlist3_eval, data)
data3_test = sb_func.extractPid(pidlist3_test, data)
if experiment_type == 2:
data2 = np.array(data)
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
## ======================= experiments =======================
## specify the dataset
# train vs eval
#trainData = data3_train
#testData = data3_eval
#testPID = pidlist3_eval
#LB = LB_y3
#LE = LE_y3
#region_labels = region_labels3
# train+eval vs test
if experiment_type == 3:
trainData = np.r_[data3_train, data3_eval]
testData = data3_test
testPID = pidlist3_test
LB = LB_y3
LE = LE_y3
region_labels = region_labels3
elif experiment_type == 2:
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
region_labels = region_labels2
## check the number of utterance
#data_all = np.r_[trainData, testData]
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
#filenames_unique = np.unique(filenames, axis=0)
#Counter(filenames_unique[:, 1])
## output filenames
fileComparison = output_dir + "\\algorithm_comparison.csv"
filePerformance = output_dir + "\\sentence-level.csv"
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
## train sentence-level classifiers.
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
trainData, LB_list, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## confusion matrix
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
print(cm)
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
np.save(output_dir + "\\confusion_matrix2.npy", cm)

32
dialect_identification/speaker_based_functions.py → accent_classification/speaker_based_functions.py

@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis @@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import dataManipulation as mani
import data_manipulation as mani
import evaluation as eval
@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y): @@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
return np.array(prediction_per_pid)
def saxon_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
Notes:
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
2 regions include ['Limburg', 'Low_Saxon']
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
samples are randomly chosen so that each class has the same amount of data.
"""
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
regionLabels2 = ['Low_Saxon', 'Limburg']
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
pidlist_saxon_ = pidlist3[index_saxon, :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
# extract the same amout of samples as Limburg.
pidlistCounter3 = Counter(pidlist3[:, 1])
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
pidlist_saxon[:, 1] = regionLabels2[0]
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
#pidlistCounter2 = Counter(pidlist2[:, 1])
return pidlist2
def groningen_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3): @@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3):
2 regions include ['Groningen_and_Drenthe', 'Limburg']
"""
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]

0
dialect_identification/test_code.py → accent_classification/test_code.py

0
dialect_identification/word_based.py → accent_classification/word_based.py

326
dialect_identification/speaker_based.py

@ -1,326 +0,0 @@ @@ -1,326 +0,0 @@
import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
import dataManipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
#####################
## USER DEFINE ##
#####################
sentenceNumMax = 10
configFile = currDir + '\\config.ini'
dirOut = currDir + '\\result'
# make train/test set: 1, load: 0
makeTrainTestSet = 0
# convert 3 regions to 2 regions: 1, load: 0
conv3to2region = 0
# 3 regions: 0
# saxon vs limburg: 1
# groningen vs limburg: 2
experiment_type = 2
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
# a bit useless error handling.
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
if experiment_type == 1:
regionLabels2 = ['Low_Saxon', 'Limburg']
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
##########################
## DATA PREPARATION ##
##########################
## load init file
config = configparser.ConfigParser()
config.sections()
config.read(configFile)
dirFeature = config['sentence_based']['dirFeature']
fileMDB = config['sentence_based']['fileMDB']
## database connection
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
data = np.array(rows)
#dataNumMax = data.shape[0]
#uniqueWordIDmax = max(data[:, 3].astype(int))
del SQL_string, rows
## make list of LabelBinarizer object per word.
# for X
# get pronvarList from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
LBlist = []
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
uniqueWordIDlist = data[:, 3].astype(int)
uniqueWordIDmax = max(uniqueWordIDlist)
for uniqueWordID in range(1, uniqueWordIDmax+1):
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
LB = preprocessing.LabelBinarizer()
LB.fit(np.unique(pronvar))
LBlist.append(LB)
# for y (=region)
LE_y = preprocessing.LabelEncoder()
LE_y.fit(regionLabels)
LE_y2 = preprocessing.LabelEncoder()
LE_y2.fit(regionLabels2)
LB_y = preprocessing.LabelBinarizer()
LB_y.fit(regionLabels)
LB_y2 = preprocessing.LabelBinarizer()
LB_y2.fit(regionLabels2)
del uniqueWordID, uniqueWordIDmax, pronvar, LB
#################
## ITERATION ##
#################
#CM_majority = np.zeros((1, 9)).astype(int)
#CM_weighted = np.zeros((1, 9)).astype(int)
#for iter in range(0, 1):
# print(iter)
## make balanced dataset
pidlist = np.unique(data[:, (1, 2)], axis=0)
# count number of samples
pidlistCounter = Counter(pidlist[:, 1])
sampleNumMax = min(pidlistCounter.values())
del pidlistCounter
## make train/eval/test set or load
if makeTrainTestSet==1:
pidlist_train = []
pidlist_eval = []
pidlist_test = []
for regionNum in range(0, len(regionLabels)):
regionName = regionLabels[regionNum]
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
pidlist_per_region, idx = mani.extractRandomSample(
pidlist_per_region_, sampleNumMax)
# split dataset into train, eval and test.
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
pidlist_per_region, test_size = 0.2, random_state = 0)
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
pidlist_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays
if regionNum == 0:
pidlist_train = pidlist_per_region_train
pidlist_eval = pidlist_per_region_eval
pidlist_test = pidlist_per_region_test
else:
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
del regionNum, regionName
del pidlist_per_region_, pidlist_per_region, idx
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
else:
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
## make dataset for 2 regions or load
if conv3to2region==1:
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
if experiment_type == 1:
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
elif experiment_type == 2:
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
del pidlist2_train_
else:
if experiment_type == 1:
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
elif experiment_type == 2:
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
## train/test data
if experiment_type == 0:
# Groningen vs Overijsel vs Limburg
data_train = sb_func.extractPid(pidlist_train, data)
data_eval = sb_func.extractPid(pidlist_eval, data)
data_test = sb_func.extractPid(pidlist_test, data)
elif experiment_type == 1 or experiment_type == 2:
data2 = np.array(data)
if experiment_type == 1:
for row, row2 in zip(data, data2):
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
row2[2] = regionLabels2[0]
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
#####################################
## EXPERIMENTS START FROM HERE ##
#####################################
## actual training
# train vs eval
#trainData = data_train
#testData = data_eval
#testPID = pidlist_eval
#LB = LB_y
#LE = LE_y
#regionLabels = regionLabels3
# train+eval vs test
if experiment_type == 0:
trainData = np.r_[data_train, data_eval]
testData = data_test
testPID = pidlist_test
LB = LB_y
LE = LE_y
elif experiment_type == 1 or experiment_type == 2:
# 2 region: saxon vs limburg/ groningen vs limburg
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
regionLabels = regionLabels2
# check the number of utterance
allData = np.r_[trainData, testData]
filenames = np.c_[allData[:, 0], allData[:, 2]]
filenames_unique = np.unique(filenames, axis=0)
Counter(filenames_unique[:, 1])
fileComparison = dirOut + "\\algorithm_comparison.csv"
filePerformance = dirOut + "\\sentence-level.csv"
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
## train sentence-level classifiers.
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
trainData, LBlist, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## majority vote (weighted)
#weight = sb_func.calc_weight(confusionMatrixList)
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
### confusion matrix
if experiment_type == 0:
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
else:
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
#confusionMatrix_weighted = confusion_matrix(
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
print(cm)
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
np.save(dirOut + "\\confusion_matrix.npy", cm)
#fout = open(fileConfusionMatrix, "w")
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
#fout.write('\n')
#fout.close()
##### iteration finish #####
conn.close()
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')

BIN
output/confusion_matrix_2regions.npy

Binary file not shown.

BIN
output/confusion_matrix_2regions.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

BIN
output/confusion_matrix_2regions_normalized.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

BIN
output/confusion_matrix_3regions.npy

Binary file not shown.

BIN
output/confusion_matrix_3regions.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

BIN
output/confusion_matrix_3regions_normalized.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

BIN
output/pidlist_2regions_test.npy

Binary file not shown.

BIN
output/pidlist_2regions_train.npy

Binary file not shown.

BIN
output/pidlist_3regions_eval.npy

Binary file not shown.

BIN
output/pidlist_3regions_test.npy

Binary file not shown.

BIN
output/pidlist_3regions_train.npy

Binary file not shown.

BIN
output/pred_per_pid_2regions.npy

Binary file not shown.

BIN
output/pred_per_pid_3regions.npy

Binary file not shown.
Loading…
Cancel
Save