cleaned up the INTERSPEECH related codes.

master
yemaozi88 5 years ago
parent a1379caced
commit eb65543781

@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26730.12
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
ProjectSection(SolutionItems) = preProject
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
EndProjectSection
EndProject
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU

@ -5,7 +5,7 @@
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
<ProjectHome>.</ProjectHome>
<StartupFile>output_confusion_matrix.py</StartupFile>
<StartupFile>speaker_based.py</StartupFile>
<SearchPath>
</SearchPath>
<WorkingDirectory>.</WorkingDirectory>
@ -22,6 +22,8 @@
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<ItemGroup>
<Compile Include="data_io.py" />
<Compile Include="data_manipulation.py" />
<Compile Include="manipulate_db.py">
<SubType>Code</SubType>
</Compile>
@ -29,9 +31,6 @@
<SubType>Code</SubType>
</Compile>
<Compile Include="classifier.py" />
<Compile Include="dataManipulation.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="output_confusion_matrix.py">
<SubType>Code</SubType>
</Compile>
@ -53,7 +52,6 @@
<Compile Include="word_based.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="dataIO.py" />
</ItemGroup>
<ItemGroup>
<Content Include="config.ini" />

@ -1,6 +1,5 @@
import os
import sys
import configparser
import numpy as np
import pypyodbc
@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module)
from forced_alignment import forced_alignment
## check if forced-alignment work in each sentence
## delete all automatically generated pronunciations
#from forced_alignment import pronunciations
#pronunciations.delete_all_g2p_entries()
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
#script_file = script_dir + '\\script10.txt'
#with open(script_file, 'r') as fin:
# script = fin.readline()
#fa = forced_alignment(wav_file, script)
## make database connection
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"

@ -0,0 +1,267 @@
import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
curr_dir = repo_dir + '\\accent_classification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
import data_manipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
## ======================= user define =======================
sentence_num_max = 10
config_file = curr_dir + '\\config.ini'
output_dir = repo_dir + '\\output'
# make train/test set: 1, load: 0
make_train_test_set = 0
# specify which experiment to be performed.
# - 3: groninven vs oost_overijssel vs limburg
# - 2: groningen vs limburg
experiment_type = 2
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
## ======================= data preparation =======================
## load variables from the ini file
config = configparser.ConfigParser()
config.sections()
config.read(config_file)
MDB_file = config['sentence_based']['fileMDB']
## connect to the database
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
data = np.array(rows)
del SQL_string, rows
## get the list of pronunciation variant (pronvarList) from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
conn.close()
## make list of LabelBinarizer object per word for X (=pronunciation variant).
LB_list = []
unique_wordID_list = data[:, 3].astype(int)
unique_wordID_max = max(unique_wordID_list)
for unique_wordID in range(1, unique_wordID_max+1):
pronvar = data[unique_wordID_list == unique_wordID, 7]
LB = preprocessing.LabelBinarizer()
LB.fit(np.unique(pronvar))
LB_list.append(LB)
## make LabelEncorder/LabelBinilizer objects for y (=region).
LE_y3 = preprocessing.LabelEncoder()
LE_y3.fit(region_labels3)
LE_y2 = preprocessing.LabelEncoder()
LE_y2.fit(region_labels2)
LB_y3 = preprocessing.LabelBinarizer()
LB_y3.fit(region_labels3)
LB_y2 = preprocessing.LabelBinarizer()
LB_y2.fit(region_labels2)
del unique_wordID, unique_wordID_max, pronvar, LB
## ======================= make train/eval/test set or load =======================
## find the smallest group to balance the number of samples per group.
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
pidlist3_counter = Counter(pidlist3[:, 1])
sample_num_max = min(pidlist3_counter.values())
del pidlist3_counter
## make train/eval/test set or load them.
if make_train_test_set==1:
pidlist3_train = []
pidlist3_eval = []
pidlist3_test = []
for region_num in range(0, len(region_labels3)):
region_name = region_labels3[region_num]
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
pidlist3_per_region, idx = mani.extractRandomSample(
pidlist3_per_region_, sample_num_max)
# split dataset into train, eval and test.
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
pidlist3_per_region, test_size = 0.2, random_state = 0)
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays.
if region_num == 0:
pidlist3_train = pidlist3_per_region_train
pidlist3_eval = pidlist3_per_region_eval
pidlist3_test = pidlist3_per_region_test
else:
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
del region_num, region_name
del pidlist3_per_region_, pidlist3_per_region, idx
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
if experiment_type == 2:
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
del pidlist2_train_
else:
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
if experiment_type == 2:
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
## extract corresponding data using pid
data3_train = sb_func.extractPid(pidlist3_train, data)
data3_eval = sb_func.extractPid(pidlist3_eval, data)
data3_test = sb_func.extractPid(pidlist3_test, data)
if experiment_type == 2:
data2 = np.array(data)
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
## ======================= experiments =======================
## specify the dataset
# train vs eval
#trainData = data3_train
#testData = data3_eval
#testPID = pidlist3_eval
#LB = LB_y3
#LE = LE_y3
#region_labels = region_labels3
# train+eval vs test
if experiment_type == 3:
trainData = np.r_[data3_train, data3_eval]
testData = data3_test
testPID = pidlist3_test
LB = LB_y3
LE = LE_y3
region_labels = region_labels3
elif experiment_type == 2:
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
region_labels = region_labels2
## check the number of utterance
#data_all = np.r_[trainData, testData]
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
#filenames_unique = np.unique(filenames, axis=0)
#Counter(filenames_unique[:, 1])
## output filenames
fileComparison = output_dir + "\\algorithm_comparison.csv"
filePerformance = output_dir + "\\sentence-level.csv"
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
## train sentence-level classifiers.
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
trainData, LB_list, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## confusion matrix
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
print(cm)
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
np.save(output_dir + "\\confusion_matrix2.npy", cm)

@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import dataManipulation as mani
import data_manipulation as mani
import evaluation as eval
@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
return np.array(prediction_per_pid)
def saxon_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
Notes:
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
2 regions include ['Limburg', 'Low_Saxon']
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
samples are randomly chosen so that each class has the same amount of data.
"""
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
regionLabels2 = ['Low_Saxon', 'Limburg']
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
pidlist_saxon_ = pidlist3[index_saxon, :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
# extract the same amout of samples as Limburg.
pidlistCounter3 = Counter(pidlist3[:, 1])
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
pidlist_saxon[:, 1] = regionLabels2[0]
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
#pidlistCounter2 = Counter(pidlist2[:, 1])
return pidlist2
def groningen_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3):
2 regions include ['Groningen_and_Drenthe', 'Limburg']
"""
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]

@ -1,326 +0,0 @@
import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
import dataManipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
#####################
## USER DEFINE ##
#####################
sentenceNumMax = 10
configFile = currDir + '\\config.ini'
dirOut = currDir + '\\result'
# make train/test set: 1, load: 0
makeTrainTestSet = 0
# convert 3 regions to 2 regions: 1, load: 0
conv3to2region = 0
# 3 regions: 0
# saxon vs limburg: 1
# groningen vs limburg: 2
experiment_type = 2
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
# a bit useless error handling.
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
if experiment_type == 1:
regionLabels2 = ['Low_Saxon', 'Limburg']
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
##########################
## DATA PREPARATION ##
##########################
## load init file
config = configparser.ConfigParser()
config.sections()
config.read(configFile)
dirFeature = config['sentence_based']['dirFeature']
fileMDB = config['sentence_based']['fileMDB']
## database connection
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
data = np.array(rows)
#dataNumMax = data.shape[0]
#uniqueWordIDmax = max(data[:, 3].astype(int))
del SQL_string, rows
## make list of LabelBinarizer object per word.
# for X
# get pronvarList from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
LBlist = []
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
uniqueWordIDlist = data[:, 3].astype(int)
uniqueWordIDmax = max(uniqueWordIDlist)
for uniqueWordID in range(1, uniqueWordIDmax+1):
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
LB = preprocessing.LabelBinarizer()
LB.fit(np.unique(pronvar))
LBlist.append(LB)
# for y (=region)
LE_y = preprocessing.LabelEncoder()
LE_y.fit(regionLabels)
LE_y2 = preprocessing.LabelEncoder()
LE_y2.fit(regionLabels2)
LB_y = preprocessing.LabelBinarizer()
LB_y.fit(regionLabels)
LB_y2 = preprocessing.LabelBinarizer()
LB_y2.fit(regionLabels2)
del uniqueWordID, uniqueWordIDmax, pronvar, LB
#################
## ITERATION ##
#################
#CM_majority = np.zeros((1, 9)).astype(int)
#CM_weighted = np.zeros((1, 9)).astype(int)
#for iter in range(0, 1):
# print(iter)
## make balanced dataset
pidlist = np.unique(data[:, (1, 2)], axis=0)
# count number of samples
pidlistCounter = Counter(pidlist[:, 1])
sampleNumMax = min(pidlistCounter.values())
del pidlistCounter
## make train/eval/test set or load
if makeTrainTestSet==1:
pidlist_train = []
pidlist_eval = []
pidlist_test = []
for regionNum in range(0, len(regionLabels)):
regionName = regionLabels[regionNum]
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
pidlist_per_region, idx = mani.extractRandomSample(
pidlist_per_region_, sampleNumMax)
# split dataset into train, eval and test.
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
pidlist_per_region, test_size = 0.2, random_state = 0)
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
pidlist_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays
if regionNum == 0:
pidlist_train = pidlist_per_region_train
pidlist_eval = pidlist_per_region_eval
pidlist_test = pidlist_per_region_test
else:
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
del regionNum, regionName
del pidlist_per_region_, pidlist_per_region, idx
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
else:
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
## make dataset for 2 regions or load
if conv3to2region==1:
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
if experiment_type == 1:
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
elif experiment_type == 2:
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
del pidlist2_train_
else:
if experiment_type == 1:
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
elif experiment_type == 2:
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
## train/test data
if experiment_type == 0:
# Groningen vs Overijsel vs Limburg
data_train = sb_func.extractPid(pidlist_train, data)
data_eval = sb_func.extractPid(pidlist_eval, data)
data_test = sb_func.extractPid(pidlist_test, data)
elif experiment_type == 1 or experiment_type == 2:
data2 = np.array(data)
if experiment_type == 1:
for row, row2 in zip(data, data2):
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
row2[2] = regionLabels2[0]
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
#####################################
## EXPERIMENTS START FROM HERE ##
#####################################
## actual training
# train vs eval
#trainData = data_train
#testData = data_eval
#testPID = pidlist_eval
#LB = LB_y
#LE = LE_y
#regionLabels = regionLabels3
# train+eval vs test
if experiment_type == 0:
trainData = np.r_[data_train, data_eval]
testData = data_test
testPID = pidlist_test
LB = LB_y
LE = LE_y
elif experiment_type == 1 or experiment_type == 2:
# 2 region: saxon vs limburg/ groningen vs limburg
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
regionLabels = regionLabels2
# check the number of utterance
allData = np.r_[trainData, testData]
filenames = np.c_[allData[:, 0], allData[:, 2]]
filenames_unique = np.unique(filenames, axis=0)
Counter(filenames_unique[:, 1])
fileComparison = dirOut + "\\algorithm_comparison.csv"
filePerformance = dirOut + "\\sentence-level.csv"
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
## train sentence-level classifiers.
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
trainData, LBlist, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## majority vote (weighted)
#weight = sb_func.calc_weight(confusionMatrixList)
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
### confusion matrix
if experiment_type == 0:
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
else:
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
#confusionMatrix_weighted = confusion_matrix(
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
print(cm)
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
np.save(dirOut + "\\confusion_matrix.npy", cm)
#fout = open(fileConfusionMatrix, "w")
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
#fout.write('\n')
#fout.close()
##### iteration finish #####
conn.close()
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.
Loading…
Cancel
Save