cleaned up the INTERSPEECH related codes.
parent
a1379caced
commit
eb65543781
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,267 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import pypyodbc
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
|
||||
curr_dir = repo_dir + '\\accent_classification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
|
||||
import data_manipulation as mani
|
||||
import evaluation as eval
|
||||
import speaker_based_functions as sb_func
|
||||
|
||||
|
||||
## ======================= user define =======================
|
||||
sentence_num_max = 10
|
||||
config_file = curr_dir + '\\config.ini'
|
||||
output_dir = repo_dir + '\\output'
|
||||
|
||||
# make train/test set: 1, load: 0
|
||||
make_train_test_set = 0
|
||||
|
||||
# specify which experiment to be performed.
|
||||
# - 3: groninven vs oost_overijssel vs limburg
|
||||
# - 2: groningen vs limburg
|
||||
experiment_type = 2
|
||||
|
||||
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
|
||||
## ======================= data preparation =======================
|
||||
|
||||
## load variables from the ini file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(config_file)
|
||||
MDB_file = config['sentence_based']['fileMDB']
|
||||
|
||||
|
||||
## connect to the database
|
||||
pypyodbc.lowercase = False
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
## get data from Access database
|
||||
# data format
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
SQL_string = """\
|
||||
{CALL dataset_with_cities}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
data = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
## get the list of pronunciation variant (pronvarList) from Access database
|
||||
# pronvarList format
|
||||
# 0: ID (unique word_id)
|
||||
# 1: word
|
||||
# 2: pronvar
|
||||
SQL_string = """\
|
||||
{CALL pronunciation_variant}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
rows = cursor.fetchall()
|
||||
pronvarList = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
## make list of LabelBinarizer object per word for X (=pronunciation variant).
|
||||
LB_list = []
|
||||
unique_wordID_list = data[:, 3].astype(int)
|
||||
unique_wordID_max = max(unique_wordID_list)
|
||||
for unique_wordID in range(1, unique_wordID_max+1):
|
||||
pronvar = data[unique_wordID_list == unique_wordID, 7]
|
||||
LB = preprocessing.LabelBinarizer()
|
||||
LB.fit(np.unique(pronvar))
|
||||
LB_list.append(LB)
|
||||
|
||||
|
||||
## make LabelEncorder/LabelBinilizer objects for y (=region).
|
||||
LE_y3 = preprocessing.LabelEncoder()
|
||||
LE_y3.fit(region_labels3)
|
||||
LE_y2 = preprocessing.LabelEncoder()
|
||||
LE_y2.fit(region_labels2)
|
||||
|
||||
LB_y3 = preprocessing.LabelBinarizer()
|
||||
LB_y3.fit(region_labels3)
|
||||
LB_y2 = preprocessing.LabelBinarizer()
|
||||
LB_y2.fit(region_labels2)
|
||||
|
||||
del unique_wordID, unique_wordID_max, pronvar, LB
|
||||
|
||||
|
||||
|
||||
## ======================= make train/eval/test set or load =======================
|
||||
|
||||
## find the smallest group to balance the number of samples per group.
|
||||
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
|
||||
pidlist3_counter = Counter(pidlist3[:, 1])
|
||||
sample_num_max = min(pidlist3_counter.values())
|
||||
del pidlist3_counter
|
||||
|
||||
|
||||
## make train/eval/test set or load them.
|
||||
|
||||
if make_train_test_set==1:
|
||||
pidlist3_train = []
|
||||
pidlist3_eval = []
|
||||
pidlist3_test = []
|
||||
for region_num in range(0, len(region_labels3)):
|
||||
region_name = region_labels3[region_num]
|
||||
|
||||
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
|
||||
pidlist3_per_region, idx = mani.extractRandomSample(
|
||||
pidlist3_per_region_, sample_num_max)
|
||||
|
||||
# split dataset into train, eval and test.
|
||||
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
|
||||
pidlist3_per_region, test_size = 0.2, random_state = 0)
|
||||
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
|
||||
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
|
||||
|
||||
# append numpy arrays.
|
||||
if region_num == 0:
|
||||
pidlist3_train = pidlist3_per_region_train
|
||||
pidlist3_eval = pidlist3_per_region_eval
|
||||
pidlist3_test = pidlist3_per_region_test
|
||||
else:
|
||||
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
|
||||
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
|
||||
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
|
||||
del region_num, region_name
|
||||
del pidlist3_per_region_, pidlist3_per_region, idx
|
||||
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
|
||||
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
|
||||
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
|
||||
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
|
||||
|
||||
|
||||
if experiment_type == 2:
|
||||
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
|
||||
|
||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
|
||||
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
|
||||
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
|
||||
|
||||
del pidlist2_train_
|
||||
else:
|
||||
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
|
||||
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
|
||||
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
|
||||
|
||||
if experiment_type == 2:
|
||||
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
|
||||
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
|
||||
|
||||
|
||||
## extract corresponding data using pid
|
||||
|
||||
data3_train = sb_func.extractPid(pidlist3_train, data)
|
||||
data3_eval = sb_func.extractPid(pidlist3_eval, data)
|
||||
data3_test = sb_func.extractPid(pidlist3_test, data)
|
||||
|
||||
if experiment_type == 2:
|
||||
data2 = np.array(data)
|
||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||
|
||||
|
||||
## ======================= experiments =======================
|
||||
|
||||
## specify the dataset
|
||||
|
||||
# train vs eval
|
||||
#trainData = data3_train
|
||||
#testData = data3_eval
|
||||
#testPID = pidlist3_eval
|
||||
#LB = LB_y3
|
||||
#LE = LE_y3
|
||||
#region_labels = region_labels3
|
||||
|
||||
# train+eval vs test
|
||||
if experiment_type == 3:
|
||||
trainData = np.r_[data3_train, data3_eval]
|
||||
testData = data3_test
|
||||
testPID = pidlist3_test
|
||||
LB = LB_y3
|
||||
LE = LE_y3
|
||||
region_labels = region_labels3
|
||||
|
||||
elif experiment_type == 2:
|
||||
trainData = data2_train
|
||||
testData = data2_test
|
||||
testPID = pidlist2_test
|
||||
LB = LB_y2
|
||||
LE = LE_y2
|
||||
region_labels = region_labels2
|
||||
|
||||
## check the number of utterance
|
||||
#data_all = np.r_[trainData, testData]
|
||||
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
|
||||
#filenames_unique = np.unique(filenames, axis=0)
|
||||
#Counter(filenames_unique[:, 1])
|
||||
|
||||
|
||||
## output filenames
|
||||
fileComparison = output_dir + "\\algorithm_comparison.csv"
|
||||
filePerformance = output_dir + "\\sentence-level.csv"
|
||||
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
|
||||
|
||||
|
||||
## compare classification algorithms for the sentence-classifiers.
|
||||
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
|
||||
|
||||
|
||||
## train sentence-level classifiers.
|
||||
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
|
||||
trainData, LB_list, LE, filePerformance)
|
||||
|
||||
|
||||
## prediction over evaluation data per each sentence-level classifier.
|
||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
|
||||
|
||||
|
||||
## combine sentence-level classifiers
|
||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||
|
||||
|
||||
## confusion matrix
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
|
||||
|
||||
|
||||
## output
|
||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||
print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
cm = confusionMatrix_majority
|
||||
print(cm)
|
||||
|
||||
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
|
||||
np.save(output_dir + "\\confusion_matrix2.npy", cm)
|
@ -1,326 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import pypyodbc
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
import dataManipulation as mani
|
||||
import evaluation as eval
|
||||
import speaker_based_functions as sb_func
|
||||
|
||||
|
||||
#####################
|
||||
## USER DEFINE ##
|
||||
#####################
|
||||
sentenceNumMax = 10
|
||||
configFile = currDir + '\\config.ini'
|
||||
dirOut = currDir + '\\result'
|
||||
|
||||
# make train/test set: 1, load: 0
|
||||
makeTrainTestSet = 0
|
||||
# convert 3 regions to 2 regions: 1, load: 0
|
||||
conv3to2region = 0
|
||||
|
||||
# 3 regions: 0
|
||||
# saxon vs limburg: 1
|
||||
# groningen vs limburg: 2
|
||||
experiment_type = 2
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
|
||||
# a bit useless error handling.
|
||||
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
||||
if experiment_type == 1:
|
||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
|
||||
##########################
|
||||
## DATA PREPARATION ##
|
||||
##########################
|
||||
|
||||
## load init file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(configFile)
|
||||
dirFeature = config['sentence_based']['dirFeature']
|
||||
fileMDB = config['sentence_based']['fileMDB']
|
||||
|
||||
|
||||
## database connection
|
||||
pypyodbc.lowercase = False
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
## get data from Access database
|
||||
# data format
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
SQL_string = """\
|
||||
{CALL dataset_with_cities}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
data = np.array(rows)
|
||||
#dataNumMax = data.shape[0]
|
||||
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
## make list of LabelBinarizer object per word.
|
||||
# for X
|
||||
# get pronvarList from Access database
|
||||
# pronvarList format
|
||||
# 0: ID (unique word_id)
|
||||
# 1: word
|
||||
# 2: pronvar
|
||||
SQL_string = """\
|
||||
{CALL pronunciation_variant}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
rows = cursor.fetchall()
|
||||
pronvarList = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
LBlist = []
|
||||
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
||||
uniqueWordIDlist = data[:, 3].astype(int)
|
||||
uniqueWordIDmax = max(uniqueWordIDlist)
|
||||
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
||||
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
||||
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
||||
LB = preprocessing.LabelBinarizer()
|
||||
LB.fit(np.unique(pronvar))
|
||||
LBlist.append(LB)
|
||||
|
||||
# for y (=region)
|
||||
LE_y = preprocessing.LabelEncoder()
|
||||
LE_y.fit(regionLabels)
|
||||
LE_y2 = preprocessing.LabelEncoder()
|
||||
LE_y2.fit(regionLabels2)
|
||||
|
||||
LB_y = preprocessing.LabelBinarizer()
|
||||
LB_y.fit(regionLabels)
|
||||
LB_y2 = preprocessing.LabelBinarizer()
|
||||
LB_y2.fit(regionLabels2)
|
||||
|
||||
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
||||
|
||||
|
||||
#################
|
||||
## ITERATION ##
|
||||
#################
|
||||
#CM_majority = np.zeros((1, 9)).astype(int)
|
||||
#CM_weighted = np.zeros((1, 9)).astype(int)
|
||||
#for iter in range(0, 1):
|
||||
# print(iter)
|
||||
|
||||
## make balanced dataset
|
||||
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
||||
|
||||
# count number of samples
|
||||
pidlistCounter = Counter(pidlist[:, 1])
|
||||
sampleNumMax = min(pidlistCounter.values())
|
||||
del pidlistCounter
|
||||
|
||||
|
||||
## make train/eval/test set or load
|
||||
if makeTrainTestSet==1:
|
||||
pidlist_train = []
|
||||
pidlist_eval = []
|
||||
pidlist_test = []
|
||||
for regionNum in range(0, len(regionLabels)):
|
||||
regionName = regionLabels[regionNum]
|
||||
|
||||
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
||||
pidlist_per_region, idx = mani.extractRandomSample(
|
||||
pidlist_per_region_, sampleNumMax)
|
||||
|
||||
# split dataset into train, eval and test.
|
||||
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
||||
pidlist_per_region, test_size = 0.2, random_state = 0)
|
||||
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
||||
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
||||
|
||||
# append numpy arrays
|
||||
if regionNum == 0:
|
||||
pidlist_train = pidlist_per_region_train
|
||||
pidlist_eval = pidlist_per_region_eval
|
||||
pidlist_test = pidlist_per_region_test
|
||||
else:
|
||||
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
||||
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
||||
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
||||
del regionNum, regionName
|
||||
del pidlist_per_region_, pidlist_per_region, idx
|
||||
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
||||
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
||||
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
||||
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
||||
else:
|
||||
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
||||
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
||||
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
||||
|
||||
|
||||
## make dataset for 2 regions or load
|
||||
if conv3to2region==1:
|
||||
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
||||
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
||||
|
||||
del pidlist2_train_
|
||||
else:
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
||||
|
||||
|
||||
## train/test data
|
||||
if experiment_type == 0:
|
||||
# Groningen vs Overijsel vs Limburg
|
||||
data_train = sb_func.extractPid(pidlist_train, data)
|
||||
data_eval = sb_func.extractPid(pidlist_eval, data)
|
||||
data_test = sb_func.extractPid(pidlist_test, data)
|
||||
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
data2 = np.array(data)
|
||||
|
||||
if experiment_type == 1:
|
||||
for row, row2 in zip(data, data2):
|
||||
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
||||
row2[2] = regionLabels2[0]
|
||||
|
||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||
|
||||
|
||||
#####################################
|
||||
## EXPERIMENTS START FROM HERE ##
|
||||
#####################################
|
||||
|
||||
## actual training
|
||||
# train vs eval
|
||||
#trainData = data_train
|
||||
#testData = data_eval
|
||||
#testPID = pidlist_eval
|
||||
#LB = LB_y
|
||||
#LE = LE_y
|
||||
#regionLabels = regionLabels3
|
||||
|
||||
# train+eval vs test
|
||||
if experiment_type == 0:
|
||||
trainData = np.r_[data_train, data_eval]
|
||||
testData = data_test
|
||||
testPID = pidlist_test
|
||||
LB = LB_y
|
||||
LE = LE_y
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
# 2 region: saxon vs limburg/ groningen vs limburg
|
||||
trainData = data2_train
|
||||
testData = data2_test
|
||||
testPID = pidlist2_test
|
||||
LB = LB_y2
|
||||
LE = LE_y2
|
||||
regionLabels = regionLabels2
|
||||
|
||||
|
||||
# check the number of utterance
|
||||
allData = np.r_[trainData, testData]
|
||||
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
||||
filenames_unique = np.unique(filenames, axis=0)
|
||||
Counter(filenames_unique[:, 1])
|
||||
|
||||
|
||||
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
||||
filePerformance = dirOut + "\\sentence-level.csv"
|
||||
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
||||
|
||||
## compare classification algorithms for the sentence-classifiers.
|
||||
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
||||
|
||||
## train sentence-level classifiers.
|
||||
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
||||
trainData, LBlist, LE, filePerformance)
|
||||
|
||||
## prediction over evaluation data per each sentence-level classifier.
|
||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
||||
|
||||
## combine sentence-level classifiers
|
||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||
|
||||
## majority vote (weighted)
|
||||
#weight = sb_func.calc_weight(confusionMatrixList)
|
||||
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
||||
|
||||
### confusion matrix
|
||||
if experiment_type == 0:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
||||
else:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
||||
|
||||
#confusionMatrix_weighted = confusion_matrix(
|
||||
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
||||
|
||||
|
||||
## output
|
||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||
print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
cm = confusionMatrix_majority
|
||||
print(cm)
|
||||
|
||||
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
||||
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
||||
|
||||
#fout = open(fileConfusionMatrix, "w")
|
||||
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
||||
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
||||
#fout.write('\n')
|
||||
#fout.close()
|
||||
|
||||
|
||||
##### iteration finish #####
|
||||
conn.close()
|
||||
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
||||
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
||||
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 18 KiB |
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue