I will post again on this project later to summarize everything that I learned, and hopefully clean up the code a bit now that I’m not under a time constraint to just get enough done to turn in! Also, now that I’ve submitted my work, any advice on the approach is welcome!
The 4 classification algorithms I wrote were:
- Naive Bayes
- Bayes
- Gaussian Kernel Density Estimator
- K-Nearest Neighbor
I’m really proud of myself for understanding these enough and ramping up quickly enough on Python to be able to finish in the week or so I had to work on it late-night. The code for the final two classifiers is below. It’s clear I still have a lot of learning to do, but I finished enough to get results to compare and turn in, so I’m happy with how far I got. Cleanup and improved code & efficiency can come later!
import numpy as np
#bring in data from files
data_train = np.loadtxt(open("train.txt"))
data_test = np.loadtxt(open("test.txt"))
Xtrn = data_train[:, 0:2] # first 2 columns of training set
Ytrn = data_train[:, 2] # last column, 1/0 labels
Xtst = data_test[:, 0:2] # first 2 columns of test set
Ytst = data_test[:, 2] # last column, 1/0 labels
print("Length of training set: %d " % len(Xtrn))
print("Length of test set: %d " % len(Xtst))
#items in class 0
Xtrn_0 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))] #note, currently hardcoded that each class is 1/2 of the total
#loop through training data and find the items labeled 0
n = 0
for train_items in data_train:
if train_items[2] == 0:
#setting up new more general way so can do covariance matrix
Xtrn_0[n][0] = train_items[0]
Xtrn_0[n][1] = train_items[1]
n=n+1
#items in class 1
Xtrn_1 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))]
#loop through training data and find the items labeled 0
n = 0
for train_items in data_train:
if train_items[2] == 1:
#setting up new more general way so can do covariance matrix
Xtrn_1[n][0] = train_items[0]
Xtrn_1[n][1] = train_items[1]
n=n+1
#set h for the GKDE "manual" estimation
h = 1.6
prob0 = []
for x in range(len(Xtst)):
sum0 = 0.0
for r in range(len(Xtrn_0)):
sum0 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_0[r]) ** 2) / 2*h*h)
prob0.append(sum0/len(Xtrn_0))
prob1 = []
for x in range(len(Xtst)):
sum1 = 0.0
for r in range(len(Xtrn_1)):
sum1 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_1[r]) ** 2) / 2*h*h)
prob1.append(sum1/len(Xtrn_1))
Xprob_GKDE = []
for i in range(len(Xtst)):
#assign class
if prob0[i] > prob1[i]:
Xprob_GKDE.append(0)
else:
Xprob_GKDE.append(1) #NOTE: this doesn't handle the case where they're equal. equal = 1
correct_class_new = 0
for i in range(len(Xtst)):
if Xprob_GKDE[i] == Ytst[i]:
correct_class_new += 1
print("Correct GKDE: %d" % correct_class_new)
print("Incorrect GKDE: %d" % (len(Xtst) - correct_class_new))
import numpy as np
import random
#bring in data from files
data_train = np.loadtxt(open("train.txt"))
data_test = np.loadtxt(open("test.txt"))
Xtrn = data_train[:, 0:2] # first 2 columns of training set
Ytrn = data_train[:, 2] # last column, 1/0 labels
Xtst = data_test[:, 0:2] # first 2 columns of test set
Ytst = data_test[:, 2] # last column, 1/0 labels
print("Length of training set: %d " % len(Xtrn))
print("Length of test set: %d " % len(Xtst))
#count items in each class
class0_count = 0
class1_count = 0
for train_items in data_train:
if train_items[2] == 0:
class0_count +=1
elif train_items[2] == 1:
class1_count +=1
print("Training points in Class 0: %d" % class0_count)
print("Training points in Class 1: %d" % class1_count)
#probability of each class
pc0 = class0_count / len(Xtrn)
pc1 = class1_count / len(Xtrn)
#for each point in the test set, loop through each point in the training set, and find the 3 nearest
k=11
print("%d Nearest Neighbors" %k)
xClass = []
for i in range(len(Xtst)):
Distances = []
KClass0 = 0
KClass1 = 0
pXc0 = 0
pXc1 = 0
pX = 0
for j in range(len(Xtrn)):
#get the distance of each item in the training set from this item in the test set
Distances.append((j,np.linalg.norm(Xtst[i] - Xtrn[j]))) #index to store for when set is sorted, distance between vectors
#print(i)
#sort those distances
Distances.sort(key = lambda tup: tup[1])
#print(Distances)
#Kth training item distance from this test item
KVol = 4 * np.pi * np.square(Distances[k-1][1])
#print(KVol)
for m in range(k-1):
#get the class of each of the k nearest neighbors by index of mth sorted item
if Ytrn[Distances[m][0]] == 0:
KClass0 +=1
if Ytrn[Distances[m][0]] == 1:
KClass1 +=1
pXc0 = KClass0 / (class0_count * KVol)
pXc1 = KClass1 / (class1_count * KVol)
pX = k / (len(Xtrn)*KVol)
pc0X = (pXc0 * pc0) / pX
pc1X = (pXc1 * pc1) / pX
#print("Probability Class 0 for Point %d: %f" %(i,pc0X))
#print("Probability Class 1 for Point %d: %f" %(i,pc1X))
if pc0X > pc1X:
xClass.append(0) #probability of class 0 is higher, assign to class 0
elif pc0X < pc1X:
xClass.append(1)
else:
#if both probabilities are equal, assign randomly
xClass.append(random.randint(0, 1))
#print("Class: %d" % xClass[i])
correct_class = 0
for i in range(len(Xtst)):
if xClass[i] == Ytst[i]:
correct_class += 1
print("Correct Class: %d" % correct_class)
print("Incorrect Class: %d" % (len(Xtst) - correct_class))