from sklearn import preprocessing
import numpy as np


#converts items to 0 if can't convert to int
def IntOrZero(value):
    try:
        return int(value)
    except:
        return 0

#converts items to 10 if can't convert to int
def IntOrTen(value):
    try:
        return int(value)
    except:
        return 10

#converts items to 9999 if can't convert to float
def FloatOr9999(value):
    try:
        return float(value)
    except:
        return 9999.00



print('Importing data...')
f = open('ProjectData3_2013b.csv')
sample=[]
target=[]
zip_one = []
rec_type = []
pref_school = []
i=0
column_headers = [] #won't include categorical data
   
for line in f.readlines():
    #skip first line
    sample_row = []
    if i==0:
        #assigned column to int 1/0, zip5 to int or 0 
        column_headers.extend(line.split(',')[9:12])
        #convert miles from hburg to float or 9999 (will skew stats)
        column_headers.extend(line.split(',')[13][0])
        #now importing solicitations and appeals
        column_headers.extend(line.split(',')[18:24])
        #number of events attended (not sure why this is coming through as 0,null
        column_headers.extend(line.split(',')[28][0])
        #years since added to system
        column_headers.extend(line.split(',')[26][0])
        #years since record/contact modified
        column_headers.extend(line.split(',')[27][0])
        #has pref address, has business address, has phone, has email
        column_headers.extend(line.split(',')[14:18])
        #recent/current parent, ever been employee, 
        column_headers.extend(line.split(',')[24:26])
        #print(column_headers)
    if i>0:
        #fields importing: recordtype, oktomail, oktoemail, oktocall, est_age, assigned, zip5, zip1, distance from hburg
        #convert yes/no okto columns to int 1/0, age to int or 0 (will skew age stats),
        #assigned column to int 1/0, zip5 to int or 0 
        sample_row.extend(list(map(IntOrZero, line.split(',')[9:12])))
        #convert miles from hburg to float or 9999 (will skew stats)
        sample_row.extend([FloatOr9999(line.split(',')[13])])
        #now importing solicitations and appeals
        sample_row.extend(list(map(IntOrZero, line.split(',')[18:24])))
        #number of events attended (not sure why this is coming through as 0,null
        sample_row.extend(list(map(int,line.split(',')[28][0])))
        #years since added to system
        sample_row.extend([FloatOr9999(line.split(',')[26])])
        #years since record/contact modified
        sample_row.extend([FloatOr9999(line.split(',')[27])])
        #will convert zip1 to int or 10 then convert to "1 of k" encoding
        zip_one.append(list([IntOrTen(line.split(',')[12])]))
        #primary record type string
        rec_type.append(list([line.split(',')[3]]))
        #pref school code string
        pref_school.append(list([line.split(',')[5]]))
        #has pref address, has business address, has phone, has email
        sample_row.extend(list(map(int,line.split(',')[14:18])))
        #recent/current parent, ever been employee, 
        sample_row.extend(list(map(int,line.split(',')[24:26])))
        #add this row we build to the sample list
        sample.append(sample_row)
        #sample.append(list(line.split(',')[6:13]))
        target.append(int(line.split(',')[1]))
    i+=1

#convert into structured array
#use object for all
sample = np.array(sample, dtype=object)

#change yes/no columns and zip1 to categorical data
#convert zip_one to 1 of k encoding, then add to existing list
enc = preprocessing.OneHotEncoder()
zip1 = enc.fit_transform(zip_one).toarray()
print(enc.active_features_)
sample = np.append(sample,zip1,1)

#convert record type string category to int, then to 1 of k encoding, then add to existing list
labelenc = preprocessing.LabelEncoder()
rec_num = labelenc.fit_transform(rec_type)
print(labelenc.inverse_transform(np.array([0,1,2,3,4,5,6,7,8,9])))
#convert to list of lists, need each item as list for next step to work
rec_num =  [[x] for x in rec_num] 
enc = preprocessing.OneHotEncoder()
rec = enc.fit_transform(rec_num).toarray()
sample = np.append(sample,rec,1)

#convert pref school string category to int, then to 1 of k encoding, then add to existing list
labelenc = preprocessing.LabelEncoder()
school_num = labelenc.fit_transform(pref_school)
print(labelenc.inverse_transform(np.array([0,1,2,3,4,5,6,7,8,9,10])))
#convert to list of lists, need each item as list for next step to work
school_num =  [[x] for x in school_num] 
enc = preprocessing.OneHotEncoder()
school = enc.fit_transform(school_num).toarray()
sample = np.append(sample,school,1)


class0data = []
class1data = []
i=0
for avalue in target:
    if avalue == 0:
        class0data.append(sample[i])
    else:
        class1data.append(sample[i])
    i+=1
class0data = np.array(class0data, dtype=object)
class1data = np.array(class1data, dtype=object)
class0target = np.zeros(len(class0data), dtype=np.int8)
class1target = np.ones(len(class1data), dtype=np.int8)

#convert target to array
target = np.array(target)
print(sample.shape, target.shape)
print(class0data.shape, class1data.shape)

print('\nSample Data:')
print(sample[1,0:18], target[1])
print(sample[1000,0:18], target[1000])

print('\nSample Class 0 Data:')
print(class0data[100,0:18], class0target[100])

print('\nSample Class 1 Data:')
print(class1data[100,0:18], class1target[100])

#build cross-validation data sets
from sklearn.cross_validation import train_test_split
sample_train, sample_test, target_train, target_test = train_test_split(sample, target, test_size=0.20)
#train the random forest classifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(sample_train[:,0:18], target_train)

#ONLY USE FIRST 18 COLUMNS
from sklearn.svm import SVC
mysvm = SVC()
mysvm = mysvm.fit(sample_train[:,0:18], target_train)


#test the model on various sets
trnresult = mysvm.score(sample_train[:,0:18],target_train)
tstresult = mysvm.score(sample_test[:,0:18],target_test)
class0result = mysvm.score(class0data[:,0:18],class0target)
class1result = mysvm.score(class1data[:,0:18],class1target)

print('\nThe training score is: %f' % trnresult)
print('The testing score is: %f' % tstresult)
print('The class 0 test score is: %f' % class0result)
print('The class 1 test score is: %f' % class1result)