from sklearn import preprocessing import numpy as np #converts items to 0 if can't convert to int def IntOrZero(value): try: return int(value) except: return 0 #converts items to 10 if can't convert to int def IntOrTen(value): try: return int(value) except: return 10 #converts items to 9999 if can't convert to float def FloatOr9999(value): try: return float(value) except: return 9999.00 print('Importing data...') f = open('ProjectData3_2013b.csv') sample=[] target=[] zip_one = [] rec_type = [] pref_school = [] i=0 column_headers = [] #won't include categorical data for line in f.readlines(): #skip first line sample_row = [] if i==0: #assigned column to int 1/0, zip5 to int or 0 column_headers.extend(line.split(',')[9:12]) #convert miles from hburg to float or 9999 (will skew stats) column_headers.extend(line.split(',')[13][0]) #now importing solicitations and appeals column_headers.extend(line.split(',')[18:24]) #number of events attended (not sure why this is coming through as 0,null column_headers.extend(line.split(',')[28][0]) #years since added to system column_headers.extend(line.split(',')[26][0]) #years since record/contact modified column_headers.extend(line.split(',')[27][0]) #has pref address, has business address, has phone, has email column_headers.extend(line.split(',')[14:18]) #recent/current parent, ever been employee, column_headers.extend(line.split(',')[24:26]) #print(column_headers) if i>0: #fields importing: recordtype, oktomail, oktoemail, oktocall, est_age, assigned, zip5, zip1, distance from hburg #convert yes/no okto columns to int 1/0, age to int or 0 (will skew age stats), #assigned column to int 1/0, zip5 to int or 0 sample_row.extend(list(map(IntOrZero, line.split(',')[9:12]))) #convert miles from hburg to float or 9999 (will skew stats) sample_row.extend([FloatOr9999(line.split(',')[13])]) #now importing solicitations and appeals sample_row.extend(list(map(IntOrZero, line.split(',')[18:24]))) #number of events attended (not sure why this is coming through as 0,null sample_row.extend(list(map(int,line.split(',')[28][0]))) #years since added to system sample_row.extend([FloatOr9999(line.split(',')[26])]) #years since record/contact modified sample_row.extend([FloatOr9999(line.split(',')[27])]) #will convert zip1 to int or 10 then convert to "1 of k" encoding zip_one.append(list([IntOrTen(line.split(',')[12])])) #primary record type string rec_type.append(list([line.split(',')[3]])) #pref school code string pref_school.append(list([line.split(',')[5]])) #has pref address, has business address, has phone, has email sample_row.extend(list(map(int,line.split(',')[14:18]))) #recent/current parent, ever been employee, sample_row.extend(list(map(int,line.split(',')[24:26]))) #add this row we build to the sample list sample.append(sample_row) #sample.append(list(line.split(',')[6:13])) target.append(int(line.split(',')[1])) i+=1 #convert into structured array #use object for all sample = np.array(sample, dtype=object) #change yes/no columns and zip1 to categorical data #convert zip_one to 1 of k encoding, then add to existing list enc = preprocessing.OneHotEncoder() zip1 = enc.fit_transform(zip_one).toarray() print(enc.active_features_) sample = np.append(sample,zip1,1) #convert record type string category to int, then to 1 of k encoding, then add to existing list labelenc = preprocessing.LabelEncoder() rec_num = labelenc.fit_transform(rec_type) print(labelenc.inverse_transform(np.array([0,1,2,3,4,5,6,7,8,9]))) #convert to list of lists, need each item as list for next step to work rec_num = [[x] for x in rec_num] enc = preprocessing.OneHotEncoder() rec = enc.fit_transform(rec_num).toarray() sample = np.append(sample,rec,1) #convert pref school string category to int, then to 1 of k encoding, then add to existing list labelenc = preprocessing.LabelEncoder() school_num = labelenc.fit_transform(pref_school) print(labelenc.inverse_transform(np.array([0,1,2,3,4,5,6,7,8,9,10]))) #convert to list of lists, need each item as list for next step to work school_num = [[x] for x in school_num] enc = preprocessing.OneHotEncoder() school = enc.fit_transform(school_num).toarray() sample = np.append(sample,school,1) class0data = [] class1data = [] i=0 for avalue in target: if avalue == 0: class0data.append(sample[i]) else: class1data.append(sample[i]) i+=1 class0data = np.array(class0data, dtype=object) class1data = np.array(class1data, dtype=object) class0target = np.zeros(len(class0data), dtype=np.int8) class1target = np.ones(len(class1data), dtype=np.int8) #convert target to array target = np.array(target) print(sample.shape, target.shape) print(class0data.shape, class1data.shape) print('\nSample Data:') print(sample[1,0:18], target[1]) print(sample[1000,0:18], target[1000]) print('\nSample Class 0 Data:') print(class0data[100,0:18], class0target[100]) print('\nSample Class 1 Data:') print(class1data[100,0:18], class1target[100]) #build cross-validation data sets from sklearn.cross_validation import train_test_split sample_train, sample_test, target_train, target_test = train_test_split(sample, target, test_size=0.20) #train the random forest classifier from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators = 50) forest = forest.fit(sample_train[:,0:18], target_train) #ONLY USE FIRST 18 COLUMNS from sklearn.svm import SVC mysvm = SVC() mysvm = mysvm.fit(sample_train[:,0:18], target_train) #test the model on various sets trnresult = mysvm.score(sample_train[:,0:18],target_train) tstresult = mysvm.score(sample_test[:,0:18],target_test) class0result = mysvm.score(class0data[:,0:18],class0target) class1result = mysvm.score(class1data[:,0:18],class1target) print('\nThe training score is: %f' % trnresult) print('The testing score is: %f' % tstresult) print('The class 0 test score is: %f' % class0result) print('The class 1 test score is: %f' % class1result)