#import the Cleveland heart patient data file using pandas, creating a header row #since file doesn't have column names import pandas as pnd header_row = ['age','sex','pain','BP','chol','fbs','ecg','maxhr','eiang','eist','slope','vessels','thal','diagnosis'] heart = pnd.read_csv('processed.cleveland.data', names=header_row) #display bp column #print(heart['BP']) #display count of people of each age #print(heart['age'].value_counts()) #filter to only those diagnosed w heart disease print(heart['diagnosis'].value_counts()) has_hd_check = heart['diagnosis'] > 0 has_hd_patients = heart[has_hd_check] #print(has_hd_patients) #add this column to dataframe heart['diag_bool'] = has_hd_check #also add integer version heart['diag_int'] = has_hd_check.astype(int) #get percentage of people diagnosed by chest pain type has_hd_pain_counts = has_hd_patients['pain'].value_counts() all_pain_counts = heart['pain'].value_counts() print(has_hd_pain_counts/all_pain_counts.astype(float)) #import matplotlib for plotting import matplotlib.pyplot as plt #plot percentage diagnosed w/heart disease by pain type #toplot = has_hd_pain_counts/all_pain_counts.astype(float) #toplot.plot(label=['4','3','2','1'],kind='bar') #plt.show() #display first 3 lines of data #print(heart[:3]) #print(heart.dtypes) #need to turn thal and vessels into floats (currently objects because of ? values) import numpy as np heart['vessels'] = heart['vessels'].apply(lambda vessels: 99.0 if vessels == "?" else vessels) #print(heart['vessels']) heart['vessels'] = heart['vessels'].astype(float) heart['thal'] = heart['thal'].apply(lambda thal: 99.0 if thal == "?" else thal) heart['thal'] = heart['thal'].astype(float) #display first 3 lines of data #print(heart[:3]) print(heart.dtypes) #import and modify VA dataset for testing heart_va = pnd.read_csv('processed.va.data', names=header_row) has_hd_check = heart_va['diagnosis'] > 0 heart_va['diag_int'] = has_hd_check.astype(int) heart_va = heart_va.replace(to_replace = '?',value = 99.0) #import and modify hungarian dataset for testing heart_hu = pnd.read_csv('processed.hungarian.data', names=header_row) has_hd_check = heart_hu['diagnosis'] > 0 heart_hu['diag_int'] = has_hd_check.astype(int) heart_hu = heart_hu.replace(to_replace = '?',value = 99.0) #classification with scikit-learn nearest neighbor from sklearn import neighbors, datasets from sklearn import cross_validation #for i in range (1,25): #learned i = 5 has best result here clf = neighbors.KNeighborsClassifier(5,'distance') #train the classifier on partial dataset heart_train, heart_test, goal_train, goal_test = cross_validation.train_test_split(heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], test_size=0.33, random_state=0) clf.fit(heart_train, goal_train ) heart_test_results = clf.predict(heart_test) heart_test_results = pnd.DataFrame(heart_test_results, columns=['predict']) goal_test_df = pnd.DataFrame(goal_test, columns=['actual']) heart_test_results['correct'] = heart_test_results['predict'] == goal_test_df['actual'] #results of decision tree classification test print("") print("Nearest Neighbors (5) Result 1:") print(heart_test_results['correct'].value_counts()) print(clf.score(heart_test, goal_test)) print("Nearest Neighbors (5) Cross-Validation:") scores = cross_validation.cross_val_score(clf, heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], cv=5) print(scores) #try with other datasets print("Trained Nearest Neighbors (5) Applied to VA Data:") heart_va_results = clf.predict(heart_va.loc[:,'age':'thal']) print(clf.score(heart_va.loc[:,'age':'thal'], heart_va.loc[:,'diag_int'])) print("Trained Nearest Neighbors (5) Applied to Hungarian Data:") heart_hu_results = clf.predict(heart_hu.loc[:,'age':'thal']) print(clf.score(heart_hu.loc[:,'age':'thal'], heart_hu.loc[:,'diag_int'])) #classification with scikit-learn decision tree from sklearn import tree clf2 = tree.DecisionTreeClassifier() #train the classifier on partial dataset heart_train, heart_test, goal_train, goal_test = cross_validation.train_test_split(heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], test_size=0.33, random_state=0) clf2.fit(heart_train, goal_train ) heart_test_results = clf2.predict(heart_test) #put the results into a dataframe and determine how many were classified correctly heart_test_results = pnd.DataFrame(heart_test_results, columns=['predict']) goal_test_df = pnd.DataFrame(goal_test, columns=['actual']) heart_test_results['correct'] = heart_test_results['predict'] == goal_test_df['actual'] #print results of decision tree classification test print("") print("Decision Tree Result 1:") print(heart_test_results['correct'].value_counts()) print(clf2.score(heart_test, goal_test)) #try the scikit-learn cross validation function print("Decision Tree Cross-Validation:") scores = cross_validation.cross_val_score(clf2, heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], cv=5) print(scores) #test classifier with other data (note: many values missing in these files) print("Trained Decision Tree Applied to VA Data:") heart_va_results = clf2.predict(heart_va.loc[:,'age':'thal']) print(clf2.score(heart_va.loc[:,'age':'thal'], heart_va.loc[:,'diag_int'])) print("Trained Decision Tree Applied to Hungarian Data:") heart_hu_results = clf2.predict(heart_hu.loc[:,'age':'thal']) print(clf2.score(heart_hu.loc[:,'age':'thal'], heart_hu.loc[:,'diag_int']))