#import the Cleveland heart patient data file using pandas, creating a header row
#since file doesn't have column names
import pandas as pnd
header_row = ['age','sex','pain','BP','chol','fbs','ecg','maxhr','eiang','eist','slope','vessels','thal','diagnosis']
heart = pnd.read_csv('processed.cleveland.data', names=header_row)

#display bp column
#print(heart['BP'])

#display count of people of each age
#print(heart['age'].value_counts())

#filter to only those diagnosed w heart disease
print(heart['diagnosis'].value_counts())
has_hd_check = heart['diagnosis'] > 0
has_hd_patients = heart[has_hd_check]
#print(has_hd_patients)
#add this column to dataframe
heart['diag_bool'] = has_hd_check
#also add integer version
heart['diag_int'] = has_hd_check.astype(int)


#get percentage of people diagnosed by chest pain type
has_hd_pain_counts = has_hd_patients['pain'].value_counts()
all_pain_counts = heart['pain'].value_counts()
print(has_hd_pain_counts/all_pain_counts.astype(float))

#import matplotlib for plotting
import matplotlib.pyplot as plt
#plot percentage diagnosed w/heart disease by pain type
#toplot = has_hd_pain_counts/all_pain_counts.astype(float)
#toplot.plot(label=['4','3','2','1'],kind='bar')
#plt.show()

#display first 3 lines of data
#print(heart[:3])
#print(heart.dtypes)
#need to turn thal and vessels into floats (currently objects because of ? values)
import numpy as np
heart['vessels'] = heart['vessels'].apply(lambda vessels: 99.0 if vessels == "?" else vessels)
#print(heart['vessels'])
heart['vessels'] = heart['vessels'].astype(float)
heart['thal'] = heart['thal'].apply(lambda thal: 99.0 if thal == "?" else thal)
heart['thal'] = heart['thal'].astype(float)
#display first 3 lines of data
#print(heart[:3])
print(heart.dtypes)

#import and modify VA dataset for testing
heart_va = pnd.read_csv('processed.va.data', names=header_row)
has_hd_check = heart_va['diagnosis'] > 0
heart_va['diag_int'] = has_hd_check.astype(int)
heart_va = heart_va.replace(to_replace = '?',value = 99.0)

#import and modify hungarian dataset for testing
heart_hu = pnd.read_csv('processed.hungarian.data', names=header_row)
has_hd_check = heart_hu['diagnosis'] > 0
heart_hu['diag_int'] = has_hd_check.astype(int)
heart_hu = heart_hu.replace(to_replace = '?',value = 99.0)

#classification with scikit-learn nearest neighbor
from sklearn import neighbors, datasets
from sklearn import cross_validation
#for i in range (1,25):
#learned i = 5 has best result here
clf = neighbors.KNeighborsClassifier(5,'distance')
#train the classifier on partial dataset
heart_train, heart_test, goal_train, goal_test = cross_validation.train_test_split(heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], test_size=0.33, random_state=0)
clf.fit(heart_train, goal_train )
heart_test_results = clf.predict(heart_test)
heart_test_results = pnd.DataFrame(heart_test_results, columns=['predict'])
goal_test_df = pnd.DataFrame(goal_test, columns=['actual'])
heart_test_results['correct'] = heart_test_results['predict'] == goal_test_df['actual']
#results of decision tree classification test
print("")
print("Nearest Neighbors (5) Result 1:")
print(heart_test_results['correct'].value_counts())
print(clf.score(heart_test, goal_test))
print("Nearest Neighbors (5) Cross-Validation:")
scores = cross_validation.cross_val_score(clf, heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], cv=5)
print(scores)
#try with other datasets
print("Trained Nearest Neighbors (5) Applied to VA Data:")
heart_va_results = clf.predict(heart_va.loc[:,'age':'thal'])
print(clf.score(heart_va.loc[:,'age':'thal'], heart_va.loc[:,'diag_int']))
print("Trained Nearest Neighbors (5) Applied to Hungarian Data:")
heart_hu_results = clf.predict(heart_hu.loc[:,'age':'thal'])
print(clf.score(heart_hu.loc[:,'age':'thal'], heart_hu.loc[:,'diag_int']))



#classification with scikit-learn decision tree
from sklearn import tree
clf2 = tree.DecisionTreeClassifier()
#train the classifier on partial dataset
heart_train, heart_test, goal_train, goal_test = cross_validation.train_test_split(heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], test_size=0.33, random_state=0)
clf2.fit(heart_train, goal_train )
heart_test_results = clf2.predict(heart_test)

#put the results into a dataframe and determine how many were classified correctly
heart_test_results = pnd.DataFrame(heart_test_results, columns=['predict'])
goal_test_df = pnd.DataFrame(goal_test, columns=['actual'])
heart_test_results['correct'] = heart_test_results['predict'] == goal_test_df['actual']

#print results of decision tree classification test
print("")
print("Decision Tree Result 1:")
print(heart_test_results['correct'].value_counts())
print(clf2.score(heart_test, goal_test))

#try the scikit-learn cross validation function
print("Decision Tree Cross-Validation:")
scores = cross_validation.cross_val_score(clf2, heart.loc[:,'age':'thal'], heart.loc[:,'diag_int'], cv=5)
print(scores)

#test classifier with other data (note: many values missing in these files)
print("Trained Decision Tree Applied to VA Data:")
heart_va_results = clf2.predict(heart_va.loc[:,'age':'thal'])
print(clf2.score(heart_va.loc[:,'age':'thal'], heart_va.loc[:,'diag_int']))
print("Trained Decision Tree Applied to Hungarian Data:")
heart_hu_results = clf2.predict(heart_hu.loc[:,'age':'thal'])
print(clf2.score(heart_hu.loc[:,'age':'thal'], heart_hu.loc[:,'diag_int']))