#classifier compare to confusion matrix for tutorial 66 maXbox in a 2D array
#classifier visualisation matrix for starter tutorial 67/68 maXbox ML V
# need pydotplus & graphviz, config path at scripttime, line, #locs:316
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model
from sklearn.datasets.samples_generator import make_regression
from scipy.stats.stats import pearsonr
# Install: pip install spacy && python -m spacy download en
import os, spacy, time
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('de_core_news_sm')
basePath = os.path.dirname(os.path.abspath(__file__))
print('base path is: '+basePath)
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
keywordtok="C:\maXbox\mX46210\DataScience\kwlisgt803_textanalysis_spacy2Output5.txt"
"""
# Import Dataset
dataset = pd.read_csv(keywordtok,sep = ', ')
# Process whole documents
text = open(keywordtok).read()
doc = nlp(text)
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)
"""
from wordcloud import WordCloud
#from nltk.corpus import stopwords
# Es hat sich gezeigt, dass Word-Clouds mit einem Frequency-Array deutlich schneller und mit
# weniger Speicherbedarf erstellt werden können. Daher haben wir ganz oben in der spaCY-Schleife # gleich die entsprechenden Dicts angelegt. # In[33]: #file_n2.pyclassifier_compare2confusion2.py/KeywordsMax_20180215_CuBr3.txt").read() """ file_content=open (keywordtok).read() wc = WordCloud(background_color="white", max_words=30) #wc.generate_from_frequencies(dataset) wc.generate(file_content) plt.figure(figsize=(12,12)) plt.title("awordcloud of machine learning") plt.imshow(wc, interpolation='bilinear') plt.axis("off"); """ def plotPredictions(clf): targets = ['Class 0', 'Class 1'] handles = [] colors = ['purple', 'yellow'] xx, yy = np.meshgrid(np.arange(2, 4, 6), np.arange(10, 70, 0.5)) Z = clf.predict(np.c_[X]) #Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) plt.figure(figsize=(8, 6)) #Z = Z.reshape(xx.shape) #plt.contourf(xx, yy, [Z], cmap=plt.cm.Paired, alpha=0.8) plt.scatter(X[:,0], X[:,1], s=600, label=targets, c=y.astype(np.float)) plt.scatter(X[:,2], X[:,3], s=600, label=targets, c=y.astype(np.float)) for color in colors: xx = X[:,0] yy = X[:,1] s = 150 handles.append(plt.scatter(xx, yy, label=targets, c=color, s=s)) for color in colors: xx = X[:,2] yy = X[:,3] s = 150 handles.append(plt.scatter(xx, yy, label=targets, c=color, s=s)) plt.legend(handles, targets, loc=4) plt.show() def print_accuracy(f): print("Accuracy = {0}%".format(100*np.sum(f(X) == y)/len(y))) #print("Accuracy = {0}%".format(100*np.sum(f(X_test) == Y_test)/len(Y_test))) return (100*np.sum(f(X) == y)/len(y)) time.sleep(0.5) # to let print get out before any progress bars from mpl_toolkits.mplot3d import Axes3D def plot3D(): colormap = np.array(['red', 'lime']) targets2 = [0,0,1,1,0,1] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') x =[1,3,5,7,10,9] y =[2,4,6,8,8,7] z =[3,5,7,9,6,5] ax.scatter(x, y, z, c=colormap[targets2], marker='o', s=300) ax.set_xlabel('A-x ') ax.set_ylabel('B-y ') ax.set_zlabel('C-z ') plt.show() # data frame builder @main -------------------------- from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QA from sklearn.cluster import KMeans datlist2 = [[1,2,3,4,0], [3,4,5,6,0], [5,6,7,8,1], [7,8,9,10,1], [10,8,6,4,0], [9,7,5,3,1]] #[9,7,5,3,1]] arr2 = np.array(datlist2, dtype='float') print(arr2,'\n') y=arr2[0:,4] X=arr2[0:,0:4] features = ['A','B','C','D'] targets = ['class 0', 'class 1'] print(y,'\n',X,'\n') df= pd.DataFrame.from_records(arr2, columns=features+['Class']) print(df.columns) #y.columns = ['Targets'] print(df.head(6),'\n') print(df.describe(),'\n') # start with 7 classifier -------------------------- svm = LinearSVC(random_state=100, C=2) #C for regularization y_pred = svm.fit(X,y).predict(X) print('linear svm score1: ',svm.score(X,y)) print('score2: ',accuracy_score(y, y_pred)) print('score3: ',print_accuracy(svm.predict)) print('predict single sample ',svm.predict([[7,8,9,8]])) #0.8333333333333334 print(confusion_matrix(y, y_pred)) print("Numbs of mislabeled points out of total %d points : %d" % (X.shape[0],(y != y_pred).sum())) print(y) print(y_pred) print('classification report: \n',classification_report(y,y_pred,target_names=targets)) # train test split X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.4, random_state=0) print('train_test_split(X, y): \n') print(X_train) print(X_test) #Now well fit the model on the training data: svm.fit(X_train,y_train) y_pred = svm.predict(X_test) print('class test report: \n',classification_report(y_test,y_pred,target_names=targets)) print('class test Score:', svm.score(X_test, y_test)) print(confusion_matrix(y_test, y_pred)) # plot the test model """ plt.scatter(y_test, y_pred, 500) plt.xlabel('True Values') plt.ylabel('Predictions') plt.show() """ df.iloc[0:,0:4].plot.hist(density=False) plt.show() # Cluster Classification Demo kmean= KMeans(n_clusters=2,init='k-means++',max_iter=100, n_init=1, random_state=15) kmean.fit(X) print('\n',kmean.cluster_centers_) print('kmean.clusters \n',np.unique(kmean.labels_, return_counts=True)) print('kmeanclusters thinks: \n',kmean.labels_) centroids = kmean.cluster_centers_ # Set the size of the plot plt.figure(figsize=(14,7)) targets2 = [0,0,1,1,0,1] y2 = pd.DataFrame(targets2) y2.columns = ['Targets'] # Create a colormap colormap = np.array(['red', 'lime']) # Plot the Original Classifications plt.subplot(1, 2, 1) plt.scatter(df.A, df.B, c=colormap[y2.Targets], s=400) plt.title('Real Classification') # Plot the Models Classifications plt.subplot(1, 2, 2) plt.scatter(df.A, df.B, c=colormap[kmean.labels_], s=400) plt.title('K-Mean Classification') #plt.show() plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=600, linewidths=50, color='black', zorder=10) plt.subplot(1, 2, 1) plt.scatter(df.C, df.D, c=colormap[y2.Targets], s=400) plt.title('Real Classification') # Plot the Models Classifications plt.subplot(1, 2, 2) plt.scatter(df.C, df.D, c=colormap[kmean.labels_], s=400) plt.title('K-Means Classification') plt.show() plot3D() # The fix, we convert all the 1s to 0s and 0s to 1s. predY = np.choose(kmean.labels_, [1, 0]).astype(np.int64) print (kmean.labels_) print (predY) #df.iloc[0:,0:4].plot.kde() # target class labels #df.iloc[0:,4:5].plot.kde() #>>> d = np.random.normal(loc=15, scale=3, size=500) #>>> d = np.random.laplace(loc=15, scale=3, size=500) #sns.distplot(d, fit=stats.laplace, kde=False) clf = SVC(random_state=100, C=5) y_pred = clf.fit(X,y).predict(X) print('supportvectormachine score1: ',clf.score(X,y)) print('score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) #plotPredictions(clf) print('classification report: \n',classification_report(y,y_pred,target_names=targets)) clf = GaussianNB() y_pred = clf.fit(X,y).predict(X) print('gaussian nb score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) clf = MLPClassifier(alpha=1, random_state=100) y_pred = clf.fit(X,y).predict(X) print('mlperceptron score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) clf = KNeighborsClassifier(n_neighbors=3) y_pred = clf.fit(X,y).predict(X) print('kneighbors score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) #plotPredictions(clf) clf = DecisionTreeClassifier(random_state=100,max_depth=5) y_pred = clf.fit(X,y).predict(X) print('decision tree score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) from sklearn.externals.six import StringIO import pydotplus from sklearn import tree dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=features) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) print(graph,dot_data, basePath) #Image(graph.create_png()) graph.write_png(basePath+'\maxboxdecisiontree_graph3.png') clf = QA() y_pred = clf.fit(X,y).predict(X) print('\n QuadDiscriminantAnalysis score2: ',accuracy_score(y, y_pred)) print(confusion_matrix(y, y_pred)) Xtest=[9,6,10,12] #Xtest= [Xtest] #Xtest = np.array(Xtest).reshape(1, -1) y_pred = clf.predict([Xtest]) print('QA predict Xtest: ',y_pred) print('pearson correlation, coeff:, p-value:') for i in range(3): print (pearsonr(X[:,i],X[:,i+1])) #plt.show() print ('pearson ref test: ',pearsonr([1,3,5,7,10,9],[2,4,6,8,8,7])) a = [1,4,6,5] b = [1,2,3,2] print ('pearson ref test: ',pearsonr(a,b),'\n') corr = np.corrcoef(X, rowvar=0) # correlation matrix w, v = np.linalg.eig(corr) # eigen values & eigen vectors print('eigenvalues & eigenvector:') print(w) print(v) print('correlation matrix: \n',df.corr(),'\n') basePath = os.path.dirname(os.path.abspath(__file__)) print(basePath) #----app_template_loaded_code---- #----File newtemplate.txt not exists - now saved!---- #""" w X . y # File "C:\Users\max\AppData\Local\Programs\Python\Python36\lib\site-packages\sk #learn\utils\validation.py", line 441, in check_array # "if it contains a single sample.".format(array)) #ValueError: Expected 2D array, got 1D array instead: #array=[ 1.2 4. ]. #Reshape your data either using array.reshape(-1, 1) if your data has a single fe #ature or array.reshape(1, -1) if it contains a single sample. # Label Encoder to turn those columns into numbers. I'd like to use# OneHotEncoder to take it o
#https://stackoverflow.com/questions/47957151/error-expected-2d-array-got-1d-array-instead-using-onehotencoder
#https://www.esecurityplanet.com/views/article.php/1501001/Security-Threat-Correlation-The-Next-Battlefield.htm
# res = cache.get(item)
#TypeError: unhashable type: 'slice'