Classification examp

R
WLE-classify
HUNG HUO-SU
09/23/2015
Practical Machine Learning Project 09/24/2015
WLE Data analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
training_csv = read.csv("pml-training.csv")
#Partition Original training data with classe into 2 part
#70% for training model and 30% for verification
inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE)
training <- training_csv[inTrain,]
testing <- training_csv[-inTrain,]
#We just only focus on accelerometers, and ignore others sensor
#training_accel contains only accelerometers data without classe
#training_accel_classe contains only accelerometers with classe.
training_accel <- training[grep("^accel", colnames(training))]
training_accel_classe<-cbind(training_accel, training$classe)
colnames(training_accel_classe)[ncol(training_accel_classe)] <-
"classe"
colnames(training_accel_classe)[ncol(training_accel_classe)]
## [1] "classe"
#Use Random Forests method to train the model called modelFit_rf_70
modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training
_accel_classe , method="rf", prof=TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
#The accuracy is over 90% and mtry =2 is best.
modelFit_rf_70
## Random Forest
##
## 13737 samples
## 12 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737,
...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9305551 0.9121133 0.003224900 0.004081289
## 7 0.9210913 0.9001473 0.003691518 0.004695061
## 12 0.9033667 0.8777115 0.005626041 0.007144838
##
## Accuracy was used to select the optimal model using the largest val
ue.
## The final value used for the model was mtry = 2.
#Use confusionMatrix to verify model accuracy
#The accuracy of model created by Random Forest is high and over 0.9
confusionMatrix(testing$classe, predict(modelFit_rf_70, testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1613 8 26 23 4
## B 43 1039 35 14 8
## C 13 37 969 6 1
## D 19 4 47 889 5
## E 5 15 6 10 1046
##
## Overall Statistics
##
## Accuracy : 0.9441
## 95% CI : (0.9379, 0.9498)
## No Information Rate : 0.2877
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9293
## Mcnemar's Test P-Value : 9.14e-12
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831
## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925
## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667
## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963
## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878
#Fill the predict result to predRight column
pred <- predict(modelFit_rf_70, testing)
testing$predRight <- pred==testing$classe
#Predict the answers of pml-testing.csv, and get result
testing_csv = read.csv("pml-testing.csv")
answers <- predict(modelFit_rf_70, testing_csv)
answers
## [1] B A C A A E D D A A B C B A E E A B B B
## Levels: A B C D E
#Use pml_write_files() function to create answer of file for 20 problem
s
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name
s=FALSE)
}
}
pml_write_files(answers)
#Although we get over 90% accuracy by testing data from 30% of original
data, we still want to know which conditions cause error prediction.
#We use correlation matrix between factors and find out some less relat
ion factors in order to show the test result in graphic.
#For example, we get the most TRUE column is accel_forearm_y, and
#find the other factor less relation with it.
min(abs(cor(training_accel[which(training_accel_classe$classe == "A
"),])))
## [1] 0.01247164
#Use min_cor_rcname() funciton can retrive row/column name of minimal c
orrelation value for each classe
min_cor_rcname <- function(Class)
{
mdat <- abs(cor(training_accel[which(training_accel_classe$classe ==
Class),]))
index <- which.min(mdat)
k <- arrayInd(index, dim(mdat))
rr <- rownames(mdat)[k[,1]]
cc <- colnames(mdat)[k[,2]]
print(rr)
print(cc)
}
min_cor_rcname("A")
## [1] "accel_belt_y"
## [1] "accel_belt_x"
min_cor_rcname("B")
## [1] "accel_forearm_x"
## [1] "accel_dumbbell_y"
min_cor_rcname("C")
## [1] "accel_forearm_y"
## [1] "accel_arm_y"
min_cor_rcname("D")
## [1] "accel_forearm_y"
## [1] "accel_belt_z"
min_cor_rcname("E")
## [1] "accel_forearm_z"
## [1] "accel_dumbbell_y"
# Divide testing data by classe, because we want to observe error by ea
ch data
testing_A <- testing[which(testing$classe == "A"),]
testing_B <- testing[which(testing$classe == "B"),]
testing_C <- testing[which(testing$classe == "C"),]
testing_D <- testing[which(testing$classe == "D"),]
testing_E <- testing[which(testing$classe == "E"),]
#Plot graphs for each Classe A,B,C,D,E
qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin
g_A), data=testing_A, main="Class A")
qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te
sting_B), data=testing_B, main="Class B")
qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin
g_C), data=testing_C, main="Class C")
qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes
ting_D), data=testing_D, main = "Class D")
qplot(accel_dumbbell_y, accel_forearm_z, colour=predict(modelFit_rf_70,
testing_E), data=testing_E, main="Class E")
Summary
• 1.Random Forest algorithm have high accuracy but performance is bad.It need
much time to train model.*
• 2.Most errors happen near the center of each group of each Class, but it is still
predicted error. It may be caused by overfitting.It is better to reduce the
features before train model by Random Forest method.*
• 3.Per the graphs we generate, they imply something:*
– Some error classifications of A are considered as B.*
– Some error classifications of B are considered as A or C.*
– Some error classifications of C are considered as A.*
– Some error classifications of D are considered as A.*
– Some error classifications of E are considered as B.*
• 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting
Exercises Dataset". It declares*
– Class A - the specification exercise.*
– Class B - throwing the elbows to the front*
– Class C - lifting the dumbbell only halfway*
– Class D - lowering the dumbbell only halfway*
– Class E - throwing the hips to the front*
• 5.when we do the specified exercise,if we make the mistake about throwing our
hips to the front, it might make our elbows to the front at the same time.*
• 6.The most important variable is accel_belt_z and then accel_dumbbell_y by
GINI importance.*

Recomendados

Human_Activity_Recognition_Predictive_Model von
Human_Activity_Recognition_Predictive_ModelHuman_Activity_Recognition_Predictive_Model
Human_Activity_Recognition_Predictive_ModelDavid Ritchie
103 views11 Folien
Kaggle talk series top 0.2% kaggler on amazon employee access challenge von
Kaggle talk series  top 0.2% kaggler on amazon employee access challengeKaggle talk series  top 0.2% kaggler on amazon employee access challenge
Kaggle talk series top 0.2% kaggler on amazon employee access challengeVivian S. Zhang
2.5K views65 Folien
Data mining with caret package von
Data mining with caret packageData mining with caret package
Data mining with caret packageVivian S. Zhang
4.4K views36 Folien
PCA and LDA in machine learning von
PCA and LDA in machine learningPCA and LDA in machine learning
PCA and LDA in machine learningAkhilesh Joshi
4K views188 Folien
K fold von
K foldK fold
K foldAkhilesh Joshi
225 views10 Folien
Data analysis on bank data von
Data analysis on bank dataData analysis on bank data
Data analysis on bank dataANISH BHANUSHALI
3K views23 Folien

Más contenido relacionado

Destacado

ag_resume13 (2) (1) von
ag_resume13 (2) (1)ag_resume13 (2) (1)
ag_resume13 (2) (1)AVAON GAITERS
157 views3 Folien
Virtual Private Network von
Virtual Private NetworkVirtual Private Network
Virtual Private NetworkYana Altunyan
129 views13 Folien
Media Relations - An overview von
Media Relations - An overviewMedia Relations - An overview
Media Relations - An overviewDC Priyan
2.5K views7 Folien
к о м п`ю т е р von
к о м п`ю т е рк о м п`ю т е р
к о м п`ю т е рТанюшка Маслієнко
213 views18 Folien
Вконтакте размещение рекламы von
Вконтакте размещение рекламыВконтакте размещение рекламы
Вконтакте размещение рекламыHiconversion
102 views17 Folien
TrackGSM von
TrackGSMTrackGSM
TrackGSMRudnicki Piotr
163 views5 Folien

Similar a Classification examp

Course Project for Coursera Practical Machine Learning von
Course Project for Coursera Practical Machine LearningCourse Project for Coursera Practical Machine Learning
Course Project for Coursera Practical Machine LearningJohn Edward Slough II
2.2K views7 Folien
maXbox starter65 machinelearning3 von
maXbox starter65 machinelearning3maXbox starter65 machinelearning3
maXbox starter65 machinelearning3Max Kleiner
168 views10 Folien
Data science with R - Clustering and Classification von
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and ClassificationBrigitte Mueller
433 views44 Folien
wk5ppt2_Iris von
wk5ppt2_Iriswk5ppt2_Iris
wk5ppt2_IrisAliciaWei1
42 views16 Folien
Xgboost von
XgboostXgboost
XgboostVivian S. Zhang
46.5K views128 Folien
RDataMining slides-regression-classification von
RDataMining slides-regression-classificationRDataMining slides-regression-classification
RDataMining slides-regression-classificationYanchang Zhao
233 views54 Folien

Similar a Classification examp(20)

maXbox starter65 machinelearning3 von Max Kleiner
maXbox starter65 machinelearning3maXbox starter65 machinelearning3
maXbox starter65 machinelearning3
Max Kleiner168 views
Data science with R - Clustering and Classification von Brigitte Mueller
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and Classification
Brigitte Mueller433 views
RDataMining slides-regression-classification von Yanchang Zhao
RDataMining slides-regression-classificationRDataMining slides-regression-classification
RDataMining slides-regression-classification
Yanchang Zhao233 views
CODE in R Train-r- # Delete everything out of the environment rm(list-.pdf von akdavinderhosiery
CODE in R Train-r- # Delete everything out of the environment rm(list-.pdfCODE in R Train-r- # Delete everything out of the environment rm(list-.pdf
CODE in R Train-r- # Delete everything out of the environment rm(list-.pdf
- K-Nearest Neighbours Classifier Now we can start building the actua.pdf von info893569
- K-Nearest Neighbours Classifier Now we can start building the actua.pdf- K-Nearest Neighbours Classifier Now we can start building the actua.pdf
- K-Nearest Neighbours Classifier Now we can start building the actua.pdf
info8935692 views
maXbox starter69 Machine Learning VII von Max Kleiner
maXbox starter69 Machine Learning VIImaXbox starter69 Machine Learning VII
maXbox starter69 Machine Learning VII
Max Kleiner117 views
Lab 2: Classification and Regression Prediction Models, training and testing ... von Yao Yao
Lab 2: Classification and Regression Prediction Models, training and testing ...Lab 2: Classification and Regression Prediction Models, training and testing ...
Lab 2: Classification and Regression Prediction Models, training and testing ...
Yao Yao421 views
CSE 110 - ASSIGNMENT # 7 Due Wednesday April 13 by .docx von aryan532920
 CSE 110 - ASSIGNMENT # 7  Due Wednesday April 13 by .docx CSE 110 - ASSIGNMENT # 7  Due Wednesday April 13 by .docx
CSE 110 - ASSIGNMENT # 7 Due Wednesday April 13 by .docx
aryan5329205 views
how can I fix this code issue- train - c() test - c() trees - c() fo.docx von Isaac9LjWelchq
how can I fix this code issue-   train - c() test - c() trees - c() fo.docxhow can I fix this code issue-   train - c() test - c() trees - c() fo.docx
how can I fix this code issue- train - c() test - c() trees - c() fo.docx
Isaac9LjWelchq2 views
eam2 von butest
eam2eam2
eam2
butest359 views
Linear Regression (Machine Learning) von Omkar Rane
Linear Regression (Machine Learning)Linear Regression (Machine Learning)
Linear Regression (Machine Learning)
Omkar Rane122 views
Training course lect2 von Noor Dhiya
Training course lect2Training course lect2
Training course lect2
Noor Dhiya20 views

Classification examp

  • 1. WLE-classify HUNG HUO-SU 09/23/2015 Practical Machine Learning Project 09/24/2015 WLE Data analysis library(caret) ## Loading required package: lattice ## Loading required package: ggplot2 training_csv = read.csv("pml-training.csv") #Partition Original training data with classe into 2 part #70% for training model and 30% for verification inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE) training <- training_csv[inTrain,] testing <- training_csv[-inTrain,] #We just only focus on accelerometers, and ignore others sensor #training_accel contains only accelerometers data without classe #training_accel_classe contains only accelerometers with classe. training_accel <- training[grep("^accel", colnames(training))] training_accel_classe<-cbind(training_accel, training$classe) colnames(training_accel_classe)[ncol(training_accel_classe)] <- "classe" colnames(training_accel_classe)[ncol(training_accel_classe)] ## [1] "classe" #Use Random Forests method to train the model called modelFit_rf_70 modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training _accel_classe , method="rf", prof=TRUE) ## Loading required package: randomForest ## randomForest 4.6-10 ## Type rfNews() to see new features/changes/bug fixes. #The accuracy is over 90% and mtry =2 is best. modelFit_rf_70 ## Random Forest ## ## 13737 samples
  • 2. ## 12 predictor ## 5 classes: 'A', 'B', 'C', 'D', 'E' ## ## No pre-processing ## Resampling: Bootstrapped (25 reps) ## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ... ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9305551 0.9121133 0.003224900 0.004081289 ## 7 0.9210913 0.9001473 0.003691518 0.004695061 ## 12 0.9033667 0.8777115 0.005626041 0.007144838 ## ## Accuracy was used to select the optimal model using the largest val ue. ## The final value used for the model was mtry = 2. #Use confusionMatrix to verify model accuracy #The accuracy of model created by Random Forest is high and over 0.9 confusionMatrix(testing$classe, predict(modelFit_rf_70, testing)) ## Confusion Matrix and Statistics ## ## Reference ## Prediction A B C D E ## A 1613 8 26 23 4 ## B 43 1039 35 14 8 ## C 13 37 969 6 1 ## D 19 4 47 889 5 ## E 5 15 6 10 1046 ## ## Overall Statistics ## ## Accuracy : 0.9441 ## 95% CI : (0.9379, 0.9498) ## No Information Rate : 0.2877 ## P-Value [Acc > NIR] : < 2.2e-16 ## ## Kappa : 0.9293 ## Mcnemar's Test P-Value : 9.14e-12 ## ## Statistics by Class: ## ## Class: A Class: B Class: C Class: D Class: E ## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831 ## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925 ## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667 ## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963 ## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
  • 3. ## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777 ## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839 ## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878 #Fill the predict result to predRight column pred <- predict(modelFit_rf_70, testing) testing$predRight <- pred==testing$classe #Predict the answers of pml-testing.csv, and get result testing_csv = read.csv("pml-testing.csv") answers <- predict(modelFit_rf_70, testing_csv) answers ## [1] B A C A A E D D A A B C B A E E A B B B ## Levels: A B C D E #Use pml_write_files() function to create answer of file for 20 problem s pml_write_files = function(x){ n = length(x) for(i in 1:n){ filename = paste0("problem_id_",i,".txt") write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name s=FALSE) } } pml_write_files(answers) #Although we get over 90% accuracy by testing data from 30% of original data, we still want to know which conditions cause error prediction. #We use correlation matrix between factors and find out some less relat ion factors in order to show the test result in graphic. #For example, we get the most TRUE column is accel_forearm_y, and #find the other factor less relation with it. min(abs(cor(training_accel[which(training_accel_classe$classe == "A "),]))) ## [1] 0.01247164 #Use min_cor_rcname() funciton can retrive row/column name of minimal c orrelation value for each classe min_cor_rcname <- function(Class) { mdat <- abs(cor(training_accel[which(training_accel_classe$classe == Class),])) index <- which.min(mdat) k <- arrayInd(index, dim(mdat)) rr <- rownames(mdat)[k[,1]] cc <- colnames(mdat)[k[,2]]
  • 4. print(rr) print(cc) } min_cor_rcname("A") ## [1] "accel_belt_y" ## [1] "accel_belt_x" min_cor_rcname("B") ## [1] "accel_forearm_x" ## [1] "accel_dumbbell_y" min_cor_rcname("C") ## [1] "accel_forearm_y" ## [1] "accel_arm_y" min_cor_rcname("D") ## [1] "accel_forearm_y" ## [1] "accel_belt_z" min_cor_rcname("E") ## [1] "accel_forearm_z" ## [1] "accel_dumbbell_y" # Divide testing data by classe, because we want to observe error by ea ch data testing_A <- testing[which(testing$classe == "A"),] testing_B <- testing[which(testing$classe == "B"),] testing_C <- testing[which(testing$classe == "C"),] testing_D <- testing[which(testing$classe == "D"),] testing_E <- testing[which(testing$classe == "E"),] #Plot graphs for each Classe A,B,C,D,E qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin g_A), data=testing_A, main="Class A")
  • 5. qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te sting_B), data=testing_B, main="Class B")
  • 6. qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin g_C), data=testing_C, main="Class C") qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes ting_D), data=testing_D, main = "Class D")
  • 8. Summary • 1.Random Forest algorithm have high accuracy but performance is bad.It need much time to train model.* • 2.Most errors happen near the center of each group of each Class, but it is still predicted error. It may be caused by overfitting.It is better to reduce the features before train model by Random Forest method.* • 3.Per the graphs we generate, they imply something:* – Some error classifications of A are considered as B.* – Some error classifications of B are considered as A or C.* – Some error classifications of C are considered as A.* – Some error classifications of D are considered as A.* – Some error classifications of E are considered as B.* • 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting Exercises Dataset". It declares* – Class A - the specification exercise.* – Class B - throwing the elbows to the front* – Class C - lifting the dumbbell only halfway* – Class D - lowering the dumbbell only halfway* – Class E - throwing the hips to the front* • 5.when we do the specified exercise,if we make the mistake about throwing our hips to the front, it might make our elbows to the front at the same time.* • 6.The most important variable is accel_belt_z and then accel_dumbbell_y by GINI importance.*