SlideShare ist ein Scribd-Unternehmen logo
1 von 3
Downloaden Sie, um offline zu lesen
proj1v2.txt
#Reading the data
d <- read.csv("C:/Users/Gaohong/1987.csv")
View(d)
str(d)
# d = subset(d, Month == "12")
d = d[, -20:-29]# remove useless variables
#d = d[, -1] # remove month
d = d[, -1] # remove year
d = d[, -10] # remove TailNum
d = d[, -12] # remove TailNum
d<-na.omit(d)
data<-d
library(gmodels)
#Summarize the data.
data.num<-data[, c(12,4,5,6,7,10,11,13,16)]
data.cat<-data[, c(1,2,3,8,9,14,15)]
aggregate(data.num, list(data$Month), mean, na.rm=TRUE)
aggregate(data.num, list(data$Month), sd, na.rm=TRUE)
CrossTable(data.cat$Month, data.cat$DayofMonth,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayofMonth'))
CrossTable(data.cat$Month, data.cat$DayOfWeek,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayOfWeek'))
CrossTable(data.cat$Month, data.cat$UniqueCarrier,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','UniqueCarrier'))
CrossTable(data.cat$Month, data.cat$Origin,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE,
prop.chisq=FALSE,dnn = c('Month','Origin'))
CrossTable(data.cat$Month, data.cat$Dest,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE,
prop.chisq=FALSE,dnn = c('Month','Dest'))
# verify that the time delay calculations are correct
tdc3=data.num$DepDelay+(data.num$ActualElapsedTime-data.num$CRSElapsedTime) #
ArrDelay
length(which(tdc3!=data.num$ArrDelay)) #78
# delete observation that the delay calculations are incorrect
data.num$ArrDelay <- ifelse(data.num$ArrDelay == tdc3,data.num$ArrDelay, NA)
data.num<-na.omit(data.num)
#Create a variable ArrivedLate
data.num$ArrivedLate <- ifelse(data.num$ArrDelay >= 15 ,"D", "O")
data.num$ArrivedLate <- factor(data.num$ArrivedLate, levels = c("D", "O"),labels =
c("Delayed", "On-Time"))
#KNN Classification
#removing not useful column.
knndata=data.num[, c(10, 2, 3,4,5,6,7,8,9)]
# create normalization function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
knndata_n <- as.data.frame(lapply(knndata[2:9], normalize))
# confirm that normalization worked
Page 1
proj1v2.txt
summary(knndata_n$DepTime)
# create training and test data
my_train <- knndata_n[1:900000, ]
my_test <- knndata_n[900001:130616, ]
# create labels for training and test data
my_train_labels <- knndata[1:900000, 1]
my_test_labels <- knndata[900001:130616, 1]
library(class)
my_test_pred1 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=5)
my_test_pred2 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=15)
my_test_pred3 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=21)
CrossTable(x = my_test_labels, y = my_test_pred1 ,prop.chisq=FALSE)
CrossTable(x = my_test_labels, y = my_test_pred2 ,prop.chisq=FALSE)
CrossTable(x = my_test_labels, y = my_test_pred3 ,prop.chisq=FALSE)
## Examine the correlation between ArrDelay and the other quantitative variables in
the dataset.
# exploring relationships among features: correlation matrix
cor(data.num[c(1, 2, 3,4,5,6,7,8,9)])
# Regression Tree
idxs.r <- sample(1:nrow(data.num), as.integer(0.7*nrow(data.num)))
# create the training and test datasets
# Regression Tree
al1987.num_train <- data.num[idxs.r,]
al1987.num_test <- data.num[-idxs.r,]
# regression tree using rpart
library(rpart)
al1987.rpart <- rpart(ArrDelay ~ ., data = al1987.num_train)
# get basic information about the tree
al1987.rpart
# get more detailed information about the tree
summary(al1987.rpart)
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(al1987.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(al1987.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart <- predict(al1987.rpart, al1987.num_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
summary(al1987.num_test$ArrDelay)
# compare the correlation
cor(p.rpart, al1987.num_test$ArrDelay) # 0.8239499
## Use the ROC curve to further examine the effectiveness of the model for
prediction.
Page 2
proj1v2.txt
library(ROCR)
pred <- prediction(predictions = my_test_pred1,labels = my_test_pred1_labels)
# ROC curves
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, main = "ROC curve for airline delay filter", col = "blue", lwd = 2)
# add a reference line to the graph
abline(a = 0, b = 1, lwd = 2, lty = 2)
# calculate AUC
perf.auc <- performance(pred, measure = "auc")
str(perf.auc)
as.numeric(perf.auc@y.values)
Page 3

Weitere ähnliche Inhalte

Was ist angesagt?

Manipulating Data using base R package
Manipulating Data using base R package Manipulating Data using base R package
Manipulating Data using base R package Rupak Roy
 
Day 1d R structures & objects: matrices and data frames.pptx
Day 1d   R structures & objects: matrices and data frames.pptxDay 1d   R structures & objects: matrices and data frames.pptx
Day 1d R structures & objects: matrices and data frames.pptxAdrien Melquiond
 
Day 1b R structures objects.pptx
Day 1b   R structures   objects.pptxDay 1b   R structures   objects.pptx
Day 1b R structures objects.pptxAdrien Melquiond
 
DPLYR package in R
DPLYR package in RDPLYR package in R
DPLYR package in RBimba Pawar
 
Manipulating data with dates
Manipulating data with datesManipulating data with dates
Manipulating data with datesRupak Roy
 
Patterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and ResourcesPatterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and ResourcesKatie Reese
 
Day 5b statistical functions.pptx
Day 5b   statistical functions.pptxDay 5b   statistical functions.pptx
Day 5b statistical functions.pptxAdrien Melquiond
 
20140427 parallel programming_zlobin_lecture11
20140427 parallel programming_zlobin_lecture1120140427 parallel programming_zlobin_lecture11
20140427 parallel programming_zlobin_lecture11Computer Science Club
 
Data handling in r
Data handling in rData handling in r
Data handling in rAbhik Seal
 
Introduction to data.table in R
Introduction to data.table in RIntroduction to data.table in R
Introduction to data.table in RPaul Richards
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Vectors data frames
Vectors data framesVectors data frames
Vectors data framesFAO
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 

Was ist angesagt? (20)

Manipulating Data using base R package
Manipulating Data using base R package Manipulating Data using base R package
Manipulating Data using base R package
 
Day 1d R structures & objects: matrices and data frames.pptx
Day 1d   R structures & objects: matrices and data frames.pptxDay 1d   R structures & objects: matrices and data frames.pptx
Day 1d R structures & objects: matrices and data frames.pptx
 
Day 1b R structures objects.pptx
Day 1b   R structures   objects.pptxDay 1b   R structures   objects.pptx
Day 1b R structures objects.pptx
 
Day 2 repeats.pptx
Day 2 repeats.pptxDay 2 repeats.pptx
Day 2 repeats.pptx
 
DPLYR package in R
DPLYR package in RDPLYR package in R
DPLYR package in R
 
Day 2b i/o.pptx
Day 2b   i/o.pptxDay 2b   i/o.pptx
Day 2b i/o.pptx
 
Manipulating data with dates
Manipulating data with datesManipulating data with dates
Manipulating data with dates
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
Patterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and ResourcesPatterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and Resources
 
Day 5b statistical functions.pptx
Day 5b   statistical functions.pptxDay 5b   statistical functions.pptx
Day 5b statistical functions.pptx
 
20140427 parallel programming_zlobin_lecture11
20140427 parallel programming_zlobin_lecture1120140427 parallel programming_zlobin_lecture11
20140427 parallel programming_zlobin_lecture11
 
Data handling in r
Data handling in rData handling in r
Data handling in r
 
Introduction to data.table in R
Introduction to data.table in RIntroduction to data.table in R
Introduction to data.table in R
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
19 tables
19 tables19 tables
19 tables
 
Vectors data frames
Vectors data framesVectors data frames
Vectors data frames
 
Rsplit apply combine
Rsplit apply combineRsplit apply combine
Rsplit apply combine
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 

Andere mochten auch

Проект Соколова
Проект СоколоваПроект Соколова
Проект СоколоваSlava Barsov
 
Design Thinking Plus Solution For Digital Transformation mrh cxc
Design Thinking Plus Solution For Digital Transformation mrh cxcDesign Thinking Plus Solution For Digital Transformation mrh cxc
Design Thinking Plus Solution For Digital Transformation mrh cxcClient X Client
 
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needs
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needsOakley and Retail Pro®: recognizing, anticipating, and fulfilling needs
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needsRetail Pro International, LLC
 

Andere mochten auch (7)

Проект Соколова
Проект СоколоваПроект Соколова
Проект Соколова
 
CAN Capital
CAN CapitalCAN Capital
CAN Capital
 
Sandeep desai cv
Sandeep desai cvSandeep desai cv
Sandeep desai cv
 
Design Thinking Plus Solution For Digital Transformation mrh cxc
Design Thinking Plus Solution For Digital Transformation mrh cxcDesign Thinking Plus Solution For Digital Transformation mrh cxc
Design Thinking Plus Solution For Digital Transformation mrh cxc
 
Profetas
ProfetasProfetas
Profetas
 
INVITACIÓN A MATRICULARSE
INVITACIÓN A MATRICULARSEINVITACIÓN A MATRICULARSE
INVITACIÓN A MATRICULARSE
 
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needs
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needsOakley and Retail Pro®: recognizing, anticipating, and fulfilling needs
Oakley and Retail Pro®: recognizing, anticipating, and fulfilling needs
 

Ähnlich wie proj1v2

R programming intro with examples
R programming intro with examplesR programming intro with examples
R programming intro with examplesDennis
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavVyacheslav Arbuzov
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Dr. Volkan OBAN
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfannikasarees
 
R Programming.pptx
R Programming.pptxR Programming.pptx
R Programming.pptxkalai75
 
The Very ^ 2 Basics of R
The Very ^ 2 Basics of RThe Very ^ 2 Basics of R
The Very ^ 2 Basics of RWinston Chen
 
Data Manipulation with Numpy and Pandas in PythonStarting with N
Data Manipulation with Numpy and Pandas in PythonStarting with NData Manipulation with Numpy and Pandas in PythonStarting with N
Data Manipulation with Numpy and Pandas in PythonStarting with NOllieShoresna
 
A quick introduction to R
A quick introduction to RA quick introduction to R
A quick introduction to RAngshuman Saha
 
Practical data science_public
Practical data science_publicPractical data science_public
Practical data science_publicLong Nguyen
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data ManipulationChu An
 
Bsc cs ii dfs u-1 introduction to data structure
Bsc cs ii dfs u-1 introduction to data structureBsc cs ii dfs u-1 introduction to data structure
Bsc cs ii dfs u-1 introduction to data structureRai University
 

Ähnlich wie proj1v2 (20)

R programming
R programmingR programming
R programming
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R programming intro with examples
R programming intro with examplesR programming intro with examples
R programming intro with examples
 
NCCU: Statistics in the Criminal Justice System, R basics and Simulation - Pr...
NCCU: Statistics in the Criminal Justice System, R basics and Simulation - Pr...NCCU: Statistics in the Criminal Justice System, R basics and Simulation - Pr...
NCCU: Statistics in the Criminal Justice System, R basics and Simulation - Pr...
 
3 Data Structure in R
3 Data Structure in R3 Data Structure in R
3 Data Structure in R
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
 
R for Statistical Computing
R for Statistical ComputingR for Statistical Computing
R for Statistical Computing
 
R Programming.pptx
R Programming.pptxR Programming.pptx
R Programming.pptx
 
The Very ^ 2 Basics of R
The Very ^ 2 Basics of RThe Very ^ 2 Basics of R
The Very ^ 2 Basics of R
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
Data Manipulation with Numpy and Pandas in PythonStarting with N
Data Manipulation with Numpy and Pandas in PythonStarting with NData Manipulation with Numpy and Pandas in PythonStarting with N
Data Manipulation with Numpy and Pandas in PythonStarting with N
 
R교육1
R교육1R교육1
R교육1
 
A quick introduction to R
A quick introduction to RA quick introduction to R
A quick introduction to R
 
Practical data science_public
Practical data science_publicPractical data science_public
Practical data science_public
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data Manipulation
 
R language introduction
R language introductionR language introduction
R language introduction
 
Bsc cs ii dfs u-1 introduction to data structure
Bsc cs ii dfs u-1 introduction to data structureBsc cs ii dfs u-1 introduction to data structure
Bsc cs ii dfs u-1 introduction to data structure
 
R
RR
R
 

proj1v2

  • 1. proj1v2.txt #Reading the data d <- read.csv("C:/Users/Gaohong/1987.csv") View(d) str(d) # d = subset(d, Month == "12") d = d[, -20:-29]# remove useless variables #d = d[, -1] # remove month d = d[, -1] # remove year d = d[, -10] # remove TailNum d = d[, -12] # remove TailNum d<-na.omit(d) data<-d library(gmodels) #Summarize the data. data.num<-data[, c(12,4,5,6,7,10,11,13,16)] data.cat<-data[, c(1,2,3,8,9,14,15)] aggregate(data.num, list(data$Month), mean, na.rm=TRUE) aggregate(data.num, list(data$Month), sd, na.rm=TRUE) CrossTable(data.cat$Month, data.cat$DayofMonth,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayofMonth')) CrossTable(data.cat$Month, data.cat$DayOfWeek,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayOfWeek')) CrossTable(data.cat$Month, data.cat$UniqueCarrier,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','UniqueCarrier')) CrossTable(data.cat$Month, data.cat$Origin,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','Origin')) CrossTable(data.cat$Month, data.cat$Dest,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','Dest')) # verify that the time delay calculations are correct tdc3=data.num$DepDelay+(data.num$ActualElapsedTime-data.num$CRSElapsedTime) # ArrDelay length(which(tdc3!=data.num$ArrDelay)) #78 # delete observation that the delay calculations are incorrect data.num$ArrDelay <- ifelse(data.num$ArrDelay == tdc3,data.num$ArrDelay, NA) data.num<-na.omit(data.num) #Create a variable ArrivedLate data.num$ArrivedLate <- ifelse(data.num$ArrDelay >= 15 ,"D", "O") data.num$ArrivedLate <- factor(data.num$ArrivedLate, levels = c("D", "O"),labels = c("Delayed", "On-Time")) #KNN Classification #removing not useful column. knndata=data.num[, c(10, 2, 3,4,5,6,7,8,9)] # create normalization function normalize <- function(x) { return ((x - min(x)) / (max(x) - min(x))) } knndata_n <- as.data.frame(lapply(knndata[2:9], normalize)) # confirm that normalization worked Page 1
  • 2. proj1v2.txt summary(knndata_n$DepTime) # create training and test data my_train <- knndata_n[1:900000, ] my_test <- knndata_n[900001:130616, ] # create labels for training and test data my_train_labels <- knndata[1:900000, 1] my_test_labels <- knndata[900001:130616, 1] library(class) my_test_pred1 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=5) my_test_pred2 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=15) my_test_pred3 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=21) CrossTable(x = my_test_labels, y = my_test_pred1 ,prop.chisq=FALSE) CrossTable(x = my_test_labels, y = my_test_pred2 ,prop.chisq=FALSE) CrossTable(x = my_test_labels, y = my_test_pred3 ,prop.chisq=FALSE) ## Examine the correlation between ArrDelay and the other quantitative variables in the dataset. # exploring relationships among features: correlation matrix cor(data.num[c(1, 2, 3,4,5,6,7,8,9)]) # Regression Tree idxs.r <- sample(1:nrow(data.num), as.integer(0.7*nrow(data.num))) # create the training and test datasets # Regression Tree al1987.num_train <- data.num[idxs.r,] al1987.num_test <- data.num[-idxs.r,] # regression tree using rpart library(rpart) al1987.rpart <- rpart(ArrDelay ~ ., data = al1987.num_train) # get basic information about the tree al1987.rpart # get more detailed information about the tree summary(al1987.rpart) library(rpart.plot) # a basic decision tree diagram rpart.plot(al1987.rpart, digits = 3) # a few adjustments to the diagram rpart.plot(al1987.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101) p.rpart <- predict(al1987.rpart, al1987.num_test) # compare the distribution of predicted values vs. actual values summary(p.rpart) summary(al1987.num_test$ArrDelay) # compare the correlation cor(p.rpart, al1987.num_test$ArrDelay) # 0.8239499 ## Use the ROC curve to further examine the effectiveness of the model for prediction. Page 2
  • 3. proj1v2.txt library(ROCR) pred <- prediction(predictions = my_test_pred1,labels = my_test_pred1_labels) # ROC curves perf <- performance(pred, measure = "tpr", x.measure = "fpr") plot(perf, main = "ROC curve for airline delay filter", col = "blue", lwd = 2) # add a reference line to the graph abline(a = 0, b = 1, lwd = 2, lty = 2) # calculate AUC perf.auc <- performance(pred, measure = "auc") str(perf.auc) as.numeric(perf.auc@y.values) Page 3