1. proj1v2.txt
#Reading the data
d <- read.csv("C:/Users/Gaohong/1987.csv")
View(d)
str(d)
# d = subset(d, Month == "12")
d = d[, -20:-29]# remove useless variables
#d = d[, -1] # remove month
d = d[, -1] # remove year
d = d[, -10] # remove TailNum
d = d[, -12] # remove TailNum
d<-na.omit(d)
data<-d
library(gmodels)
#Summarize the data.
data.num<-data[, c(12,4,5,6,7,10,11,13,16)]
data.cat<-data[, c(1,2,3,8,9,14,15)]
aggregate(data.num, list(data$Month), mean, na.rm=TRUE)
aggregate(data.num, list(data$Month), sd, na.rm=TRUE)
CrossTable(data.cat$Month, data.cat$DayofMonth,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayofMonth'))
CrossTable(data.cat$Month, data.cat$DayOfWeek,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','DayOfWeek'))
CrossTable(data.cat$Month, data.cat$UniqueCarrier,prop.r=TRUE, prop.c=FALSE,
prop.t=FALSE, prop.chisq=FALSE,dnn = c('Month','UniqueCarrier'))
CrossTable(data.cat$Month, data.cat$Origin,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE,
prop.chisq=FALSE,dnn = c('Month','Origin'))
CrossTable(data.cat$Month, data.cat$Dest,prop.r=TRUE, prop.c=FALSE, prop.t=FALSE,
prop.chisq=FALSE,dnn = c('Month','Dest'))
# verify that the time delay calculations are correct
tdc3=data.num$DepDelay+(data.num$ActualElapsedTime-data.num$CRSElapsedTime) #
ArrDelay
length(which(tdc3!=data.num$ArrDelay)) #78
# delete observation that the delay calculations are incorrect
data.num$ArrDelay <- ifelse(data.num$ArrDelay == tdc3,data.num$ArrDelay, NA)
data.num<-na.omit(data.num)
#Create a variable ArrivedLate
data.num$ArrivedLate <- ifelse(data.num$ArrDelay >= 15 ,"D", "O")
data.num$ArrivedLate <- factor(data.num$ArrivedLate, levels = c("D", "O"),labels =
c("Delayed", "On-Time"))
#KNN Classification
#removing not useful column.
knndata=data.num[, c(10, 2, 3,4,5,6,7,8,9)]
# create normalization function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
knndata_n <- as.data.frame(lapply(knndata[2:9], normalize))
# confirm that normalization worked
Page 1
2. proj1v2.txt
summary(knndata_n$DepTime)
# create training and test data
my_train <- knndata_n[1:900000, ]
my_test <- knndata_n[900001:130616, ]
# create labels for training and test data
my_train_labels <- knndata[1:900000, 1]
my_test_labels <- knndata[900001:130616, 1]
library(class)
my_test_pred1 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=5)
my_test_pred2 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=15)
my_test_pred3 <- knn(train = my_train, test = my_test,cl = my_train_labels, k=21)
CrossTable(x = my_test_labels, y = my_test_pred1 ,prop.chisq=FALSE)
CrossTable(x = my_test_labels, y = my_test_pred2 ,prop.chisq=FALSE)
CrossTable(x = my_test_labels, y = my_test_pred3 ,prop.chisq=FALSE)
## Examine the correlation between ArrDelay and the other quantitative variables in
the dataset.
# exploring relationships among features: correlation matrix
cor(data.num[c(1, 2, 3,4,5,6,7,8,9)])
# Regression Tree
idxs.r <- sample(1:nrow(data.num), as.integer(0.7*nrow(data.num)))
# create the training and test datasets
# Regression Tree
al1987.num_train <- data.num[idxs.r,]
al1987.num_test <- data.num[-idxs.r,]
# regression tree using rpart
library(rpart)
al1987.rpart <- rpart(ArrDelay ~ ., data = al1987.num_train)
# get basic information about the tree
al1987.rpart
# get more detailed information about the tree
summary(al1987.rpart)
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(al1987.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(al1987.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart <- predict(al1987.rpart, al1987.num_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
summary(al1987.num_test$ArrDelay)
# compare the correlation
cor(p.rpart, al1987.num_test$ArrDelay) # 0.8239499
## Use the ROC curve to further examine the effectiveness of the model for
prediction.
Page 2
3. proj1v2.txt
library(ROCR)
pred <- prediction(predictions = my_test_pred1,labels = my_test_pred1_labels)
# ROC curves
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, main = "ROC curve for airline delay filter", col = "blue", lwd = 2)
# add a reference line to the graph
abline(a = 0, b = 1, lwd = 2, lty = 2)
# calculate AUC
perf.auc <- performance(pred, measure = "auc")
str(perf.auc)
as.numeric(perf.auc@y.values)
Page 3