# #Example 02 # # Taken from David Mease Lecture Notes # uses stats202log.txt # Warm Up Create Data - Read in Data aa<-c(1,10,12) aa aa+10 length(aa) bb<-c(2,6,79) my_data_set<-data.frame(attributeA=aa,attributeB=bb) my_data_set my_data_set[,1] my_data_set[1,] my_data_set[3,2] my_data_set[1:2,] # # #Sample Data # # Create Random subset of data for analysis # #Draw 10 samples with repacement from 7th colm # # what directory are you in and where do you want to be? getwd() setwd("C:/Users/PatriciaHoffman/Desktop/MLHacker/DavidMease/Lectures/Lecture02") data<-read.csv("stats202log.txt", sep=" ",header=F, na.strings = "-") data[,7] ?sample my_seq <- seq(1,1922) sam<-sample(seq(1,1922),10,replace=T) ?seq my_seq sam my_sample<-data[sam,7] my_sample #Sample Repeatedly and see what happens real_mean<-mean(data[,7]) ?rep store_diff<-rep(0,10000) for (k in 1:10000){ sam<-sample(seq(1,1922),10,replace=T) my_sample<-data[sam,7] store_diff[k]<-abs(mean(my_sample)-real_mean) } mean(store_diff) #Change Sample size from 10 to 100 # with the larger sample size the mean is more accurate real_mean<-mean(data[,7]) store_diff<-rep(0,10000) for (k in 1:10000){ sam<-sample(seq(1,1922),100,replace=T) my_sample<-data[sam,7] store_diff[k]<-abs(mean(my_sample)-real_mean) } mean(store_diff) # # Plotting Examples # exam_scores<-read.csv("exam_scores.csv",header=F) # # Histograms # #Make a frequency histogram in R # for the exam scores using bins # of width 10 beginning at 120 and ending at 200 hist(exam_scores[,1],breaks=seq(120,200,by=10), col="red", xlab="Exam Scores", ylab="Frequency",main="Exam Score Histogram") #plot - relative frequency polygon # frequency polygon in R for the exam scores # using bins of width 10 beginning at 120 and ending at 200 my_hist<-hist(exam_scores[,1], breaks=seq(120,200,by=10),plot=FALSE) counts<-my_hist$counts breaks<-my_hist$breaks plot(c(115,breaks+5), c(0,counts,0), pch=19, xlab="Exam Scores", ylab="Frequency",main="Frequency Polygon") lines(c(115,breaks+5),c(0,counts,0)) # # #Empirical Cumulative Distribution Function # function ecdf # # #A cumulative distribution function (CDF)= # probability that a point is less than a value #For each observed value, # an empirical cumulative distribution function (ECDF) # shows the fraction of points that are less than this value # #A plot of the ECDF is sometimes called an ogive. # plot(ecdf(exam_scores[,1]), verticals= TRUE, do.p = FALSE, main ="ECDF for Exam Scores", xlab="Exam Scores", ylab="Cumulative Percent") # #Relative Frequency Polygons # #Plot of relative frequency polygons for both the first and second exams # on the same graph with legend. more_exam_scores<-read.csv("more_exam_scores.csv",header=F) my_new_hist<- hist(more_exam_scores[,1], breaks=seq(100,200,by=10),plot=FALSE) new_counts<-my_new_hist$counts new_breaks<-my_new_hist$breaks plot(c(95,new_breaks+5),c(0,new_counts/37,0), pch=19,xlab="Exam Scores", ylab="Relative Frequency",main="Relative Frequency Polygons",ylim=c(0,.30)) lines(c(95,new_breaks+5),c(0,new_counts/37,0),lty=2) # First and Second on Same Graph with legend points(c(115,breaks+5),c(0,counts/40,0), col="blue",pch=19) lines(c(115,breaks+5),c(0,counts/40,0), col="blue",lty=1) legend(110,.25,c("Exam 2","Exam 1"), col=c("black","blue"),lty=c(2,1),pch=19) # Plot ECDF plot(ecdf(exam_scores[,1]), verticals= TRUE,do.p = FALSE, main ="ECDF for Exam Scores", xlab="Exam Scores", ylab="Cumulative Percent", xlim=c(100,200)) lines(ecdf(more_exam_scores[,1]), verticals= TRUE,do.p = FALSE, col.h="red",col.v="red",lwd=4) legend(110,.6,c("Exam 1","Exam 2"), col=c("black","red"),lwd=c(1,4)) # Scatter Plots data<-read.csv("exams_and_names.csv") plot(data$Exam.1,data$Exam.2, xlim=c(100,200),ylim=c(100,200), pch=19, main="Exam Scores",xlab="Exam 1",ylab="Exam 2") abline(c(0,1)) # Label Points text(data$Exam.1[data$Exam.1<150], data$Exam.2[data$Exam.1<150], labels=data$Student[data$Exam.1<150],adj=1) identify(data$Exam.1,data$Exam.2,labels=data$Student) # Add Noise data$Exam.1<-data$Exam.1+runif(40)-.5 data$Exam.2<-data$Exam.2+runif(40)-.5 # Plot Again lot(data$Exam.1,data$Exam.2, xlim=c(100,200),ylim=c(100,200), pch=19, main="Exam Scores",xlab="Exam 1",ylab="Exam 2") abline(c(0,1))