Professional Documents
Culture Documents
x=read.csv(file.choose(),header=)
y=x$name of column or variable
plot(x,y,type="l")
hist(y)
z=c(10,15,20)
Cluster
rownames(P)=P$column name
m=dist(as.matrix(P))
hc=hclust(m)
plot(hc)
Scatterplot
extract minimum 2 variables to plot
scatterplot(X~Y,data=name of file,xlab=" ",ylab=" ",main=" ")
res=lm(X~Y)
res=signif(residuals(res),5)
res & enter
Pie chart
X=c()
Y=c()
pie(X,labels=Y)
Creating functions
myfun=function(x)sum(x)/length(x)
d=c(5,10,15,20)
myfun(d)
Linear regression
plot(xaxis name,yaxis name,main="heading")
cor(x,y)
data=lm(yaxis~xaxis)
summary(data)
attributes(data)
data$coef
abline(data)
confint(data,level=any%value)
anova(data)
Checking linear regression
after continuing above steps
plot(data)
par(mfrow=c(2,2))
Statistics
replace(x, list, values)
scrub(x, where, min, max, isvalue,newvalue)
x <- as.matrix()
x%in%y
all(x%in%y)
all(x)
max(x, na.rm=TRUE)
var(x, na.rm=TRUE)
sd(x, na.rm=TRUE)
mad(x, na.rm=TRUE)
fivenum(x, na.rm=TRUE)
table(x)
scale(data,scale=FALSE)
cumsum(x,na=rm=TRUE)
rev(x)
cor(x,y,use="pair")
aov(x~y,data=datafile)
aov.ex1 = aov(DV~IV,data=data.ex1)
aov.ex2 = aov(DV~IV1*IV21,data=data.ex2)
summary(aov.ex1)
print(model.tables(aov.ex1,"means"),digits=3)
boxplot(DV~IV,data=data.ex1)
lm(x~y,data=dataset)
t(X)
X %*% Y
solve(A)
solve(A,B)
Table
table(train$Survived)
prop.table(table(train$Survived))
table(<data_variable_1>, <data_variable_2>)
prop.table(table(train$Child,train$Survived),1)
tapply(variable1,var2,mean)
which.min/which.max
subset(filename,var1 >1000)
sd(variablename,na.rm=TRUE)
count()
for(i in 1:max){
+ file_name<-paste("result",i,sep = "")
+ file_name1=subset(Train,Train$Group == i)
+ assign(file_name,file_name1)
+}
for(i in 1:max){filename= paste("A",i,sep = "")
try=eval(as.name(paste("result",i,sep = "")))
assign(filename,try)}
object=summary(filename)
write.csv(t(as.matrix(object)), file="name.csv")
colnames(data)[colnames(data)=="old_name"] <- "new_name"
paste0()
substr
Train$columnname=NULL
Used for
For i/p of csv file
extracting variable or whole column
plotting graph of x and y,joined with lines or type P for points
Frequency distribution
assigning values to z
pearson correlation
to fit linear regression
to check summary of linear regression
to see the names and class
to extract coefficient
to plot a line
to improve plotting
to create anova
very imp
Commands
t.str <- strptime(Timeseriesmin$TimeSeries, "%Y-%m-%d %H:%M:%S")
S.str <- as.numeric(format(t.str, "%H"))*60*60 + as.numeric(format(t.str, "%M"))*60+as.numeric
(format(t.str,"%S")
h.str <- as.numeric(format(t.str, "%H")) +
+
as.numeric(format(t.str, "%M"))/60
as.Date(Train$DOB, "%d-%b-%Y")
data$Transaction_Year <- format(data$Transaction_Date, "%Y")
DateConvert = as.Date(strptime(mvt$Date, "%m/%d/%y %H:%M"))
Used for
Conversion into proper form
to convert time into secs
to convert time into hrs
to convert into data format
to extract year from date format
to extract date from timestamp
calculating age
Sequence no.
Name
Packages
Loading of dataset
Combining of dataset
Exploration of data
Data cleaning
Feature engineering
10
Model Building
11
Codes
library(data.table)
library(dplyr)
library(ggplot2)
library(randomForest)
library(caret)
library(dummies)
test$Loan_Status<- "N"
combi<- rbind(train , test)
str(train)
summary()
Explorating categorical variables
table()
Plotting
Timeseriesmin[complete.cases(Timeseriesmin), ]
Addition of new variable
combi$ls<- with(combi , combi$ApplicantIncome+ combi$CoapplicantIncome)
Description
to
to
to
to
replace
replace
replace
replace
NA values
blank values
with median
with mean
onlinecode
onlinecode