Professional Documents
Culture Documents
packages(tm)
install.packages(wordcloud)
install.packages(RWeka)
install.packages(SnowballC)
install.packages(caret)
install.packages(rminer)
install.packages(kernlab)
install.packages(rpart)
library(tm)
library(wordcloud)
library(RWeka)
library(SnowballC)
library(caret)
library(rminer)
library(kernlab)
library(rpart)
setwd()
setwd()
book_reviews=read.csv(BOOK1.csv,header=T,stringFactors=F)
str(book_reviews)
book_reviews$Label<-factor(book_reviews$Label)
summary(book_reviews)
nrow(book_reviews)
prop.table(table(book_reviews$Label))
set.seed(100)
inTrain<-createDataPartition(y= book_reviews$Label,p=0.50,list=FALSE)
train_m<- book_reviews[inTrain]
testdata<- book_reviews[-inTrain,]
inTest<-createDataPartition(y= book_reviews$Label,p=0.50,list=FALSE)
test1_m<-testdata[inTest,]
test2_m<-testdata[-inTest,]
nrow(train_m)
summary(train_m)
nrow(test1_m)
summary(test1_m)
nrow(test2_m)
summary(test2_m)
prop.table(table(train_m$Label))
prop.table(table(test1_m$Label))
prop.table(table(test2_m$Label))
train_corpus_m<-Corpus(VectorSource(train_m$review))
length(train_corpus_m)
? Corpus
?VectorSource
train_corpus_m<-Corpus(DataframeSource(as.matrix(train_m$review)))
?DataframeSource
length(train_corpus_m)
train_m$review[1]
Step1<-tm_map(train_corpus_m,tolower)
Step1[1]
Step[[1]]
Step1[2:3]
insepct(Step1[2:3])
insepct(head(Step1,3))
t<-tolower(TEXT)
Step2<-tm_map(Step1,removeNumbers)
Step2[[1]]
insepct(Step2[1:2])
insepct(head(Step2,3))
Step3<-tm_map(Step2,removeWords,stopwords(english))
Step3<-tm_map(Step2,removeWords,stopwords())
Step3[[1]]
Step3<-tm_map(Step3,removeWords,two)
Step3a[[1]]
Step3b<-tm_map(Step3,removeWords,c(two,movie,film,films))
Step3b[[1]]
Step3c<-tm_map(Step3,removeWords, mystopwords)
Step3c[[1]]
Step4<-tm_map(Step3c,removePunctuation)
Step4[[1]]
Step5<-tm_map(Step4,stripWhitespace)
Step5[[1]]
Texts<-(I am member of the XYZ association,apply for our open associate position,xyz memorial
lecture takes place on wednesday,vote for the most popular lecturer)
corpus<-Corpus(DataframeSource(data.frame(texts))
corpus.temp<-tm_map(corpus,stemDocument,language=english)
corpus.temp[[2]][1]
Step6<-tm_map(Step5,stemDocument,language=english)
Step6[[1]]
train_dtm_m=DocumentTermMatrix(Step6)
dim(train_dtm_m)
train_dtm_m=DocumentTermMatrix(train_corpus_m,control=list(removeNumbers=T,removePunctuatio
n=T,stringWhitespace=T,tolower=T,stopwords=T,stemming=T))
dim(train_dtm_m)
train_rmspa_m=removeSparseTerms(train_dtm_m,0.80)
? removeSparseTerms
dim(train_rmspa_m)
mean_train=sort(colMeans(as.matrix(train_rmsapa_m)),decreasing=T)
mean_train[1:20]
average_top20=mean(mean_train[1:20])
average_top20
barplot(mean_train[1:20],border=NA,las=3,xlab=top 20 words,ylab=Frequency,ylim=c(0,3))
train_dtm_m2=DocumentTermMatrix(train_corpus_m2,control=list(removeNumbers=T,removePunctua
tion=T,stringWhitespace=T,tolower=T,stopwords=T,stemming=T))
dim(train_dtm_m2)
train_rmspa_m2= removeSparseTerms(train_dtm_m2,0.80)
dim(train_rmspa_m2)
mean_train_m2=sort(colMeans(as.matrix(train_rmsapa_m2)),decreasing=T)
mean_train_m2
average_top20_m2=mean(mean_train_m2[1:20])
average_top20_m2
barplot(mean_train_m2[1:20],border=NA,las=3,xlab=top 20 words,ylab=Frequency,ylim=c(0,3))
wordcloud(names(mean_train_m2[1:30]),
mean_train_m2[1:30],scale=c(5,1),colors=brewer,pal(8,Dark2))
?wordcloud
Train_BoWfreq<-as.matrix(train_rmspa_m)
Train_data_m=data.frame(y=train_m$Label,x= Train_BoWfreq)
Summary(train_data_m)
str(train_data_m)
train_BoW_m=findFreqTerms(train_rmspa_m)
length(train_BoW_m)
test1_corpus_m<-Corpus(DataframeSource(as.matrix(test1_m$review)))
BoW_test1_m=DocumentTermMatrix(test1_corpus_m,control=list(removeNumbers=T,removePunctuati
on=T,stringWhitespace=T,tolower=T,stopwords=T,stemming=T, dictionary=train_BoW_m))
str(BoW_test1_m)
dim(BoW_test1_m)
test1_BoWfreq_m<-as.matrix(BoW_test1_m)
test1_data_m=data.frame(y=test1_m$Label,x=test1_BoWfreq_m)
str(test1_data_m)
summary(test1_data_m)
library(party)
BoW_ctree_m<-ctree(y ~.,data=train_data_m)
summary(BoW_ctree_m)
plot(BoW_ctree_m)
plot(BoW_ctree_m,type=simple)
test1Pred=predict(BoW_ctree_m,newdata=test1_data_m)
confusionMatrix(test1Pred,test1_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test1Pred,test1_data_m[,1],c(ACC,TPR,PRECISION,F1))
library(RWeka)
NB<-make_Weka_classifier( )
BoW_NB_m=NB(y ~.,data=train_data_m)
Library(kernlab)
test1Pred=predict(BoW_ksvm_m,newdata=test1_data_m)
confusionMatrix(test1Pred,test1_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test1Pred,test1_data_m[,1],c(ACC,TPR,PRECISION,F1))
Library(RWeka)
MLP<-make_Weka_classifier( )
test1Pred=predict(BoW_MLP_m,newdata=test1_data_m)
confusionMatrix(test1Pred,test1_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test1Pred,test1_data_m[,1],c(ACC,TPR,PRECISION,F1))
evaluate_Weka_classifier(BoW_IBk_m,test1_data_m,numFolds=0,complexity=FALSE,seed=1,class=TRUE)
test2_corpus_m<-Corpus(DataframeSource(as.matrix(test2_m$review)))
BoW_test2_m=DocumentTermMatrix(test2_corpus_m,control=list(removeNumbers=T,removePunctuati
on=T,stringWhitespace=T,tolower=T,stopwords=T,stemming=T, dictionary=train_BoW_m))
dim(BoW_test2_m)
test2_BoWfreq_m=as.matrix(tBoW_test2_m)
summary(test2_BoWfreq_m)
test2_data_m=data.frame(y=test2_m$Label,x=test2_BoWfreq_m)
test2Pred=predict(BoW_ctree_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
test2Pred=predict(BoW_J48_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
evaluate_Weka_classifier(BoW_J48_m,test2_data_m,numFolds=0,complexity=FALSE,seed=1,class=TRUE)
test2Pred=predict(BoW_C50_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
test2Pred=predict(BoW_NB_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
test2Pred=predict(BoW_ksvm_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
test2Pred=predict(BoW_MLP_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
test2Pred=predict(BoW_IBk_m,newdata=test2_data_m)
confusionMatrix(test2Pred,test2_data_m[,1],positive=Positive,dnn=c(Prediction,True))
nmetric(test2Pred,test2_data_m[,1],c(ACC,TPR,PRECISION,F1))
evaluate_Weka_classifier(BoW_IBk_m,test2_data_m,numFolds=0,complexity=FALSE,seed=1,class=TRUE)