在做中文文本聚类的时候,将数据用tm包转换为文档-词矩阵之后就都是乱码了,应该是Rwordseg分词之后的文件编码格式不对,可是怎么改啊55555
#导入_
song <- read.xlsx("C:\\Users\\silencewille\\Desktop\\lunwen\\data\\julei.xlsx")
#用Rwordseg分词
lyric_words <- lapply(1:length(song$lyric), function(i) segmentCN(song$lyric[i], nature = TRUE))
#建立语料库
wordcorpus <- Corpus(VectorSource(lyric_word))
#建立矩阵
Sys.setlocale(locale="Chinese")
dtm1 <- DocumentTermMatrix(wordcorpus,control = list(wordLengths=c(1, Inf), bounds = list(global = c(5,Inf)), removeNumbers = TRUE,weighting = weightTf,encoding = "UTF-8"))