val lines = sc.textFile("file:///D:/data/solr.txt")
val hashingTF = new mllib.feature.HashingTF()
val sentences = lines.collect().map{ sents =>
val data = sents.split(",")
val lable = "1"
val sentence=sents.replaceAll("\t","")
println(sentence)
val temp = ToAnalysis.parse(sentence) //报错的地方
val stopwords: java.util.List[String] = sc.textFile("hdfs:/svm/stopword.dic").collect().toSeq
FilterModifWord.insertStopWords(stopwords)
//(3)根据词性去停用词,w为标点符号
FilterModifWord.insertStopNatures("w", null)
val filter = FilterModifWord.modifResult(temp)
val sent = for (i <- Range(0, filter.size())) yield filter.get(i).getName
val message = sent.toArray
message.map{word=>
termMap.put(hashingTF.indexOf(word),word)
}
RawDataRecord(lable, message)
}
16/12/17 17:30:45 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 63 ms on localhost (1/1)
16/12/17 17:30:45 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
16/12/17 17:30:45 INFO DAGScheduler: ResultStage 0 (collect at seg_local.scala:33) finished in 0.102 s
16/12/17 17:30:45 INFO DAGScheduler: Job 0 finished: collect at seg_local.scala:33, took 0.146047 s
目前的分词器大部分都是单机服务器进行分词,或者使用hadoop mapreduce对存储在hdfs中大量的数据文本进行分词。由于mapreduce的速度较慢,相对spark来说代码书写较繁琐。
16/12/17 17:30:45 INFO BlockManagerInfo: Removed broadcast_1_piece0 on 172.16.110.10:49409 in memory (size: 1850.0 B, free: 1992.9 MB)
16/12/17 17:30:46 INFO DICLOG: init user userLibrary ok path is : D:\Intellij\tsf_lda\library\default.dic
16/12/17 17:30:46 INFO DICLOG: init ambiguityLibrary ok!
16/12/17 17:30:46 INFO DICLOG: init core library ok use time :304
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 3
at org.ansj.splitWord.Analysis.analysisStr(Analysis.java:115)
at org.ansj.splitWord.Analysis.parseStr(Analysis.java:222)
at org.ansj.splitWord.analysis.ToAnalysis.parse(ToAnalysis.java:103)
at tsf_lda.seg_local$$anonfun$1.apply(seg_local.scala:38)
at tsf_lda.seg_local$$anonfun$1.apply(seg_local.scala:33)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at tsf_lda.seg_local$.main(seg_local.scala:33)
at tsf_lda.seg_local.main(seg_local.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:483)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)