public final class JavaWordCount {
private static final Pattern SPACE = Pattern.compile("(\\s|\\p{Punct})+");
public static void main(String[] args) throws Exception {
//设置hadoop路径
System.setProperty("hadoop.home.dir", "F:\\TDDownload\\spark-2.1.0-bin-hadoop2.7\\spark-2.1.0-bin-hadoop2.7");
//设置目标文件
args = new String[]{"D:/logs/"};
//创建spark
SparkSession spark = SparkSession.builder().appName("JavaWordCount").getOrCreate();
//
JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) {
return Arrays.asList(SPACE.split(s)).iterator();
}
});
JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
List<Tuple2<String, Integer>> output = counts.collect();
for (Tuple2<?, ?> tuple : output) {
// if(tuple._1().equals("hello"))
System.out.println(tuple._1() + ": " + tuple._2());
}
spark.stop();
}
}
spark可以专门去统计某个词吗,而不是把所有的词都统计了
我想知道日志文件中存在多少个2017,我不想知道其他的信息,这个在spark中可以实现吗?