前2天刚刚拿到了这本书,今天有点空闲时间来玩玩代码。
第一个例子:用来将一个大txt文本分割成若干小文本,并进行全角与半角的转码。
跑了一下,虽然简单还是遇到了很多问题。
代码如下:
package com.mask.demo_A; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; public class FilePreprocess { public static void preprocess(File file, String outputDir) { try { splitToSmallFiles(charactorProcess(file, outputDir + "output.all"), outputDir); } catch (Exception e) { e.printStackTrace(); } } /** * 对文件中的字符进行全角/半角转换 * * @param file * 原始文件 * @param destFile * 目标地址 * @return */ public static File charactorProcess(File file, String destFile) { try { BufferedWriter writer = new BufferedWriter(new FileWriter(destFile)); BufferedReader reader = new BufferedReader(new FileReader(file)); String line = reader.readLine(); while (line != null) { String newLine = replace(line); writer.write(newLine); writer.newLine(); line = reader.readLine(); } reader.close(); writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return new File(destFile); } /** * 拆分小文件 * * @param file * 源文件 * @param outputPath * 目标文件地址 * @throws IOException */ public static void splitToSmallFiles(File file, String outputPath) throws IOException { int filePointer = 0; int MAX_SIZE = 10240; BufferedWriter writer = null; BufferedReader reader = new BufferedReader(new FileReader(file)); StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while (line != null) { buffer.append(line).append("\r\n"); if (buffer.toString().getBytes().length >= MAX_SIZE) { writer = new BufferedWriter(new FileWriter(outputPath + "output" + filePointer + ".txt")); writer.write(buffer.toString()); writer.close(); filePointer++; buffer = new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputPath + "output" + filePointer + ".txt")); writer.write(buffer.toString()); writer.close(); } /** * 全角/半角转换 * * @return */ @SuppressWarnings( { "unchecked", "unused" }) private static String replace(String line) { System.out.println("old line = " + line); HashMap map = new HashMap(); map.put(",", ","); map.put("。", "."); map.put("《", "<"); map.put("》", ">"); map.put("‖", "|"); map.put("〈", "<"); map.put("〉", ">"); map.put("〔", "["); map.put("〕", "]"); map.put("?", "?"); map.put("“", "\""); map.put("”", "\""); map.put(":", ":"); map.put("、", ","); map.put("(", "("); map.put(")", ")"); map.put("【", "["); map.put("】", "]"); map.put("—", "-"); map.put("~", "~"); map.put("!", "!"); map.put("‘", "'"); map.put("①", "1"); map.put("②", "2"); map.put("③", "3"); map.put("④", "4"); map.put("⑤", "5"); map.put("⑥", "6"); map.put("⑦", "7"); map.put("⑧", "8"); map.put("⑨", "9"); for (int i = 0; i < line.length(); i++) { String charAt = line.substring(i, i + 1); if (map.get(charAt) != null) { line.replace(charAt, (String) map.get(charAt)); } } System.out.println("new line = " + line); return line; } @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { String inputFile = "f:\\book.txt"; String outputDir = "f:\\temp\\"; if (!new File(outputDir).exists()) { new File(outputDir).mkdir(); } FilePreprocess process = new FilePreprocess(); process.preprocess(new File(inputFile), outputDir); } }
遇到的第一个问题就是发现分隔出来的小文件全部都是代码。
查了一下问题 好像是因为下载的文件编码不是UTF-8,而我是以此方式运行的程序。于是使用editplus将文本另存为utf-8的。再次运行 OK 分隔出来的是正常的中文。
后来又发现 字符并没有转换成功,全角依然还在。调试半天~~~无效。郁闷啊!囧!
这里希望哪位大侠看见了,还请帮个忙!