qizi456258
2012-06-21 16:42
浏览 243
已采纳

java网页抓取问题

在这个网站中:http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA

  需要抓取:页面中的运输进程的部分 该运输进程 查看源码为一个div层 (

) 点击层厚URL地址改变为:http://wwwapps.ups.com/WebTracking/detail 因为抓取的信息需要 第一个链接中的H8947154378 参数 所以URL改变后就不知道怎么抓取了

  通过普通抓取 只能抓取到 层中的第一条数据和最后一条数据 火狐和其他浏览器查看第一个页面的源码也只有div中第一条数据和最后一条
  public String getPageContent(String strUrl, String strPostRequest,int maxLength) {

  // 读取结果网页
  StringBuffer buffer = new StringBuffer();
  System.setProperty("sun.net.client.defaultConnectTimeout", "5000");
  System.setProperty("sun.net.client.defaultReadTimeout", "5000");
  try {
  URL newUrl = new URL(strUrl);
  HttpURLConnection hConnect = (HttpURLConnection) newUrl.openConnection();
  // POST方式的额外数据
  if (strPostRequest.length() > 0) {
  hConnect.setDoOutput(true);
  OutputStreamWriter out = new OutputStreamWriter(hConnect.getOutputStream());
  out.write(strPostRequest);
  out.flush();
  out.close();
  }
  // 读取内容

  BufferedReader rd = new BufferedReader(new InputStreamReader(hConnect.getInputStream(),"utf-8"));
  int ch;
  for (int length = 0; (ch = rd.read()) > -1 && (maxLength <= 0 || length < maxLength); length++)
  buffer.append((char) ch);
  String s = buffer.toString();
  s.replaceAll("//&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
  System.out.println(s);

  rd.close();
  hConnect.disconnect();
  return buffer.toString().trim();
  } catch (Exception e) {
  return "错误:读取网页失败!";
  //

  }
  }
  public static void main(String[] args) {

  String url = "http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA";

  String url2 = "http://wwwapps.ups.com/WebTracking/detail";

  Test p = new Test();
  p.getPageContent(url, "post", 100500);

  Test3 p3 = new Test3();
  p3.getPageContent(url2, "post", 100500);
  System.out.print("已经执行!");
  }
  上面是我写的普通抓取办法
  想请教大家 是否有其他解决办法 没有公开的API接口

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

4条回答 默认 最新

  • wayne_ren 2012-06-22 11:15
    已采纳

    [color=blue][b]这里是使用HttpClient和nekohtml的完整实现,能够完整抓取出来运输进程一览:[/b][/color]

    [code="java"]
    public class UpsDetail {

    private static final String HTML_TACK_HTML = "html/tack.html";
    private static final String HTML_DETAIL_HTML = "html/detail.html";
    
    private static String url1 = "http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA";
    private static String url2 = "http://wwwapps.ups.com/WebTracking/detail"; 
    
    public static void main(String[] args) {
    
        try {
    
            //抓取追踪信息页面HTML
            getHtml(url1, HTML_TACK_HTML, null);
    
            //获取 抓取运输进程页面HTML时 需要的参数
            Map<String, String> data = getHiddenValue(HTML_TACK_HTML);
    
            //抓取运输进程页面HTML      
            getHtml(url2, HTML_DETAIL_HTML, data);
    
            //获取运输进程
            List<DetailBean> list = getDetailList(HTML_DETAIL_HTML);
    
            //打印详细的运输进程
            DetailBean bean = null;
            System.out.println("地点" + "\t" + "日期" + "\t" + "当地时间" + "\t" + "处理");
            for (int i = 0; i < list.size(); i++) {
                bean = list.get(i);
                System.out.println(bean.getLocation() + "\t" + bean.getDate() + "\t" + bean.getTime() + "\t" + bean.getOperation());
            }
    
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    private static List<DetailBean> getDetailList(String html) throws Exception {
        List<DetailBean> list = new ArrayList<DetailBean>();
    
        DOMParser parser = new DOMParser();
        parser.parse(html);
        Node node = parser.getDocument();
    
        Node tb = XPathAPI.selectSingleNode(node, "//TABLE[@class='dataTable']");
        NodeList tdlist = XPathAPI.selectNodeList(tb, "//TR/TD");
    
        int line = 0;
        while (line < tdlist.getLength() / 4) {
            DetailBean bean = new DetailBean();
    
            bean.setLocation(deleteSpace(tdlist.item(line * 4 + 0).getTextContent()));
            bean.setDate(deleteSpace(tdlist.item(line * 4 + 1).getTextContent()));
            bean.setTime(deleteSpace(tdlist.item(line * 4 + 2).getTextContent()));
            bean.setOperation(deleteSpace(tdlist.item(line * 4 + 3).getTextContent()));
    
            line++;
    
            list.add(bean);
        }
    
        return list;
    }
    
    private static Map<String, String> getHiddenValue(String html) throws Exception {       
        Map<String, String> data = new HashMap<String, String>();
    
        List<String> params = new ArrayList<String>();
        params.add("loc".toLowerCase());
        params.add("USER_HISTORY_LIST".toLowerCase());
        params.add("progressIsLoaded".toLowerCase());
        params.add("refresh_sii".toLowerCase());
        params.add("showSpPkgProg1".toLowerCase());
        params.add("datakey".toLowerCase());
        params.add("HIDDEN_FIELD_SESSION".toLowerCase());
        params.add("trackNums".toLowerCase());
    
        DOMParser parser = new DOMParser();
        parser.parse(html);
        Node node = parser.getDocument();
    
        NodeList nodeList = XPathAPI.selectNodeList(node, "//INPUT");
        for (int i = 0; i < nodeList.getLength(); i++) {
            Element e = (Element) nodeList.item(i);
            if ("hidden".equalsIgnoreCase(e.getAttribute("type"))
                    && params.contains(e.getAttribute("name").toLowerCase())) {
                data.put(e.getAttribute("name"), e.getAttribute("value"));
            }
        }
    
        System.out.println("订单编号:" + data.get("trackNums"));
        return data;
    }
    
    private static void getHtml(String url, String filename, Map<String, String> data) throws Exception {
    
        //创建一个客户端
        DefaultHttpClient client = new DefaultHttpClient();
    
        HttpResponse res = null;
        if (data == null) {
            //创建一个get方法
            HttpGet get = new HttpGet(url);
            //执行请求
            res = client.execute(get);
        } else {
    
            client.setRedirectStrategy(new DefaultRedirectStrategy() {                
                    public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context)  {
                        boolean isRedirect = false;
                        try {
                            isRedirect = super.isRedirected(request, response, context);
                        } catch (ProtocolException e) {
                            e.printStackTrace();
                        }
                        if (!isRedirect) {
                            int responseCode = response.getStatusLine().getStatusCode();
                            if (responseCode == 301 || responseCode == 302) {
                                return true;
                            }
                        }
                        return isRedirect;
                    }
                });
    
            //作成post参数Entity
            List<NameValuePair> formparams = new ArrayList<NameValuePair>();
            Iterator i = data.keySet().iterator();
            while(i.hasNext()) {
                String key = (String)i.next();
                formparams.add(new BasicNameValuePair(key, data.get(key)));
            }
            UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
    
            //创建一个post方法
            HttpPost post = new HttpPost(url);
            //设置post参数
            post.setEntity(entity);
    
            //执行请求
            res = client.execute(post);
        }
    
        //获取完整的StatusLine・・・「HTTP/1.1 200 OK」
        System.out.println(res.getStatusLine().toString());
    
        //获取返回内容
        if (res.getEntity() != null) {
            String result = EntityUtils.toString(res.getEntity());
            //System.out.println(result);           
            //生成HTML文件保存到本地(测试用可以不保存直接解析)
            createHtmlFile(filename, result);
        }
    
        //关闭流
        EntityUtils.consume(res.getEntity());
    
        //关闭连接
        client.getConnectionManager().shutdown();
    }
    
    private static void createHtmlFile(String filename, String data) throws Exception {
        File file = new File(filename);
        OutputStream os = new FileOutputStream(file);
        os.write(data.getBytes("UTF-8"));
        os.close();
    }
    
    private static String deleteSpace(String in) {
        Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
        Matcher re = pattern.matcher(in);
    
        return re.replaceAll("");
    }
    

    }
    [/code]

    其中用到的DetailBean
    [code="java"]
    public class DetailBean {
    //地点
    private String location;
    //日期
    private String date;
    //当地时间
    private String time;
    //处理
    private String operation;

    public String getLocation() {
        return location;
    }
    public void setLocation(String location) {
        this.location = location;
    }
    public String getDate() {
        return date;
    }
    public void setDate(String date) {
        this.date = date;
    }
    public String getTime() {
        return time;
    }
    public void setTime(String time) {
        this.time = time;
    }
    public String getOperation() {
        return operation;
    }
    public void setOperation(String operation) {
        this.operation = operation;
    }
    

    }
    [/code]

    点赞 评论
  • leixw0102 2012-06-21 17:05

    用htmlparser 或者jsoup过滤下网页,获取你想要的

    点赞 评论
  • Android面试专栏 2012-06-22 09:42

    去掉后面的参数不行吗?

    点赞 评论
  • iteye_878 2012-06-22 11:34

    注意运输进程点击后提交的是POST请求,
    虽然这个网址http://wwwapps.ups.com/WebTracking/detail后面没参数了,但他是POST请求,里面藏着2个cookie要传给网站。
    UPS_SHARED_SESSION:
    webappcommon.cclamp.usb.acceptsCookie
    我想H8947154378 参数已经通过session/cookie藏在里面了。所以你要在第一个网页先找到以上两个cookie参数。并通过POST方式提交给第二个网页。

    建议用firebug监控每次提交网页的动作(GET,POST)到底传哪些参数给网站,以及网站返回给你哪些信息(set-cookie)

    单用JDK搞定这类工作会很累,建议用一些第三方类包,比如httpclient抓网页,htmlparser解析html(建议前2个),或者用webharvest搞定(但这个是写xml,初学者会比较累)。可选用的开源包很多,比自己写来的方便多了。

    同时建议了解一下http协议,不然做这类东西会云里雾里,知其然不知其所以然。了解了http协议,你就知道为什么要这样GET,POST了,cookie,session的作用。

    点赞 评论

相关推荐 更多相似问题