微风载着阳光在流浪 2017-05-08 02:35 采纳率: 100%
浏览 1976
已采纳

神箭手爬虫新手问题:抓取不到数据(全部代码如下)

 var url = "http://finance.ccb.com/cn/finance/product.html";

var configs = {
    enableProxy: true,
    enableJS : true,

    domains: ["finance.ccb.com"],
    scanUrls: [url],
    contentUrlRegexes:  [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],
    helperUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],

    fields: [
        {
            name: "items",
            //selector: "//*[@id='list1']/table/tbody",
            selector : "//div[@id='pro_tab pro_tab1 clearfix']",
            repeated : true,
            children : [
                {
                    name: "name",
                    alias: "理财产品名称",
                    //selector: "//td[contains(@class,'list_title')]text()",
                    //selector:"//*[@id='list1']/table/tbody/tr[2]/td[1]/div/div/a",
                    selector : "//a[contains(@class,'AcqProductName AcqProductClick')/text()]",
                    required : true
                },
                {
                    name: "money",
                    alias: "起购金额",
                    //selector: "//*[@id='list1']/table/tbody/tr[2]/td[2]"
                    selector:"//td[@id='list_time']/text()"
                }
            ]
        }
    ]
};

configs.onProcessScanPage = function (page, content, site) {
    var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp?jsoncallback=jQuery1910025284838050604774_1494164835256";
    var options = {
        method : "POST",
        data: {
                        "queryForm.provinceId": "110",
                        "queryForm.brand": "03",
                        pageNo: 1,
                        pageSize: 12
        }
    }
    site.addUrl(helperUrl, options);
    console.log("debug 1");
    return false;
};

configs.onProcessHelperPage = function (page, content, site) {
    var currentPage = page.request.data.pageNo;
    console.log("debug 2 currentPage="+parseInt(currentPage));
    //var totalPage = extract(content, "//span[contains(@class,'ebdp-pc4promote-pageturn-totalpage')]/b");
    totalPage =4; //临时措施,需要抓取页面总数

    totalPage = parseInt(totalPage);
    if(currentPage<totalPage){
        var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp?jsoncallback=jQuery1910025284838050604774_1494164835256";
        var options = {
            method : "POST",
            data: {
                        "queryForm.provinceId": "110",
                        "queryForm.brand": "03",
                        pageNo: currentPage+1,
                        pageSize: 12
            },
            reserve : true
        }
         site.addUrl(helperUrl, options);       
    }    
    return false;
};

configs.onProcessContentPage = function (page, content, site) {
    return false;
};

configs.afterExtractField = function (fieldName, data, page, site) {
    if(fieldName=="items.money" || fieldName=="items.duration"){
        data = data.replace(/<\/?b>/g,"");
        return data;
    }
    return data;
};

var crawler = new Crawler(configs);
crawler.start();

  • 写回答

1条回答 默认 最新

  • Go 旅城通票 2017-05-08 07:21
    关注

    你内容地址是contentUrlRegexes: [/http:\/\/finance.ccb.com\/cc_webtran\/queryFinanceProdList.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],这个页面返回的内容是JSON,不是dom对象,selector应该配置JsonPath,selectorType: SelectorType.JsonPath

    而且需要去掉?jsoncallback=......这个参数,这样才是json数据,要不是jsonp,这个借口没有jsonp处理方法,改下面的就行了 var url = "http://finance.ccb.com/cn/finance/product.html";

    图片说明

     var configs = {
        enableProxy: true,
        enableJS : true,
    
        domains: ["finance.ccb.com"],
        scanUrls: [url],
    
        contentUrlRegexes:  [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp/],//注意去掉jsonp回调
        helperUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp/],//注意去掉jsonp回调
    
        fields: [
            {
                name: "items",selectorType: SelectorType.JsonPath,//为json选择器
                selector : "$.ProdList[*]",
                repeated : true,
                children : [
                    {
                        name: "name",
                        alias: "理财产品名称",
                        selector : ".name",
                        required : true,selectorType: SelectorType.JsonPath
                    },
                    {
                        name: "money",
                        alias: "起购金额",
                        selector:"purFloorAmt",selectorType: SelectorType.JsonPath
                    }
                ]
            }
        ]
    };
    
    
    
    configs.onProcessScanPage = function (page, content, site) {
        var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp";//注意去掉jsonp回调
        var options = {
            method : "POST",
            data: {
                            "queryForm.provinceId": "110",
                            "queryForm.brand": "03",
                            pageNo: 1,
                            pageSize: 12
            }
        }
        site.addUrl(helperUrl, options);
        console.log("debug 1");
        return false;
    };
    
    configs.onProcessHelperPage = function (page, content, site) {
        var currentPage = page.request.data.pageNo;
        console.log("debug 2 currentPage="+parseInt(currentPage));
        //var totalPage = extract(content, "//span[contains(@class,'ebdp-pc4promote-pageturn-totalpage')]/b");
        totalPage =4; //临时措施,需要抓取页面总数
    
        totalPage = parseInt(totalPage);
        if(currentPage<totalPage){
            var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp";//注意去掉jsonp回调
            var options = {
                method : "POST",
                data: {
                            "queryForm.provinceId": "110",
                            "queryForm.brand": "03",
                            pageNo: currentPage+1,
                            pageSize: 12
                },
                reserve : true
            }
             site.addUrl(helperUrl, options);       
        }    
        return false;
    };
    
    configs.onProcessContentPage = function (page, content, site) {
        return false;
    };
    
    configs.afterExtractField = function (fieldName, data, page, site) {
        if(fieldName=="items.money" || fieldName=="items.duration"){
            data =(data||'NaN').replace(/<\/?b>/g,"");
            return data;
        }
        return data;
    };
    
    var crawler = new Crawler(configs);
    crawler.start();
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

悬赏问题

  • ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来
  • ¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
  • ¥20 有关区间dp的问题求解
  • ¥15 多电路系统共用电源的串扰问题
  • ¥15 slam rangenet++配置
  • ¥15 有没有研究水声通信方面的帮我改俩matlab代码
  • ¥15 ubuntu子系统密码忘记
  • ¥15 保护模式-系统加载-段寄存器
  • ¥15 电脑桌面设定一个区域禁止鼠标操作
  • ¥15 求NPF226060磁芯的详细资料