var url = "http://finance.ccb.com/cn/finance/product.html";
var configs = {
enableProxy: true,
enableJS : true,
domains: ["finance.ccb.com"],
scanUrls: [url],
contentUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],
helperUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],
fields: [
{
name: "items",
//selector: "//*[@id='list1']/table/tbody",
selector : "//div[@id='pro_tab pro_tab1 clearfix']",
repeated : true,
children : [
{
name: "name",
alias: "理财产品名称",
//selector: "//td[contains(@class,'list_title')]text()",
//selector:"//*[@id='list1']/table/tbody/tr[2]/td[1]/div/div/a",
selector : "//a[contains(@class,'AcqProductName AcqProductClick')/text()]",
required : true
},
{
name: "money",
alias: "起购金额",
//selector: "//*[@id='list1']/table/tbody/tr[2]/td[2]"
selector:"//td[@id='list_time']/text()"
}
]
}
]
};
configs.onProcessScanPage = function (page, content, site) {
var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp?jsoncallback=jQuery1910025284838050604774_1494164835256";
var options = {
method : "POST",
data: {
"queryForm.provinceId": "110",
"queryForm.brand": "03",
pageNo: 1,
pageSize: 12
}
}
site.addUrl(helperUrl, options);
console.log("debug 1");
return false;
};
configs.onProcessHelperPage = function (page, content, site) {
var currentPage = page.request.data.pageNo;
console.log("debug 2 currentPage="+parseInt(currentPage));
//var totalPage = extract(content, "//span[contains(@class,'ebdp-pc4promote-pageturn-totalpage')]/b");
totalPage =4; //临时措施,需要抓取页面总数
totalPage = parseInt(totalPage);
if(currentPage<totalPage){
var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp?jsoncallback=jQuery1910025284838050604774_1494164835256";
var options = {
method : "POST",
data: {
"queryForm.provinceId": "110",
"queryForm.brand": "03",
pageNo: currentPage+1,
pageSize: 12
},
reserve : true
}
site.addUrl(helperUrl, options);
}
return false;
};
configs.onProcessContentPage = function (page, content, site) {
return false;
};
configs.afterExtractField = function (fieldName, data, page, site) {
if(fieldName=="items.money" || fieldName=="items.duration"){
data = data.replace(/<\/?b>/g,"");
return data;
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();
神箭手爬虫新手问题:抓取不到数据(全部代码如下)
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
1条回答 默认 最新
- Go 旅城通票 2017-05-08 07:21关注
你内容地址是contentUrlRegexes: [/http:\/\/finance.ccb.com\/cc_webtran\/queryFinanceProdList.gsp\?jsoncallback=jQuery1910025284838050604774_1494164835256/],这个页面返回的内容是JSON,不是dom对象,selector应该配置JsonPath,selectorType: SelectorType.JsonPath
而且需要去掉?jsoncallback=......这个参数,这样才是json数据,要不是jsonp,这个借口没有jsonp处理方法,改下面的就行了 var url = "http://finance.ccb.com/cn/finance/product.html";
var configs = { enableProxy: true, enableJS : true, domains: ["finance.ccb.com"], scanUrls: [url], contentUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp/],//注意去掉jsonp回调 helperUrlRegexes: [/http:\/\/finance\.ccb.com\/cc_webtran\/queryFinanceProdList\.gsp/],//注意去掉jsonp回调 fields: [ { name: "items",selectorType: SelectorType.JsonPath,//为json选择器 selector : "$.ProdList[*]", repeated : true, children : [ { name: "name", alias: "理财产品名称", selector : ".name", required : true,selectorType: SelectorType.JsonPath }, { name: "money", alias: "起购金额", selector:"purFloorAmt",selectorType: SelectorType.JsonPath } ] } ] }; configs.onProcessScanPage = function (page, content, site) { var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp";//注意去掉jsonp回调 var options = { method : "POST", data: { "queryForm.provinceId": "110", "queryForm.brand": "03", pageNo: 1, pageSize: 12 } } site.addUrl(helperUrl, options); console.log("debug 1"); return false; }; configs.onProcessHelperPage = function (page, content, site) { var currentPage = page.request.data.pageNo; console.log("debug 2 currentPage="+parseInt(currentPage)); //var totalPage = extract(content, "//span[contains(@class,'ebdp-pc4promote-pageturn-totalpage')]/b"); totalPage =4; //临时措施,需要抓取页面总数 totalPage = parseInt(totalPage); if(currentPage<totalPage){ var helperUrl = "http://finance.ccb.com/cc_webtran/queryFinanceProdList.gsp";//注意去掉jsonp回调 var options = { method : "POST", data: { "queryForm.provinceId": "110", "queryForm.brand": "03", pageNo: currentPage+1, pageSize: 12 }, reserve : true } site.addUrl(helperUrl, options); } return false; }; configs.onProcessContentPage = function (page, content, site) { return false; }; configs.afterExtractField = function (fieldName, data, page, site) { if(fieldName=="items.money" || fieldName=="items.duration"){ data =(data||'NaN').replace(/<\/?b>/g,""); return data; } return data; }; var crawler = new Crawler(configs); crawler.start();
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报
悬赏问题
- ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来
- ¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
- ¥20 有关区间dp的问题求解
- ¥15 多电路系统共用电源的串扰问题
- ¥15 slam rangenet++配置
- ¥15 有没有研究水声通信方面的帮我改俩matlab代码
- ¥15 ubuntu子系统密码忘记
- ¥15 保护模式-系统加载-段寄存器
- ¥15 电脑桌面设定一个区域禁止鼠标操作
- ¥15 求NPF226060磁芯的详细资料