php pthreads集成的thread类 没法停止进程啊,一直占着内存,关掉网页也不行。后来试了忽视断开然后判断是否断开再unset exit,也不行。
还试了pool类 worker类 都没用 求救
php pthreads内存泄漏无法停止线程
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
1条回答 默认 最新
- haolyx 2016-11-01 04:50关注
代码
<?php namespace app\noveladmin\controller; set_time_limit(0); ignore_user_abort(true); //断线后仍运作,以判断是否断线或关闭网页,来停止所有线程并清除临时数据 ob_start();//开启缓冲区 $buffer = ini_get('output_buffering'); echo str_repeat(' ',$buffer+1);//防止浏览器缓存 ob_end_flush();//关闭缓存 //use一些thinkphp的类 use \think\Controller; use \think\Request; use \think\Log; class Spider extends Controller { public $config; public $param; public function __construct() { echo '<title>EGO小说爬虫</title><style>body{margin:8px !important;background: #fff; font-family: "Microsoft Yahei","Helvetica Neue",Helvetica,Arial,sans-serif; color: #333; font-size: 16px;}</style>';//css if(!function_exists('curl_init')) { $this->error('不支持CURL,请检查环境并安装CURL');//检查curl是否支持,没检查pthreads但一定要装pthreads! } $this->param = Request::instance()->request();//获取输入参数 $this->config = require(ROOT_PATH.'spider_config/config.php');//引用爬虫配置文件 $configKey = array('key','default_novel_id','default_thread_num','default_rule_name');//配置文件中必须有的键名 foreach($configKey as $keyName) { //采用foreach找出缺少了哪个配置项 if(!array_key_exists($keyName,$this->config)) { //缺少项 $this->error('配置文件错误,缺少'.$keyName); } } if(Request::instance()->has('thread','request')) { $this->param['thread'] = intval($this->param['thread']);//线程数取整 } if(Request::instance()->has('startid','request')) { $this->param['startid'] = intval($this->param['startid']);//起始小说ID取整 } } public function index() { $input = array('thread' => '','rule' => '','key' => '','startid' => '');//定义三个键,防止报错(不存在键) $input = array_merge($input,$this->param);//合并两个数组 将存在的输入参数添加到input中 echo 'Spider -- A PHP Novel Spider By:Eric<br><form action="spiderstart" method="post">爬虫线程数:<input name="thread" value="'.$input['thread'].'" placeholder="不填默认为'.$this->config['default_thread_num'].'" /><br />规则名:<input name="rule" value="'.$input['rule'].'" placeholder="不填默认为'.$this->config['default_rule_name'].'" /><br />密钥:<input name="key" value="'.$input['key'].'" placeholder="必填" /><br />起始小说ID:<input name="startid" value="'.$input['startid'].'" placeholder="不填默认为'.$this->config['default_novel_id'].'" /><br /><input type="submit" value="启动爬虫" /></form>'; } public function spiderstart() { //启动爬虫 echo '<div style="height:30px;position:fixed;top:0px;width:100%;text-align:center;background:blue;color:white;" id="RunInfo"></div><script>function GoToBottom(){document.body.scrollTop=document.body.scrollHeight;}</script>'; echo '<div style="height:30px;"> </div>正在初始化爬虫<br /><br />'; ob_flush(); flush(); usleep(300000);//暂停0.3秒,无实际用途 echo '正在校验并配置参数<br /><br />'; ob_flush(); flush(); if(!Request::instance()->has('key','request') || $this->param['key'] != $this->config['key']) { $this->error('密钥错误!');//密钥错误 } if(empty($this->param['thread']) || $this->param['thread'] < 0) { $threadNum = $this->config['default_thread_num'];//未输入thread或thread不为正整数,使用默认配置 } else { $threadNum = $this->param['thread'];//使用输入参数 } if(!Request::instance()->has('rule','request') || $this->param['rule'] == '') { $ruleName = $this->config['default_rule_name'];//未输入rule或rule为空,使用默认配置 } else { $ruleName = $this->param['rule'];//使用输入参数 } if(empty($this->param['startid']) || $this->param['startid'] < 0) { $novelId = $this->config['default_novel_id'];//未输入startid或startid不为正整数,使用默认配置 } else { $novelId = $this->param['startid'];//使用输入参数 } usleep(300000);//暂停0.3秒,无实际用途 echo '参数配置完成<br /><br />正在读取并解析规则文件<br/><br/>'; ob_flush(); flush(); $rulePath = ROOT_PATH.'spider_config/rule/'.$ruleName.'.xml';//规则文件路径 if(!file_exists($rulePath)) { //判断规则文件是否存在 $this->error('规则文件不存在!'); } try { $ruleFile = fopen($rulePath,'r'); $ruleString = fread($ruleFile,filesize($rulePath));//读取规则文件 fclose($ruleFile); } //Exception加\防止使用Thinkphp的Exception类 catch(\Exception $e) { $this->error('无法读取规则文件'); } $ruleList = array(); $ruleTag = array('SiteName','SiteUrl','BookUrl','Title','Charset','NotFound','UserAgent','ProxyList','Writer','Image','Status','StatusC','Tag');//规则中的标签 foreach($ruleTag as $tagName) { //foreach+正则解析XML,只读取需要的标签 if(!preg_match('#<'.$tagName.'>(.*)</'.$tagName.'>#',$ruleString,$matchContent)) { //未匹配到该标签 $this->error('规则错误,缺少'.$tagName); } else{ $ruleList[$tagName] = $matchContent[1];//匹配到标签,添加到数组中 } } usleep(300000);//暂停0.3秒,无实际用途 echo '读取并解析配置文件成功!<br /><br />初始化结束!配置信息:<br />线程数:'.$threadNum.'<br />规则名:'.$ruleName.'<br />规则文件:'.$rulePath.'<br />抓取网站:'.$ruleList['SiteName'].'<br />网站URL:'.$ruleList['SiteUrl'].'<br /><br />正在启动线程<br /><br />'; ob_flush(); flush(); //创建线程池 for($pNum = 1;$pNum <= $threadNum;$pNum++) { $pool[] = new crawl($pNum);//初始化爬虫线程,先全部初始化好再启动 } //启动所有线程,使其处于工作状态 foreach ($pool as $tid => $wthread) { $tid++;//线程id foreach($ruleTag as $tagName) { //foreach遍历,设置该线程抓取规则,还有一种方法是传递配置文本ruleString,让crawl类来解析,这样省事,但是需要花费些时间(然而也没多久) $tRuleName = 'rule'.$tagName;//拼接属性名,属性名:rule标签名,如ruleTitle,ruleCharset /*if(property_exists('crawl',$tRuleName)){ //判断是否存在该属性,如果存在就赋值 $wthread->$tRuleName = $ruleList[$tagName]; }*/ $objectVars = get_object_vars($wthread);//获取类中的属性,上面的property_exists不知道为啥全部返回false,只好用get_objects_vars了 if(array_key_exists($tRuleName,$objectVars)) { $wthread->$tRuleName = $ruleList[$tagName];//判断类中是否存在该属性,如果存在就赋值 } } $wthread->start(); echo '线程'.$tid.'已启动!<br />'; ob_flush(); flush(); } echo '所有线程已启动,正在派发任务!<br />'; ob_flush(); flush(); usleep(300000);//暂停0.3秒 while (true) { foreach ($pool as $worker) { //run不为true则说明线程空闲 if($worker->run != true) { echo $worker->echoString;//线程空闲后输出返回信息 $worker->echoString = ''; ob_flush(); flush(); //end为true代表完成此次任务,需要派发任务,否则直接启动即可 if($worker->end == true){ $taskUrl = str_replace('{d}',$novelId,$ruleList['BookUrl']);//根据规则生成被抓取页面URL $novelId++;//小说ID + 1 $worker->end = false;//设置爬虫状态 $worker->desUrl = $taskUrl;//派发任务 $worker->type = 0;//设置type为详情页 echo '线程'.$worker->id.'已派发任务,开始抓取 '.$taskUrl.'<br />'; ob_flush(); flush(); } $worker->run = true;//开始工作 } } if(connection_status() != 0 || connection_aborted()) { //退出了爬虫 foreach($pool as $tid => $worker) { $worker->isKill = true;//通知线程停止工作,似乎没用,不知道为什么 unset($pool[$tid]);//看网上有用unset的,但是实测不行,还是无法结束线程释放内存 //不知道是线程没停止呢,还是停止了没释放内存呢? } /*for($tid = 0;$tid < count($pool);$tid++) { $pool[$tid]->isKill = true; unset($pool[$tid]); }*/ //break; exit;//break,exit都不行,线程就是没法结束掉 } usleep(300000); } } } class crawl extends \Thread { //为了防止使用Thinkphp中的类,extends \Thread时,Thread前必须加\ public $run; public $id; public $echoString; public $isKill; public $desUrl;//小说详情页URL,任务起始URL,每个任务由详情页开始 //抓取规则 具体说明看自带规则中的注释 public $ruleCharset; public $ruleTitle; public $ruleUserAgent; public $ruleProxyList; public $ruleNotFound; public $ruleWriter; public $ruleStatus; public $ruleImage; public $ruleStatusC; public $ruleTag; public $end;//此本小说抓取是否完成 public $type;//0:详情页 1:内容页 2:章节页 public function __construct($id) { //初始化,给变量赋默认值 $this->id = $id; $this->end = true;//默认还没执行任务所以为true以获取任务 $this->isKill = false; $this->echoString = ''; $this->url = ''; $this->type = 0; } public function fetch($url = '',$charset = 'UTF-8',$proxy = '',$ua = '') { //仅采用curl方式,需支持curl $ch = curl_init(); //设置CURL curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); //执行抓取 $result = curl_exec($ch); //获取HTTP状态码 if(curl_getinfo($ch,CURLINFO_HTTP_CODE) == 404) { $result = '404'; } //释放CURL curl_close($ch); return mb_convert_encoding($result,'UTF-8',$charset);//编码转换为UTF-8 } public function run() { while($this->isKill == false){ if($this->run == true) { //type为0,抓取详情页 type为1,抓取章节页 type为2,抓取内容页 switch($this->type) { case 0: $result = $this->fetch($this->desUrl,$this->ruleCharset,$this->ruleProxyList,$this->ruleUserAgent);//抓取页面内容 if($result == '404' || preg_match('#'.$this->ruleNotFound.'#',$result)) { //小说不存在 $this->echoString = '线程'.$this->id.'抓取详情页失败:小说不存在!<br />'; $this->end = true;//结束此ID的采集 } else { $novelInfo = array();//小说详情信息数组 //匹配小说信息 preg_match('#'.$this->ruleTitle.'#',$result,$matchTitle); $novelInfo['title'] = $matchTitle[1]; preg_match('#'.$this->ruleWriter.'#',$result,$matchWriter); $novelInfo['writer'] = $matchWriter[1]; preg_match('#'.$this->ruleStatus.'#',$result,$matchStatus); if($matchStatus[1] == $this->ruleStatusC) { $novelInfo['status'] = '1';//已完结 $statusText = '已完结'; } else { $novelInfo['status'] = '0';//连载中 $statusText = '连载中'; } preg_match('#'.$this->ruleImage.'#',$result,$matchImage); $novelInfo['image'] = $matchImage[1]; preg_match('#'.$this->ruleTag.'#',$result,$matchTag); $novelInfo['tag'] = $matchTag[1]; foreach($novelInfo as $key => $value) { if($value == '' && $key != 'image') { //除了封面其余有未匹配到,就抓取失败并重新抓取,封面为空则使用默认封面(数据库里封面留空即可,页面中用onerror指定默认封面) } } $this->echoString = '线程'.$this->id.'抓取详情页完成!小说:'.$novelInfo['title'].' 封面:<img src="'.$novelInfo['image'].'" style="width:50px;height:65px;" /> 类别:'.$novelInfo['tag'].' 作者:'.$novelInfo['writer'].' 状态:'.$statusText.'<br />'; $this->type = 1;//设置type为章节页 } break; case 1: break; case 2: break; } $this->run = false;//运行结束 } else { while($this->run == false && $this->isKill == false){ //等待运行 usleep(300000); } } } $this->kill();//销毁自身 } } ?>
解决 无用评论 打赏 举报
悬赏问题
- ¥15 素材场景中光线烘焙后灯光失效
- ¥15 请教一下各位,为什么我这个没有实现模拟点击
- ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来
- ¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
- ¥20 有关区间dp的问题求解
- ¥15 多电路系统共用电源的串扰问题
- ¥15 slam rangenet++配置
- ¥15 有没有研究水声通信方面的帮我改俩matlab代码
- ¥15 ubuntu子系统密码忘记
- ¥15 保护模式-系统加载-段寄存器