php pthreads内存泄漏无法停止线程

php pthreads集成的thread类没法停止进程啊，一直占着内存，关掉网页也不行。后来试了忽视断开然后判断是否断开再unset exit，也不行。
还试了pool类 worker类都没用求救

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

1条回答默认最新

haolyx 2016-11-01 04:50

关注

代码

 <?php
namespace app\noveladmin\controller;
set_time_limit(0);
ignore_user_abort(true); //断线后仍运作，以判断是否断线或关闭网页，来停止所有线程并清除临时数据
ob_start();//开启缓冲区
$buffer = ini_get('output_buffering');
echo str_repeat(' ',$buffer+1);//防止浏览器缓存
ob_end_flush();//关闭缓存
//use一些thinkphp的类
use \think\Controller;
use \think\Request;
use \think\Log;

class Spider extends Controller
{
    public $config;
    public $param;

    public function __construct()
    {
        echo '<title>EGO小说爬虫</title><style>body{margin:8px !important;background: #fff; font-family: "Microsoft Yahei","Helvetica Neue",Helvetica,Arial,sans-serif; color: #333; font-size: 16px;}</style>';//css
        if(!function_exists('curl_init')) {
            $this->error('不支持CURL，请检查环境并安装CURL');//检查curl是否支持，没检查pthreads但一定要装pthreads！
        }
        $this->param = Request::instance()->request();//获取输入参数
        $this->config = require(ROOT_PATH.'spider_config/config.php');//引用爬虫配置文件
        $configKey = array('key','default_novel_id','default_thread_num','default_rule_name');//配置文件中必须有的键名
        foreach($configKey as $keyName) {
            //采用foreach找出缺少了哪个配置项
            if(!array_key_exists($keyName,$this->config)) {
                //缺少项
                $this->error('配置文件错误，缺少'.$keyName);
            }
        }
        if(Request::instance()->has('thread','request')) {
            $this->param['thread'] = intval($this->param['thread']);//线程数取整
        }
        if(Request::instance()->has('startid','request')) {
            $this->param['startid'] = intval($this->param['startid']);//起始小说ID取整
        }
    }

    public function index()
    {
        $input = array('thread' => '','rule' => '','key' => '','startid' => '');//定义三个键，防止报错（不存在键）
        $input = array_merge($input,$this->param);//合并两个数组 将存在的输入参数添加到input中
        echo 'Spider -- A PHP Novel Spider By:Eric<br><form action="spiderstart" method="post">爬虫线程数：<input name="thread" value="'.$input['thread'].'" placeholder="不填默认为'.$this->config['default_thread_num'].'" /><br />规则名：<input name="rule" value="'.$input['rule'].'" placeholder="不填默认为'.$this->config['default_rule_name'].'" /><br />密钥：<input name="key" value="'.$input['key'].'" placeholder="必填" /><br />起始小说ID：<input name="startid" value="'.$input['startid'].'" placeholder="不填默认为'.$this->config['default_novel_id'].'" /><br /><input type="submit" value="启动爬虫" /></form>';
    }

    public function spiderstart()
    {
        //启动爬虫
        echo '<div style="height:30px;position:fixed;top:0px;width:100%;text-align:center;background:blue;color:white;" id="RunInfo"></div><script>function GoToBottom(){document.body.scrollTop=document.body.scrollHeight;}</script>';
        echo '<div style="height:30px;">&nbsp;</div>正在初始化爬虫<br /><br />';
        ob_flush();
        flush();
        usleep(300000);//暂停0.3秒，无实际用途
        echo '正在校验并配置参数<br /><br />';
        ob_flush();
        flush();
        if(!Request::instance()->has('key','request') || $this->param['key'] != $this->config['key']) {
            $this->error('密钥错误！');//密钥错误
        }
        if(empty($this->param['thread']) || $this->param['thread'] < 0) {
            $threadNum = $this->config['default_thread_num'];//未输入thread或thread不为正整数，使用默认配置
        } else {
            $threadNum = $this->param['thread'];//使用输入参数
        }
        if(!Request::instance()->has('rule','request') || $this->param['rule'] == '') {
            $ruleName = $this->config['default_rule_name'];//未输入rule或rule为空，使用默认配置
        } else {
            $ruleName = $this->param['rule'];//使用输入参数
        }
        if(empty($this->param['startid']) || $this->param['startid'] < 0) {
            $novelId = $this->config['default_novel_id'];//未输入startid或startid不为正整数，使用默认配置
        } else {
            $novelId = $this->param['startid'];//使用输入参数
        }
        usleep(300000);//暂停0.3秒，无实际用途
        echo '参数配置完成<br /><br />正在读取并解析规则文件<br/><br/>';
        ob_flush();
        flush();
        $rulePath = ROOT_PATH.'spider_config/rule/'.$ruleName.'.xml';//规则文件路径
        if(!file_exists($rulePath)) {
            //判断规则文件是否存在
            $this->error('规则文件不存在！');
        }
        try {           
            $ruleFile = fopen($rulePath,'r');
            $ruleString = fread($ruleFile,filesize($rulePath));//读取规则文件
            fclose($ruleFile);
        }
        //Exception加\防止使用Thinkphp的Exception类
        catch(\Exception $e) {
            $this->error('无法读取规则文件');
        }
        $ruleList = array();
        $ruleTag = array('SiteName','SiteUrl','BookUrl','Title','Charset','NotFound','UserAgent','ProxyList','Writer','Image','Status','StatusC','Tag');//规则中的标签
        foreach($ruleTag as $tagName) {
            //foreach+正则解析XML，只读取需要的标签
            if(!preg_match('#<'.$tagName.'>(.*)</'.$tagName.'>#',$ruleString,$matchContent)) {
                //未匹配到该标签
                $this->error('规则错误，缺少'.$tagName);
            } else{
                $ruleList[$tagName] = $matchContent[1];//匹配到标签，添加到数组中
            }
        }
        usleep(300000);//暂停0.3秒，无实际用途
        echo '读取并解析配置文件成功！<br /><br />初始化结束！配置信息：<br />线程数：'.$threadNum.'<br />规则名：'.$ruleName.'<br />规则文件：'.$rulePath.'<br />抓取网站：'.$ruleList['SiteName'].'<br />网站URL：'.$ruleList['SiteUrl'].'<br /><br />正在启动线程<br /><br />';
        ob_flush();
        flush();
        //创建线程池
        for($pNum = 1;$pNum <= $threadNum;$pNum++) {
            $pool[] = new crawl($pNum);//初始化爬虫线程，先全部初始化好再启动
        }
        //启动所有线程,使其处于工作状态
        foreach ($pool as $tid => $wthread) {
            $tid++;//线程id
            foreach($ruleTag as $tagName) {
                //foreach遍历，设置该线程抓取规则，还有一种方法是传递配置文本ruleString，让crawl类来解析，这样省事，但是需要花费些时间（然而也没多久）
                $tRuleName = 'rule'.$tagName;//拼接属性名，属性名：rule标签名，如ruleTitle，ruleCharset
                /*if(property_exists('crawl',$tRuleName)){
                    //判断是否存在该属性，如果存在就赋值
                    $wthread->$tRuleName = $ruleList[$tagName];
                }*/
                $objectVars = get_object_vars($wthread);//获取类中的属性，上面的property_exists不知道为啥全部返回false，只好用get_objects_vars了
                if(array_key_exists($tRuleName,$objectVars)) {
                    $wthread->$tRuleName = $ruleList[$tagName];//判断类中是否存在该属性，如果存在就赋值
                }
            }
            $wthread->start();
            echo '线程'.$tid.'已启动！<br />';
            ob_flush();
            flush();
        }
        echo '所有线程已启动，正在派发任务！<br />';
        ob_flush();
        flush();
        usleep(300000);//暂停0.3秒
        while (true) {
            foreach ($pool as $worker) {
                //run不为true则说明线程空闲
                if($worker->run != true) {
                    echo $worker->echoString;//线程空闲后输出返回信息
                    $worker->echoString = '';
                    ob_flush();
                    flush();
                    //end为true代表完成此次任务，需要派发任务，否则直接启动即可
                    if($worker->end == true){
                        $taskUrl = str_replace('{d}',$novelId,$ruleList['BookUrl']);//根据规则生成被抓取页面URL
                        $novelId++;//小说ID + 1
                        $worker->end = false;//设置爬虫状态
                        $worker->desUrl = $taskUrl;//派发任务
                        $worker->type = 0;//设置type为详情页
                        echo '线程'.$worker->id.'已派发任务，开始抓取 '.$taskUrl.'<br />';
                        ob_flush();
                        flush();
                    }
                    $worker->run = true;//开始工作
                }
            }
            if(connection_status() != 0 || connection_aborted()) {
                //退出了爬虫
                foreach($pool as $tid => $worker) {
                    $worker->isKill = true;//通知线程停止工作，似乎没用，不知道为什么
                    unset($pool[$tid]);//看网上有用unset的，但是实测不行，还是无法结束线程释放内存
                    //不知道是线程没停止呢，还是停止了没释放内存呢？
                }
                /*for($tid = 0;$tid < count($pool);$tid++) {
                    $pool[$tid]->isKill = true;
                    unset($pool[$tid]);
                }*/
                //break;
                exit;//break,exit都不行，线程就是没法结束掉
            }
            usleep(300000);
        }
    }
}

class crawl extends \Thread {
    //为了防止使用Thinkphp中的类，extends \Thread时，Thread前必须加\
    public $run;
    public $id;
    public $echoString;
    public $isKill;
    public $desUrl;//小说详情页URL，任务起始URL，每个任务由详情页开始
    //抓取规则 具体说明看自带规则中的注释
    public $ruleCharset;
    public $ruleTitle;
    public $ruleUserAgent;
    public $ruleProxyList;
    public $ruleNotFound;
    public $ruleWriter;
    public $ruleStatus;
    public $ruleImage;
    public $ruleStatusC;
    public $ruleTag;
    public $end;//此本小说抓取是否完成
    public $type;//0：详情页 1：内容页 2：章节页

    public function __construct($id)
    {
        //初始化，给变量赋默认值
        $this->id = $id;
        $this->end = true;//默认还没执行任务所以为true以获取任务
        $this->isKill = false;
        $this->echoString = '';
        $this->url = '';
        $this->type = 0;
    }

    public function fetch($url = '',$charset = 'UTF-8',$proxy = '',$ua = '')
    {
        //仅采用curl方式，需支持curl
        $ch = curl_init();
        //设置CURL
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_HEADER, 0);
        //执行抓取
        $result = curl_exec($ch);
        //获取HTTP状态码
        if(curl_getinfo($ch,CURLINFO_HTTP_CODE) == 404) {
            $result = '404';
        }
        //释放CURL
        curl_close($ch);
        return mb_convert_encoding($result,'UTF-8',$charset);//编码转换为UTF-8
    }

    public function run()
    {
        while($this->isKill == false){
            if($this->run == true) {
                //type为0，抓取详情页 type为1，抓取章节页 type为2，抓取内容页
                switch($this->type) {
                    case 0:
                    $result = $this->fetch($this->desUrl,$this->ruleCharset,$this->ruleProxyList,$this->ruleUserAgent);//抓取页面内容
                    if($result == '404' || preg_match('#'.$this->ruleNotFound.'#',$result)) {
                        //小说不存在
                        $this->echoString = '线程'.$this->id.'抓取详情页失败：小说不存在！<br />';
                        $this->end = true;//结束此ID的采集
                    } else {
                        $novelInfo = array();//小说详情信息数组
                        //匹配小说信息
                        preg_match('#'.$this->ruleTitle.'#',$result,$matchTitle);
                        $novelInfo['title'] = $matchTitle[1];
                        preg_match('#'.$this->ruleWriter.'#',$result,$matchWriter);
                        $novelInfo['writer'] = $matchWriter[1];
                        preg_match('#'.$this->ruleStatus.'#',$result,$matchStatus);
                        if($matchStatus[1] == $this->ruleStatusC) {
                            $novelInfo['status'] = '1';//已完结
                            $statusText = '已完结';
                        } else {
                            $novelInfo['status'] = '0';//连载中
                            $statusText = '连载中';
                        }
                        preg_match('#'.$this->ruleImage.'#',$result,$matchImage);
                        $novelInfo['image'] = $matchImage[1];
                        preg_match('#'.$this->ruleTag.'#',$result,$matchTag);
                        $novelInfo['tag'] = $matchTag[1];
                        foreach($novelInfo as $key => $value) {
                            if($value == '' && $key != 'image') {
                                //除了封面其余有未匹配到，就抓取失败并重新抓取，封面为空则使用默认封面（数据库里封面留空即可，页面中用onerror指定默认封面）
                            }
                        }
                        $this->echoString = '线程'.$this->id.'抓取详情页完成！小说：'.$novelInfo['title'].' 封面：<img src="'.$novelInfo['image'].'" style="width:50px;height:65px;" /> 类别：'.$novelInfo['tag'].' 作者：'.$novelInfo['writer'].' 状态：'.$statusText.'<br />';
                        $this->type = 1;//设置type为章节页
                    }
                    break;
                    case 1:
                    break;
                    case 2:
                    break;
                }
                $this->run = false;//运行结束
            } else {
                while($this->run == false && $this->isKill == false){
                    //等待运行
                    usleep(300000);
                }
            }            
        }
        $this->kill();//销毁自身
    }
}
?>

报告相同问题？

关注问题

PHP Pthreads无法将值传递给线程中的子对象 php
2015-09-15 15:10

回答 2 已采纳 Finally I found the answer. But I am not quite sure if its a correct behaviour. Your suggestions a
从cron运行时PHP pthreads失败了 linux php
2016-10-29 20:38

回答 1 已采纳 After all of that, it seems that the issue was with regards to user permissions. I was setting thi
PHP pThreads IIS：浏览器中的意外输出 php
2015-06-25 17:21

回答 1 已采纳 As krakjoe mentioned in your issue, strange things can happen when different threads work with the
PHP中使用Pthread进行并行编程-基础
2020-09-01 02:04

culi3182的博客从pthreads v3开始，此建议已得到执行，因此现在您根本无法在Web服务器环境中使用它。造成这种情况的两个突出原因是： It is not safe to use multiple threads in such an environment (causing IO issues, ...
PHP pThreads - 你如何执行垃圾收集？ php
2014-04-28 12:32

回答 1 已采纳 Why Why should I collect anyway ? The Worker threads provided by pthreads require that the progr
PHP线程编译不起作用 linux php
2018-03-13 00:30

回答 1 已采纳 I am attempting to use pthreads with Apache FPM. You can't. Find a way to work without them.
PHP Pthread似乎有随机内存管理和内存泄漏[关闭] php
2015-10-27 19:49

回答 1 已采纳 So it turns out this wasn't coming from pthreads. Instead the multithread just made the issue mor
PHP-FPM进程池探秘
2017-10-16 19:53

熊猫猛男的博客 PHP 7.2+ pthreads 扩展提供了Thread、Worker、Threaded 对象，使得创建、读取、写入以及执行多线程成为可能，并可以在多个线程之间进行同步控制；pthreads 多线程开发也仅限于命令行模式，不能用于 web 服务器环境...
PHP线程告诉JavaScript将睡眠 ajax javascript php
2016-08-05 10:13

回答 2 已采纳 Solution: I used the javascript function setInterval with the time that user set, who runs a funct
如何在CentOS 6.7的PHP 5.6.13中启用pthreads？ php
2015-09-17 01:16

回答 1 已采纳 For some reason there is two different Versions, the one you installed which cause a conflict with
PHP pthreads很奇怪 php
2014-03-07 08:22

回答 2 已采纳 Standard output cannot be guaranteed, Zend provides no API to ensure you get the right stream, it
PHP与Angular详细对比帮助你选择合适的项目技术
2023-12-29 10:46

WPHunter的博客本文将解决关于PHP与Angular的争论，并帮助您为下一个项目选择最合适的技术。
关联数组作为PHP中的对象属性（使用pthreads） php
2016-02-13 12:46

回答 1 已采纳 Since GetInfo does not descend from pthreads (it is not Threaded): $this->get_info_object = $o
高级工程师面试 - PHP
2023-04-18 17:01

凯丨的博客 PHP高级工程师面试
PHP面试题2021和2022面试、跳槽必备大全！
2021-07-19 14:38

程序员张小妍的博客一、php面试前言每个Phper在应聘的时候，都会有php面试与笔试。除了口语表达能力之外，还有一点就是实力，这也是你的php面试题所要体现的！那么提前掌握最新的PHP面试题，必然能使你在求职过程中事半功倍！以下...
php定时器使用,PHP定时器的说明
2021-03-25 10:48

weixin_39606396的博客这篇文章主要介绍了关于PHP定时器的说明，有着一定的参考价值，现在分享给大家，有需要的朋友可以参考一下常见的定时器有两种：一种周期性定时执行，例如每天的凌晨三点出报表；另一种在指定时间后执行(一次)，例如...
PHP七天定时器,PHP定时器的说明
2021-04-07 08:21

weixin_39882271的博客这篇文章主要介绍了关于PHP定时器的说明，有着一定的参考价值，现在分享给大家，有需要的朋友可以参考一下常见的定时器有两种：一种周期性定时执行，例如每天的凌晨三点出报表；另一种在指定时间后执行(一次)，例如...
mysql多进程 php_PHP多进程编程
2021-02-08 06:02

闻省的博客在日常开发中，我们经常会遇到需要使用脚本处理一些数据，在数据量比较大的情况下，我们可以采用并行的方式处理，比如说：1....i++))do/usr/bin/php multiprocessTest.php &donewait但是这种方式依赖外...
一份PHP学习大纲
2021-12-07 17:56

水行云起的博客内存管理 zend内存池变量 zval、zend_value hashtable gc refcount 编译过程词法分析lexer -> tokens 语法分析parser -> AST 编译compiler -> opcode opcache 执行过程（简单归纳，重点学习...
PHP多进程基于Redis实现轻量级延迟队列
2020-01-21 08:07

lxw1844912514的博客内存泄露检测有必要: 所有的内存分配在底层都是调用了brk或者mmap，只要程序只有大量brk或者mmap的系统调用，内存泄露可能性非常高 ,检测命令: strace -c -p pid | grep -P 'mmap| brk' 4.检测程序的系统调用情况：...
没有解决我的问题, 去提问

悬赏问题

¥15 素材场景中光线烘焙后灯光失效
¥15 请教一下各位，为什么我这个没有实现模拟点击
¥15 执行 virtuoso 命令后，界面没有，cadence 启动不起来
¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
¥20 有关区间dp的问题求解
¥15 多电路系统共用电源的串扰问题
¥15 slam rangenet++配置
¥15 有没有研究水声通信方面的帮我改俩matlab代码
¥15 ubuntu子系统密码忘记
¥15 保护模式-系统加载-段寄存器

码龄粉丝数原力等级 --

php pthreads内存泄漏无法停止线程

1条回答默认最新

码龄粉丝数原力等级 --

悬赏问题

php pthreads内存泄漏无法停止线程

1条回答 默认 最新

悬赏问题

1条回答默认最新