So, I have hit a dead-end here. I have tried everything I know to isolate a memory leak and from what I've gathered, it seems to be related to the use of pthread for multithreading this script.
I am in the process of writing a bot for Wikipedia, and I am nearing completion. Functionally, the program is sound, and works as expected in both multithreading and single-threading. The memory leak only occurs when multi-threading is switched on.
Both versions use exactly the same functions on the same script/file, to allow for easy/consistent debugging.
The engine that threads is below.
//Multithread engine
//This thread class allows for asyncronous function calls. This is useful for the functions that consume time and can run in the background.
//Caution must be excercised to ensure that the functions are thread safe.
class AsyncFunctionCall extends Thread {
protected $method;
protected $params;
public $result;
public function __construct( $method, $params ) {
$this->method = $method;
$this->params = $params;
$this->result = null;
}
public function run() {
if (($this->result=call_user_func_array($this->method, $this->params))) {
return true;
} else return false;
}
public static function call($method, $params){
$thread = new AsyncFunctionCall($method, $params);
if($thread->start()){
return $thread;
} else {
echo "Unable to initiate background function $method!
";
return false;
}
}
}
// Analyze multiple pages simultaneously and edit them.
class ThreadedBot extends Collectable {
protected $page, $pageid, $alreadyArchived, $ARCHIVE_ALIVE, $TAG_OVERRIDE, $ARCHIVE_BY_ACCESSDATE, $TOUCH_ARCHIVE, $DEAD_ONLY, $NOTIFY_ERROR_ON_TALK, $NOTIFY_ON_TALK, $TALK_MESSAGE_HEADER, $TALK_MESSAGE, $TALK_ERROR_MESSAGE_HEADER, $TALK_ERROR_MESSAGE, $DEADLINK_TAGS, $CITATION_TAGS, $IGNORE_TAGS, $ARCHIVE_TAGS, $VERIFY_DEAD, $LINK_SCAN;
public $result;
public function __construct($page, $pageid, $alreadyArchived, $ARCHIVE_ALIVE, $TAG_OVERRIDE, $ARCHIVE_BY_ACCESSDATE, $TOUCH_ARCHIVE, $DEAD_ONLY, $NOTIFY_ERROR_ON_TALK, $NOTIFY_ON_TALK, $TALK_MESSAGE_HEADER, $TALK_MESSAGE, $TALK_ERROR_MESSAGE_HEADER, $TALK_ERROR_MESSAGE, $DEADLINK_TAGS, $CITATION_TAGS, $IGNORE_TAGS, $ARCHIVE_TAGS, $VERIFY_DEAD, $LINK_SCAN) {
$this->page = $page;
$this->pageid = $pageid;
$this->alreadyArchived = $alreadyArchived;
$this->ARCHIVE_ALIVE = $ARCHIVE_ALIVE;
$this->TAG_OVERRIDE = $TAG_OVERRIDE;
$this->ARCHIVE_BY_ACCESSDATE = $ARCHIVE_BY_ACCESSDATE;
$this->TOUCH_ARCHIVE = $TOUCH_ARCHIVE;
$this->DEAD_ONLY = $DEAD_ONLY;
$this->NOTIFY_ERROR_ON_TALK = $NOTIFY_ERROR_ON_TALK;
$this->NOTIFY_ON_TALK = $NOTIFY_ON_TALK;
$this->TALK_MESSAGE_HEADER = $TALK_MESSAGE_HEADER;
$this->TALK_MESSAGE = $TALK_MESSAGE;
$this->TALK_ERROR_MESSAGE_HEADER = $TALK_ERROR_MESSAGE_HEADER;
$this->TALK_ERROR_MESSAGE = $TALK_ERROR_MESSAGE;
$this->DEADLINK_TAGS = $DEADLINK_TAGS;
$this->CITATION_TAGS = $CITATION_TAGS;
$this->IGNORE_TAGS = $IGNORE_TAGS;
$this->ARCHIVE_TAGS = $ARCHIVE_TAGS;
$this->VERIFY_DEAD = $VERIFY_DEAD;
$this->LINK_SCAN = $LINK_SCAN;
}
public function run() {
ini_set( 'memory_limit', '1G' );
echo ini_get( 'memory_limit' )."; ".(memory_get_usage( true )/1024/1024)." MB
";
$this->result = analyzePage( $this->page, $this->pageid, $this->alreadyArchived, $this->ARCHIVE_ALIVE, $this->TAG_OVERRIDE, $this->ARCHIVE_BY_ACCESSDATE, $this->TOUCH_ARCHIVE, $this->DEAD_ONLY, $this->NOTIFY_ERROR_ON_TALK, $this->NOTIFY_ON_TALK, $this->TALK_MESSAGE_HEADER, $this->TALK_MESSAGE, $this->TALK_ERROR_MESSAGE_HEADER, $this->TALK_ERROR_MESSAGE, $this->DEADLINK_TAGS, $this->CITATION_TAGS, $this->IGNORE_TAGS, $this->ARCHIVE_TAGS, $this->VERIFY_DEAD, $this->LINK_SCAN);
$this->setGarbage();
$this->page = null;
$this->pageid = null;
$this->alreadyArchived = null;
$this->ARCHIVE_ALIVE = null;
$this->TAG_OVERRIDE = null;
$this->ARCHIVE_BY_ACCESSDATE = null;
$this->TOUCH_ARCHIVE = null;
$this->DEAD_ONLY = null;
$this->NOTIFY_ERROR_ON_TALK = null;
$this->NOTIFY_ON_TALK = null;
$this->TALK_MESSAGE_HEADER = null;
$this->TALK_MESSAGE = null;
$this->TALK_ERROR_MESSAGE_HEADER = null;
$this->TALK_ERROR_MESSAGE = null;
$this->DEADLINK_TAGS = null;
$this->CITATION_TAGS = null;
$this->IGNORE_TAGS = null;
$this->ARCHIVE_TAGS = null;
$this->VERIFY_DEAD = null;
$this->LINK_SCAN = null;
unset( $this->page, $this->pageid, $this->alreadyArchived, $this->ARCHIVE_ALIVE, $this->TAG_OVERRIDE, $this->ARCHIVE_BY_ACCESSDATE, $this->TOUCH_ARCHIVE, $this->DEAD_ONLY, $this->NOTIFY_ERROR_ON_TALK, $this->NOTIFY_ON_TALK, $this->TALK_MESSAGE_HEADER, $this->TALK_MESSAGE, $this->TALK_ERROR_MESSAGE_HEADER, $this->TALK_ERROR_MESSAGE, $this->DEADLINK_TAGS, $this->CITATION_TAGS, $this->IGNORE_TAGS, $this->ARCHIVE_TAGS, $this->VERIFY_DEAD, $this->LINK_SCAN );
}
}
This block here in the body of the program calls the threading engine.
if( WORKERS === false ) {
foreach( $pages as $tid => $tpage ) {
$pagesAnalyzed++;
$stats = analyzePage( $tpage['title'], $tpage['pageid'], $alreadyArchived, $ARCHIVE_ALIVE, $TAG_OVERRIDE, $ARCHIVE_BY_ACCESSDATE, $TOUCH_ARCHIVE, $DEAD_ONLY, $NOTIFY_ERROR_ON_TALK, $NOTIFY_ON_TALK, $TALK_MESSAGE_HEADER, $TALK_MESSAGE, $TALK_ERROR_MESSAGE_HEADER, $TALK_ERROR_MESSAGE, $DEADLINK_TAGS, $CITATION_TAGS, $IGNORE_TAGS, $ARCHIVE_TAGS, $VERIFY_DEAD, $LINK_SCAN );
if( $stats['pagemodified'] === true ) $pagesModified++;
$linksAnalyzed += $stats['linksanalyzed'];
$linksArchived += $stats['linksarchived'];
$linksFixed += $stats['linksrescued'];
$linksTagged += $stats['linkstagged'];
$alreadyArchived = array_merge( $stats['newlyArchived'], $alreadyArchived );
$failedToArchive = array_merge( $failedToArchive, $stats['archiveProblems'] );
$allerrors = array_merge( $allerrors, $stats['errors'] );
file_put_contents( $dlaaLocation, serialize( $alreadyArchived ) );
}
} else {
//for( $i = 0; $i < count( $pages ); $i += $workerLimit ) {
$workerQueue = new Pool( $workerLimit );
//$tpages = array_slice( $pages, $i, $workerLimit );
foreach( $pages as $tid => $tpage ) {
$pagesAnalyzed++;
echo "Submitted {$tpage['title']}, job ".($tid+1)." for analyzing...
";
$workerQueue->submit( new ThreadedBot( $tpage['title'], $tpage['pageid'], $alreadyArchived, $ARCHIVE_ALIVE, $TAG_OVERRIDE, $ARCHIVE_BY_ACCESSDATE, $TOUCH_ARCHIVE, $DEAD_ONLY, $NOTIFY_ERROR_ON_TALK, $NOTIFY_ON_TALK, $TALK_MESSAGE_HEADER, $TALK_MESSAGE, $TALK_ERROR_MESSAGE_HEADER, $TALK_ERROR_MESSAGE, $DEADLINK_TAGS, $CITATION_TAGS, $IGNORE_TAGS, $ARCHIVE_TAGS, $VERIFY_DEAD, $LINK_SCAN ) );
}
$workerQueue->shutdown();
$workerQueue->collect(
function( $thread ) {
global $pagesModified, $linksAnalyzed, $linksArchived, $linksFixed, $linksTagged, $alreadyArchived, $failedToArchive, $allerrors;
$stats = $thread->result;
if( $stats['pagemodified'] === true ) $pagesModified++;
$linksAnalyzed += $stats['linksanalyzed'];
$linksArchived += $stats['linksarchived'];
$linksFixed += $stats['linksrescued'];
$linksTagged += $stats['linkstagged'];
$alreadyArchived = array_merge( $stats['newlyArchived'], $alreadyArchived );
$failedToArchive = array_merge( $failedToArchive, $stats['archiveProblems'] );
$allerrors = array_merge( $allerrors, $stats['errors'] );
return $thread->isGarbage();
});
echo "!!!!!!!!!!!!!!Links analyzed so far: $linksAnalyzed
";
file_put_contents( $dlaaLocation, serialize( $alreadyArchived ) );
//$workerQueue = null;
//unset( $workerQueue );
//}
}
As you can see above, the if statement decides whether to multithread or single-thread. Some notes, $workerLimit = 20, all resources initialized in functions are closed, nullified, and unset, there is no memory leak as a result of function calls, memory_limit has been confirmed to be at 1G, workers will eventually crash with an OOM Fatal error, the memory allocation seems to be randomly assigned among worker, each worker gradually uses more and more memory, the script itself goes to 700 MB according to task manager before crashing, and finally the more workers I add, the faster the crash in each worker, and 100 workers create an immediate crash.
Here's a segment of the output.
Analyzed Stanley Hartt (8742961)
Rescued: 0; Tagged dead: 0; Archived: 0; Max System Memory Used: 1.25 MB
PHP Fatal error: Out of memory (allocated 46661632) (tried to allocate 6557907 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
Fatal error: Out of memory (allocated 46661632) (tried to allocate 6557907 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
Analyzed High-explosive anti-tank warhead (255968)
Rescued: 0; Tagged dead: 0; Archived: 5; Max System Memory Used: 22.75 MB
PHP Fatal error: Out of memory (allocated 14680064) (tried to allocate 6341940 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1261
Fatal error: Out of memory (allocated 14680064) (tried to allocate 6341940 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1261
PHP Fatal error: Out of memory (allocated 6291456) (tried to allocate 5243257 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
Fatal error: Out of memory (allocated 6291456) (tried to allocate 5243257 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
PHP Fatal error: Out of memory (allocated 7864320) (tried to allocate 5245685 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
Fatal error: Out of memory (allocated 7864320) (tried to allocate 5245685 bytes) in C:\Users\Maximilian Doerr\Documents\GitHub\Cyberbot_II\deadlink.php on line 1259
Analyzed Nadezhda Tylik (2896780)
Rescued: 0; Tagged dead: 0; Archived: 5; Max System Memory Used: 2.75 MB
This is my first time multithreading so I'm new to this, so I would appreciate any help and suggestions, and if you have more questions, just ask. :-)