duandiaoqian5795 2016-01-18 09:24
浏览 23

Sphider索引(pdf,xls,doc,...)mysql中奇怪的lambda字符

im using sphider, to create some kind of fulltext search above shared network drive, im almost done, but i get stucked on this. Im indexing txt,pdf, xls, etc file content, but i noticed, that in mysql DB i have stored some characters called lambda_[number] or some combinations with other words, but always its lambda word. It sounds like some unprintable character, or other little grinch to me.

So i decided to remove this pain by utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput)); but without succes. My DB collation is utf8_general_ci. Any ideas how to escape this little troll?

and whole file parser code for better understand what am i doing and all parsing functions, but issue will be probably with escaping.

    function readTextFile($filePath) {
    $fileContent = file_get_contents($filePath);

    return $fileContent;
}

// -------------- START PARSE MS OFFICE FILES BLOCK 

function parsePPT($filename) {
// This approach uses detection of the string "chr(0f).Hex_value.chr(0x00).chr(0x00).chr(0x00)" to find text strings, which are then terminated by another NUL chr(0x00). [1] Get text between delimiters [2]
    $fileHandle = fopen($filename, "r");
    $line = @fread($fileHandle, filesize($filename));
    $lines = explode(chr(0x0f), $line);
    $outtext = '';

    foreach ($lines as $thisline) {
        if (strpos($thisline, chr(0x00) . chr(0x00) . chr(0x00)) == 1) {
            $text_line = substr($thisline, 4);
            $end_pos = strpos($text_line, chr(0x00));
            $text_line = substr($text_line, 0, $end_pos);
            $text_line = preg_replace("/[^a-zA-Z0-9\s\,\.\-
\t@\/\_\(\)]/", "", $text_line);
            if (strlen($text_line) > 1) {
                $outtext.= substr($text_line, 0, $end_pos) . "
";
            }
        }
    }
    return $outtext;
}

function pptx2text($filename) {
    $zip = new ZipArchive;

    // Open received archive file
    if (true === $zip->open($filename)) {

        // If done, search for the data file in the archive
        $dia = 1;
        $data = array();
        $output = "";

        while (($index = $zip->locateName("ppt/slides/slide$dia.xml") ) !== false) {
            $data[$dia] = $zip->getFromIndex($index);
            $xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
            $output.=$xml;
            $dia++;
        }

        $zip->close();
        return strip_tags($output);
    } else {
        return "";
    }
}

function xlsx2text($filename) {
    $zip = new ZipArchive;
    // Open received archive file
    if (true === $zip->open($filename)) {

        // If done, search for the data file in the archive
        $dia = 1;
        $data = array();
        $output = "";

        while (($index = $zip->locateName("xl/worksheets/sheet$dia.xml") ) !== false) {
            $data[$dia] = $zip->getFromIndex($index);
            //$pageContent .= $data[$dia];
            $xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
            $output.=$xml;
            $dia++;
        }

        $zip->close();
        return strip_tags($output);
    } else {
        return "";
    }
}

function docx2text($filename) {
    return readZippedXML($filename, "word/document.xml");
}

function readZippedXML($archiveFile, $dataFile) {
    // Create new ZIP archive  
    $zip = new ZipArchive;

    // Open received archive file  
    if (true === $zip->open($archiveFile)) {
        // If done, search for the data file in the archive  
        if (($index = $zip->locateName($dataFile)) !== false) {
            // If found, read it to the string  
            $data = $zip->getFromIndex($index);
            $data = str_replace("></", "> </", $data);
            // Close archive file
            $zip->close();
            // Load XML from a string  
            // Skip errors and warnings  
            $xml = new DOMDocument();
            $xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            //$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            // Return data without XML formatting tags
            return strip_tags($xml->saveXML());
            return $xml;
        }
        $zip->close();
    }
    // In case of failure return empty string  
    return "";
}



function parsePDF($fileName){
    require('tools/pdf2text.php');
    $pdfClass = new PDF2Text();
    $pdfClass->setFilename($fileName);
    $pdfClass->decodePDF();
    return $pdfClass->output();
}

// -------------- END PARSE MS OFFICE FILES BLOCK 


$fileType = filter_input(INPUT_GET, 'fileType');
$filePath = filter_input(INPUT_GET, 'filePath');

$serverUri = $_SERVER['REQUEST_URI'];
$_SERVER['REQUEST_URI'] = "testval";
$secondUri = $_SERVER['REQUEST_URI'];
$fileTitle = trim(str_replace("\\", " ",$filePath));

$fileOutput = "<html><head><title>".$fileTitle."</title></head><body>";

switch ($fileType) {
    case 'txt':
        $fileOutput .= readTextFile($filePath);
        break;
    case 'pptx':
        $fileOutput .= pptx2text($filePath);
        break;
    case 'docx':
        $fileOutput .= docx2text($filePath);
        break;
    case 'xlsx':
        $fileOutput .= xlsx2text($filePath);
        break;
    case 'ppt':
        $fileOutput .= parsePPT($filePath);
        break;
    case 'pdf':
        $fileOutput .= parsePDF($filePath);
        break;
    default:
        return false;
}
$fileOutput .= "</body>";
echo utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput));
  • 写回答

0条回答 默认 最新

    报告相同问题?

    悬赏问题

    • ¥15 关于#matlab#的问题:在模糊控制器中选出线路信息,在simulink中根据线路信息生成速度时间目标曲线(初速度为20m/s,15秒后减为0的速度时间图像)我想问线路信息是什么
    • ¥15 banner广告展示设置多少时间不怎么会消耗用户价值
    • ¥16 mybatis的代理对象无法通过@Autowired装填
    • ¥15 可见光定位matlab仿真
    • ¥15 arduino 四自由度机械臂
    • ¥15 wordpress 产品图片 GIF 没法显示
    • ¥15 求三国群英传pl国战时间的修改方法
    • ¥15 matlab代码代写,需写出详细代码,代价私
    • ¥15 ROS系统搭建请教(跨境电商用途)
    • ¥15 AIC3204的示例代码有吗,想用AIC3204测量血氧,找不到相关的代码。