Indexer v3 Rewrite part two, update uses of indexer
This commit is contained in:
parent
00803e5628
commit
9b41be2446
|
@ -24,6 +24,7 @@ if ( $OPTS->isError() ) {
|
|||
}
|
||||
$CLEAR = false;
|
||||
$QUIET = false;
|
||||
$INDEXER = null;
|
||||
foreach ($OPTS->options as $key => $val) {
|
||||
switch ($key) {
|
||||
case 'h':
|
||||
|
@ -66,6 +67,9 @@ function _usage() {
|
|||
|
||||
function _update(){
|
||||
global $conf;
|
||||
global $INDEXER;
|
||||
|
||||
$INDEXER = idx_get_indexer();
|
||||
|
||||
$data = array();
|
||||
_quietecho("Searching pages... ");
|
||||
|
@ -78,25 +82,47 @@ function _update(){
|
|||
}
|
||||
|
||||
function _index($id){
|
||||
global $INDEXER;
|
||||
global $CLEAR;
|
||||
global $QUIET;
|
||||
|
||||
// if not cleared only update changed and new files
|
||||
if(!$CLEAR){
|
||||
$idxtag = metaFN($id,'.indexed');
|
||||
if(@file_exists($idxtag)){
|
||||
if(io_readFile($idxtag) == idx_get_version()){
|
||||
$last = @filemtime(metaFN($id,'.indexed'));
|
||||
$last = @filemtime($idxtag);
|
||||
if($last > @filemtime(wikiFN($id))) return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_lock();
|
||||
_quietecho("$id... ");
|
||||
idx_addPage($id);
|
||||
io_saveFile(metaFN($id,'.indexed'), idx_get_version());
|
||||
$body = '';
|
||||
$data = array($id, $body);
|
||||
$evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
|
||||
if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($id);
|
||||
$evt->advise_after();
|
||||
unset($evt);
|
||||
list($id,$body) = $data;
|
||||
$said = false;
|
||||
while(true) {
|
||||
$result = $INDEXER->addPageWords($id, $body);
|
||||
if ($result == "locked") {
|
||||
if($said){
|
||||
_quietecho(".");
|
||||
}else{
|
||||
_quietecho("Waiting for lockfile (max. 5 min)");
|
||||
$said = true;
|
||||
}
|
||||
sleep(15);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($result)
|
||||
io_saveFile(metaFN($id,'.indexed'), idx_get_version());
|
||||
_quietecho("done.\n");
|
||||
_unlock();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -141,7 +167,7 @@ function _clearindex(){
|
|||
_lock();
|
||||
_quietecho("Clearing index... ");
|
||||
io_saveFile($conf['indexdir'].'/page.idx','');
|
||||
io_saveFile($conf['indexdir'].'/title.idx','');
|
||||
//io_saveFile($conf['indexdir'].'/title.idx','');
|
||||
$dir = @opendir($conf['indexdir']);
|
||||
if($dir!==false){
|
||||
while(($f = readdir($dir)) !== false){
|
||||
|
@ -150,6 +176,7 @@ function _clearindex(){
|
|||
@unlink($conf['indexdir']."/$f");
|
||||
}
|
||||
}
|
||||
@unlink($conf['indexdir'].'/lengths.idx');
|
||||
_quietecho("done.\n");
|
||||
_unlock();
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ class Sitemapper {
|
|||
|
||||
dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode
|
||||
|
||||
$pages = idx_getIndex('page', '');
|
||||
$pages = idx_get_indexer()->getPages();
|
||||
dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages');
|
||||
$items = array();
|
||||
|
||||
|
|
|
@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
|
|||
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
|
||||
*/
|
||||
function _ft_pageSearch(&$data) {
|
||||
$Indexer = idx_get_indexer();
|
||||
|
||||
// parse the given query
|
||||
$q = ft_queryParser($data['query']);
|
||||
$q = ft_queryParser($Indexer, $data['query']);
|
||||
$data['highlight'] = $q['highlight'];
|
||||
|
||||
if (empty($q['parsed_ary'])) return array();
|
||||
|
||||
// lookup all words found in the query
|
||||
$lookup = idx_lookup($q['words']);
|
||||
$lookup = $Indexer->lookup($q['words']);
|
||||
|
||||
// get all pages in this dokuwiki site (!: includes nonexistent pages)
|
||||
$pages_all = array();
|
||||
foreach (idx_getIndex('page', '') as $id) {
|
||||
$pages_all[trim($id)] = 0; // base: 0 hit
|
||||
foreach ($Indexer->getPages() as $id) {
|
||||
$pages_all[$id] = 0; // base: 0 hit
|
||||
}
|
||||
|
||||
// process the query
|
||||
|
@ -126,15 +128,12 @@ function _ft_pageSearch(&$data) {
|
|||
* evaluates the instructions of the found pages
|
||||
*/
|
||||
function ft_backlinks($id){
|
||||
global $conf;
|
||||
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
|
||||
$stopwords = @file_exists($swfile) ? file($swfile) : array();
|
||||
|
||||
$result = array();
|
||||
|
||||
// quick lookup of the pagename
|
||||
// FIXME use metadata key lookup
|
||||
$page = noNS($id);
|
||||
$matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
|
||||
$matches = idx_lookup(idx_tokenizer($page)); // pagename may contain specials (_ or .)
|
||||
$docs = array_keys(ft_resultCombine(array_values($matches)));
|
||||
$docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
|
||||
if(!count($docs)) return $result;
|
||||
|
@ -168,17 +167,14 @@ function ft_backlinks($id){
|
|||
* Aborts after $max found results
|
||||
*/
|
||||
function ft_mediause($id,$max){
|
||||
global $conf;
|
||||
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
|
||||
$stopwords = @file_exists($swfile) ? file($swfile) : array();
|
||||
|
||||
if(!$max) $max = 1; // need to find at least one
|
||||
|
||||
$result = array();
|
||||
|
||||
// quick lookup of the mediafile
|
||||
// FIXME use metadata key lookup
|
||||
$media = noNS($id);
|
||||
$matches = idx_lookup(idx_tokenizer($media,$stopwords));
|
||||
$matches = idx_lookup(idx_tokenizer($media));
|
||||
$docs = array_keys(ft_resultCombine(array_values($matches)));
|
||||
if(!count($docs)) return $result;
|
||||
|
||||
|
@ -229,7 +225,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){
|
|||
}
|
||||
|
||||
function _ft_pageLookup(&$data){
|
||||
global $conf;
|
||||
// split out original parameters
|
||||
$id = $data['id'];
|
||||
if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
|
||||
|
@ -239,29 +234,27 @@ function _ft_pageLookup(&$data){
|
|||
|
||||
$in_ns = $data['in_ns'];
|
||||
$in_title = $data['in_title'];
|
||||
|
||||
$pages = array_map('rtrim', idx_getIndex('page', ''));
|
||||
$titles = array_map('rtrim', idx_getIndex('title', ''));
|
||||
// check for corrupt title index #FS2076
|
||||
if(count($pages) != count($titles)){
|
||||
$titles = array_fill(0,count($pages),'');
|
||||
@unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php
|
||||
}
|
||||
$pages = array_combine($pages, $titles);
|
||||
|
||||
$cleaned = cleanID($id);
|
||||
|
||||
$Indexer = idx_get_indexer();
|
||||
$page_idx = $Indexer->getPages();
|
||||
|
||||
$pages = array();
|
||||
if ($id !== '' && $cleaned !== '') {
|
||||
foreach ($pages as $p_id => $p_title) {
|
||||
if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
|
||||
(!$in_title || (stripos($p_title, $id) === false)) ) {
|
||||
unset($pages[$p_id]);
|
||||
foreach ($page_idx as $p_id) {
|
||||
if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
|
||||
if (!isset($pages[$p_id]))
|
||||
$pages[$p_id] = p_get_first_heading($p_id, false);
|
||||
}
|
||||
}
|
||||
//if ($in_title)
|
||||
// $titles = $Indexer->lookupKey('title', "*$id*");
|
||||
}
|
||||
if (isset($ns)) {
|
||||
foreach (array_keys($pages) as $p_id) {
|
||||
if (strpos($p_id, $ns) !== 0) {
|
||||
unset($pages[$p_id]);
|
||||
foreach ($page_idx as $p_id) {
|
||||
if (strpos($p_id, $ns) === 0) {
|
||||
if (!isset($pages[$p_id]))
|
||||
$pages[$p_id] = p_get_first_heading($p_id, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -494,11 +487,7 @@ function ft_resultComplement($args) {
|
|||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
|
||||
*/
|
||||
function ft_queryParser($query){
|
||||
global $conf;
|
||||
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
|
||||
$stopwords = @file_exists($swfile) ? file($swfile) : array();
|
||||
|
||||
function ft_queryParser($Indexer, $query){
|
||||
/**
|
||||
* parse a search query and transform it into intermediate representation
|
||||
*
|
||||
|
@ -544,7 +533,7 @@ function ft_queryParser($query){
|
|||
if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
|
||||
// phrase-include and phrase-exclude
|
||||
$not = $matches[1] ? 'NOT' : '';
|
||||
$parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
|
||||
$parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
|
||||
} else {
|
||||
// fix incomplete phrase
|
||||
$term = str_replace('"', ' ', $term);
|
||||
|
@ -591,10 +580,10 @@ function ft_queryParser($query){
|
|||
$parsed .= '(N+:'.$matches[1].')';
|
||||
} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
|
||||
// word-exclude
|
||||
$parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
|
||||
$parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
|
||||
} else {
|
||||
// word-include
|
||||
$parsed .= ft_termParser($token, $stopwords);
|
||||
$parsed .= ft_termParser($Indexer, $token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -728,18 +717,18 @@ function ft_queryParser($query){
|
|||
*
|
||||
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
|
||||
*/
|
||||
function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
|
||||
function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
|
||||
$parsed = '';
|
||||
if ($consider_asian) {
|
||||
// successive asian characters need to be searched as a phrase
|
||||
$words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
|
||||
foreach ($words as $word) {
|
||||
if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true;
|
||||
$parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
|
||||
$parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
|
||||
}
|
||||
} else {
|
||||
$term_noparen = str_replace(array('(', ')'), ' ', $term);
|
||||
$words = idx_tokenizer($term_noparen, $stopwords, true);
|
||||
$words = $Indexer->tokenizer($term_noparen, true);
|
||||
|
||||
// W_: no need to highlight
|
||||
if (empty($words)) {
|
||||
|
|
|
@ -97,7 +97,8 @@ class Doku_Indexer {
|
|||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
public function addPageWords($page, $text) {
|
||||
$this->_lock();
|
||||
if (!$this->_lock())
|
||||
return "locked";
|
||||
|
||||
// load known documents
|
||||
$page_idx = $this->_addIndexKey('page', '', $page);
|
||||
|
@ -348,12 +349,12 @@ class Doku_Indexer {
|
|||
* in the returned list is an array with the page names as keys and the
|
||||
* number of times that token appeas on the page as value.
|
||||
*
|
||||
* @param array $tokens list of words to search for
|
||||
* @param arrayref $tokens list of words to search for
|
||||
* @return array list of page names with usage counts
|
||||
* @author Tom N Harris <tnharris@whoopdedo.org>
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
public function lookup($tokens) {
|
||||
public function lookup(&$tokens) {
|
||||
$result = array();
|
||||
$wids = $this->_getIndexWords($tokens, $result);
|
||||
if (empty($wids)) return array();
|
||||
|
@ -397,10 +398,11 @@ class Doku_Indexer {
|
|||
* @param string $key name of the metadata key to look for
|
||||
* @param string $value search term to look for
|
||||
* @param callback $func comparison function
|
||||
* @return array list with page names
|
||||
* @return array list with page names, keys are query values if more than one given
|
||||
* @author Tom N Harris <tnharris@whoopdedo.org>
|
||||
*/
|
||||
public function lookupKey($key, $value, $func=null) {
|
||||
return array();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -411,12 +413,12 @@ class Doku_Indexer {
|
|||
* The $result parameter can be used to merge the index locations with
|
||||
* the appropriate query term.
|
||||
*
|
||||
* @param array $words The query terms.
|
||||
* @param arrayref $words The query terms.
|
||||
* @param arrayref $result Set to word => array("length*id" ...)
|
||||
* @return array Set to length => array(id ...)
|
||||
* @author Tom N Harris <tnharris@whoopdedo.org>
|
||||
*/
|
||||
private function _getIndexWords($words, &$result) {
|
||||
private function _getIndexWords(&$words, &$result) {
|
||||
$tokens = array();
|
||||
$tokenlength = array();
|
||||
$tokenwild = array();
|
||||
|
@ -807,7 +809,7 @@ class Doku_Indexer {
|
|||
* @return object a Doku_Indexer
|
||||
* @author Tom N Harris <tnharris@whoopdedo.org>
|
||||
*/
|
||||
function & idx_get_indexer() {
|
||||
function idx_get_indexer() {
|
||||
static $Indexer = null;
|
||||
if (is_null($Indexer)) {
|
||||
$Indexer = new Doku_Indexer();
|
||||
|
@ -841,10 +843,23 @@ function & idx_get_stopwords() {
|
|||
* Locking is handled internally.
|
||||
*
|
||||
* @param string $page name of the page to index
|
||||
* @param boolean $verbose print status messages
|
||||
* @return boolean the function completed successfully
|
||||
* @author Tom N Harris <tnharris@whoopdedo.org>
|
||||
*/
|
||||
function idx_addPage($page) {
|
||||
function idx_addPage($page, $verbose=false) {
|
||||
// check if indexing needed
|
||||
$idxtag = metaFN($page,'.indexed');
|
||||
if(@file_exists($idxtag)){
|
||||
if(trim(io_readFile($idxtag)) == idx_get_version()){
|
||||
$last = @filemtime($idxtag);
|
||||
if($last > @filemtime(wikiFN($ID))){
|
||||
if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$body = '';
|
||||
$data = array($page, $body);
|
||||
$evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
|
||||
|
@ -853,8 +868,19 @@ function idx_addPage($page) {
|
|||
unset($evt);
|
||||
list($page,$body) = $data;
|
||||
|
||||
$Indexer =& idx_get_indexer();
|
||||
return $Indexer->addPageWords($page, $body);
|
||||
$Indexer = idx_get_indexer();
|
||||
$result = $Indexer->addPageWords($page, $body);
|
||||
if ($result == "locked") {
|
||||
if ($verbose) print("Indexer: locked".DOKU_LF);
|
||||
return false;
|
||||
}
|
||||
if ($result)
|
||||
io_saveFile(metaFN($page,'.indexed'), idx_get_version());
|
||||
if ($verbose) {
|
||||
print("Indexer: finished".DOKU_LF);
|
||||
return true;
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -866,11 +892,11 @@ function idx_addPage($page) {
|
|||
* Important: No ACL checking is done here! All results are
|
||||
* returned, regardless of permissions
|
||||
*
|
||||
* @param array $words list of words to search for
|
||||
* @param arrayref $words list of words to search for
|
||||
* @return array list of pages found, associated with the search terms
|
||||
*/
|
||||
function idx_lookup($words) {
|
||||
$Indexer =& idx_get_indexer();
|
||||
function idx_lookup(&$words) {
|
||||
$Indexer = idx_get_indexer();
|
||||
return $Indexer->lookup($words);
|
||||
}
|
||||
|
||||
|
@ -879,7 +905,7 @@ function idx_lookup($words) {
|
|||
*
|
||||
*/
|
||||
function idx_tokenizer($string, $wc=false) {
|
||||
$Indexer =& idx_get_indexer();
|
||||
$Indexer = idx_get_indexer();
|
||||
return $Indexer->tokenizer($string, $wc);
|
||||
}
|
||||
|
||||
|
|
|
@ -276,6 +276,7 @@ function init_files(){
|
|||
}
|
||||
|
||||
# create title index (needs to have same length as page.idx)
|
||||
/*
|
||||
$file = $conf['indexdir'].'/title.idx';
|
||||
if(!@file_exists($file)){
|
||||
$pages = file($conf['indexdir'].'/page.idx');
|
||||
|
@ -290,6 +291,7 @@ function init_files(){
|
|||
nice_die("$file is not writable. Check your permissions settings!");
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -134,41 +134,8 @@ function runIndexer(){
|
|||
|
||||
if(!$ID) return false;
|
||||
|
||||
// check if indexing needed
|
||||
$idxtag = metaFN($ID,'.indexed');
|
||||
if(@file_exists($idxtag)){
|
||||
if(trim(io_readFile($idxtag)) == idx_get_version()){
|
||||
$last = @filemtime($idxtag);
|
||||
if($last > @filemtime(wikiFN($ID))){
|
||||
print "runIndexer(): index for $ID up to date".NL;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to aquire a lock
|
||||
$lock = $conf['lockdir'].'/_indexer.lock';
|
||||
while(!@mkdir($lock,$conf['dmode'])){
|
||||
usleep(50);
|
||||
if(time()-@filemtime($lock) > 60*5){
|
||||
// looks like a stale lock - remove it
|
||||
@rmdir($lock);
|
||||
print "runIndexer(): stale lock removed".NL;
|
||||
}else{
|
||||
print "runIndexer(): indexer locked".NL;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if($conf['dperm']) chmod($lock, $conf['dperm']);
|
||||
|
||||
// do the work
|
||||
idx_addPage($ID);
|
||||
|
||||
// we're finished - save and free lock
|
||||
io_saveFile(metaFN($ID,'.indexed'), idx_get_version());
|
||||
@rmdir($lock);
|
||||
print "runIndexer(): finished".NL;
|
||||
return true;
|
||||
return idx_addPage($ID, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -355,9 +355,8 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer {
|
|||
*/
|
||||
function listPages(){
|
||||
$list = array();
|
||||
$pages = array_filter(array_filter(idx_getIndex('page', ''),
|
||||
'isVisiblePage'),
|
||||
'page_exists');
|
||||
$pages = idx_get_indexer()->getPages();
|
||||
$pages = array_filter(array_filter($pages,'isVisiblePage'),'page_exists');
|
||||
|
||||
foreach(array_keys($pages) as $idx) {
|
||||
$perm = auth_quickaclcheck($pages[$idx]);
|
||||
|
@ -552,27 +551,7 @@ class dokuwiki_xmlrpc_server extends IXR_IntrospectionServer {
|
|||
unlock($id);
|
||||
|
||||
// run the indexer if page wasn't indexed yet
|
||||
if(!@file_exists(metaFN($id, '.indexed'))) {
|
||||
// try to aquire a lock
|
||||
$lock = $conf['lockdir'].'/_indexer.lock';
|
||||
while(!@mkdir($lock,$conf['dmode'])){
|
||||
usleep(50);
|
||||
if(time()-@filemtime($lock) > 60*5){
|
||||
// looks like a stale lock - remove it
|
||||
@rmdir($lock);
|
||||
}else{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if($conf['dperm']) chmod($lock, $conf['dperm']);
|
||||
|
||||
// do the work
|
||||
idx_addPage($id);
|
||||
|
||||
// we're finished - save and free lock
|
||||
io_saveFile(metaFN($id,'.indexed'), idx_get_version());
|
||||
@rmdir($lock);
|
||||
}
|
||||
idx_addPage($id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue