dokuwiki/inc/search.php

589 lines
18 KiB
PHP

<?php
/**
* DokuWiki search functions
*
* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
* @author Andreas Gohr <andi@splitbrain.org>
*/
use dokuwiki\Utf8\PhpString;
use dokuwiki\File\MediaFile;
use dokuwiki\Utf8\Sort;
/**
* Recurse directory
*
* This function recurses into a given base directory
* and calls the supplied function for each file and directory
*
* @param array &$data The results of the search are stored here
* @param string $base Where to start the search
* @param callback $func Callback (function name or array with object,method)
* @param array $opts option array will be given to the Callback
* @param string $dir Current directory beyond $base
* @param int $lvl Recursion Level
* @param mixed $sort 'natural' to use natural order sorting (default);
* 'date' to sort by filemtime; leave empty to skip sorting.
* @author Andreas Gohr <andi@splitbrain.org>
*/
function search(&$data, $base, $func, $opts, $dir = '', $lvl = 1, $sort = 'natural')
{
$dirs = [];
$files = [];
$filepaths = [];
// safeguard against runaways #1452
if ($base == '' || $base == '/') {
throw new RuntimeException('No valid $base passed to search() - possible misconfiguration or bug');
}
//read in directories and files
$dh = @opendir($base . '/' . $dir);
if (!$dh) return;
while (($file = readdir($dh)) !== false) {
if (preg_match('/^[\._]/', $file)) continue; //skip hidden files and upper dirs
if (is_dir($base . '/' . $dir . '/' . $file)) {
$dirs[] = $dir . '/' . $file;
continue;
}
$files[] = $dir . '/' . $file;
$filepaths[] = $base . '/' . $dir . '/' . $file;
}
closedir($dh);
if (!empty($sort)) {
if ($sort == 'date') {
@array_multisort(array_map('filemtime', $filepaths), SORT_NUMERIC, SORT_DESC, $files);
} else /* natural */ {
Sort::asortFN($files);
}
Sort::asortFN($dirs);
}
//give directories to userfunction then recurse
foreach ($dirs as $dir) {
if (call_user_func_array($func, [&$data, $base, $dir, 'd', $lvl, $opts])) {
search($data, $base, $func, $opts, $dir, $lvl + 1, $sort);
}
}
//now handle the files
foreach ($files as $file) {
call_user_func_array($func, [&$data, $base, $file, 'f', $lvl, $opts]);
}
}
/**
* The following functions are userfunctions to use with the search
* function above. This function is called for every found file or
* directory. When a directory is given to the function it has to
* decide if this directory should be traversed (true) or not (false)
* The function has to accept the following parameters:
*
* array &$data - Reference to the result data structure
* string $base - Base usually $conf['datadir']
* string $file - current file or directory relative to $base
* string $type - Type either 'd' for directory or 'f' for file
* int $lvl - Current recursion depht
* array $opts - option array as given to search()
*
* return values for files are ignored
*
* All functions should check the ACL for document READ rights
* namespaces (directories) are NOT checked (when sneaky_index is 0) as this
* would break the recursion (You can have an nonreadable dir over a readable
* one deeper nested) also make sure to check the file type (for example
* in case of lockfiles).
*/
/**
* Searches for pages beginning with the given query
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_qsearch(&$data, $base, $file, $type, $lvl, $opts)
{
$opts = [
'idmatch' => '(^|:)' . preg_quote($opts['query'], '/') . '/',
'listfiles' => true,
'pagesonly' => true
];
return search_universal($data, $base, $file, $type, $lvl, $opts);
}
/**
* Build the browsable index of pages
*
* $opts['ns'] is the currently viewed namespace
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_index(&$data, $base, $file, $type, $lvl, $opts)
{
global $conf;
$ns = $opts['ns'] ?? '';
$opts = [
'pagesonly' => true,
'listdirs' => true,
'listfiles' => empty($opts['nofiles']),
'sneakyacl' => $conf['sneaky_index'],
// Hacky, should rather use recmatch
'depth' => preg_match('#^' . preg_quote($file, '#') . '(/|$)#', '/' . $ns) ? 0 : -1,
];
return search_universal($data, $base, $file, $type, $lvl, $opts);
}
/**
* List all namespaces
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_namespaces(&$data, $base, $file, $type, $lvl, $opts)
{
$opts = ['listdirs' => true];
return search_universal($data, $base, $file, $type, $lvl, $opts);
}
/**
* List all mediafiles in a namespace
* $opts['depth'] recursion level, 0 for all
* $opts['showmsg'] shows message if invalid media id is used
* $opts['skipacl'] skip acl checking
* $opts['pattern'] check given pattern
* $opts['hash'] add hashes to result list
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_media(&$data, $base, $file, $type, $lvl, $opts)
{
//we do nothing with directories
if ($type == 'd') {
if (empty($opts['depth'])) return true; // recurse forever
$depth = substr_count($file, '/');
if ($depth >= $opts['depth']) return false; // depth reached
return true;
}
$info = [];
$info['id'] = pathID($file, true);
if ($info['id'] !== cleanID($info['id'])) {
if (!empty($opts['showmsg']))
msg(hsc($info['id']) . ' is not a valid file name for DokuWiki - skipped', -1);
return false; // skip non-valid files
}
//check ACL for namespace (we have no ACL for mediafiles)
$info['perm'] = auth_quickaclcheck(getNS($info['id']) . ':*');
if (empty($opts['skipacl']) && $info['perm'] < AUTH_READ) {
return false;
}
//check pattern filter
if (!empty($opts['pattern']) && !@preg_match($opts['pattern'], $info['id'])) {
return false;
}
$info['file'] = PhpString::basename($file);
$info['size'] = filesize($base . '/' . $file);
$info['mtime'] = filemtime($base . '/' . $file);
$info['writable'] = is_writable($base . '/' . $file);
if (preg_match("/\.(jpe?g|gif|png)$/", $file)) {
$info['isimg'] = true;
$info['meta'] = new JpegMeta($base . '/' . $file);
} else {
$info['isimg'] = false;
}
if (!empty($opts['hash'])) {
$info['hash'] = md5(io_readFile(mediaFN($info['id']), false));
}
$data[] = $info;
return false;
}
/**
* List all mediafiles in a namespace
* $opts['depth'] recursion level, 0 for all
* $opts['showmsg'] shows message if invalid media id is used
* $opts['skipacl'] skip acl checking
* $opts['pattern'] check given pattern
* $opts['hash'] add hashes to result list
*
* @todo This is a temporary copy of search_media returning a list of MediaFile intances
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_mediafiles(&$data, $base, $file, $type, $lvl, $opts)
{
//we do nothing with directories
if ($type == 'd') {
if (empty($opts['depth'])) return true; // recurse forever
$depth = substr_count($file, '/');
if ($depth >= $opts['depth']) return false; // depth reached
return true;
}
$id = pathID($file, true);
if ($id != cleanID($id)) {
if ($opts['showmsg'])
msg(hsc($id) . ' is not a valid file name for DokuWiki - skipped', -1);
return false; // skip non-valid files
}
//check ACL for namespace (we have no ACL for mediafiles)
$info['perm'] = auth_quickaclcheck(getNS($id) . ':*');
if (empty($opts['skipacl']) && $info['perm'] < AUTH_READ) {
return false;
}
//check pattern filter
if (!empty($opts['pattern']) && !@preg_match($opts['pattern'], $id)) {
return false;
}
$data[] = new MediaFile($id);
return false;
}
/**
* This function just lists documents (for RSS namespace export)
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_list(&$data, $base, $file, $type, $lvl, $opts)
{
//we do nothing with directories
if ($type == 'd') return false;
//only search txt files
if (str_ends_with($file, '.txt')) {
//check ACL
$id = pathID($file);
if (auth_quickaclcheck($id) < AUTH_READ) {
return false;
}
$data[]['id'] = $id;
}
return false;
}
/**
* Quicksearch for searching matching pagenames
*
* $opts['query'] is the search query
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_pagename(&$data, $base, $file, $type, $lvl, $opts)
{
//we do nothing with directories
if ($type == 'd') return true;
//only search txt files
if (!str_ends_with($file, '.txt')) return true;
//simple stringmatching
if (!empty($opts['query'])) {
if (strpos($file, (string) $opts['query']) !== false) {
//check ACL
$id = pathID($file);
if (auth_quickaclcheck($id) < AUTH_READ) {
return false;
}
$data[]['id'] = $id;
}
}
return true;
}
/**
* Just lists all documents
*
* $opts['depth'] recursion level, 0 for all
* $opts['hash'] do md5 sum of content?
* $opts['skipacl'] list everything regardless of ACL
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $data
* @param string $base
* @param string $file
* @param string $type
* @param integer $lvl
* @param array $opts
*
* @return bool
*/
function search_allpages(&$data, $base, $file, $type, $lvl, $opts)
{
if (($opts['depth'] ?? 0) > 0) {
$parts = explode('/', ltrim($file, '/'));
if (
($type == 'd' && count($parts) >= $opts['depth'])
|| ($type != 'd' && count($parts) > $opts['depth'])
) {
return false; // depth reached
}
}
//we do nothing with directories
if ($type == 'd') {
return true;
}
//only search txt files
if (!str_ends_with($file, '.txt')) return true;
$item = [];
$item['id'] = pathID($file);
if (empty($opts['skipacl']) && auth_quickaclcheck($item['id']) < AUTH_READ) {
return false;
}
$item['rev'] = filemtime($base . '/' . $file);
$item['mtime'] = $item['rev'];
$item['size'] = filesize($base . '/' . $file);
if (!empty($opts['hash'])) {
$item['hash'] = md5(trim(rawWiki($item['id'])));
}
$data[] = $item;
return true;
}
/* ------------- helper functions below -------------- */
/**
* fulltext sort
*
* Callback sort function for use with usort to sort the data
* structure created by search_fulltext. Sorts descending by count
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param array $a
* @param array $b
*
* @return int
*/
function sort_search_fulltext($a, $b)
{
if ($a['count'] > $b['count']) {
return -1;
} elseif ($a['count'] < $b['count']) {
return 1;
} else {
return Sort::strcmp($a['id'], $b['id']);
}
}
/**
* translates a document path to an ID
*
* @author Andreas Gohr <andi@splitbrain.org>
* @todo move to pageutils
*
* @param string $path
* @param bool $keeptxt
*
* @return string
*/
function pathID($path, $keeptxt = false)
{
$id = utf8_decodeFN($path);
$id = str_replace('/', ':', $id);
if (!$keeptxt) $id = preg_replace('#\.txt$#', '', $id);
$id = trim($id, ':');
return $id;
}
/**
* This is a very universal callback for the search() function, replacing
* many of the former individual functions at the cost of a more complex
* setup.
*
* How the function behaves, depends on the options passed in the $opts
* array, where the following settings can be used.
*
* depth int recursion depth. 0 for unlimited (default: 0)
* keeptxt bool keep .txt extension for IDs (default: false)
* listfiles bool include files in listing (default: false)
* listdirs bool include namespaces in listing (default: false)
* pagesonly bool restrict files to pages (default: false)
* skipacl bool do not check for READ permission (default: false)
* sneakyacl bool don't recurse into nonreadable dirs (default: false)
* hash bool create MD5 hash for files (default: false)
* meta bool return file metadata (default: false)
* filematch string match files against this regexp (default: '', so accept everything)
* idmatch string match full ID against this regexp (default: '', so accept everything)
* dirmatch string match directory against this regexp when adding (default: '', so accept everything)
* nsmatch string match namespace against this regexp when adding (default: '', so accept everything)
* recmatch string match directory against this regexp when recursing (default: '', so accept everything)
* showmsg bool warn about non-ID files (default: false)
* showhidden bool show hidden files(e.g. by hidepages config) too (default: false)
* firsthead bool return first heading for pages (default: false)
*
* @param array &$data - Reference to the result data structure
* @param string $base - Base usually $conf['datadir']
* @param string $file - current file or directory relative to $base
* @param string $type - Type either 'd' for directory or 'f' for file
* @param int $lvl - Current recursion depht
* @param array $opts - option array as given to search()
* @return bool if this directory should be traversed (true) or not (false)
* return value is ignored for files
*
* @author Andreas Gohr <gohr@cosmocode.de>
*/
function search_universal(&$data, $base, $file, $type, $lvl, $opts)
{
$item = [];
$return = true;
// get ID and check if it is a valid one
$item['id'] = pathID($file, ($type == 'd' || !empty($opts['keeptxt'])));
if ($item['id'] !== cleanID($item['id'])) {
if (!empty($opts['showmsg'])) {
msg(hsc($item['id']) . ' is not a valid file name for DokuWiki - skipped', -1);
}
return false; // skip non-valid files
}
$item['ns'] = getNS($item['id']);
if ($type == 'd') {
// decide if to recursion into this directory is wanted
if (empty($opts['depth'])) {
$return = true; // recurse forever
} else {
$depth = substr_count($file, '/');
if ($depth >= $opts['depth']) {
$return = false; // depth reached
} else {
$return = true;
}
}
if ($return) {
$match = empty($opts['recmatch']) || preg_match('/' . $opts['recmatch'] . '/', $file);
if (!$match) {
return false; // doesn't match
}
}
}
// check ACL
if (empty($opts['skipacl'])) {
if ($type == 'd') {
$item['perm'] = auth_quickaclcheck($item['id'] . ':*');
} else {
$item['perm'] = auth_quickaclcheck($item['id']); //FIXME check namespace for media files
}
} else {
$item['perm'] = AUTH_DELETE;
}
// are we done here maybe?
if ($type == 'd') {
if (empty($opts['listdirs'])) return $return;
//neither list nor recurse forbidden items:
if (empty($opts['skipacl']) && !empty($opts['sneakyacl']) && $item['perm'] < AUTH_READ) return false;
if (!empty($opts['dirmatch']) && !preg_match('/' . $opts['dirmatch'] . '/', $file)) return $return;
if (!empty($opts['nsmatch']) && !preg_match('/' . $opts['nsmatch'] . '/', $item['ns'])) return $return;
} else {
if (empty($opts['listfiles'])) return $return;
if (empty($opts['skipacl']) && $item['perm'] < AUTH_READ) return $return;
if (!empty($opts['pagesonly']) && !str_ends_with($file, '.txt')) return $return;
if (empty($opts['showhidden']) && isHiddenPage($item['id'])) return $return;
if (!empty($opts['filematch']) && !preg_match('/' . $opts['filematch'] . '/', $file)) return $return;
if (!empty($opts['idmatch']) && !preg_match('/' . $opts['idmatch'] . '/', $item['id'])) return $return;
}
// still here? prepare the item
$item['type'] = $type;
$item['level'] = $lvl;
$item['open'] = $return;
if (!empty($opts['meta'])) {
$item['file'] = PhpString::basename($file);
$item['size'] = filesize($base . '/' . $file);
$item['mtime'] = filemtime($base . '/' . $file);
$item['rev'] = $item['mtime'];
$item['writable'] = is_writable($base . '/' . $file);
$item['executable'] = is_executable($base . '/' . $file);
}
if ($type == 'f') {
if (!empty($opts['hash'])) $item['hash'] = md5(io_readFile($base . '/' . $file, false));
if (!empty($opts['firsthead'])) $item['title'] = p_get_first_heading($item['id'], METADATA_DONT_RENDER);
}
// finally add the item
$data[] = $item;
return $return;
}
//Setup VIM: ex: et ts=4 :