added stopword support to the indexer, added indexer webbug

darcs-hash:20050814181035-7ad00-ed5d879d29fcee7f925f806456675605b058966a.gz
This commit is contained in:
Andreas Gohr 2005-08-14 20:10:35 +02:00
parent 48665d389b
commit 7367b36877
17 changed files with 1334 additions and 3 deletions

View File

@ -24,6 +24,12 @@
function idx_getPageWords($page){
global $conf;
$word_idx = file($conf['cachedir'].'/word.idx');
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
if(@file_exists($swfile)){
$stopwords = file($swfile);
}else{
$stopwords = array();
}
// split page into words
$body = rawWiki($page);
@ -60,13 +66,17 @@ function idx_getPageWords($page){
// checking minimum word-size (excepting numbers)
if(!is_numeric($word)) {
if(strlen($word) < 3) { #FIXME add config option for max wordsize
if(strlen($word) < 3) {
$doit = false;
continue;
}
}
//FIXME add stopword check
// stopword check
if(is_int(array_search("$word\n",$stopwords))){
$doit = false;
continue;
}
// get word ID
$wid = array_search("$word\n",$word_idx);

View File

@ -71,6 +71,7 @@
// make real paths and check them
init_paths();
init_files();
// automatic upgrade to script versions of certain files
scriptify(DOKU_CONF.'users.auth');
@ -92,13 +93,29 @@ function init_paths(){
'changelog' => 'changes.log');
foreach($paths as $c => $p){
if(!$conf[$c]) $conf[$c] = $conf['savedir'].'/'.$p;
$conf[$c] = init_path($conf[$c]);
if(!$conf[$c]) die("$c does not exist or isn't writable. Check config!");
}
}
/**
* Checks the existance of certain files and creates them if missing
*/
function init_files(){
global $conf;
$files = array( $conf['cachedir'].'/word.idx',
$conf['cachedir'].'/page.idx',
$conf['cachedir'].'/index.idx', );
foreach($files as $file){
if(!@file_exists($file)){
$fh = fopen($file,'a');
fclose($fh);
}
}
}
/**
* returns absolute path
*

115
inc/lang/cs/stopwords.txt Normal file
View File

@ -0,0 +1,115 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
dnes
timto
budes
budem
byli
jses
muj
svym
tomto
tohle
tuto
tyto
jej
zda
proc
mate
tato
kam
tohoto
kdo
kteri
nam
tom
tomuto
mit
nic
proto
kterou
byla
toho
protoze
asi
nasi
napiste
coz
tim
takze
svych
jeji
svymi
jste
tedy
teto
bylo
kde
prave
nad
nejsou
pod
tema
mezi
pres
pak
vam
ani
kdyz
vsak
jsem
tento
clanku
clanky
aby
jsme
pred
pta
jejich
byl
jeste
bez
take
pouze
prvni
vase
ktera
nas
novy
tipy
pokud
muze
design
strana
jeho
sve
jine
zpravy
nove
neni
vas
jen
podle
zde
clanek
email
byt
vice
bude
jiz
nez
ktery
ktere
nebo
ten
tak
pri
jsou
jak
dalsi
ale
jako
zpet
pro

88
inc/lang/da/stopwords.txt Normal file
View File

@ -0,0 +1,88 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
alle
andet
andre
begge
den
denne
der
deres
det
dette
dig
din
dog
eller
end
ene
eneste
enhver
fem
fire
flere
fleste
for
fordi
forrige
fra
før
god
han
hans
har
hendes
her
hun
hvad
hvem
hver
hvilken
hvis
hvor
hvordan
hvorfor
hvornår
ikke
ind
ingen
intet
jeg
jeres
kan
kom
kommer
lav
lidt
lille
man
mand
mange
med
meget
men
mens
mere
mig
ned
nogen
noget
nyt
nær
næste
næsten
otte
over
seks
ses
som
stor
store
syv
til
tre
var

122
inc/lang/de/stopwords.txt Normal file
View File

@ -0,0 +1,122 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
aber
als
auch
auf
aus
bei
bin
bis
bist
dadurch
daher
darum
das
daß
dass
dein
deine
dem
den
der
des
dessen
deshalb
die
dies
dieser
dieses
doch
dort
durch
ein
eine
einem
einen
einer
eines
euer
eure
für
hatte
hatten
hattest
hattet
hier
hinter
ich
ihr
ihre
ist
jede
jedem
jeden
jeder
jedes
jener
jenes
jetzt
kann
kannst
können
könnt
machen
mein
meine
mit
muß
mußt
musst
müssen
müßt
nach
nachdem
nein
nicht
nun
oder
seid
sein
seine
sich
sie
sind
soll
sollen
sollst
sollt
sonst
soweit
sowie
und
unser
unsere
unter
vom
von
vor
wann
warum
was
weiter
weitere
wenn
wer
werde
werden
werdet
weshalb
wie
wieder
wieso
wir
wird
wirst
woher
wohin
zum
zur
über

28
inc/lang/en/stopwords.txt Normal file
View File

@ -0,0 +1,28 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
about
are
and
you
your
them
their
com
for
from
how
that
the
this
was
what
when
where
who
will
with
und
the
www

171
inc/lang/es/stopwords.txt Normal file
View File

@ -0,0 +1,171 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
una
unas
unos
uno
sobre
todo
también
tras
otro
algún
alguno
alguna
algunos
algunas
ser
soy
eres
somos
sois
estoy
esta
estamos
estais
estan
como
para
atras
porque
por
qué
estado
estaba
ante
antes
siendo
ambos
pero
poder
puede
puedo
podemos
podeis
pueden
fui
fue
fuimos
fueron
hacer
hago
hace
hacemos
haceis
hacen
cada
fin
incluso
primero
desde
conseguir
consigo
consigue
consigues
conseguimos
consiguen
voy
va
vamos
vais
van
vaya
gueno
tener
tengo
tiene
tenemos
teneis
tienen
las
los
aqui
mio
tuyo
ellos
ellas
nos
nosotros
vosotros
vosotras
dentro
solo
solamente
saber
sabes
sabe
sabemos
sabeis
saben
ultimo
largo
bastante
haces
muchos
aquellos
aquellas
sus
entonces
tiempo
verdad
verdadero
verdadera
cierto
ciertos
cierta
ciertas
intentar
intento
intenta
intentas
intentamos
intentais
intentan
dos
bajo
arriba
encima
usar
uso
usas
usa
usamos
usais
usan
emplear
empleo
empleas
emplean
ampleamos
empleais
valor
muy
era
eras
eramos
eran
modo
bien
cual
cuando
donde
mientras
quien
con
entre
sin
trabajo
trabajar
trabajas
trabaja
trabajamos
trabajais
trabajan
podria
podrias
podriamos
podrian
podriais
aquel

111
inc/lang/fr/stopwords.txt Normal file
View File

@ -0,0 +1,111 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
alors
aucuns
aussi
autre
avant
avec
avoir
bon
car
cela
ces
ceux
chaque
comme
comment
dans
des
dedans
dehors
depuis
deux
devrait
doit
donc
dos
droite
début
elle
elles
encore
essai
est
fait
faites
fois
font
force
haut
hors
ici
ils
juste
les
leur
maintenant
mais
mes
mine
moins
mon
mot
même
nommés
notre
nous
nouveaux
par
parce
parole
pas
personnes
peut
peu
pièce
plupart
pour
pourquoi
quand
que
quel
quelle
quelles
quels
qui
sans
ses
seulement
sien
son
sont
sous
soyez
sujet
sur
tandis
tellement
tels
tes
ton
tous
tout
trop
très
valeur
voie
voient
vont
votre
vous
ça
étaient
état
étions
été
être

28
inc/lang/hu/stopwords.txt Normal file
View File

@ -0,0 +1,28 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
egy
fel
meg
át
ide
oda
szét
össze
vissza
hát
és
vagy
hogy
van
lesz
volt
csak
nem
igen
mint
én
õk
ön

119
inc/lang/it/stopwords.txt Normal file
View File

@ -0,0 +1,119 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
adesso
alla
allo
allora
altre
altri
altro
anche
ancora
avere
aveva
avevano
ben
buono
che
chi
cinque
comprare
con
consecutivi
consecutivo
cosa
cui
del
della
dello
dentro
deve
devo
doppio
due
ecco
fare
fine
fino
fra
gente
giu
hai
hanno
indietro
invece
lavoro
lei
loro
lui
lungo
meglio
molta
molti
molto
nei
nella
noi
nome
nostro
nove
nuovi
nuovo
oltre
ora
otto
peggio
pero
persone
piu
poco
primo
promesso
qua
quarto
quasi
quattro
quello
questo
qui
quindi
quinto
rispetto
sara
secondo
sei
sembra
sembrava
senza
sette
sia
siamo
siete
solo
sono
sopra
soprattutto
sotto
stati
stato
stesso
su
subito
sul
sulla
tanto
tempo
terzo
tra
tre
triplo
ultimo
una
uno
va
vai
voi
volte
vostro

37
inc/lang/nl/stopwords.txt Normal file
View File

@ -0,0 +1,37 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
aan
als
bij
dan
dat
die
dit
een
had
heb
hem
het
hij
hoe
hun
kan
men
met
mij
nog
ons
ook
tot
uit
van
was
wat
wel
wij
zal
zei
zij
zou

108
inc/lang/no/stopwords.txt Normal file
View File

@ -0,0 +1,108 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
alle
andre
arbeid
begge
bort
bra
bruke
denne
der
deres
det
din
disse
eller
ene
eneste
enhver
enn
folk
for
fordi
forsÛke
fra
fÛr
fÛrst
gjorde
gjÛre
god
hadde
han
hans
hennes
her
hva
hvem
hver
hvilken
hvis
hvor
hvordan
hvorfor
ikke
inn
innen
kan
kunne
lage
lang
lik
like
makt
mange
med
meg
meget
men
mens
mer
mest
min
mye
mÅte
navn
nei
nÅr
ogsÅ
opp
oss
over
part
punkt
rett
riktig
samme
sant
siden
sist
skulle
slik
slutt
som
start
stille
tid
til
tilbake
tilstand
under
uten
var
ved
verdi
vil
ville
vite
vÅr
vÖre
vÖrt

75
inc/lang/pl/stopwords.txt Normal file
View File

@ -0,0 +1,75 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
aby
ale
bardziej
bardzo
bez
bowiem
bêdzie
czy
czyli
dla
dlatego
gdy
gdzie
ich
innych
jak
jako
jednak
jego
jej
jest
jeszcze
kiedy
kilka
która
które
którego
której
który
których
którym
którzy
lub
miêdzy
mnie
nad
nam
nas
naszego
naszych
nawet
nich
nie
nim
oraz
pod
poza
przed
przede
przez
przy
siê
sobie
swoje
tak
takie
tam
te
tego
tej
ten
tych
tylko
tym
wiele
wielu
wiêc
wszystkich
wszystkim
wszystko
zawsze

View File

@ -0,0 +1,141 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
último
acerca
agora
algmas
alguns
ali
ambos
antes
apontar
aquela
aquelas
aquele
aqueles
aqui
atrás
bem
bom
cada
caminho
cima
com
como
comprido
conhecido
corrente
das
debaixo
dentro
desde
desligado
deve
devem
deverá
direita
diz
dizer
dois
dos
ela
ele
eles
enquanto
então
está
estão
estado
estar
estará
este
estes
esteve
estive
estivemos
estiveram
fará
faz
fazer
fazia
fez
fim
foi
fora
horas
iniciar
inicio
irá
ista
iste
isto
ligado
maioria
maiorias
mais
mas
mesmo
meu
muito
muitos
nós
não
nome
nosso
novo
onde
outro
para
parte
pegar
pelo
pessoas
pode
poderá
podia
por
porque
povo
promeiro
quê
qual
qualquer
quando
quem
quieto
são
saber
sem
ser
seu
somente
têm
tal
também
tem
tempo
tenho
tentar
tentaram
tente
tentei
teu
teve
tipo
tive
todos
trabalhar
trabalho
uma
umas
uns
usa
usar
valor
veja
ver
verdade
verdadeiro
você

141
inc/lang/pt/stopwords.txt Normal file
View File

@ -0,0 +1,141 @@
# This is a list of words the indexer ignores, one word per line
# When you edit this file be sure to use UNIX line endings (single newline)
# No need to include words shorter than 3 chars - these are ignored anyway
# This list is based upon the ones found at http://www.ranks.nl/stopwords/
último
acerca
agora
algmas
alguns
ali
ambos
antes
apontar
aquela
aquelas
aquele
aqueles
aqui
atrás
bem
bom
cada
caminho
cima
com
como
comprido
conhecido
corrente
das
debaixo
dentro
desde
desligado
deve
devem
deverá
direita
diz
dizer
dois
dos
ela
ele
eles
enquanto
então
está
estão
estado
estar
estará
este
estes
esteve
estive
estivemos
estiveram
fará
faz
fazer
fazia
fez
fim
foi
fora
horas
iniciar
inicio
irá
ista
iste
isto
ligado
maioria
maiorias
mais
mas
mesmo
meu
muito
muitos
nós
não
nome
nosso
novo
onde
outro
para
parte
pegar
pelo
pessoas
pode
poderá
podia
por
porque
povo
promeiro
quê
qual
qualquer
quando
quem
quieto
são
saber
sem
ser
seu
somente
têm
tal
também
tem
tempo
tenho
tentar
tentaram
tente
tentei
teu
teve
tipo
tive
todos
trabalhar
trabalho
uma
umas
uns
usa
usar
valor
veja
ver
verdade
verdadeiro
você

View File

@ -867,4 +867,22 @@ function tpl_img($maxwidth=900,$maxheight=700){
print '</a>';
}
/**
* This function inserts a 1x1 pixel gif which in reality
* is the inexer function.
*
* Should be called somewhere at the very end of the main.php
* template
*/
function tpl_indexerWebBug(){
global $ID;
$p = array();
$p['src'] = DOKU_BASE.'lib/exe/indexer.php?id='.urlencode($ID);
$p['width'] = 1;
$p['height'] = 1;
$p['alt'] = '';
$att = buildAttributes($p);
print "<img $att />";
}
//Setup VIM: ex: et ts=2 enc=utf-8 :

View File

@ -126,5 +126,7 @@
</div>
<?php /*old includehook*/ @include(dirname(__FILE__).'/footer.html')?>
<?php tpl_indexerWebBug()?>
</body>
</html>