LuceneSearch.php
1 <?php
2 /**
3  * wCMF - wemove Content Management Framework
4  * Copyright (C) 2005-2020 wemove digital solutions GmbH
5  *
6  * Licensed under the terms of the MIT License.
7  *
8  * See the LICENSE file distributed with this work for
9  * additional information.
10  */
11 namespace wcmf\lib\search\impl;
12 
24 use ZendSearch\Lucene\Analysis\Analyzer\Analyzer;
25 use ZendSearch\Lucene\Analysis\TokenFilter\StopWords;
26 use ZendSearch\Lucene\Index\Term;
27 use ZendSearch\Lucene\Lucene;
28 use ZendSearch\Lucene\Search\Query\Wildcard;
29 use ZendSearch\Lucene\Search\QueryParser;
30 use ZendSearch\Lucene\Search\Weight\Boolean;
31 
32 /**
33  * LuceneSearch provides access to the search based on ZendSearch/Lucene.
34  * The search index stored in the location that is defined by the parameter 'indexPath'.
35  * To manage PersistentObjects in the index use the methods LuceneSearch::addToIndex()
36  * and LuceneSearch::deleteFromIndex() and LuceneSearch::commitIndex().
37  * The method LuceneSearch::getIndex() offers direct access to the search index
38  * for advanced operations.
39  *
40  * @author Niko <enikao@users.sourceforge.net>
41  */
42 class LuceneSearch implements IndexedSearch {
43 
44  private $indexStrategy;
45  private $indexPath = '';
46  private $liveUpdate = true;
47  private $defaultLanguageFallback = true;
48  private $index;
49  private $indexIsDirty = false;
50 
51  private static $logger = null;
52 
53  /**
54  * Constructor
55  */
56  public function __construct() {
57  if (self::$logger == null) {
58  self::$logger = LogManager::getLogger(__CLASS__);
59  }
60  $this->indexStrategy = new DefaultIndexStrategy();
61 
62  // listen to object change events
63  ObjectFactory::getInstance('eventManager')->addListener(TransactionEvent::NAME,
64  [$this, 'afterCommit']);
65  }
66 
67  /**
68  * Destructor
69  */
70  public function __destruct() {
71  $this->commitIndex(false);
72  ObjectFactory::getInstance('eventManager')->removeListener(TransactionEvent::NAME,
73  [$this, 'afterCommit']);
74  }
75 
76  /**
77  * Set the IndexStrategy instance.
78  * @param $indexStrategy
79  */
80  public function setIndexStrategy(IndexStrategy $indexStrategy) {
81  $this->indexStrategy = $indexStrategy;
82  }
83 
84  /**
85  * Set the path to the search index.
86  * @param $indexPath Directory relative to main
87  */
88  public function setIndexPath($indexPath) {
89  $fileUtil = new FileUtil();
90  $this->indexPath = $fileUtil->realpath(WCMF_BASE.$indexPath).'/';
91  $fileUtil->mkdirRec($this->indexPath);
92  if (!is_writeable($this->indexPath)) {
93  throw new ConfigurationException("Index path '".$indexPath."' is not writeable.");
94  }
95  if (self::$logger->isDebugEnabled()) {
96  self::$logger->debug("Lucene index location: ".$this->indexPath);
97  }
98  }
99 
100  /**
101  * Get the path to the search index.
102  * @return String
103  */
104  public function getIndexPath() {
105  return $this->indexPath;
106  }
107 
108  /**
109  * Set if the search index should update itself, when
110  * persistent objects are created/updated/deleted.
111  * @param $liveUpdate Boolean
112  */
113  public function setLiveUpdate($liveUpdate) {
114  $this->liveUpdate = $liveUpdate;
115  }
116 
117  /**
118  * Get if the search index should update itself, when
119  * persistent objects are created/updated/deleted.
120  * @return Boolean
121  */
122  public function getLiveUpdate() {
123  return $this->liveUpdate;
124  }
125 
126  /**
127  * Set if the search index should use the default language, if a translation is missing
128  * @param $useDefaultLanguage Boolean
129  */
130  public function setDefaultLanguageFallback($defaultLanguageFallback) {
131  $this->defaultLanguageFallback = $defaultLanguageFallback;
132  }
133 
134  /**
135  * @see Search::check()
136  */
137  public function check($word) {
138  $message = ObjectFactory::getInstance('message');
139  // check for length and stopwords
140  if (strlen($word) < 3) {
141  return ($message->getText("The search term is too short"));
142  }
143  if (in_array($word, $this->getStopWords())) {
144  return ($message->getText("The search terms are too common"));
145  }
146  return true;
147  }
148 
149  /**
150  * @see Search::find()
151  */
152  public function find($searchTerm, PagingInfo $pagingInfo=null, $createSummary=true) {
153  $results = [];
154  $index = $this->getIndex(false);
155  if ($index) {
156  $persistenceFacade = ObjectFactory::getInstance('persistenceFacade');
157  $query = QueryParser::parse($searchTerm, 'UTF-8');
158  if (self::$logger->isDebugEnabled()) {
159  self::$logger->debug("Search term: ".$searchTerm);
160  self::$logger->debug("Parsed query: ".$query);
161  self::$logger->debug("Optimized query: ".$query->rewrite($index)->optimize($index));
162  }
163  try {
164  $hits = $index->find($query);
165  if (self::$logger->isDebugEnabled()) {
166  self::$logger->debug("Hits: ".sizeof($hits));
167  }
168  if ($pagingInfo != null && $pagingInfo->getPageSize() > 0) {
169  $pagingInfo->setTotalCount(sizeof($hits));
170  $hits = array_slice($hits, $pagingInfo->getOffset(), $pagingInfo->getPageSize());
171  }
172  foreach($hits as $hit) {
173  if (self::$logger->isDebugEnabled()) {
174  self::$logger->debug(['oid' => $hit->oid, 'lang' => $hit->lang, 'document_id' => $hit->document_id, 'score' => $hit->score]);
175  }
176  $oidStr = $hit->oid;
177  $oid = ObjectId::parse($oidStr);
178 
179  $summary = '';
180  if ($createSummary) {
181  // get the summary with highlighted text
182  $highlightedRegex = '/((<b style="color:black;background-color:#[0-9a-f]{6}">)+)([^<]+?)((<\/b>)+)/';
183  $obj = $persistenceFacade->load($oid);
184  if ($obj) {
185  $valueNames = $obj->getValueNames(true);
186  foreach ($valueNames as $curValueName) {
187  $inputType = $obj->getValueProperty($curValueName, 'input_type');
188  $value = $obj->getValue($curValueName);
189  if (!is_object($value) && !is_array($value)) {
190  $value = $this->indexStrategy->encodeValue($value, $inputType);
191  if (strlen($value) > 0) {
192  $highlighted = @$query->htmlFragmentHighlightMatches(strip_tags($value), 'UTF-8');
193  $matches = [];
194  if (preg_match($highlightedRegex, $highlighted, $matches)) {
195  $hitStr = $matches[3];
196  $highlighted = preg_replace($highlightedRegex, ' <em class="highlighted">$3</em> ', $highlighted);
197  $highlighted = trim(preg_replace('/&#13;|[\n\r\t]/', ' ', $highlighted));
198  $excerpt = StringUtil::excerpt($highlighted, $hitStr, 300, '');
199  $summary = $excerpt;
200  break;
201  }
202  }
203  }
204  }
205  }
206  }
207  $results[$oidStr] = [
208  'oid' => $oidStr,
209  'score' => $hit->score,
210  'summary' => $summary
211  ];
212  }
213  }
214  catch (\Exception $ex) {
215  // do nothing, return empty result
216  }
217  }
218  return $results;
219  }
220 
221  /**
222  * @see Search::isSearchable()
223  */
224  public function isSearchable(PersistentObject $obj) {
225  return (boolean) $obj->getProperty('isSearchable');
226  }
227 
228  /**
229  * @see IndexedSearch::resetIndex()
230  */
231  public function resetIndex() {
232  $indexPath = $this->getIndexPath();
233  return Lucene::create($indexPath);
234  }
235 
236  /**
237  * @see IndexedSearch::commitIndex()
238  */
239  public function commitIndex($optimize = true) {
240  if (self::$logger->isDebugEnabled()) {
241  self::$logger->debug("Commit index");
242  }
243  if ($this->indexIsDirty) {
244  $index = $this->getIndex(false);
245  $index->commit();
246  if ($optimize) {
247  $index->optimize();
248  }
249  }
250  }
251 
252  /**
253  * @see IndexedSearch::optimizeIndex()
254  */
255  public function optimizeIndex() {
256  $index = $this->getIndex(false);
257  $index->optimize();
258  }
259 
260  /**
261  * @see IndexedSearch::addToIndex()
262  */
263  public function addToIndex(PersistentObject $obj) {
264  if ($this->isSearchable($obj)) {
265  $index = $this->getIndex();
266  $oidStr = $obj->getOID()->__toString();
267 
268  $this->deleteFromIndex($obj->getOID());
269 
270  // create document for each language
271  $localization = ObjectFactory::getInstance('localization');
272  foreach ($localization->getSupportedLanguages() as $language => $languageName) {
273  // load translation
274  $indexObj = $localization->loadTranslation($obj, $language, $this->defaultLanguageFallback);
275 
276  if (self::$logger->isDebugEnabled()) {
277  self::$logger->debug("Add/Update index for: ".$oidStr." language:".$language);
278  }
279 
280  $doc = $this->indexStrategy->getDocument($indexObj, $language);
281  if ($doc) {
282  $index->addDocument($doc);
283  }
284  }
285  $this->indexIsDirty = true;
286  }
287  }
288 
289  /**
290  * @see IndexedSearch::deleteFromIndex()
291  */
292  public function deleteFromIndex(ObjectId $oid) {
293  if (self::$logger->isDebugEnabled()) {
294  self::$logger->debug("Delete from index: ".$oid);
295  }
296  $index = $this->getIndex();
297 
298  $term = new Term($oid->__toString(), 'oid');
299  $docIds = $index->termDocs($term);
300  foreach ($docIds as $id) {
301  $index->delete($id);
302  }
303  $this->indexIsDirty = true;
304  }
305 
306  /**
307  * Listen to TransactionEvents
308  * @param $event TransactionEvent instance
309  */
310  public function afterCommit(TransactionEvent $event) {
311  if ($this->liveUpdate && $event->getPhase() == TransactionEvent::AFTER_COMMIT) {
312  $persistenceFacade = ObjectFactory::getInstance('persistenceFacade');
313  // add inserted/updated objects
314  foreach (array_merge(array_values($event->getInsertedOids()), $event->getUpdatedOids()) as $oid) {
315  $object = $persistenceFacade->load(ObjectId::parse($oid));
316  if ($object) {
317  $this->addToIndex($object);
318  }
319  else {
320  self::$logger->warn("Could not index object with oid ".$oid." because it does not exist");
321  }
322  }
323  // remove deleted objects
324  foreach ($event->getDeletedOids() as $oid) {
325  $this->deleteFromIndex(ObjectId::parse($oid));
326  }
327  }
328  }
329 
330  /**
331  * Get the search index.
332  * @param $create Boolean whether to create the index, if it does not exist (default: _true_)
333  * @return An instance of ZendSearch/SearchIndexInterface or null
334  */
335  private function getIndex($create = true) {
336  if (!$this->index || $create) {
337  $indexPath = $this->getIndexPath();
338 
339  $analyzer = new LuceneUtf8Analyzer();
340 
341  // add stop words filter
342  $stopWords = $this->getStopWords();
343  $stopWordsFilter = new StopWords($stopWords);
344  $analyzer->addFilter($stopWordsFilter);
345 
346  Analyzer::setDefault($analyzer);
347  Wildcard::setMinPrefixLength(0);
348  QueryParser::setDefaultEncoding('UTF-8');
349  QueryParser::setDefaultOperator(QueryParser::B_AND);
350 
351  try {
352  $this->index = Lucene::open($indexPath);
353  //$this->index->setMaxMergeDocs(5);
354  //$this->index->setMergeFactor(5);
355  }
356  catch (\Exception $ex) {
357  $this->index = $this->resetIndex();
358  }
359  }
360  return $this->index;
361  }
362 
363  /**
364  * Get a list of words that are forbidden to search for
365  * @return Array
366  */
367  protected function getStopWords() {
368  return explode("\n", $GLOBALS['STOP_WORDS']);
369  }
370 }
371 
372 /**
373  * Standard german/english stop words taken from Lucene's StopAnalyzer
374  */
375 $GLOBALS['STOP_WORDS'] = <<<'EOD'
376 ein
377 einer
378 eine
379 eines
380 einem
381 einen
382 der
383 die
384 das
385 dass
386 daß
387 du
388 er
389 sie
390 es
391 was
392 wer
393 wie
394 wir
395 und
396 oder
397 ohne
398 mit
399 am
400 im
401 in
402 aus
403 auf
404 ist
405 sein
406 war
407 wird
408 ihr
409 ihre
410 ihres
411 als
412 für
413 von
414 mit
415 dich
416 dir
417 mich
418 mir
419 mein
420 sein
421 kein
422 durch
423 wegen
424 wird
425 a
426 an
427 and
428 are
429 as
430 at
431 be
432 but
433 by
434 for
435 if
436 in
437 into
438 is
439 it
440 no
441 not
442 of
443 on
444 or
445 s
446 such
447 t
448 that
449 the
450 their
451 then
452 there
453 these
454 they
455 this
456 to
457 was
458 will
459 with
460 EOD;
461 ?>
getStopWords()
Get a list of words that are forbidden to search for.
setDefaultLanguageFallback($defaultLanguageFallback)
Set if the search index should use the default language, if a translation is missing.
setIndexPath($indexPath)
Set the path to the search index.
DefaultIndexStrategy implements indexing of PersistentObject values and might be customized by overri...
setIndexStrategy(IndexStrategy $indexStrategy)
Set the IndexStrategy instance.
getOID()
Get the object id of the PersistentObject.
__toString()
Get a string representation of the object id.
Definition: ObjectId.php:215
addToIndex(PersistentObject $obj)
getIndexPath()
Get the path to the search index.
IndexedSearch implementations are used to search entity objects in a search index.
StringUtil provides support for string manipulation.
Definition: StringUtil.php:18
const AFTER_COMMIT
An AFTER_COMMIT event occurs after the transaction is committed.
TransactionEvent instances are fired at different phases of a transaction.
static excerpt($string, $phrase, $radius=100)
Create an excerpt from the given text around the given phrase code based on: http://stackoverflow....
Definition: StringUtil.php:195
ObjectId is the unique identifier of an object.
Definition: ObjectId.php:28
isSearchable(PersistentObject $obj)
ConfigurationException signals an exception in the configuration.
static parse($oid)
Parse a serialized object id string into an ObjectId instance.
Definition: ObjectId.php:135
getPhase()
Get the phase at which the event occurred.
IndexStrategy defines the interface for indexing implementations.
LuceneSearch provides access to the search based on ZendSearch/Lucene.
getLiveUpdate()
Get if the search index should update itself, when persistent objects are created/updated/deleted.
afterCommit(TransactionEvent $event)
Listen to TransactionEvents.
FileUtil provides basic support for file functionality like HTTP file upload.
Definition: FileUtil.php:22
static getLogger($name)
Get the logger with the given name.
Definition: LogManager.php:37
static getInstance($name, $dynamicConfiguration=[])
getUpdatedOids()
Get the list of oids of updated objects.
setLiveUpdate($liveUpdate)
Set if the search index should update itself, when persistent objects are created/updated/deleted.
find($searchTerm, PagingInfo $pagingInfo=null, $createSummary=true)
PagingInfo contains information about a paged list.
Definition: PagingInfo.php:18
PersistentObject defines the interface of all persistent objects.
getDeletedOids()
Get the list of oids of deleted objects.
$GLOBALS['STOP_WORDS']
Standard german/english stop words taken from Lucene's StopAnalyzer.
getInsertedOids()
Get the map of oids of inserted objects.
LogManager is used to retrieve Logger instances.
Definition: LogManager.php:20
getProperty($name)
Get the value of a named property in the object.
ObjectFactory implements the service locator pattern by wrapping a Factory instance and providing sta...