LuceneSearch.php
1 <?php
2 /**
3  * wCMF - wemove Content Management Framework
4  * Copyright (C) 2005-2015 wemove digital solutions GmbH
5  *
6  * Licensed under the terms of the MIT License.
7  *
8  * See the LICENSE file distributed with this work for
9  * additional information.
10  */
11 namespace wcmf\lib\search\impl;
12 
23 
24 /**
25  * LuceneSearch provides access to the search based on Zend_Search_Lucene.
26  * The search index stored in the location that is defined by the parameter 'indexPath'.
27  * To manage PersistentObjects in the index use the methods LuceneSearch::addToIndex()
28  * and LuceneSearch::deleteFromIndex() and LuceneSearch::commitIndex().
29  * The method LuceneSearch::getIndex() offers direct access to the search index
30  * for advanced operations.
31  *
32  * @author Niko <enikao@users.sourceforge.net>
33  */
34 class LuceneSearch implements IndexedSearch {
35 
36  private $_indexPath = '';
37  private $_liveUpdate = true;
38  private $_index;
39  private $_indexIsDirty = false;
40 
41  private static $_logger = null;
42 
43  /**
44  * Constructor
45  */
46  public function __construct() {
47  if (self::$_logger == null) {
48  self::$_logger = LogManager::getLogger(__CLASS__);
49  }
50  // listen to object change events
51  ObjectFactory::getInstance('eventManager')->addListener(StateChangeEvent::NAME,
52  array($this, 'stateChanged'));
53  }
54 
55  /**
56  * Destructor
57  */
58  public function __destruct() {
59  $this->commitIndex(false);
60  ObjectFactory::getInstance('eventManager')->removeListener(StateChangeEvent::NAME,
61  array($this, 'stateChanged'));
62  }
63 
64  /**
65  * Set the path to the search index.
66  * @param $indexPath Directory relative to main
67  */
68  public function setIndexPath($indexPath) {
69  $fileUtil = new FileUtil();
70  $this->_indexPath = $fileUtil->realpath(WCMF_BASE.$indexPath).'/';
71  $fileUtil->mkdirRec($this->_indexPath);
72  if (!is_writeable($this->_indexPath)) {
73  throw new ConfigurationException("Index path '".$indexPath."' is not writeable.");
74  }
75  self::$_logger->debug("Lucene index location: ".$this->_indexPath);
76  }
77 
78  /**
79  * Get the path to the search index.
80  * @return String
81  */
82  public function getIndexPath() {
83  return $this->_indexPath;
84  }
85 
86  /**
87  * Set if the search index should update itself, when
88  * persistent objects are created/updated/deleted.
89  * @param $liveUpdate Boolean
90  */
91  public function setLiveUpdate($liveUpdate) {
92  $this->_liveUpdate = $liveUpdate;
93  }
94 
95  /**
96  * Get if the search index should update itself, when
97  * persistent objects are created/updated/deleted.
98  * @return Boolean
99  */
100  public function getLiveUpdate() {
101  return $this->_liveUpdate;
102  }
103 
104  /**
105  * @see Search::check()
106  */
107  public function check($word) {
108  $message = ObjectFactory::getInstance('message');
109  // check for length and stopwords
110  if (strlen($word) < 3) {
111  return ($message->getText("The search term is too short"));
112  }
113  if (in_array($word, $this->getStopWords())) {
114  return ($message->getText("The search terms are too common"));
115  }
116  return true;
117  }
118 
119  /**
120  * @see Search::find()
121  */
122  public function find($searchTerm, PagingInfo $pagingInfo=null) {
123  $results = array();
124  $index = $this->getIndex(false);
125  if ($index) {
126  $persistenceFacade = ObjectFactory::getInstance('persistenceFacade');
127  $query = \Zend_Search_Lucene_Search_QueryParser::parse($searchTerm, 'UTF-8');
128  try {
129  $hits = $index->find($query);
130  if ($pagingInfo != null && $pagingInfo->getPageSize() > 0) {
131  $pagingInfo->setTotalCount(sizeof($hits));
132  $hits = array_slice($hits, $pagingInfo->getOffset(), $pagingInfo->getPageSize());
133  }
134  foreach($hits as $hit) {
135  $oidStr = $hit->oid;
136  $oid = ObjectId::parse($oidStr);
137 
138  // get the summary with highlighted text
139  $summary = '';
140  $highlightedRegex = '/((<b style="color:black;background-color:#[0-9a-f]{6}">)+)([^<]+?)((<\/b>)+)/';
141  $obj = $persistenceFacade->load($oid);
142  if ($obj) {
143  $valueNames = $obj->getValueNames(true);
144  foreach ($valueNames as $curValueName) {
145  $inputType = $obj->getValueProperty($curValueName, 'input_type');
146  $value = $obj->getValue($curValueName);
147  if (!is_object($value) && !is_array($value)) {
148  $value = $this->encodeValue($value, $inputType);
149  if (strlen($value) > 0) {
150  $highlighted = @$query->htmlFragmentHighlightMatches(strip_tags($value), 'UTF-8');
151  $matches = array();
152  if (preg_match($highlightedRegex, $highlighted, $matches)) {
153  $hitStr = $matches[3];
154  $highlighted = preg_replace($highlightedRegex, ' <em class="highlighted">$3</em> ', $highlighted);
155  $highlighted = trim(preg_replace('/&#13;|[\n\r\t]/', ' ', $highlighted));
156  $excerpt = StringUtil::excerpt($highlighted, $hitStr, 300, '');
157  $summary = $excerpt;
158  break;
159  }
160  }
161  }
162  }
163  $results[$oidStr] = array(
164  'oid' => $oidStr,
165  'score' => $hit->score,
166  'summary' => $summary
167  );
168  }
169  }
170  }
171  catch (Exception $ex) {
172  // do nothing, return empty result
173  }
174  }
175  return $results;
176  }
177 
178  /**
179  * @see Search::isSearchable()
180  */
181  public function isSearchable(PersistentObject $obj) {
182  return (boolean) $obj->getProperty('is_searchable');
183  }
184 
185  /**
186  * @see IndexedSearch::resetIndex()
187  */
188  public function resetIndex() {
189  $indexPath = $this->getIndexPath();
190  return \Zend_Search_Lucene::create($indexPath);
191  }
192 
193  /**
194  * @see IndexedSearch::commitIndex()
195  */
196  public function commitIndex($optimize = true) {
197  self::$_logger->debug("Commit index");
198  if ($this->_indexIsDirty) {
199  $index = $this->getIndex(false);
200  if ($index) {
201  $index->commit();
202  if ($optimize) {
203  $index->optimize();
204  }
205  }
206  }
207  }
208 
209  /**
210  * @see IndexedSearch::optimizeIndex()
211  */
212  public function optimizeIndex() {
213  $index = $this->getIndex(false);
214  if ($index) {
215  $index->optimize();
216  }
217  }
218 
219  /**
220  * @see IndexedSearch::addToIndex()
221  */
222  public function addToIndex(PersistentObject $obj) {
223  if ($this->isSearchable($obj)) {
224  $index = $this->getIndex();
225  $oidStr = $obj->getOID()->__toString();
226 
227  // create document for each language
228  $localization = ObjectFactory::getInstance('localization');
229  foreach ($localization->getSupportedLanguages() as $language => $languageName) {
230  // load translation
231  $indexObj = $localization->loadTranslation($obj, $language, false);
232 
233  if (self::$_logger->isDebugEnabled()) {
234  self::$_logger->debug("Add/Update index for: ".$oidStr." language:".$language);
235  }
236 
237  // create the document
238  $doc = new \Zend_Search_Lucene_Document();
239 
240  $valueNames = $indexObj->getValueNames(true);
241 
242  $doc->addField(\Zend_Search_Lucene_Field::unIndexed('oid', $oidStr, 'UTF-8'));
243  $typeField = \Zend_Search_Lucene_Field::keyword('type', $obj->getType(), 'UTF-8');
244  $typeField->isStored = false;
245  $doc->addField($typeField);
246  if ($language != null) {
247  $languageField = \Zend_Search_Lucene_Field::keyword('lang', $language, 'UTF-8');
248  $languageField->isStored = false;
249  $doc->addField($languageField);
250  }
251 
252  foreach ($valueNames as $curValueName) {
253  $inputType = $indexObj->getValueProperty($curValueName, 'input_type');
254  $value = $indexObj->getValue($curValueName);
255  if (!is_object($value) && !is_array($value)) {
256  $value = $this->encodeValue($value, $inputType);
257  if (preg_match('/^text|^f?ckeditor/', $inputType)) {
258  $value = strip_tags($value);
259  $doc->addField(\Zend_Search_Lucene_Field::unStored($curValueName, $value, 'UTF-8'));
260  }
261  else {
262  $field = \Zend_Search_Lucene_Field::keyword($curValueName, $value, 'UTF-8');
263  $field->isStored = false;
264  $doc->addField($field);
265  }
266  }
267  }
268 
269  $term = new \Zend_Search_Lucene_Index_Term($oidStr, 'oid');
270  $docIds = $index->termDocs($term);
271  foreach ($docIds as $id) {
272  $index->delete($id);
273  }
274 
275  $index->addDocument($doc);
276  }
277  $this->_indexIsDirty = true;
278  }
279  }
280 
281  /**
282  * @see IndexedSearch::deleteFromIndex()
283  */
284  public function deleteFromIndex(PersistentObject $obj) {
285  if ($this->isSearchable($obj)) {
286  if (self::$_logger->isDebugEnabled()) {
287  self::$_logger->debug("Delete from index: ".$obj->getOID());
288  }
289  $index = $this->getIndex();
290 
291  $term = new \Zend_Search_Lucene_Index_Term($obj->getOID()->__toString(), 'oid');
292  $docIds = $index->termDocs($term);
293  foreach ($docIds as $id) {
294  $index->delete($id);
295  }
296  $this->_indexIsDirty = true;
297  }
298  }
299 
300  /**
301  * Listen to StateChangeEvents
302  * @param $event StateChangeEvent instance
303  */
304  public function stateChanged(StateChangeEvent $event) {
305  if ($this->_liveUpdate) {
306  $object = $event->getObject();
307  $oldState = $event->getOldValue();
308  $newState = $event->getNewValue();
309  if (($oldState == PersistentObject::STATE_NEW || $oldState == PersistentObject::STATE_DIRTY)
310  && $newState == PersistentObject::STATE_CLEAN) {
311  $this->addToIndex($object);
312  }
313  elseif ($newState == PersistentObject::STATE_DELETED) {
314  $this->deleteFromIndex($object);
315  }
316  }
317  }
318 
319  /**
320  * Get the search index.
321  * @param $create Boolean whether to create the index, if it does not exist (default: _true_)
322  * @return An instance of Zend_Search_Lucene_Interface or null
323  */
324  private function getIndex($create = true) {
325  if (!$this->_index || $create) {
326  $indexPath = $this->getIndexPath();
327 
328  $analyzer = new Analyzer();
329 
330  // add stop words filter
331  $stopWords = $this->getStopWords();
332  $stopWordsFilter = new \Zend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords);
333  $analyzer->addFilter($stopWordsFilter);
334 
335  \Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer);
336  \Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength(0);
337  \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('UTF-8');
338  \Zend_Search_Lucene_Search_QueryParser::setDefaultOperator(\Zend_Search_Lucene_Search_QueryParser::B_AND);
339 
340  try {
341  $this->_index = \Zend_Search_Lucene::open($indexPath);
342  //$this->_index->setMaxMergeDocs(5);
343  //$this->_index->setMergeFactor(5);
344  }
345  catch (\Zend_Search_Lucene_Exception $ex) {
346  $this->_index = $this->resetIndex();
347  }
348  }
349  return $this->_index;
350  }
351 
352  /**
353  * Encode the given value according to the input type
354  * @param $value
355  * @param $inputType
356  * @return String
357  */
358  protected function encodeValue($value, $inputType) {
359  if (preg_match('/^f?ckeditor/', $inputType)) {
360  $value = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
361  }
362  return trim($value);
363  }
364 
365  /**
366  * Get a list of words that are forbidden to search for
367  * @return Array
368  */
369  protected function getStopWords() {
370  return explode("\n", $GLOBALS['STOP_WORDS']);
371  }
372 }
373 
374 class Analyzer extends \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive {
375  /**
376  * Override method to make sure we are using utf-8
377  */
378  public function setInput($data, $encoding = '') {
379  parent::setInput($data, 'UTF-8');
380  }
381 }
382 
383 /**
384  * Standard german/english stop words taken from Lucene's StopAnalyzer
385  */
386 $GLOBALS['STOP_WORDS'] = <<<'EOD'
387 ein
388 einer
389 eine
390 eines
391 einem
392 einen
393 der
394 die
395 das
396 dass
397 daß
398 du
399 er
400 sie
401 es
402 was
403 wer
404 wie
405 wir
406 und
407 oder
408 ohne
409 mit
410 am
411 im
412 in
413 aus
414 auf
415 ist
416 sein
417 war
418 wird
419 ihr
420 ihre
421 ihres
422 als
423 für
424 von
425 mit
426 dich
427 dir
428 mich
429 mir
430 mein
431 sein
432 kein
433 durch
434 wegen
435 wird
436 a
437 an
438 and
439 are
440 as
441 at
442 be
443 but
444 by
445 for
446 if
447 in
448 into
449 is
450 it
451 no
452 not
453 of
454 on
455 or
456 s
457 such
458 t
459 that
460 the
461 their
462 then
463 there
464 these
465 they
466 this
467 to
468 was
469 will
470 with
471 EOD;
472 ?>
stateChanged(StateChangeEvent $event)
Listen to StateChangeEvents.
setIndexPath($indexPath)
Set the path to the search index.
getType()
Get the type of the object.
getOID()
Get the object id of the PersistentObject.
getStopWords()
Get a list of words that are forbidden to search for.
static excerpt($string, $phrase, $radius=100)
Create an excerpt from the given text around the given phrase code based on: http://stackoverflow.com/questions/1292121/how-to-generate-the-snippet-like-generated-by-google-with-php-and-mysql.
Definition: StringUtil.php:95
getObject()
Get the object whose state has changed.
getProperty($name)
Get the value of a named property in the object.
StateChangeEvent signals a change of the state of a PersistentObject instance.
static getLogger($name)
Get the logger with the given name.
Definition: LogManager.php:35
addToIndex(PersistentObject $obj)
find($searchTerm, PagingInfo $pagingInfo=null)
deleteFromIndex(PersistentObject $obj)
IndexedSearch implementations are used to search entity objects in a search index.
static getInstance($name, $dynamicConfiguration=array())
PagingInfo contains information about a paged list.
Definition: PagingInfo.php:18
static parse($oid)
Parse a serialized object id string into an ObjectId instance.
Definition: ObjectId.php:144
LuceneSearch provides access to the search based on Zend_Search_Lucene.
getIndexPath()
Get the path to the search index.
isSearchable(PersistentObject $obj)
FileUtil provides basic support for file functionality like HTTP file upload.
Definition: FileUtil.php:22
getLiveUpdate()
Get if the search index should update itself, when persistent objects are created/updated/deleted.
setInput($data, $encoding= '')
Override method to make sure we are using utf-8.
ConfigurationException signals an exception in the configuration.
setLiveUpdate($liveUpdate)
Set if the search index should update itself, when persistent objects are created/updated/deleted.
encodeValue($value, $inputType)
Encode the given value according to the input type.
$GLOBALS['STOP_WORDS']
Standard german/english stop words taken from Lucene's StopAnalyzer.
PersistentObject defines the interface of all persistent objects.