private static function parseSentences( WikiObjectModelCollection $wom ) { global $wgOMSentenceObjectTypes; $in_sentence = false; $new_objs = array(); $sentenceObj = null; foreach ( $wom->getObjects() as $id => $obj ) { if ( in_array( $obj->getTypeID(), $wgOMSentenceObjectTypes ) ) { if ( !$in_sentence ) { $sentenceObj = new WOMSentenceModel(); $new_objs[] = $sentenceObj; $in_sentence = true; } $sentenceObj->insertObject( $obj ); // parse sentence break if ( $obj->getTypeID() == WOM_TYPE_TEXT ) { $text = $obj->getWikiText(); $offset = 0; $len = strlen( $text ); // FIXME: sentence algorithm here, for now, just think of // \n // [.?!]((['"]|'{2,6})?)<space> // shall think of other language, e.g., Chinese $r = preg_match_all( '/(\n[^\n])|(([\.!?](([\'"]|\'{2,6})?))([ \t]))/', $text, $ms, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ); if ( $r ) { foreach ( $ms as $m ) { if ( $m[1][1] >= 0 ) { $end = $m[0][1] + 1; } else { $end = $m[0][1] + strlen( $m[0][0] ); } $obj->setText( substr( $text, $offset, $end - $offset ) ); $offset = $end; if ( $end == $len ) break; $sentenceObj = new WOMSentenceModel(); $new_objs[] = $sentenceObj; $obj = new WOMTextModel( substr( $text, $end ) ); $sentenceObj->insertObject( $obj ); } } } } else { $in_sentence = false; $sentenceObj = null; if ( $obj->getTypeID() == WOM_TYPE_HTMLTAG ) { // special case, html tag } elseif ( $obj instanceof WikiObjectModelCollection ) { self::parseSentences( $obj ); } $new_objs[] = $obj; } } $wom->reset(); foreach ( $new_objs as $obj ) { $wom->insertObject( $obj ); } }