} elseif (empty($sruQuery)) { returnDiagnostic(7, "query"); } elseif (empty($sruVersion)) { returnDiagnostic(7, "version"); } elseif ($sruVersion != "1.1") { returnDiagnostic(5, "1.1"); } elseif (!preg_match("#^((srw_)?mods|info:srw/schema/1/mods-v3\\.2|http://www\\.loc\\.gov/mods/v3)\$#i", $sruRecordSchema) and !preg_match("#^((oai_|srw_)?dc|info:srw/schema/1/dc-v1\\.1|http://purl\\.org/dc/elements/1\\.1/)\$#i", $sruRecordSchema)) { returnDiagnostic(66, $sruRecordSchema); } elseif (!preg_match("/^xml\$/i", $sruRecordPacking)) { returnDiagnostic(71, "Only 'recordPacking=xml' is supported"); } elseif (!empty($sruRecordXPath)) { returnDiagnostic(72, ""); } elseif (!empty($sruSortKeys)) { returnDiagnostic(80, ""); } elseif (!empty($sruResultSetTTL)) { returnDiagnostic(50, ""); } else { // use an appropriate default stylesheet: if ($exportStylesheet == "DEFAULT") { if (preg_match("#^((oai_|srw_)?dc|info:srw/schema/1/dc-v1\\.1|http://purl\\.org/dc/elements/1\\.1/)\$#i", $sruRecordSchema)) { // simple Dublin Core was requested as record schema $exportStylesheet = "srwdc2html.xsl"; } else { // use a stylesheet that's appropriate for SRW+MODS XML: $exportStylesheet = "srwmods2html.xsl"; } } // // NOTE: the generation of SQL queries (or parts of) should REALLY be modular and be moved to separate dedicated functions! // CONSTRUCT SQL QUERY: // TODO: build the complete SQL query using functions 'buildFROMclause()' and 'buildORDERclause()' // Note: the 'verifySQLQuery()' function that gets called by 'search.php' to process query data with "$formType = sqlSearch" will add the user-specific fields to the 'SELECT' clause
// in '$contentTypeCharset' (which is defined in 'ini.inc.php'): setHeaderContentType("application/opensearchdescription+xml", $contentTypeCharset); // function 'setHeaderContentType()' is defined in 'include.inc.php' echo openSearchDescription($exportStylesheet); // function 'openSearchDescription()' is defined in 'opensearch.inc.php' } elseif (preg_match("/^suggest\$/i", $operation) and preg_match("/^(html|json)\$/i", $recordSchema)) { // Set the appropriate mimetype & set the character encoding to the one given // in '$contentTypeCharset' (which is defined in 'ini.inc.php'): setHeaderContentType($exportContentType, $contentTypeCharset); echo searchSuggestions($cqlQuery, $query); } elseif (!isset($_REQUEST['query']) and !isset($_REQUEST['recordSchema']) and !isset($_REQUEST['maximumRecords']) and !isset($_REQUEST['startRecord']) and !isset($_REQUEST['stylesheet'])) { showQueryPage($operation, $viewType, $showRows, $rowOffset); } elseif (empty($cqlQuery)) { returnDiagnostic(7, "query"); } elseif (!preg_match("/^((atom|rss)([ _]?xml)?|srw([ _]?(mods|dc))?([ _]?xml)?|html|json)\$/i", $recordSchema)) { returnDiagnostic(66, $recordSchema); } else { // Write the current OpenSearch/CQL query into a session variable: // (this session variable is used by functions 'atomCollection()' and 'citeRecords()' (in 'cite_html.php') to re-establish the original OpenSearch/CQL query; // function 'atomCollection()' uses the OpenSearch/CQL query to output 'opensearch.php' URLs instead of 'show.php' URLs) saveSessionVariable("cqlQuery", $cqlQuery); // function 'saveSessionVariable()' is defined in 'include.inc.php' // Build the correct query URL: // (we skip unnecessary parameters here since function 'generateURL()' and 'show.php' will use their default values for them) $queryParametersArray = array("where" => $query, "submit" => $displayType, "viewType" => $viewType, "exportStylesheet" => $exportStylesheet); // NOTE: The 'show.php' script allows anonymous users to query the 'cite_key' field (if a valid 'userID' is included in the query URL). // However, this requires that the cite key is passed in the 'cite_key' URL parameter. Since 'opensearch.php' uses the 'where' // parameter to pass its query, anonymous querying of the 'cite_key' field currently does not work for 'opensearch.php'. But // querying of user-specific fields will work if a user is logged in. if (isset($_SESSION['loginEmail'])) { // we only include the 'userID' parameter if the user is logged in
function parseCQL($sruVersion, $sruQuery, $operation = "") { global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php' // map CQL indexes to refbase field names: $indexNamesArray = mapCQLIndexes(); $searchArray = array(); // intialize array that will hold information about context set, index name, relation and search value $searchSubArray1 = array(); // -------------------------------- if (!empty($sruQuery)) { // check for presence of context set/index name and any of the main relations: if (!preg_match('/^[^\\" <>=]+( +(all|any|exact|within) +| *(<>|<=|>=|<|>|=) *)/', $sruQuery)) { // if no context set/index name and relation was given we'll add meaningful defaults: if (preg_match("/^suggest\$/i", $operation)) { $sruQuery = "main_fields all " . $sruQuery; } else { $sruQuery = "cql.serverChoice all " . $sruQuery; } // otherwise we currently use 'cql.serverChoice' (since 'main_fields' isn't yet supported for regular OpenSearch queries) } // extract the context set: if (preg_match('/^([^\\" <>=.]+)\\./', $sruQuery)) { $contextSet = preg_replace('/^([^\\" <>=.]+)\\..*/', '\\1', $sruQuery); } else { $contextSet = ""; } // use the default context set // extract the index: $indexName = preg_replace('/^(?:[^\\" <>=.]+\\.)?([^\\" <>=.]+).*/', '\\1', $sruQuery); // ---------------- // return a fatal diagnostic if the CQL query does contain an unrecognized 'set.index' identifier: // (a) verify that the given context set (if any) is recognized: if (!empty($contextSet)) { $contextSetIndexConnector = "."; $contextSetLabel = "context set '" . $contextSet . "'"; if (!preg_match("/^(dc|bath|rec|bib|cql)\$/", $contextSet)) { returnDiagnostic(15, $contextSet); // unsupported context set (function 'returnDiagnostic()' is defined in 'opensearch.php' and 'sru.php') exit; } } else { $contextSetIndexConnector = ""; $contextSetLabel = "empty context set"; } // (b) verify that the given 'set.index' term is recognized: if (!isset($indexNamesArray[$contextSet . $contextSetIndexConnector . $indexName])) { if (isset($indexNamesArray[$indexName]) or isset($indexNamesArray["dc." . $indexName]) or isset($indexNamesArray["bath." . $indexName]) or isset($indexNamesArray["rec." . $indexName]) or isset($indexNamesArray["bib." . $indexName]) or isset($indexNamesArray["cql." . $indexName])) { returnDiagnostic(10, "Unsupported combination of " . $contextSetLabel . " with index '" . $indexName . "'"); // unsupported combination of context set & index } else { returnDiagnostic(16, $indexName); // unsupported index } exit; } // ---------------- // extract the main relation (relation modifiers aren't supported yet!): $mainRelation = preg_replace('/^[^\\" <>=]+( +(all|any|exact|within) +| *(<>|<=|>=|<|>|=) *).*/', '\\1', $sruQuery); // remove any runs of leading or trailing whitespace: $mainRelation = trim($mainRelation); // ---------------- // extract the search term: $searchTerm = preg_replace('/^[^\\" <>=]+(?: +(?:all|any|exact|within) +| *(?:<>|<=|>=|<|>|=) *)(.*)/', '\\1', $sruQuery); // remove slashes from search term if 'magic_quotes_gpc = On': $searchTerm = stripSlashesIfMagicQuotes($searchTerm); // function 'stripSlashesIfMagicQuotes()' is defined in 'include.inc.php' // remove any leading or trailing quotes from the search term: // (note that multiple query parts connected with boolean operators aren't supported yet!) $searchTerm = preg_replace('/^\\"/', '', $searchTerm); $searchTerm = preg_replace('/\\"$/', '', $searchTerm); // OpenSearch search suggestions ('$operation=suggest'): since CQL matches full words (not sub-strings), // we need to make sure that every search term ends with the '*' masking character: if (preg_match("/^suggest\$/i", $operation) and $mainRelation != "exact") { $searchTerm = preg_replace("/([{$word}]+)(?![?*^])/{$patternModifiers}", "\\1*", $searchTerm); } // escape meta characters (including '/' that is used as delimiter for the PCRE replace functions below and which gets passed as second argument): $searchTerm = preg_quote($searchTerm, "/"); // escape special regular expression characters: . \ + * ? [ ^ ] $ ( ) { } = ! < > | : // account for CQL anchoring ('^') and masking ('*' and '?') characters: // NOTE: in the code block above we quote everything to escape possible meta characters, // so all special chars in the block below have to be matched in their escaped form! // (The expression '\\\\' in the patterns below describes only *one* backslash! -> '\'. // The reason for this is that before the regex engine can interpret the \\ into \, PHP interprets it. // Thus, you have to escape your backslashes twice: once for PHP, and once for the regex engine.) // // more info about masking characters in CQL: <http://zing.z3950.org/cql/intro.html#6> // more info about word anchoring in CQL: <http://zing.z3950.org/cql/intro.html#6.1> // recognize any anchor at the beginning of a search term (like '^foo'): // (in CQL, a word beginning with ^ must be the first in its field) $searchTerm = preg_replace('/(^| )\\\\\\^/', '\\1^', $searchTerm); // convert any anchor at the end of a search term (like 'foo^') to the correct MySQL variant ('foo$'): // (in CQL, a word ending with ^ must be the last in its field) $searchTerm = preg_replace('/\\\\\\^( |$)/', '$\\1', $searchTerm); // recognize any masking ('*' and '?') characters: // Note: by "character" we do refer to *word* characters here, i.e., any character that is not a space or punctuation character (see below); // however, I'm not sure if the masking characters '*' and '?' should also include non-word characters! $searchTerm = preg_replace('/(?<!\\\\)\\\\\\*/', '[^[:space:][:punct:]]*', $searchTerm); // a single asterisk ('*') is used to mask zero or more characters $searchTerm = preg_replace('/(?<!\\\\)\\\\\\?/', '[^[:space:][:punct:]]', $searchTerm); // a single question mark ('?') is used to mask a single character, thus N consecutive question-marks means mask N characters // ---------------- // construct the WHERE clause: $whereClausePart = $indexNamesArray[$contextSet . $contextSetIndexConnector . $indexName]; // start WHERE clause with field name if ($mainRelation == "all") { if (preg_match("/ /", $searchTerm)) { $searchTermArray = preg_split("/ +/", $searchTerm); foreach ($searchTermArray as $searchTermItem) { $whereClauseSubPartsArray[] = " RLIKE " . quote_smart("(^|[[:space:][:punct:]])" . $searchTermItem . "([[:space:][:punct:]]|\$)"); } // NOTE: For word-matching relations (like 'all', 'any' or '=') we could also use word boundaries which would be more (too?) restrictive: // // [[:<:]] , [[:>:]] // // They match the beginning and end of words, respectively. A word is a sequence of word characters that is not preceded by or // followed by word characters. A word character is an alphanumeric character in the alnum class or an underscore (_). $whereClausePart .= implode(" AND " . $indexNamesArray[$contextSet . $contextSetIndexConnector . $indexName], $whereClauseSubPartsArray); } else { $whereClausePart .= " RLIKE " . quote_smart("(^|[[:space:][:punct:]])" . $searchTerm . "([[:space:][:punct:]]|\$)"); } } elseif ($mainRelation == "any") { $searchTerm = splitAndMerge("/ +/", "|", $searchTerm); // function 'splitAndMerge()' is defined in 'include.inc.php' $whereClausePart .= " RLIKE " . quote_smart("(^|[[:space:][:punct:]])(" . $searchTerm . ")([[:space:][:punct:]]|\$)"); } elseif ($mainRelation == "exact") { // 'exact' is used for exact string matching, i.e., it matches field contents exactly $whereClausePart .= " = " . quote_smart($searchTerm); } elseif ($mainRelation == "within") { if (preg_match("/[^ ]+ [^ ]+/", $searchTerm)) { $searchTermArray = preg_split("/ +/", $searchTerm); $whereClausePart .= " >= " . quote_smart($searchTermArray[0]) . " AND " . $indexNamesArray[$contextSet . $contextSetIndexConnector . $indexName] . " <= " . quote_smart($searchTermArray[1]); } else { returnDiagnostic(36, "Search term requires two space-separated dimensions. Example: dc.date within \"2004 2005\""); exit; } } elseif ($mainRelation == "=") { // matches full words (not sub-strings); '=' is used for word adjacency, the words appear in that order with no others intervening $whereClausePart .= " RLIKE " . quote_smart("(^|[[:space:][:punct:]])" . $searchTerm . "([[:space:][:punct:]]|\$)"); } elseif ($mainRelation == "<>") { // does this also match full words (and not sub-strings) ?:-/ $whereClausePart .= " NOT RLIKE " . quote_smart("(^|[[:space:][:punct:]])" . $searchTerm . "([[:space:][:punct:]]|\$)"); } elseif ($mainRelation == "<") { $whereClausePart .= " < " . quote_smart($searchTerm); } elseif ($mainRelation == "<=") { $whereClausePart .= " <= " . quote_smart($searchTerm); } elseif ($mainRelation == ">") { $whereClausePart .= " > " . quote_smart($searchTerm); } elseif ($mainRelation == ">=") { $whereClausePart .= " >= " . quote_smart($searchTerm); } $searchSubArray1[] = array("_boolean" => "", "_query" => $whereClausePart); } else { $searchSubArray1[] = array("_boolean" => "", "_query" => "serial RLIKE " . quote_smart(".+")); } // -------------------------------- if (!empty($searchSubArray1)) { $searchArray[] = array("_boolean" => "", "_query" => $searchSubArray1); } return $searchArray; }