function extract_text($ref, $extension, $path = "") { # path can be set to use an alternate file, for example, in the case of unoconv # Extract text from the resource and save to the configured field. global $extracted_text_field, $antiword_path, $pdftotext_path, $zip_contents_field; $text = ""; if ($path == "") { $path = get_resource_path($ref, true, "", false, $extension); } # Microsoft Word extraction using AntiWord. if ($extension == "doc" && isset($antiword_path)) { $command = $antiword_path . "/antiword"; if (!file_exists($command)) { $command = $antiword_path . "\\antiword.exe"; } if (!file_exists($command)) { debug("ERROR: Antiword executable not found at '{$antiword_path}'"); return false; } $text = run_command($command . " -m UTF-8 \"" . $path . "\""); } # Microsoft OfficeOpen (docx,xlsx) extraction # This is not perfect and needs some work, but does at least extract indexable content. if ($extension == "docx" || $extension == "xlsx") { $path = escapeshellarg($path); # DOCX files are zip files and the content is in word/document.xml. # We extract this then remove tags. switch ($extension) { case "xlsx": $text = run_command("unzip -p {$path} \"xl/sharedStrings.xml\""); break; case "docx": $text = run_command("unzip -p {$path} \"word/document.xml\""); break; } # Remove tags, but add newlines as appropriate (without this, separate text blocks are joined together with no spaces). $text = str_replace("<", "\n<", $text); $text = trim(strip_tags($text)); while (strpos($text, "\n\n") !== false) { $text = str_replace("\n\n", "\n", $text); } # condense multiple line breaks } # OpenOffice Text (ODT) if ($extension == "odt" || $extension == "ods" || $extension == "odp") { $path = escapeshellarg($path); # ODT files are zip files and the content is in content.xml. # We extract this then remove tags. $text = run_command("unzip -p {$path} \"content.xml\""); # Remove tags, but add newlines as appropriate (without this, separate text blocks are joined together with no spaces). $text = str_replace("<", "\n<", $text); $text = trim(strip_tags($text)); while (strpos($text, "\n\n") !== false) { $text = str_replace("\n\n", "\n", $text); } # condense multiple line breaks } # PDF extraction using pdftotext (part of the XPDF project) if (($extension == "pdf" || $extension == "ai") && isset($pdftotext_path)) { $command = $pdftotext_path . "/pdftotext"; if (!file_exists($command)) { $command = $pdftotext_path . "\\pdftotext.exe"; } if (!file_exists($command)) { debug("ERROR: pdftotext executable not found at '{$pdftotext_path}'"); return false; } $text = run_command($command . " -enc UTF-8 \"" . $path . "\" -"); } # HTML extraction if ($extension == "html" || $extension == "htm") { $text = strip_tags(file_get_contents($path)); } # TXT extraction if ($extension == "txt") { $text = file_get_contents($path); } if ($extension == "zip") { # Zip files - map the field $path = escapeshellarg($path); $text = run_command("unzip -l {$path}"); global $zip_contents_field_crop; if ($zip_contents_field_crop > 0) { # Remove the first few lines according to $zip_contents_field_crop in config. $text = explode("\n", $text); for ($n = 0; $n < count($zip_contents_field_crop); $n++) { array_shift($text); } $text = join("\n", $text); } if (isset($zip_contents_field)) { $extracted_text_field = $zip_contents_field; } } hook("textextraction", "all", array($extension, $path)); # Save the extracted text. if ($text != "") { $modified_text = hook("modifiedextractedtext", '', array($text)); if (!empty($modified_text)) { $text = $modified_text; } # Save text update_field($ref, $extracted_text_field, $text); # Update XML metadata dump file. update_xml_metadump($ref); } }
function save_resource_data_multi($collection) { # Save all submitted data for collection $collection, this is for the 'edit multiple resources' feature # Loop through the field data and save (if necessary) $list=get_collection_resources($collection); $tmp = hook("altercollist", "", array("save_resource_data_multi", $list)); if(is_array($tmp)) { if(count($tmp)>0) $list = $tmp; else return true; } // alter the collection list to spare some when saving multiple, if you need $ref=$list[0]; $fields=get_resource_field_data($ref,true); global $auto_order_checkbox; $expiry_field_edited=false; for ($n=0;$n<count($fields);$n++) { if (getval("editthis_field_" . $fields[$n]["ref"],"")!="" || hook("save_resource_data_multi_field_decision","",array($fields[$n]["ref"]))) { if ($fields[$n]["type"]==2) { # construct the value from the ticked boxes $val=","; # Note: it seems wrong to start with a comma, but this ensures it is treated as a comma separated list by split_keywords(), so if just one item is selected it still does individual word adding, so 'South Asia' is split to 'South Asia','South','Asia'. $options=trim_array(explode(",",$fields[$n]["options"])); if ($auto_order_checkbox) {sort($options);} for ($m=0;$m<count($options);$m++) { $name=$fields[$n]["ref"] . "_" . md5($options[$m]); if (getval($name,"")=="yes") { if ($val!=",") {$val.=",";} $val.=$options[$m]; } } } elseif ($fields[$n]["type"]==4 || $fields[$n]["type"]==6 || $fields[$n]["type"]==10) { # date/expiry date type, construct the value from the date dropdowns $val=sprintf("%04d", getvalescaped("field_" . $fields[$n]["ref"] . "-y","")); if ((int)$val<=0) { $val=""; } elseif (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-m",""))!="") { $val.="-" . $field; if (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-d",""))!="") { $val.="-" . $field; if (($field=getval("field_" . $fields[$n]["ref"] . "-h",""))!="") { $val.=" " . $field . ":"; if (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-i",""))!="") { $val.=$field; } else { $val.="00"; } } } } } elseif ($fields[$n]["type"] == 3) { $val=getvalescaped("field_" . $fields[$n]["ref"],""); // if it doesn't already start with a comma, add one if (substr($val,0,1) != ',') { $val = ','.$val; } } else { $val=getvalescaped("field_" . $fields[$n]["ref"],""); } $origval=$val; # Loop through all the resources and save. for ($m=0;$m<count($list);$m++) { $ref=$list[$m]; $resource_sql=""; # Work out existing field value. $existing=escape_check(sql_value("select value from resource_data where resource='$ref' and resource_type_field='" . $fields[$n]["ref"] . "'","")); # Find and replace mode? Perform the find and replace. if (getval("modeselect_" . $fields[$n]["ref"],"")=="FR") { $val=str_replace ( getvalescaped("find_" . $fields[$n]["ref"],""), getvalescaped("replace_" . $fields[$n]["ref"],""), $existing ); } # Append text/option(s) mode? if (getval("modeselect_" . $fields[$n]["ref"],"")=="AP") { if ($fields[$n]["type"]!=2 && $fields[$n]["type"]!=3) { # Automatically append a space when appending text types. $val=$existing . " " . $origval; } else { # Checkbox/dropdown types can just append immediately (a comma will already be present at the beginning of $origval). $val=$existing . $origval; } } # Prepend text/option(s) mode? if (getval("modeselect_" . $fields[$n]["ref"],"")=="PP"){ global $filename_field; if ($fields[$n]["ref"]==$filename_field){ $val=rtrim($origval,"_")."_".trim($existing); // use an underscore if editing filename. } else { # Automatically append a space when appending text types. $val=$origval . " " . $existing; } } # Remove text/option(s) mode? if (getval("modeselect_" . $fields[$n]["ref"],"")=="RM") { $val=str_replace($origval,"",$existing); } $val=strip_leading_comma($val); #echo "<li>existing=$existing, new=$val"; if ($existing!=str_replace("\\","",$val)) { # This value is different from the value we have on record. # Write this edit to the log. resource_log($ref,'m',$fields[$n]["ref"],"",$existing,$val); # Expiry field? Set that expiry date(s) have changed so the expiry notification flag will be reset later in this function. if ($fields[$n]["type"]==6) {$expiry_field_edited=true;} # If this is a 'joined' field we need to add it to the resource column $joins=get_resource_table_joins(); if (in_array($fields[$n]["ref"],$joins)){ sql_query("update resource set field".$fields[$n]["ref"]."='".escape_check($val)."' where ref='$ref'"); } # Purge existing data and keyword mappings, decrease keyword hitcounts. sql_query("delete from resource_data where resource='$ref' and resource_type_field='" . $fields[$n]["ref"] . "'"); # Insert new data and keyword mappings, increase keyword hitcounts. sql_query("insert into resource_data(resource,resource_type_field,value) values('$ref','" . $fields[$n]["ref"] . "','" . escape_check($val) . "')"); $oldval=$existing; $newval=$val; if ($fields[$n]["type"]==3) { # Prepend a comma when indexing dropdowns $newval="," . $val; $oldval="," . $oldval; } if ($fields[$n]["keywords_index"]==1) { # Date field? These need indexing differently. $is_date=($fields[$n]["type"]==4 || $fields[$n]["type"]==6); remove_keyword_mappings($ref,i18n_get_indexable($oldval),$fields[$n]["ref"],$fields[$n]["partial_index"],$is_date); add_keyword_mappings($ref,i18n_get_indexable($newval),$fields[$n]["ref"],$fields[$n]["partial_index"],$is_date); } } } } } # Also save related resources field if (getval("editthis_related","")!="") { $related=explode(",",getvalescaped("related","")); # Make sure all submitted values are numeric $ok=array();for ($n=0;$n<count($related);$n++) {if (is_numeric(trim($related[$n]))) {$ok[]=trim($related[$n]);}} for ($m=0;$m<count($list);$m++) { $ref=$list[$m]; sql_query("delete from resource_related where resource='$ref' or related='$ref'"); # remove existing related items if (count($ok)>0) {sql_query("insert into resource_related(resource,related) values ($ref," . join("),(" . $ref . ",",$ok) . ")");} } } # Also update archive status if (getval("editthis_status","")!="") { $notifyrefs=array(); for ($m=0;$m<count($list);$m++) { $ref=$list[$m]; $archive=getvalescaped("archive",0); $oldarchive=sql_value("select archive value from resource where ref='$ref'",0); if ($oldarchive!=$archive) { sql_query("update resource set archive='" . $archive . "' where ref='$ref'"); # Log resource_log($ref,"s",0,"",$oldarchive,$archive); if ($oldarchive==-2 && $archive==-1) { # Notify the admin users of this change. $notifyrefs[]=$ref; } } } if (count($notifyrefs)>0) { # Notify the admin users of any submitted resources. notify_user_contributed_submitted($notifyrefs); } } # Expiry field(s) edited? Reset the notification flag so that warnings are sent again when the date is reached. if ($expiry_field_edited) { if (count($list)>0) { sql_query("update resource set expiry_notification_sent=0 where ref in (" . join(",",$list) . ")"); } } # Also update access level if (getval("editthis_access","")!="") { for ($m=0;$m<count($list);$m++) { $ref=$list[$m]; $access=getvalescaped("access",0); $oldaccess=sql_value("select access value from resource where ref='$ref'",""); if ($access!=$oldaccess) { sql_query("update resource set access='$access' where ref='$ref'"); resource_log($ref,"a",0,"",$oldaccess,$access); } # For access level 3 (custom) - also save custom permissions if ($access==3) {save_resource_custom_access($ref);} } } # Update resource type? if (getval("editresourcetype","")!="") { for ($m=0;$m<count($list);$m++) { $ref=$list[$m]; update_resource_type($ref,getvalescaped("resource_type","")); } } # Update location? if (getval("editlocation","")!="") { $location=explode(",",getvalescaped("location","")); if (count($list)>0) { if (count($location)==2) { $geo_lat=(float)$location[0]; $geo_long=(float)$location[1]; sql_query("update resource set geo_lat=$geo_lat,geo_long=$geo_long where ref in (" . join(",",$list) . ")"); } elseif (getvalescaped("location","")=="") { sql_query("update resource set geo_lat=null,geo_long=null where ref in (" . join(",",$list) . ")"); } } } # Update mapzoom? if (getval("editmapzoom","")!="") { $mapzoom=getvalescaped("mapzoom",""); if (count($list)>0) { if ($mapzoom!="") { sql_query("update resource set mapzoom=$mapzoom where ref in (" . join(",",$list) . ")"); } else { sql_query("update resource set mapzoom=null where ref in (" . join(",",$list) . ")"); } } } hook("saveextraresourcedata","",array($list)); # Update XML metadata dump file for all edited resources. for ($m=0;$m<count($list);$m++) { update_xml_metadump($list[$m]); } hook("aftersaveresourcedata"); }
# # update_xml_metadump.php # # # update XML metadump files in filestore from scratch # include "../../include/db.php"; include "../../include/general.php"; include "../../include/authenticate.php"; if (!checkperm("a")) { exit("Permission denied"); } include "../../include/resource_functions.php"; include "../../include/image_processing.php"; $sql = ""; if (getval("ref", "") != "") { $sql = "where r.ref='" . getvalescaped("ref", "", true) . "'"; } set_time_limit(60 * 60 * 5); echo "<pre><strong>\nUpdating XML metadata dump files...</strong>\n\n"; $start = getval('start', '0'); if (!is_numeric($start)) { $start = 0; } $resources = sql_query("select r.ref,u.username,u.fullname from resource r left outer join user u on r.created_by=u.ref {$sql} order by ref"); for ($n = $start; $n < count($resources); $n++) { $ref = $resources[$n]["ref"]; update_xml_metadump($ref); echo "Done {$ref} ({$n}/" . count($resources) . ")<br />\n"; flush(); }