function extract_text($ref, $extension, $path = "")
{
    # path can be set to use an alternate file, for example, in the case of unoconv
    # Extract text from the resource and save to the configured field.
    global $extracted_text_field, $antiword_path, $pdftotext_path, $zip_contents_field;
    $text = "";
    if ($path == "") {
        $path = get_resource_path($ref, true, "", false, $extension);
    }
    # Microsoft Word extraction using AntiWord.
    if ($extension == "doc" && isset($antiword_path)) {
        $command = $antiword_path . "/antiword";
        if (!file_exists($command)) {
            $command = $antiword_path . "\\antiword.exe";
        }
        if (!file_exists($command)) {
            debug("ERROR: Antiword executable not found at '{$antiword_path}'");
            return false;
        }
        $text = run_command($command . " -m UTF-8 \"" . $path . "\"");
    }
    # Microsoft OfficeOpen (docx,xlsx) extraction
    # This is not perfect and needs some work, but does at least extract indexable content.
    if ($extension == "docx" || $extension == "xlsx") {
        $path = escapeshellarg($path);
        # DOCX files are zip files and the content is in word/document.xml.
        # We extract this then remove tags.
        switch ($extension) {
            case "xlsx":
                $text = run_command("unzip -p {$path} \"xl/sharedStrings.xml\"");
                break;
            case "docx":
                $text = run_command("unzip -p {$path} \"word/document.xml\"");
                break;
        }
        # Remove tags, but add newlines as appropriate (without this, separate text blocks are joined together with no spaces).
        $text = str_replace("<", "\n<", $text);
        $text = trim(strip_tags($text));
        while (strpos($text, "\n\n") !== false) {
            $text = str_replace("\n\n", "\n", $text);
        }
        # condense multiple line breaks
    }
    # OpenOffice Text (ODT)
    if ($extension == "odt" || $extension == "ods" || $extension == "odp") {
        $path = escapeshellarg($path);
        # ODT files are zip files and the content is in content.xml.
        # We extract this then remove tags.
        $text = run_command("unzip -p {$path} \"content.xml\"");
        # Remove tags, but add newlines as appropriate (without this, separate text blocks are joined together with no spaces).
        $text = str_replace("<", "\n<", $text);
        $text = trim(strip_tags($text));
        while (strpos($text, "\n\n") !== false) {
            $text = str_replace("\n\n", "\n", $text);
        }
        # condense multiple line breaks
    }
    # PDF extraction using pdftotext (part of the XPDF project)
    if (($extension == "pdf" || $extension == "ai") && isset($pdftotext_path)) {
        $command = $pdftotext_path . "/pdftotext";
        if (!file_exists($command)) {
            $command = $pdftotext_path . "\\pdftotext.exe";
        }
        if (!file_exists($command)) {
            debug("ERROR: pdftotext executable not found at '{$pdftotext_path}'");
            return false;
        }
        $text = run_command($command . " -enc UTF-8 \"" . $path . "\" -");
    }
    # HTML extraction
    if ($extension == "html" || $extension == "htm") {
        $text = strip_tags(file_get_contents($path));
    }
    # TXT extraction
    if ($extension == "txt") {
        $text = file_get_contents($path);
    }
    if ($extension == "zip") {
        # Zip files - map the field
        $path = escapeshellarg($path);
        $text = run_command("unzip -l {$path}");
        global $zip_contents_field_crop;
        if ($zip_contents_field_crop > 0) {
            # Remove the first few lines according to $zip_contents_field_crop in config.
            $text = explode("\n", $text);
            for ($n = 0; $n < count($zip_contents_field_crop); $n++) {
                array_shift($text);
            }
            $text = join("\n", $text);
        }
        if (isset($zip_contents_field)) {
            $extracted_text_field = $zip_contents_field;
        }
    }
    hook("textextraction", "all", array($extension, $path));
    # Save the extracted text.
    if ($text != "") {
        $modified_text = hook("modifiedextractedtext", '', array($text));
        if (!empty($modified_text)) {
            $text = $modified_text;
        }
        # Save text
        update_field($ref, $extracted_text_field, $text);
        # Update XML metadata dump file.
        update_xml_metadump($ref);
    }
}
예제 #2
0
function save_resource_data_multi($collection)
	{
	# Save all submitted data for collection $collection, this is for the 'edit multiple resources' feature
	# Loop through the field data and save (if necessary)
	$list=get_collection_resources($collection);

	$tmp = hook("altercollist", "", array("save_resource_data_multi", $list)); if(is_array($tmp)) { if(count($tmp)>0) $list = $tmp; else return true; } // alter the collection list to spare some when saving multiple, if you need

	$ref=$list[0];
	$fields=get_resource_field_data($ref,true);
	global $auto_order_checkbox;
	$expiry_field_edited=false;

	for ($n=0;$n<count($fields);$n++)
		{
		if (getval("editthis_field_" . $fields[$n]["ref"],"")!="" || hook("save_resource_data_multi_field_decision","",array($fields[$n]["ref"])))
			{
			if ($fields[$n]["type"]==2)
				{
				# construct the value from the ticked boxes
				$val=","; # Note: it seems wrong to start with a comma, but this ensures it is treated as a comma separated list by split_keywords(), so if just one item is selected it still does individual word adding, so 'South Asia' is split to 'South Asia','South','Asia'.
				$options=trim_array(explode(",",$fields[$n]["options"]));
				if ($auto_order_checkbox) {sort($options);}
				
				for ($m=0;$m<count($options);$m++)
					{
					$name=$fields[$n]["ref"] . "_" . md5($options[$m]);
					if (getval($name,"")=="yes")
						{
						if ($val!=",") {$val.=",";}
						$val.=$options[$m];
						}
					}
				}
			elseif ($fields[$n]["type"]==4 || $fields[$n]["type"]==6 || $fields[$n]["type"]==10)
				{
				# date/expiry date type, construct the value from the date dropdowns
				$val=sprintf("%04d", getvalescaped("field_" . $fields[$n]["ref"] . "-y",""));
				if ((int)$val<=0) 
					{
					$val="";
					}
				elseif (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-m",""))!="") 
					{
					$val.="-" . $field;
					if (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-d",""))!="") 
						{
						$val.="-" . $field;
						if (($field=getval("field_" . $fields[$n]["ref"] . "-h",""))!="")
							{
							$val.=" " . $field . ":";
							if (($field=getvalescaped("field_" . $fields[$n]["ref"] . "-i",""))!="") 
								{
									$val.=$field;
								} 
							else 
								{
									$val.="00";
								}
							}
						}
					}
				}
			elseif ($fields[$n]["type"] == 3)
				{
				$val=getvalescaped("field_" . $fields[$n]["ref"],"");				
				// if it doesn't already start with a comma, add one
				if (substr($val,0,1) != ',')
					{
					$val = ','.$val;
					}
				}
			else
				{
				$val=getvalescaped("field_" . $fields[$n]["ref"],"");
				}
			$origval=$val;
			# Loop through all the resources and save.
			for ($m=0;$m<count($list);$m++)
				{
				$ref=$list[$m];
				$resource_sql="";

				# Work out existing field value.
				$existing=escape_check(sql_value("select value from resource_data where resource='$ref' and resource_type_field='" . $fields[$n]["ref"] . "'",""));
				
				# Find and replace mode? Perform the find and replace.
				if (getval("modeselect_" . $fields[$n]["ref"],"")=="FR")
					{
					$val=str_replace
						(
						getvalescaped("find_" . $fields[$n]["ref"],""),
						getvalescaped("replace_" . $fields[$n]["ref"],""),
						$existing
						);
					}
				
				# Append text/option(s) mode?
				if (getval("modeselect_" . $fields[$n]["ref"],"")=="AP")
					{
					if ($fields[$n]["type"]!=2 && $fields[$n]["type"]!=3)
						{
						# Automatically append a space when appending text types.
						$val=$existing . " " . $origval;
						}
					else
						{
						# Checkbox/dropdown types can just append immediately (a comma will already be present at the beginning of $origval).
						$val=$existing . $origval;
						}
					}
					
				# Prepend text/option(s) mode?
				if (getval("modeselect_" . $fields[$n]["ref"],"")=="PP"){
					global $filename_field;
					if ($fields[$n]["ref"]==$filename_field){
						$val=rtrim($origval,"_")."_".trim($existing); // use an underscore if editing filename.
					}
					else {
						# Automatically append a space when appending text types.
						$val=$origval . " " . $existing;
					}
				}
					
				# Remove text/option(s) mode?
				if (getval("modeselect_" . $fields[$n]["ref"],"")=="RM")
					{
					$val=str_replace($origval,"",$existing);
					}
					
				$val=strip_leading_comma($val);		
				#echo "<li>existing=$existing, new=$val";
				if ($existing!=str_replace("\\","",$val))
					{
					# This value is different from the value we have on record.
					
					# Write this edit to the log.
					resource_log($ref,'m',$fields[$n]["ref"],"",$existing,$val);
		
					# Expiry field? Set that expiry date(s) have changed so the expiry notification flag will be reset later in this function.
					if ($fields[$n]["type"]==6) {$expiry_field_edited=true;}
				
					# If this is a 'joined' field we need to add it to the resource column
					$joins=get_resource_table_joins();
					if (in_array($fields[$n]["ref"],$joins)){
						sql_query("update resource set field".$fields[$n]["ref"]."='".escape_check($val)."' where ref='$ref'");
					}		
						
					# Purge existing data and keyword mappings, decrease keyword hitcounts.
					sql_query("delete from resource_data where resource='$ref' and resource_type_field='" . $fields[$n]["ref"] . "'");
					
					# Insert new data and keyword mappings, increase keyword hitcounts.
					sql_query("insert into resource_data(resource,resource_type_field,value) values('$ref','" . $fields[$n]["ref"] . "','" . escape_check($val) . "')");
		
					$oldval=$existing;
					$newval=$val;
					
					if ($fields[$n]["type"]==3)
						{
						# Prepend a comma when indexing dropdowns
						$newval="," . $val;
						$oldval="," . $oldval;
						}
					
					if ($fields[$n]["keywords_index"]==1)
						{
						# Date field? These need indexing differently.
						$is_date=($fields[$n]["type"]==4 || $fields[$n]["type"]==6); 
						remove_keyword_mappings($ref,i18n_get_indexable($oldval),$fields[$n]["ref"],$fields[$n]["partial_index"],$is_date);
						add_keyword_mappings($ref,i18n_get_indexable($newval),$fields[$n]["ref"],$fields[$n]["partial_index"],$is_date);
						}
					}
				}
			}
		}
		
	# Also save related resources field
	if (getval("editthis_related","")!="")
		{
		$related=explode(",",getvalescaped("related",""));
		# Make sure all submitted values are numeric
		$ok=array();for ($n=0;$n<count($related);$n++) {if (is_numeric(trim($related[$n]))) {$ok[]=trim($related[$n]);}}

		for ($m=0;$m<count($list);$m++)
			{
			$ref=$list[$m];
			sql_query("delete from resource_related where resource='$ref' or related='$ref'"); # remove existing related items
			if (count($ok)>0) {sql_query("insert into resource_related(resource,related) values ($ref," . join("),(" . $ref . ",",$ok) . ")");}
			}
		}

	# Also update archive status
	if (getval("editthis_status","")!="")
		{
		$notifyrefs=array();
		for ($m=0;$m<count($list);$m++)
			{
			$ref=$list[$m];
			$archive=getvalescaped("archive",0);
			$oldarchive=sql_value("select archive value from resource where ref='$ref'",0);
			
			if ($oldarchive!=$archive)
				{
				sql_query("update resource set archive='" . $archive . "' where ref='$ref'");

				# Log
				resource_log($ref,"s",0,"",$oldarchive,$archive);

				if ($oldarchive==-2 && $archive==-1)
					{
					# Notify the admin users of this change.
					$notifyrefs[]=$ref;
					}
				}
			}
		if (count($notifyrefs)>0)
			{
			# Notify the admin users of any submitted resources.
			notify_user_contributed_submitted($notifyrefs);
			}
		}
	
	# Expiry field(s) edited? Reset the notification flag so that warnings are sent again when the date is reached.
	if ($expiry_field_edited)
		{
		if (count($list)>0)
			{
			sql_query("update resource set expiry_notification_sent=0 where ref in (" . join(",",$list) . ")");
			}
		}
	
	# Also update access level
	if (getval("editthis_access","")!="")
		{
		for ($m=0;$m<count($list);$m++)
			{
			$ref=$list[$m];
			$access=getvalescaped("access",0);
			$oldaccess=sql_value("select access value from resource where ref='$ref'","");
			
			if ($access!=$oldaccess)
				{
				sql_query("update resource set access='$access' where ref='$ref'");
				
				resource_log($ref,"a",0,"",$oldaccess,$access);
				}
			
			# For access level 3 (custom) - also save custom permissions
			if ($access==3) {save_resource_custom_access($ref);}
			}
		}
	
	# Update resource type?
	if (getval("editresourcetype","")!="")
		{
		for ($m=0;$m<count($list);$m++)
			{
			$ref=$list[$m];
			update_resource_type($ref,getvalescaped("resource_type",""));
			}
		}
		
	# Update location?
	if (getval("editlocation","")!="")
		{
		$location=explode(",",getvalescaped("location",""));
		if (count($list)>0) 
			{
			if (count($location)==2)
				{
				$geo_lat=(float)$location[0];
				$geo_long=(float)$location[1];
				sql_query("update resource set geo_lat=$geo_lat,geo_long=$geo_long where ref in (" . join(",",$list) . ")");
				}
			elseif (getvalescaped("location","")=="")
				{
				sql_query("update resource set geo_lat=null,geo_long=null where ref in (" . join(",",$list) . ")");
				}
			}
		}

	# Update mapzoom?
	if (getval("editmapzoom","")!="")
		{
		$mapzoom=getvalescaped("mapzoom","");
		if (count($list)>0)
			{
			if ($mapzoom!="")
				{
				sql_query("update resource set mapzoom=$mapzoom where ref in (" . join(",",$list) . ")");
				}
			else
				{
				sql_query("update resource set mapzoom=null where ref in (" . join(",",$list) . ")");
				}
			}
		}

	hook("saveextraresourcedata","",array($list));
		
	# Update XML metadata dump file for all edited resources.
	for ($m=0;$m<count($list);$m++)
		{
		update_xml_metadump($list[$m]);
		}
	
	hook("aftersaveresourcedata");	
	}
#
# update_xml_metadump.php
#
#
# update XML metadump files in filestore from scratch
#
include "../../include/db.php";
include "../../include/general.php";
include "../../include/authenticate.php";
if (!checkperm("a")) {
    exit("Permission denied");
}
include "../../include/resource_functions.php";
include "../../include/image_processing.php";
$sql = "";
if (getval("ref", "") != "") {
    $sql = "where r.ref='" . getvalescaped("ref", "", true) . "'";
}
set_time_limit(60 * 60 * 5);
echo "<pre><strong>\nUpdating XML metadata dump files...</strong>\n\n";
$start = getval('start', '0');
if (!is_numeric($start)) {
    $start = 0;
}
$resources = sql_query("select r.ref,u.username,u.fullname from resource r left outer join user u on r.created_by=u.ref {$sql} order by ref");
for ($n = $start; $n < count($resources); $n++) {
    $ref = $resources[$n]["ref"];
    update_xml_metadump($ref);
    echo "Done {$ref} ({$n}/" . count($resources) . ")<br />\n";
    flush();
}