/** * Given an array of droplets information, populate identities from the * DB creating those that do not exist. * * @param array $identities Array containing the identity information */ public static function get_identities(array &$droplets) { if (empty($droplets)) { return; } // Generate the identity hashes and create a hash array of the given identites $identities_idx = array(); foreach ($droplets as $key => $droplet) { if (!isset($droplet['identity_id'])) { $hash = md5($droplet['channel'] . $droplet['identity_orig_id']); if (empty($identities_idx[$hash])) { $identities_idx[$hash] = array(); } $identities_idx[$hash][] = $key; } } if (empty($identities_idx)) { return; } Swiftriver_Mutex::obtain(get_class(), 3600); // Create the missing entries // Find those that exist $found = DB::select('hash', 'id')->from('identities')->where('hash', 'IN', array_keys($identities_idx))->execute()->as_array(); // Update the found entries $new_identity_count = count($identities_idx); foreach ($found as $hash) { foreach ($identities_idx[$hash['hash']] as $key) { $droplets[$key]['identity_id'] = $hash['id']; } $new_identity_count--; unset($identities_idx[$hash['hash']]); } if (!empty($identities_idx)) { // Get a range of IDs to be used in inserting the new drops $base_id = self::get_ids($new_identity_count); $query = DB::insert('identities', array('id', 'hash', 'channel', 'identity_orig_id', 'identity_name', 'identity_username', 'identity_avatar')); foreach ($identities_idx as $hash => $keys) { $droplet = NULL; foreach ($keys as $key) { $droplet = $droplets[$key]; $droplets[$key]['identity_id'] = $base_id; } $query->values(array('id' => $base_id++, 'hash' => $hash, 'channel' => $droplet['channel'], 'identity_orig_id' => $droplet['identity_orig_id'], 'identity_name' => $droplet['identity_name'], 'identity_username' => $droplet['identity_username'], 'identity_avatar' => $droplet['identity_avatar'])); } $query->execute(); } Swiftriver_Mutex::release(get_class()); }
/** * Creates trends from the given array * * @param array $trends * @return array */ public static function create_from_array($trends) { if (empty($trends)) { return; } Swiftriver_Mutex::obtain(get_class(), 3600); // Hash array with droplet_hash as key and index in droplets array that contain that hash $trends_idx = array(); foreach ($trends as $key => &$trend) { $hash = md5($trend['river_id'] . $trend['date_pub'] . $trend['tag'] . $trend['tag_type']); $trend['hash'] = $hash; $trends_idx[$hash] = $key; } // Find the drops that already exist by their droplet_hash $found_query = DB::select('hash', 'id')->from('river_tag_trends')->where('hash', 'IN', array_keys($trends_idx)); $found = $found_query->execute()->as_array(); // Create a query to update existing trends and // remove them from the trends_idx $update_query = NULL; foreach ($found as $hash) { if ($update_query) { $update_query .= ' union all '; } $count = $trends[$trends_idx[$hash['hash']]]['count']; $update_query .= 'select ' . $hash['id'] . ' id, ' . $count . ' count'; unset($trends_idx[$hash['hash']]); } if ($update_query) { $query = "UPDATE `river_tag_trends` JOIN (" . $update_query . ") a " . "USING (`id`) SET `river_tag_trends`.`count` = `river_tag_trends`.`count` + `a`.`count`"; DB::query(Database::UPDATE, $query)->execute(); } if (!empty($trends_idx)) { // Get a range of IDs to be used in inserting the new drops $base_id = Model_River_Tag_Trend::get_ids(count($trends_idx)); // Insert into the droplets table $query = DB::insert('river_tag_trends', array('id', 'hash', 'river_id', 'date_pub', 'tag', 'tag_type', 'count')); foreach ($trends_idx as $hash => $key) { $query->values(array('id' => $base_id++, 'hash' => $trends[$key]['hash'], 'river_id' => $trends[$key]['river_id'], 'date_pub' => $trends[$key]['date_pub'], 'tag' => $trends[$key]['tag'], 'tag_type' => $trends[$key]['tag_type'], 'count' => $trends[$key]['count'])); } $query->execute(); } Swiftriver_Mutex::release(get_class()); }
/** * Populate IDs into the droplets' tags' arrays creating those that are missing * The tags array is an array of hashes containing the * tag name and type as below * E.g: $tag = array('tag_name' => 'bubba', tag_type => 'junk'); * * @param string $tags Array of hashes described above */ public static function get_tags(&$drops) { if (empty($drops)) { return; } // Generate the tag hashes and create a index hash array of the given tag // linking a drop to a tag $tags_idx = array(); foreach ($drops as $drop_key => &$drop) { if (isset($drop['tags'])) { foreach ($drop['tags'] as $tag_key => &$tag) { if (!isset($tag['id'])) { $tag['tag_name'] = trim($tag['tag_name']); $tag['tag_type'] = strtolower(trim($tag['tag_type'])); $hash = md5($tag['tag_name'] . $tag['tag_type']); if (empty($tags_idx[$hash])) { $tags_idx[$hash]['tag_name'] = $tag['tag_name']; $tags_idx[$hash]['tag_type'] = $tag['tag_type']; $tags_idx[$hash]['keys'] = array(); } $tags_idx[$hash]['keys'][] = array($drop_key, $tag_key); } } } } if (empty($tags_idx)) { return; } Swiftriver_Mutex::obtain(get_class(), 3600); // Find those that exist $found = DB::select('hash', 'id')->from('tags')->where('hash', 'IN', array_keys($tags_idx))->execute()->as_array(); // Update the found entries $new_tag_count = count($tags_idx); foreach ($found as $hash) { foreach ($tags_idx[$hash['hash']]['keys'] as $keys) { list($drop_key, $tag_key) = $keys; $drops[$drop_key]['tags'][$tag_key]['id'] = $hash['id']; } $new_tag_count--; unset($tags_idx[$hash['hash']]); } if (!empty($tags_idx)) { // Get a range of IDs to be used in inserting the new tags $base_id = self::get_ids($new_tag_count); $query = DB::insert('tags', array('id', 'hash', 'tag', 'tag_canonical', 'tag_type')); foreach ($tags_idx as $hash => $value) { foreach ($value['keys'] as $key) { list($drop_key, $tag_key) = $key; $drops[$drop_key]['tags'][$tag_key]['id'] = $base_id; } $query->values(array('id' => $base_id++, 'hash' => $hash, 'tag' => $value['tag_name'], 'tag_canonical' => strtolower($value['tag_name']), 'tag_type' => $value['tag_type'])); } $query->execute(); } Swiftriver_Mutex::release(get_class()); }
/** * Populate IDs into the droplets' links arrays creating those that are missing * The links array is an array of urls * * @param string $links Array of hashes described above * @return mixed array of links ids if the links exists, FALSE otherwise */ public static function get_links(&$drops) { if (empty($drops)) { return; } // Generate the url hashes and create a index hash array of the given link // linking a drop to a link $links_idx = array(); foreach ($drops as $drop_key => &$drop) { if (isset($drop['links'])) { foreach ($drop['links'] as $link_key => $link) { if (!isset($link['id'])) { $hash = md5($link['url']); if (empty($links_idx[$hash])) { $links_idx[$hash]['url'] = $link['url']; $links_idx[$hash]['keys'] = array(); } $links_idx[$hash]['keys'][] = array($drop_key, $link_key); } } } } if (empty($links_idx)) { return; } Swiftriver_Mutex::obtain(get_class(), 3600); // Find those that exist $found = DB::select('hash', 'id')->from('links')->where('hash', 'IN', array_keys($links_idx))->execute()->as_array(); // Update the found entries $new_link_count = count($links_idx); foreach ($found as $hash) { foreach ($links_idx[$hash['hash']]['keys'] as $keys) { list($drop_key, $link_key) = $keys; $drops[$drop_key]['links'][$link_key]['id'] = $hash['id']; } $new_link_count--; unset($links_idx[$hash['hash']]); } if (!empty($links_idx)) { // Get a range of IDs to be used in inserting the new links $base_id = self::get_ids($new_link_count); $query = DB::insert('links', array('id', 'hash', 'url')); foreach ($links_idx as $hash => $value) { foreach ($value['keys'] as $key) { list($drop_key, $link_key) = $key; $drops[$drop_key]['links'][$link_key]['id'] = $base_id; } $query->values(array('id' => $base_id++, 'hash' => $hash, 'url' => $value['url'])); } $query->execute(); } Swiftriver_Mutex::release(get_class()); }
/** * Populate the droplet metadata tables. * * @param array $drops Drop array */ public static function add_metadata(&$drops) { // Build queries for creating entries in the meta tables droplet_tags, droplet_links and so on $drops_idx = array(); $river_check_query = NULL; $tags_ref = array(); $tag_values = NULL; $tag_check_query = NULL; $semantics_complete = array(); $link_values = NULL; $drop_links = NULL; $media_values = NULL; $drop_images = NULL; $media_thumbnail_values = NULL; $media_complete = array(); $place_values = NULL; $place_check_query = NULL; foreach ($drops as $key => $drop) { // Create an in memory drop reference $drops_idx[$drop['id']] = $key; // Place drops into rivers if (isset($drop['river_id'])) { foreach ($drop['river_id'] as $river_id) { // Subquery to find new river drops $subquery = DB::select(array(DB::expr($drop['id']), 'droplet_id'), array(DB::expr($river_id), 'river_id')); if (!$river_check_query) { $river_check_query = $subquery; } else { $river_check_query = $subquery->union($river_check_query, TRUE); } } } // Create a query to insert tags into droplets_tags if (isset($drop['tags'])) { foreach ($drop['tags'] as $tag) { // Create an in memory tag id -> tag detail reference $tags_ref[$tag['id']] = array('tag_name' => $tag['tag_name'], 'tag_type' => $tag['tag_type']); // Values for the drop tags insert query if ($tag_values) { $tag_values .= ','; } $tag_values .= '(' . $drop['id'] . ',' . $tag['id'] . ')'; // Subquery to find new tags $subquery = DB::select(array(DB::expr($drop['id']), 'droplet_id'), array(DB::expr($tag['id']), 'tag_id')); if (!$tag_check_query) { $tag_check_query = $subquery; } else { $tag_check_query = $subquery->union($tag_check_query, TRUE); } } } // Find drops that have complete semantic processing if (isset($drop['semantics_complete'])) { $semantics_complete[] = $drop['id']; } if (isset($drop['links'])) { foreach ($drop['links'] as $link) { if ($link_values) { $link_values .= ','; } $link_values .= '(' . $drop['id'] . ',' . $link['id'] . ')'; // Store drop original link for updating the droplets table if (isset($link['id']) and isset($link['original_url']) and $link['original_url']) { if ($drop_links) { $drop_links .= ' union all '; } $drop_links .= 'select ' . $link['id'] . ' drop_link, ' . $drop['id'] . ' id'; } } } if (isset($drop['media'])) { foreach ($drop['media'] as $media) { if ($media_values) { $media_values .= ','; } $media_values .= '(' . $drop['id'] . ',' . $media['id'] . ')'; if ($media['droplet_image']) { if ($drop_images) { $drop_images .= ' union all '; } $drop_images .= 'select ' . $media['id'] . ' droplet_image, ' . $drop['id'] . ' id'; } if (isset($media['thumbnails'])) { foreach ($media['thumbnails'] as $thumbnail) { if ($media_thumbnail_values) { $media_thumbnail_values .= ','; } $media_thumbnail_values .= '(' . $media['id'] . ',' . $thumbnail['size'] . ",'" . addslashes($thumbnail['url']) . "')"; } } } } // Find drops that have completed media processing if (isset($drop['media_complete'])) { $media_complete[] = $drop['id']; } if (isset($drop['places'])) { foreach ($drop['places'] as $place) { // Create an in memory tag id -> tag detail reference $places_ref[$place['id']] = array('place_name' => $place['place_name']); if ($place_values) { $place_values .= ','; } $place_values .= '(' . $drop['id'] . ',' . $place['id'] . ')'; // Subquery to find new places $subquery = DB::select(array(DB::expr($drop['id']), 'droplet_id'), array(DB::expr($place['id']), 'place_id')); if (!$place_check_query) { $place_check_query = $subquery; } else { $place_check_query = $subquery->union($place_check_query, TRUE); } } } } // Find river drops already in the DB $existing_river_drops = DB::select('droplet_id', 'river_id')->from('rivers_droplets')->where('droplet_id', 'IN', array_keys($drops_idx))->execute()->as_array(); // Find the new river drops we are just about to add $new_river_drops = NULL; $max_river_drop_ids = array(); if ($river_check_query) { Swiftriver_Mutex::obtain('rivers_droplets', 3600); $sub = DB::select('droplet_id', 'river_id')->from('rivers_droplets')->where('droplet_id', 'IN', array_keys($drops_idx)); $new_river_drops = DB::select('droplet_id', 'river_id')->distinct(TRUE)->from(array($river_check_query, 'a'))->where(DB::expr('(`droplet_id`, `river_id`)'), 'NOT IN', $sub)->execute()->as_array(); if (!empty($new_river_drops)) { $base_id = Model_Droplet::get_ids(count($new_river_drops), 'rivers_droplets'); // Insert into the rivers_droplets table $query = DB::insert('rivers_droplets', array('id', 'river_id', 'droplet_id', 'droplet_date_pub')); foreach ($new_river_drops as $new_river_drop) { $drops_key = $drops_idx[$new_river_drop['droplet_id']]; $date_pub = $drops[$drops_key]['droplet_date_pub']; $river_id = $new_river_drop['river_id']; $id = $base_id++; $query->values(array('id' => $id, 'river_id' => $river_id, 'droplet_id' => $new_river_drop['droplet_id'], 'droplet_date_pub' => $date_pub)); if (!isset($max_river_drop_ids[$river_id])) { $max_river_drop_ids[$river_id] = array('max_id' => $id, 'count' => 1); } else { $max_river_drop_ids[$river_id]['count'] += 1; if ($id > $max_river_drop_ids[$river_id]['max_id']) { $max_river_drop_ids[$river_id]['max_id'] = $id; } } } $query->execute(); $max_river_drops_query = NULL; foreach ($max_river_drop_ids as $key => $value) { if ($max_river_drops_query) { $max_river_drops_query .= ' union all '; } $max_river_drops_query .= 'select ' . $key . ' id, ' . $value['max_id'] . ' max_id, ' . $value['count'] . ' cnt'; } // Update river max_drop_id $update_rivers_sql = "UPDATE `rivers` JOIN (" . $max_river_drops_query . ") a " . "USING (`id`) SET `rivers`.`max_drop_id` = `a`.`max_id` " . "WHERE `rivers`.`max_drop_id` < `a`.`max_id`"; DB::query(Database::UPDATE, $update_rivers_sql)->execute(); // Update drop count $update_rivers_sql = "UPDATE `rivers` JOIN (" . $max_river_drops_query . ") a " . "USING (`id`) SET `rivers`.`drop_count` = `rivers`.`drop_count` + `a`.`cnt` "; DB::query(Database::UPDATE, $update_rivers_sql)->execute(); } Swiftriver_Mutex::release('rivers_droplets'); } // Find the new drop tags $new_drop_tags = array(); if ($tag_check_query) { $sub = DB::select('droplet_id', 'tag_id')->from('droplets_tags')->where('droplet_id', 'IN', array_keys($drops_idx)); $new_tags = DB::select('droplet_id', 'tag_id')->from(array($tag_check_query, 'a'))->where(DB::expr('(`droplet_id`, `tag_id`)'), 'NOT IN', $sub)->execute()->as_array(); foreach ($new_tags as $new_tag) { if (!isset($new_drop_tags[$new_tag['droplet_id']])) { $new_drop_tags[$new_tag['droplet_id']] = array(); } $new_drop_tags[$new_tag['droplet_id']][] = array('id' => $new_tag['tag_id'], 'tag' => $tags_ref[$new_tag['tag_id']]['tag_name'], 'tag_type' => $tags_ref[$new_tag['tag_id']]['tag_type']); } } // Update droplets tags $all_drop_tags = array(); if ($tag_values) { $insert_tags_sql = "INSERT IGNORE INTO `droplets_tags` (`droplet_id`, `tag_id`) " . "VALUES " . $tag_values; DB::query(Database::INSERT, $insert_tags_sql)->execute(); // Get all tags(new + existing) for drops that have just been added // to a river if ($new_river_drops) { $drop_tags = DB::select('droplet_id', 'tags.id', 'tag', 'tag_type')->from('droplets_tags')->join('tags', 'INNER')->on('droplets_tags.tag_id', '=', 'tags.id')->where('droplet_id', 'IN', array_keys($drops_idx))->execute()->as_array(); foreach ($drop_tags as $drop_tag) { if (!isset($all_drop_tags[$drop_tag['droplet_id']])) { $all_drop_tags[$drop_tag['droplet_id']] = array(); } $all_drop_tags[$drop_tag['droplet_id']][] = array('id' => $drop_tag['id'], 'tag' => $drop_tag['tag'], 'tag_type' => $drop_tag['tag_type']); } } } // Update drops that completed semantic processing if (!empty($semantics_complete)) { DB::update('droplets')->set(array('processing_status' => DB::expr('processing_status | ' . self::PROCESSING_FLAG_SEMANTICS)))->where("id", "IN", $semantics_complete)->execute(); } // Update droplet links if ($link_values) { $insert_links_sql = "INSERT IGNORE INTO `droplets_links` (`droplet_id`, `link_id`) " . "VALUES " . $link_values; DB::query(Database::INSERT, $insert_links_sql)->execute(); } // Set the drop's original url if ($drop_links) { $update_orig_url_sql = "UPDATE `droplets` JOIN (" . $drop_links . ") a " . "USING (`id`) SET `droplets`.`original_url` = `a`.`drop_link`"; DB::query(Database::UPDATE, $update_orig_url_sql)->execute(); } // Update droplet media if ($media_values) { $insert_media_sql = "INSERT IGNORE INTO `droplets_media` (`droplet_id`, `media_id`) " . "VALUES " . $media_values; DB::query(Database::INSERT, $insert_media_sql)->execute(); } // Insert thumbnails if ($media_thumbnail_values) { $insert_thumbnail_sql = "INSERT IGNORE INTO `media_thumbnails` (`media_id`, `size`, `url`) " . "VALUES " . $media_thumbnail_values; DB::query(Database::INSERT, $insert_thumbnail_sql)->execute(); } // Set drop image if ($drop_images) { $update_images_sql = "UPDATE `droplets` JOIN (" . $drop_images . ") a " . "USING (`id`) SET `droplets`.`droplet_image` = `a`.`droplet_image`"; DB::query(Database::UPDATE, $update_images_sql)->execute(); } // Update drops that completed media processing if (!empty($media_complete)) { DB::update('droplets')->set(array('processing_status' => DB::expr('processing_status | ' . self::PROCESSING_FLAG_MEDIA)))->where("id", "IN", $media_complete)->execute(); } // Find the new drop places $new_drop_places = array(); if ($place_check_query) { $sub = DB::select('droplet_id', 'place_id')->from('droplets_places')->where('droplet_id', 'IN', array_keys($drops_idx)); $new_places = DB::select('droplet_id', 'place_id')->from(array($place_check_query, 'a'))->where(DB::expr('(`droplet_id`, `place_id`)'), 'NOT IN', $sub)->execute()->as_array(); foreach ($new_places as $new_place) { if (!isset($new_drop_places[$new_place['droplet_id']])) { $new_drop_places[$new_place['droplet_id']] = array(); } $new_drop_places[$new_place['droplet_id']][] = array('id' => $new_place['place_id'], 'tag' => $places_ref[$new_place['place_id']]['place_name'], 'tag_type' => 'place'); } } // Update droplet places $all_drop_places = array(); if ($place_values) { $insert_places_sql = "INSERT IGNORE INTO `droplets_places` (`droplet_id`, `place_id`) " . "VALUES " . $place_values; DB::query(Database::INSERT, $insert_places_sql)->execute(); // Get all places(new + existing) for drops that have just been added // to a river if ($new_river_drops) { $drop_places = DB::select('droplet_id', 'places.id', 'place_name')->from('droplets_places')->join('places', 'INNER')->on('droplets_places.place_id', '=', 'places.id')->where('droplet_id', 'IN', array_keys($drops_idx))->execute()->as_array(); foreach ($drop_places as $drop_place) { if (!isset($all_drop_places[$drop_place['droplet_id']])) { $all_drop_places[$drop_place['droplet_id']] = array(); } $all_drop_places[$drop_place['droplet_id']][] = array('id' => $drop_place['id'], 'tag' => $drop_place['place_name'], 'tag_type' => 'place'); } } } // Update trends $trends = array(); // Update for drops already in the DB but we are adding new tags if ($existing_river_drops) { $trends = array_merge($trends, self::get_tag_trends($existing_river_drops, $new_drop_tags, $drops, $drops_idx)); $trends = array_merge($trends, self::get_tag_trends($existing_river_drops, $new_drop_places, $drops, $drops_idx)); } // Update for drops that already exist in the DB with tags but we are adding // the drop into a new river if ($new_river_drops) { $trends = array_merge($trends, self::get_tag_trends($new_river_drops, $all_drop_tags, $drops, $drops_idx)); $trends = array_merge($trends, self::get_tag_trends($new_river_drops, $all_drop_places, $drops, $drops_idx)); } if (!empty($trends)) { Model_River_Tag_Trend::create_from_array($trends); } }
/** * Forks of the callback into a separate process. * The parent process exits immediately completing the HTTP request. * and maintain a mutex preventing other instances of this class * from running before the callback completes. * Double fork is done to allow the callback to obtain another * mutex if need be. */ private function do_fork($callback) { // The signals used below require cli mode if (php_sapi_name() != 'cli') { Kohana::$log->add(Log::ERROR, "CLI mode is required"); return; } // Fork process to do the crawl if pcntl is installed if (!function_exists('pcntl_fork')) { Kohana::$log->add(Log::ERROR, "PCNTL is required"); return; } $pid = pcntl_fork(); if ($pid == -1) { Kohana::$log->add(Log::ERROR, "Forking failed."); } elseif ($pid == 0) { // Fork again // This second parent will hold the crawl mutex // so that child processes can other locks // Install signal handlers declare (ticks=1); // How often to check for signals // Run callable where OK received from parent pcntl_signal(SIGUSR1, $callback); // Exit when NACK received from parent. pcntl_signal(SIGUSR2, function ($signo) { exit; }); $pid = pcntl_fork(); // Force reconnection. Both parent and child // processes will open their own conneciton // once they start. Database::instance()->disconnect(); if ($pid == -1) { Kohana::$log->add(Log::ERROR, "Second fork failed."); } elseif ($pid == 0) { // Second child // Wait for signal from parent to proceed while (TRUE) { sleep(60); } } else { // Second parent try { Swiftriver_Mutex::obtain(get_class()); // Signal child to proceed Kohana::$log->write(); posix_kill($pid, SIGUSR1); } catch (SwiftRiver_Exception_Mutex $e) { // Signal child to exit Kohana::$log->add(Log::ERROR, "Unable to obtain mutex"); posix_kill($pid, SIGUSR2); exit; } pcntl_wait($status); Swiftriver_Mutex::release(get_class()); } } }