public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // OVERRIDES for epicurious // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "ingredients"]/*'); foreach ($nodes as $node) { // <strong> contains ingredient section names if ($node->nodeName == 'strong') { $line = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($line); continue; } // Extract ingredients from inside of <ul class="ingredientsList"> if ($node->nodeName == 'ul') { // Child nodes should all be <li> $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { if ($ing_node->nodeName == 'li') { $line = trim($ing_node->nodeValue); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Photo URL, use larger version found on MyRecipes $recipe->photo_url = str_replace('-l.jpg', '-x.jpg', $recipe->photo_url); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@class="recipeDetails"]/ul'); foreach ($nodes->item(0)->childNodes as $li) { if ($li->nodeName == 'li') { $text = RecipeParser_Text::FormatAsOneLine($li->nodeValue); if ($li->getAttribute('itemprop') == 'ingredient') { $text = trim(str_replace('$Click to see savings', '', $text)); $recipe->appendIngredient($text); } else { $text = RecipeParser_Text::formatSectionName($text); $recipe->addIngredientsSection($text); } } } // Credits $nodes = $xpath->query('//*[@itemprop="author"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->credits = $line; } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "recipe-ingredients"]/*'); foreach ($nodes as $node) { if ($node->nodeName == 'p') { $value = trim($node->nodeValue); // Older recipes will have ingredients jumbled into a single <p> // rather than using 'ingredients' classes. If the node value looks // like multiple lines, treat it like a section header followed by // section ingredients. $lines = explode("\n", $value); if (count($lines) > 1) { for ($i = 0; $i < count($lines); $i++) { $line = trim($lines[$i]); if ($i == 0) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { $line = trim($line); $recipe->appendIngredient($line); } } // Otherwise, we're dealing with a normal section for hrecipe, and // ingredients for the section will follow as <ul> elements. } else { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { if ($node->nodeName == 'ul') { $subnodes = $xpath->query('./li[@class = "ingredient"]', $node); foreach ($subnodes as $subnode) { $value = trim($subnode->nodeValue); $recipe->appendIngredient($value); } } } } // Notes $nodes = $xpath->query('//*[@id="recipe-intronote"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->notes = RecipeParser_Text::formatAsParagraphs($value); } // Photo URL to replace og:image $nodes = $xpath->query('//img[@itemprop="image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("src"); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard hrecipe stuff we can find. $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Multi-stage ingredients $nodes = $xpath->query('//dl[@id="stages"]/*'); if ($nodes->length) { $recipe->resetIngredients(); foreach ($nodes as $node) { if ($node->nodeName == 'dt') { $value = $node->nodeValue; $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { if ($node->nodeName == 'dd') { $subs = $xpath->query('.//*[@class="ingredient"]', $node); foreach ($subs as $sub) { $value = trim($sub->nodeValue); $recipe->appendIngredient($value); } } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = null; if (!$nodes || !$nodes->length) { $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="view-content"]/*'); } if (!$nodes || !$nodes->length) { $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="ingredient-lists separator-serated tab-content"]/*'); } foreach ($nodes as $node) { if ($node->nodeName == 'h3') { $line = $node->nodeValue; $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { if ($node->nodeName == 'ul') { foreach ($node->childNodes as $subnode) { $line = $subnode->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " yield ")]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Times $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " prepTime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " rspec-cook-time ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " totaltime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($line); } // Ingredients $recipe->resetIngredients(); $ing_nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredients ")]/*'); foreach ($ing_nodes as $ing_node) { if ($ing_node->getAttribute('class') == "ingr-divider") { $line = RecipeParser_Text::formatSectionName($ing_node->nodeValue); $recipe->addIngredientsSection($line); continue; } // Extract ingredients from inside of <ul class="ingredientsList"> // Child nodes should all be <li> if ($ing_node->nodeName == 'ul') { foreach ($ing_node->childNodes as $node) { $line = trim($node->nodeValue); $recipe->appendIngredient($line); } continue; } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $node_list = $doc->getElementsByTagName('title'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $value = trim(str_replace("Cooks.com - Recipe - ", "", $value)); $value = trim(str_replace(" - Recipe - Cooks.com", "", $value)); $recipe->title = $value; } // This node contains all ingredients, section titles, and instructions $node_list = $xpath->query('//table[@class="hrecipe"]//td/div'); foreach ($node_list as $node) { // Can determine each piece of content by the "style" attributes. $style = $node->getAttribute("style"); // Ingredients found in a div, black text if (stripos($style, "color: BLACK;") !== false) { $ing_nodes = $xpath->query('./span[@class = "ingredient"]', $node); foreach ($ing_nodes as $ing_node) { $recipe->appendIngredient($ing_node->nodeValue); } // Instructions node } else { if ($node->getAttribute('class') == "instructions") { foreach ($node->childNodes as $child) { $line = $child->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Section title } else { if ($node->getAttribute("class") == "section") { $title = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($title); if (count($recipe->instructions) > 0) { $recipe->addInstructionsSection($title); } } } } } return $recipe; }
public static function parse($html, $url) { // Get all of the standard hrecipe stuff we can find. $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[@name="resizeTo"]'); if ($nodes->length) { $line = trim($nodes->item(0)->getAttribute("value")) . " servings"; $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($nodes as $node) { $parts = array(); foreach ($node->childNodes as $n) { $parts[] = $n->nodeValue; } $line = implode(' ', $parts); $line = str_replace(" ; ", "; ", $line); $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//div[@class="display-field"]/p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); if ($line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addInstructionsSection($line); } else { $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $sections = $xpath->query('//*[@id="ingredients"]//*[@class="group"]'); if ($sections->length) { // Sections foreach ($sections as $section_node) { $section_nodes = $xpath->query('.//h3', $section_node); if ($section_nodes->length) { $line = $section_nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); if (!empty($line)) { $recipe->addIngredientsSection($line); } } $ing_nodes = $xpath->query('.//li', $section_node); if ($ing_nodes->length) { foreach ($ing_nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } } // Notes $nodes = $xpath->query('.//*[@class = "body-c note-text"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = trim(str_replace("Cook's Note", '', $value)); $recipe->notes = $value; } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // // Some of the ingredient lines in on The Daily Meal do not adhere to // the usual microdata formatting. Here we fall back to looking for a // regular list within a higher-level ingredients div. // if (!empty($recipe->ingredients)) { $nodes = $xpath->query("//div[@class='content']/div[@class='ingredient']/ul/li"); foreach ($nodes as $node) { $value = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (empty($value)) { continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } } // // The Daily Meal provides servings details via Edamam's plugin. // if (!$recipe->yield) { $nodes = $xpath->query("//table[@class='edamam-data']/tr[2]/td[2]"); if ($nodes->length) { $recipe->yield = RecipeParser_Text::formatYield($nodes->item(0)->nodeValue); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Description $description = ""; $nodes = $xpath->query('//div[@id="recipe"]/p/i'); foreach ($nodes as $node) { $line = trim($node->nodeValue); if (strpos($line, "Adapted from") === false) { $description .= $line . "\n\n"; } } $description = trim($description); $recipe->description = $description; // Ingredients $recipe->resetIngredients(); $lines = array(); // Add ingredients to blob $nodes = $xpath->query('//div[@id="recipe"]/blockquote/p'); foreach ($nodes as $node) { foreach ($node->childNodes as $child) { $line = trim($child->nodeValue); switch ($child->nodeName) { case "strong": case "b": if (strpos($line, ":") === false) { $line .= ":"; } $lines[] = $line; break; case "#text": case "div": case "p": $lines[] = $line; break; } } } foreach ($lines as $line) { if (RecipeParser_Text::matchSectionName($line)) { $recipe->addIngredientsSection(RecipeParser_Text::formatSectionName($line)); } else { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Instructions $recipe->resetInstructions(); $lines = array(); $nodes = $xpath->query('//div[@id="recipe"]/*'); $passed_ingredients = false; foreach ($nodes as $node) { if ($node->nodeName == "blockquote") { $passed_ingredients = true; continue; } if ($node->nodeName == "p") { if ($passed_ingredients) { $line = trim($node->nodeValue); // Finished with ingredients once we hit "Adapted" notes or any <p> // with a class attribute. if (stripos($line, "Adapted from") !== false) { break; } else { if ($node->getAttribute("class")) { break; } } // Servings? if (stripos($line, "Serves ") === 0) { $recipe->yield = RecipeParser_Text::formatYield($line); continue; } $recipe->appendInstruction(RecipeParser_Text::formatAsOneLine($node->nodeValue)); } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@itemprop="name"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Times and yield // <time datetime="PT35M" itemprop="prepTime"> $nodes = $xpath->query('//time[@itemprop="prepTime"]'); if ($nodes->length) { if ($value = $nodes->item(0)->textContent) { $value = RecipeParser_Text::mixedTimeToMinutes($value); $recipe->time['total'] = $value; } } $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($value); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = trim($node->nodeValue); if ($value != "Ingredients") { $recipe->appendIngredient($value); } } // Instructions $nodes = $xpath->query('//span[@class = "steps-list__item__text"]'); foreach ($nodes as $node) { $value = trim($node->nodeValue); $value = RecipeParser_Text::stripLeadingNumbers($value); $parts = self::splitDirections($value); if ($parts['section']) { $parts['section'] = RecipeParser_Text::formatSectionName($parts['section']); $recipe->addInstructionsSection($parts['section']); } $recipe->appendInstruction($parts['direction']); } // Notes $nodes = $xpath->query('//div[@class = "recipe-notes__content"]/div/p'); $notes = array(); if ($nodes->length) { foreach ($nodes as $node) { $value = trim($node->nodeValue); array_push($notes, $value); } $recipe->notes = implode(' | ', $notes); } // Photo $nodes = $xpath->query('//img[@class = "recipe-carousel__recipe__img"]'); if ($nodes && $nodes->item(1)) { $photo_url = $nodes->item(1)->getAttribute('src'); if (strpos($photo_url, 'default-recipe-image.gif') === false && strpos($photo_url, 'placeholder.gif') === false) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } } return $recipe; }
public function test_format_section_name() { // Pass through as original $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("Cake")); // Title case single word, strip colon, trim whitespace. $this->assertEquals("Cake", RecipeParser_Text::formatSectionName(" CAKE: ")); // Remove leading "for". $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("For Cake")); // Remove leading "for the". $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("For the cake")); // Upper-case only the first word (until we have a better way of doing this). $this->assertEquals("Cake frosting", RecipeParser_Text::formatSectionName("Cake Frosting")); }
public static function parse($html, $url) { // Get all of the standard hrecipe stuff we can find. $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe->resetIngredients(); $recipeName = $xpath->query('.//*[@itemprop="name"]'); $value = trim($recipeName[0]->nodeValue); $recipe->title = $value; $nodes = $xpath->query('//li[@itemprop="recipeInstructions"]/*'); if ($nodes->length) { foreach ($nodes as $sub) { $line = trim($sub->nodeValue); $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); } } $image = $xpath->query('.//*[@itemprop="image"]'); $photo_url = $image[0]->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); // Meta data $nodes = $xpath->query('//div[@class="recipe-metadata-wrap"]/*'); if ($nodes->length) { $prepTime = $xpath->query('.//*[@itemprop="prepTime"]'); foreach ($prepTime[0]->attributes as $sub) { if ($sub->nodeName == "content") { $value = trim($sub->nodeValue); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } } $prepTime = $xpath->query('.//*[@itemprop="cookTime"]'); foreach ($prepTime[0]->attributes as $sub) { if ($sub->nodeName == "content") { $value = trim($sub->nodeValue); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value); } } $recipe->time['total'] = $recipe->time['cook'] + $recipe->time['prep']; $recipeYield = $xpath->query('.//*[@itemprop="recipeYield"]'); $value = trim($recipeYield[0]->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($value); } // Multi-stage ingredients $nodes = $xpath->query('//div[@class="recipe-ingredients-wrapper"]/*'); if ($nodes->length) { foreach ($nodes as $node) { if ($node->nodeName == 'h3') { $value = $node->nodeValue; $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { if ($node->nodeName == 'ul') { $subs = $xpath->query('.//li[@itemprop="ingredients"]', $node); foreach ($subs as $sub) { $value = trim($sub->nodeValue); $recipe->appendIngredient($value); } } } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $microdata = null; $nodes = $xpath->query('//*[contains(@itemtype, "//schema.org/Recipe") or contains(@itemtype, "//schema.org/recipe")]'); if ($nodes->length) { $microdata = $nodes->item(0); } // Parse elements if ($microdata) { // Title $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = RecipeParser_Text::formatTitle($value); } // Summary $nodes = $xpath->query('.//*[@itemprop="description"]', $microdata); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatAsParagraphs($value); $recipe->description = $value; } // Times $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { if ($value = $nodes->item(0)->getAttribute('datetime')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $nodes = $xpath->query('.//*[@itemprop="recipeYield"]', $microdata); if (!$nodes->length) { $nodes = $xpath->query('.//*[@itemprop="recipeyield"]', $microdata); } if ($nodes->length) { if ($nodes->item(0)->hasAttribute('content')) { $line = $nodes->item(0)->getAttribute('content'); } else { $line = $nodes->item(0)->nodeValue; } $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (strlen($value) > 150) { // probably a mistake, like a run-on of existing ingredients? continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Look for markup that uses <li> tags for each instruction. if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for instructions as direct descendents of "recipeInstructions". if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple recipeInstructions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; if (!$photo_url) { // try to find open graph url $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); } } if (!$photo_url) { $nodes = $xpath->query('.//*[@itemprop="image"]', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[@itemprop="image"]//img', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } // Credits $line = ""; $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $nodes = $xpath->query('.//*[@itemprop="publisher"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $recipe->credits = RecipeParser_Text::formatCredits($line); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//li[@class="credit"]'); foreach ($nodes as $node) { $line = $node->nodeValue; if (stripos($line, "servings") !== false) { $line = preg_replace("/servings\\:?.*(\\d+)/i", "\$1", $line); $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; } } // Description $nodes = $xpath->query('//*[@itemprop="page-dek"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->description = $line; } // Notes $line = ""; $nodes = $xpath->query('//*[@class="note-text"]'); foreach ($nodes as $node) { $line .= trim($node->nodeValue) . "\n\n"; } $line = rtrim($line); $recipe->notes = $line; // Ingredients $recipe->resetIngredients(); $sections = $xpath->query('//*[@class="components-group"]'); if ($sections->length) { // Sections foreach ($sections as $section_node) { $section_nodes = $xpath->query('.//*[@class="components-group-header"]', $section_node); if ($section_nodes->length) { $line = $section_nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); if (!empty($line)) { $recipe->addIngredientsSection($line); } } $ing_nodes = $xpath->query('.//*[@class="components-item"]', $section_node); if ($ing_nodes->length) { foreach ($ing_nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//*[@class="directions-item"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); $recipe->appendInstruction($line); } // Photo URL $nodes = $xpath->query('//img[@itemprop="image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("data-original"); $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // OVERRIDES for epicurious // Prep Times $nodes = $xpath->query('//*[@class="summary_data"]'); if ($nodes->length) { foreach ($nodes as $node) { if (preg_match('/ACTIVE/', $node->nodeValue)) { $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { if ($ing_node->nodeName == "span") { $recipe->prep_time = RecipeParser_Text::formatAsOneLine($ing_node->nodeValue); } } } else { if (preg_match('/TOTAL/', $node->nodeValue)) { $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { if ($ing_node->nodeName == "span") { $recipe->total_time = RecipeParser_Text::formatAsOneLine($ing_node->nodeValue); } } } } } } // Total Time $nodes = $xpath->query('//*[@itemprop="totalTime"]'); if ($nodes->length) { $value = $nodes->item(0)->getAttribute("content"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "ingredients"]/*'); foreach ($nodes as $node) { // <strong> contains ingredient section names if ($node->nodeName == 'strong') { $line = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($line); continue; } // Extract ingredients from inside of <ul class="ingredientsList"> if ($node->nodeName == 'ul') { // Child nodes should all be <li> $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { if ($ing_node->nodeName == 'li') { $line = trim($ing_node->nodeValue); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[@class="prep_box"]'); foreach ($nodes as $node) { $line = $node->nodeValue; if (preg_match("/Number of Servings: (\\d+)/", $line, $m)) { $recipe->yield = RecipeParser_Text::formatYield($m[1]); } } // Instructions $recipe->resetInstructions(); $str = ""; $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]'); if ($nodes->length) { $children = $nodes->item(0)->childNodes; // This is a piece of HTML that has <br> tags for breaks in each instruction. // Rather than just getting nodeValue, I want to preserve the <br> tags. So I'm // looking for them as nodes and appending them to the string. Any other nodes // (either #text or other, e.g. <a href="">) get passed along into the string as // nodeValue. foreach ($children as $child) { if ($child->nodeName == "br") { $str .= "<br>"; } else { $line = trim($child->nodeValue); if (!empty($line)) { $str .= $line; } } } $lines = explode("<br>", $str); foreach ($lines as $line) { if (empty($line)) { continue; } else { if (RecipeParser_Text::matchSectionName($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addInstructionsSection($line); } else { if (!empty($line)) { $line = RecipeParser_Text::formatAsOneLine($line); $line = RecipeParser_Text::stripLeadingNumbers($line); if (stripos($line, "Recipe submitted by SparkPeople") === 0) { continue; } if (stripos($line, "Number of Servings:") === 0) { continue; } $recipe->appendInstruction($line); } } } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $hrecipe = null; if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hrecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hRecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if ($hrecipe) { // Title $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " fn ")]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->title = RecipeParser_Text::formatTitle($line); } // Summary $nodes = $xpath->query('.//*[@class="summary"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->description = RecipeParser_Text::formatAsParagraphs($line); } // Credits $nodes = $xpath->query('.//*[@class="author"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } // Photo $photo_url = ""; $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]//img', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } // Yield $nodes = $xpath->query('.//*[@class="yield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Prep Times $nodes = $xpath->query('.//*[@class="prepTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="preptime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } } // Cook Times $nodes = $xpath->query('.//*[@class="cookTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="cooktime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['cook'] = RecipeParser_Times::toMinutes($value); } } // Total Time / Duration $nodes = $xpath->query('.//*[@class="totalTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } } } // Ingredients $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = trim($line); $line = RecipeParser_Text::formatAsOneLine($line); // Skip lines that contain no word-like characters (sometimes used as section dividers). if (!preg_match("/\\w/", $line)) { continue; } // Section name delineated with dashes. E.g. "---Cake---" if (preg_match('/^\\-+([^\\-]{1}.*[^\\-]{1})\\-+$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } // Section name with colon. if (preg_match('/^(.+)\\:$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions $found = false; // Look for usage of <li> to denote each step of the instructions. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for "instruction class for each step of the instructions. if (!$found) { $query = '//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]' . '//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'; $nodes = $xpath->query($query); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Default. Multiple instructions nodes, or one with a blob of text. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Times $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartPrimaryAttributes"]//li'); if ($nodes->length) { foreach ($nodes as $node) { if (trim($node->childNodes->item(1)->nodeValue) == "Prep Time") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->time['prep'] = RecipeParser_Times::toMinutes($line); continue; } if (trim($node->childNodes->item(1)->nodeValue) == "Total Time") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->time['total'] = RecipeParser_Times::toMinutes($line); continue; } } } // Yield $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartSecondaryAttributes"]//li'); if ($nodes->length) { foreach ($nodes as $node) { if (trim($node->childNodes->item(1)->nodeValue) == "Servings") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } } } // Ingredients $recipe->resetIngredients(); $groups = $xpath->query('//*[@class="recipePartIngredientGroup"]'); foreach ($groups as $group) { $nodes = $xpath->query('.//h2', $group); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } $nodes = $xpath->query('.//*[@itemprop="ingredients"]', $group); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Notes / footnotes $notes = array(); $nodes = $xpath->query('//div[@class="recipePartTipsInfo"]'); foreach ($nodes as $node) { $line = trim($node->nodeValue); $notes[] = $line; } $recipe->notes = implode("\n\n", $notes); $recipe->notes = RecipeParser_Text::formatAsParagraphs($recipe->notes); // Fix description $recipe->description = trim(preg_replace("/Servings \\# \\d+/", "", $recipe->description)); return $recipe; }
public static function parse($html, $url) { if (strpos($url, "www.nytimes.com/recipes/") !== false) { // // "RECIPES" SECTION // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@class="recipe-title recipeName"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatTitle($value); $recipe->title = $value; } // Yield $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatYield($value); $recipe->yield = $value; } // Ingredients $nodes = $xpath->query('//div[@class="ingredientsGroup"]/*'); foreach ($nodes as $node) { if ($node->nodeName == "h3") { $value = trim($node->nodeValue); if (!preg_match('/^Ingredients:?$/i', $value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { foreach ($node->childNodes as $child) { $value = trim($child->nodeValue); $recipe->appendIngredient($value); } } } // Instructions $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/dd'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); $recipe->appendInstruction($value); } // Notes if (!$recipe->notes) { $nodes = $xpath->query('//div[@class="yieldNotesGroup"]//*[@class="note"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $value = preg_replace("/^Notes?:?\\s*/i", '', $value); $recipe->notes = trim($value); } } } else { // // DINING SECTION RECIPES // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//div[@id = "article"]//h1'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Time and Yield $nodes = $xpath->query('//div[@id = "article"]//p'); foreach ($nodes as $node) { $text = trim($node->nodeValue); if (preg_match('/^Yield:? (.+)/', $text, $m)) { $recipe->yield = RecipeParser_Text::formatYield($m[1]); } else { if (preg_match('/^Time:? (.+)/', $text, $m)) { $str = trim($m[1]); $str = preg_replace('/About (.+)/', '$1', $str); $str = preg_replace('/(.+) plus.*/', '$1', $str); $recipe->time['total'] = RecipeParser_Times::toMinutes($str); } } } // Ingredients $nodes = $xpath->query('//div[@class="recipeIngredientsList"]/p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); // Section names if ($line && $line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions and notes $nodes = $xpath->query('//div[@class="articleBody"]//p'); if (!$nodes->length) { $nodes = $xpath->query('//div[@id="articleBody"]//p'); } $notes = ''; $in_notes_section = false; foreach ($nodes as $node) { $line = trim($node->nodeValue); // Skip some of the useless lines if (preg_match('/^(Adapted from|Time|Yield)/i', $line)) { continue; } // Instructions start with line numbers if (!$in_notes_section && preg_match('/^\\d+\\./', $line)) { $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); continue; } // Look for lines that start the notes section. $note = ''; if (preg_match('/^Notes?:?(.*)/i', $line, $m)) { $in_notes_section = true; $note = trim($m[1]); } else { if ($in_notes_section) { $note = $line; } } if ($note) { $notes .= $note . "\n\n"; } } if ($notes) { $notes = str_replace(" ", " ", $notes); // Some unnecessary spaces $notes = trim($notes); $recipe->notes = $notes; } // Photo $nodes = $xpath->query('//div[@class="image"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $photo_url = str_replace('-articleInline.jpg', '-popup.jpg', $photo_url); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // OVERRIDES FOR ABOUT.COM // Title $nodes = $xpath->query('//*[@itemprop="headline name"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = RecipeParser_Text::formatTitle($value); } // Credits $nodes = $xpath->query('//*[@itemprop="author"]//*[@itemprop="name"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line . ", About.com"); } // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (RecipeParser_Text::matchSectionName($value) || $node->childNodes->item(0)->nodeName == "strong" || $node->childNodes->item(0)->nodeName == "b") { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//div[@itemprop="recipeInstructions"]'); foreach ($nodes as $node) { $text = trim($node->nodeValue); $lines = preg_split("/[\n\r]+/", $text); for ($i = count($lines) - 1; $i >= 0; $i--) { $lines[$i] = trim($lines[$i]); // Remove ends of lines that have the word "recipes" squashed up against // another word, which seems to happen with long lists of related // recipe links. // Remove lines that have the phrase "Xxxxx Recipes and More". // Remove lines that have the phrase "Xxxxx Recipes | Xxxxx". // Remove mentions of newsletters. $lines[$i] = preg_replace("/(.*)recipes\\w/i", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)More .* Recipes.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes and More.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes \\| .*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipe Newsletter.*/", "\$1", $lines[$i]); // Look for a line in the instructions that looks like a yield. if (strpos($lines[$i], "Makes ") === 0) { $recipe->yield = substr($lines[$i], 6); $lines[$i] = ''; continue; } } foreach ($lines as $line) { $line = trim($line); if (empty($line)) { continue; } if (strtolower($line) == "preparation") { continue; } // Match section names that read something like "---For the cake: Raise the oven temperature..." if (preg_match("/^(?:-{2,})?For the (.+)\\: (.*)\$/i", $line, $m)) { $section = $m[1]; $section = RecipeParser_Text::formatSectionName($section); $recipe->addInstructionsSection($section); // Reset the value of $line, without the section name. $line = ucfirst($m[2]); } $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@property="v:name"]'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } // Summary $nodes = $xpath->query('//*[@property="v:summary"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->description = $value; } // Times $searches = array('v:prepTime' => 'prep', 'v:cookTime' => 'cook', 'v:totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('//*[@property="' . $itemprop . '"]'); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $nodes = $xpath->query('//*[@property="v:yield"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $line = preg_replace('/\\s+/', ' ', $line); $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = null; // (data-vocabulary) $nodes = $xpath->query('//*[@rel="v:ingredient"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('//*[@property="v:instructions"]//*[@property="v:instruction"]'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for markup that uses <li>, <p> or other tags for each instruction. $search_sub_nodes = array("p", "li"); while (!$found && ($tag = array_pop($search_sub_nodes))) { $nodes = $xpath->query('//*[@property="v:instructions"]//' . $tag); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple instrutions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('//*[@property="v:instructions"]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; $nodes = $xpath->query('//*[@rel="v:photo"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } if (!$photo_url) { // for <img> as sub-node of rel="v:photo" $nodes = $xpath->query('//*[@rel="v:photo"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } // Credits $nodes = $xpath->query('//*[@property="v:author"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } return $recipe; }
public static function parse($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // OVERRIDES FOR ABOUT.COM // Cook times $node_list = $xpath->query('//div[@id = "articlebody"]/h3'); foreach ($node_list as $node) { $line = $node->nodeValue; $line = preg_replace('/[\\s\\"]+/', ' ', $line); $line = trim($line); if (preg_match("/prep time\\:(.+)/i", $line, $m)) { $recipe->time['prep'] = RecipeParser_Times::toMinutes($m[1]); } else { if (preg_match("/cook time\\:(.+)/i", $line, $m)) { $recipe->time['cook'] = RecipeParser_Times::toMinutes($m[1]); } } // Total time is provided as part of microformat markup for About.com } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//div[@class = "instructions"]'); foreach ($nodes as $node) { $text = trim($node->nodeValue); $lines = preg_split("/[\n\r]+/", $text); for ($i = count($lines) - 1; $i >= 0; $i--) { $lines[$i] = trim($lines[$i]); // Remove ends of lines that have the word "recipes" squashed up against // another word, which seems to happen with long lists of related // recipe links. // Remove lines that have the phrase "Xxxxx Recipes and More". // Remove lines that have the phrase "Xxxxx Recipes | Xxxxx". // Remove mentions of newsletters. $lines[$i] = preg_replace("/(.*)recipes\\w/i", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)More .* Recipes.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes and More.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes \\| .*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipe Newsletter.*/", "\$1", $lines[$i]); // Look for a line in the instructions that looks like a yield. if (strpos($lines[$i], "Makes ") === 0) { $recipe->yield = substr($lines[$i], 6); $lines[$i] = ''; continue; } } foreach ($lines as $line) { $line = trim($line); if (empty($line)) { continue; } // Match section names that read something like "---For the cake: Raise the oven temperature..." if (preg_match("/^(?:-{2,})?For the (.+)\\: (.*)\$/i", $line, $m)) { $section = $m[1]; $section = RecipeParser_Text::formatSectionName($section); $recipe->addInstructionsSection($section); // Reset the value of $line, without the section name. $line = ucfirst($m[2]); } $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//div[@id="rightCol"]/h1'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } // Yield $nodes = $xpath->query('//h4[@class="detailHeader"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } // Notes $nodes = $xpath->query('//div[@class="dek"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->notes = $line; } // Ingredients $nodes = $xpath->query('//ul[@class="recipe_ingredients"]/li'); foreach ($nodes as $node) { // Section names have class="ingredientSectionTitle", // ingredients themselves have no class. if ($node->hasAttributes()) { $line = trim($node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { $line = trim($node->nodeValue); // Add spaces between quantities and units $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line); // Remove spaces before commas (not sure why this happens in their HTML) $line = str_replace(' ,', ',', $line); // Condense multiple spaces $line = str_replace(' ', ' ', $line); $recipe->appendIngredient($line); } } // Instructions $nodes = $xpath->query('//ol[@class="recipe_instructions"]/li'); foreach ($nodes as $node) { $line = trim($node->nodeValue); $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//img[@class="recipeImg"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } else { // Second option for where to find recipe image $nodes = $xpath->query('//img[@id="splashImage"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[@id="v_ingredients"]//*[@id="IngredientSet"]'); foreach ($nodes as $node) { $children = $xpath->query('.//*[@id="IngredientHeading"]', $node); if ($children->length) { $line = $children->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } $children = $xpath->query('.//*[@id="IngredientLine"]', $node); foreach ($children as $child) { $line = $child->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Instructions $recipe->resetInstructions(); $str = ""; $nodes = $xpath->query('//*[@itemprop="instructions"]'); if ($nodes->length) { $children = $nodes->item(0)->childNodes; // This is a piece of HTML that has <br> tags for breaks in each instruction. // Rather than just getting nodeValue, I want to preserve the <br> tags. So I'm // looking for them as nodes and appending them to the string. Any other nodes // (either #text or other, e.g. <a href="">) get passed along into the string as // nodeValue. foreach ($children as $child) { if ($child->nodeName == "br") { $str .= "<br>"; } else { if ($child->nodeName == "b") { $str .= "SECTION:" . $child->nodeValue; } else { $line = $child->nodeValue; if (preg_match("/\\S/", $line)) { $str .= $line; } } } } $lines = explode("<br>", $str); foreach ($lines as $line) { if (strpos($line, "SECTION:") === 0) { $line = substr($line, 8); $line = RecipeParser_Text::formatSectionName($line); $recipe->addInstructionsSection($line); } else { $line = RecipeParser_Text::formatAsOneLine($line); $line = RecipeParser_Text::stripLeadingNumbers($line); if (stripos($line, "yield:") === 0) { continue; } $recipe->appendInstruction($line); } } } return $recipe; }
public static function parse($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe = new RecipeParser_Recipe(); // Title $nodes = $xpath->query('//div[@id="detail_content"]/h1'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } // Yield and Times $nodes = $xpath->query('//p[@id="yield"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } // Notes, instructions, and ingredients are not very well structured. $found_ingredients = false; $found_instructions = false; $nodes = $xpath->query('//div[@id="detail_content"]/*'); foreach ($nodes as $node) { // Notes -- Weird, but this is the only <p> that doesn't have attributes // on the tag. if ($node->nodeName == 'p') { if (!$node->hasAttributes()) { $recipe->notes = trim($node->nodeValue); continue; } } // Ingredients/ingredients markers if ($node->nodeName == 'h5') { $line = strtolower(trim($node->nodeValue)); if ($line == 'ingredients') { $found_ingredients = true; continue; } else { if ($line == 'instructions') { $found_instructions = true; } } } // Ingredients if ($found_ingredients && !$found_instructions) { if ($node->nodeName == 'h6') { $line = trim($node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { if ($node->nodeName == 'ul') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub) { $line = trim($sub->nodeValue); // Add spaces between quantities and units $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line); // Remove spaces before commas (not sure why this happens in their HTML) $line = str_replace(' ,', ',', $line); // Condense multiple spaces $line = str_replace(' ', ' ', $line); $recipe->appendIngredient($line); } } } } // Instructions if ($found_instructions) { if ($node->nodeName == 'ul') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub) { $line = trim($sub->nodeValue); $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); } } } } // Photo $nodes = $xpath->query('//img[@class="detail"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h3[@class = "title"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Cook times $nodes = $xpath->query('//div[@class = "recipe-metadata"]/ul/li'); foreach ($nodes as $node) { $sub_nodes = $node->childNodes; $key = null; $value = null; foreach ($sub_nodes as $sub_node) { if ($sub_node->nodeName == 'h5') { $key = trim($sub_node->nodeValue); } if ($sub_node->nodeName == 'p') { $value = trim($sub_node->nodeValue); } } // Inspect keys/values we've found. if ($key == 'Total Time:') { $value = self::cleanupTime($value); $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } if ($key == 'Prep Time:') { $value = self::cleanupTime($value); $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } } $node_list = $xpath->query('//dd[@class = "preptime"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } $node_list = $xpath->query('//dd[@class = "cooktime"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['cook'] = RecipeParser_Times::toMinutes($value); } $node_list = $xpath->query('//dd[@class = "duration totaltime special"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } // Ingredients, Yield, Description, Notes, etc. $nodes = $xpath->query('//div[@class = "recipe-body"]/*'); $section_title = null; foreach ($nodes as $node) { // Section titles if ($node->nodeName == 'h4') { $value = $node->nodeValue; $value = trim(strtolower($value)); $section_title = $value; continue; } $in_section = false; if ($node->nodeName == 'div') { // Ensure that we're in a <div class="section"> node. foreach ($node->attributes as $attr_name => $attr_node) { if ($attr_name == 'class' && $attr_node->value == 'section') { $in_section = true; } } if (!$in_section) { continue; } // Description should be first text, before any section titles. if (!$section_title) { $value = $node->nodeValue; $value = preg_replace("/^(Drink\\:|Top Chef).*\$/m", '', $value); $value = str_replace("\n\n", "\n", $value); $value = trim($value); $recipe->description = $value; // Yield } else { if ($section_title == 'yield') { $value = trim($node->nodeValue); $recipe->yield = $value; // Notes } else { if ($section_title == 'notes') { $value = trim($node->nodeValue); $value = str_replace("\n\n", "\n", $value); $recipe->notes = $value; // Ingredients } else { if ($section_title == 'ingredients') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub_node) { if ($sub_node->nodeName == 'h5') { $value = RecipeParser_Text::formatSectionName($sub_node->nodeValue); $recipe->addIngredientsSection($value); } else { if ($sub_node->nodeName == 'ul') { $li_nodes = $sub_node->childNodes; foreach ($li_nodes as $li_node) { $value = trim($li_node->nodeValue); $recipe->appendIngredient($value); } } } } // Instructions } else { if ($section_title == 'directions') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub_node) { $value = trim($sub_node->nodeValue); // Section titles appear in all-caps. if ($value && ($value == strtoupper($value) || preg_match('/:$/', $value))) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addInstructionsSection($value); } else { $value = RecipeParser_Text::stripLeadingNumbers($value); $recipe->appendInstruction($value); } } } } } } } } } // Source / Chef $nodes = $xpath->query('//div[@class = "recipe-sidebar"]/div/*'); $section_title = null; $chef_name = null; $show_name = 'Bravo TV'; foreach ($nodes as $node) { if ($node->nodeName == 'h4') { $value = trim($node->nodeValue); $section_title = strtolower($value); continue; } if ($node->nodeName == 'small') { if ($section_title == 'chef' || $section_title == 'author') { $value = trim($node->nodeValue); $chef_name = $value; break; } } } $nodes = $xpath->query('//div[@class = "section"]/p[1]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; if (strpos($value, 'Top Chef Masters') !== false) { $show_name = 'Top Chef Masters'; } else { if (strpos($value, 'Top Chef') !== false) { $show_name = 'Top Chef'; } } } $recipe->credits = $chef_name . ', ' . $show_name; $nodes = $xpath->query('//div[@class = "recipe-header clearfix"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $photo_url = str_replace('/medium/', '/original/', $photo_url); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@class="col6 ingredients"]/*'); foreach ($nodes as $node) { // Extract ingredients from <ul> <li>. if ($node->nodeName == 'ul') { $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { // Find <li> with itemprop="ingredients" for each ingredient. if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("itemprop") == "ingredients") { $line = trim($ing_node->nodeValue); // Section titles might be all uppercase ingredients if ($line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); continue; } // Ingredient lines if (stripos($line, "copyright") !== false) { continue; } else { if (stripos($line, "recipe follows") !== false) { continue; } else { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Section titles } else { if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("class") == "subtitle") { $line = trim($ing_node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } } } continue; } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*'); foreach ($nodes as $node) { if ($node->nodeName == "span") { $line = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addInstructionsSection($line); } else { if ($node->nodeName == "p") { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (!preg_match("/^Photograph/i", $line)) { $recipe->appendInstruction($line); } } } } // See if we've captured a chef's photo, and delete it (if so). if ($recipe->photo_url) { $nodes = $xpath->query('//a[@itemprop="url"]/img[@itemprop="image"]'); if ($nodes->length > 0) { $url = $nodes->item(0)->getAttribute("src"); if ($recipe->photo_url == $url) { $recipe->photo_url = ""; } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Find the top-level node for Recipe microdata $microdata = null; $nodes = $xpath->query('//*[@itemtype="http://data-vocabulary.org/Recipe"]'); if ($nodes->length) { $microdata = $nodes->item(0); } // Parse elements if ($microdata) { // Title $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatTitle($value); $recipe->title = $value; } // Summary $nodes = $xpath->query('.//*[@itemprop="summary"]', $microdata); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->description = $value; } // Times $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('datetime')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $line = ""; $nodes = $xpath->query('.//*[@itemprop="yield"]', $microdata); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); } else { $nodes = $xpath->query('.//*[@itemprop="servingSize"]', $microdata); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); } } if ($line) { $line = preg_replace('/\\s+/', ' ', $line); $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = null; // (data-vocabulary) if (!$nodes || !$nodes->length) { $nodes = $xpath->query('.//*[@itemprop="ingredient"]', $microdata); } if (!$nodes || !$nodes->length) { // non-standard $nodes = $xpath->query('.//*[@id="ingredients"]//li', $microdata); } if (!$nodes || !$nodes->length) { // non-standard $nodes = $xpath->query('.//*[@class="ingredients"]//li', $microdata); } foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Look for markup that uses <li> tags for each instruction. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instructions"]//li', $microdata); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instruction"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]', $microdata); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple instrutions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instructions"]', $microdata); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; if (!$photo_url) { // try to find open graph url $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); } } if (!$photo_url) { $nodes = $xpath->query('.//*[@itemprop="photo"]', $microdata); if ($nodes->length) { if ($nodes->item(0)->hasAttribute('src')) { $photo_url = $nodes->item(0)->getAttribute('src'); } else { if ($nodes->item(0)->hasAttribute('content')) { $photo_url = $nodes->item(0)->getAttribute('content'); } } } } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[@itemprop="photo"]//img', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } // Credits $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } } return $recipe; }