public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@id="page-title"]'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Times $nodes = $xpath->query('//*[@class="field-recipe-time"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (strpos($line, "Hands-On Time") !== false) { $line = str_replace("Hands-On Time ", "", $line); $recipe->time["prep"] = RecipeParser_Times::toMinutes($line); } else { if (strpos($line, "Total Time") !== false) { $line = str_replace("Total Time ", "", $line); $recipe->time["total"] = RecipeParser_Times::toMinutes($line); } } } // Yield $nodes = $xpath->query('//*[@class="field-yield"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; } // Ingredients $nodes = $xpath->query('//*[@class="field-ingredients"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@class="field-instructions"]//li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//*[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $hrecipe = $xpath->query('//section[@role="main"]'); if ($hrecipe->length) { $hrecipe = $hrecipe->item(0); // Title is not marked up with class="fn" $nodes = $xpath->query('.//h1', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->title = RecipeParser_Text::formatTitle($value); } // Yield -- Class names are conflated $nodes = $xpath->query('.//*[@class="info yield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } $nodes = $xpath->query('.//span[@itemprop="recipeYield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Prep Times -- Class names are conflated $nodes = $xpath->query('.//*[@class="info preptime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } // Total Time / Duration -- Class names are conflated $nodes = $xpath->query('.//*[@class="info duration"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } } // Photo $nodes = $xpath->query('//section[@class="content-unit"]/img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } } // Remove recipe title intros -- e.g. "Sunday Dinner: Pork Ribs" changes to "Pork Ribs" if (strpos($recipe->title, ": ") !== false) { $recipe->title = preg_replace("/^[^:]+: (.+)/", "\$1", $recipe->title); } return $recipe; }
public static function getBookmarkAsRecipeStruct($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // This recipe will be stored as a bookmark $recipe = new RecipeStruct(); $recipe->url = $url; $recipe->status = "bookmark"; // Find the page title $title = ""; $title_tag = ""; $title_og_meta = ""; $nodes = $xpath->query('//title'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatTitle($line); if ($line) { $title_tag = $line; } } $nodes = $xpath->query('//meta[@property="og:title"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $line = RecipeParser_Text::formatTitle($line); if ($line) { $title_og_meta = $line; } } // Which title string to use? if ($title_og_meta) { $title = $title_og_meta; } else { if ($title_tag) { $title = $title_tag; } else { $title = "Recipe from {$url}"; } } $recipe->title = $title; // Get image from Open Graph tag $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("content"); if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title missing? if (!$recipe->title) { $nodes = $xpath->query('//meta[@property="og:title"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $line = RecipeParser_Text::formatTitle($line); $recipe->title = $line; } } // Photo URL, use larger version found on MyRecipes $recipe->photo_url = str_replace('-l.jpg', '-x.jpg', $recipe->photo_url); // Credits $nodes = $xpath->query('//*[@class="link-list"]/h4'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); if (strpos($line, "More from") === 0) { $line = str_replace("More from ", "", $line); $recipe->credits = $line; } } // Times $searches = array('prep' => 'prep: ', 'cook' => 'cook: ', 'total' => 'total: '); $nodes = $xpath->query('//*[@class="recipe-time-info"]'); foreach ($nodes as $node) { $line = trim(strtolower($node->nodeValue)); foreach ($searches as $key => $value) { if (strpos($line, $value) === 0) { $line = str_replace($value, "", $line); $recipe->time[$key] = RecipeParser_Times::toMinutes($line); } } } // Clean up each of the ingredients to remove "$Click to see savings" // These don't come through in the curl'ed test files for ($i = 0; $i < count($recipe->ingredients); $i++) { for ($j = 0; $j < count($recipe->ingredients[$i]['list']); $j++) { if (strpos($recipe->ingredients[$i]['list'][$j], "\$") > 0) { $recipe->ingredients[$i]['list'][$j] = substr($recipe->ingredients[$i]['list'][$j], 0, strpos($recipe->ingredients[$i]['list'][$j], "\$")); } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@itemprop="name"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatTitle($line); $recipe->title = $line; } // Description $nodes = $xpath->query('//*[@itemprop="description"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->description = $line; } // Author $nodes = $xpath->query('//span[@itemprop="author"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatCredits($line); $recipe->credits = $line; } // Prep Times $nodes = $xpath->query('//*[@itemprop="prepTime"]'); if ($nodes->length) { $value = $nodes->item(0)->getAttribute("content"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } // Total Time $nodes = $xpath->query('//*[@itemprop="totalTime"]'); if ($nodes->length) { $value = $nodes->item(0)->getAttribute("content"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } // Yield $nodes = $xpath->query('//*[@itemprop="recipeyield"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@itemprop="recipeinstructions"]/li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $recipe->photo_url = $line; } return $recipe; }
public function test_title_ends_recipe() { $this->assertEquals("Bananas Foster", RecipeParser_Text::formatTitle(" Bananas Foster Recipe ")); }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $microdata = null; $nodes = $xpath->query('//*[contains(@itemtype, "//schema.org/Recipe") or contains(@itemtype, "//schema.org/recipe")]'); if ($nodes->length) { $microdata = $nodes->item(0); } // Parse elements if ($microdata) { // Title $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = RecipeParser_Text::formatTitle($value); } // Summary $nodes = $xpath->query('.//*[@itemprop="description"]', $microdata); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatAsParagraphs($value); $recipe->description = $value; } // Times $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { if ($value = $nodes->item(0)->getAttribute('datetime')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $nodes = $xpath->query('.//*[@itemprop="recipeYield"]', $microdata); if (!$nodes->length) { $nodes = $xpath->query('.//*[@itemprop="recipeyield"]', $microdata); } if ($nodes->length) { if ($nodes->item(0)->hasAttribute('content')) { $line = $nodes->item(0)->getAttribute('content'); } else { $line = $nodes->item(0)->nodeValue; } $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (strlen($value) > 150) { // probably a mistake, like a run-on of existing ingredients? continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Look for markup that uses <li> tags for each instruction. if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for instructions as direct descendents of "recipeInstructions". if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple recipeInstructions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; if (!$photo_url) { // try to find open graph url $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); } } if (!$photo_url) { $nodes = $xpath->query('.//*[@itemprop="image"]', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[@itemprop="image"]//img', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } // Credits $line = ""; $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $nodes = $xpath->query('.//*[@itemprop="publisher"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $recipe->credits = RecipeParser_Text::formatCredits($line); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // --- Allrecipes allows for custom recipes that use a different // --- template than their standard content. This template is not currently // --- using schema.org/Recipe. So we'll look for fields that need to be // --- overridden. // Title if (!$recipe->title) { $node_list = $xpath->query('//h1[@itemprop="name"]'); if ($node_list->length) { $value = RecipeParser_Text::formatTitle($node_list->item(0)->nodeValue); $recipe->title = $value; } } // Yield if (!$recipe->yield) { $node_list = $xpath->query('//div[@class = "servings-form"]//span[@class = "yield yieldform"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->yield = $value; } } // Times $searches = array('liPrep' => 'prep', 'liCook' => 'cook', 'liTotal' => 'total'); foreach ($searches as $id_name => $time_key) { $nodes = $xpath->query('.//*[@id="' . $id_name . '"]'); if ($nodes->length) { $value = RecipeParser_Text::formatAsOneLine($nodes->item(0)->nodeValue); $value = trim(preg_replace("/(COOK|PREP|READY IN)/", "", $value)); $value = RecipeParser_Times::toMinutes($value); if ($value) { $recipe->time[$time_key] = $value; } } } // Ingredients if (!count($recipe->ingredients[0]["list"])) { $node_list = $xpath->query('//li[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($node_list as $node) { $line = trim(strip_tags($node->nodeValue)); if (preg_match("/^(.+):\$/", $line, $m)) { $recipe->addIngredientsSection(ucfirst(strtolower($m[1]))); } else { if ($line) { $recipe->appendIngredient($line); } } } } // Instructions if (!count($recipe->instructions[0]["list"])) { $nodes = $xpath->query('//div[@class="directions"]//ol/li'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (preg_match("/^(.+):\$/", $line, $m)) { $recipe->addInstructionsSection(ucfirst(strtolower($m[1]))); } else { if ($line) { $recipe->appendInstruction($line); } } } } // Look for useless line at end of instructions $i = count($recipe->instructions) - 1; $j = count($recipe->instructions[$i]['list']) - 1; if ($j >= 0 && strpos($recipe->instructions[$i]['list'][$j], "All done!") === 0) { unset($recipe->instructions[$i]['list'][$j]); } // Photo URL // Get larger images if ($recipe->photo_url) { $recipe->photo_url = str_replace('/userphoto/small/', '/userphoto/big/', $recipe->photo_url); $recipe->photo_url = str_replace('/userphotos/140x140/', '/userphotos/250x250/', $recipe->photo_url); } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Find the top-level node for Recipe microdata $microdata = null; $nodes = $xpath->query('//*[@itemtype="http://data-vocabulary.org/Recipe"]'); if ($nodes->length) { $microdata = $nodes->item(0); } // Parse elements if ($microdata) { // Title $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatTitle($value); $recipe->title = $value; } // Summary $nodes = $xpath->query('.//*[@itemprop="summary"]', $microdata); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->description = $value; } // Times $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('datetime')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $line = ""; $nodes = $xpath->query('.//*[@itemprop="yield"]', $microdata); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); } else { $nodes = $xpath->query('.//*[@itemprop="servingSize"]', $microdata); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); } } if ($line) { $line = preg_replace('/\\s+/', ' ', $line); $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = null; // (data-vocabulary) if (!$nodes || !$nodes->length) { $nodes = $xpath->query('.//*[@itemprop="ingredient"]', $microdata); } if (!$nodes || !$nodes->length) { // non-standard $nodes = $xpath->query('.//*[@id="ingredients"]//li', $microdata); } if (!$nodes || !$nodes->length) { // non-standard $nodes = $xpath->query('.//*[@class="ingredients"]//li', $microdata); } foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Look for markup that uses <li> tags for each instruction. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instructions"]//li', $microdata); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instruction"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]', $microdata); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple instrutions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="instructions"]', $microdata); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; if (!$photo_url) { // try to find open graph url $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); } } if (!$photo_url) { $nodes = $xpath->query('.//*[@itemprop="photo"]', $microdata); if ($nodes->length) { if ($nodes->item(0)->hasAttribute('src')) { $photo_url = $nodes->item(0)->getAttribute('src'); } else { if ($nodes->item(0)->hasAttribute('content')) { $photo_url = $nodes->item(0)->getAttribute('content'); } } } } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[@itemprop="photo"]//img', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } // Credits $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); if (!$recipe->title) { $nodes = $xpath->query('//div[@itemprop="name"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatTitle($line); $recipe->title = $line; } } if (!$recipe->yield) { $nodes = $xpath->query('//div[@class="box"]/div'); foreach ($nodes as $node) { $line = trim($node->nodeValue); if (stripos($line, "makes") === 0) { $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; break; } } } if (!count($recipe->ingredients[0]["list"])) { $nodes = $xpath->query('//ul[@class="ingredients"]'); if ($nodes->length) { $nodes = $nodes->item(0)->childNodes; $str = ""; foreach ($nodes as $node) { if (in_array($node->nodeName, array("li"))) { $line = $node->nodeValue; $str .= $line . "<br>"; } } $lines = explode("<br>", $str); foreach ($lines as $line) { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } if (!count($recipe->instructions[0]["list"])) { $nodes = $xpath->query('//div[@class="instructions"]/ol/li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } } if (!$recipe->photo_url) { $nodes = $xpath->query('//meta[@property="og:image"]'); foreach ($nodes as $node) { $line = $node->getAttribute("content"); if (strpos($line, "wp-content") !== false) { $recipe->photo_url = $line; break; } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@class="rTitle fn"]'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Yield $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " yield ")]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Times $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " prepTime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " rspec-cook-time ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " totaltime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($line); } // Ingredients $nodes = $xpath->query('//*[@class="ingredient"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@class="instructions"]'); if ($nodes->length) { $blob = ""; foreach ($nodes->item(0)->childNodes as $node) { $blob .= RecipeParser_Text::formatAsOneLine($node->nodeValue) . " "; if ($node->nodeName == "p") { $blob .= "\n\n"; } } // Minor cleanup $blob = str_replace(" , ", ", ", $blob); $blob = str_replace(" . ", ". ", $blob); $blob = str_replace(" ", " ", $blob); foreach (explode("\n\n", $blob) as $line) { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } } // Photo $nodes = $xpath->query('//a[@class="img-enlarge"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("href"); $photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); $recipe->photo_url = $photo_url; } return $recipe; }
public static function parse($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // OVERRIDES FOR ABOUT.COM // Title $nodes = $xpath->query('//*[@itemprop="headline name"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = RecipeParser_Text::formatTitle($value); } // Credits $nodes = $xpath->query('//*[@itemprop="author"]//*[@itemprop="name"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line . ", About.com"); } // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (RecipeParser_Text::matchSectionName($value) || $node->childNodes->item(0)->nodeName == "strong" || $node->childNodes->item(0)->nodeName == "b") { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//div[@itemprop="recipeInstructions"]'); foreach ($nodes as $node) { $text = trim($node->nodeValue); $lines = preg_split("/[\n\r]+/", $text); for ($i = count($lines) - 1; $i >= 0; $i--) { $lines[$i] = trim($lines[$i]); // Remove ends of lines that have the word "recipes" squashed up against // another word, which seems to happen with long lists of related // recipe links. // Remove lines that have the phrase "Xxxxx Recipes and More". // Remove lines that have the phrase "Xxxxx Recipes | Xxxxx". // Remove mentions of newsletters. $lines[$i] = preg_replace("/(.*)recipes\\w/i", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)More .* Recipes.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes and More.*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipes \\| .*/", "\$1", $lines[$i]); $lines[$i] = preg_replace("/(.*)Recipe Newsletter.*/", "\$1", $lines[$i]); // Look for a line in the instructions that looks like a yield. if (strpos($lines[$i], "Makes ") === 0) { $recipe->yield = substr($lines[$i], 6); $lines[$i] = ''; continue; } } foreach ($lines as $line) { $line = trim($line); if (empty($line)) { continue; } if (strtolower($line) == "preparation") { continue; } // Match section names that read something like "---For the cake: Raise the oven temperature..." if (preg_match("/^(?:-{2,})?For the (.+)\\: (.*)\$/i", $line, $m)) { $section = $m[1]; $section = RecipeParser_Text::formatSectionName($section); $recipe->addInstructionsSection($section); // Reset the value of $line, without the section name. $line = ucfirst($m[2]); } $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { if (strpos($url, "www.nytimes.com/recipes/") !== false) { // // "RECIPES" SECTION // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@class="recipe-title recipeName"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatTitle($value); $recipe->title = $value; } // Yield $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatYield($value); $recipe->yield = $value; } // Ingredients $nodes = $xpath->query('//div[@class="ingredientsGroup"]/*'); foreach ($nodes as $node) { if ($node->nodeName == "h3") { $value = trim($node->nodeValue); if (!preg_match('/^Ingredients:?$/i', $value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { foreach ($node->childNodes as $child) { $value = trim($child->nodeValue); $recipe->appendIngredient($value); } } } // Instructions $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/dd'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); $recipe->appendInstruction($value); } // Notes if (!$recipe->notes) { $nodes = $xpath->query('//div[@class="yieldNotesGroup"]//*[@class="note"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $value = preg_replace("/^Notes?:?\\s*/i", '', $value); $recipe->notes = trim($value); } } } else { // // DINING SECTION RECIPES // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//div[@id = "article"]//h1'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Time and Yield $nodes = $xpath->query('//div[@id = "article"]//p'); foreach ($nodes as $node) { $text = trim($node->nodeValue); if (preg_match('/^Yield:? (.+)/', $text, $m)) { $recipe->yield = RecipeParser_Text::formatYield($m[1]); } else { if (preg_match('/^Time:? (.+)/', $text, $m)) { $str = trim($m[1]); $str = preg_replace('/About (.+)/', '$1', $str); $str = preg_replace('/(.+) plus.*/', '$1', $str); $recipe->time['total'] = RecipeParser_Times::toMinutes($str); } } } // Ingredients $nodes = $xpath->query('//div[@class="recipeIngredientsList"]/p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); // Section names if ($line && $line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions and notes $nodes = $xpath->query('//div[@class="articleBody"]//p'); if (!$nodes->length) { $nodes = $xpath->query('//div[@id="articleBody"]//p'); } $notes = ''; $in_notes_section = false; foreach ($nodes as $node) { $line = trim($node->nodeValue); // Skip some of the useless lines if (preg_match('/^(Adapted from|Time|Yield)/i', $line)) { continue; } // Instructions start with line numbers if (!$in_notes_section && preg_match('/^\\d+\\./', $line)) { $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); continue; } // Look for lines that start the notes section. $note = ''; if (preg_match('/^Notes?:?(.*)/i', $line, $m)) { $in_notes_section = true; $note = trim($m[1]); } else { if ($in_notes_section) { $note = $line; } } if ($note) { $notes .= $note . "\n\n"; } } if ($notes) { $notes = str_replace(" ", " ", $notes); // Some unnecessary spaces $notes = trim($notes); $recipe->notes = $notes; } // Photo $nodes = $xpath->query('//div[@class="image"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $photo_url = str_replace('-articleInline.jpg', '-popup.jpg', $photo_url); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $hrecipe = null; if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hrecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hRecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if ($hrecipe) { // Title $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " fn ")]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->title = RecipeParser_Text::formatTitle($line); } // Summary $nodes = $xpath->query('.//*[@class="summary"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->description = RecipeParser_Text::formatAsParagraphs($line); } // Credits $nodes = $xpath->query('.//*[@class="author"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } // Photo $photo_url = ""; $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]//img', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } // Yield $nodes = $xpath->query('.//*[@class="yield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Prep Times $nodes = $xpath->query('.//*[@class="prepTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="preptime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } } // Cook Times $nodes = $xpath->query('.//*[@class="cookTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="cooktime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['cook'] = RecipeParser_Times::toMinutes($value); } } // Total Time / Duration $nodes = $xpath->query('.//*[@class="totalTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } } } // Ingredients $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = trim($line); $line = RecipeParser_Text::formatAsOneLine($line); // Skip lines that contain no word-like characters (sometimes used as section dividers). if (!preg_match("/\\w/", $line)) { continue; } // Section name delineated with dashes. E.g. "---Cake---" if (preg_match('/^\\-+([^\\-]{1}.*[^\\-]{1})\\-+$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } // Section name with colon. if (preg_match('/^(.+)\\:$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions $found = false; // Look for usage of <li> to denote each step of the instructions. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for "instruction class for each step of the instructions. if (!$found) { $query = '//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]' . '//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'; $nodes = $xpath->query($query); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Default. Multiple instructions nodes, or one with a blob of text. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@id="recipe"]/h3'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Instructions and Ingredients $nodes = $xpath->query('//*[@id="recipe"]/*'); $blob = ""; $found_servings = false; foreach ($nodes as $node) { // Skip title if ($node->nodeName == "h3") { continue; } // Get servings $line = $node->nodeValue; if (strpos($line, "Serves")) { if (preg_match("/.*(Serves.+)\$/m", $line, $m)) { $line = $m[1]; $recipe->yield = RecipeParser_Text::formatYield($line); continue; } } // Add child nodes to blob foreach ($node->childNodes as $child) { $line = trim($child->nodeValue); switch ($child->nodeName) { case "strong": $blob .= $line . " "; break; case "em": if (strpos($line, ":") === false) { $line .= ":"; } $blob .= $line . "\n\n"; break; case "#text": case "div": case "span": case "p": if ($line == "•") { continue; } $blob .= $line . "\n\n"; break; } } } RecipeParser_Text::parseIngredientsAndInstructionsFromBlob($blob, $recipe); // Photo $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $recipe->photo_url = $line; } return $recipe; }