public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "recipe-ingredients"]/*'); foreach ($nodes as $node) { if ($node->nodeName == 'p') { $value = trim($node->nodeValue); // Older recipes will have ingredients jumbled into a single <p> // rather than using 'ingredients' classes. If the node value looks // like multiple lines, treat it like a section header followed by // section ingredients. $lines = explode("\n", $value); if (count($lines) > 1) { for ($i = 0; $i < count($lines); $i++) { $line = trim($lines[$i]); if ($i == 0) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { $line = trim($line); $recipe->appendIngredient($line); } } // Otherwise, we're dealing with a normal section for hrecipe, and // ingredients for the section will follow as <ul> elements. } else { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { if ($node->nodeName == 'ul') { $subnodes = $xpath->query('./li[@class = "ingredient"]', $node); foreach ($subnodes as $subnode) { $value = trim($subnode->nodeValue); $recipe->appendIngredient($value); } } } } // Notes $nodes = $xpath->query('//*[@id="recipe-intronote"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->notes = RecipeParser_Text::formatAsParagraphs($value); } // Photo URL to replace og:image $nodes = $xpath->query('//img[@itemprop="image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("src"); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // ---- OVERRIDES // Credits if ($recipe->credits) { $recipe->credits = "Food52 (" . $recipe->credits . ")"; } else { $recipe->credits = "Food52"; } // Notes $line = ""; $nodes = $xpath->query('.//span[@class="recipe-note"]'); if ($nodes->length) { $nodes = $nodes->item(0)->childNodes; // go through 'childNodes' to get #text nodes foreach ($nodes as $node) { switch ($node->nodeName) { case "br": $line .= "\n"; break; case "#text": case "span": case "strong": case "b": case "em": case "i": case "a": $line .= $node->nodeValue . " "; break; } } } $line = preg_replace("/^Author Notes:\\s*/", "", $line); $recipe->notes = RecipeParser_Text::formatAsParagraphs($line); return $recipe; }
public function test_format_paragraphs() { $str = "\tThis is the\n\tfirst paragraph. \n\t \nThis is \nthe second. \r\n \n \n And this is\r\nthe third. "; $test = "This is the first paragraph.\n\nThis is the second.\n\nAnd this is the third."; $this->assertEquals($test, RecipeParser_Text::formatAsParagraphs($str)); }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $microdata = null; $nodes = $xpath->query('//*[contains(@itemtype, "//schema.org/Recipe") or contains(@itemtype, "//schema.org/recipe")]'); if ($nodes->length) { $microdata = $nodes->item(0); } // Parse elements if ($microdata) { // Title $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = RecipeParser_Text::formatTitle($value); } // Summary $nodes = $xpath->query('.//*[@itemprop="description"]', $microdata); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatAsParagraphs($value); $recipe->description = $value; } // Times $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total'); foreach ($searches as $itemprop => $time_key) { $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { if ($value = $nodes->item(0)->getAttribute('datetime')) { $value = RecipeParser_Text::iso8601ToMinutes($value); } else { $value = trim($nodes->item(0)->nodeValue); $value = RecipeParser_Times::toMinutes($value); } } if ($value) { $recipe->time[$time_key] = $value; } } } // Yield $nodes = $xpath->query('.//*[@itemprop="recipeYield"]', $microdata); if (!$nodes->length) { $nodes = $xpath->query('.//*[@itemprop="recipeyield"]', $microdata); } if ($nodes->length) { if ($nodes->item(0)->hasAttribute('content')) { $line = $nodes->item(0)->getAttribute('content'); } else { $line = $nodes->item(0)->nodeValue; } $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); if (empty($value)) { continue; } if (strlen($value) > 150) { // probably a mistake, like a run-on of existing ingredients? continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } // Instructions $found = false; // Look for markup that uses <li> tags for each instruction. if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for instructions as direct descendents of "recipeInstructions". if (!$found) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Some sites will use an "instruction" class for each line. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Either multiple recipeInstructions nodes, or one node with a blob of text. if (!$found) { $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } // Photo $photo_url = ""; if (!$photo_url) { // try to find open graph url $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); } } if (!$photo_url) { $nodes = $xpath->query('.//*[@itemprop="image"]', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[@itemprop="image"]//img', $microdata); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } // Credits $line = ""; $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $nodes = $xpath->query('.//*[@itemprop="publisher"]', $microdata); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; } $recipe->credits = RecipeParser_Text::formatCredits($line); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Times $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartPrimaryAttributes"]//li'); if ($nodes->length) { foreach ($nodes as $node) { if (trim($node->childNodes->item(1)->nodeValue) == "Prep Time") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->time['prep'] = RecipeParser_Times::toMinutes($line); continue; } if (trim($node->childNodes->item(1)->nodeValue) == "Total Time") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->time['total'] = RecipeParser_Times::toMinutes($line); continue; } } } // Yield $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartSecondaryAttributes"]//li'); if ($nodes->length) { foreach ($nodes as $node) { if (trim($node->childNodes->item(1)->nodeValue) == "Servings") { $line = trim($node->childNodes->item(3)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } } } // Ingredients $recipe->resetIngredients(); $groups = $xpath->query('//*[@class="recipePartIngredientGroup"]'); foreach ($groups as $group) { $nodes = $xpath->query('.//h2', $group); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } $nodes = $xpath->query('.//*[@itemprop="ingredients"]', $group); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Notes / footnotes $notes = array(); $nodes = $xpath->query('//div[@class="recipePartTipsInfo"]'); foreach ($nodes as $node) { $line = trim($node->nodeValue); $notes[] = $line; } $recipe->notes = implode("\n\n", $notes); $recipe->notes = RecipeParser_Text::formatAsParagraphs($recipe->notes); // Fix description $recipe->description = trim(preg_replace("/Servings \\# \\d+/", "", $recipe->description)); return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $hrecipe = null; if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hrecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if (!$hrecipe) { $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hRecipe ")]'); if ($nodes->length) { $hrecipe = $nodes->item(0); } } if ($hrecipe) { // Title $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " fn ")]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->title = RecipeParser_Text::formatTitle($line); } // Summary $nodes = $xpath->query('.//*[@class="summary"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->description = RecipeParser_Text::formatAsParagraphs($line); } // Credits $nodes = $xpath->query('.//*[@class="author"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->credits = RecipeParser_Text::formatCredits($line); } // Photo $photo_url = ""; $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } if (!$photo_url) { // for <img> as sub-node of class="photo" $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]//img', $hrecipe); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); } } if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } // Yield $nodes = $xpath->query('.//*[@class="yield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Prep Times $nodes = $xpath->query('.//*[@class="prepTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="preptime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } } // Cook Times $nodes = $xpath->query('.//*[@class="cookTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="cooktime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['cook'] = RecipeParser_Times::toMinutes($value); } } // Total Time / Duration $nodes = $xpath->query('.//*[@class="totalTime"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]//*[@class="value-title"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->getAttribute('title'); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } else { $nodes = $xpath->query('.//*[@class="duration"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } } } // Ingredients $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = trim($line); $line = RecipeParser_Text::formatAsOneLine($line); // Skip lines that contain no word-like characters (sometimes used as section dividers). if (!preg_match("/\\w/", $line)) { continue; } // Section name delineated with dashes. E.g. "---Cake---" if (preg_match('/^\\-+([^\\-]{1}.*[^\\-]{1})\\-+$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } // Section name with colon. if (preg_match('/^(.+)\\:$/', $line, $m)) { $line = RecipeParser_Text::formatSectionName($m[1]); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions $found = false; // Look for usage of <li> to denote each step of the instructions. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]//li'); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Look for "instruction class for each step of the instructions. if (!$found) { $query = '//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]' . '//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]'; $nodes = $xpath->query($query); if ($nodes->length) { RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } } // Default. Multiple instructions nodes, or one with a blob of text. if (!$found) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]'); if ($nodes->length > 1) { // Multiple nodes RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe); $found = true; } else { if ($nodes->length == 1) { // Blob $str = $nodes->item(0)->nodeValue; RecipeParser_Text::parseInstructionsFromBlob($str, $recipe); $found = true; } } } } return $recipe; }