public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@id="page-title"]'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Times $nodes = $xpath->query('//*[@class="field-recipe-time"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (strpos($line, "Hands-On Time") !== false) { $line = str_replace("Hands-On Time ", "", $line); $recipe->time["prep"] = RecipeParser_Times::toMinutes($line); } else { if (strpos($line, "Total Time") !== false) { $line = str_replace("Total Time ", "", $line); $recipe->time["total"] = RecipeParser_Times::toMinutes($line); } } } // Yield $nodes = $xpath->query('//*[@class="field-yield"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; } // Ingredients $nodes = $xpath->query('//*[@class="field-ingredients"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@class="field-instructions"]//li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//*[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $node_list = $doc->getElementsByTagName('title'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $value = trim(str_replace("Cooks.com - Recipe - ", "", $value)); $value = trim(str_replace(" - Recipe - Cooks.com", "", $value)); $recipe->title = $value; } // This node contains all ingredients, section titles, and instructions $node_list = $xpath->query('//table[@class="hrecipe"]//td/div'); foreach ($node_list as $node) { // Can determine each piece of content by the "style" attributes. $style = $node->getAttribute("style"); // Ingredients found in a div, black text if (stripos($style, "color: BLACK;") !== false) { $ing_nodes = $xpath->query('./span[@class = "ingredient"]', $node); foreach ($ing_nodes as $ing_node) { $recipe->appendIngredient($ing_node->nodeValue); } // Instructions node } else { if ($node->getAttribute('class') == "instructions") { foreach ($node->childNodes as $child) { $line = $child->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Section title } else { if ($node->getAttribute("class") == "section") { $title = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($title); if (count($recipe->instructions) > 0) { $recipe->addInstructionsSection($title); } } } } } return $recipe; }
public function testMultipleEmptySections() { $r = new RecipeParser_Recipe(); // Empty values for ingredients or instructions should be ingored. $r->addIngredientsSection('Pasta'); $r->appendIngredient('1 lb Spaghetti'); $r->addIngredientsSection(''); $r->addIngredientsSection(' '); $r->appendIngredient('1 C Water'); $r->addInstructionsSection(''); $r->addInstructionsSection(' '); $r->addInstructionsSection(' '); $r->appendInstruction('Heat water in large pot.'); $this->assertEquals(2, count($r->ingredients)); $this->assertEquals(1, count($r->instructions)); }
public static function parse($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $recipe = new RecipeParser_Recipe(); // Title $nodes = $xpath->query('//div[@id="detail_content"]/h1'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } // Yield and Times $nodes = $xpath->query('//p[@id="yield"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } // Notes, instructions, and ingredients are not very well structured. $found_ingredients = false; $found_instructions = false; $nodes = $xpath->query('//div[@id="detail_content"]/*'); foreach ($nodes as $node) { // Notes -- Weird, but this is the only <p> that doesn't have attributes // on the tag. if ($node->nodeName == 'p') { if (!$node->hasAttributes()) { $recipe->notes = trim($node->nodeValue); continue; } } // Ingredients/ingredients markers if ($node->nodeName == 'h5') { $line = strtolower(trim($node->nodeValue)); if ($line == 'ingredients') { $found_ingredients = true; continue; } else { if ($line == 'instructions') { $found_instructions = true; } } } // Ingredients if ($found_ingredients && !$found_instructions) { if ($node->nodeName == 'h6') { $line = trim($node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { if ($node->nodeName == 'ul') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub) { $line = trim($sub->nodeValue); // Add spaces between quantities and units $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line); // Remove spaces before commas (not sure why this happens in their HTML) $line = str_replace(' ,', ',', $line); // Condense multiple spaces $line = str_replace(' ', ' ', $line); $recipe->appendIngredient($line); } } } } // Instructions if ($found_instructions) { if ($node->nodeName == 'ul') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub) { $line = trim($sub->nodeValue); $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); } } } } // Photo $nodes = $xpath->query('//img[@class="detail"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//div[@id="rightCol"]/h1'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } // Yield $nodes = $xpath->query('//h4[@class="detailHeader"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } // Notes $nodes = $xpath->query('//div[@class="dek"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->notes = $line; } // Ingredients $nodes = $xpath->query('//ul[@class="recipe_ingredients"]/li'); foreach ($nodes as $node) { // Section names have class="ingredientSectionTitle", // ingredients themselves have no class. if ($node->hasAttributes()) { $line = trim($node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { $line = trim($node->nodeValue); // Add spaces between quantities and units $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line); // Remove spaces before commas (not sure why this happens in their HTML) $line = str_replace(' ,', ',', $line); // Condense multiple spaces $line = str_replace(' ', ' ', $line); $recipe->appendIngredient($line); } } // Instructions $nodes = $xpath->query('//ol[@class="recipe_instructions"]/li'); foreach ($nodes as $node) { $line = trim($node->nodeValue); $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//img[@class="recipeImg"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } else { // Second option for where to find recipe image $nodes = $xpath->query('//img[@id="splashImage"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@itemprop="name"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatTitle($line); $recipe->title = $line; } // Description $nodes = $xpath->query('//*[@itemprop="description"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->description = $line; } // Author $nodes = $xpath->query('//span[@itemprop="author"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatCredits($line); $recipe->credits = $line; } // Prep Times $nodes = $xpath->query('//*[@itemprop="prepTime"]'); if ($nodes->length) { $value = $nodes->item(0)->getAttribute("content"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value); } // Total Time $nodes = $xpath->query('//*[@itemprop="totalTime"]'); if ($nodes->length) { $value = $nodes->item(0)->getAttribute("content"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value); } // Yield $nodes = $xpath->query('//*[@itemprop="recipeyield"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@itemprop="recipeinstructions"]/li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $recipe->photo_url = $line; } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@itemprop="name"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Times and yield // <time datetime="PT35M" itemprop="prepTime"> $nodes = $xpath->query('//time[@itemprop="prepTime"]'); if ($nodes->length) { if ($value = $nodes->item(0)->textContent) { $value = RecipeParser_Text::mixedTimeToMinutes($value); $recipe->time['total'] = $value; } } $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($value); } // Ingredients $nodes = $xpath->query('//*[@itemprop="ingredients"]'); foreach ($nodes as $node) { $value = trim($node->nodeValue); if ($value != "Ingredients") { $recipe->appendIngredient($value); } } // Instructions $nodes = $xpath->query('//span[@class = "steps-list__item__text"]'); foreach ($nodes as $node) { $value = trim($node->nodeValue); $value = RecipeParser_Text::stripLeadingNumbers($value); $parts = self::splitDirections($value); if ($parts['section']) { $parts['section'] = RecipeParser_Text::formatSectionName($parts['section']); $recipe->addInstructionsSection($parts['section']); } $recipe->appendInstruction($parts['direction']); } // Notes $nodes = $xpath->query('//div[@class = "recipe-notes__content"]/div/p'); $notes = array(); if ($nodes->length) { foreach ($nodes as $node) { $value = trim($node->nodeValue); array_push($notes, $value); } $recipe->notes = implode(' | ', $notes); } // Photo $nodes = $xpath->query('//img[@class = "recipe-carousel__recipe__img"]'); if ($nodes && $nodes->item(1)) { $photo_url = $nodes->item(1)->getAttribute('src'); if (strpos($photo_url, 'default-recipe-image.gif') === false && strpos($photo_url, 'placeholder.gif') === false) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@class="rTitle fn"]'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Yield $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " yield ")]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Times $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " prepTime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " rspec-cook-time ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " totaltime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($line); } // Ingredients $nodes = $xpath->query('//*[@class="ingredient"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@class="instructions"]'); if ($nodes->length) { $blob = ""; foreach ($nodes->item(0)->childNodes as $node) { $blob .= RecipeParser_Text::formatAsOneLine($node->nodeValue) . " "; if ($node->nodeName == "p") { $blob .= "\n\n"; } } // Minor cleanup $blob = str_replace(" , ", ", ", $blob); $blob = str_replace(" . ", ". ", $blob); $blob = str_replace(" ", " ", $blob); foreach (explode("\n\n", $blob) as $line) { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } } // Photo $nodes = $xpath->query('//a[@class="img-enlarge"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("href"); $photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); $recipe->photo_url = $photo_url; } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h3[@class = "title"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Cook times $nodes = $xpath->query('//div[@class = "recipe-metadata"]/ul/li'); foreach ($nodes as $node) { $sub_nodes = $node->childNodes; $key = null; $value = null; foreach ($sub_nodes as $sub_node) { if ($sub_node->nodeName == 'h5') { $key = trim($sub_node->nodeValue); } if ($sub_node->nodeName == 'p') { $value = trim($sub_node->nodeValue); } } // Inspect keys/values we've found. if ($key == 'Total Time:') { $value = self::cleanupTime($value); $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } if ($key == 'Prep Time:') { $value = self::cleanupTime($value); $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } } $node_list = $xpath->query('//dd[@class = "preptime"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } $node_list = $xpath->query('//dd[@class = "cooktime"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['cook'] = RecipeParser_Times::toMinutes($value); } $node_list = $xpath->query('//dd[@class = "duration totaltime special"]'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } // Ingredients, Yield, Description, Notes, etc. $nodes = $xpath->query('//div[@class = "recipe-body"]/*'); $section_title = null; foreach ($nodes as $node) { // Section titles if ($node->nodeName == 'h4') { $value = $node->nodeValue; $value = trim(strtolower($value)); $section_title = $value; continue; } $in_section = false; if ($node->nodeName == 'div') { // Ensure that we're in a <div class="section"> node. foreach ($node->attributes as $attr_name => $attr_node) { if ($attr_name == 'class' && $attr_node->value == 'section') { $in_section = true; } } if (!$in_section) { continue; } // Description should be first text, before any section titles. if (!$section_title) { $value = $node->nodeValue; $value = preg_replace("/^(Drink\\:|Top Chef).*\$/m", '', $value); $value = str_replace("\n\n", "\n", $value); $value = trim($value); $recipe->description = $value; // Yield } else { if ($section_title == 'yield') { $value = trim($node->nodeValue); $recipe->yield = $value; // Notes } else { if ($section_title == 'notes') { $value = trim($node->nodeValue); $value = str_replace("\n\n", "\n", $value); $recipe->notes = $value; // Ingredients } else { if ($section_title == 'ingredients') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub_node) { if ($sub_node->nodeName == 'h5') { $value = RecipeParser_Text::formatSectionName($sub_node->nodeValue); $recipe->addIngredientsSection($value); } else { if ($sub_node->nodeName == 'ul') { $li_nodes = $sub_node->childNodes; foreach ($li_nodes as $li_node) { $value = trim($li_node->nodeValue); $recipe->appendIngredient($value); } } } } // Instructions } else { if ($section_title == 'directions') { $sub_nodes = $node->childNodes; foreach ($sub_nodes as $sub_node) { $value = trim($sub_node->nodeValue); // Section titles appear in all-caps. if ($value && ($value == strtoupper($value) || preg_match('/:$/', $value))) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addInstructionsSection($value); } else { $value = RecipeParser_Text::stripLeadingNumbers($value); $recipe->appendInstruction($value); } } } } } } } } } // Source / Chef $nodes = $xpath->query('//div[@class = "recipe-sidebar"]/div/*'); $section_title = null; $chef_name = null; $show_name = 'Bravo TV'; foreach ($nodes as $node) { if ($node->nodeName == 'h4') { $value = trim($node->nodeValue); $section_title = strtolower($value); continue; } if ($node->nodeName == 'small') { if ($section_title == 'chef' || $section_title == 'author') { $value = trim($node->nodeValue); $chef_name = $value; break; } } } $nodes = $xpath->query('//div[@class = "section"]/p[1]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; if (strpos($value, 'Top Chef Masters') !== false) { $show_name = 'Top Chef Masters'; } else { if (strpos($value, 'Top Chef') !== false) { $show_name = 'Top Chef'; } } } $recipe->credits = $chef_name . ', ' . $show_name; $nodes = $xpath->query('//div[@class = "recipe-header clearfix"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $photo_url = str_replace('/medium/', '/original/', $photo_url); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { if (strpos($url, "www.nytimes.com/recipes/") !== false) { // // "RECIPES" SECTION // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@class="recipe-title recipeName"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatTitle($value); $recipe->title = $value; } // Yield $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = RecipeParser_Text::formatYield($value); $recipe->yield = $value; } // Ingredients $nodes = $xpath->query('//div[@class="ingredientsGroup"]/*'); foreach ($nodes as $node) { if ($node->nodeName == "h3") { $value = trim($node->nodeValue); if (!preg_match('/^Ingredients:?$/i', $value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { foreach ($node->childNodes as $child) { $value = trim($child->nodeValue); $recipe->appendIngredient($value); } } } // Instructions $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/dd'); foreach ($nodes as $node) { $value = $node->nodeValue; $value = RecipeParser_Text::formatAsOneLine($value); $recipe->appendInstruction($value); } // Notes if (!$recipe->notes) { $nodes = $xpath->query('//div[@class="yieldNotesGroup"]//*[@class="note"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $value = preg_replace("/^Notes?:?\\s*/i", '', $value); $recipe->notes = trim($value); } } } else { // // DINING SECTION RECIPES // $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//div[@id = "article"]//h1'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Time and Yield $nodes = $xpath->query('//div[@id = "article"]//p'); foreach ($nodes as $node) { $text = trim($node->nodeValue); if (preg_match('/^Yield:? (.+)/', $text, $m)) { $recipe->yield = RecipeParser_Text::formatYield($m[1]); } else { if (preg_match('/^Time:? (.+)/', $text, $m)) { $str = trim($m[1]); $str = preg_replace('/About (.+)/', '$1', $str); $str = preg_replace('/(.+) plus.*/', '$1', $str); $recipe->time['total'] = RecipeParser_Times::toMinutes($str); } } } // Ingredients $nodes = $xpath->query('//div[@class="recipeIngredientsList"]/p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); // Section names if ($line && $line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); continue; } $recipe->appendIngredient($line); } // Instructions and notes $nodes = $xpath->query('//div[@class="articleBody"]//p'); if (!$nodes->length) { $nodes = $xpath->query('//div[@id="articleBody"]//p'); } $notes = ''; $in_notes_section = false; foreach ($nodes as $node) { $line = trim($node->nodeValue); // Skip some of the useless lines if (preg_match('/^(Adapted from|Time|Yield)/i', $line)) { continue; } // Instructions start with line numbers if (!$in_notes_section && preg_match('/^\\d+\\./', $line)) { $line = RecipeParser_Text::stripLeadingNumbers($line); $recipe->appendInstruction($line); continue; } // Look for lines that start the notes section. $note = ''; if (preg_match('/^Notes?:?(.*)/i', $line, $m)) { $in_notes_section = true; $note = trim($m[1]); } else { if ($in_notes_section) { $note = $line; } } if ($note) { $notes .= $note . "\n\n"; } } if ($notes) { $notes = str_replace(" ", " ", $notes); // Some unnecessary spaces $notes = trim($notes); $recipe->notes = $notes; } // Photo $nodes = $xpath->query('//div[@class="image"]//img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); $photo_url = str_replace('-articleInline.jpg', '-popup.jpg', $photo_url); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//h1[@itemprop="name"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->title = $value; } // Times and yield // <meta content="PT3H30M" itemprop="totalTime"> $nodes = $xpath->query('//meta[@itemprop="totalTime"]'); if ($nodes->length) { if ($value = $nodes->item(0)->getAttribute('content')) { $value = RecipeParser_Text::iso8601ToMinutes($value); $recipe->time['total'] = $value; } } $nodes = $xpath->query('//*[@itemprop="recipeYield"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($value); } // Ingredients $nodes = $xpath->query('//div[@id = "ingredients"]/*'); foreach ($nodes as $node) { if ($node->nodeName == 'h2') { $value = trim($node->nodeValue); $value = RecipeParser_Text::formatSectionName($value); if ($value != "Ingredients") { $recipe->addIngredientsSection($value); } } else { if ($node->nodeName == 'ol') { $subnodes = $xpath->query('./li/span', $node); foreach ($subnodes as $subnode) { $value = trim($subnode->nodeValue); $recipe->appendIngredient($value); } } } } // Instructions $nodes = $xpath->query('//div[@id = "directions"]/ol/li'); foreach ($nodes as $node) { $value = trim($node->nodeValue); $value = RecipeParser_Text::stripLeadingNumbers($value); $parts = self::splitDirections($value); if ($parts['section']) { $parts['section'] = RecipeParser_Text::formatSectionName($parts['section']); $recipe->addInstructionsSection($parts['section']); } $recipe->appendInstruction($parts['direction']); } // Notes $nodes = $xpath->query('//div[@id = "directions"]/div[@id = "endnotes"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $recipe->notes = $value; } // Photo $nodes = $xpath->query('//img[@itemprop="image"]'); if ($nodes && $nodes->item(0)) { $photo_url = $nodes->item(0)->getAttribute('src'); if (strpos($photo_url, 'default-recipe-image.gif') === false && strpos($photo_url, 'placeholder.gif') === false) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }