public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Notes -- Collect the non-standard cook times and baking temps, // and also any tips/notes that appear at the end of the recipe instructions. $notes = array(); $nodes = $xpath->query('//*[@class="recipeTips"]//li'); foreach ($nodes as $node) { $value = RecipeParser_Text::FormatAsOneLine($node->nodeValue); $value = preg_replace("/^(Tip|Note)\\s*(.*)\$/", "\$2", $value); $notes[] = $value; } $nodes = $xpath->query('//*[@class="recipeInfo"]//*[@class="type"]'); foreach ($nodes as $node) { $value = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (strpos($value, "Makes:") !== false) { continue; } $notes[] = $value; } $recipe->notes = implode("\n\n", $notes); // Adjust Photo URL for larger dimensions $recipe->photo_url = preg_replace("/\\/l_([^\\/]+)/", "/550_\$1", $recipe->photo_url); return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "recipe-ingredients"]/*'); foreach ($nodes as $node) { if ($node->nodeName == 'p') { $value = trim($node->nodeValue); // Older recipes will have ingredients jumbled into a single <p> // rather than using 'ingredients' classes. If the node value looks // like multiple lines, treat it like a section header followed by // section ingredients. $lines = explode("\n", $value); if (count($lines) > 1) { for ($i = 0; $i < count($lines); $i++) { $line = trim($lines[$i]); if ($i == 0) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { $line = trim($line); $recipe->appendIngredient($line); } } // Otherwise, we're dealing with a normal section for hrecipe, and // ingredients for the section will follow as <ul> elements. } else { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } } else { if ($node->nodeName == 'ul') { $subnodes = $xpath->query('./li[@class = "ingredient"]', $node); foreach ($subnodes as $subnode) { $value = trim($subnode->nodeValue); $recipe->appendIngredient($value); } } } } // Notes $nodes = $xpath->query('//*[@id="recipe-intronote"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->notes = RecipeParser_Text::formatAsParagraphs($value); } // Photo URL to replace og:image $nodes = $xpath->query('//img[@itemprop="image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("src"); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard hrecipe stuff we can find. $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Multi-stage ingredients $nodes = $xpath->query('//dl[@id="stages"]/*'); if ($nodes->length) { $recipe->resetIngredients(); foreach ($nodes as $node) { if ($node->nodeName == 'dt') { $value = $node->nodeValue; $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { if ($node->nodeName == 'dd') { $subs = $xpath->query('.//*[@class="ingredient"]', $node); foreach ($subs as $sub) { $value = trim($sub->nodeValue); $recipe->appendIngredient($value); } } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Photo -- skip logo if it was used in place of photo if (strpos($recipe->photo_url, "FDC_Logo_vertical.png") !== false || strpos($recipe->photo_url, "FDC_share-logo.png") !== false) { $recipe->photo_url = ''; } if ($recipe->photo_url) { $recipe->photo_url = str_replace("/thumbs/", "/large/", $recipe->photo_url); } // Yield $yield = ''; $nodes = $xpath->query('//*[@class="yield"]'); // Find as 'yield' if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; // Or as number of 'servings' } else { $nodes = $xpath->query('//*[@class="servings"]//*[@class="value"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // OVERRIDES for epicurious // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id = "ingredients"]/*'); foreach ($nodes as $node) { // <strong> contains ingredient section names if ($node->nodeName == 'strong') { $line = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($line); continue; } // Extract ingredients from inside of <ul class="ingredientsList"> if ($node->nodeName == 'ul') { // Child nodes should all be <li> $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { if ($ing_node->nodeName == 'li') { $line = trim($ing_node->nodeValue); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Photo URL, use larger version found on MyRecipes $recipe->photo_url = str_replace('-l.jpg', '-x.jpg', $recipe->photo_url); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@class="recipeDetails"]/ul'); foreach ($nodes->item(0)->childNodes as $li) { if ($li->nodeName == 'li') { $text = RecipeParser_Text::FormatAsOneLine($li->nodeValue); if ($li->getAttribute('itemprop') == 'ingredient') { $text = trim(str_replace('$Click to see savings', '', $text)); $recipe->appendIngredient($text); } else { $text = RecipeParser_Text::formatSectionName($text); $recipe->addIngredientsSection($text); } } } // Credits $nodes = $xpath->query('//*[@itemprop="author"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->credits = $line; } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = null; if (!$nodes || !$nodes->length) { $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="view-content"]/*'); } if (!$nodes || !$nodes->length) { $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="ingredient-lists separator-serated tab-content"]/*'); } foreach ($nodes as $node) { if ($node->nodeName == 'h3') { $line = $node->nodeValue; $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } else { if ($node->nodeName == 'ul') { foreach ($node->childNodes as $subnode) { $line = $subnode->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $nodes = $xpath->query('//*[@id="page-title"]'); if ($nodes->length) { $line = RecipeParser_Text::formatTitle($nodes->item(0)->nodeValue); $recipe->title = $line; } // Times $nodes = $xpath->query('//*[@class="field-recipe-time"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (strpos($line, "Hands-On Time") !== false) { $line = str_replace("Hands-On Time ", "", $line); $recipe->time["prep"] = RecipeParser_Times::toMinutes($line); } else { if (strpos($line, "Total Time") !== false) { $line = str_replace("Total Time ", "", $line); $recipe->time["total"] = RecipeParser_Times::toMinutes($line); } } } // Yield $nodes = $xpath->query('//*[@class="field-yield"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; } // Ingredients $nodes = $xpath->query('//*[@class="field-ingredients"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//*[@class="field-instructions"]//li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Photo $nodes = $xpath->query('//*[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('content'); $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); $hrecipe = $xpath->query('//section[@role="main"]'); if ($hrecipe->length) { $hrecipe = $hrecipe->item(0); // Title is not marked up with class="fn" $nodes = $xpath->query('.//h1', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->title = RecipeParser_Text::formatTitle($value); } // Yield -- Class names are conflated $nodes = $xpath->query('.//*[@class="info yield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } $nodes = $xpath->query('.//span[@itemprop="recipeYield"]', $hrecipe); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Prep Times -- Class names are conflated $nodes = $xpath->query('.//*[@class="info preptime"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['prep'] = RecipeParser_Times::toMinutes($value); } // Total Time / Duration -- Class names are conflated $nodes = $xpath->query('.//*[@class="info duration"]', $hrecipe); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $recipe->time['total'] = RecipeParser_Times::toMinutes($value); } } // Photo $nodes = $xpath->query('//section[@class="content-unit"]/img'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute('src'); if ($photo_url) { $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url); } } // Remove recipe title intros -- e.g. "Sunday Dinner: Pork Ribs" changes to "Pork Ribs" if (strpos($recipe->title, ": ") !== false) { $recipe->title = preg_replace("/^[^:]+: (.+)/", "\$1", $recipe->title); } return $recipe; }
public static function getBookmarkAsRecipeStruct($html, $url) { // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // This recipe will be stored as a bookmark $recipe = new RecipeStruct(); $recipe->url = $url; $recipe->status = "bookmark"; // Find the page title $title = ""; $title_tag = ""; $title_og_meta = ""; $nodes = $xpath->query('//title'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatTitle($line); if ($line) { $title_tag = $line; } } $nodes = $xpath->query('//meta[@property="og:title"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $line = RecipeParser_Text::formatTitle($line); if ($line) { $title_og_meta = $line; } } // Which title string to use? if ($title_og_meta) { $title = $title_og_meta; } else { if ($title_tag) { $title = $title_tag; } else { $title = "Recipe from {$url}"; } } $recipe->title = $title; // Get image from Open Graph tag $nodes = $xpath->query('//meta[@property="og:image"]'); if ($nodes->length) { $photo_url = $nodes->item(0)->getAttribute("content"); if ($photo_url) { $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " yield ")]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $recipe->yield = RecipeParser_Text::formatYield($line); } // Times $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " prepTime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " rspec-cook-time ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($line); } $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " totaltime ")]/span'); if ($nodes->length) { $line = $nodes->item(1)->getAttribute("title"); $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($line); } // Ingredients $recipe->resetIngredients(); $ing_nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredients ")]/*'); foreach ($ing_nodes as $ing_node) { if ($ing_node->getAttribute('class') == "ingr-divider") { $line = RecipeParser_Text::formatSectionName($ing_node->nodeValue); $recipe->addIngredientsSection($line); continue; } // Extract ingredients from inside of <ul class="ingredientsList"> // Child nodes should all be <li> if ($ing_node->nodeName == 'ul') { foreach ($ing_node->childNodes as $node) { $line = trim($node->nodeValue); $recipe->appendIngredient($line); } continue; } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title missing? if (!$recipe->title) { $nodes = $xpath->query('//meta[@property="og:title"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $line = RecipeParser_Text::formatTitle($line); $recipe->title = $line; } } // Photo URL, use larger version found on MyRecipes $recipe->photo_url = str_replace('-l.jpg', '-x.jpg', $recipe->photo_url); // Credits $nodes = $xpath->query('//*[@class="link-list"]/h4'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); if (strpos($line, "More from") === 0) { $line = str_replace("More from ", "", $line); $recipe->credits = $line; } } // Times $searches = array('prep' => 'prep: ', 'cook' => 'cook: ', 'total' => 'total: '); $nodes = $xpath->query('//*[@class="recipe-time-info"]'); foreach ($nodes as $node) { $line = trim(strtolower($node->nodeValue)); foreach ($searches as $key => $value) { if (strpos($line, $value) === 0) { $line = str_replace($value, "", $line); $recipe->time[$key] = RecipeParser_Times::toMinutes($line); } } } // Clean up each of the ingredients to remove "$Click to see savings" // These don't come through in the curl'ed test files for ($i = 0; $i < count($recipe->ingredients); $i++) { for ($j = 0; $j < count($recipe->ingredients[$i]['list']); $j++) { if (strpos($recipe->ingredients[$i]['list'][$j], "\$") > 0) { $recipe->ingredients[$i]['list'][$j] = substr($recipe->ingredients[$i]['list'][$j], 0, strpos($recipe->ingredients[$i]['list'][$j], "\$")); } } } return $recipe; }
public static function parse($html, $url) { $recipe = new RecipeParser_Recipe(); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Title $node_list = $doc->getElementsByTagName('title'); if ($node_list->length) { $value = $node_list->item(0)->nodeValue; $value = trim(str_replace("Cooks.com - Recipe - ", "", $value)); $value = trim(str_replace(" - Recipe - Cooks.com", "", $value)); $recipe->title = $value; } // This node contains all ingredients, section titles, and instructions $node_list = $xpath->query('//table[@class="hrecipe"]//td/div'); foreach ($node_list as $node) { // Can determine each piece of content by the "style" attributes. $style = $node->getAttribute("style"); // Ingredients found in a div, black text if (stripos($style, "color: BLACK;") !== false) { $ing_nodes = $xpath->query('./span[@class = "ingredient"]', $node); foreach ($ing_nodes as $ing_node) { $recipe->appendIngredient($ing_node->nodeValue); } // Instructions node } else { if ($node->getAttribute('class') == "instructions") { foreach ($node->childNodes as $child) { $line = $child->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Section title } else { if ($node->getAttribute("class") == "section") { $title = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addIngredientsSection($title); if (count($recipe->instructions) > 0) { $recipe->addInstructionsSection($title); } } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Photo -- skip logo if it was used in place of photo if (strpos($recipe->photo_url, "FDC_Logo_vertical.png") !== false) { $recipe->photo_url = ''; } if ($recipe->photo_url) { $recipe->photo_url = str_replace("/thumbs/", "/large/", $recipe->photo_url); } // Yield $yield = ''; $nodes = $xpath->query('//option[@class="select-title"]'); if ($nodes->length) { $yield .= trim($nodes->item(0)->nodeValue); } $nodes = $xpath->query('//p[@class="yieldUnits-txt"]'); if ($nodes->length) { $value = trim($nodes->item(0)->nodeValue); $yield .= ' ' . ($value ? $value : 'servings'); } $recipe->yield = trim($yield); // Ingredients (custom because of duplicate class attributes for "ingredients") $recipe->resetIngredients(); $nodes = $xpath->query('//div[@class = "pod ingredients"]/*'); foreach ($nodes as $node) { # <h3> contains ingredient section names if ($node->nodeName == 'h3') { $recipe->addIngredientsSection(ucfirst(trim(strtolower($node->nodeValue)))); } # Extract ingredients from <ul> <li>. if ($node->nodeName == 'ul') { $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { // Find <li> with class="ingredient" for each ingredient. if ($ing_node->nodeName == 'li') { $line = RecipeParser_Text::FormatAsOneLine($ing_node->nodeValue); $recipe->appendIngredient($line); } } } } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // ---- OVERRIDES // Title $nodes = $xpath->query('//h3//strong'); if ($nodes->length) { $line = RecipeParser_Text::formatAsOneLine($nodes->item(0)->nodeValue); $recipe->title = $line; } // Yield $nodes = $xpath->query('//span[@itemprop="articleBody"]//p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); if (strpos($line, "Yield") === 0 || strpos($line, "Serve") === 0) { $line = RecipeParser_Text::formatYield($line); $recipe->yield = $line; break; } } // Ingredients $nodes = $xpath->query('//span[@itemprop="articleBody"]//ul/li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $nodes = $xpath->query('//span[@itemprop="articleBody"]//ol/li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } // Image $nodes = $xpath->query('//meta[@property="og:image"]'); foreach ($nodes as $node) { $line = $node->getAttribute("content"); $recipe->photo_url = $line; break; } return $recipe; }
public static function parse($html, $url) { // Get all of the standard bits we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Titles include "recipe" if (preg_match("/ Recipe( - CHOW.com)?\$/", $recipe->title)) { $recipe->title = trim(preg_replace("/(.*) Recipe( - CHOW.com)?\$/", "\$1", $recipe->title)); } // Strip leading numbers from instructions for ($i = 0; $i < count($recipe->instructions); $i++) { for ($j = 0; $j < count($recipe->instructions[$i]['list']); $j++) { $recipe->instructions[$i]['list'][$j] = preg_replace("/^\\d+(\\w.*)\$/", "\$1", $recipe->instructions[$i]['list'][$j]); } } // Ingredients (If none parsed) if (!count($recipe->ingredients[0]['list'])) { $nodes = $xpath->query('//*[@id="ingredients_list"]//li'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Instructions (If none parsed) if (!count($recipe->instructions[0]['list'])) { $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendInstruction($line); } } // Cleanup description if ($recipe->description) { $recipe->description = preg_replace("/^(Read our review of|This (dish|recipe) was featured as part|See more recipes) .*\$/m", "", $recipe->description); $recipe->description = preg_replace("/[\r\n]{3,}/", "\n\n", $recipe->description); $recipe->description = trim($recipe->description); } return $recipe; }
/** * Cleanup for clipped HTML prior to parsing with RecipeParser. * * @param string HTML * @return string HTML */ public static function cleanupClippedRecipeHtml($html) { $html = preg_replace('/(\\r\\n|\\r)/', "\n", $html); // Normalize line breaks $html = str_replace(' ', ' ', $html); // get rid of non-breaking space (html code) $html = str_replace(' ', ' ', $html); // get rid of non-breaking space (numeric) $html = preg_replace('/\\xC2\\xA0/', ' ', $html); // get rid of non-breaking space (UTF-8) $html = preg_replace('/[\\x{0096}-\\x{0097}]/u', '-', $html); // ndash, mdash (bonappetit) // Strip out script tags so they don't accidentally get executed if we ever display // clipped content to end-users. $html = RecipeParser_Text::stripTagAndContents('script', $html); return $html; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield, Ingredients, Instructions $found_instructions = false; $found_ingredients = false; $nodes = $xpath->query('//*[@class="field field-name-body field-type-text-with-summary field-label-hidden"]//*[@class="field-item even"]'); if ($nodes->length) { foreach ($nodes->item(0)->childNodes as $node) { $str = trim($node->nodeValue); // Yield if (!$recipe->yield && preg_match("/(makes|yields|serves|servings)/i", $str) && preg_match("/\\d/", $str)) { $recipe->yield = RecipeParser_Text::formatYield($str); continue; } // Ingredients and Instructions if ($str == "INGREDIENTS") { $found_ingredients = true; continue; } if ($str == "INSTRUCTIONS") { $found_instructions = true; continue; } if (!$found_ingredients) { continue; } else { if (!$found_instructions) { $str = RecipeParser_Text::formatAsOneLine($str); $recipe->appendIngredient($str); } else { $str = RecipeParser_Text::formatAsOneLine($str); $str = RecipeParser_Text::stripLeadingNumbers($str); $recipe->appendInstruction($str); } } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Overrides for data that isn't captured by their implementation of Schema.org. // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]'); foreach ($nodes as $node) { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); $recipe->appendInstruction($line); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataRdfDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[@class="ingredient"]'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // ---- OVERRIDES // Credits if ($recipe->credits) { $recipe->credits = "Food52 (" . $recipe->credits . ")"; } else { $recipe->credits = "Food52"; } // Notes $line = ""; $nodes = $xpath->query('.//span[@class="recipe-note"]'); if ($nodes->length) { $nodes = $nodes->item(0)->childNodes; // go through 'childNodes' to get #text nodes foreach ($nodes as $node) { switch ($node->nodeName) { case "br": $line .= "\n"; break; case "#text": case "span": case "strong": case "b": case "em": case "i": case "a": $line .= $node->nodeValue . " "; break; } } } $line = preg_replace("/^Author Notes:\\s*/", "", $line); $recipe->notes = RecipeParser_Text::formatAsParagraphs($line); return $recipe; }
public static function parse($html, $url) { // Get all of the standard hrecipe stuff we can find. $recipe = RecipeParser_Parser_Microformat::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[@name="resizeTo"]'); if ($nodes->length) { $line = trim($nodes->item(0)->getAttribute("value")) . " servings"; $recipe->yield = RecipeParser_Text::formatYield($line); } // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]'); foreach ($nodes as $node) { $parts = array(); foreach ($node->childNodes as $n) { $parts[] = $n->nodeValue; } $line = implode(' ', $parts); $line = str_replace(" ; ", "; ", $line); $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//div[@class="display-field"]/p'); foreach ($nodes as $node) { $line = trim($node->nodeValue); if ($line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addInstructionsSection($line); } else { $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $sections = $xpath->query('//*[@id="ingredients"]//*[@class="group"]'); if ($sections->length) { // Sections foreach ($sections as $section_node) { $section_nodes = $xpath->query('.//h3', $section_node); if ($section_nodes->length) { $line = $section_nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatSectionName($line); if (!empty($line)) { $recipe->addIngredientsSection($line); } } $ing_nodes = $xpath->query('.//li', $section_node); if ($ing_nodes->length) { foreach ($ing_nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } } } // Notes $nodes = $xpath->query('.//*[@class = "body-c note-text"]'); if ($nodes->length) { $value = $nodes->item(0)->nodeValue; $value = trim(str_replace("Cook's Note", '', $value)); $recipe->notes = $value; } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_Microformat::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // --- Items not properly definied in Recipe.com's microformat markup. // Title -- Fallback if "fn" is not defined. if (!$recipe->title) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " heading1 ")]'); if ($nodes->length) { $recipe->title = trim($nodes->item(0)->nodeValue); } } // Photo -- Fallback if "photo" is not defined. if (!$recipe->photo_url) { $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " PB10 ")]/img'); if ($nodes->length) { $url = $nodes->item(0)->getAttribute('src'); $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($url, $this->url); } } // Yield $nodes = $xpath->query('//*[@class="servingsize"]'); if ($nodes->length) { $line = trim($nodes->item(0)->nodeValue); $recipe->yield = RecipeParser_Text::formatYield($line); } // Credits $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " partnerName ")]'); if ($nodes->length) { $line = RecipeParser_Text::FormatAsOneLine($nodes->item(0)->nodeValue); $line = preg_replace('/\\s*Recipe from\\s+(.*)$/', "\$1", $line); $recipe->credits = trim($line); } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // // Some of the ingredient lines in on The Daily Meal do not adhere to // the usual microdata formatting. Here we fall back to looking for a // regular list within a higher-level ingredients div. // if (!empty($recipe->ingredients)) { $nodes = $xpath->query("//div[@class='content']/div[@class='ingredient']/ul/li"); foreach ($nodes as $node) { $value = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (empty($value)) { continue; } if (RecipeParser_Text::matchSectionName($value)) { $value = RecipeParser_Text::formatSectionName($value); $recipe->addIngredientsSection($value); } else { $recipe->appendIngredient($value); } } } // // The Daily Meal provides servings details via Edamam's plugin. // if (!$recipe->yield) { $nodes = $xpath->query("//table[@class='edamam-data']/tr[2]/td[2]"); if ($nodes->length) { $recipe->yield = RecipeParser_Text::formatYield($nodes->item(0)->nodeValue); } } return $recipe; }
public static function downloadRecipeWithCache($url) { $cache_ttl = 86400 * 3; // Target filename $filename = FileUtil::tempFilenameFromUrl($url); // Only fetch 1x per day if (file_exists($filename) && filesize($filename) > 0 && time() - filemtime($filename) < $cache_ttl) { error_log("Found file in cache: {$filename}"); $html = file_get_contents($filename); } else { // Fetch and cleanup the HTML error_log("Downloading recipe from url: {$url}"); $html = FileUtil::downloadPage($url); $html = RecipeParser_Text::forceUTF8($html); $html = RecipeParser_Text::cleanupClippedRecipeHtml($html); // Append some notes to the HTML $comments = RecipeParser_Text::getRecipeMetadataComment($url, "curl"); $html = $comments . "\n\n" . $html; error_log("Saving recipe to file {$filename}"); file_put_contents($filename, $html); } return $html; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@id="ingredients-box"]//ul/li'); foreach ($nodes as $node) { if ($node->getAttribute("itemprop")) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } else { $line = $node->nodeValue; $line = RecipeParser_Text::formatSEctionName($line); $recipe->addIngredientsSection($line); } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//*[@id="method-box"]//p'); foreach ($nodes as $node) { $line = $node->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); if ($line) { $recipe->appendInstruction($line); } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Notes $nodes = $xpath->query('//div[@class="rd_editornote margin_bottom"]'); if ($nodes->length) { $line = $nodes->item(0)->nodeValue; $line = RecipeParser_Text::formatAsOneLine($line); $line = preg_replace("/Editor's Note:\\s+/", "", $line); $recipe->notes = $line; } // Override image $nodes = $xpath->query('//meta[@itemprop="image"]'); if ($nodes->length) { $line = $nodes->item(0)->getAttribute("content"); $recipe->photo_url = $line; } return $recipe; }
public static function parse($html, $url) { // Get all of the standard microdata stuff we can find. $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); // Turn off libxml errors to prevent mismatched tag warnings. libxml_use_internal_errors(true); $doc = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Ingredients $recipe->resetIngredients(); $nodes = $xpath->query('//div[@class="col6 ingredients"]/*'); foreach ($nodes as $node) { // Extract ingredients from <ul> <li>. if ($node->nodeName == 'ul') { $ing_nodes = $node->childNodes; foreach ($ing_nodes as $ing_node) { // Find <li> with itemprop="ingredients" for each ingredient. if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("itemprop") == "ingredients") { $line = trim($ing_node->nodeValue); // Section titles might be all uppercase ingredients if ($line == strtoupper($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); continue; } // Ingredient lines if (stripos($line, "copyright") !== false) { continue; } else { if (stripos($line, "recipe follows") !== false) { continue; } else { $line = RecipeParser_Text::formatAsOneLine($line); $recipe->appendIngredient($line); } } // Section titles } else { if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("class") == "subtitle") { $line = trim($ing_node->nodeValue); $line = RecipeParser_Text::formatSectionName($line); $recipe->addIngredientsSection($line); } } } continue; } } // Instructions $recipe->resetInstructions(); $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*'); foreach ($nodes as $node) { if ($node->nodeName == "span") { $line = RecipeParser_Text::formatSectionName($node->nodeValue); $recipe->addInstructionsSection($line); } else { if ($node->nodeName == "p") { $line = RecipeParser_Text::formatAsOneLine($node->nodeValue); if (!preg_match("/^Photograph/i", $line)) { $recipe->appendInstruction($line); } } } } // See if we've captured a chef's photo, and delete it (if so). if ($recipe->photo_url) { $nodes = $xpath->query('//a[@itemprop="url"]/img[@itemprop="image"]'); if ($nodes->length > 0) { $url = $nodes->item(0)->getAttribute("src"); if ($recipe->photo_url == $url) { $recipe->photo_url = ""; } } } return $recipe; }
public static function parse($html, $url) { $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url); libxml_use_internal_errors(true); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc = new DOMDocument(); $doc->loadHTML('<?xml encoding="UTF-8">' . $html); $xpath = new DOMXPath($doc); // Yield $nodes = $xpath->query('//*[@class="prep_box"]'); foreach ($nodes as $node) { $line = $node->nodeValue; if (preg_match("/Number of Servings: (\\d+)/", $line, $m)) { $recipe->yield = RecipeParser_Text::formatYield($m[1]); } } // Instructions $recipe->resetInstructions(); $str = ""; $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]'); if ($nodes->length) { $children = $nodes->item(0)->childNodes; // This is a piece of HTML that has <br> tags for breaks in each instruction. // Rather than just getting nodeValue, I want to preserve the <br> tags. So I'm // looking for them as nodes and appending them to the string. Any other nodes // (either #text or other, e.g. <a href="">) get passed along into the string as // nodeValue. foreach ($children as $child) { if ($child->nodeName == "br") { $str .= "<br>"; } else { $line = trim($child->nodeValue); if (!empty($line)) { $str .= $line; } } } $lines = explode("<br>", $str); foreach ($lines as $line) { if (empty($line)) { continue; } else { if (RecipeParser_Text::matchSectionName($line)) { $line = RecipeParser_Text::formatSectionName($line); $recipe->addInstructionsSection($line); } else { if (!empty($line)) { $line = RecipeParser_Text::formatAsOneLine($line); $line = RecipeParser_Text::stripLeadingNumbers($line); if (stripos($line, "Recipe submitted by SparkPeople") === 0) { continue; } if (stripos($line, "Number of Servings:") === 0) { continue; } $recipe->appendInstruction($line); } } } } } return $recipe; }