Ejemplo n.º 1
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // OVERRIDES for epicurious
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//div[@id = "ingredients"]/*');
     foreach ($nodes as $node) {
         // <strong> contains ingredient section names
         if ($node->nodeName == 'strong') {
             $line = RecipeParser_Text::formatSectionName($node->nodeValue);
             $recipe->addIngredientsSection($line);
             continue;
         }
         // Extract ingredients from inside of <ul class="ingredientsList">
         if ($node->nodeName == 'ul') {
             // Child nodes should all be <li>
             $ing_nodes = $node->childNodes;
             foreach ($ing_nodes as $ing_node) {
                 if ($ing_node->nodeName == 'li') {
                     $line = trim($ing_node->nodeValue);
                     $recipe->appendIngredient($line);
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 2
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url);
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Photo URL, use larger version found on MyRecipes
     $recipe->photo_url = str_replace('-l.jpg', '-x.jpg', $recipe->photo_url);
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//div[@class="recipeDetails"]/ul');
     foreach ($nodes->item(0)->childNodes as $li) {
         if ($li->nodeName == 'li') {
             $text = RecipeParser_Text::FormatAsOneLine($li->nodeValue);
             if ($li->getAttribute('itemprop') == 'ingredient') {
                 $text = trim(str_replace('$Click to see savings', '', $text));
                 $recipe->appendIngredient($text);
             } else {
                 $text = RecipeParser_Text::formatSectionName($text);
                 $recipe->addIngredientsSection($text);
             }
         }
     }
     // Credits
     $nodes = $xpath->query('//*[@itemprop="author"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->nodeValue);
         $recipe->credits = $line;
     }
     return $recipe;
 }
Ejemplo n.º 3
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//div[@id = "recipe-ingredients"]/*');
     foreach ($nodes as $node) {
         if ($node->nodeName == 'p') {
             $value = trim($node->nodeValue);
             // Older recipes will have ingredients jumbled into a single <p>
             // rather than using 'ingredients' classes. If the node value looks
             // like multiple lines, treat it like a section header followed by
             // section ingredients.
             $lines = explode("\n", $value);
             if (count($lines) > 1) {
                 for ($i = 0; $i < count($lines); $i++) {
                     $line = trim($lines[$i]);
                     if ($i == 0) {
                         $line = RecipeParser_Text::formatSectionName($line);
                         $recipe->addIngredientsSection($line);
                     } else {
                         $line = trim($line);
                         $recipe->appendIngredient($line);
                     }
                 }
                 // Otherwise, we're dealing with a normal section for hrecipe, and
                 // ingredients for the section will follow as <ul> elements.
             } else {
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             }
         } else {
             if ($node->nodeName == 'ul') {
                 $subnodes = $xpath->query('./li[@class = "ingredient"]', $node);
                 foreach ($subnodes as $subnode) {
                     $value = trim($subnode->nodeValue);
                     $recipe->appendIngredient($value);
                 }
             }
         }
     }
     // Notes
     $nodes = $xpath->query('//*[@id="recipe-intronote"]');
     if ($nodes->length) {
         $value = $nodes->item(0)->nodeValue;
         $recipe->notes = RecipeParser_Text::formatAsParagraphs($value);
     }
     // Photo URL to replace og:image
     $nodes = $xpath->query('//img[@itemprop="image"]');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute("src");
         $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
     }
     return $recipe;
 }
Ejemplo n.º 4
0
 public static function parse($html, $url)
 {
     // Get all of the standard hrecipe stuff we can find.
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Multi-stage ingredients
     $nodes = $xpath->query('//dl[@id="stages"]/*');
     if ($nodes->length) {
         $recipe->resetIngredients();
         foreach ($nodes as $node) {
             if ($node->nodeName == 'dt') {
                 $value = $node->nodeValue;
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             } else {
                 if ($node->nodeName == 'dd') {
                     $subs = $xpath->query('.//*[@class="ingredient"]', $node);
                     foreach ($subs as $sub) {
                         $value = trim($sub->nodeValue);
                         $recipe->appendIngredient($value);
                     }
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 5
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Ingredients
     $recipe->resetIngredients();
     $nodes = null;
     if (!$nodes || !$nodes->length) {
         $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="view-content"]/*');
     }
     if (!$nodes || !$nodes->length) {
         $nodes = $xpath->query('//*[@id="recipe-ingredients"]//div[@class="ingredient-lists separator-serated tab-content"]/*');
     }
     foreach ($nodes as $node) {
         if ($node->nodeName == 'h3') {
             $line = $node->nodeValue;
             $line = RecipeParser_Text::formatSectionName($line);
             $recipe->addIngredientsSection($line);
         } else {
             if ($node->nodeName == 'ul') {
                 foreach ($node->childNodes as $subnode) {
                     $line = $subnode->nodeValue;
                     $line = RecipeParser_Text::formatAsOneLine($line);
                     $recipe->appendIngredient($line);
                 }
             }
         }
     }
     return $recipe;
 }
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Yield
     $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " yield ")]');
     if ($nodes->length) {
         $line = $nodes->item(0)->nodeValue;
         $recipe->yield = RecipeParser_Text::formatYield($line);
     }
     // Times
     $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " prepTime ")]/span');
     if ($nodes->length) {
         $line = $nodes->item(1)->getAttribute("title");
         $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($line);
     }
     $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " rspec-cook-time ")]/span');
     if ($nodes->length) {
         $line = $nodes->item(1)->getAttribute("title");
         $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($line);
     }
     $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " totaltime ")]/span');
     if ($nodes->length) {
         $line = $nodes->item(1)->getAttribute("title");
         $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($line);
     }
     // Ingredients
     $recipe->resetIngredients();
     $ing_nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredients ")]/*');
     foreach ($ing_nodes as $ing_node) {
         if ($ing_node->getAttribute('class') == "ingr-divider") {
             $line = RecipeParser_Text::formatSectionName($ing_node->nodeValue);
             $recipe->addIngredientsSection($line);
             continue;
         }
         // Extract ingredients from inside of <ul class="ingredientsList">
         // Child nodes should all be <li>
         if ($ing_node->nodeName == 'ul') {
             foreach ($ing_node->childNodes as $node) {
                 $line = trim($node->nodeValue);
                 $recipe->appendIngredient($line);
             }
             continue;
         }
     }
     return $recipe;
 }
Ejemplo n.º 7
0
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Title
     $node_list = $doc->getElementsByTagName('title');
     if ($node_list->length) {
         $value = $node_list->item(0)->nodeValue;
         $value = trim(str_replace("Cooks.com - Recipe - ", "", $value));
         $value = trim(str_replace(" - Recipe - Cooks.com", "", $value));
         $recipe->title = $value;
     }
     // This node contains all ingredients, section titles, and instructions
     $node_list = $xpath->query('//table[@class="hrecipe"]//td/div');
     foreach ($node_list as $node) {
         // Can determine each piece of content by the "style" attributes.
         $style = $node->getAttribute("style");
         // Ingredients found in a div, black text
         if (stripos($style, "color: BLACK;") !== false) {
             $ing_nodes = $xpath->query('./span[@class = "ingredient"]', $node);
             foreach ($ing_nodes as $ing_node) {
                 $recipe->appendIngredient($ing_node->nodeValue);
             }
             // Instructions node
         } else {
             if ($node->getAttribute('class') == "instructions") {
                 foreach ($node->childNodes as $child) {
                     $line = $child->nodeValue;
                     $line = RecipeParser_Text::formatAsOneLine($line);
                     $recipe->appendInstruction($line);
                 }
                 // Section title
             } else {
                 if ($node->getAttribute("class") == "section") {
                     $title = RecipeParser_Text::formatSectionName($node->nodeValue);
                     $recipe->addIngredientsSection($title);
                     if (count($recipe->instructions) > 0) {
                         $recipe->addInstructionsSection($title);
                     }
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 8
0
 public static function parse($html, $url)
 {
     // Get all of the standard hrecipe stuff we can find.
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Yield
     $nodes = $xpath->query('//*[@name="resizeTo"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->getAttribute("value")) . " servings";
         $recipe->yield = RecipeParser_Text::formatYield($line);
     }
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]');
     foreach ($nodes as $node) {
         $parts = array();
         foreach ($node->childNodes as $n) {
             $parts[] = $n->nodeValue;
         }
         $line = implode(' ', $parts);
         $line = str_replace(" ; ", "; ", $line);
         $line = RecipeParser_Text::formatAsOneLine($line);
         $recipe->appendIngredient($line);
     }
     // Instructions
     $recipe->resetInstructions();
     $nodes = $xpath->query('//div[@class="display-field"]/p');
     foreach ($nodes as $node) {
         $line = trim($node->nodeValue);
         if ($line == strtoupper($line)) {
             $line = RecipeParser_Text::formatSectionName($line);
             $recipe->addInstructionsSection($line);
         } else {
             $recipe->appendInstruction($line);
         }
     }
     return $recipe;
 }
Ejemplo n.º 9
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Ingredients
     $recipe->resetIngredients();
     $sections = $xpath->query('//*[@id="ingredients"]//*[@class="group"]');
     if ($sections->length) {
         // Sections
         foreach ($sections as $section_node) {
             $section_nodes = $xpath->query('.//h3', $section_node);
             if ($section_nodes->length) {
                 $line = $section_nodes->item(0)->nodeValue;
                 $line = RecipeParser_Text::formatSectionName($line);
                 if (!empty($line)) {
                     $recipe->addIngredientsSection($line);
                 }
             }
             $ing_nodes = $xpath->query('.//li', $section_node);
             if ($ing_nodes->length) {
                 foreach ($ing_nodes as $node) {
                     $line = $node->nodeValue;
                     $line = RecipeParser_Text::formatAsOneLine($line);
                     $recipe->appendIngredient($line);
                 }
             }
         }
     }
     // Notes
     $nodes = $xpath->query('.//*[@class = "body-c note-text"]');
     if ($nodes->length) {
         $value = $nodes->item(0)->nodeValue;
         $value = trim(str_replace("Cook's Note", '', $value));
         $recipe->notes = $value;
     }
     return $recipe;
 }
Ejemplo n.º 10
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url);
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     //
     // Some of the ingredient lines in on The Daily Meal do not adhere to
     // the usual microdata formatting.  Here we fall back to looking for a
     // regular list within a higher-level ingredients div.
     //
     if (!empty($recipe->ingredients)) {
         $nodes = $xpath->query("//div[@class='content']/div[@class='ingredient']/ul/li");
         foreach ($nodes as $node) {
             $value = RecipeParser_Text::formatAsOneLine($node->nodeValue);
             if (empty($value)) {
                 continue;
             }
             if (RecipeParser_Text::matchSectionName($value)) {
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             } else {
                 $recipe->appendIngredient($value);
             }
         }
     }
     //
     // The Daily Meal provides servings details via Edamam's plugin.
     //
     if (!$recipe->yield) {
         $nodes = $xpath->query("//table[@class='edamam-data']/tr[2]/td[2]");
         if ($nodes->length) {
             $recipe->yield = RecipeParser_Text::formatYield($nodes->item(0)->nodeValue);
         }
     }
     return $recipe;
 }
Ejemplo n.º 11
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Description
     $description = "";
     $nodes = $xpath->query('//div[@id="recipe"]/p/i');
     foreach ($nodes as $node) {
         $line = trim($node->nodeValue);
         if (strpos($line, "Adapted from") === false) {
             $description .= $line . "\n\n";
         }
     }
     $description = trim($description);
     $recipe->description = $description;
     // Ingredients
     $recipe->resetIngredients();
     $lines = array();
     // Add ingredients to blob
     $nodes = $xpath->query('//div[@id="recipe"]/blockquote/p');
     foreach ($nodes as $node) {
         foreach ($node->childNodes as $child) {
             $line = trim($child->nodeValue);
             switch ($child->nodeName) {
                 case "strong":
                 case "b":
                     if (strpos($line, ":") === false) {
                         $line .= ":";
                     }
                     $lines[] = $line;
                     break;
                 case "#text":
                 case "div":
                 case "p":
                     $lines[] = $line;
                     break;
             }
         }
     }
     foreach ($lines as $line) {
         if (RecipeParser_Text::matchSectionName($line)) {
             $recipe->addIngredientsSection(RecipeParser_Text::formatSectionName($line));
         } else {
             $line = RecipeParser_Text::formatAsOneLine($line);
             $recipe->appendIngredient($line);
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $lines = array();
     $nodes = $xpath->query('//div[@id="recipe"]/*');
     $passed_ingredients = false;
     foreach ($nodes as $node) {
         if ($node->nodeName == "blockquote") {
             $passed_ingredients = true;
             continue;
         }
         if ($node->nodeName == "p") {
             if ($passed_ingredients) {
                 $line = trim($node->nodeValue);
                 // Finished with ingredients once we hit "Adapted" notes or any <p>
                 // with a class attribute.
                 if (stripos($line, "Adapted from") !== false) {
                     break;
                 } else {
                     if ($node->getAttribute("class")) {
                         break;
                     }
                 }
                 // Servings?
                 if (stripos($line, "Serves ") === 0) {
                     $recipe->yield = RecipeParser_Text::formatYield($line);
                     continue;
                 }
                 $recipe->appendInstruction(RecipeParser_Text::formatAsOneLine($node->nodeValue));
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 12
0
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Title
     $nodes = $xpath->query('//h1[@itemprop="name"]');
     if ($nodes->length) {
         $value = trim($nodes->item(0)->nodeValue);
         $recipe->title = $value;
     }
     // Times and yield
     // <time datetime="PT35M" itemprop="prepTime">
     $nodes = $xpath->query('//time[@itemprop="prepTime"]');
     if ($nodes->length) {
         if ($value = $nodes->item(0)->textContent) {
             $value = RecipeParser_Text::mixedTimeToMinutes($value);
             $recipe->time['total'] = $value;
         }
     }
     $nodes = $xpath->query('//*[@itemprop="recipeYield"]');
     if ($nodes->length) {
         $value = $nodes->item(0)->nodeValue;
         $recipe->yield = RecipeParser_Text::formatYield($value);
     }
     // Ingredients
     $nodes = $xpath->query('//*[@itemprop="ingredients"]');
     foreach ($nodes as $node) {
         $value = trim($node->nodeValue);
         if ($value != "Ingredients") {
             $recipe->appendIngredient($value);
         }
     }
     // Instructions
     $nodes = $xpath->query('//span[@class = "steps-list__item__text"]');
     foreach ($nodes as $node) {
         $value = trim($node->nodeValue);
         $value = RecipeParser_Text::stripLeadingNumbers($value);
         $parts = self::splitDirections($value);
         if ($parts['section']) {
             $parts['section'] = RecipeParser_Text::formatSectionName($parts['section']);
             $recipe->addInstructionsSection($parts['section']);
         }
         $recipe->appendInstruction($parts['direction']);
     }
     // Notes
     $nodes = $xpath->query('//div[@class = "recipe-notes__content"]/div/p');
     $notes = array();
     if ($nodes->length) {
         foreach ($nodes as $node) {
             $value = trim($node->nodeValue);
             array_push($notes, $value);
         }
         $recipe->notes = implode(' | ', $notes);
     }
     // Photo
     $nodes = $xpath->query('//img[@class = "recipe-carousel__recipe__img"]');
     if ($nodes && $nodes->item(1)) {
         $photo_url = $nodes->item(1)->getAttribute('src');
         if (strpos($photo_url, 'default-recipe-image.gif') === false && strpos($photo_url, 'placeholder.gif') === false) {
             $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url);
         }
     }
     return $recipe;
 }
 public function test_format_section_name()
 {
     // Pass through as original
     $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("Cake"));
     // Title case single word, strip colon, trim whitespace.
     $this->assertEquals("Cake", RecipeParser_Text::formatSectionName(" CAKE: "));
     // Remove leading "for".
     $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("For Cake"));
     // Remove leading "for the".
     $this->assertEquals("Cake", RecipeParser_Text::formatSectionName("For the cake"));
     // Upper-case only the first word (until we have a better way of doing this).
     $this->assertEquals("Cake frosting", RecipeParser_Text::formatSectionName("Cake Frosting"));
 }
Ejemplo n.º 14
0
 public static function parse($html, $url)
 {
     // Get all of the standard hrecipe stuff we can find.
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $recipe->resetIngredients();
     $recipeName = $xpath->query('.//*[@itemprop="name"]');
     $value = trim($recipeName[0]->nodeValue);
     $recipe->title = $value;
     $nodes = $xpath->query('//li[@itemprop="recipeInstructions"]/*');
     if ($nodes->length) {
         foreach ($nodes as $sub) {
             $line = trim($sub->nodeValue);
             $line = RecipeParser_Text::stripLeadingNumbers($line);
             $recipe->appendInstruction($line);
         }
     }
     $image = $xpath->query('.//*[@itemprop="image"]');
     $photo_url = $image[0]->getAttribute('src');
     $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url);
     // Meta data
     $nodes = $xpath->query('//div[@class="recipe-metadata-wrap"]/*');
     if ($nodes->length) {
         $prepTime = $xpath->query('.//*[@itemprop="prepTime"]');
         foreach ($prepTime[0]->attributes as $sub) {
             if ($sub->nodeName == "content") {
                 $value = trim($sub->nodeValue);
                 $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value);
             }
         }
         $prepTime = $xpath->query('.//*[@itemprop="cookTime"]');
         foreach ($prepTime[0]->attributes as $sub) {
             if ($sub->nodeName == "content") {
                 $value = trim($sub->nodeValue);
                 $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value);
             }
         }
         $recipe->time['total'] = $recipe->time['cook'] + $recipe->time['prep'];
         $recipeYield = $xpath->query('.//*[@itemprop="recipeYield"]');
         $value = trim($recipeYield[0]->nodeValue);
         $recipe->yield = RecipeParser_Text::formatYield($value);
     }
     // Multi-stage ingredients
     $nodes = $xpath->query('//div[@class="recipe-ingredients-wrapper"]/*');
     if ($nodes->length) {
         foreach ($nodes as $node) {
             if ($node->nodeName == 'h3') {
                 $value = $node->nodeValue;
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             } else {
                 if ($node->nodeName == 'ul') {
                     $subs = $xpath->query('.//li[@itemprop="ingredients"]', $node);
                     foreach ($subs as $sub) {
                         $value = trim($sub->nodeValue);
                         $recipe->appendIngredient($value);
                     }
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 15
0
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $microdata = null;
     $nodes = $xpath->query('//*[contains(@itemtype, "//schema.org/Recipe") or contains(@itemtype, "//schema.org/recipe")]');
     if ($nodes->length) {
         $microdata = $nodes->item(0);
     }
     // Parse elements
     if ($microdata) {
         // Title
         $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata);
         if ($nodes->length) {
             $value = trim($nodes->item(0)->nodeValue);
             $recipe->title = RecipeParser_Text::formatTitle($value);
         }
         // Summary
         $nodes = $xpath->query('.//*[@itemprop="description"]', $microdata);
         if ($nodes->length) {
             $value = $nodes->item(0)->nodeValue;
             $value = RecipeParser_Text::formatAsParagraphs($value);
             $recipe->description = $value;
         }
         // Times
         $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total');
         foreach ($searches as $itemprop => $time_key) {
             $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata);
             if ($nodes->length) {
                 if ($value = $nodes->item(0)->getAttribute('content')) {
                     $value = RecipeParser_Text::iso8601ToMinutes($value);
                 } else {
                     if ($value = $nodes->item(0)->getAttribute('datetime')) {
                         $value = RecipeParser_Text::iso8601ToMinutes($value);
                     } else {
                         $value = trim($nodes->item(0)->nodeValue);
                         $value = RecipeParser_Times::toMinutes($value);
                     }
                 }
                 if ($value) {
                     $recipe->time[$time_key] = $value;
                 }
             }
         }
         // Yield
         $nodes = $xpath->query('.//*[@itemprop="recipeYield"]', $microdata);
         if (!$nodes->length) {
             $nodes = $xpath->query('.//*[@itemprop="recipeyield"]', $microdata);
         }
         if ($nodes->length) {
             if ($nodes->item(0)->hasAttribute('content')) {
                 $line = $nodes->item(0)->getAttribute('content');
             } else {
                 $line = $nodes->item(0)->nodeValue;
             }
             $recipe->yield = RecipeParser_Text::formatYield($line);
         }
         // Ingredients
         $nodes = $xpath->query('//*[@itemprop="ingredients"]');
         foreach ($nodes as $node) {
             $value = $node->nodeValue;
             $value = RecipeParser_Text::formatAsOneLine($value);
             if (empty($value)) {
                 continue;
             }
             if (strlen($value) > 150) {
                 // probably a mistake, like a run-on of existing ingredients?
                 continue;
             }
             if (RecipeParser_Text::matchSectionName($value)) {
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             } else {
                 $recipe->appendIngredient($value);
             }
         }
         // Instructions
         $found = false;
         // Look for markup that uses <li> tags for each instruction.
         if (!$found) {
             $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]//li');
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Look for instructions as direct descendents of "recipeInstructions".
         if (!$found) {
             $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*');
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Some sites will use an "instruction" class for each line.
         if (!$found) {
             $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]');
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Either multiple recipeInstructions nodes, or one node with a blob of text.
         if (!$found) {
             $nodes = $xpath->query('.//*[@itemprop="recipeInstructions"]');
             if ($nodes->length > 1) {
                 // Multiple nodes
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             } else {
                 if ($nodes->length == 1) {
                     // Blob
                     $str = $nodes->item(0)->nodeValue;
                     RecipeParser_Text::parseInstructionsFromBlob($str, $recipe);
                     $found = true;
                 }
             }
         }
         // Photo
         $photo_url = "";
         if (!$photo_url) {
             // try to find open graph url
             $nodes = $xpath->query('//meta[@property="og:image"]');
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('content');
             }
         }
         if (!$photo_url) {
             $nodes = $xpath->query('.//*[@itemprop="image"]', $microdata);
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('src');
             }
         }
         if (!$photo_url) {
             // for <img> as sub-node of class="photo"
             $nodes = $xpath->query('.//*[@itemprop="image"]//img', $microdata);
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('src');
             }
         }
         if ($photo_url) {
             $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
         }
         // Credits
         $line = "";
         $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
         }
         $nodes = $xpath->query('.//*[@itemprop="publisher"]', $microdata);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
         }
         $recipe->credits = RecipeParser_Text::formatCredits($line);
     }
     return $recipe;
 }
Ejemplo n.º 16
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Yield
     $nodes = $xpath->query('//li[@class="credit"]');
     foreach ($nodes as $node) {
         $line = $node->nodeValue;
         if (stripos($line, "servings") !== false) {
             $line = preg_replace("/servings\\:?.*(\\d+)/i", "\$1", $line);
             $line = RecipeParser_Text::formatYield($line);
             $recipe->yield = $line;
         }
     }
     // Description
     $nodes = $xpath->query('//*[@itemprop="page-dek"]');
     if ($nodes->length) {
         $line = $nodes->item(0)->nodeValue;
         $line = RecipeParser_Text::formatAsOneLine($line);
         $recipe->description = $line;
     }
     // Notes
     $line = "";
     $nodes = $xpath->query('//*[@class="note-text"]');
     foreach ($nodes as $node) {
         $line .= trim($node->nodeValue) . "\n\n";
     }
     $line = rtrim($line);
     $recipe->notes = $line;
     // Ingredients
     $recipe->resetIngredients();
     $sections = $xpath->query('//*[@class="components-group"]');
     if ($sections->length) {
         // Sections
         foreach ($sections as $section_node) {
             $section_nodes = $xpath->query('.//*[@class="components-group-header"]', $section_node);
             if ($section_nodes->length) {
                 $line = $section_nodes->item(0)->nodeValue;
                 $line = RecipeParser_Text::formatSectionName($line);
                 if (!empty($line)) {
                     $recipe->addIngredientsSection($line);
                 }
             }
             $ing_nodes = $xpath->query('.//*[@class="components-item"]', $section_node);
             if ($ing_nodes->length) {
                 foreach ($ing_nodes as $node) {
                     $line = $node->nodeValue;
                     $line = RecipeParser_Text::formatAsOneLine($line);
                     $recipe->appendIngredient($line);
                 }
             }
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $nodes = $xpath->query('//*[@class="directions-item"]');
     foreach ($nodes as $node) {
         $line = RecipeParser_Text::formatAsOneLine($node->nodeValue);
         $recipe->appendInstruction($line);
     }
     // Photo URL
     $nodes = $xpath->query('//img[@itemprop="image"]');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute("data-original");
         $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url);
     }
     return $recipe;
 }
Ejemplo n.º 17
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // OVERRIDES for epicurious
     // Prep Times
     $nodes = $xpath->query('//*[@class="summary_data"]');
     if ($nodes->length) {
         foreach ($nodes as $node) {
             if (preg_match('/ACTIVE/', $node->nodeValue)) {
                 $ing_nodes = $node->childNodes;
                 foreach ($ing_nodes as $ing_node) {
                     if ($ing_node->nodeName == "span") {
                         $recipe->prep_time = RecipeParser_Text::formatAsOneLine($ing_node->nodeValue);
                     }
                 }
             } else {
                 if (preg_match('/TOTAL/', $node->nodeValue)) {
                     $ing_nodes = $node->childNodes;
                     foreach ($ing_nodes as $ing_node) {
                         if ($ing_node->nodeName == "span") {
                             $recipe->total_time = RecipeParser_Text::formatAsOneLine($ing_node->nodeValue);
                         }
                     }
                 }
             }
         }
     }
     // Total Time
     $nodes = $xpath->query('//*[@itemprop="totalTime"]');
     if ($nodes->length) {
         $value = $nodes->item(0)->getAttribute("content");
         $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value);
     }
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//div[@id = "ingredients"]/*');
     foreach ($nodes as $node) {
         // <strong> contains ingredient section names
         if ($node->nodeName == 'strong') {
             $line = RecipeParser_Text::formatSectionName($node->nodeValue);
             $recipe->addIngredientsSection($line);
             continue;
         }
         // Extract ingredients from inside of <ul class="ingredientsList">
         if ($node->nodeName == 'ul') {
             // Child nodes should all be <li>
             $ing_nodes = $node->childNodes;
             foreach ($ing_nodes as $ing_node) {
                 if ($ing_node->nodeName == 'li') {
                     $line = trim($ing_node->nodeValue);
                     $recipe->appendIngredient($line);
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 18
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Yield
     $nodes = $xpath->query('//*[@class="prep_box"]');
     foreach ($nodes as $node) {
         $line = $node->nodeValue;
         if (preg_match("/Number of Servings: (\\d+)/", $line, $m)) {
             $recipe->yield = RecipeParser_Text::formatYield($m[1]);
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $str = "";
     $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]');
     if ($nodes->length) {
         $children = $nodes->item(0)->childNodes;
         // This is a piece of HTML that has <br> tags for breaks in each instruction.
         // Rather than just getting nodeValue, I want to preserve the <br> tags. So I'm
         // looking for them as nodes and appending them to the string. Any other nodes
         // (either #text or other, e.g. <a href="">) get passed along into the string as
         // nodeValue.
         foreach ($children as $child) {
             if ($child->nodeName == "br") {
                 $str .= "<br>";
             } else {
                 $line = trim($child->nodeValue);
                 if (!empty($line)) {
                     $str .= $line;
                 }
             }
         }
         $lines = explode("<br>", $str);
         foreach ($lines as $line) {
             if (empty($line)) {
                 continue;
             } else {
                 if (RecipeParser_Text::matchSectionName($line)) {
                     $line = RecipeParser_Text::formatSectionName($line);
                     $recipe->addInstructionsSection($line);
                 } else {
                     if (!empty($line)) {
                         $line = RecipeParser_Text::formatAsOneLine($line);
                         $line = RecipeParser_Text::stripLeadingNumbers($line);
                         if (stripos($line, "Recipe submitted by SparkPeople") === 0) {
                             continue;
                         }
                         if (stripos($line, "Number of Servings:") === 0) {
                             continue;
                         }
                         $recipe->appendInstruction($line);
                     }
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 19
0
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $hrecipe = null;
     if (!$hrecipe) {
         $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hrecipe ")]');
         if ($nodes->length) {
             $hrecipe = $nodes->item(0);
         }
     }
     if (!$hrecipe) {
         $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " hRecipe ")]');
         if ($nodes->length) {
             $hrecipe = $nodes->item(0);
         }
     }
     if ($hrecipe) {
         // Title
         $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " fn ")]', $hrecipe);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $recipe->title = RecipeParser_Text::formatTitle($line);
         }
         // Summary
         $nodes = $xpath->query('.//*[@class="summary"]', $hrecipe);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $recipe->description = RecipeParser_Text::formatAsParagraphs($line);
         }
         // Credits
         $nodes = $xpath->query('.//*[@class="author"]', $hrecipe);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $recipe->credits = RecipeParser_Text::formatCredits($line);
         }
         // Photo
         $photo_url = "";
         $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]', $hrecipe);
         if ($nodes->length) {
             $photo_url = $nodes->item(0)->getAttribute('src');
         }
         if (!$photo_url) {
             // for <img> as sub-node of class="photo"
             $nodes = $xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " photo ")]//img', $hrecipe);
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('src');
             }
         }
         if ($photo_url) {
             $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url);
         }
         // Yield
         $nodes = $xpath->query('.//*[@class="yield"]', $hrecipe);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $recipe->yield = RecipeParser_Text::formatYield($line);
         }
         // Prep Times
         $nodes = $xpath->query('.//*[@class="prepTime"]//*[@class="value-title"]', $hrecipe);
         if ($nodes->length) {
             $value = $nodes->item(0)->getAttribute('title');
             $recipe->time['prep'] = RecipeParser_Text::iso8601ToMinutes($value);
         } else {
             $nodes = $xpath->query('.//*[@class="preptime"]', $hrecipe);
             if ($nodes->length) {
                 $value = $nodes->item(0)->nodeValue;
                 $recipe->time['prep'] = RecipeParser_Times::toMinutes($value);
             }
         }
         // Cook Times
         $nodes = $xpath->query('.//*[@class="cookTime"]//*[@class="value-title"]', $hrecipe);
         if ($nodes->length) {
             $value = $nodes->item(0)->getAttribute('title');
             $recipe->time['cook'] = RecipeParser_Text::iso8601ToMinutes($value);
         } else {
             $nodes = $xpath->query('.//*[@class="cooktime"]', $hrecipe);
             if ($nodes->length) {
                 $value = $nodes->item(0)->nodeValue;
                 $recipe->time['cook'] = RecipeParser_Times::toMinutes($value);
             }
         }
         // Total Time / Duration
         $nodes = $xpath->query('.//*[@class="totalTime"]//*[@class="value-title"]', $hrecipe);
         if ($nodes->length) {
             $value = $nodes->item(0)->getAttribute('title');
             $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value);
         } else {
             $nodes = $xpath->query('.//*[@class="duration"]//*[@class="value-title"]', $hrecipe);
             if ($nodes->length) {
                 $value = $nodes->item(0)->getAttribute('title');
                 $recipe->time['total'] = RecipeParser_Text::iso8601ToMinutes($value);
             } else {
                 $nodes = $xpath->query('.//*[@class="duration"]', $hrecipe);
                 if ($nodes->length) {
                     $value = $nodes->item(0)->nodeValue;
                     $recipe->time['total'] = RecipeParser_Times::toMinutes($value);
                 }
             }
         }
         // Ingredients
         $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " ingredient ")]');
         foreach ($nodes as $node) {
             $line = $node->nodeValue;
             $line = trim($line);
             $line = RecipeParser_Text::formatAsOneLine($line);
             // Skip lines that contain no word-like characters (sometimes used as section dividers).
             if (!preg_match("/\\w/", $line)) {
                 continue;
             }
             // Section name delineated with dashes. E.g. "---Cake---"
             if (preg_match('/^\\-+([^\\-]{1}.*[^\\-]{1})\\-+$/', $line, $m)) {
                 $line = RecipeParser_Text::formatSectionName($m[1]);
                 $recipe->addIngredientsSection($line);
                 continue;
             }
             // Section name with colon.
             if (preg_match('/^(.+)\\:$/', $line, $m)) {
                 $line = RecipeParser_Text::formatSectionName($m[1]);
                 $recipe->addIngredientsSection($line);
                 continue;
             }
             $recipe->appendIngredient($line);
         }
         // Instructions
         $found = false;
         // Look for usage of <li> to denote each step of the instructions.
         if (!$found) {
             $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]//li');
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Look for "instruction class for each step of the instructions.
         if (!$found) {
             $query = '//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]' . '//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]';
             $nodes = $xpath->query($query);
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Default. Multiple instructions nodes, or one with a blob of text.
         if (!$found) {
             $nodes = $xpath->query('//*[contains(concat(" ", normalize-space(@class), " "), " instructions ")]');
             if ($nodes->length > 1) {
                 // Multiple nodes
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             } else {
                 if ($nodes->length == 1) {
                     // Blob
                     $str = $nodes->item(0)->nodeValue;
                     RecipeParser_Text::parseInstructionsFromBlob($str, $recipe);
                     $found = true;
                 }
             }
         }
     }
     return $recipe;
 }
Ejemplo n.º 20
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Times
     $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartPrimaryAttributes"]//li');
     if ($nodes->length) {
         foreach ($nodes as $node) {
             if (trim($node->childNodes->item(1)->nodeValue) == "Prep Time") {
                 $line = trim($node->childNodes->item(3)->nodeValue);
                 $recipe->time['prep'] = RecipeParser_Times::toMinutes($line);
                 continue;
             }
             if (trim($node->childNodes->item(1)->nodeValue) == "Total Time") {
                 $line = trim($node->childNodes->item(3)->nodeValue);
                 $recipe->time['total'] = RecipeParser_Times::toMinutes($line);
                 continue;
             }
         }
     }
     // Yield
     $nodes = $xpath->query('//*[@class="recipePartAttributes recipePartSecondaryAttributes"]//li');
     if ($nodes->length) {
         foreach ($nodes as $node) {
             if (trim($node->childNodes->item(1)->nodeValue) == "Servings") {
                 $line = trim($node->childNodes->item(3)->nodeValue);
                 $recipe->yield = RecipeParser_Text::formatYield($line);
             }
         }
     }
     // Ingredients
     $recipe->resetIngredients();
     $groups = $xpath->query('//*[@class="recipePartIngredientGroup"]');
     foreach ($groups as $group) {
         $nodes = $xpath->query('.//h2', $group);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $line = RecipeParser_Text::formatSectionName($line);
             $recipe->addIngredientsSection($line);
         }
         $nodes = $xpath->query('.//*[@itemprop="ingredients"]', $group);
         foreach ($nodes as $node) {
             $line = $node->nodeValue;
             $line = RecipeParser_Text::formatAsOneLine($line);
             $recipe->appendIngredient($line);
         }
     }
     // Notes / footnotes
     $notes = array();
     $nodes = $xpath->query('//div[@class="recipePartTipsInfo"]');
     foreach ($nodes as $node) {
         $line = trim($node->nodeValue);
         $notes[] = $line;
     }
     $recipe->notes = implode("\n\n", $notes);
     $recipe->notes = RecipeParser_Text::formatAsParagraphs($recipe->notes);
     // Fix description
     $recipe->description = trim(preg_replace("/Servings \\# \\d+/", "", $recipe->description));
     return $recipe;
 }
Ejemplo n.º 21
0
 public static function parse($html, $url)
 {
     if (strpos($url, "www.nytimes.com/recipes/") !== false) {
         //
         // "RECIPES" SECTION
         //
         $recipe = new RecipeParser_Recipe();
         libxml_use_internal_errors(true);
         $doc = new DOMDocument();
         $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
         $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
         $xpath = new DOMXPath($doc);
         // Title
         $nodes = $xpath->query('//h1[@class="recipe-title recipeName"]');
         if ($nodes->length) {
             $value = $nodes->item(0)->nodeValue;
             $value = RecipeParser_Text::formatTitle($value);
             $recipe->title = $value;
         }
         // Yield
         $nodes = $xpath->query('//*[@itemprop="recipeYield"]');
         if ($nodes->length) {
             $value = $nodes->item(0)->nodeValue;
             $value = RecipeParser_Text::formatYield($value);
             $recipe->yield = $value;
         }
         // Ingredients
         $nodes = $xpath->query('//div[@class="ingredientsGroup"]/*');
         foreach ($nodes as $node) {
             if ($node->nodeName == "h3") {
                 $value = trim($node->nodeValue);
                 if (!preg_match('/^Ingredients:?$/i', $value)) {
                     $value = RecipeParser_Text::formatSectionName($value);
                     $recipe->addIngredientsSection($value);
                 }
             } else {
                 foreach ($node->childNodes as $child) {
                     $value = trim($child->nodeValue);
                     $recipe->appendIngredient($value);
                 }
             }
         }
         // Instructions
         $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/dd');
         foreach ($nodes as $node) {
             $value = $node->nodeValue;
             $value = RecipeParser_Text::formatAsOneLine($value);
             $recipe->appendInstruction($value);
         }
         // Notes
         if (!$recipe->notes) {
             $nodes = $xpath->query('//div[@class="yieldNotesGroup"]//*[@class="note"]');
             if ($nodes->length) {
                 $value = trim($nodes->item(0)->nodeValue);
                 $value = preg_replace("/^Notes?:?\\s*/i", '', $value);
                 $recipe->notes = trim($value);
             }
         }
     } else {
         //
         // DINING SECTION RECIPES
         //
         $recipe = new RecipeParser_Recipe();
         libxml_use_internal_errors(true);
         $doc = new DOMDocument();
         $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
         $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
         $xpath = new DOMXPath($doc);
         // Title
         $nodes = $xpath->query('//div[@id = "article"]//h1');
         if ($nodes->length) {
             $value = trim($nodes->item(0)->nodeValue);
             $recipe->title = $value;
         }
         // Time and Yield
         $nodes = $xpath->query('//div[@id = "article"]//p');
         foreach ($nodes as $node) {
             $text = trim($node->nodeValue);
             if (preg_match('/^Yield:? (.+)/', $text, $m)) {
                 $recipe->yield = RecipeParser_Text::formatYield($m[1]);
             } else {
                 if (preg_match('/^Time:? (.+)/', $text, $m)) {
                     $str = trim($m[1]);
                     $str = preg_replace('/About (.+)/', '$1', $str);
                     $str = preg_replace('/(.+) plus.*/', '$1', $str);
                     $recipe->time['total'] = RecipeParser_Times::toMinutes($str);
                 }
             }
         }
         // Ingredients
         $nodes = $xpath->query('//div[@class="recipeIngredientsList"]/p');
         foreach ($nodes as $node) {
             $line = trim($node->nodeValue);
             // Section names
             if ($line && $line == strtoupper($line)) {
                 $line = RecipeParser_Text::formatSectionName($line);
                 $recipe->addIngredientsSection($line);
                 continue;
             }
             $recipe->appendIngredient($line);
         }
         // Instructions and notes
         $nodes = $xpath->query('//div[@class="articleBody"]//p');
         if (!$nodes->length) {
             $nodes = $xpath->query('//div[@id="articleBody"]//p');
         }
         $notes = '';
         $in_notes_section = false;
         foreach ($nodes as $node) {
             $line = trim($node->nodeValue);
             // Skip some of the useless lines
             if (preg_match('/^(Adapted from|Time|Yield)/i', $line)) {
                 continue;
             }
             // Instructions start with line numbers
             if (!$in_notes_section && preg_match('/^\\d+\\./', $line)) {
                 $line = RecipeParser_Text::stripLeadingNumbers($line);
                 $recipe->appendInstruction($line);
                 continue;
             }
             // Look for lines that start the notes section.
             $note = '';
             if (preg_match('/^Notes?:?(.*)/i', $line, $m)) {
                 $in_notes_section = true;
                 $note = trim($m[1]);
             } else {
                 if ($in_notes_section) {
                     $note = $line;
                 }
             }
             if ($note) {
                 $notes .= $note . "\n\n";
             }
         }
         if ($notes) {
             $notes = str_replace("  ", " ", $notes);
             // Some unnecessary spaces
             $notes = trim($notes);
             $recipe->notes = $notes;
         }
         // Photo
         $nodes = $xpath->query('//div[@class="image"]//img');
         if ($nodes->length) {
             $photo_url = $nodes->item(0)->getAttribute('src');
             $photo_url = str_replace('-articleInline.jpg', '-popup.jpg', $photo_url);
             $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
         }
     }
     return $recipe;
 }
Ejemplo n.º 22
0
 public static function parse($html, $url)
 {
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     // OVERRIDES FOR ABOUT.COM
     // Title
     $nodes = $xpath->query('//*[@itemprop="headline name"]');
     if ($nodes->length) {
         $value = trim($nodes->item(0)->nodeValue);
         $recipe->title = RecipeParser_Text::formatTitle($value);
     }
     // Credits
     $nodes = $xpath->query('//*[@itemprop="author"]//*[@itemprop="name"]');
     if ($nodes->length) {
         $line = $nodes->item(0)->nodeValue;
         $recipe->credits = RecipeParser_Text::formatCredits($line . ", About.com");
     }
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//*[@itemprop="ingredients"]');
     foreach ($nodes as $node) {
         $value = $node->nodeValue;
         $value = RecipeParser_Text::formatAsOneLine($value);
         if (RecipeParser_Text::matchSectionName($value) || $node->childNodes->item(0)->nodeName == "strong" || $node->childNodes->item(0)->nodeName == "b") {
             $value = RecipeParser_Text::formatSectionName($value);
             $recipe->addIngredientsSection($value);
         } else {
             $recipe->appendIngredient($value);
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $nodes = $xpath->query('//div[@itemprop="recipeInstructions"]');
     foreach ($nodes as $node) {
         $text = trim($node->nodeValue);
         $lines = preg_split("/[\n\r]+/", $text);
         for ($i = count($lines) - 1; $i >= 0; $i--) {
             $lines[$i] = trim($lines[$i]);
             // Remove ends of lines that have the word "recipes" squashed up against
             // another word, which seems to happen with long lists of related
             // recipe links.
             // Remove lines that have the phrase "Xxxxx Recipes and More".
             // Remove lines that have the phrase "Xxxxx Recipes | Xxxxx".
             // Remove mentions of newsletters.
             $lines[$i] = preg_replace("/(.*)recipes\\w/i", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)More .* Recipes.*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipes and More.*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipes \\| .*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipe Newsletter.*/", "\$1", $lines[$i]);
             // Look for a line in the instructions that looks like a yield.
             if (strpos($lines[$i], "Makes ") === 0) {
                 $recipe->yield = substr($lines[$i], 6);
                 $lines[$i] = '';
                 continue;
             }
         }
         foreach ($lines as $line) {
             $line = trim($line);
             if (empty($line)) {
                 continue;
             }
             if (strtolower($line) == "preparation") {
                 continue;
             }
             // Match section names that read something like "---For the cake: Raise the oven temperature..."
             if (preg_match("/^(?:-{2,})?For the (.+)\\: (.*)\$/i", $line, $m)) {
                 $section = $m[1];
                 $section = RecipeParser_Text::formatSectionName($section);
                 $recipe->addInstructionsSection($section);
                 // Reset the value of $line, without the section name.
                 $line = ucfirst($m[2]);
             }
             $recipe->appendInstruction($line);
         }
     }
     return $recipe;
 }
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Title
     $nodes = $xpath->query('//*[@property="v:name"]');
     if ($nodes->length) {
         $recipe->title = trim($nodes->item(0)->nodeValue);
     }
     // Summary
     $nodes = $xpath->query('//*[@property="v:summary"]');
     if ($nodes->length) {
         $value = trim($nodes->item(0)->nodeValue);
         $recipe->description = $value;
     }
     // Times
     $searches = array('v:prepTime' => 'prep', 'v:cookTime' => 'cook', 'v:totalTime' => 'total');
     foreach ($searches as $itemprop => $time_key) {
         $nodes = $xpath->query('//*[@property="' . $itemprop . '"]');
         if ($nodes->length) {
             if ($value = $nodes->item(0)->getAttribute('content')) {
                 $value = RecipeParser_Text::iso8601ToMinutes($value);
             } else {
                 $value = trim($nodes->item(0)->nodeValue);
                 $value = RecipeParser_Times::toMinutes($value);
             }
             if ($value) {
                 $recipe->time[$time_key] = $value;
             }
         }
     }
     // Yield
     $nodes = $xpath->query('//*[@property="v:yield"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->nodeValue);
         $line = preg_replace('/\\s+/', ' ', $line);
         $recipe->yield = RecipeParser_Text::formatYield($line);
     }
     // Ingredients
     $nodes = null;
     // (data-vocabulary)
     $nodes = $xpath->query('//*[@rel="v:ingredient"]');
     foreach ($nodes as $node) {
         $value = $node->nodeValue;
         $value = RecipeParser_Text::formatAsOneLine($value);
         if (empty($value)) {
             continue;
         }
         if (RecipeParser_Text::matchSectionName($value)) {
             $value = RecipeParser_Text::formatSectionName($value);
             $recipe->addIngredientsSection($value);
         } else {
             $recipe->appendIngredient($value);
         }
     }
     // Instructions
     $found = false;
     // Some sites will use an "instruction" class for each line.
     if (!$found) {
         $nodes = $xpath->query('//*[@property="v:instructions"]//*[@property="v:instruction"]');
         if ($nodes->length) {
             RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
             $found = true;
         }
     }
     // Look for markup that uses <li>, <p> or other tags for each instruction.
     $search_sub_nodes = array("p", "li");
     while (!$found && ($tag = array_pop($search_sub_nodes))) {
         $nodes = $xpath->query('//*[@property="v:instructions"]//' . $tag);
         if ($nodes->length) {
             RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
             $found = true;
         }
     }
     // Either multiple instrutions nodes, or one node with a blob of text.
     if (!$found) {
         $nodes = $xpath->query('//*[@property="v:instructions"]');
         if ($nodes->length > 1) {
             // Multiple nodes
             RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
             $found = true;
         } else {
             if ($nodes->length == 1) {
                 // Blob
                 $str = $nodes->item(0)->nodeValue;
                 RecipeParser_Text::parseInstructionsFromBlob($str, $recipe);
                 $found = true;
             }
         }
     }
     // Photo
     $photo_url = "";
     $nodes = $xpath->query('//*[@rel="v:photo"]');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute('src');
     }
     if (!$photo_url) {
         // for <img> as sub-node of rel="v:photo"
         $nodes = $xpath->query('//*[@rel="v:photo"]//img');
         if ($nodes->length) {
             $photo_url = $nodes->item(0)->getAttribute('src');
         }
     }
     if ($photo_url) {
         $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
     }
     // Credits
     $nodes = $xpath->query('//*[@property="v:author"]');
     if ($nodes->length) {
         $line = $nodes->item(0)->nodeValue;
         $recipe->credits = RecipeParser_Text::formatCredits($line);
     }
     return $recipe;
 }
Ejemplo n.º 24
0
 public static function parse($html, $url)
 {
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $recipe = RecipeParser_Parser_Microformat::parse($html, $url);
     // OVERRIDES FOR ABOUT.COM
     // Cook times
     $node_list = $xpath->query('//div[@id = "articlebody"]/h3');
     foreach ($node_list as $node) {
         $line = $node->nodeValue;
         $line = preg_replace('/[\\s\\"]+/', ' ', $line);
         $line = trim($line);
         if (preg_match("/prep time\\:(.+)/i", $line, $m)) {
             $recipe->time['prep'] = RecipeParser_Times::toMinutes($m[1]);
         } else {
             if (preg_match("/cook time\\:(.+)/i", $line, $m)) {
                 $recipe->time['cook'] = RecipeParser_Times::toMinutes($m[1]);
             }
         }
         // Total time is provided as part of microformat markup for About.com
     }
     // Instructions
     $recipe->resetInstructions();
     $nodes = $xpath->query('//div[@class = "instructions"]');
     foreach ($nodes as $node) {
         $text = trim($node->nodeValue);
         $lines = preg_split("/[\n\r]+/", $text);
         for ($i = count($lines) - 1; $i >= 0; $i--) {
             $lines[$i] = trim($lines[$i]);
             // Remove ends of lines that have the word "recipes" squashed up against
             // another word, which seems to happen with long lists of related
             // recipe links.
             // Remove lines that have the phrase "Xxxxx Recipes and More".
             // Remove lines that have the phrase "Xxxxx Recipes | Xxxxx".
             // Remove mentions of newsletters.
             $lines[$i] = preg_replace("/(.*)recipes\\w/i", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)More .* Recipes.*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipes and More.*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipes \\| .*/", "\$1", $lines[$i]);
             $lines[$i] = preg_replace("/(.*)Recipe Newsletter.*/", "\$1", $lines[$i]);
             // Look for a line in the instructions that looks like a yield.
             if (strpos($lines[$i], "Makes ") === 0) {
                 $recipe->yield = substr($lines[$i], 6);
                 $lines[$i] = '';
                 continue;
             }
         }
         foreach ($lines as $line) {
             $line = trim($line);
             if (empty($line)) {
                 continue;
             }
             // Match section names that read something like "---For the cake: Raise the oven temperature..."
             if (preg_match("/^(?:-{2,})?For the (.+)\\: (.*)\$/i", $line, $m)) {
                 $section = $m[1];
                 $section = RecipeParser_Text::formatSectionName($section);
                 $recipe->addInstructionsSection($section);
                 // Reset the value of $line, without the section name.
                 $line = ucfirst($m[2]);
             }
             $recipe->appendInstruction($line);
         }
     }
     return $recipe;
 }
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Title
     $nodes = $xpath->query('//div[@id="rightCol"]/h1');
     if ($nodes->length) {
         $recipe->title = trim($nodes->item(0)->nodeValue);
     }
     // Yield
     $nodes = $xpath->query('//h4[@class="detailHeader"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->nodeValue);
         $recipe->yield = RecipeParser_Text::formatYield($line);
     }
     // Notes
     $nodes = $xpath->query('//div[@class="dek"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->nodeValue);
         $recipe->notes = $line;
     }
     // Ingredients
     $nodes = $xpath->query('//ul[@class="recipe_ingredients"]/li');
     foreach ($nodes as $node) {
         // Section names have class="ingredientSectionTitle",
         // ingredients themselves have no class.
         if ($node->hasAttributes()) {
             $line = trim($node->nodeValue);
             $line = RecipeParser_Text::formatSectionName($line);
             $recipe->addIngredientsSection($line);
         } else {
             $line = trim($node->nodeValue);
             // Add spaces between quantities and units
             $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line);
             // Remove spaces before commas (not sure why this happens in their HTML)
             $line = str_replace(' ,', ',', $line);
             // Condense multiple spaces
             $line = str_replace('  ', ' ', $line);
             $recipe->appendIngredient($line);
         }
     }
     // Instructions
     $nodes = $xpath->query('//ol[@class="recipe_instructions"]/li');
     foreach ($nodes as $node) {
         $line = trim($node->nodeValue);
         $line = RecipeParser_Text::stripLeadingNumbers($line);
         $recipe->appendInstruction($line);
     }
     // Photo
     $nodes = $xpath->query('//img[@class="recipeImg"]');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute('src');
         $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
     } else {
         // Second option for where to find recipe image
         $nodes = $xpath->query('//img[@id="splashImage"]');
         if ($nodes->length) {
             $photo_url = $nodes->item(0)->getAttribute('src');
             $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
         }
     }
     return $recipe;
 }
Ejemplo n.º 26
0
 public static function parse($html, $url)
 {
     $recipe = RecipeParser_Parser_MicrodataDataVocabulary::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc = new DOMDocument();
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//*[@id="v_ingredients"]//*[@id="IngredientSet"]');
     foreach ($nodes as $node) {
         $children = $xpath->query('.//*[@id="IngredientHeading"]', $node);
         if ($children->length) {
             $line = $children->item(0)->nodeValue;
             $line = RecipeParser_Text::formatSectionName($line);
             $recipe->addIngredientsSection($line);
         }
         $children = $xpath->query('.//*[@id="IngredientLine"]', $node);
         foreach ($children as $child) {
             $line = $child->nodeValue;
             $line = RecipeParser_Text::formatAsOneLine($line);
             $recipe->appendIngredient($line);
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $str = "";
     $nodes = $xpath->query('//*[@itemprop="instructions"]');
     if ($nodes->length) {
         $children = $nodes->item(0)->childNodes;
         // This is a piece of HTML that has <br> tags for breaks in each instruction.
         // Rather than just getting nodeValue, I want to preserve the <br> tags. So I'm
         // looking for them as nodes and appending them to the string. Any other nodes
         // (either #text or other, e.g. <a href="">) get passed along into the string as
         // nodeValue.
         foreach ($children as $child) {
             if ($child->nodeName == "br") {
                 $str .= "<br>";
             } else {
                 if ($child->nodeName == "b") {
                     $str .= "SECTION:" . $child->nodeValue;
                 } else {
                     $line = $child->nodeValue;
                     if (preg_match("/\\S/", $line)) {
                         $str .= $line;
                     }
                 }
             }
         }
         $lines = explode("<br>", $str);
         foreach ($lines as $line) {
             if (strpos($line, "SECTION:") === 0) {
                 $line = substr($line, 8);
                 $line = RecipeParser_Text::formatSectionName($line);
                 $recipe->addInstructionsSection($line);
             } else {
                 $line = RecipeParser_Text::formatAsOneLine($line);
                 $line = RecipeParser_Text::stripLeadingNumbers($line);
                 if (stripos($line, "yield:") === 0) {
                     continue;
                 }
                 $recipe->appendInstruction($line);
             }
         }
     }
     return $recipe;
 }
 public static function parse($html, $url)
 {
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     $recipe = new RecipeParser_Recipe();
     // Title
     $nodes = $xpath->query('//div[@id="detail_content"]/h1');
     if ($nodes->length) {
         $recipe->title = trim($nodes->item(0)->nodeValue);
     }
     // Yield and Times
     $nodes = $xpath->query('//p[@id="yield"]');
     if ($nodes->length) {
         $line = trim($nodes->item(0)->nodeValue);
         $recipe->yield = RecipeParser_Text::formatYield($line);
     }
     // Notes, instructions, and ingredients are not very well structured.
     $found_ingredients = false;
     $found_instructions = false;
     $nodes = $xpath->query('//div[@id="detail_content"]/*');
     foreach ($nodes as $node) {
         // Notes -- Weird, but this is the only <p> that doesn't have attributes
         // on the tag.
         if ($node->nodeName == 'p') {
             if (!$node->hasAttributes()) {
                 $recipe->notes = trim($node->nodeValue);
                 continue;
             }
         }
         // Ingredients/ingredients markers
         if ($node->nodeName == 'h5') {
             $line = strtolower(trim($node->nodeValue));
             if ($line == 'ingredients') {
                 $found_ingredients = true;
                 continue;
             } else {
                 if ($line == 'instructions') {
                     $found_instructions = true;
                 }
             }
         }
         // Ingredients
         if ($found_ingredients && !$found_instructions) {
             if ($node->nodeName == 'h6') {
                 $line = trim($node->nodeValue);
                 $line = RecipeParser_Text::formatSectionName($line);
                 $recipe->addIngredientsSection($line);
             } else {
                 if ($node->nodeName == 'ul') {
                     $sub_nodes = $node->childNodes;
                     foreach ($sub_nodes as $sub) {
                         $line = trim($sub->nodeValue);
                         // Add spaces between quantities and units
                         $line = preg_replace('/(\\d+)([A-Za-z]+)/', "\$1 \$2", $line);
                         // Remove spaces before commas (not sure why this happens in their HTML)
                         $line = str_replace(' ,', ',', $line);
                         // Condense multiple spaces
                         $line = str_replace('  ', ' ', $line);
                         $recipe->appendIngredient($line);
                     }
                 }
             }
         }
         // Instructions
         if ($found_instructions) {
             if ($node->nodeName == 'ul') {
                 $sub_nodes = $node->childNodes;
                 foreach ($sub_nodes as $sub) {
                     $line = trim($sub->nodeValue);
                     $line = RecipeParser_Text::stripLeadingNumbers($line);
                     $recipe->appendInstruction($line);
                 }
             }
         }
     }
     // Photo
     $nodes = $xpath->query('//img[@class="detail"]');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute('src');
         $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
     }
     return $recipe;
 }
Ejemplo n.º 28
0
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Title
     $nodes = $xpath->query('//h3[@class = "title"]');
     if ($nodes->length) {
         $value = trim($nodes->item(0)->nodeValue);
         $recipe->title = $value;
     }
     // Cook times
     $nodes = $xpath->query('//div[@class = "recipe-metadata"]/ul/li');
     foreach ($nodes as $node) {
         $sub_nodes = $node->childNodes;
         $key = null;
         $value = null;
         foreach ($sub_nodes as $sub_node) {
             if ($sub_node->nodeName == 'h5') {
                 $key = trim($sub_node->nodeValue);
             }
             if ($sub_node->nodeName == 'p') {
                 $value = trim($sub_node->nodeValue);
             }
         }
         // Inspect keys/values we've found.
         if ($key == 'Total Time:') {
             $value = self::cleanupTime($value);
             $recipe->time['total'] = RecipeParser_Times::toMinutes($value);
         }
         if ($key == 'Prep Time:') {
             $value = self::cleanupTime($value);
             $recipe->time['prep'] = RecipeParser_Times::toMinutes($value);
         }
     }
     $node_list = $xpath->query('//dd[@class = "preptime"]');
     if ($node_list->length) {
         $value = $node_list->item(0)->nodeValue;
         $recipe->time['prep'] = RecipeParser_Times::toMinutes($value);
     }
     $node_list = $xpath->query('//dd[@class = "cooktime"]');
     if ($node_list->length) {
         $value = $node_list->item(0)->nodeValue;
         $recipe->time['cook'] = RecipeParser_Times::toMinutes($value);
     }
     $node_list = $xpath->query('//dd[@class = "duration totaltime special"]');
     if ($node_list->length) {
         $value = $node_list->item(0)->nodeValue;
         $recipe->time['total'] = RecipeParser_Times::toMinutes($value);
     }
     // Ingredients, Yield, Description, Notes, etc.
     $nodes = $xpath->query('//div[@class = "recipe-body"]/*');
     $section_title = null;
     foreach ($nodes as $node) {
         // Section titles
         if ($node->nodeName == 'h4') {
             $value = $node->nodeValue;
             $value = trim(strtolower($value));
             $section_title = $value;
             continue;
         }
         $in_section = false;
         if ($node->nodeName == 'div') {
             // Ensure that we're in a <div class="section"> node.
             foreach ($node->attributes as $attr_name => $attr_node) {
                 if ($attr_name == 'class' && $attr_node->value == 'section') {
                     $in_section = true;
                 }
             }
             if (!$in_section) {
                 continue;
             }
             // Description should be first text, before any section titles.
             if (!$section_title) {
                 $value = $node->nodeValue;
                 $value = preg_replace("/^(Drink\\:|Top Chef).*\$/m", '', $value);
                 $value = str_replace("\n\n", "\n", $value);
                 $value = trim($value);
                 $recipe->description = $value;
                 // Yield
             } else {
                 if ($section_title == 'yield') {
                     $value = trim($node->nodeValue);
                     $recipe->yield = $value;
                     // Notes
                 } else {
                     if ($section_title == 'notes') {
                         $value = trim($node->nodeValue);
                         $value = str_replace("\n\n", "\n", $value);
                         $recipe->notes = $value;
                         // Ingredients
                     } else {
                         if ($section_title == 'ingredients') {
                             $sub_nodes = $node->childNodes;
                             foreach ($sub_nodes as $sub_node) {
                                 if ($sub_node->nodeName == 'h5') {
                                     $value = RecipeParser_Text::formatSectionName($sub_node->nodeValue);
                                     $recipe->addIngredientsSection($value);
                                 } else {
                                     if ($sub_node->nodeName == 'ul') {
                                         $li_nodes = $sub_node->childNodes;
                                         foreach ($li_nodes as $li_node) {
                                             $value = trim($li_node->nodeValue);
                                             $recipe->appendIngredient($value);
                                         }
                                     }
                                 }
                             }
                             // Instructions
                         } else {
                             if ($section_title == 'directions') {
                                 $sub_nodes = $node->childNodes;
                                 foreach ($sub_nodes as $sub_node) {
                                     $value = trim($sub_node->nodeValue);
                                     // Section titles appear in all-caps.
                                     if ($value && ($value == strtoupper($value) || preg_match('/:$/', $value))) {
                                         $value = RecipeParser_Text::formatSectionName($value);
                                         $recipe->addInstructionsSection($value);
                                     } else {
                                         $value = RecipeParser_Text::stripLeadingNumbers($value);
                                         $recipe->appendInstruction($value);
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     // Source / Chef
     $nodes = $xpath->query('//div[@class = "recipe-sidebar"]/div/*');
     $section_title = null;
     $chef_name = null;
     $show_name = 'Bravo TV';
     foreach ($nodes as $node) {
         if ($node->nodeName == 'h4') {
             $value = trim($node->nodeValue);
             $section_title = strtolower($value);
             continue;
         }
         if ($node->nodeName == 'small') {
             if ($section_title == 'chef' || $section_title == 'author') {
                 $value = trim($node->nodeValue);
                 $chef_name = $value;
                 break;
             }
         }
     }
     $nodes = $xpath->query('//div[@class = "section"]/p[1]');
     if ($nodes->length) {
         $value = $nodes->item(0)->nodeValue;
         if (strpos($value, 'Top Chef Masters') !== false) {
             $show_name = 'Top Chef Masters';
         } else {
             if (strpos($value, 'Top Chef') !== false) {
                 $show_name = 'Top Chef';
             }
         }
     }
     $recipe->credits = $chef_name . ', ' . $show_name;
     $nodes = $xpath->query('//div[@class = "recipe-header clearfix"]//img');
     if ($nodes->length) {
         $photo_url = $nodes->item(0)->getAttribute('src');
         $photo_url = str_replace('/medium/', '/original/', $photo_url);
         $recipe->photo_url = RecipeParser_Text::formatPhotoUrl($photo_url, $url);
     }
     return $recipe;
 }
Ejemplo n.º 29
0
 public static function parse($html, $url)
 {
     // Get all of the standard microdata stuff we can find.
     $recipe = RecipeParser_Parser_MicrodataSchema::parse($html, $url);
     // Turn off libxml errors to prevent mismatched tag warnings.
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Ingredients
     $recipe->resetIngredients();
     $nodes = $xpath->query('//div[@class="col6 ingredients"]/*');
     foreach ($nodes as $node) {
         // Extract ingredients from <ul> <li>.
         if ($node->nodeName == 'ul') {
             $ing_nodes = $node->childNodes;
             foreach ($ing_nodes as $ing_node) {
                 // Find <li> with itemprop="ingredients" for each ingredient.
                 if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("itemprop") == "ingredients") {
                     $line = trim($ing_node->nodeValue);
                     // Section titles might be all uppercase ingredients
                     if ($line == strtoupper($line)) {
                         $line = RecipeParser_Text::formatSectionName($line);
                         $recipe->addIngredientsSection($line);
                         continue;
                     }
                     // Ingredient lines
                     if (stripos($line, "copyright") !== false) {
                         continue;
                     } else {
                         if (stripos($line, "recipe follows") !== false) {
                             continue;
                         } else {
                             $line = RecipeParser_Text::formatAsOneLine($line);
                             $recipe->appendIngredient($line);
                         }
                     }
                     // Section titles
                 } else {
                     if ($ing_node->nodeName == 'li' && $ing_node->getAttribute("class") == "subtitle") {
                         $line = trim($ing_node->nodeValue);
                         $line = RecipeParser_Text::formatSectionName($line);
                         $recipe->addIngredientsSection($line);
                     }
                 }
             }
             continue;
         }
     }
     // Instructions
     $recipe->resetInstructions();
     $nodes = $xpath->query('//*[@itemprop="recipeInstructions"]/*');
     foreach ($nodes as $node) {
         if ($node->nodeName == "span") {
             $line = RecipeParser_Text::formatSectionName($node->nodeValue);
             $recipe->addInstructionsSection($line);
         } else {
             if ($node->nodeName == "p") {
                 $line = RecipeParser_Text::formatAsOneLine($node->nodeValue);
                 if (!preg_match("/^Photograph/i", $line)) {
                     $recipe->appendInstruction($line);
                 }
             }
         }
     }
     // See if we've captured a chef's photo, and delete it (if so).
     if ($recipe->photo_url) {
         $nodes = $xpath->query('//a[@itemprop="url"]/img[@itemprop="image"]');
         if ($nodes->length > 0) {
             $url = $nodes->item(0)->getAttribute("src");
             if ($recipe->photo_url == $url) {
                 $recipe->photo_url = "";
             }
         }
     }
     return $recipe;
 }
 public static function parse($html, $url)
 {
     $recipe = new RecipeParser_Recipe();
     libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
     $xpath = new DOMXPath($doc);
     // Find the top-level node for Recipe microdata
     $microdata = null;
     $nodes = $xpath->query('//*[@itemtype="http://data-vocabulary.org/Recipe"]');
     if ($nodes->length) {
         $microdata = $nodes->item(0);
     }
     // Parse elements
     if ($microdata) {
         // Title
         $nodes = $xpath->query('.//*[@itemprop="name"]', $microdata);
         if ($nodes->length) {
             $value = $nodes->item(0)->nodeValue;
             $value = RecipeParser_Text::formatTitle($value);
             $recipe->title = $value;
         }
         // Summary
         $nodes = $xpath->query('.//*[@itemprop="summary"]', $microdata);
         if ($nodes->length) {
             $value = trim($nodes->item(0)->nodeValue);
             $recipe->description = $value;
         }
         // Times
         $searches = array('prepTime' => 'prep', 'cookTime' => 'cook', 'totalTime' => 'total');
         foreach ($searches as $itemprop => $time_key) {
             $nodes = $xpath->query('.//*[@itemprop="' . $itemprop . '"]', $microdata);
             if ($nodes->length) {
                 if ($value = $nodes->item(0)->getAttribute('datetime')) {
                     $value = RecipeParser_Text::iso8601ToMinutes($value);
                 } else {
                     if ($value = $nodes->item(0)->getAttribute('content')) {
                         $value = RecipeParser_Text::iso8601ToMinutes($value);
                     } else {
                         $value = trim($nodes->item(0)->nodeValue);
                         $value = RecipeParser_Times::toMinutes($value);
                     }
                 }
                 if ($value) {
                     $recipe->time[$time_key] = $value;
                 }
             }
         }
         // Yield
         $line = "";
         $nodes = $xpath->query('.//*[@itemprop="yield"]', $microdata);
         if ($nodes->length) {
             $line = trim($nodes->item(0)->nodeValue);
         } else {
             $nodes = $xpath->query('.//*[@itemprop="servingSize"]', $microdata);
             if ($nodes->length) {
                 $line = trim($nodes->item(0)->nodeValue);
             }
         }
         if ($line) {
             $line = preg_replace('/\\s+/', ' ', $line);
             $recipe->yield = RecipeParser_Text::formatYield($line);
         }
         // Ingredients
         $nodes = null;
         // (data-vocabulary)
         if (!$nodes || !$nodes->length) {
             $nodes = $xpath->query('.//*[@itemprop="ingredient"]', $microdata);
         }
         if (!$nodes || !$nodes->length) {
             // non-standard
             $nodes = $xpath->query('.//*[@id="ingredients"]//li', $microdata);
         }
         if (!$nodes || !$nodes->length) {
             // non-standard
             $nodes = $xpath->query('.//*[@class="ingredients"]//li', $microdata);
         }
         foreach ($nodes as $node) {
             $value = $node->nodeValue;
             $value = RecipeParser_Text::formatAsOneLine($value);
             if (empty($value)) {
                 continue;
             }
             if (RecipeParser_Text::matchSectionName($value)) {
                 $value = RecipeParser_Text::formatSectionName($value);
                 $recipe->addIngredientsSection($value);
             } else {
                 $recipe->appendIngredient($value);
             }
         }
         // Instructions
         $found = false;
         // Look for markup that uses <li> tags for each instruction.
         if (!$found) {
             $nodes = $xpath->query('.//*[@itemprop="instructions"]//li', $microdata);
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Some sites will use an "instruction" class for each line.
         if (!$found) {
             $nodes = $xpath->query('.//*[@itemprop="instruction"]//*[contains(concat(" ", normalize-space(@class), " "), " instruction ")]', $microdata);
             if ($nodes->length) {
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             }
         }
         // Either multiple instrutions nodes, or one node with a blob of text.
         if (!$found) {
             $nodes = $xpath->query('.//*[@itemprop="instructions"]', $microdata);
             if ($nodes->length > 1) {
                 // Multiple nodes
                 RecipeParser_Text::parseInstructionsFromNodes($nodes, $recipe);
                 $found = true;
             } else {
                 if ($nodes->length == 1) {
                     // Blob
                     $str = $nodes->item(0)->nodeValue;
                     RecipeParser_Text::parseInstructionsFromBlob($str, $recipe);
                     $found = true;
                 }
             }
         }
         // Photo
         $photo_url = "";
         if (!$photo_url) {
             // try to find open graph url
             $nodes = $xpath->query('//meta[@property="og:image"]');
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('content');
             }
         }
         if (!$photo_url) {
             $nodes = $xpath->query('.//*[@itemprop="photo"]', $microdata);
             if ($nodes->length) {
                 if ($nodes->item(0)->hasAttribute('src')) {
                     $photo_url = $nodes->item(0)->getAttribute('src');
                 } else {
                     if ($nodes->item(0)->hasAttribute('content')) {
                         $photo_url = $nodes->item(0)->getAttribute('content');
                     }
                 }
             }
         }
         if (!$photo_url) {
             // for <img> as sub-node of class="photo"
             $nodes = $xpath->query('.//*[@itemprop="photo"]//img', $microdata);
             if ($nodes->length) {
                 $photo_url = $nodes->item(0)->getAttribute('src');
             }
         }
         if ($photo_url) {
             $recipe->photo_url = RecipeParser_Text::relativeToAbsolute($photo_url, $url);
         }
         // Credits
         $nodes = $xpath->query('.//*[@itemprop="author"]', $microdata);
         if ($nodes->length) {
             $line = $nodes->item(0)->nodeValue;
             $recipe->credits = RecipeParser_Text::formatCredits($line);
         }
     }
     return $recipe;
 }