<?php /*=======================================================================+ | PHP Universal Feed Parser | +------------------------------------------------------------------------/ Author : Anis uddin Ahmad <admin@ajaxray.com> Web : http://www.ajaxray.com Publish Date : March 24, 2008 LICENSE ---------------------------------------------------------------------- PHP Universal Feed Parser 1.0 - A PHP class to parse RSS 1.0, RSS 2.0 and ATOM 1.0 feed. Copyright (C) 2008 Anis uddin Ahmad <admin@ajaxray.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License (GPL) as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. To read the license please visit http://www.gnu.org/copyleft/gpl.html ======================================================================= HOW TO USE ----------------------------------------------------------------------- It's very easy to use. Just follow this 3 steps: 1. Include the file include('FeedParser.php'); 2. Create an object of FeedParser class $Parser = new FeedParser(); 3. Parse the URL you want to featch $Parser->parse('http://www.sitepoint.com/rss.php'); Done. Now you can use this functions to get various information of parsed feed: 1. $Parser->getChannels() - To get all channel elements as array 2. $Parser->getItems() - To get all feed elements as array 3. $Parser->getChannel($name) - To get a channel element by name 4. $Parser->getItem($index) - To get a feed element as array by it's index 5. $Parser->getTotalItems() - To get the number of total feed elements 6. $Parser->getFeedVersion() - To get the detected version of parsed feed 7. $Parser->getParsedUrl() - To get the parsed feed URL ======================================================================= IMPORTANT NOTES ----------------------------------------------------------------------- 1. All array keys are must be UPPERCASE 2. All dates are converted to timestamp 3. Attributes of a tag will be found under TAGNAME_ATTRS index example: Attributes of $item['GUID'] will be found as $item['GUID_ATTRS'] 4. The tags which have subtags will be an array and sub tags will be found as it's element example: IMAGE tag in RSS 2.0 ======================================================================== EXAMPLES ----------------------------------------------------------------------- To see more details and examples, please visit: http://www.ajaxray.com/blog/2008/05/02/php-universal-feed-parser-lightweight-php-class-for-parsing-rss-and-atom-feeds/ ======================================================================== */ /** * PHP Univarsel Feed Parser class * * Parses RSS 1.0, RSS2.0 and ATOM Feed * * @license GNU General Public License (GPL) * @author Anis uddin Ahmad <admin@ajaxray.com> * @link http://www.ajaxray.com/blog/2008/05/02/php-universal-feed-parser-lightweight-php-class-for-parsing-rss-and-atom-feeds/ */ class FeedParser{ private $xmlParser = null; private $insideItem = array(); // Keep track of current position in tag tree private $currentTag = null; // Last entered tag name private $currentAttr = null; // Attributes array of last entered tag private $namespaces = array( 'http://purl.org/rss/1.0/' => 'RSS 1.0', 'http://purl.org/rss/1.0/modules/content/' => 'RSS 2.0', 'http://www.w3.org/2005/Atom' => 'ATOM 1', ); // Namespaces to detact feed version private $itemTags = array('ITEM','ENTRY'); // List of tag names which holds a feed item private $channelTags = array('CHANNEL','FEED'); // List of tag names which holds all channel elements private $dateTags = array('UPDATED','PUBDATE','DC:DATE'); private $hasSubTags = array('IMAGE','AUTHOR'); // List of tag names which have sub tags private $channels = array(); private $items = array(); private $itemIndex = 0; private $url = null; // The parsed url private $version = null; // Detected feed version /** * Constructor - Initialize and set event handler functions to xmlParser */ function __construct() { $this->xmlParser = xml_parser_create(); xml_set_object($this->xmlParser, $this); xml_set_element_handler($this->xmlParser, "startElement", "endElement"); xml_set_character_data_handler($this->xmlParser, "characterData"); } /*-----------------------------------------------------------------------+ | Public functions. Use to parse feed and get informations. | +-----------------------------------------------------------------------*/ /** * Get all channel elements * * @access public * @return array - All chennels as associative array */ public function getChannels() { return $this->channels; } /** * Get all feed items * * @access public * @return array - All feed items as associative array */ public function getItems() { return $this->items; } /** * Get total number of feed items * * @access public * @return number */ public function getTotalItems() { return count($this->items); } /** * Get a feed item by index * * @access public * @param number index of feed item * @return array feed item as associative array of it's elements */ public function getItem($index) { if($index < $this->getTotalItems()) { return $this->items[$index]; } else { throw new Exception("Item index is learger then total items."); return false; } } /** * Get a channel element by name * * @access public * @param string the name of channel tag * @return string */ public function getChannel($tagName) { if(array_key_exists(strtoupper($tagName), $this->channels)) { return $this->channels[strtoupper($tagName)]; } else { throw new Exception("Channel tag $tagName not found."); return false; } } /** * Get the parsed URL * * @access public * @return string */ public function getParsedUrl() { if(empty($this->url)) { throw new Exception("Feed URL is not set yet."); return FALSE; } else { return $this->url; } } /** * Get the detected Feed version * * @access public * @return string */ public function getFeedVersion() { return $this->version; } /** * Parses a feed url * * @access public * @param srting teh feed url * @return void */ public function parse($url) { $this->url = $url; $URLContent = $this->getUrlContent(); if($URLContent) { $segments = str_split($URLContent, 4096); foreach($segments as $index=>$data) { $lastPiese = ((count($segments)-1) == $index)? true : false; xml_parse($this->xmlParser, $data, $lastPiese) or die(sprintf("XML error: %s at line %d", xml_error_string(xml_get_error_code($this->xmlParser)), xml_get_current_line_number($this->xmlParser))); } xml_parser_free($this->xmlParser); } else { die('Sorry! cannot load the feed url.'); } if(empty($this->version)) { die('Sorry! cannot detect the feed version.'); } } // End public functions ------------------------------------------------- /*-----------------------------------------------------------------------+ | Private functions. Be careful to edit them. | +-----------------------------------------------------------------------*/ /** * Load the whole contents of a RSS/ATOM page * * @access private * @return string */ private function getUrlContent() { if(empty($this->url)) { throw new Exception("URL to parse is empty!."); return false; } if($content = @file_get_contents($this->url)) { return $content; } else { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->url); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $content = curl_exec($ch); $error = curl_error($ch); curl_close($ch); if(empty($error)) { return $content; } else { throw new Exception("Erroe occured while loading url by cURL. <br />\n" . $error) ; return false; } } } /** * Handle the start event of a tag while parsing * * @access private * @param object the xmlParser object * @param string name of currently entering tag * @param array array of attributes * @return void */ private function startElement($parser, $tagName, $attrs) { if(!$this->version) { $this->findVersion($tagName, $attrs); } array_push($this->insideItem, $tagName); $this->currentTag = $tagName; $this->currentAttr = $attrs; } /** * Handle the end event of a tag while parsing * * @access private * @param object the xmlParser object * @param string name of currently ending tag * @return void */ private function endElement($parser, $tagName) { if (in_array($tagName, $this->itemTags)) { $this->itemIndex++; } array_pop($this->insideItem); $this->currentTag = $this->insideItem[count($this->insideItem)-1]; } /** * Handle character data of a tag while parsing * * @access private * @param object the xmlParser object * @param string tag value * @return void */ private function characterData($parser, $data) { //Converting all date formats to timestamp if(in_array($this->currentTag, $this->dateTags)) { $data = strtotime($data); } if($this->inChannel()) { // If has subtag, make current element an array and assign subtags as it's element if(in_array($this->getParentTag(), $this->hasSubTags)) { if(! is_array($this->channels[$this->getParentTag()])) { $this->channels[$this->getParentTag()] = array(); } $this->channels[$this->getParentTag()][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data)))); return; } else { if(! in_array($this->currentTag, $this->hasSubTags)) { $this->channels[$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data)))); } } if(!empty($this->currentAttr)) { $this->channels[$this->currentTag . '_ATTRS'] = $this->currentAttr; //If the tag has no value if(strlen($this->channels[$this->currentTag]) < 2) { //If there is only one attribute, assign the attribute value as channel value if(count($this->currentAttr) == 1) { foreach($this->currentAttr as $attrVal) { $this->channels[$this->currentTag] = $attrVal; } } //If there are multiple attributes, assign the attributs array as channel value else { $this->channels[$this->currentTag] = $this->currentAttr; } } } } elseif($this->inItem()) { // If has subtag, make current element an array and assign subtags as it's elements if(in_array($this->getParentTag(), $this->hasSubTags)) { if(! is_array($this->items[$this->itemIndex][$this->getParentTag()])) { $this->items[$this->itemIndex][$this->getParentTag()] = array(); } $this->items[$this->itemIndex][$this->getParentTag()][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data)))); return; } else { if(! in_array($this->currentTag, $this->hasSubTags)) { $this->items[$this->itemIndex][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data)))); } } if(!empty($this->currentAttr)) { $this->items[$this->itemIndex][$this->currentTag . '_ATTRS'] = $this->currentAttr; //If the tag has no value if(strlen($this->items[$this->itemIndex][$this->currentTag]) < 2) { //If there is only one attribute, assign the attribute value as feed element's value if(count($this->currentAttr) == 1) { foreach($this->currentAttr as $attrVal) { $this->items[$this->itemIndex][$this->currentTag] = $attrVal; } } //If there are multiple attributes, assign the attribute array as feed element's value else { $this->items[$this->itemIndex][$this->currentTag] = $this->currentAttr; } } } } } /** * Find out the feed version * * @access private * @param string name of current tag * @param array array of attributes * @return void */ private function findVersion($tagName, $attrs) { $namespace = array_values($attrs); foreach($this->namespaces as $value =>$version) { if(in_array($value, $namespace)) { $this->version = $version; return; } } } private function getParentTag() { return $this->insideItem[count($this->insideItem) - 2]; } /** * Detect if current position is in channel element * * @access private * @return bool */ private function inChannel() { if($this->version == 'RSS 1.0') { if(in_array('CHANNEL', $this->insideItem) && $this->currentTag != 'CHANNEL') return TRUE; } elseif($this->version == 'RSS 2.0') { if(in_array('CHANNEL', $this->insideItem) && !in_array('ITEM', $this->insideItem) && $this->currentTag != 'CHANNEL') return TRUE; } elseif($this->version == 'ATOM 1') { if(in_array('FEED', $this->insideItem) && !in_array('ENTRY', $this->insideItem) && $this->currentTag != 'FEED') return TRUE; } return FALSE; } /** * Detect if current position is in Item element * * @access private * @return bool */ private function inItem() { if($this->version == 'RSS 1.0' || $this->version == 'RSS 2.0') { if(in_array('ITEM', $this->insideItem) && $this->currentTag != 'ITEM') return TRUE; } elseif($this->version == 'ATOM 1') { if(in_array('ENTRY', $this->insideItem) && $this->currentTag != 'ENTRY') return TRUE; } return FALSE; } //This function is taken from lastRSS /** * Replace HTML entities &something; by real characters * * * @access private * @author Vojtech Semecky <webmaster@oslab.net> * @link http://lastrss.oslab.net/ * @param string * @return string */ private function unhtmlentities($string) { // Get HTML entities table $trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES); // Flip keys<==>values $trans_tbl = array_flip ($trans_tbl); // Add support for ' entity (missing in HTML_ENTITIES) $trans_tbl += array(''' => "'"); // Replace entities by values return strtr ($string, $trans_tbl); } } //End class FeedParser