PHP Classes

File: FeedFinder.php

Recommend this page to a friend!
  Classes of Manel Zaera  >  Feed Finder  >  FeedFinder.php  >  Download  
File: FeedFinder.php
Role: Class source
Content type: text/plain
Description: FeedFinder class file
Class: Feed Finder
Check whether an URL has an RSS or Atom feed
Author: By
Last change: Solve wrong absolute URL construction in getAbsoluteUrl
Date: 14 years ago
Size: 11,308 bytes
 

Contents

Class file image Download
<?php /* * FeedFinder * Author: Manel Zaera (manelzaera@gmail.com) * Description: Singleton class to find syndication feeds in * a website. * Creation date: 2007-11-06 * * Modified by: * 2007-11-16 - Manel Zaera (manelzaera@gmail.com) - Reduce number of URL fetches * 2007-11-21 - Manel Zaera (manelzaera@gmail.com) - Return array of feeds indicating the type of each feed * 2007-11-24 - Manel Zaera (manelzaera@gmail.com) - Add feed title to returned array * 2007-12-03 - Manel Zaera (manelzaera@gmail.com) - Solve parse errors, avoid warning messages on HTML loading * 2007-12-28 - Manel Zaera (manelzaera@gmail.com) - Recognize partial feed URLs from HTML document header and get feeds from OPML file * 2008-01-17 - Manel Zaera (manelzaera@gmail.com) - Fix wrong absolute URL construction in getAbsoluteUrl * * This work is published under the GPL license (http://www.gnu.org/copyleft/gpl.html) * */ class FeedFinder { private static $sInstance = null; const MIME_RSS = 'application/rss+xml'; const MIME_ATOM = 'application/atom+xml'; const XMLNS_RSS1 = 'http://purl.org/rss/1.0/'; const XMLNS_ATOM1 = 'http://www.w3.org/2005/Atom'; const XMLNS_ATOM2 = 'http://purl.org/atom/ns#'; // Feed array element constants const FEED_FIELD_TYPE = 'type'; const FEED_FIELD_URL = 'url'; const FEED_FIELD_TITLE = 'title'; // Feed type constants const FEED_TYPE_NONE = 0; const FEED_TYPE_RSS1 = 1; const FEED_TYPE_RSS2 = 2; const FEED_TYPE_ATOM = 3; const FEED_TYPE_OPML = 4; private function __construct() { } /** * Gets the unique class instance */ public static function getInstance() { if (self::$sInstance == null) { self::$sInstance = new FeedFinder(); } return self::$sInstance; } /** * Get the feeds discovered from a URL * @param $aUrl URL that can be a feed * or that contains one or more * feed references * * @return Array of found feed URLs, null otherwise. The array elements are 'url', 'type' and 'title' data. */ public function getFeeds($aUrl) { $aaFeeds = array(); try { $aDoc = $this->prepareXmlReader($aUrl); $aType = $this->typeOfDoc($aDoc); $aTitle = ($aType!=self::FEED_TYPE_NONE || $aType!=self::FEED_TYPE_OPML)?$this->getFeedTitle($aDoc):null; if ($aType!=self::FEED_TYPE_NONE && $aType!=self::FEED_TYPE_OPML) { $aFeed = array(self::FEED_FIELD_TYPE=>$aType, self::FEED_FIELD_URL=>$aUrl, self::FEED_FIELD_TITLE=>$aTitle); $aaFeeds[] = $aFeed; } elseif ($aType==self::FEED_TYPE_OPML) { $aaFeeds = $this->getFeedsFromOpml($aDoc); } else { // Not a feed URL -> find feeds in document $aaFeeds = $this->discoverFeeds($aUrl); } $aDoc->close(); } catch (Exception $aEx) { // Do nothing } return $aaFeeds; } /** * Check if a URL is a feed URL * @param $aUrl URL to analyze */ public function isFeed($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsFeed = ($this->isRssDoc($aDoc) || $this->isAtomDoc($aDoc)); $aDoc->close(); } catch (Exception $aEx) { $zIsFeed = false; } return $zIsFeed; } /** * Check if a URL is RSS 1.0 or RSS 2.0 feed * @param $aUrl URL to analyze */ public function isRss($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsRss = $this->isRssDoc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $zIsRss = false; } return $zIsRss; } /** * Check if a URL is a RSS 1.0 feed * @param $aUrl URL to analyze */ public function isRss1($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsRss1 = $this->isRss1Doc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $zIsRss1 = false; } return $zIsRss1; } /** * Check if a URL is a RSS 2.0 feed * @param $aUrl URL to analyze */ public function isRss2($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsRss2 = $this->isRss2Doc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $zIsRss2 = false; } return $zIsRss2; } /** * Check if a URL is an Atom feed * @param $aUrl URL to analyze */ public function isAtom($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsAtom = $this->isAtomDoc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $zIsAtom = false; } return $zIsAtom; } /** * Check if a URL is an OPML feed list * @param $aUrl URL to analyze */ public function isOpml($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $zIsOpml = $this->isOpmlDoc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $zIsOpml = false; } return $zIsOpml; } /* * Look for feeds within a document * @param $aUrl URL of document * * @return Array of feed URLs */ private function discoverFeeds($aUrl) { $aFeeds = array(); try { $aDocument = new DOMDocument(); // Avoid HTML load warnings @$aDocument->loadHTMLFile($aUrl); $aDocument->normalize(); $aElements = $aDocument->getElementsByTagName('link'); foreach ($aElements as $aElement) { $aRel = $aElement->getAttribute('rel'); $aAttrType = $aElement->getAttribute('type'); if ($aRel == 'alternate' && ($aAttrType == self::MIME_RSS || $aAttrType == self::MIME_ATOM)) { try { $aHref = $this->getAbsoluteUrl($aElement->getAttribute('href'),$aUrl); $aDoc = $this->prepareXmlReader($aHref); $aType = $this->typeOfDoc($aDoc); $aTitle = $this->getFeedTitle($aDoc); if ($aType != self::FEED_TYPE_NONE) { $aFeed = array(self::FEED_FIELD_TYPE=>$aType, self::FEED_FIELD_URL=>$aHref,self::FEED_FIELD_TITLE=>$aTitle); $aFeeds[] = $aFeed; } $aDoc->close(); } catch (Exception $aEx) { // Do nothing } } } } catch (Exception $aEx) { // None } return $aFeeds; } /** * Get the type of feed for a Url * @param $aUrl URL to check * * @return Number One of the constant values in this class related to feed types */ public function typeOf($aUrl) { try { $aDoc = $this->prepareXmlReader($aUrl); $aType = $this->typeOfDoc($aDoc); $aDoc->close(); } catch (Exception $aEx) { $aType = self::FEED_TYPE_NONE; } return $aType; } /* * Get the type of a loaded document * @param $aDoc Loaded XML document, positioned at first element */ private function typeOfDoc($aDoc) { $aType = self::FEED_TYPE_NONE; if ($this->isRss1Doc($aDoc)) { $aType = self::FEED_TYPE_RSS1; } elseif ($this->isRss2Doc($aDoc)) { $aType = self::FEED_TYPE_RSS2; } elseif ($this->isAtomDoc($aDoc)) { $aType = self::FEED_TYPE_ATOM; } elseif ($this->isOpmlDoc($aDoc)) { $aType = self::FEED_TYPE_OPML; } return $aType; } /* * Check if a loaded document is RSS 1.0 or RSS 2.0 feed * @param $aDoc Loaded XML document, positioned at first element */ private function isRssDoc($aDoc) { $zIsRss = $this->isRss2Doc($aDoc); if (!$zIsRss) { $zIsRss = $this->isRss1Doc($aDoc); } return $zIsRss; } /* * Check if a loaded document is a RSS 2.0 feed * @param $aDoc Loaded XML document, positioned at first element */ private function isRss2Doc($aDoc) { return ($aDoc->name == 'rss'); } /* * Check if a loaded documentL is a RSS 1.0 feed * @param $aDoc Loaded XML document, positioned at first element */ private function isRss1Doc($aDoc) { $zIsRss1 = false; try { $zIsRss1 = ($aDoc->name == 'rdf:RDF') && ($aDoc->getAttribute('xmlns') == self::XMLNS_RSS1); } catch (Exception $aEx) { // } return $zIsRss1; } /* * Check if a loaded document is an Atom feed * @param $aDoc Loaded XML document, positioned at first element */ private function isAtomDoc($aDoc) { return ($aDoc->name == 'feed' && ($aDoc->namespaceURI == self::XMLNS_ATOM1 || $aDoc->namespaceURI == self::XMLNS_ATOM2)); } /* * Check if a loaded document is an OPML feed list * @param $aDoc Loaded XML document, positioned at first element */ private function isOpmlDoc($aDoc) { return ($aDoc->name == 'opml'); } /* * Get an XMLReader object from a URL and positions it at the first element * of the document * @param $aUrl URL to get the XMLReader object from */ private function prepareXmlReader($aUrl) { $aDoc = new XMLReader(); $aDoc->open($aUrl); $this->readNextXmlElement($aDoc); return $aDoc; } /* * Get the feed title from a loaded XML document * @param $aDoc XML document */ private function getFeedTitle($aDoc) { $zFound = false; $zEnd = false; try { do { $zFound = ($aDoc->name == self::FEED_FIELD_TITLE); if (!$zFound) { $zEnd = !$this->readNextXmlElement($aDoc,XMLReader::ELEMENT); } } while (!$zFound && !$zEnd); // Read inner text from title element if ($zFound) { $this->readNextXmlElement($aDoc,XMLReader::TEXT); } } catch (Exception $aEx) { // } return ($zFound?$aDoc->value:''); } /* * Read next element in XML document if this * is a valid one */ private function readNextXmlElement($aDoc,$aNodeType=XMLReader::ELEMENT) { // Avoid reading nodes like XML Stylesheet, etc. $zReadOk = true; do { $zReadOk = @$aDoc->read(); } while ($aDoc->nodeType!=$aNodeType && $zReadOk); return $zReadOk; } /* * Get the absolute URL for a feed * @param $aHref Feed URL (partial or absolute) * @param $aUrl Url to use as base URL if $aHref is not an absolute one */ private function getAbsoluteUrl($aHref,$aUrl) { $aAbsUrl = $aHref; $aHrefParts = parse_url($aHref); $aScheme = $aHrefParts['scheme']; $aPath = $aHrefParts['path']; $aQuery = $aHrefParts['query']; // An absolute URL has some value in 'scheme' part if ($aScheme == '') { $aUrlParts = parse_url($aUrl); $aScheme = $aUrlParts['scheme']; $aHost = $aUrlParts['host']; $aUrlPath = $aUrlParts['path']; $aAbsUrl = $aScheme.'://'.$aHost.$aUrlPath; if ($aAbsUrl[strlen($aAbsUrl)-1] != '/') { $aAbsUrl .= '/'; } $aAbsUrl .= $aPath; if ($aQuery != '') { $aAbsUrl .= '?'.$aQuery; } } return $aAbsUrl; } /* * Get feeds from a OPML feed list */ private function getFeedsFromOpml($aDoc) { $aaFeeds = array(); if ($this->goToOpmlBody($aDoc)) { $aaFeeds = $this->getOpmlBodyFeeds($aDoc); } return $aaFeeds; } /* * Seek the XML document to OPML body element */ private function goToOpmlBody($aDoc) { $zFound = false; $zEnd = false; try { do { $zFound = ($aDoc->name == 'body'); if (!$zFound) { $zEnd = !$this->readNextXmlElement($aDoc,XMLReader::ELEMENT); } } while (!$zFound && !$zEnd); } catch (Exception $aEx) { // } return ($zFound); } /* * Get the feeds in OPML document body */ private function getOpmlBodyFeeds($aDoc) { $zEnd = false; $aaFeeds = array(); try { do { $zEnd = !$this->readNextXmlElement($aDoc,XMLReader::ELEMENT); if ($aDoc->name == 'outline') { $aXmlUrl = $aDoc->getAttribute('xmlUrl'); if ($aXmlUrl != '') { try { $aFeedType = @$this->typeOf($aXmlUrl); } catch (Exception $aInEx) { $aFeddType = self::FEED_TYPE_NONE; } $aFeed = array(self::FEED_FIELD_TYPE=>$aFeedType, self::FEED_FIELD_URL=>$aXmlUrl, self::FEED_FIELD_TITLE=>$aDoc->getAttribute('title')); $aaFeeds[] = $aFeed; } } } while (!$zEnd); } catch (Exception $aEx) { // } return $aaFeeds; } } ?>