#native_company# #native_desc#
#native_cta#

Extract Links

By Winston Huang
on October 31, 2003

Version: 0.3

Type: Function

Category: HTML

License: GNU Library Public License

Description: Get all links from any web page just by specifying the URL.

<?php
/* 	#################################################################
	// VERSION 0.3

	RETRIEVES LINKS FROM A URL
	1) Include this file.
	2) $links=GetLinks('http://www.somesite.com', '');
	3) You now have an array of all unique links at the URL specified.
	4) You can now filter the links you want returned in the second parameter.
    5) Added error checking if the page happened to have no links, in which case
       the array value doesn't exist.
    6) Added case sensitive filtering option. Defaults to CaSe SeNsItIvE = true.
    7) Rebuilt into a class architecture. GetLinks is now ExtractLinks. GetInput
       is now ConvertToHTML. Added DisplayHTML.

	NOTES
	// Ignores javascript, links with email addresses, etc.
	// Filters are case sensitive and must use web escape characters to match
	   unless specified by boolean parameter false. i.e.  & = &amp;
	
	COMPATIBILITY
	PHP 4.0.0+
		
	MODIFIED BY												ACTION
	Winston Huang, Netdiversity Inc.						(4) - (7)
	[email protected]

	LICENSE: LGPL -  If you modify this work, you must recontribute at
	https://phpbuilder.com/snippet/submit.php. 
 	#################################################################
*/
class Extractor
{
	var $domain;
	var $html;
	var $links;

	function Extractor($url = '')
	{
		$this->domain = '';
		$this->links = array();
		if( strcmp($url,'') )
			$this->ConvertToHTML($url);	
		else
			$this->html = $url;
	}

	function ConvertToHTML($url)
	{
		$match_domain='_[hH][tT][tT][pP]://(.*?)(/|$)_';
		preg_match($match_domain, $url, $res); 
		$this->domain=$res[1];
		if (!$this->domain)
			return false;
		if( !$fp = fopen($url, 'r') )
			return false;
		$this->html = fread($fp, 500000);
		fclose($fp);
		return true;
	}
	
	function DisplayHTML()
	{
		if( strlen($this->html) )
		{
			echo $this->html;
			return true;
		}
		else
			return false;
	}

	function ExtractLinks($filter, $sensitive = true)
	{ 
		$lookfor='/<[aA]s.*?[hH][rR][eE][fF]=[ 					"']{0,}([-.,%_()|=~;+:?&/a-zA-Z0-9]+)[ "'>]/';
		preg_match_all($lookfor, $this->html, $data);
		while (list($k, $v)=each($data[1]))
		{
			// filter by
			if( strlen($filter) )
			{
				if( $sensitive )
				{
					if( strpos($v, $filter) === false )
						continue;
				}
				else
				{
					if( strpos(strtolower($v), strtolower($filter)) === false )
						continue;
				}
			}
			if( stristr($v, 'javascript:') )
			{
				// ignore - contains javascript
			}
			elseif( stristr($v, '//') == $v ) 
			{ 
				$v = 'http:'.$v;
				$this->links[] = $v;
			} 
			elseif( stristr($v, 'http://') != $v ) 
			{ 
				if( stristr($v, '/') != $v )
					$sep = '/';
				else
					$sep = '';
				$v = 'http://' . $this->domain . $sep . $v; 
				$this->links[] = $v;
			} 
			else
				$this->links[] = $v;
		}
		if( count($this->links) )
		{
			$this->links = array_flip($this->links);
			$this->links = array_keys($this->links); 
		}
		else
			$this->links[] = 'No Data';
		return true; 
	}
}
?>