Version: 0.3
Type: Function
Category: HTML
License: GNU Library Public License
Description: Get all links from any web page just by specifying the URL.
<?php /* ################################################################# // VERSION 0.3 RETRIEVES LINKS FROM A URL 1) Include this file. 2) $links=GetLinks('http://www.somesite.com', ''); 3) You now have an array of all unique links at the URL specified. 4) You can now filter the links you want returned in the second parameter. 5) Added error checking if the page happened to have no links, in which case the array value doesn't exist. 6) Added case sensitive filtering option. Defaults to CaSe SeNsItIvE = true. 7) Rebuilt into a class architecture. GetLinks is now ExtractLinks. GetInput is now ConvertToHTML. Added DisplayHTML. NOTES // Ignores javascript, links with email addresses, etc. // Filters are case sensitive and must use web escape characters to match unless specified by boolean parameter false. i.e. & = & COMPATIBILITY PHP 4.0.0+ MODIFIED BY ACTION Winston Huang, Netdiversity Inc. (4) - (7) [email protected] LICENSE: LGPL - If you modify this work, you must recontribute at https://phpbuilder.com/snippet/submit.php. ################################################################# */ class Extractor { var $domain; var $html; var $links; function Extractor($url = '') { $this->domain = ''; $this->links = array(); if( strcmp($url,'') ) $this->ConvertToHTML($url); else $this->html = $url; } function ConvertToHTML($url) { $match_domain='_[hH][tT][tT][pP]://(.*?)(/|$)_'; preg_match($match_domain, $url, $res); $this->domain=$res[1]; if (!$this->domain) return false; if( !$fp = fopen($url, 'r') ) return false; $this->html = fread($fp, 500000); fclose($fp); return true; } function DisplayHTML() { if( strlen($this->html) ) { echo $this->html; return true; } else return false; } function ExtractLinks($filter, $sensitive = true) { $lookfor='/<[aA]s.*?[hH][rR][eE][fF]=[ "']{0,}([-.,%_()|=~;+:?&/a-zA-Z0-9]+)[ "'>]/'; preg_match_all($lookfor, $this->html, $data); while (list($k, $v)=each($data[1])) { // filter by if( strlen($filter) ) { if( $sensitive ) { if( strpos($v, $filter) === false ) continue; } else { if( strpos(strtolower($v), strtolower($filter)) === false ) continue; } } if( stristr($v, 'javascript:') ) { // ignore - contains javascript } elseif( stristr($v, '//') == $v ) { $v = 'http:'.$v; $this->links[] = $v; } elseif( stristr($v, 'http://') != $v ) { if( stristr($v, '/') != $v ) $sep = '/'; else $sep = ''; $v = 'http://' . $this->domain . $sep . $v; $this->links[] = $v; } else $this->links[] = $v; } if( count($this->links) ) { $this->links = array_flip($this->links); $this->links = array_keys($this->links); } else $this->links[] = 'No Data'; return true; } } ?>