Version: 1.0 (stable)
Type: Class
Category: File Management
License: GNU General Public License
Description: Page Indexer is a class for indexing all words on a web document in to a MySQL-database.
It is very easy to use and VERY powerful! It will only return actual words and throw away everything else that isn’t a valid word.
The class works with both PHP4 and PHP5!
Any comments, bugs, suggestions etc can be sent to my e-mail (which is found in the class).
I would really appreciate if you sended me a e-mail if you use the class, telling me why you use it and what you think of it.
Enjoy! It’s a fine release =)
<?php /** * @name: class PageIndexer * @file: classPageIndexer.php * @author: Niklas Forsberg * @email: [email protected] * @version: 1.0 * * * @description: * Class for indexing words from a specified web document and * inserting them in to a specified MySQL-database. * * * @origin: * This class was originally written by Niklas Forsberg as a * test project for deliverance to SailSoft. * * * @usage: * To index a web document, simply include this class in to your * PHP-document by using require_once("classPageIndexer.php"). * * In your PHP-document, include this line and edit the options * to suite your needs: * * new PageIndexer(str URL, str MySQL-host:str MySQL-port, str username, str password, str DB-name, str silent mode) * * For example, if you want to put all words (that are recognized by this class) * from the Google start page in to your MySQL-database named "webindexer", then use: * * $IndexPage = new PageIndexer("http://www.google.com", "localhost:3306", "myuser", "mypass", "webindexer", 0); * (Where myuser and mypass is the login to get access to the database "webindexer" and 0 is for displaying * all error messages and notifications that may occur (read below for more information about the silent mode)). * * Silent mode is a way to put all errors and announcements, which the class may produce, aside; which means that * no errors or other messages will be printed out by the class. Simply use either 1 for running class in silent mode * or 0 for runnning in normal mode (which means that the class will print out any errors and announcements). * * Directives: All directives except the port-number is needed! * If you don't specify a port-number, the class will use the default * port for MySQL which is "3306". * If you don't want to use any port-number, simply remove the :nr-clause like: * ...google.com", "localhost", "myuser... * * * @support: * PHP: The class supports PHP4> (aswell as PHP5). * OS: Independent * SQL: MySQL Server (at least v3.04) * (though it can work with older releases aswell, try it out!) * * You also need a MySQL-database with the following table structure: * * CREATE TABLE `tblord` ( * `iKeyOrd` int(11) unsigned NOT NULL auto_increment, * `sOrd` varchar(50) NOT NULL default '', * PRIMARY KEY (`iKeyOrd`) * ) ENGINE=MyISAM DEFAULT CHARSET=latin1; * * CREATE TABLE `tblsida` ( * `iKeySida` int(11) unsigned NOT NULL auto_increment, * `sWebsida` varchar(255) default NULL, * PRIMARY KEY (`iKeySida`) * ) ENGINE=MyISAM DEFAULT CHARSET=latin1; * * CREATE TABLE `tblsida_ord` ( * `iKeySida` int(11) NOT NULL default '0', * `iKeyOrd` int(11) NOT NULL default '0' * ) ENGINE=MyISAM DEFAULT CHARSET=latin1; * */ /** * License: * * Copyright (C) 2005 Niklas Forsberg * * This file is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This file is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this file; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // **** // **** class PageIndexer // **** /** Class for handling methods for indexing web documents. */ class PageIndexer { /** object pageURL * Contains the full URL to the web document that should be indexed. * This has to be a valid URL (like: http://www.mysite.com/news.html) * otherwise, a error message will be returned. */ var $pageURL; /** object silentMode * Contains the value for running the class in silent mode or not. */ var $silentMode; /** object pageID * Contains the ID number of the document (URL) in the database. */ var $pageID; /** object dbHost * Contains the hostname of the database. */ var $dbHost; /** object dbOpen * Contains the connection resource for the MySQL-connection. */ var $dbOpen; /** object dbPort * Contains the port number to use to create a connection to the database. * If no port number is defined, default port '3306' will be used. */ var $dbPort; /** object dbName * Contains the database name where we should store things. */ var $dbName; /** object dbUser * Contains the username to connect to the database with. */ var $dbUser; /** object dbPass * Contains the password for the user specified in object dbUser. */ var $dbPass; /** object pageContents * Containts the unformatted file contents of the file specified in object pageURL. */ var $pageContents; /** object parsedContents * Contains the parsed contents of the file specified in object pageURL. */ var $parsedContents; /** object wordsIndex * Contains the parsed words in a array element after being processed by ParsePage-method */ var $wordsIndex; /** constructor pageIndexer(str URL, str dbhost, str dbuser, str dbpass, str dbname, int silent) * Method which is called upon a initialisation of the class PageIndexer'. * The contents of string $url will be passed on to method etURL for further * management and the rest of the strings will be passed on to the DB-connection method. */ function PageIndexer($url, $db_host, $db_user, $db_pass, $db_name, $silent) { /** Take contents of $url and put them in to object pageURL. */ $this->pageURL = $url; /** Take contents of $silent and put them in to object silentMode. */ $this->silentMode = $silent; /** Take contents of $db_user and put them in to object dbUser. */ $this->dbUser = $db_user; /** Take contents of $db_pass and put them in to object dbPass. */ $this->dbPass = $db_pass; /** Take contents of $db_name and put them in to object dbName. */ $this->dbName = $db_name; /** Get host address and port number from var $db_host which was * passed on from the caller. */ $host_address = explode(":", $db_host); /** Take out the host name from array $host_address and put * it in to it's object dbHost for future accessing. */ $this->dbHost = $host_address[0]; /** Take out the port number from array $host_address and put it in to it's * object dbPort for future accessing. Default port number is '3306'. */ $this->dbPort = $host_address[1]; /** Call method DBConnect to make a connection to the database specified in object dbName. */ $this->DBConnect("$this->dbHost", "$this->dbPort", "$this->dbUser", "$this->dbPass", "$this->dbName"); /** Call method GetPage to get the contents of the document in object pageURL. */ $this->GetPage($this->pageURL); } /** method GiveMsg(str message) * Method for returning announcements and error messages to class's methods. * Note: Messages will only be returned if class is -not- running in silent mode. */ function GiveMsg($msg) { /** If object silentMode is -not- set to 1, then return all messages. */ if($this->silentMode != 1) { /** Print out error or announcement on the screen. */ print "$msg"; } /** If object silentMode -is- set to 1, then don't return any * messages at all to make the operations silent. */ else { // Do not print anything out (if in silent mode) } } /** method DBConnect(str hostname, str portnr, str username, str password, str dbname) * Method for connecting to a MySQL database specified in object dbName. */ function DBConnect($db_host, $db_port, $db_user, $db_pass, $db_name) { /** Attempt to create a connection to the MySQL-database * using the values sent from the constructor. */ $this->dbOpen = @mysql_connect("$this->dbHost:$this->dbPort", "$this->dbUser", "$this->dbPass"); /** Attempt to select the MySQL-database specified in object dbName. * Kill class and send a error message if the connection to the database fails. */ if( !@mysql_select_db($this->dbName) ) { /** Make a error message to send to method GiveMsg. */ $msg = "Could not connect to database <font color=red>'" . $this->dbName . "'</font>."; $msg .= "<br>n"; $msg .= "<br>n"; /** Send message to method GiveMsg. */ $this->GiveMsg($msg); /** Kill script and return the MySQL error. */ die('Reason: ' . mysql_error()); } } /** method GetPage(str URL) * Method for getting the contents of the file specified in object pageURL. */ function GetPage($url) { /** Check if the document given in object pageURL exists or not. * Return a error message if not. */ if( !@fopen($this->pageURL, r) ) { /** Send message to method GiveMsg. */ $this->GiveMsg("The specified URL ('<font color=red>" . $this->pageURL . "</font>') does not exist!"); return false; } /** If the URL -do- exist, then proceed to parsing of the document * by sending the object pageURLto ParsePage-method. */ else { /** Call method ParsePage to start parsing the page in object pageURL. */ $this->ParsePage($this->pageURL); } } /** method ParsePage(str URL) * Method for parsing the contents of the file specified in object pageURL. */ function ParsePage($url) { /** function check_word(str word) * Function to see if the string actually is a word or not. * A valid word could for example be: * foo * foo-bar * 0123 * foo:bar */ function check_word($word) { return (preg_match("/^[-'/.@:;w]+$/", $word) === 1); } /** function parse_word(str word) * Function for stripping out unaccepted data from a word (ending punctuations etc) * so a nice and correct word will be the result. Also make all words lower-case. */ function parse_word($word) { $word = preg_replace('/^W+/', '', strtolower($word)); $word = preg_replace('/W+$/', '', $word); $word = preg_replace("/[^-'/.@:;w]+/", '', $word); return $word; } $this->pageContents = file_get_contents($this->pageURL); /** Remove everything outside the body-element in object pageContents * and return the results to array $body_array. */ preg_match("@ <body([^>]*)> ( (?> [^<]* ) (?> (?! </?body> ) < [^<]*)* ) </body>@imsx", $this->pageContents, $body_array); /** Remove all javascript, CSS and other things from the body-array that * may result in problems for the parser method. */ $remove_script = preg_replace("@ <script([^>]*)> ( (?> [^<]* ) (?> (?! </?script> ) < [^<]*)*) </script> @imsx", "", $body_array[2]); /** Replace all tags which are stucked together with words (like word1<br>word2) * with a blankspace so the parser can split both words in to two different words. */ $body_contents = preg_replace('/(w)(?:<[^>]+>)+(w)/', '1 2', $remove_script); /** Decode HTML entities and strip out tags (HTML, comments etc) from document source in var $body_contents * so that the actual words will be represented. */ $body_contents = html_entity_decode(strip_tags($body_contents)); /** Strip out any remaining &...; entities that the * html_entity_decode()-function above could not take care of. */ $body_contents = preg_replace('@&w+;@', '', $body_contents); /** Fix the problem with words that are stuck together with a @ (eg a e-mail address). * A e-mail address should count as a word aswell. */ $body_contents = preg_replace('@([a-zA-Z])/([a-zA-Z])@', '1 2', $body_contents); /** Split the string $body into an array of actual words. */ $words_array = preg_split("/s+/ms", $body_contents); /** Apply filters using the functions check_word and parse_word * to get the result needed and place the results in object wordsIndex. */ $filter_words = array_filter(array_map("parse_word", $words_array), "check_word"); /** Remove all duplicates of words since we only need to store * a word once, not twice or more. So if a document contains "How do I do that" * we have two "do"-words. Now let us reduce it to only one "do"-word. */ $this->wordsIndex = array_unique($filter_words); /** After all words have been taken care of we should have a * pretty nice array with parsed words. Let's pass them on to * the IndexWords-method to get them in to the database. */ $this->IndexWords($this->wordsIndex, $this->pageURL); } /** method IndexWords(array words, str URL) * Method for insertion of the parsed words in object wordsIndex. */ function IndexWords($words, $url) { /** Check to see if the web page have been indexed before. */ $sql_return_url = mysql_query("SELECT iKeySida FROM tblsida WHERE sWebsida='$this->pageURL'"); while( $url_row = mysql_fetch_array($sql_return_url) ) { $this->pageID = $url_row[iKeySida]; } /** If the web document have been indexed before, then attempt to update * the words for the web document specified in object pageURL. */ if( mysql_num_rows($sql_return_url) != 0 ) { $this->UpdateIndex($this->pageID, $this->pageURL); } /** If the web document have -not- been indexed before, then attempt to * insert a new index in the database with all indexed words in object wordsIndex'. */ else { $this->AddIndex($this->pageURL, $this->wordsIndex); } } /** method AddIndex(str URL, array words) * Method for adding a new index to the database. */ function AddIndex($url, $words) { /** Create a new record in the database for the web document specified in object pageURL. */ $sql_make_index = mysql_query("INSERT INTO tblsida (sWebsida) VALUES ('$this->pageURL')"); /** Get the last inserted ID number (the actual document ID). */ $new_index_id = mysql_insert_id($this->dbOpen); /** Om $this->wordsIndex r tom, visa meddelande. */ if(!$this->wordsIndex) { $this->GiveMsg("No words were found on the page you selected to index."); exit; } /** Select the words that already exists from the database */ foreach($this->wordsIndex as $word) { /** Add slashes to all single-quotes to prevent failure upon parsing of the words. */ $word = addslashes($word); /** Compare the words in array wordsIndex with the words in the database. */ $sql_match_word = mysql_query("SELECT iKeyOrd, sOrd FROM tblord WHERE sOrd='$word'"); /** If one or more words are found, then only create a new connector * between the page and the word. If no word were found, then add new word * to the database (together with a new connector). */ if( mysql_num_rows($sql_match_word) != 0 ) { while( $word_row = mysql_fetch_array($sql_match_word) ) { $word_id = $word_row[iKeyOrd]; $word_old = $word_row['sOrd']; /** The word already exists, so we only need to add a new connector/bridge * between our document and the existing word. */ $sql_new_connector = mysql_query("INSERT INTO tblsida_ord (iKeySida, iKeyOrd) VALUES ('$new_index_id', '$word_id')"); } } /** If no matches were found, then add new word to the database. */ else { /** Add the new word to the words table in database. */ $sql_new_word = mysql_query("INSERT INTO tblord (sOrd) VALUES ('$word')"); /** Get the last inserted ID number (the actual ID for the new word). */ $new_word_id = mysql_insert_id($this->dbOpen); /** Add a new connector/bridge for the word. */ $sql_new_connector = mysql_query("INSERT INTO tblsida_ord (iKeySida, iKeyOrd) VALUES ('$new_index_id', '$new_word_id')"); } } /** If everything went out okay, then print out a OK-message. */ $this->GiveMsg("The document '<font color=green>" . $this->pageURL . "</font>' have been successfully indexed."); } /** method UpdateIndex(str pageid, str URL) * Method for updating existing indexes in the database. */ function UpdateIndex($pid, $url) { /** Get word connectors/bridges from the connection/bridge table in the database * for the specified web document URL. */ $sql_return_connector = mysql_query("SELECT iKeySida, iKeyOrd FROM tblsida_ord WHERE iKeySida = $this->pageID"); while( $conn_row = mysql_fetch_array($sql_return_connector) ) { $_page_id = $conn_row[iKeySida]; # Get existing document ID $_word_id = $conn_row[iKeyOrd]; # Get existing word ID /** Delete all affected connectors/bridges in the connection/bridge table. */ $sql_delete_connector = mysql_query("DELETE FROM tblsida_ord WHERE iKeySida=$_page_id AND iKeyOrd=$_word_id"); /** Return all un-used words from the database. */ $sql_return_unused = mysql_query("SELECT * FROM tblsida_ord WHERE iKeyOrd=$_word_id"); /** Check to see if there actually are any un-used words in the database. * If that is the case, then remove them to save database space. */ if( mysql_num_rows($sql_return_unused) == 0 ) { /** Delete all un-used words from the database. */ $sql_delete_unused = mysql_query("DELETE FROM tblord WHERE iKeyOrd=$_word_id"); } } /** Delete the document index from the database. */ $sql_delete_index = mysql_query("DELETE FROM tblsida WHERE iKeySida=$this->pageID"); /** Re-create a new page index in the database for the web page specified in var $url. */ $this->AddIndex($this->pageURL, $this->wordsIndex); } } ?>