Version: 0.90b
Type: Function
Category: Algorithms
License: GNU General Public License
Description: Actually this code parses every formatted text tagged with < & >. Look at the code for some explanation. It heavily updated at the moment (I need it as an OFX parser, more on that later), so it might be worth to check it out. Probably arguments wont work, because I have not yet tested it (I do not need it when pasring OFX)
<? // This code has been published under GPL version 2 or higher // so you are free to modify etc etc // Please drop me a line when you use it in your code or // when you modify it // I will be making changes to it and include more comments // [email protected] // btw NO WARRANTIES // function parsehtml // based on the libhtmlparse libary by // Mooneer Salem ([email protected]) // should be rewritten to be an extension to php4 // just a debug print function // cause tags aren't that easy to output // Version 0.90b function dp($txt, $var) { $trans = get_html_translation_table(HTML_ENTITIES); $encoded = strtr($var, $trans); printf("<br>%s is now: %s", $txt, $encoded); } function parse($html) { $html2 = $html; $tmp = $html2; $c=0; while (($tmp[$c] || $c < strlen($tmp))) { if ($tmp[$c] == '<' || $istag) { $istag = 0; if ($tmp[$c++] == '!') { // comment if ($tmp[$c++] == '-' || $tmp[$c+1] == '-') { $c+=2; // trim($tmp) ?? while ($tmp[$c] == ' ') $c++; $c--; startCommentCallback(); // find the end of the comment $t = $c; while (($tmp[$t] || $t < strlen($tmp)) && !($tmp[$t] == '-' && $tmp[$t++] == '-' && $tmp[$t+2] == '>')) $t++; if (($tmp[$t] || $t < strlen($tmp))) { while ($tmp[$t] == ' ') $t--; $comment = substr($tmp, $c, $t-$c); while ($tmp[$t] == ' ') $t++; $t+=2; $c = $t; } commentCallback($comment); endCommentCallback(); $c++; } else { $c--; // handle starttag } } elseif ($tmp[$c] == '/' || $tmp[$c-1] == '/') { if ($tmp[$c] == '/') $c++; $t=$c; while (($tmp[$t] || $t < strlen($tmp)) && $tmp[$t] != '>') $t++; $tag = substr($tmp,$c,$t-$c); endCallback($tag); $t++; $c = $t; continue; } else { // starttag here // maybe inseperate function because of above if ($tmp[$c-1] != '<') $c--; // comment function from above in seperate function call. if ($tmp[$c] == '!' && $tmp[$c++] == '-') comment(); $t = $c; $q = $c; // $q belongs to $tag[$q] $tagstart = $c; $tag = substr($tmp,$c); while ($tmp[$t] != '>' && $tmp[$t] != ' ') $t++; if ($tmp[$t] == '>') { $tag = substr($tmp, $c, $t-$c); if ($tag[0] == '!') { $tag = substr($tmp,$c+1, $t-($c+1)); $tagstart = $c+1; declCallback($tag, "", 0); } else startCallback($tag, "", 0); // TEST was c = t $c = $t+1; continue; } elseif ($tmp[$c] == ' ') { while ($tmp[$c] == ' ') $c++; } else { if ($tmp[$q] == '!') { $q++; $tag = substr($tmp, $q, $q-$c); declCallback ($tag, "", 0); } else { declCallback ($tag, "", 0); } break; } unset($args); $numargs = 0; while (($tmp[$c] || $c < strlen($tmp))) { $istrue = 0; $tagended = 0; while ($tmp[$c] == ' ') $c++; if (!$tmp[$c-1] == ' ') $c--; // $arg = start of argument $arg = $c; // $q is used to find end of argument if ($tmp[$arg] == '"' || $tmp[$arg] == ''') { $c++; $arg = $c; while (($tmp[$c] || $c < strlen($tmp)) && !($tmp[$c] == '"' && $tmp[$c-1] != '' && $tmp[$c] != ''')) $c++; // add arguments to table? if ($tmp[$c] != '>') continue; if ($tmp[$c+1] == '>') { $c++; } break; } $val = ""; while ($tmp[$c] != '=' && $tmp[$c] != ' ' && $tmp[$c] != '>') $c++; if ($tmp[$c] != ' ' && $tmp[$c] != '>') $istrue = 1; if ($tmp[$c] == '>') $tagended = 1; $q = $c; $c++; if ($istrue) { if ($tmp[$c] != ''' && $tmp[$c] != '"') { while ($tmp[$c] != ' ' && $tmp[$c] != '>') $c++; if ($tmp[$c] == '>') { $val = substr($tmp,$q, $c-$q); } else { $c++; $val = substr($tmp, $c, $c-$q); continue; } } else { $c++; while ($tmp[$c] && ($tmp[$c] != ''' || ($tmp[$c] == ''' && $tmp[$c-1] == '')) && ($tmp[$c] != '"' || ($tmp[$c] == '"' && $tmp[$c-1] == ''))) $c++; if ($tmp[$c] == '>') { $val = substr($tmp,$q, $c-$q); $c++; // add args break; } elseif ($tmp[$c+1] == '>') { $val = substr($tmp, $q, $c-$q); $c++; // add args break; } else { $val = substr($tmp, $q, $c-$q); $c+=2; // add args } } } else { // add args if (!$tagended) continue; $tagended = 0; $c--; break; } } // is q allowed here? $q=0; if ($tag[$q] == '!') { $q++; // FIXME $tag = substr($tag, $q); // FIX THIS TOO declCallback($tag, "", 0); } else { // and this startCallback($tag, "", 0); } // clear arg list; $c++; continue; } } else { // check for newline char if ($tmp[$c] == 'n') { $c++; continue; } $text = $tmp; $q = $c; if ($text[$q] == '!') { $q--; if ($text[$q-1] == '<') { $q--; continue; } } while ($tmp[$c] == ' ' && $tmp[$c] != '<' && ($tmp[$c] || $c < strlen($tmp))) $c++; if ($tmp[$c] == '<' && $tmp[$c+1]) { continue; } else if (!($tmp[$c] || $c < strlen($tmp))) break; // text start textStartCallback(); for (;;) { while (($tmp[$c] || $c < strlen($tmp)) && $tmp[$c] != '<') $c++; if ($tmp[$c] == '<') { if ($tmp[$c+1] == ' ') { $c++; continue; } else $istag = 1; } break; } $text = substr($tmp, $q, $c-$q); // text callback textCallback($text); // text end textEndCallback(); $c++; continue; } } return; } ?>