ParseHTML

The following code demonstrates how to parse a html file looking for



Begin Tag

End Tag

Raw Text



The following routine demonstrates how to parse a html file.



I welcome feed back to improve the routine, if you have any suggestions/hints please let me know.



rgds



Si Carter





---------------- BEGIN CODE BLOCK ------------------------



unit HTMLParse;

(***************************************************************************



    HTMLParse



    Purpose: Parse a html file to extract tags and plain text.



    Copyright © 2003 - TECT Software Ltd. All Rights Reserved.



    All code remains the property of TECT Software Ltd and may not

    be changed without permission. Use of this code is granted to

    any developer for private, open source or commercial applications.



    No warranty expressed or implied. Use at own risk.





    Contact:

      WEB - www.tectsoft.com

      EMail - support@tectsoft.com



    Copyright Notice Must Remain With File.



    Visit www.tectsoft.com for *low cost* developer friendly web hosting.



    Requires:

      FastStrings from http://www.droopyeyes.com



    Usage:

      See www.howtodothings.com for demo usage.



****************************************************************************)



interface



uses Classes, FastStringFuncs, FastStrings;



type

  TTagType = (ttBeginTag, ttEndTag, ttRawText);

  THTMLParseProc = procedure(const HTMLData: string; TagType: TTagType;

    Parameters: TStrings);



procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);



implementation



uses SysUtils;



const

  (* NOTE: download the file below, the following codes are wrong when

 displayed in a browser like this :-) *)


  THTMLReplaceWords: array[0..4] of array[0..1] of string = ((' ', ' '),

    ('&', '&'), ('<', '<'), ('>', '>'), ('"', '"'));



procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);



  procedure CallTagProc(IsTag: Boolean; HTMLData: string);

  var

    s: string;

    sl: TStringList;

    I: Integer;

  begin

    HTMLData := Trim(HTMLData);

    if Length(HTMLData) > 0 then

    begin

      if IsTag then

      begin

        if Pos(' ', HTMLData) > 0 then


          s := Trim(Copy(HTMLData, 1, Pos(' ', HTMLData)))

        else

          s := Trim(HTMLData);



        sl := TStringList.Create;

        try

          sl.Text := Trim(Copy(HTMLData, Length(s) + 1, length(HTMLData)));

          sl.Text := Trim(FastReplace(sl.Text, ';', #13));

          sl.Text := Trim(FastReplace(sl.Text, '" ', #13));

          sl.Text := Trim(FastReplace(sl.Text, '"', ''));



          if LeftStr(s, 1) = '/' then

            THTMLParseProc(ParseProc)(uppercase(s), ttEndTag, sl)

          else

            THTMLParseProc(ParseProc)(UpperCase(s), ttBeginTag, sl);

        finally

          sl.Free;

        end;

      end else

      begin

        for I := 0 to 4 do

          HTMLData := FastReplace(HTMLData, THTMLReplaceWords[I, 0],

            THTMLReplaceWords[I, 1]);



        THTMLParseProc(ParseProc)(HTMLData, ttRawText, nil);

      end;

    end;

  end;



var

  s: string;

  P: PChar;

begin

  Assert(Assigned(ParseProc));

  P := PChar(HTML);

  s := '';



  while P^ <> #0 do

  begin

    case P^ of

      '<':

        begin

          CallTagProc(False, s);

          s := '';

        end;

      '>':

        begin

          CallTagProc(True, s);

          s := '';

        end;

    else

      s := s + P^;

    end; //case

    Inc(P);

  end;

end;



end.





---------------- END CODE BLOCK ------------------------



Downloads:



Source File: HTMLParse.pas



Demo:

htmlparse.zip



All tested using D6.

 

Share this article!

Follow us!

Find more helpful articles: