Parsing URI's

A fast and easy way to parse URI's as seen on commercial libraries

NOTE: if you have a Internet library then you already have URI parsing functions, however this may serve as an alternate way, a check on how the parsing algorithm works, a way to spend a credit article, a way to flame somebody else coding, etc



on Indy(Internet Direct) for URI parsing check TIdURI class on idURI unit



on TurboPower Internet Professional for IpMisc unit function IpParseURL



Dont know about ICS though





A URI is the way that an internet address presents itself, most protocols follow a URI. A URI has the following syntax:



[PROTOCOL + ://]HOST[:PORT][SUBDIRS][DOCUMENT][#+BOOKMARK]



Now here is the way I came up for URI parsing, the code is commented



<---------------------BEGIN CODE--------------------------->

function IsNumber(ACharacter: Char): Boolean;

begin

  Result := Pos(ACharacter, '0123456789') > 0;

end;



{Parse params from a URL into a TStrings

 EG. http://search.yahoo.com/bin/search?p=britney+spears

 Dest[0] = 'p=britney+spears'

 To access params and values check TStringList.Names and

 TStringList.Values}

procedure SplitParams(const Params: String; Dest: TStrings);

var

  p: Integer;

  Tmp: String;

begin

  if not Assigned(Dest) then Exit;



  Dest.Clear;

  Tmp := Params;

  p := Pos('&', Tmp);



  while (p > 0) do

  begin

    Dest.Add(Copy(Tmp, 1, p -1));

    Tmp := Copy(Tmp, P+1, Length(Tmp) - p);

    p := Pos('&', Tmp);

  end;

  Dest.Add(Tmp);

end;



{Given an URL check for a query, return the query starting

 position, and the params in Params, this function uses SplitParams}

procedure ProcessQuery(const URL: String; Params: TStrings; var QueryPos: Integer);

begin

  //Anything after the ? are params so send them to splitparams

  QueryPos := Pos('?', URL);

  if QueryPos > 0 then

    SplitParams(Copy(URL, QueryPos+1, Length(URL) -QueryPos), Params);

end;



{Given an URL get the document name

 there are some special cases to this:

  1) if URL is of form http://www.hello.com there is no document

     but when queried to the server it will return one, so return

     no document

  2) if URL is of form http://www.hi.com/index.php or

     http://www.hi.com/index.php#thisis or

     http://www.hi.com/index.php#thisis?howareyou=hacker

     then document is index.php forget anything else}

function QueryDocument(const URL: String; var DocPos: Integer): String;

var

  QueryPos, i: Integer;

begin

  Result := '';

  //check for parameters

  ProcessQuery(URL, nil, QueryPos);

  if (QueryPos > 0) then

  begin

    dec(QueryPos);

    i := QueryPos;

    while URL[QueryPos] <> '/' do dec(i);

    inc(i);


    Result := Copy(URL, i +1, QueryPos);

  end

  else

  begin

    {QueryPos is not found try to get either a docname or

     check if no document at hand}

    i := Length(URL);

    while URL[i] <> '/' do dec(i);

    inc(i);

    DocPos := i;

    if (i >= Pos('://', URL)+3) then

      Result := Copy(URL, i, Length(URL))

    else

    begin

      //No document or url of the form www.hello.com

      Result := '';

      Exit;

    end;

  end;



  //check for any bookmarks in the document

  if Pos('#', Result) > 0 then

    Result := Copy(Result, 1, Pos('#', Result) -1);

  if pos('/', Result) >0 then

    Delete(Result, pos('/', Result), 1);

end;



{Query the port number of a url if any

 EG. www.hello.com:8000

 QueryPort will return 8000 else it will return 0}

function QueryPort(const URL: String): Integer;

var

  DotIdx, SlashIdx: Integer;

  Tmp, Buffer: String;

begin

  Tmp := URL;

  if Pos('://', Tmp) > 0 then

    Delete(Tmp, Pos('://', Tmp), 3);

  SlashIdx := Pos('/', Tmp) -1;

  if SlashIdx < 0 then

    SlashIdx := Length(Tmp);

  DotIdx := Pos(':', Tmp);

  if DotIdx < 0 then

  // no Port Number so exit gracefully

    Exit;

  Buffer := Copy(Tmp, DotIdx+1, SlashIdx);

  val(Buffer, Result, DotIdx);

end;



{Query a Bookmark in a document

 EG. www.hello.com/index.htm#notwelcome

 QueryBookMark will return 'notwelcome'}

function QueryBookmark(const URL: String): String;

var

  HashPos: Integer;

  QuestionPos: Integer;

begin

  HashPos:= Pos('#', URL);

  if (HashPos <= 0) then Exit;

  QuestionPos:= Pos('?', URL);

  if (QuestionPos <= 0) then

    QuestionPos := Length(URL);

  Result := Copy(URL, HashPos+1, QuestionPos-1);

end;



procedure ParseURI(const URI: String; var Host, Document, BookMark, Port: String; QueryParams: TStrings);

var

  QueryPos, DocPos: Integer;

begin

  ProcessQuery(URI, QueryParams, QueryPos);

  BookMark := QueryBookmark(URI);

  Port := IntTOStr(QueryPort(URI));

  Document := QueryDocument(URI, DocPos);

  Host := Copy(URI, 1, DocPos -1);

end;

<---------------------END CODE----------------------------->



By the way this algorithm way can be extended, expanded, etc., and I am working on it. Any suggestions, comments, critics, drop a comment.



KNOWN BUGS:



  * According to some sources the Pos function is not as fast as it should be and can not be used for reverse string positioning

  * Yes I know this aint the best algorithm for URI parsing, as of this writing I am working out on enhancements, code enlargements etc.

 

Share this article!

Follow us!

Find more helpful articles: