Docs GODI Archive
Projects Blog Link DB

Search GODI:


More options
File lib/ocaml/pkg-lib/netstring/nethtml.mli GODI Package godi-ocamlnet
Library netstring
 
   Nethtml.html    nethtml.cmi_pretty    nethtml.mli    Sources  
(* $Id: nethtml.mli 1013 2006-10-02 13:20:18Z gerd $
 * ----------------------------------------------------------------------
 *
 *)

(** Parsing of HTML *)


(** The type [document] represents parsed HTML documents:
 *
 * {ul
 * {- [Element (name, args, subnodes)] is an element node for an element of
 *   type [name] (i.e. written [<name ...>...</name>]) with arguments [args]
 *   and subnodes [subnodes] (the material within the element). The arguments
 *   are simply name/value pairs. Entity references (something like [&xy;])
 *   occuring in the values are {b not} resolved.
 *
 *   Arguments without values (e.g. [<select name="x" multiple>]: here,
 *   [multiple] is such an argument) are represented as [(name,name)], i.e. the
 *   name is also returned as value.
 *
 *   As argument names are case-insensitive, the names are all lowercase.}
 * {- [Data s] is a character data node. Again, entity references are contained
 *   as such and not as what they mean.}
 * }
 *
 * Character encodings: The parser is restricted to ASCII-compatible
 * encodings (see the function {!Netconversion.is_ascii_compatible} for
 * a definition). In order to read other encodings, the text must be
 * first recoded to an ASCII-compatible encoding (example below).
 * Names of elements and attributes must additionally be ASCII-only.
 *)
type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


(** We also need a type that declares how to handle the various tags.
 * This is called a "simplified DTD", as it is derived from SGML DTDs,
 * but simplified to the extent used in the HTML definition.
 *)

(* Now follows the type definition of simplified DTDs. *)

type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;
(** Element classes are a property used in the HTML DTD. For our purposes,
 * we define element classes simply as an enumeration:
 * - [`Inline] is the class of inline HTML elements
 * - [`Block] is the class of block HTML elements
 * - [`Essential_block] is a sub-class of [`Block] with the additional
 *   property that every start tag must be explicitly ended
 * - [`None] means that the members of the class are neither block nor
 *   inline elements, but have to be handled specially
 * - [`Everywhere] means that the members of the class can occur everywhere, 
 *   regardless of whether a constraint allows it or not.
 *)


type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow                                            (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list             (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;
(** Model constraints define the possible sub elements of an element:
 * - [`Inline]: The sub elements must belong to the class [`Inline]
 * - [`Block]: The sub elements must be members of the classes [`Block] or 
 *   [`Essential_block]
 * - [`Flow]: The sub elements must belong to the classes [`Inline], [`Block],
 *   or [`Essential_block]
 * - [`Empty]: There are no sub elements
 * - [`Any]: Any sub element is allowed
 * - [`Special]: The element has special content (e.g. [<script>]).
 *   Functionally equivalent to [`Empty]
 * - [`Elements l]: Only these enumerated elements may occur as sub elements
 * - [`Or(m1,m2)]: One of the constraints [m1] or [m2] must hold
 * - [`Except(m1,m2)]: The constraint [m1] must hold, and [m2] must not hold
 * - [`Sub_exclusions(l,m)]: The constraint [m] must hold; furthermore, 
 *   the elements enumerated in list [l] are not allowed as direct or
 *   indirect subelements, even if [m] or the model of a subelement would
 *   allow them. The difference to [`Except(m, `Elements l)] is that the
 *   exclusion is inherited to the subelements. The [`Sub_exclusions]
 *   expression must be toplevel, i.e. it must not occur within an [`Or], 
 *   [`Except], or another ['Sub_exclusions] expression.
 *
 * Note that the members of the class [`Everywhere] are allowed everywhere,
 * regardless of whether the model constraint allows them or not.
 *
 * Note that certain aspects are not modeled:
 * - [#PCDATA]: We do not specify where PCDATA is allowed and where not.
 * - Order, Number: We do neither specify in which order the sub elements must
 *   occur nor how often they can occur
 * - Inclusions: DTDs may describe that an element extraordinarily
 *   allows a list of elements in all sub elements. 
 * - Optional tags: Whether start or end tags can be omitted (to some extent,
 *   this can be expressed with [`Essential_block], however)
 *)

type simplified_dtd =
    (string * (element_class * model_constraint)) list;;
(** A [simplified_dtd] is an associative list of tuples
 *  [(element_name, (element_class, constraint))]: For every [element_name]
 *  it is declared that it is a member of [element_class], and that
 *  the sub elements must satisfy [constraint].
 *
 *  It is not allowed to have several entries for the same element.
 *)

val html40_dtd : simplified_dtd
  (** The (transitional) HTML 4.0 DTD, expressed as [simplified_dtd] *)

val relaxed_html40_dtd : simplified_dtd
  (** A relaxed version of the HTML 4.0 DTD that matches better common
   * practice. In particular, this DTD additionally allows that inline
   * elements may span blocks. For example, 
   * {[ <B>text1 <P>text2 ]}
   * is parsed as
   * {[ <B>text1 <P>text2</P></B> ]}
   * and not as
   * {[ <B>text1 </B><P>text2</P> ]}
   * \- the latter is more correct (and parsed by [html40_dtd]), but is not what
   * users expect.
   *
   * Note that this is still not what many browsers implement. For example,
   * Netscape treats most inline tags specially: [<B>] switches bold on,
   * [</B>] switches bold off. For example,
   * {[ <A href='a'>text1<B>text2<A href='b'>text3 ]}
   * is parsed as
   * {[ <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B> ]}
   * \- there is an extra [B] element around the second anchor! (You can
   * see what Netscape parses by loading a page into the "Composer".)
   * IMHO it is questionable to consider inline tags as switches because
   * this is totally outside of the HTML specification, and browsers may
   * differ in that point.
   *
   * Furthermore, several elements are turned into essential blocks:
   * [TABLE], [UL], [OL], and [DL]. David Fox reported a problem with structures
   * like:
   * {[ <TABLE><TR><TD><TABLE><TR><TD>x</TD></TD></TR></TABLE>y</TD></TR></TABLE> ]}
   * i.e. the [TD] of the inner table has two end tags. Without additional
   * help, the second [</TD>] would close the outer table cell. Because of
   * this problem, tables are now essential meaning that it is not allowed
   * to implicitly add a missing [</TABLE>]; every table element has to
   * be explicitly ended. This rule seems to be what many browsers implement.
   *)

val parse_document : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                     ?return_declarations:bool ->      (* default: false *)
                     ?return_pis:bool ->               (* default: false *)
                     ?return_comments:bool ->          (* default: false *)
                     Lexing.lexbuf ->
                       document list
  (** Parses the HTML document from a [lexbuf] and returns it. 
   * 
   * @param dtd specifies the DTD to use. By default, [html40_dtd] is used which
   *   bases on the transitional HTML 4.0 DTD
   * @param return_declarations if set, the parser returns [<!...>] declarations
   *   as [Element("!",["contents",c],[])] nodes, where [c] is the string inside
   *   [<!] and [>]. - By default, declarations are skipped.
   * @param return_pis if set, the parser returns [<?...>] (or [<?...?>]) processing
   *   instructions as [Element("?",["contents",c],[])] nodes, where [c] is the
   *   string inside [<?] and [>] (or [?>]). - By default, processing instructions
   *   are skipped.