Docs GODI Archive
Projects Blog Link DB

Search GODI:


More options
File lib/ocaml/pkg-lib/netstring/neturl.mli GODI Package godi-ocamlnet
Library netstring
 
   Neturl.html    neturl.cmi_pretty    neturl.mli    Sources  
(* $Id: neturl.mli 1003 2006-09-24 15:17:15Z gerd $
 * ----------------------------------------------------------------------
 *
 *)

(* This module applies already O'Caml-3 features. *)

(** Uniform Resource Locators (URLs)
 *
 * {b Contents}
 *
 * - {!Neturl.interface}
 *
 * The tutorial has been moved to {!Neturl_tut}.
 *)

(** {1:interface Interface}
 *
 * This module provides functions to parse URLs, to print URLs, to
 * store URLs, to modify URLs, and to apply relative URLs.
 *
 * URLs are strings formed according to pattern (1) or (2):
 *
 * + [scheme://user;userparams:password\@host:port/path;params?query#fragment]
 * + [scheme:other;params?query#fragment]
 *
 * The word at the beginning of the URL identifies the URL scheme
 * (such as "http" or "file"). Depending on the scheme, not all of the
 * parts are allowed, or parts may be omitted. This module defines the
 * type [url_syntax] whose values describe which parts are allowed/required/
 * not allowed for a concrete URL scheme (see below).
 *
 * Not all characters are allowed in a URL. Some characters are allowed,
 * but have the special task to separate the various parts of the URL
 * (reserved characters).
 * However, it is possible to include even invalid or reserved characters
 * as normal content by applying the [%]-encoding on these characters:
 * A ['%'] indicates that an encoded character follows, and the character
 * is denoted by a two-digit hexadecimal number (e.g. [%2f] for ['/']).
 * In the following descriptions, the term "encoded string" means a string
 * containing such [%]-encoded characters, and the "decoded string" means a
 * string not containing such characters.
 * See the module {!Netencoding.Url} for functions encoding or decoding
 * strings.
 *
 * The type [url] describes values storing the components of a URL,
 * and the [url_syntax] for the URL. In general, the components are
 * stored as encoded strings; however, not for all components the
 * [%]-encoding is applicable.
 *
 * For convenience, the functions creating, modifying, and accessing
 * URLs can handle both encoded and decoded strings. In order to
 * avoid errors, the functions pass strings even in their decoded form.
 *
 * Note that there is currently no function to compare URLs. The
 * canoncical comparison ( [=] ) is not applicable because the same URL
 * may be written in different ways.
 *
 * Note that nothing is said about the character set/encoding of URLs.
 * Some protocols and standards prefer UTF-8 as fundamental encoding
 * and apply the [%]-encoding on top of it; i.e. the byte sequence
 * representing a character in UTF-8 is [%]-encoded. 
 *
 * {b Standards Compliance}
 *
 * This module implements RFC 1738 and RFC 1808. There is also a newer
 * RFC, 2396, updating the former RFCs, but this module is not fully 
 * compatible with RFC 2396. The following (minor) problems may occur:
 *
 * - The module escapes more characters than needed. All characters that
 *   are "unsafe" or "reserved" in either RFC document are escaped.
 * - URL parameters (appended with a ";") are handled as in RFCs 1738/1808.
 *   In RFC 2396, every path component may have parameters, and the
 *   algorithm to resolve relative URLs is different in this point.
 *   If it is required to apply RFC 2396, one can disable URL parameters
 *   in the syntax, and extract them from the path by a self-written
 *   postprocessor. Usually, this is only required for [imap] URLs.
 *
 * In one point, RFC 2396 is preferred:
 *
 * - Authorities may be terminated by a question mark, as in
 *   ["http://host?query"]. This is illegal in RFC 1738. The consequence
 *   is, however, that question marks in user strings must be escaped.
 *)

exception Malformed_URL
(** Raised by a number of functions when encountering a badly formed
 * URL.
 *)

val extract_url_scheme : string -> string
  (** Returns the URL scheme from the string representation of an URL. 
   * E.g. [extract_url_scheme "http://host/path" = "http"]. 
   * The scheme name is always converted to lowercase characters.
   * Raises [Malformed_URL] if the scheme name is not found.
   *)

type url_syntax_option =
    Url_part_not_recognized  (** The part, even if there, is not even recognized *)
  | Url_part_allowed         (** The part can be present *)
  | Url_part_required        (** The part must be present *)


type url_syntax =
    { url_enable_scheme    : url_syntax_option;
      url_enable_user      : url_syntax_option;
      url_enable_user_param: url_syntax_option;
      url_enable_password  : url_syntax_option;
      url_enable_host      : url_syntax_option;
      url_enable_port      : url_syntax_option;
      url_enable_path      : url_syntax_option;
      url_enable_param     : url_syntax_option;
      url_enable_query     : url_syntax_option;
      url_enable_fragment  : url_syntax_option;
      url_enable_other     : url_syntax_option;
      url_accepts_8bits    : bool;
      url_is_valid         : url -> bool;
      url_enable_relative  : bool;
    }
(** Values of type [url_syntax] describe which components of an URL are
 * recognized, which are allowed (and optional), and which are required.
 * Not all combinations are valid; the predicate expressed by the
 * function [url_syntax_is_valid] must hold.
 *
 * The function [url_is_valid] is applied when a fresh URL is created
 * and must return [true]. This function allows it to add an arbitrary
 * validity criterion to [url_syntax]. (Note that the URL passed to 
 * this function is not fully working; you can safely assume that the
 * accessor functions [url_scheme] etc. can be applied to it.)
 *
 * Switch [url_accepts_8bit]: If [true], the bytes with code 128 to
 * 255 are treated like alphanumeric characters; if [false] these bytes
 * are illegal (but it is still possible to include such byte in their
 * encoded form: [%80] to [%FF]).
 *
 * Switch [url_enable_relative]: If [true], the syntax allows relative
 * URLs in principle. Actually, parsing of relative URLs is possible
 * when the optional parts are flagged as [Url_part_allowed] and not
 * as [Url_part_required]. However, it is useful to specify URL syntaxes
 * always as absolute URLs, and to weaken them on demand when a relative
 * URL is found by the parser. This switch enables that. In particular,
 * the function [partial_url_syntax] checks this flag.
 *)

and url
 (** Values of type [url] describe concrete URLs. Every URL must have
 * a fundamental [url_syntax], and it is only possible to create URLs
 * conforming to the syntax. See [make_url] for further information.
 *)
;;



val url_syntax_is_valid : url_syntax -> bool
  (** Checks whether the passed [url_syntax] is valid. This means:
   * - If passwords are recognized, users (and hosts) must be recognized, too
   * - If ports are recognized, hosts must be recognized, too
   * - If users are recognized, hosts must be recognized, too
   * - Either the syntax recognizes one of the phrases
   *   \{ user, password, host, port, path \}, or the syntax recognized
   *   the phrase 'other'.
   *)


val partial_url_syntax : url_syntax -> url_syntax
  (** Transforms the syntax into another syntax where all required parts are
   * changed into optional parts.
   *)


(* Note that all following url_syntaxes do not allow 8bit bytes. *)

val null_url_syntax   : url_syntax
  (** An URL syntax that recognizes nothing. Use this as base for your own
   * definitions, e.g.
   * {[
   * let my_syntax = { null_url_syntax with
   *                     url_enable_host = Url_part_required; ... }
   * ]}
   *)

val ip_url_syntax : url_syntax
  (** Syntax for IP based protocols. This syntax allows scheme, user,
   * password, host, port, path, param, query, fragment, but not "other".
   * It does not accept 8 bit bytes.
   *)

val common_url_syntax : (string, url_syntax) Hashtbl.t
  (** Syntax descriptions for common URL schemes. The key of the hashtable
   * is the scheme name, and the value is the corresponding syntax.
   *
   * - ["file"]: scheme, host?, path
   * - ["ftp"]: scheme, user?, password?, host, port?, path?, param?
   *   Note: param is not checked.
   * - ["http"], ["https"]: 
   *   scheme, user?, password?, host, port?, path?, query?
   * - ["mailto"]: scheme, other, query? (RFC 2368)
   * - ["pop"], ["pops"]: scheme, user?, user_param?, password?, host, port?
   *   Note: user_param is not checked.
   *   (RFC 2384)
   * - ["imap"], ["imaps"]: scheme, user?, user_param?, password?, host, port?,
   *   path?, query? (RFC 2192)
   *   Note: "param" is intentionally not recognized to get the resolution of
   *   relative URLs as described in the RFC. When analysing this kind of URL,
   *   it is recommended to re-parse it with "param" enabled.
   * - ["news"]: scheme, other (RFC 1738)
   * - ["nntp"], ["nntps"]: scheme, host, port?, path (with two components)