(* $Id: neturl.mli 1003 2006-09-24 15:17:15Z gerd $
* ----------------------------------------------------------------------
*
*)
(* This module applies already O'Caml-3 features. *)
(** Uniform Resource Locators (URLs)
*
* {b Contents}
*
* - {!Neturl.interface}
*
* The tutorial has been moved to {!Neturl_tut}.
*)
(** {1:interface Interface}
*
* This module provides functions to parse URLs, to print URLs, to
* store URLs, to modify URLs, and to apply relative URLs.
*
* URLs are strings formed according to pattern (1) or (2):
*
* + [scheme://user;userparams:password\@host:port/path;params?query#fragment]
* + [scheme:other;params?query#fragment]
*
* The word at the beginning of the URL identifies the URL scheme
* (such as "http" or "file"). Depending on the scheme, not all of the
* parts are allowed, or parts may be omitted. This module defines the
* type [url_syntax] whose values describe which parts are allowed/required/
* not allowed for a concrete URL scheme (see below).
*
* Not all characters are allowed in a URL. Some characters are allowed,
* but have the special task to separate the various parts of the URL
* (reserved characters).
* However, it is possible to include even invalid or reserved characters
* as normal content by applying the [%]-encoding on these characters:
* A ['%'] indicates that an encoded character follows, and the character
* is denoted by a two-digit hexadecimal number (e.g. [%2f] for ['/']).
* In the following descriptions, the term "encoded string" means a string
* containing such [%]-encoded characters, and the "decoded string" means a
* string not containing such characters.
* See the module {!Netencoding.Url} for functions encoding or decoding
* strings.
*
* The type [url] describes values storing the components of a URL,
* and the [url_syntax] for the URL. In general, the components are
* stored as encoded strings; however, not for all components the
* [%]-encoding is applicable.
*
* For convenience, the functions creating, modifying, and accessing
* URLs can handle both encoded and decoded strings. In order to
* avoid errors, the functions pass strings even in their decoded form.
*
* Note that there is currently no function to compare URLs. The
* canoncical comparison ( [=] ) is not applicable because the same URL
* may be written in different ways.
*
* Note that nothing is said about the character set/encoding of URLs.
* Some protocols and standards prefer UTF-8 as fundamental encoding
* and apply the [%]-encoding on top of it; i.e. the byte sequence
* representing a character in UTF-8 is [%]-encoded.
*
* {b Standards Compliance}
*
* This module implements RFC 1738 and RFC 1808. There is also a newer
* RFC, 2396, updating the former RFCs, but this module is not fully
* compatible with RFC 2396. The following (minor) problems may occur:
*
* - The module escapes more characters than needed. All characters that
* are "unsafe" or "reserved" in either RFC document are escaped.
* - URL parameters (appended with a ";") are handled as in RFCs 1738/1808.
* In RFC 2396, every path component may have parameters, and the
* algorithm to resolve relative URLs is different in this point.
* If it is required to apply RFC 2396, one can disable URL parameters
* in the syntax, and extract them from the path by a self-written
* postprocessor. Usually, this is only required for [imap] URLs.
*
* In one point, RFC 2396 is preferred:
*
* - Authorities may be terminated by a question mark, as in
* ["http://host?query"]. This is illegal in RFC 1738. The consequence
* is, however, that question marks in user strings must be escaped.
*)
exception Malformed_URL
(** Raised by a number of functions when encountering a badly formed
* URL.
*)
val extract_url_scheme : string -> string
(** Returns the URL scheme from the string representation of an URL.
* E.g. [extract_url_scheme "http://host/path" = "http"].
* The scheme name is always converted to lowercase characters.
* Raises [Malformed_URL] if the scheme name is not found.
*)
type url_syntax_option =
Url_part_not_recognized (** The part, even if there, is not even recognized *)
| Url_part_allowed (** The part can be present *)
| Url_part_required (** The part must be present *)
type url_syntax =
{ url_enable_scheme : url_syntax_option;
url_enable_user : url_syntax_option;
url_enable_user_param: url_syntax_option;
url_enable_password : url_syntax_option;
url_enable_host : url_syntax_option;
url_enable_port : url_syntax_option;
url_enable_path : url_syntax_option;
url_enable_param : url_syntax_option;
url_enable_query : url_syntax_option;
url_enable_fragment : url_syntax_option;
url_enable_other : url_syntax_option;
url_accepts_8bits : bool;
url_is_valid : url -> bool;
url_enable_relative : bool;
}
(** Values of type [url_syntax] describe which components of an URL are
* recognized, which are allowed (and optional), and which are required.
* Not all combinations are valid; the predicate expressed by the
* function [url_syntax_is_valid] must hold.
*
* The function [url_is_valid] is applied when a fresh URL is created
* and must return [true]. This function allows it to add an arbitrary
* validity criterion to [url_syntax]. (Note that the URL passed to
* this function is not fully working; you can safely assume that the
* accessor functions [url_scheme] etc. can be applied to it.)
*
* Switch [url_accepts_8bit]: If [true], the bytes with code 128 to
* 255 are treated like alphanumeric characters; if [false] these bytes
* are illegal (but it is still possible to include such byte in their
* encoded form: [%80] to [%FF]).
*
* Switch [url_enable_relative]: If [true], the syntax allows relative
* URLs in principle. Actually, parsing of relative URLs is possible
* when the optional parts are flagged as [Url_part_allowed] and not
* as [Url_part_required]. However, it is useful to specify URL syntaxes
* always as absolute URLs, and to weaken them on demand when a relative
* URL is found by the parser. This switch enables that. In particular,
* the function [partial_url_syntax] checks this flag.
*)
and url
(** Values of type [url] describe concrete URLs. Every URL must have
* a fundamental [url_syntax], and it is only possible to create URLs
* conforming to the syntax. See [make_url] for further information.
*)
;;
val url_syntax_is_valid : url_syntax -> bool
(** Checks whether the passed [url_syntax] is valid. This means:
* - If passwords are recognized, users (and hosts) must be recognized, too
* - If ports are recognized, hosts must be recognized, too
* - If users are recognized, hosts must be recognized, too
* - Either the syntax recognizes one of the phrases
* \{ user, password, host, port, path \}, or the syntax recognized
* the phrase 'other'.
*)
val partial_url_syntax : url_syntax -> url_syntax
(** Transforms the syntax into another syntax where all required parts are
* changed into optional parts.
*)
(* Note that all following url_syntaxes do not allow 8bit bytes. *)
val null_url_syntax : url_syntax
(** An URL syntax that recognizes nothing. Use this as base for your own
* definitions, e.g.
* {[
* let my_syntax = { null_url_syntax with
* url_enable_host = Url_part_required; ... }
* ]}
*)
val ip_url_syntax : url_syntax
(** Syntax for IP based protocols. This syntax allows scheme, user,
* password, host, port, path, param, query, fragment, but not "other".
* It does not accept 8 bit bytes.
*)
val common_url_syntax : (string, url_syntax) Hashtbl.t
(** Syntax descriptions for common URL schemes. The key of the hashtable
* is the scheme name, and the value is the corresponding syntax.
*
* - ["file"]: scheme, host?, path
* - ["ftp"]: scheme, user?, password?, host, port?, path?, param?
* Note: param is not checked.
* - ["http"], ["https"]:
* scheme, user?, password?, host, port?, path?, query?
* - ["mailto"]: scheme, other, query? (RFC 2368)
* - ["pop"], ["pops"]: scheme, user?, user_param?, password?, host, port?
* Note: user_param is not checked.
* (RFC 2384)
* - ["imap"], ["imaps"]: scheme, user?, user_param?, password?, host, port?,
* path?, query? (RFC 2192)
* Note: "param" is intentionally not recognized to get the resolution of
* relative URLs as described in the RFC. When analysing this kind of URL,
* it is recommended to re-parse it with "param" enabled.
* - ["news"]: scheme, other (RFC 1738)
* - ["nntp"], ["nntps"]: scheme, host, port?, path (with two components)