(* $Id: mimestring.mli 1003 2006-09-24 15:17:15Z gerd $
* ----------------------------------------------------------------------
*
*)
(** Low-level functions to parse and print mail and MIME messages
*
* [Mimestring] contains a lot of functions to scan and print strings
* formatted as MIME messages. For a higher-level view on this topic,
* see the [Netmime] module.
*
* {b Contents}
* - {!Mimestring.headers}
* - {!Mimestring.structured_values}
* - {!Mimestring.parsers_for_structured_values}
* - {!Mimestring.printers_for_structured_values}
* - {!Mimestring.scanning_mime}
* - {!Mimestring.helpers_mime}
*
*)
(* *********************************************************************)
(* Collection of auxiliary functions to parse MIME headers *)
(* *********************************************************************)
(* See also the module Netmime for high-level MIME functions *)
(** {1:headers Parsing and Printing Mail Headers} *)
val scan_header :
?downcase:bool -> (* default: true *)
?unfold:bool -> (* default: true *)
?strip:bool -> (* default: false *)
string -> start_pos:int -> end_pos:int ->
((string * string) list * int)
(** [let params, header_end_pos = scan_header s start_pos end_pos]:
*
* Scans the mail header that begins at position [start_pos] in the string
* [s] and that must end somewhere before position [end_pos]. It is intended
* that in [end_pos] the character position following the end of the body of
* the MIME message is passed.
*
* Returns the parameters of the header as [(name,value)] pairs (in
* [params]), and in [header_end_pos] the position of the character following
* directly after the header (i.e. after the blank line separating
* the header from the body).
*
* The following normalizations have already been applied:
* - (D) The names are converted to lowercase characters
* - (U) Newline characters (CR and LF) in the middle of the header fields
* have been removed
* - (S) Whitespace at the beginning and at the end of field values has been
* removed
*
* The default is to apply all three normalizations (D), (U), and (S)
* (for historic reasons). The three arguments [downcase], [unfold],
* and [strip] control which normalizations are performed (and for
* historic reasons, too, this is not what you would expect - backwards
* compatibility can sometimes be a burden):
*
* - If [downcase], do (D); if [not downcase], don't do (D).
* - If [unfold], do (U); if [not unfold], don't do (U).
* - If [unfold || strip], do (S); if [not unfold && not strip],
* don't do (S)
* - Defaults: [downcase], [unfold], [not strip].
*
* This means that [unfold] not only removes CR/LF from the field value,
* but also removes whitespace at the beginning and at the end of the
* field value. [strip] causes not to remove CR/LF if it occurs
* somewhere within the field value, but all whitespace (including
* CR/LF) at the beginning of the field value and at the end of the
* field value is still deleted. Note that if you only want (S)
* you have to pass [~unfold:false] and [~strip:true].
*
* The rules to postprocess mail messages in MIME format are {b not}
* applied (e.g. encoding transformations as indicated by RFC 2047).
*
* The function fails if the header violates the header format
* strongly. (Some minor deviations are tolerated, e.g. it is sufficient
* to separate lines by only LF instead of CRLF.)
*
* {b The Format of Mail Messages}
*
* Messages
* consist of a header and a body; the first empty line separates both
* parts. The header contains lines "{i param-name}[:] {i param-value}" where
* the param-name must begin on column 0 of the line, and the "[:]"
* separates the name and the value. So the format is roughly:
*
* {[
* param1-name: param1-value
* ...
* paramN-name: paramN-value
* _
* body ]}
*
* (Where "_" denotes an empty line.)
*
* This function wants in [start_pos] the position of the first character of
* [param1-name] in the string, and in [end_pos] the position of the character
* following [body]. It returns as [header_end_pos] the position where
* [body] begins. Furthermore, in [params] all parameters are returned the
* function finds in the header.
*
* {b Details}
*
* Note that parameter values are restricted; you cannot represent
* arbitrary strings. The following problems can arise:
* - Values cannot begin with whitespace characters, because there
* may be an arbitrary number of whitespaces between the "[:]" and the
* value.
* - Values (and names of parameters, too) must only be formed of
* 7 bit ASCII characters. (If this is not enough, the MIME standard
* knows the extension RFC 2047 that allows that header values may
* be composed of arbitrary characters of arbitrary character sets.
* See below how to decode such characters in values returned by
* this function.)
* - Header values may be broken into several lines. Continuation
* lines must begin with whitespace characters. This means that values
* must not contain line breaks as semantic part of the value.
* And it may mean that {i one} whitespace character is not distinguishable
* from {i several} whitespace characters.
* - Header lines must not be longer than 78 characters (soft limit) or
* 998 characters (hard limit). Values that
* would result into longer lines must be broken into several lines.
* This means that you cannot represent strings that contain too few
* whitespace characters.
* (Note: The soft limit is to avoid that user agents have problems
* with long lines. The hard limit means that transfer agents sometimes
* do not transfer longer lines correctly.)
* - Some old gateways pad the lines with spaces at the end of the lines.
*
* This implementation of a mail scanner tolerates a number of
* deviations from the standard: long lines are not rejected; 8 bit
* values are generally accepted; lines may be ended only with LF instead of
* CRLF.
*
* Furthermore, the transformations (D), (U), and (S) can be performed
* resulting in values that are simpler to process.
*
* {b Compatibility}
*
* This function can parse all mail headers that conform to RFC 822 or
* RFC 2822.
*
* But there may be still problems, as RFC 822 allows some crazy
* representations that are actually not used in practice.
* In particular, RFC 822 allows it to use backslashes to "indicate"
* that a CRLF sequence is semantically meant as line break. As this
* function normally deletes CRLFs, it is not possible to recognize such
* indicators in the result of the function.
*)
val read_header :
?downcase:bool ->
?unfold:bool ->
?strip:bool ->
Netstream.in_obj_stream ->
(string * string) list
(** This function expects that the current position of the passed
* [in_obj_stream] is the first byte of the header. The function scans the
* header and returns it. After that, the stream position is after
* the header and the terminating empty line (i.e. at the beginning of
* the message body).
*
* The options [downcase], [unfold], and [strip] have the same meaning
* as in [scan_header].
*
* {b Example}
*
* To read the mail message "[file.txt]":
*
* {[
* let ch = Netchannels.input_channel (open_in "file.txt") in
* let stream = Netstream.input_stream ch in
* let header = read_header stream in
* stream#close_in() (* no need to close ch *)
* ]}
*)
val write_header :
?soft_eol:string -> (* default: "\r\n" *)
?eol:string -> (* default: "\r\n" *)
Netchannels.out_obj_channel ->
(string * string) list ->
unit
(** This function writes the header to the passed [out_obj_channel]. The
* empty line following the header is also written.
*
* Exact output format:
* {ul
* {- The header is not folded, i.e. no additional CRLF sequences
* are inserted into the header to avoid long header lines.
* In order to produce correct headers, the necessary CRLF bytes
* must already exist in the field values. (You can use the
* function [write_value] below for this.)}
* {-