(* Compiling a lexer definition *)

#open "syntax";;
#open "sort";;

(* Deep abstract syntax for regular expressions *)

type regexp =
    Empty
  | Chars of int
  | Action of int
  | Seq of regexp * regexp
  | Alt of regexp * regexp
  | Star of regexp
;;

(* From shallow to deep syntax *)

let encode_lexdef (Lexdef(_, ld)) =
  let chars = ref []
  and chars_count = ref (-1)
  and actions = ref []
  and actions_count = ref (-1) in
    let rec encode_regexp = function
        Epsilon -> Empty
      | Characters cl ->
          chars := cl :: !chars; incr chars_count; Chars(!chars_count)
      | Sequence(r1,r2) ->
          Seq(encode_regexp r1, encode_regexp r2)
      | Alternative(r1,r2) ->
          Alt(encode_regexp r1, encode_regexp r2)
      | Repetition r ->
          Star (encode_regexp r) in
    let encode_casedef =
      it_list
       (fun reg (expr,act) ->
         incr actions_count;
         let act_num = !actions_count in
           actions := (act_num, act) :: !actions;
           Alt(reg, Seq(encode_regexp expr, Action act_num)))
       Empty in
    let name_regexp_list =
      map (fun (name, casedef) -> (name, encode_casedef casedef)) ld
    in
      (vect_of_list (rev !chars)), name_regexp_list, !actions
;;

(* To generate directly a NFA from a regular expression.
   Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)

type transition =
    OnChars of int
  | ToAction of int
;;

let merge_trans = merge (fun
    (OnChars n1)  (OnChars n2)  -> n1<n2
  | (ToAction n1) (ToAction n2) -> n1<n2
  | (ToAction _)  (OnChars _)   -> true
  | (OnChars _)   (ToAction _)  -> false)
;;

let rec nullable = function
    Empty      -> true
  | Chars _    -> false
  | Action _   -> false
  | Seq(r1,r2) -> nullable r1 & nullable r2
  | Alt(r1,r2) -> nullable r1 or nullable r2
  | Star r     -> true
;;

let rec firstpos = function
    Empty      -> []
  | Chars pos  -> [OnChars pos]
  | Action act -> [ToAction act]
  | Seq(r1,r2) -> if nullable r1
                  then merge_trans (firstpos r1) (firstpos r2)
                  else firstpos r1
  | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2)
  | Star r     -> firstpos r
;;

let rec lastpos = function
    Empty      -> []
  | Chars pos  -> [OnChars pos]
  | Action act -> [ToAction act]
  | Seq(r1,r2) -> if nullable r2
                  then merge_trans (lastpos r1) (lastpos r2)
                  else lastpos r2
  | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2)
  | Star r     -> lastpos r
;;

let followpos size name_regexp_list =
  let v = make_vect size [] in
    let fill_pos first = function
        OnChars pos -> v.(pos) <- merge_trans first v.(pos); ()
      | ToAction _  -> () in
    let rec fill = function
        Seq(r1,r2) ->
          fill r1; fill r2;
          do_list (fill_pos (firstpos r2)) (lastpos r1)
      | Alt(r1,r2) ->
          fill r1; fill r2
      | Star r ->
          fill r;
          do_list (fill_pos (firstpos r)) (lastpos r)
      | _ -> () in
    do_list (fun (name, regexp) -> fill regexp) name_regexp_list;
    v
;;

let no_action = 32767;;

let split_trans_set = it_list
  (fun (act, pos_set as act_pos_set) ->
     function OnChars pos   -> (act, pos :: pos_set)
         |    ToAction act1 -> if act1 < act then (act1, pos_set)
                                             else act_pos_set)
  (no_action, [])
;;

let memory  = ref ([] : (transition list * int) list)
and todo    = ref ([] : (transition list * int) list)
and next    = ref 0
;;

let reset_state_mem () =
  memory := []; todo := []; next := 0; ()
;;

let get_state st = 
  try
    assoc st !memory
  with Not_found ->
    let (_, nbr as st_nbr) = (st, !next) in
      incr next;
      memory := st_nbr :: !memory;
      todo := st_nbr :: !todo;
      nbr
;;

let rec map_on_states f =
  match !todo with
    []  -> []
  | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f
;;

let number_of_states () =
  !next
;;

let goto_state = function
    [] -> Backtrack
  | ps -> Goto (get_state ps)
;;

let make_dfa lexdef =
  let (chars, name_regexp_list, actions) =
    encode_lexdef lexdef in
  let follow =
    followpos (vect_length chars) name_regexp_list in
  let transition_from pos_set = 
    let tr = make_vect 256 []
    and shift = make_vect 256 Backtrack in
      do_list
        (fun pos ->
          do_list
            (fun c ->
               tr.(int_of_char c) <-
                 merge_trans tr.(int_of_char c) follow.(pos))
            chars.(pos))
        pos_set;
      for i = 0 to 255 do
        shift.(i) <- goto_state tr.(i)
      done;
      shift in
  let translate_state state =
    match split_trans_set state with
      n, [] -> Perform n
    | n, ps -> Shift( (if n == no_action then No_remember else Remember n),
                      transition_from ps) in
  reset_state_mem();
  let initial_states =
    map (fun (name, regexp) -> (name, get_state(firstpos regexp)))
        name_regexp_list in
  let states =
    map_on_states translate_state in
  let v =
    make_vect (number_of_states()) (Perform 0)
  in
    do_list (fun (auto, i) -> v.(i) <- auto) states;
    reset_state_mem();
    (initial_states, v, actions)
;;
