Matt McDonnell's Blog

Blogging using Ocaml

Thu Jul 13 09:58:41 BST 2006

Write an Ocamllex/Ocamlyacc program to parse text from my log files into a blog format. Get the lexer to recognise dates in the standard Unix date format, and take this to represent the start of a new blog entry. Immediately following the date can come an optional tags field, stating the categories that the entry should be filed under. This is terminated by two newlines and then the blog entry proper appears, until the next date field at the start of a line by itself.

As a first attempt just use a modified key/value parser, replacing the delimiter character with a date string, and adding entries to the list as tuples of (date string, tags string, entry string). This is fairly trivial to accomplish, the parser only needs to recognise five tokens: DATE, TAGS, ENTRY, EOL (end of line) and EOF (end of file).

Once the text file is parsed into a list of entry tuples the entries themselves can be filtered through Markdown to convert the formatted text into HTML. Initially this can be done by an OS command to the Markdown Perl script, but ideally a Markdown parser in Ocaml should be written, so that it can be compiled into the overall program.

blogLex.mll:

{ 
  open BlogParse
  let line=ref 1
}

let digit = ['0'-'9']
let number = digit+
let ws = ['\t' ' ']
let day = "Sun" | "Mon" | "Tue" | "Wed" | "Thu" | "Fri" | "Sat"
let month = "Jan" | "Feb" | "Mar" | "Apr" | "May" | "Jul" | "Aug"
            | "Sep" | "Oct" | "Nov" | "Dec"
let time = digit digit ':' digit digit ':' digit digit
let tz = ['A'-'Z']['A'-'Z'] | ['A'-'Z']['A'-'Z']['A'-'Z']  
let date = day ws+ month ws+ number ws+ time ws+ tz ws+ number
let delim = date (* Use date on line by itself to signal each 
                         new entry *)
let punct = ['.' ',' '?' '-']
let gentext = [^'\n']+ 

rule token = parse
  | (date as key) '\n'                 
      { (* 
        Printf.printf "DATE token:%s\n" key; 
        Printf.printf "%d\t" !line; *) 
        incr line; DATE (key)}
  | '%' (gentext+ as tag) ':' ws* (gentext+ as tagstr) '\n'
      { (* Printf.printf "TAG: %s, TAGSTR: %s\n" tag tagstr; 
        Printf.printf "%d\t" !line; *) incr line; TAG (tag, tagstr)}
  | (gentext)+ as str 
      { (* Printf.printf "ENTRY token:\n%s\n" str; *)
       ENTRY (str)}
  | '\n' {(* Printf.printf "%d\t" !line; *) incr line;   EOL }
  | _ {failwith((Lexing.lexeme lexbuf) ^ 
                 ": mistake at line " ^
                 string_of_int !line)}
  | eof  {EOF}

{
}

blogParse.mly:

%{
%}

%token EOL EOF
%token <string> DATE 
%token <string> ENTRY
%token <string * string> TAG
%token <string> SUBJECT
%start main
%type <(string * ((string * string) list) * string) list> main
%%

main:            
  | kvpair main {$1 :: $2}
  | EOF  {[]};

kvpair:
  | DATE entries {($1, [] ,$2)}
  | DATE tags entries {($1, $2, $3)}

tags:
  | TAG tags {$1 :: $2} 
  | TAG      { [$1] }

entry:
  | ENTRY { $1 }
  | EOL   { "\n" }

entries:
  | entry {$1}
  | entry entries { $1 ^ $2 }

blogmain.ml

let main() = 
  let lexbuf = Lexing.from_channel stdin in
  let parselist = BlogParse.main BlogLex.token lexbuf in
  let sort_tags taglst = 
    (* Sort the tags in order of precedence, subject>tag *)
    List.sort 
      (fun (tag1, _) (tag2, _) -> match tag1, tag2 with
      | "subject", _ -> -1
      | _, "subject" -> 1
      | _, _         -> 0) taglst in
  let print_tag (tag, tagstr) = match tag with
    | "subject" -> "###" ^ tagstr
    | _ -> tag ^ ": " ^ tagstr in 
  let extract_tags taglst = 
    List.fold_left 
      (fun acc tag -> acc ^ (print_tag tag) ^ "\n") 
      "" (sort_tags taglst) in
  List.iter (fun (k,tlst,v) -> 
    Printf.printf 
      "##%s\n%s\n%s\n"  
      k (extract_tags tlst)  v) (List.rev parselist);;

let _ = Printexc.print main ();;

[code]

[permlink]

code (24)

cooling (4)

erlang (5)

ideas (19)

lisp (1)

me (11)

notes (4)

ocaml (1)

physics (45)

politics (5)

qo (7)

security (2)

unix (6)

vim (3)

Matt's Blog

Blogging using Ocaml

Thu Jul 13 09:58:41 BST 2006