Source File csv/csv.ils

    1 ;;; csv.ils --- Simple CSV parser (Excel-style, text/csv)
    2 
    3 ;; Copyright (C) 2012-2013  Damien Diederen
    4 
    5 ;; @author   Damien Diederen <dd@crosstwine.com>
    6 ;; @keywords csv
    7 
    8 ;; All Rights Reserved.
    9 ;;
   10 ;; NOTICE: All information, intellectual and technical concepts
   11 ;; contained herein are, and remain the property of Damien Diederen
   12 ;; and his suppliers, if any.  Dissemination of this information or
   13 ;; reproduction of this material is strictly forbidden unless prior
   14 ;; written permission is obtained from Damien Diederen.
   15 
   16 ;;; Commentary:
   17 
   18 ;; Package csv (global prefix VedaCsv) provides a simple parser
   19 ;; matching the syntax produced by Microsoft Excel in an U.S. locale.
   20 ;;
   21 ;; Basic usage:
   22 ;;
   23 ;;     $ cat Book1.csv
   24 ;;     A,1
   25 ;;     B,2
   26 ;;     "crazy""column
   27 ;;     with CR and all!",3
   28 ;;
   29 ;;     $ $CDSHOME/tools/dfII/bin/skill
   30 ;;     (load "pkg/csv/csv.ils")
   31 ;;     |- t
   32 ;;     (VedaCsvParseFile "./Book1.csv")
   33 ;;     |- (("A" "1")
   34 ;;     |   ("B" "2")
   35 ;;     |   ("crazy\"column\nwith CR and all!" "3"))
   36 
   37 ;;; Code:
   38 
   39 ;; Parser holds the CSV parser state.
   40 ;;
   41 ;; @ignore Internal
   42 (defclass VedaCsvParser ()
   43   ((filename @initarg filename)
   44    (line     @initform 1)
   45    (cell     @initform nil)
   46    (row      @initform nil)
   47    (rows     @initform nil)))
   48 
   49 ;; Package entry points.
   50 
   51 (define VedaCsvParseLines nil)
   52 (define VedaCsvParsePort nil)
   53 (define VedaCsvParseFile nil)
   54 
   55 ;; Scope/hide utility functions and related data.
   56 (let ()
   57   ;; Constants used during parsing.
   58 
   59   (define asciiQuotationMark
   60       (getchar "\"" 1))
   61 
   62   (define asciiComma
   63       (getchar "," 1))
   64 
   65   (define asciiLineFeed
   66       (intToChar 0xa))
   67 
   68   (define asciiCarriageReturn
   69       (intToChar 0x0d))
   70 
   71   (define endOfRow
   72       (list nil asciiLineFeed))
   73 
   74   ;; Create a parser instance
   75   (defun MakeParser (filename)
   76     (makeInstance 'VedaCsvParser ?filename filename))
   77 
   78   ;; Parser state manipulation.
   79 
   80   (defun FinishCell (p)
   81     (letseq ((str (buildString (reverse p->cell) ""))
   82              (newrow (cons str p->row)))
   83       (p->row = newrow)
   84       (p->cell = nil)))
   85 
   86   (defun FinishRow (p)
   87     (let ((newrows (cons (reverse p->row) p->rows)))
   88       (p->rows = newrows)
   89       (p->row = nil)
   90       (p->cell = nil)))
   91 
   92   (defun Rows (p)
   93     (reverse p->rows))
   94 
   95   (defun PushToCell (p c "gs")
   96     (let ((newcell (cons c p->cell)))
   97       (p->cell = newcell)))
   98 
   99   (defun Error (p msg "gt")
  100     (let (pieces)
  101       (push (sprintf nil "%L" p->line) pieces)
  102       (push (or p->filename "<input>") pieces)
  103       (let ((prefix (buildString pieces ":")))
  104         (error "%s: %s" prefix msg))))
  105 
  106   ;; State functions.
  107 
  108   ;; Cursor is at the beginning of a row of records, either because we
  109   ;; are starting or because newline has just been seen.
  110   (defun NewRowState (p c)
  111     (unless (null c)
  112       (p->row = nil)
  113       (p->cell = nil)
  114       (CellState p c)))
  115 
  116   ;; Cursor is collecting characters for the current cell.
  117   (defun CellState (p c)
  118     (cond
  119       ((eq c asciiComma)
  120        (FinishCell p)
  121        CellState)
  122       ((eq c asciiQuotationMark)
  123        QuotedCellState)
  124       ((memq c endOfRow)
  125        (FinishCell p)
  126        (FinishRow p)
  127        (when c
  128          NewRowState))
  129       ((eq c asciiCarriageReturn)
  130        ;; Ignored.
  131        CellState)
  132       (t
  133        (PushToCell p c)
  134        CellState)))
  135 
  136   ;; An ASCII quotation mark has been seen; the cursor is collecting
  137   ;; characters within quoted cell contents.
  138   (defun QuotedCellState (p c)
  139     (cond
  140       ((eq c asciiQuotationMark)
  141        QuotedCellStateSeenQuote)
  142       ((null c)
  143        (Error p "Unexpected EOF within quoted CSV cell."))
  144       (t
  145        (PushToCell p c)
  146        QuotedCellState)))
  147 
  148   ;; An ASCII quotation mark has been seen within a quoted section; we
  149   ;; don't know whether it closes the section or is an escape for a
  150   ;; literal quotation mark.
  151   (defun QuotedCellStateSeenQuote (p c)
  152     (cond
  153       ((eq c asciiQuotationMark)
  154        (PushToCell p c)
  155        QuotedCellState)
  156       (t
  157        (CellState p c))))
  158 
  159   ;; Parsing functions
  160 
  161   ;; Parse the characters returned by the source cs, a function
  162   ;; returning either the character symbol or nil for EOF.
  163   ;;
  164   ;; If passed and non-nil, filename may be used for diagnostic
  165   ;; purposes.
  166   (defun ParseCharStream (cs @optional filename)
  167     (let ((p (MakeParser filename))
  168           (state NewRowState))
  169       (while state
  170         (let ((c (funcall cs)))
  171           (when (eq c asciiLineFeed)
  172             (p->line = (p->line + 1)))
  173           (setq state (funcall state p c))))
  174       (Rows p)))
  175 
  176   ;; Makes a character source which consumes lines, as described in
  177   ;; ParseLines.
  178   (defun MakeLinesCharStream (lines)
  179     (cond
  180       ((null lines)
  181        (lambda () nil))
  182       (t
  183        (let ((index 1)
  184              (end (strlen (car lines)))
  185              (line (pop lines)))
  186          (lambda ()
  187            (cond
  188              ((index > end)
  189               (when lines
  190                 (setq line (pop lines))
  191                 (setq index 1)
  192                 (setq end (strlen line))
  193                 asciiLineFeed))
  194              (t
  195               (prog1
  196                   (getchar line index)
  197                 (setq index (index + 1))))))))))
  198 
  199   ;; ParseLines parses a list of string "lines" as a CSV stream.  The
  200   ;; list of strings is interpreted as if joigned by:
  201   ;;
  202   ;;     (buildString lines "\n")
  203   ;;
  204   ;; but the code doesn't do that to avoid triggering string length
  205   ;; limits.  Note that each "line" can also embed one or more \n
  206   ;; characters.
  207   ;;
  208   ;; Cf. ParseFile for more information and return specification.
  209   (defun ParseLines (lines "l")
  210     (ParseCharStream (MakeLinesCharStream lines)))
  211 
  212   (setq VedaCsvParseLines ParseLines)
  213 
  214   ;; ParsePort parses text read from port as a CSV stream.  Parsing
  215   ;; finishes when EOF is reached, but the port is not closed.
  216   ;;
  217   ;; Cf. ParseFile for more information and return specification.
  218   (defun ParsePort (port "p")
  219     (ParseCharStream (lambda () (getc port))))
  220 
  221   (setq VedaCsvParsePort ParsePort)
  222 
  223   ;; ParseFile parses filename as a CSV stream, returning a list of
  224   ;; lists of strings; the upper-left corner of a spreadsheet filled
  225   ;; with cell addresses would parse as:
  226   ;;
  227   ;;     (("A1" "B1" "C1")
  228   ;;      ("A2" "B2" "C2")
  229   ;;      ("A3" "B3" "C3"))
  230   ;;
  231   ;; Escape sequences and in-cell carriage returns are supported, but
  232   ;; no data interpretation is done besides parsing to strings.
  233   ;;
  234   ;; Errors are thrown if filename cannot be open for reading, or when
  235   ;; EOF is encountered within a quoted cell.
  236   ;;
  237   ;; "Staggered" spreadsheets, or streams ending without a carriage
  238   ;; return, are not error conditions; they produce the "obvious"
  239   ;; result:
  240   ;;
  241   ;;     a,b,c\n         (("a" "b" "c")
  242   ;;     d,e\n      =>    ("d" "e")
  243   ;;     f,g,h\n          ("f" "g" "h")
  244   ;;     i                ("i"))
  245   (defun ParseFile (filename "t")
  246     (let ((port (or (infile filename)
  247                     (error "Unable to open %L for reading" filename))))
  248       ;; TODO: unwindProtect, depending on Virtuoso version.
  249       (prog1
  250           (ParseCharStream (lambda () (getc port)) filename)
  251         (close port))))
  252 
  253   (setq VedaCsvParseFile ParseFile))
  254 
  255 ;;; csv.ils ends here