Source File csv/csv.ils

    1 ;;; csv.ils --- Simple CSV parser (Excel-style, text/csv)
    2 
    3 ;; Copyright (C) 2012-2013  Damien Diederen
    4 
    5 ;; @author   Damien Diederen <dd@crosstwine.com>
    6 ;; @keywords csv
    7 
    8 ;; All Rights Reserved.
    9 ;;
   10 ;; NOTICE: All information, intellectual and technical concepts
   11 ;; contained herein are, and remain the property of Damien Diederen
   12 ;; and his suppliers, if any.  Dissemination of this information or
   13 ;; reproduction of this material is strictly forbidden unless prior
   14 ;; written permission is obtained from Damien Diederen.
   15 
   16 ;;; Commentary:
   17 
   18 ;; Package csv (global prefix VedaCsv) provides a simple parser
   19 ;; matching the syntax produced by Microsoft Excel in an U.S. locale.
   20 ;;
   21 ;; Basic usage:
   22 ;;
   23 ;;     $ cat Book1.csv
   24 ;;     A,1
   25 ;;     B,2
   26 ;;     "crazy""column
   27 ;;     with CR and all!",3
   28 ;;
   29 ;;     $ $CDSHOME/tools/dfII/bin/skill
   30 ;;     (load "pkg/csv/csv.ils")
   31 ;;     |- t
   32 ;;     (VedaCsvParseFile "./Book1.csv")
   33 ;;     |- (("A" "1")
   34 ;;     |   ("B" "2")
   35 ;;     |   ("crazy\"column\nwith CR and all!" "3"))
   36 
   37 ;;; Code:
   38 
   39 ;; Parser holds the CSV parser state.
   40 ;;
   41 ;; @ignore Internal
   42 (defclass VedaCsvParser ()
   43   ((filename @initarg filename)
   44    (line     @initform 1)
   45    (cell     @initform nil)
   46    (row      @initform nil)
   47    (rows     @initform nil)))
   48 
   49 ;; Scope/hide utility functions and related data.
   50 (let ()
   51   ;; Export exports function fn as a global symbol with the VedaCsv
   52   ;; prefix.
   53   (defun Export (suffix fn)
   54     (putd (concat 'VedaCsv suffix) fn))
   55 
   56   ;; Constants used during parsing.
   57 
   58   (define asciiQuotationMark (intToChar 0x22))
   59 
   60   (define asciiComma (intToChar 0x2c))
   61 
   62   (define asciiLineFeed (intToChar 0xa))
   63 
   64   (define asciiCarriageReturn (intToChar 0x0d))
   65 
   66   (define endOfRow (list nil asciiLineFeed))
   67 
   68   ;; Create a parser instance
   69   (defun MakeParser (filename)
   70     (makeInstance 'VedaCsvParser ?filename filename))
   71 
   72   ;; Parser state manipulation.
   73 
   74   (defun FinishCell (p)
   75     (letseq ((str (buildString (reverse p->cell) ""))
   76              (newrow (cons str p->row)))
   77       (p->row = newrow)
   78       (p->cell = nil)))
   79 
   80   (defun FinishRow (p)
   81     (let ((newrows (cons (reverse p->row) p->rows)))
   82       (p->rows = newrows)
   83       (p->row = nil)
   84       (p->cell = nil)))
   85 
   86   (defun Rows (p)
   87     (reverse p->rows))
   88 
   89   (defun PushToCell (p c)
   90     (let ((newcell (cons c p->cell)))
   91       (p->cell = newcell)))
   92 
   93   (defun Error (p msg)
   94     (let (pieces)
   95       (push (sprintf nil "%L" p->line) pieces)
   96       (push (or p->filename "<input>") pieces)
   97       (let ((prefix (buildString pieces ":")))
   98         (error "%s: %s" prefix msg))))
   99 
  100   ;; State functions.
  101 
  102   ;; Cursor is at the beginning of a row of records, either because we
  103   ;; are starting or because newline has just been seen.
  104   (defun NewRowState (p c)
  105     (unless (null c)
  106       (p->row = nil)
  107       (p->cell = nil)
  108       (CellState p c)))
  109 
  110   ;; Cursor is collecting characters for the current cell.
  111   (defun CellState (p c)
  112     (cond
  113       ((eq c asciiComma)
  114        (FinishCell p)
  115        CellState)
  116       ((eq c asciiQuotationMark)
  117        QuotedCellState)
  118       ((memq c endOfRow)
  119        (FinishCell p)
  120        (FinishRow p)
  121        (when c
  122          NewRowState))
  123       ((eq c asciiCarriageReturn)
  124        ;; Ignored.
  125        CellState)
  126       (t
  127        (PushToCell p c)
  128        CellState)))
  129 
  130   ;; An ASCII quotation mark has been seen; the cursor is collecting
  131   ;; characters within quoted cell contents.
  132   (defun QuotedCellState (p c)
  133     (cond
  134       ((eq c asciiQuotationMark)
  135        QuotedCellStateSeenQuote)
  136       ((null c)
  137        (Error p "Unexpected EOF within quoted CSV cell."))
  138       (t
  139        (PushToCell p c)
  140        QuotedCellState)))
  141 
  142   ;; An ASCII quotation mark has been seen within a quoted section; we
  143   ;; don't know whether it closes the section or is an escape for a
  144   ;; literal quotation mark.
  145   (defun QuotedCellStateSeenQuote (p c)
  146     (cond
  147       ((eq c asciiQuotationMark)
  148        (PushToCell p c)
  149        QuotedCellState)
  150       (t
  151        (CellState p c))))
  152 
  153   ;; Parsing functions
  154 
  155   ;; Parse the characters returned by the source cs, a function
  156   ;; returning either the character symbol or nil for EOF.
  157   ;;
  158   ;; If passed and non-nil, filename may be used for diagnostic
  159   ;; purposes.
  160   (defun ParseCharStream (cs @optional filename)
  161     (let ((p (MakeParser filename))
  162           (state NewRowState))
  163       (while state
  164         (let ((c (funcall cs)))
  165           (when (eq c asciiLineFeed)
  166             (p->line = (p->line + 1)))
  167           (setq state (funcall state p c))))
  168       (Rows p)))
  169 
  170   ;; Makes a character source which consumes lines, as described in
  171   ;; ParseLines.
  172   (defun MakeLinesCharStream (lines)
  173     (cond
  174       ((null lines)
  175        (lambda () nil))
  176       (t
  177        (let ((index 1)
  178              (end (strlen (car lines)))
  179              (line (pop lines)))
  180          (lambda ()
  181            (cond
  182              ((index > end)
  183               (when lines
  184                 (setq line (pop lines))
  185                 (setq index 1)
  186                 (setq end (strlen line))
  187                 asciiLineFeed))
  188              (t
  189               (prog1
  190                   (getchar line index)
  191                 (setq index (index + 1))))))))))
  192 
  193   ;; ParseLines parses a list of string "lines" as a CSV stream.  The
  194   ;; list of strings is interpreted as if joigned by:
  195   ;;
  196   ;;     (buildString lines "\n")
  197   ;;
  198   ;; but the code doesn't do that to avoid triggering string length
  199   ;; limits.  Note that each "line" can also embed one or more \n
  200   ;; characters.
  201   ;;
  202   ;; Cf. ParseFile for more information and return specification.
  203   (defun ParseLines (lines)
  204     (ParseCharStream (MakeLinesCharStream lines)))
  205 
  206   (Export 'ParseLines ParseLines)
  207 
  208   ;; ParsePort parses text read from port as a CSV stream.  Parsing
  209   ;; finishes when EOF is reached, but the port is not closed.
  210   ;;
  211   ;; Cf. ParseFile for more information and return specification.
  212   (defun ParsePort (port)
  213     (ParseCharStream (lambda () (getc port))))
  214 
  215   (Export 'ParsePort ParsePort)
  216 
  217   ;; ParseFile parses filename as a CSV stream, returning a list of
  218   ;; lists of strings; the upper-left corner of a spreadsheet filled
  219   ;; with cell addresses would parse as:
  220   ;;
  221   ;;     (("A1" "B1" "C1")
  222   ;;      ("A2" "B2" "C2")
  223   ;;      ("A3" "B3" "C3"))
  224   ;;
  225   ;; Escape sequences and in-cell carriage returns are supported, but
  226   ;; no data interpretation is done besides parsing to strings.
  227   ;;
  228   ;; Errors are thrown if filename cannot be open for reading, or when
  229   ;; EOF is encountered within a quoted cell.
  230   ;;
  231   ;; "Staggered" spreadsheets, or streams ending without a carriage
  232   ;; return, are not error conditions; they produce the "obvious"
  233   ;; result:
  234   ;;
  235   ;;     a,b,c\n         (("a" "b" "c")
  236   ;;     d,e\n      =>    ("d" "e")
  237   ;;     f,g,h\n          ("f" "g" "h")
  238   ;;     i                ("i"))
  239   (defun ParseFile (filename)
  240     (let ((port (or (infile filename)
  241                     (error "Unable to open %L for reading" filename))))
  242       ;; TODO: unwindProtect, depending on Virtuoso version.
  243       (prog1
  244           (ParseCharStream (lambda () (getc port)) filename)
  245         (close port))))
  246 
  247   (Export 'ParseFile ParseFile))
  248 
  249 ;;; csv.ils ends here